├── docs ├── src │ ├── index.md │ └── tutorials │ │ ├── intro1.png │ │ └── common.jl ├── .gitignore ├── Project.toml └── make.jl ├── deps ├── .gitignore └── build.jl ├── bors.toml ├── .gitignore ├── src ├── deprecated.jl ├── forwarddiff.jl ├── dnn │ ├── error.jl │ ├── compat.jl │ ├── CUDNN.jl │ ├── nnlib.jl │ ├── helpers.jl │ └── libcudnn_types.jl ├── blas │ ├── util.jl │ ├── error.jl │ ├── README.md │ ├── CUBLAS.jl │ ├── libcublas_types.jl │ └── highlevel.jl ├── fft │ ├── CUFFT.jl │ ├── genericfft.jl │ ├── error.jl │ ├── fft.jl │ ├── libcufft_types.jl │ ├── libcufft.jl │ ├── highlevel.jl │ └── wrappers.jl ├── indexing.jl ├── nnlib.jl ├── rand │ ├── CURAND.jl │ ├── error.jl │ ├── libcurand_types.jl │ ├── highlevel.jl │ └── libcurand.jl ├── sparse │ ├── CUSPARSE.jl │ ├── error.jl │ ├── highlevel.jl │ ├── libcusparse.jl │ ├── libcusparse_types.jl │ └── array.jl ├── subarray.jl ├── solver │ ├── error.jl │ ├── CUSOLVER.jl │ ├── libcusolver_types.jl │ ├── highlevel.jl │ └── libcusolver.jl ├── accumulate.jl ├── broadcast.jl ├── utils.jl ├── gpuarray_interface.jl ├── matmul.jl ├── CuArrays.jl ├── mapreduce.jl └── array.jl ├── test ├── util.jl ├── runtests.jl ├── rand.jl ├── dnn.jl ├── sparse_solver.jl ├── fft.jl └── base.jl ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── LICENSE.md ├── Project.toml ├── .gitlab-ci.yml └── README.md /docs/src/index.md: -------------------------------------------------------------------------------- 1 | # CuArrays.jl -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | src/**/generated/ 2 | -------------------------------------------------------------------------------- /deps/.gitignore: -------------------------------------------------------------------------------- 1 | ext.jl.bak 2 | build.log 3 | 4 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ 2 | "ci/gitlab/%" 3 | ] 4 | delete_merged_branches = true 5 | -------------------------------------------------------------------------------- /docs/src/tutorials/intro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CuArrays.jl/master/docs/src/tutorials/intro1.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.cov 2 | *.jl.*.cov 3 | *.jl.mem 4 | deps/ext.jl 5 | Manifest.toml 6 | tutorials/build/ 7 | docs/build/ 8 | -------------------------------------------------------------------------------- /docs/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 3 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 4 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" 5 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 6 | -------------------------------------------------------------------------------- /src/deprecated.jl: -------------------------------------------------------------------------------- 1 | # Deprecated functionality 2 | 3 | import Base: @deprecate_binding 4 | 5 | @deprecate_binding BLAS CUBLAS 6 | @deprecate_binding FFT CUFFT 7 | 8 | @deprecate cuzeros CuArrays.zeros 9 | @deprecate cuones CuArrays.ones 10 | @deprecate cufill CuArrays.fill 11 | -------------------------------------------------------------------------------- /test/util.jl: -------------------------------------------------------------------------------- 1 | macro grab_output(ex) 2 | quote 3 | mktemp() do fname, fout 4 | ret = nothing 5 | open(fname, "w") do fout 6 | redirect_stdout(fout) do 7 | ret = $(esc(ex)) 8 | end 9 | end 10 | ret, read(fname, String) 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /docs/make.jl: -------------------------------------------------------------------------------- 1 | using Documenter 2 | using Literate 3 | 4 | using CuArrays 5 | 6 | # generate tutorials 7 | OUTPUT = joinpath(@__DIR__, "src/tutorials/generated") 8 | Literate.markdown(joinpath(@__DIR__, "src/tutorials/intro.jl"), OUTPUT) 9 | 10 | makedocs( 11 | modules = [CuArrays], 12 | format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), 13 | sitename = "CuArrays.jl", 14 | pages = [ 15 | "Home" => "index.md", 16 | "Tutorials" => [ 17 | "tutorials/generated/intro.md" 18 | ], 19 | ], 20 | doctest = true 21 | ) 22 | -------------------------------------------------------------------------------- /docs/src/tutorials/common.jl: -------------------------------------------------------------------------------- 1 | # function to run a Julia script outside of the current environment 2 | function script(code; wrapper=``, args=``) 3 | if Base.JLOptions().project != C_NULL 4 | args = `$args --project=$(unsafe_string(Base.JLOptions().project))` 5 | end 6 | mktemp() do path, io 7 | write(io, code) 8 | flush(io) 9 | cmd = `$wrapper $(Base.julia_cmd()) $args $path` 10 | # redirect stderr to stdout to have it picked up by Weave.jl 11 | run(pipeline(ignorestatus(cmd), stderr=stdout)) 12 | end 13 | nothing 14 | end 15 | -------------------------------------------------------------------------------- /src/forwarddiff.jl: -------------------------------------------------------------------------------- 1 | # ForwardDiff integration 2 | 3 | for f in libdevice 4 | if haskey(ForwardDiff.DiffRules.DEFINED_DIFFRULES, (:Base,f,1)) 5 | f == :tanh && continue 6 | diffrule = ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base,f,1)] 7 | ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative,f,1)] = 8 | (args...) -> replace_device(diffrule(args...)) 9 | eval(ForwardDiff.unary_dual_definition(:CUDAnative, f)) 10 | end 11 | end 12 | 13 | ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x -> 14 | replace_device(:(1-tanh(x)^2)) 15 | eval(ForwardDiff.unary_dual_definition(:CUDAnative, :tanh)) 16 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | include("util.jl") 4 | 5 | using Random 6 | Random.seed!(1) 7 | 8 | using CuArrays 9 | 10 | using GPUArrays 11 | import GPUArrays: allowscalar, @allowscalar 12 | 13 | testf(f, xs...; kwargs...) = GPUArrays.TestSuite.compare(f, CuArray, xs...; kwargs...) 14 | 15 | allowscalar(false) 16 | 17 | @testset "CuArrays" begin 18 | 19 | include("base.jl") 20 | include("blas.jl") 21 | include("rand.jl") 22 | include("fft.jl") 23 | include("sparse.jl") 24 | include("solver.jl") 25 | include("sparse_solver.jl") 26 | include("dnn.jl") 27 | 28 | CuArrays.pool_status() 29 | CuArrays.pool_timings() 30 | 31 | end 32 | -------------------------------------------------------------------------------- /src/dnn/error.jl: -------------------------------------------------------------------------------- 1 | export CUDNNError 2 | 3 | struct CUDNNError <: Exception 4 | code::cudnnStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUDNNError) = print(io, "CUDNNError(code $(err.code), $(err.msg))") 8 | 9 | function CUDNNError(status::cudnnStatus_t) 10 | msg = unsafe_string(cudnnGetErrorString(status)) 11 | return CUDNNError(status, msg) 12 | end 13 | 14 | macro check(dnn_func) 15 | quote 16 | local err::cudnnStatus_t 17 | err = $(esc(dnn_func)) 18 | if err != CUDNN_STATUS_SUCCESS 19 | throw(CUDNNError(err)) 20 | end 21 | err 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /src/blas/util.jl: -------------------------------------------------------------------------------- 1 | # convert matrix to band storage 2 | function band(A::AbstractMatrix,kl,ku) 3 | m, n = size(A) 4 | AB = zeros(eltype(A),kl+ku+1,n) 5 | for j = 1:n 6 | for i = max(1,j-ku):min(m,j+kl) 7 | AB[ku+1-j+i,j] = A[i,j] 8 | end 9 | end 10 | return AB 11 | end 12 | 13 | # convert band storage to general matrix 14 | function unband(AB::AbstractMatrix,m,kl,ku) 15 | bm, n = size(AB) 16 | A = zeros(eltype(AB),m,n) 17 | for j = 1:n 18 | for i = max(1,j-ku):min(m,j+kl) 19 | A[i,j] = AB[ku+1-j+i,j] 20 | end 21 | end 22 | return A 23 | end 24 | 25 | # zero out elements not on matrix bands 26 | function bandex(A::AbstractMatrix,kl,ku) 27 | m, n = size(A) 28 | AB = band(A,kl,ku) 29 | B = unband(AB,m,kl,ku) 30 | return B 31 | end 32 | -------------------------------------------------------------------------------- /src/fft/CUFFT.jl: -------------------------------------------------------------------------------- 1 | module CUFFT 2 | 3 | import CUDAapi 4 | 5 | using ..CuArrays 6 | using ..CuArrays: libcufft, configured 7 | 8 | import AbstractFFTs: plan_fft, plan_fft!, plan_bfft, plan_bfft!, 9 | plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft, 10 | Plan, ScaledPlan 11 | import Base: show, *, convert, unsafe_convert, size, strides, ndims 12 | import Base.Sys: WORD_SIZE 13 | 14 | using LinearAlgebra 15 | import LinearAlgebra: mul! 16 | 17 | include("libcufft_types.jl") 18 | include("error.jl") 19 | 20 | include("libcufft.jl") 21 | include("genericfft.jl") 22 | include("fft.jl") 23 | include("wrappers.jl") 24 | include("highlevel.jl") 25 | 26 | version() = VersionNumber(cufftGetProperty(CUDAapi.MAJOR_VERSION), 27 | cufftGetProperty(CUDAapi.MINOR_VERSION), 28 | cufftGetProperty(CUDAapi.PATCH_LEVEL)) 29 | 30 | end 31 | -------------------------------------------------------------------------------- /src/dnn/compat.jl: -------------------------------------------------------------------------------- 1 | # Compatibility shims until users upgrade to new NNlib format 2 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat} 3 | cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation) 4 | return conv!(y, x, w, cdims; kwargs...) 5 | end 6 | 7 | function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat} 8 | cdims = DenseConvDims(x, dw; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation) 9 | # NOTE!!! This compat shim re-arranges the argument order! 10 | return ∇conv_filter!(dw, x, dy, cdims; kwargs...) 11 | end 12 | 13 | function maxpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat} 14 | pdims = PoolDims(x, k; padding=pad, stride=stride) 15 | return maxpool!(y, x, pdims) 16 | end 17 | 18 | function meanpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat} 19 | pdims = PoolDims(x, k; padding=pad, stride=stride) 20 | return meanpool!(y, x, pdims) 21 | end 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The CuArrays.jl package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2017: Mike J Innes. 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy 6 | > of this software and associated documentation files (the "Software"), to deal 7 | > in the Software without restriction, including without limitation the rights 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | > copies of the Software, and to permit persons to whom the Software is 10 | > furnished to do so, subject to the following conditions: 11 | > 12 | > The above copyright notice and this permission notice shall be included in all 13 | > copies or substantial portions of the Software. 14 | > 15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | > SOFTWARE. 22 | > 23 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "CuArrays" 2 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 3 | version = "2.0.0" 4 | 5 | [deps] 6 | AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" 7 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 8 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" 9 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" 10 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" 11 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" 12 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 13 | MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" 14 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" 15 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" 16 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 17 | Requires = "ae029012-a4dd-5104-9daa-d747884805df" 18 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" 19 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 20 | 21 | [extras] 22 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" 23 | ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" 24 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 25 | 26 | [targets] 27 | test = ["Test", "FFTW", "ForwardDiff"] 28 | 29 | [compat] 30 | julia = "1.0" 31 | CUDAnative = "2.0" 32 | CUDAdrv = "3.0" 33 | CUDAapi = "0.5.3, 0.6, 1.0" 34 | NNlib = "0.6" 35 | GPUArrays = "0.7" 36 | Adapt = "0.4" 37 | -------------------------------------------------------------------------------- /src/indexing.jl: -------------------------------------------------------------------------------- 1 | import GPUArrays: allowscalar, @allowscalar 2 | 3 | function _getindex(xs::CuArray{T}, i::Integer) where T 4 | buf = Array{T}(undef) 5 | copyto!(buf, 1, xs, i, 1) 6 | buf[] 7 | end 8 | 9 | function _setindex!(xs::CuArray{T}, v::T, i::Integer) where T 10 | copyto!(xs, i, T[v], 1, 1) 11 | end 12 | 13 | 14 | ## logical indexing 15 | 16 | Base.getindex(xs::CuArray, bools::AbstractArray{Bool}) = getindex(xs, CuArray(bools)) 17 | 18 | function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T} 19 | bools = reshape(bools, prod(size(bools))) 20 | indices = cumsum(bools) # unique indices for elements that are true 21 | 22 | n = _getindex(indices, length(indices)) # number that are true 23 | ys = CuArray{T}(undef, n) 24 | 25 | if n > 0 26 | num_threads = min(n, 256) 27 | num_blocks = ceil(Int, length(indices) / num_threads) 28 | 29 | function kernel(ys::CuDeviceArray{T}, xs::CuDeviceArray{T}, bools, indices) 30 | i = threadIdx().x + (blockIdx().x - 1) * blockDim().x 31 | 32 | if i <= length(xs) && bools[i] 33 | b = indices[i] # new position 34 | ys[b] = xs[i] 35 | 36 | end 37 | 38 | return 39 | end 40 | 41 | @cuda blocks=num_blocks threads=num_threads kernel(ys, xs, bools, indices) 42 | end 43 | 44 | return ys 45 | end 46 | -------------------------------------------------------------------------------- /src/nnlib.jl: -------------------------------------------------------------------------------- 1 | using NNlib 2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, 3 | maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, 4 | softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax 5 | using CUDAnative 6 | 7 | # Activation functions 8 | @cufunc σ(x) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x))) 9 | 10 | @cufunc function logσ(x) 11 | max_v = max(zero(x), -x) 12 | z = exp(-max_v) + exp(-x-max_v) 13 | -(max_v + log(z)) 14 | end 15 | 16 | @cufunc elu(x, α = one(x)) = 17 | ifelse(x ≥ 0, x/1, α * (exp(x) - one(x))) 18 | 19 | @cufunc swish(x) = x * σ(x) 20 | 21 | @cufunc function gelu(x) 22 | λ = oftype(x/1, √(2/π)) 23 | α = oftype(x/1, 0.044715) 24 | h = oftype(x/1, 0.5) 25 | h * x * (one(x) + tanh(λ * (x + α * x^3))) 26 | end 27 | 28 | @cufunc function selu(x) 29 | λ = oftype(x/1, 1.0507009873554804934193349852946) 30 | α = oftype(x/1, 1.6732632423543772848170429916717) 31 | λ * ifelse(x > 0, x/1, α * (exp(x) - 1)) 32 | end 33 | 34 | @cufunc softplus(x) = log1p(exp(x)) 35 | 36 | if !@isdefined CUDNN 37 | function conv!(y::CuArray, x::CuArray, w::CuArray; kw...) 38 | error("CUDNN is not installed.") 39 | end 40 | function softmax!(out::CuVecOrMat, xs::CuVecOrMat) 41 | error("CUDNN is not installed.") 42 | end 43 | function logsoftmax!(out::CuVecOrMat, xs::CuVecOrMat) 44 | error("CUDNN is not installed.") 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /src/dnn/CUDNN.jl: -------------------------------------------------------------------------------- 1 | module CUDNN 2 | 3 | import CUDAapi 4 | 5 | import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL 6 | 7 | using ..CuArrays 8 | using ..CuArrays: libcudnn, active_context, configured, unsafe_free! 9 | using ..CuArrays: CuVecOrMat, CuVector 10 | using NNlib 11 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, stride, dilation, flipkernel, 12 | maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, spatial_dims, padding, kernel_size, 13 | softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax 14 | using CUDAnative 15 | include("libcudnn_types.jl") 16 | include("error.jl") 17 | 18 | const _handles = Dict{CuContext,cudnnHandle_t}() 19 | const _handle = Ref{cudnnHandle_t}(C_NULL) 20 | 21 | function handle() 22 | if _handle[] == C_NULL 23 | @assert isassigned(active_context) # some other call should have initialized CUDA 24 | _handle[] = get!(_handles, active_context[]) do 25 | context = active_context[] 26 | handle = cudnnCreate() 27 | atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle)) 28 | handle 29 | end 30 | end 31 | 32 | return _handle[] 33 | end 34 | 35 | include("libcudnn.jl") 36 | include("helpers.jl") 37 | include("nnlib.jl") 38 | include("compat.jl") 39 | 40 | version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION), 41 | cudnnGetProperty(CUDAapi.MINOR_VERSION), 42 | cudnnGetProperty(CUDAapi.PATCH_LEVEL)) 43 | 44 | end 45 | -------------------------------------------------------------------------------- /src/rand/CURAND.jl: -------------------------------------------------------------------------------- 1 | module CURAND 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuPtr 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcurand, active_context 8 | 9 | using GPUArrays 10 | 11 | using Random 12 | 13 | export rand_logn!, rand_poisson! 14 | 15 | include("libcurand_types.jl") 16 | include("error.jl") 17 | 18 | const _generators = Dict{CuContext,RNG}() 19 | const _generator = Ref{Union{Nothing,RNG}}(nothing) 20 | 21 | function generator() 22 | if _generator[] == nothing 23 | @assert isassigned(active_context) # some other call should have initialized CUDA 24 | _generator[] = get!(_generators, active_context[]) do 25 | context = active_context[] 26 | generator = create_generator() 27 | # FIXME: crashes 28 | #atexit(()->CUDAdrv.isvalid(context) && destroy_generator(generator)) 29 | generator 30 | end 31 | end 32 | 33 | return _generator[]::RNG 34 | end 35 | 36 | include("libcurand.jl") 37 | include("highlevel.jl") 38 | 39 | version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION), 40 | curandGetProperty(CUDAapi.MINOR_VERSION), 41 | curandGetProperty(CUDAapi.PATCH_LEVEL)) 42 | 43 | end 44 | 45 | const rand = CURAND.rand 46 | const randn = CURAND.randn 47 | const rand_logn = CURAND.rand_logn 48 | const rand_poisson = CURAND.rand_poisson 49 | 50 | @deprecate curand CuArrays.rand 51 | @deprecate curandn CuArrays.randn 52 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | CI_IMAGE_TAG: 'cuda' 3 | JULIA_NUM_THREADS: '4' 4 | 5 | include: 6 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml' 7 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.0.yml' 8 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.1.yml' 9 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_dev.yml' 10 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/coverage_v1.1.yml' 11 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/documentation_v1.1.yml' 12 | 13 | test:dev: 14 | allow_failure: true 15 | 16 | coverage: 17 | allow_failure: true 18 | only: 19 | - master 20 | 21 | pages: 22 | stage: deploy 23 | script: 24 | - mv docs/build public 25 | artifacts: 26 | paths: 27 | - public 28 | only: 29 | - master 30 | 31 | flux: 32 | stage: test 33 | image: "juliagpu/julia:v1.1-cuda" 34 | script: 35 | - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325 36 | - julia -e 'using Pkg; 37 | Pkg.develop([PackageSpec(path=pwd()); 38 | [PackageSpec(name=pkg) 39 | for pkg in split(get(ENV,"CI_DEV_PKGS",""))]]); 40 | Pkg.build("CuArrays")' 41 | - julia -e 'using Pkg; 42 | Pkg.add("Flux"); 43 | Pkg.test("Flux")' 44 | allow_failure: true 45 | -------------------------------------------------------------------------------- /src/sparse/CUSPARSE.jl: -------------------------------------------------------------------------------- 1 | module CUSPARSE 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcusparse, active_context, unsafe_free! 8 | 9 | using SparseArrays 10 | using LinearAlgebra 11 | 12 | import Base.one 13 | import Base.zero 14 | 15 | const SparseChar = Char 16 | import Base.one 17 | import Base.zero 18 | 19 | export CuSparseMatrixCSC, CuSparseMatrixCSR, 20 | CuSparseMatrixHYB, CuSparseMatrixBSR, 21 | CuSparseMatrix, AbstractCuSparseMatrix, 22 | CuSparseVector 23 | 24 | include("libcusparse_types.jl") 25 | include("error.jl") 26 | 27 | const _handles = Dict{CuContext,cusparseHandle_t}() 28 | const _handle = Ref{cusparseHandle_t}() 29 | 30 | function handle() 31 | if _handle[] == C_NULL 32 | @assert isassigned(active_context) # some other call should have initialized CUDA 33 | _handle[] = get!(_handles, active_context[]) do 34 | context = active_context[] 35 | handle = cusparseCreate() 36 | atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle)) 37 | handle 38 | end 39 | end 40 | 41 | return _handle[] 42 | end 43 | 44 | include("libcusparse.jl") 45 | include("array.jl") 46 | include("util.jl") 47 | include("wrappers.jl") 48 | include("highlevel.jl") 49 | 50 | version() = VersionNumber(cusparseGetProperty(CUDAapi.MAJOR_VERSION), 51 | cusparseGetProperty(CUDAapi.MINOR_VERSION), 52 | cusparseGetProperty(CUDAapi.PATCH_LEVEL)) 53 | 54 | end 55 | -------------------------------------------------------------------------------- /src/subarray.jl: -------------------------------------------------------------------------------- 1 | import Base: view 2 | 3 | using Base: ScalarIndex, ViewIndex, Slice, @_inline_meta, @boundscheck, 4 | to_indices, compute_offset1, unsafe_length, _maybe_reshape_parent, index_ndims 5 | 6 | struct Contiguous end 7 | struct NonContiguous end 8 | 9 | # Detect whether the view is contiguous or not 10 | CuIndexStyle() = Contiguous() 11 | CuIndexStyle(I...) = NonContiguous() 12 | CuIndexStyle(i1::Colon, ::ScalarIndex...) = Contiguous() 13 | CuIndexStyle(i1::AbstractUnitRange, ::ScalarIndex...) = Contiguous() 14 | CuIndexStyle(i1::Colon, I...) = CuIndexStyle(I...) 15 | 16 | cuviewlength() = () 17 | cuviewlength(::Real, I...) = (@_inline_meta; cuviewlength(I...)) # skip scalars 18 | cuviewlength(i1::AbstractUnitRange, I...) = (@_inline_meta; (unsafe_length(i1), cuviewlength(I...)...)) 19 | cuviewlength(i1::AbstractUnitRange, ::ScalarIndex...) = (@_inline_meta; (unsafe_length(i1),)) 20 | 21 | view(A::CuArray, I::Vararg{Any,N}) where {N} = (@_inline_meta; _cuview(A, I, CuIndexStyle(I...))) 22 | 23 | function _cuview(A, I, ::Contiguous) 24 | @_inline_meta 25 | J = to_indices(A, I) 26 | @boundscheck checkbounds(A, J...) 27 | _cuview(_maybe_reshape_parent(A, index_ndims(J...)), J, cuviewlength(J...)) 28 | end 29 | 30 | # for contiguous views just return a new CuArray 31 | _cuview(A::CuArray{T}, I::NTuple{N,ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M} = 32 | CuArray{T,M}(A.buf, dims; offset=A.offset + compute_offset1(A, 1, I) * sizeof(T), own=A.own) 33 | 34 | # fallback to SubArray when the view is not contiguous 35 | _cuview(A, I, ::NonContiguous) where {N} = invoke(view, Tuple{AbstractArray, typeof(I).parameters...}, A, I...) 36 | -------------------------------------------------------------------------------- /src/solver/error.jl: -------------------------------------------------------------------------------- 1 | export CUSOLVERError 2 | 3 | struct CUSOLVERError <: Exception 4 | code::cusolverStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUSOLVERError) = print(io, "CUSOLVERError(code $(err.code), $(err.msg))") 8 | 9 | function CUSOLVERError(code::cusolverStatus_t) 10 | msg = status_message(code) 11 | return CUSOLVERError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUSOLVER_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUSOLVER_STATUS_NOT_INITIALIZED 18 | return "the library was not initialized" 19 | elseif status == CUSOLVER_STATUS_ALLOC_FAILED 20 | return "the resource allocation failed" 21 | elseif status == CUSOLVER_STATUS_INVALID_VALUE 22 | return "an invalid value was used as an argument" 23 | elseif status == CUSOLVER_STATUS_ARCH_MISMATCH 24 | return "an absent device architectural feature is required" 25 | elseif status == CUSOLVER_STATUS_EXECUTION_FAILED 26 | return "the GPU program failed to execute" 27 | elseif status == CUSOLVER_STATUS_INTERNAL_ERROR 28 | return "an internal operation failed" 29 | elseif status == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED 30 | return "the matrix type is not supported." 31 | else 32 | return "unknown status" 33 | end 34 | end 35 | 36 | macro check(solver_func) 37 | quote 38 | local err::cusolverStatus_t 39 | err = $(esc(solver_func::Expr)) 40 | if err != CUSOLVER_STATUS_SUCCESS 41 | throw(CUSOLVERError(err)) 42 | end 43 | err 44 | end 45 | end -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Sanity checks (read this first, then remove this section)** 11 | Make sure you're reporting *a bug*; for general questions, please use Discourse. 12 | 13 | If you're dealing with a performance issue, make sure you **disable scalar iteration** (`CuArrays.allowscalar(false)`). Only file an issue if that shows scalar iteration happening within Base or CuArrays, as opposed to your own code. 14 | 15 | If you're seeing an error message, **follow the error message instructions**, if any (eg. `inspect code with @device_code_warntype`). If you can't solve the problem using that information, make sure to post it as part of the issue. 16 | 17 | If your bug is still valid, please go ahead and fill out the template below. 18 | 19 | **Describe the bug** 20 | A clear and concise description of what the bug is. 21 | 22 | **To Reproduce** 23 | The Minimal Working Example (MWE) for this bug: 24 | ```julia 25 | # some code here 26 | ``` 27 | 28 | **Expected behavior** 29 | A clear and concise description of what you expected to happen. 30 | 31 | **Build log** 32 | ``` 33 | # post the output of Pkg.build() 34 | # make sure the error still reproduces after that. 35 | ``` 36 | 37 | **Environment details (please complete this section)** 38 | Details on Julia: 39 | ``` 40 | # please post the output of: 41 | versioninfo() 42 | ``` 43 | 44 | Julia packages: 45 | - CuArrays.jl: 46 | - CUDAnative.jl: 47 | - ... 48 | 49 | CUDA: toolkit and driver version 50 | 51 | 52 | **Additional context** 53 | Add any other context about the problem here. 54 | -------------------------------------------------------------------------------- /src/sparse/error.jl: -------------------------------------------------------------------------------- 1 | export CUSPARSError 2 | 3 | struct CUSPARSEError <: Exception 4 | code::cusparseStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUSPARSEError) = print(io, "CUSPARSError(code $(err.code), $(err.msg))") 8 | 9 | function CUSPARSError(code::cusparseStatus_t) 10 | msg = status_message(code) 11 | return CUSPARSEError(code, msg) 12 | end 13 | 14 | 15 | function statusmessage( status ) 16 | if status == CUSPARSE_STATUS_SUCCESS 17 | return "cusparse success" 18 | end 19 | if status == CUSPARSE_STATUS_NOT_INITIALIZED 20 | return "cusparse not initialized" 21 | end 22 | if status == CUSPARSE_STATUS_ALLOC_FAILED 23 | return "cusparse allocation failed" 24 | end 25 | if status == CUSPARSE_STATUS_INVALID_VALUE 26 | return "cusparse invalid value" 27 | end 28 | if status == CUSPARSE_STATUS_ARCH_MISMATCH 29 | return "cusparse architecture mismatch" 30 | end 31 | if status == CUSPARSE_STATUS_MAPPING_ERROR 32 | return "cusparse mapping error" 33 | end 34 | if status == CUSPARSE_STATUS_EXECUTION_FAILED 35 | return "cusparse execution failed" 36 | end 37 | if status == CUSPARSE_STATUS_INTERNAL_ERROR 38 | return "cusparse internal error" 39 | end 40 | if status == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED 41 | return "cusparse matrix type not supported" 42 | end 43 | end 44 | 45 | macro check(sparse_func) 46 | quote 47 | local err = $(esc(sparse_func::Expr)) 48 | if err != CUSPARSE_STATUS_SUCCESS 49 | throw(CUSPARSEError(cusparseStatus_t(err))) 50 | end 51 | err 52 | end 53 | end 54 | 55 | -------------------------------------------------------------------------------- /test/rand.jl: -------------------------------------------------------------------------------- 1 | @testset "CURAND" begin 2 | 3 | using CuArrays.CURAND 4 | 5 | CURAND.seed!() 6 | 7 | # in-place 8 | for (f,T) in ((rand!,Float32), 9 | (randn!,Float32), 10 | (rand_logn!,Float32), 11 | (rand_poisson!,Cuint)), 12 | d in (2, (2,2), (2,2,2)) 13 | A = CuArray{T}(undef, d) 14 | f(A) 15 | end 16 | 17 | # out-of-place, with implicit type 18 | for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32), 19 | (CuArrays.rand_logn,Float32), (CuArrays.rand_poisson,Cuint), 20 | (rand,Float64), (randn,Float64)), 21 | args in ((2,), (2, 2)) 22 | A = f(args...) 23 | @test eltype(A) == T 24 | end 25 | 26 | # out-of-place, with type specified 27 | for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32), (CuArrays.rand_logn,Float32), 28 | (CuArrays.rand,Float64), (CuArrays.randn,Float64), (CuArrays.rand_logn,Float64), 29 | (CuArrays.rand_poisson,Cuint), 30 | (rand,Float32), (randn,Float32), 31 | (rand,Float64), (randn,Float64)), 32 | args in ((T, 2), (T, 2, 2), (T, (2, 2))) 33 | A = f(args...) 34 | @test eltype(A) == T 35 | end 36 | 37 | # unsupported types that fall back to GPUArrays 38 | for (f,T) in ((CuArrays.rand,Int64),), 39 | args in ((T, 2), (T, 2, 2), (T, (2, 2))) 40 | A = f(args...) 41 | @test eltype(A) == T 42 | end 43 | for (f,T) in ((rand!,Int64),), 44 | d in (2, (2,2), (2,2,2)) 45 | A = CuArray{T}(undef, d) 46 | f(A) 47 | end 48 | 49 | @test_throws ErrorException randn!(CuArray{Cuint}(undef, 10)) 50 | @test_throws ErrorException rand_logn!(CuArray{Cuint}(undef, 10)) 51 | @test_throws ErrorException rand_poisson!(CuArray{Float64}(undef, 10)) 52 | 53 | end 54 | -------------------------------------------------------------------------------- /src/accumulate.jl: -------------------------------------------------------------------------------- 1 | # Implements the Hillis--Steele algorithm using global memory 2 | # See algorithm 1 at https://en.wikipedia.org/wiki/Prefix_sum#Parallel_algorithm 3 | 4 | # TODO: features 5 | # - init::Some 6 | # - CuMatrix 7 | # - pairwise 8 | 9 | # TODO: performance 10 | # - shared memory / shuffle (see CUDAnative.jl/examples/scan) 11 | 12 | function Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, dims::Int, 13 | init::Nothing) where {T} 14 | if dims != 1 15 | return copyto!(vout, v) 16 | end 17 | 18 | return Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, nothing, nothing) 19 | end 20 | 21 | function Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, dims::Nothing, 22 | init::Nothing) where {T} 23 | vin = T.(v) # convert to vector with eltype T 24 | 25 | Δ = 1 # Δ = 2^d 26 | n = ceil(Int, log2(length(v))) 27 | 28 | num_threads = 256 29 | num_blocks = ceil(Int, length(v) / num_threads) 30 | 31 | for d in 0:n # passes through data 32 | @cuda blocks=num_blocks threads=num_threads _partial_accumulate!(op, vout, vin, Δ) 33 | 34 | vin, vout = vout, vin 35 | Δ *= 2 36 | end 37 | 38 | return vin 39 | end 40 | 41 | function _partial_accumulate!(op, vout, vin, Δ) 42 | @inbounds begin 43 | k = threadIdx().x + (blockIdx().x - 1) * blockDim().x 44 | 45 | if k <= length(vin) 46 | if k > Δ 47 | vout[k] = op(vin[k - Δ], vin[k]) 48 | else 49 | vout[k] = vin[k] 50 | end 51 | end 52 | end 53 | 54 | return 55 | end 56 | 57 | Base.accumulate_pairwise!(op, result::CuVector, v::CuVector) = accumulate!(op, result, v) 58 | -------------------------------------------------------------------------------- /src/fft/genericfft.jl: -------------------------------------------------------------------------------- 1 | cufftfloat(x) = _cufftfloat(float(x)) 2 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T 3 | _cufftfloat(::Type{Float16}) = Float32 4 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)} 5 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported") 6 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x) 7 | 8 | complexfloat(x::CuArray{Complex{<:cufftReals}}) = x 9 | realfloat(x::CuArray{<:cufftReals}) = x 10 | 11 | complexfloat(x::CuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x) 12 | complexfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x) 13 | 14 | realfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x) 15 | 16 | function copy1(::Type{T}, x) where T 17 | y = CuArray{T}(undef, map(length, axes(x))) 18 | #copy!(y, x) 19 | y .= broadcast(xi->convert(T,xi),x) 20 | end 21 | 22 | # promote to a complex floating-point type (out-of-place only), 23 | # so implementations only need Complex{Float} methods 24 | for f in (:fft, :bfft, :ifft) 25 | pf = Symbol("plan_", f) 26 | @eval begin 27 | $f(x::CuArray{<:Real}, region=1:ndims(x)) = $f(complexfloat(x), region) 28 | $pf(x::CuArray{<:Real}, region) = $pf(complexfloat(x), region) 29 | $f(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region=1:ndims(x)) = $f(complexfloat(x), region) 30 | $pf(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region) = $pf(complexfloat(x), region) 31 | end 32 | end 33 | rfft(x::CuArray{<:Union{Integer,Rational}}, region=1:ndims(x)) = rfft(realfloat(x), region) 34 | plan_rfft(x::CuArray{<:Real}, region) = plan_rfft(realfloat(x), region) 35 | 36 | *(p::Plan{T}, x::CuArray) where {T} = p * copy1(T, x) 37 | *(p::ScaledPlan, x::CuArray) = rmul!(p.p * x, p.scale) 38 | -------------------------------------------------------------------------------- /src/blas/error.jl: -------------------------------------------------------------------------------- 1 | export CUBLASError 2 | 3 | struct CUBLASError <: Exception 4 | code::cublasStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUBLASError) = print(io, "CUBLASError(code $(err.code), $(err.msg))") 8 | 9 | function CUBLASError(code::cublasStatus_t) 10 | msg = status_message(code) 11 | return CUBLASError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUBLAS_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUBLAS_STATUS_NOT_INITIALIZED 18 | return "the library was not initialized" 19 | elseif status == CUBLAS_STATUS_ALLOC_FAILED 20 | return "the resource allocation failed" 21 | elseif status == CUBLAS_STATUS_INVALID_VALUE 22 | return "an invalid value was used as an argument" 23 | elseif status == CUBLAS_STATUS_ARCH_MISMATCH 24 | return "an absent device architectural feature is required" 25 | elseif status == CUBLAS_STATUS_MAPPING_ERROR 26 | return "an access to GPU memory space failed" 27 | elseif status == CUBLAS_STATUS_EXECUTION_FAILED 28 | return "the GPU program failed to execute" 29 | elseif status == CUBLAS_STATUS_INTERNAL_ERROR 30 | return "an internal operation failed" 31 | elseif status == CUBLAS_STATUS_NOT_SUPPORTED 32 | return "the requested feature is not supported" 33 | elseif status == CUBLAS_STATUS_LICENSE_ERROR 34 | return "error detected trying to check the license" 35 | else 36 | return "unknown status" 37 | end 38 | end 39 | 40 | macro check(blas_func) 41 | quote 42 | local err::cublasStatus_t 43 | err = $(esc(blas_func::Expr)) 44 | if err != CUBLAS_STATUS_SUCCESS 45 | throw(CUBLASError(err)) 46 | end 47 | err 48 | end 49 | end -------------------------------------------------------------------------------- /src/broadcast.jl: -------------------------------------------------------------------------------- 1 | import Base.Broadcast: Broadcasted, Extruded, BroadcastStyle, ArrayStyle 2 | 3 | BroadcastStyle(::Type{<:CuArray}) = ArrayStyle{CuArray}() 4 | 5 | function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}) where T 6 | similar(CuArray{T}, axes(bc)) 7 | end 8 | 9 | 10 | # replace base functions with libdevice alternatives 11 | # TODO: do this with Cassette.jl 12 | 13 | cufunc(f) = f 14 | cufunc(::Type{T}) where T = (x...) -> T(x...) # broadcasting type ctors isn't GPU compatible 15 | 16 | Broadcast.broadcasted(::ArrayStyle{CuArray}, f, args...) = 17 | Broadcasted{ArrayStyle{CuArray}}(cufunc(f), args, nothing) 18 | 19 | libdevice = :[ 20 | cos, cospi, sin, sinpi, tan, acos, asin, atan, 21 | cosh, sinh, tanh, acosh, asinh, atanh, 22 | log, log10, log1p, log2, logb, ilogb, 23 | exp, exp2, exp10, expm1, ldexp, 24 | erf, erfinv, erfc, erfcinv, erfcx, 25 | brev, clz, ffs, byte_perm, popc, 26 | isfinite, isinf, isnan, nearbyint, 27 | nextafter, signbit, copysign, abs, 28 | sqrt, rsqrt, cbrt, rcbrt, pow, 29 | ceil, floor, saturate, 30 | lgamma, tgamma, 31 | j0, j1, jn, y0, y1, yn, 32 | normcdf, normcdfinv, hypot, 33 | fma, sad, dim, mul24, mul64hi, hadd, rhadd, scalbn].args 34 | 35 | for f in libdevice 36 | isdefined(Base, f) || continue 37 | @eval cufunc(::typeof(Base.$f)) = CUDAnative.$f 38 | end 39 | 40 | using MacroTools 41 | 42 | const _cufuncs = copy(libdevice) 43 | cufuncs() = (global _cufuncs; _cufuncs) 44 | 45 | function replace_device(ex) 46 | global _cufuncs 47 | MacroTools.postwalk(ex) do x 48 | x in _cufuncs ? :(CuArrays.cufunc($x)) : x 49 | end 50 | end 51 | 52 | macro cufunc(ex) 53 | global _cufuncs 54 | def = MacroTools.splitdef(ex) 55 | f = def[:name] 56 | def[:name] = Symbol(:cu, f) 57 | def[:body] = replace_device(def[:body]) 58 | push!(_cufuncs, f) 59 | quote 60 | $(esc(MacroTools.combinedef(def))) 61 | CuArrays.cufunc(::typeof($(esc(f)))) = $(esc(def[:name])) 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /src/solver/CUSOLVER.jl: -------------------------------------------------------------------------------- 1 | module CUSOLVER 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free! 8 | 9 | using LinearAlgebra 10 | using SparseArrays 11 | 12 | import Base.one 13 | import Base.zero 14 | import CuArrays.CUSPARSE.CuSparseMatrixCSR 15 | import CuArrays.CUSPARSE.CuSparseMatrixCSC 16 | import CuArrays.CUSPARSE.cusparseMatDescr_t 17 | 18 | include("libcusolver_types.jl") 19 | include("error.jl") 20 | 21 | const _dense_handles = Dict{CuContext,cusolverDnHandle_t}() 22 | const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL) 23 | const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}() 24 | const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL) 25 | 26 | function dense_handle() 27 | if _dense_handle[] == C_NULL 28 | @assert isassigned(active_context) # some other call should have initialized CUDA 29 | _dense_handle[] = get!(_dense_handles, active_context[]) do 30 | context = active_context[] 31 | handle = cusolverDnCreate() 32 | atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle)) 33 | handle 34 | end 35 | end 36 | return _dense_handle[] 37 | end 38 | 39 | function sparse_handle() 40 | if _sparse_handle[] == C_NULL 41 | @assert isassigned(active_context) # some other call should have initialized CUDA 42 | _sparse_handle[] = get!(_sparse_handles, active_context[]) do 43 | context = active_context[] 44 | handle = cusolverSpCreate() 45 | atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle)) 46 | handle 47 | end 48 | end 49 | return _sparse_handle[] 50 | end 51 | 52 | include("libcusolver.jl") 53 | include("sparse.jl") 54 | include("dense.jl") 55 | include("highlevel.jl") 56 | 57 | version() = VersionNumber(cusolverGetProperty(CUDAapi.MAJOR_VERSION), 58 | cusolverGetProperty(CUDAapi.MINOR_VERSION), 59 | cusolverGetProperty(CUDAapi.PATCH_LEVEL)) 60 | 61 | end 62 | -------------------------------------------------------------------------------- /src/rand/error.jl: -------------------------------------------------------------------------------- 1 | export CURANDError 2 | 3 | struct CURANDError <: Exception 4 | code::curandStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CURANDError) = print(io, "CURANDError(code $(err.code), $(err.msg))") 8 | 9 | function CURANDError(code::curandStatus_t) 10 | msg = status_message(code) 11 | return CURANDError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CURAND_STATUS_SUCCESS 16 | return "generator was created successfully" 17 | elseif status == CURAND_STATUS_VERSION_MISMATCH 18 | return "Header file and linked library version do not match" 19 | elseif status == CURAND_STATUS_NOT_INITIALIZED 20 | return "Generator not initialized" 21 | elseif status == CURAND_STATUS_ALLOCATION_FAILED 22 | return "Memory allocation failed" 23 | elseif status == CURAND_STATUS_TYPE_ERROR 24 | return "Generator is wrong type" 25 | elseif status == CURAND_STATUS_OUT_OF_RANGE 26 | return "Argument out of range" 27 | elseif status == CURAND_STATUS_LENGTH_NOT_MULTIPLE 28 | return "Length requested is not a multple of dimension" 29 | elseif status == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED 30 | return "GPU does not have double precision required by MRG32k3a" 31 | elseif status == CURAND_STATUS_LAUNCH_FAILURE 32 | return "Kernel launch failure" 33 | elseif status == CURAND_STATUS_PREEXISTING_FAILURE 34 | return "Preexisting failure on library entry" 35 | elseif status == CURAND_STATUS_INITIALIZATION_FAILED 36 | return "Initialization of CUDA failed" 37 | elseif status == CURAND_STATUS_ARCH_MISMATCH 38 | return "Architecture mismatch, GPU does not support requested feature" 39 | elseif status == CURAND_STATUS_INTERNAL_ERROR 40 | return "Internal library error" 41 | else 42 | return "unknown status" 43 | end 44 | end 45 | 46 | macro check(func) 47 | quote 48 | local err::curandStatus_t 49 | err = $(esc(func::Expr)) 50 | if err != CURAND_STATUS_SUCCESS 51 | throw(CURANDError(err)) 52 | end 53 | err 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /src/blas/README.md: -------------------------------------------------------------------------------- 1 | # CUBLAS implementation progress 2 | 3 | The following sections list the CUBLAS functions shown on the CUBLAS 4 | documentation page: 5 | 6 | http://docs.nvidia.com/cuda/cublas/index.html 7 | 8 | ## Level 1 (13 functions) 9 | 10 | CUBLAS functions: 11 | 12 | * [x] amax 13 | * [x] amin 14 | * [x] asum 15 | * [x] axpy 16 | * [x] copy 17 | * [x] dot, dotc, dotu 18 | * [x] nrm2 19 | * [ ] rot (not implemented in julia blas.jl) 20 | * [ ] rotg (not implemented in julia blas.jl) 21 | * [ ] rotm (not implemented in julia blas.jl) 22 | * [ ] rotmg (not implemented in julia blas.jl) 23 | * [x] scal 24 | * [ ] swap (not implemented in julia blas.jl) 25 | 26 | ## Level 2 27 | 28 | Key: 29 | * `ge`: general 30 | * `gb`: general banded 31 | * `sy`: symmetric 32 | * `sb`: symmetric banded 33 | * `sp`: symmetric packed 34 | * `tr`: triangular 35 | * `tb`: triangular banded 36 | * `tp`: triangular packed 37 | * `he`: hermitian 38 | * `hb`: hermitian banded 39 | * `hp`: hermitian packed 40 | 41 | CUBLAS functions: 42 | 43 | * [x] gbmv (in julia/blas.jl) 44 | * [x] gemv (in julia/blas.jl) 45 | * [x] ger (in julia/blas.jl) 46 | * [x] sbmv (in julia/blas.jl) 47 | * [ ] spmv 48 | * [ ] spr 49 | * [ ] spr2 50 | * [x] symv (in julia/blas.jl) 51 | * [x] syr (in julia/blas.jl) 52 | * [ ] syr2 53 | * [x] tbmv 54 | * [x] tbsv 55 | * [ ] tpmv 56 | * [ ] tpsv 57 | * [x] trmv (in julia/blas.jl) 58 | * [x] trsv (in julia/blas.jl) 59 | * [x] hemv (in julia/blas.jl) 60 | * [x] hbmv 61 | * [ ] hpmv 62 | * [x] her (in julia/blas.jl) 63 | * [x] her2 64 | * [ ] hpr 65 | * [ ] hpr2 66 | 67 | ## Level 3 68 | 69 | CUBLAS functions: 70 | 71 | * [x] gemm (in julia/blas.jl) 72 | * [x] gemmBatched 73 | * [x] symm (in julia/blas.jl) 74 | * [x] syrk (in julia/blas.jl) 75 | * [x] syr2k (in julia/blas.jl) 76 | * [ ] syrkx 77 | * [x] trmm (in julia/blas.jl) 78 | * [x] trsm (in julia/blas.jl) 79 | * [x] trsmBatched 80 | * [x] hemm 81 | * [x] herk (in julia/blas.jl) 82 | * [x] her2k (in julia/blas.jl) 83 | * [ ] herkx 84 | 85 | ## BLAS-like extensions 86 | 87 | * [x] geam 88 | * [x] dgmm 89 | * [x] getrfBatched 90 | * [x] getriBatched 91 | * [x] geqrfBatched 92 | * [x] gelsBatched 93 | * [ ] tpttr 94 | * [ ] trttp 95 | -------------------------------------------------------------------------------- /src/blas/CUBLAS.jl: -------------------------------------------------------------------------------- 1 | module CUBLAS 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL, devices 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcublas, active_context, unsafe_free! 8 | using LinearAlgebra 9 | 10 | include("libcublas_types.jl") 11 | include("error.jl") 12 | 13 | const _handles = Dict{CuContext,cublasHandle_t}() 14 | const _xt_handles = Dict{CuContext,cublasXtHandle_t}() 15 | const _handle = Ref{cublasHandle_t}(C_NULL) 16 | const _xt_handle = Ref{cublasXtHandle_t}(C_NULL) 17 | 18 | function handle() 19 | if _handle[] == C_NULL 20 | @assert isassigned(active_context) # some other call should have initialized CUDA 21 | _handle[] = get!(_handles, active_context[]) do 22 | context = active_context[] 23 | handle = cublasCreate_v2() 24 | 25 | # enable tensor math mode if our device supports it, and fast math is enabled 26 | dev = CUDAdrv.device(context) 27 | if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" 28 | cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle) 29 | end 30 | 31 | atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle)) 32 | handle 33 | end 34 | end 35 | 36 | return _handle[] 37 | end 38 | 39 | function xt_handle() 40 | if _xt_handle[] == C_NULL 41 | @assert isassigned(active_context) # some other call should have initialized CUDA 42 | _xt_handle[] = get!(_xt_handles, active_context[]) do 43 | context = active_context[] 44 | handle = cublasXtCreate() 45 | devs = convert.(Cint, CUDAdrv.devices()) 46 | cublasXtDeviceSelect(handle, length(devs), devs) 47 | atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle)) 48 | handle 49 | end 50 | end 51 | return _xt_handle[] 52 | end 53 | 54 | include("libcublas.jl") 55 | include("util.jl") 56 | include("wrappers.jl") 57 | include("highlevel.jl") 58 | 59 | version() = VersionNumber(cublasGetProperty(CUDAapi.MAJOR_VERSION), 60 | cublasGetProperty(CUDAapi.MINOR_VERSION), 61 | cublasGetProperty(CUDAapi.PATCH_LEVEL)) 62 | 63 | end 64 | -------------------------------------------------------------------------------- /src/utils.jl: -------------------------------------------------------------------------------- 1 | using Base.Cartesian 2 | 3 | function cudims(n::Integer) 4 | threads = min(n, 256) 5 | ceil(Int, n / threads), threads 6 | end 7 | 8 | cudims(a::AbstractArray) = cudims(length(a)) 9 | 10 | @inline ind2sub_(a::AbstractArray{T,0}, i) where T = () 11 | @inline ind2sub_(a, i) = Tuple(CartesianIndices(a)[i]) 12 | 13 | macro cuindex(A) 14 | quote 15 | A = $(esc(A)) 16 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 17 | i > length(A) && return 18 | ind2sub_(A, i) 19 | end 20 | end 21 | 22 | 23 | @generated function nindex(i::T, ls::NTuple{N,T}) where {N,T} 24 | na = one(i) 25 | quote 26 | Base.@_inline_meta 27 | $(foldr((n, els) -> :(i ≤ ls[$n] ? ($n, i) : (i -= ls[$n]; $els)), :($na, $na), one(i):i(N))) 28 | end 29 | end 30 | 31 | @inline function catindex(dim, I::NTuple{N}, shapes) where N 32 | @inbounds x, i = nindex(I[dim], getindex.(shapes, dim)) 33 | x, ntuple(n -> n == dim ? i : I[n], Val{N}) 34 | end 35 | 36 | function growdims(dim, x) 37 | if ndims(x) >= dim 38 | x 39 | else 40 | reshape(x, size.((x,), 1:dim)...) 41 | end 42 | end 43 | 44 | function _cat(dim, dest, xs...) 45 | function kernel(dim, dest, xs) 46 | I = @cuindex dest 47 | @inbounds n, I′ = catindex(dim, Int.(I), size.(xs)) 48 | @inbounds dest[I...] = xs[n][I′...] 49 | return 50 | end 51 | xs = growdims.(dim, xs) 52 | blk, thr = cudims(dest) 53 | @cuda blocks=blk threads=thr kernel(dim, dest, xs) 54 | return dest 55 | end 56 | 57 | function Base.cat_t(dims::Integer, T::Type, x::CuArray, xs::CuArray...) 58 | catdims = Base.dims2cat(dims) 59 | shape = Base.cat_shape(catdims, (), size.((x, xs...))...) 60 | dest = Base.cat_similar(x, T, shape) 61 | _cat(dims, dest, x, xs...) 62 | end 63 | 64 | Base.vcat(xs::CuArray...) = cat(xs..., dims=1) 65 | Base.hcat(xs::CuArray...) = cat(xs..., dims=2) 66 | 67 | 68 | """ 69 | @sync ex 70 | 71 | Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly 72 | synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As 73 | such, this operation is preferred over implicit synchronization (e.g. when performing a 74 | memory copy) for high-performance applications. 75 | 76 | It is also useful for timing code that executes asynchronously. 77 | """ 78 | macro sync(ex) 79 | quote 80 | local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) 81 | local ret = $(esc(ex)) 82 | CUDAdrv.record(e) 83 | CUDAdrv.synchronize(e) 84 | ret 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /src/gpuarray_interface.jl: -------------------------------------------------------------------------------- 1 | import GPUArrays 2 | 3 | struct CuArrayBackend <: GPUArrays.GPUBackend end 4 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend() 5 | 6 | 7 | #Abstract GPU interface 8 | struct CuKernelState end 9 | 10 | @inline function GPUArrays.LocalMemory(::CuKernelState, ::Type{T}, ::Val{N}, ::Val{id}) where {T, N, id} 11 | ptr = CUDAnative._shmem(Val(id), T, Val(N)) 12 | CuDeviceArray(N, DevicePtr{T, CUDAnative.AS.Shared}(ptr)) 13 | end 14 | 15 | GPUArrays.AbstractDeviceArray(A::CUDAnative.CuDeviceArray, shape) = CUDAnative.CuDeviceArray(shape, pointer(A)) 16 | 17 | @inline GPUArrays.synchronize_threads(::CuKernelState) = CUDAnative.sync_threads() 18 | 19 | GPUArrays.blas_module(::CuArray) = CuArrays.CUBLAS 20 | GPUArrays.blasbuffer(x::CuArray) = x 21 | 22 | """ 23 | Blocks until all operations are finished on `A` 24 | """ 25 | GPUArrays.synchronize(A::CuArray) = 26 | CUDAdrv.synchronize() 27 | 28 | for (i, sym) in enumerate((:x, :y, :z)) 29 | for (f, fcu) in ( 30 | (:blockidx, :blockIdx), 31 | (:blockdim, :blockDim), 32 | (:threadidx, :threadIdx), 33 | (:griddim, :gridDim) 34 | ) 35 | fname = Symbol(string(f, '_', sym)) 36 | cufun = Symbol(string(fcu, '_', sym)) 37 | @eval GPUArrays.$fname(::CuKernelState) = CUDAnative.$cufun() 38 | end 39 | end 40 | 41 | # devices() = CUDAdrv.devices() 42 | GPUArrays.device(A::CuArray) = CUDAdrv.device(CUDAdrv.CuCurrentContext()) 43 | GPUArrays.is_gpu(dev::CUDAdrv.CuDevice) = true 44 | GPUArrays.name(dev::CUDAdrv.CuDevice) = string("CU ", CUDAdrv.name(dev)) 45 | GPUArrays.threads(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK) 46 | 47 | GPUArrays.blocks(dev::CUDAdrv.CuDevice) = 48 | (CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X), 49 | CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y), 50 | CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Z)) 51 | 52 | GPUArrays.free_global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.Mem.info()[1] 53 | GPUArrays.global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.totalmem(dev) 54 | GPUArrays.local_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.TOTAL_CONSTANT_MEMORY) 55 | 56 | function GPUArrays._gpu_call(::CuArrayBackend, f, A, args::Tuple, 57 | blocks_threads::Tuple{T, T}) where {N, T <: NTuple{N, Integer}} 58 | blk, thr = blocks_threads 59 | @cuda blocks=blk threads=thr f(CuKernelState(), args...) 60 | end 61 | 62 | # Save reinterpret and reshape implementation use this in GPUArrays 63 | GPUArrays.unsafe_reinterpret(::Type{T}, A::CuArray, size::NTuple{N, Integer}) where {T, N} = 64 | CuArray{T, N}(A.buf, size) 65 | -------------------------------------------------------------------------------- /src/fft/error.jl: -------------------------------------------------------------------------------- 1 | export CUFFTError 2 | 3 | struct CUFFTError <: Exception 4 | code::cufftStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUFFTError) = print(io, "CUFFTError(code $(err.code), $(err.msg))") 8 | 9 | function CUFFTError(code::cufftStatus_t) 10 | msg = status_message(code) 11 | return CUFFTError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUFFT_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUFFT_STATUS_INVALID_PLAN 18 | return "cuFFT was passed an invalid plan handle" 19 | elseif status == CUFFT_STATUS_ALLOC_FAILED 20 | return "cuFFT failed to allocate GPU or CPU memory" 21 | elseif status == CUFFT_STATUS_INVALID_TYPE 22 | return "cuFFT invalid type " # No longer used 23 | elseif status == CUFFT_STATUS_INVALID_VALUE 24 | return "User specified an invalid pointer or parameter" 25 | elseif status == CUFFT_STATUS_INTERNAL_ERROR 26 | return "Driver or internal cuFFT library error" 27 | elseif status == CUFFT_STATUS_EXEC_FAILED 28 | return "Failed to execute an FFT on the GPU" 29 | elseif status == CUFFT_STATUS_SETUP_FAILED 30 | return "The cuFFT library failed to initialize" 31 | elseif status == CUFFT_STATUS_INVALID_SIZE 32 | return "User specified an invalid transform size" 33 | elseif status == CUFFT_STATUS_UNALIGNED_DATA 34 | return "cuFFT unaligned data" # No longer used 35 | elseif status == CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST 36 | return "Missing parameters in call" 37 | elseif status == CUFFT_STATUS_INVALID_DEVICE 38 | return "Execution of a plan was on different GPU than plan creation" 39 | elseif status == CUFFT_STATUS_PARSE_ERROR 40 | return "Internal plan database error" 41 | elseif status == CUFFT_STATUS_NO_WORKSPACE 42 | return "No workspace has been provided prior to plan execution" 43 | elseif status == CUFFT_STATUS_NOT_IMPLEMENTED 44 | return "Function does not implement functionality for parameters given." 45 | elseif status == CUFFT_STATUS_LICENSE_ERROR 46 | return "cuFFT license error" # Used in previous versions. 47 | elseif status == CUFFT_STATUS_NOT_SUPPORTED 48 | return "Operation is not supported for parameters given." 49 | else 50 | return "unknown status" 51 | end 52 | end 53 | 54 | macro check(fft_func) 55 | quote 56 | local err::cufftStatus_t 57 | err = $(esc(fft_func::Expr)) 58 | if err != CUFFT_STATUS_SUCCESS 59 | throw(CUFFTError(err)) 60 | end 61 | err 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /src/rand/libcurand_types.jl: -------------------------------------------------------------------------------- 1 | const curandGenerator_t = Ptr{Cvoid} 2 | 3 | mutable struct RNG <: Random.AbstractRNG 4 | ptr::curandGenerator_t 5 | typ::Int 6 | end 7 | 8 | Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr 9 | 10 | 11 | const curandDiscreteDistribution_t = Ptr{Cvoid} 12 | 13 | mutable struct DiscreteDistribution 14 | ptr::curandDiscreteDistribution_t 15 | end 16 | 17 | Base.unsafe_convert(::Type{curandDiscreteDistribution_t}, dist::DiscreteDistribution) = dist.ptr 18 | 19 | 20 | # CURAND status codes 21 | const curandStatus_t = UInt32 22 | const CURAND_STATUS_SUCCESS = 0 23 | const CURAND_STATUS_VERSION_MISMATCH = 100 24 | const CURAND_STATUS_NOT_INITIALIZED = 101 25 | const CURAND_STATUS_ALLOCATION_FAILED = 102 26 | const CURAND_STATUS_TYPE_ERROR = 103 27 | const CURAND_STATUS_OUT_OF_RANGE = 104 28 | const CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105 29 | const CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106 30 | const CURAND_STATUS_LAUNCH_FAILURE = 201 31 | const CURAND_STATUS_PREEXISTING_FAILURE = 202 32 | const CURAND_STATUS_INITIALIZATION_FAILED = 203 33 | const CURAND_STATUS_ARCH_MISMATCH = 204 34 | const CURAND_STATUS_INTERNAL_ERROR = 999 35 | 36 | # CURAND RNG types (curandRngType) 37 | const CURAND_RNG_TEST = 0 38 | const CURAND_RNG_PSEUDO_DEFAULT = 100 39 | const CURAND_RNG_PSEUDO_XORWOW = 101 40 | const CURAND_RNG_PSEUDO_MRG32K3A = 121 41 | const CURAND_RNG_PSEUDO_MTGP32 = 141 42 | const CURAND_RNG_PSEUDO_MT19937 = 142 43 | const CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161 44 | const CURAND_RNG_QUASI_DEFAULT = 200 45 | const CURAND_RNG_QUASI_SOBOL32 = 201 46 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202 47 | const CURAND_RNG_QUASI_SOBOL64 = 203 48 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204 49 | 50 | # CURAND ordering of results in memory 51 | const CURAND_ORDERING_PSEUDO_BEST = 100 52 | const CURAND_ORDERING_PSEUDO_DEFAULT = 101 53 | const CURAND_ORDERING_PSEUDO_SEEDED = 102 54 | const CURAND_ORDERING_QUASI_DEFAULT = 201 55 | 56 | # CURAND choice of direction vector set 57 | const CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101 58 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102 59 | const CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103 60 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104 61 | 62 | # CURAND method 63 | const CURAND_CHOOSE_BEST = 0 64 | const CURAND_ITR = 1 65 | const CURAND_KNUTH = 2 66 | const CURAND_HITR = 3 67 | const CURAND_M1 = 4 68 | const CURAND_M2 = 5 69 | const CURAND_BINARY_SEARCH = 6 70 | const CURAND_DISCRETE_GAUSS = 7 71 | const CURAND_REJECTION = 8 72 | const CURAND_DEVICE_API = 9 73 | const CURAND_FAST_REJECTION = 10 74 | const CURAND_3RD = 11 75 | const CURAND_DEFINITION = 12 76 | const CURAND_POISSON = 13 77 | -------------------------------------------------------------------------------- /deps/build.jl: -------------------------------------------------------------------------------- 1 | using CUDAapi 2 | 3 | 4 | ## auxiliary routines 5 | 6 | status = 0 7 | function build_warning(reason) 8 | println("$reason.") 9 | global status 10 | status = 1 11 | # NOTE: it's annoying that we have to `exit(1)`, but otherwise messages are hidden 12 | end 13 | 14 | function build_error(reason) 15 | println(reason) 16 | exit(1) 17 | end 18 | 19 | 20 | ## main 21 | 22 | config_path = joinpath(@__DIR__, "ext.jl") 23 | const previous_config_path = config_path * ".bak" 24 | 25 | function write_ext(config) 26 | open(config_path, "w") do io 27 | println(io, "# autogenerated file, do not edit") 28 | for (key,val) in config 29 | println(io, "const $key = $(repr(val))") 30 | end 31 | end 32 | end 33 | 34 | function main() 35 | ispath(config_path) && mv(config_path, previous_config_path; force=true) 36 | config = Dict{Symbol,Any}(:configured => false) 37 | write_ext(config) 38 | 39 | 40 | ## discover stuff 41 | 42 | toolkit = find_toolkit() 43 | 44 | # required libraries that are part of the CUDA toolkit 45 | for name in ("cublas", "cusparse", "cusolver", "cufft", "curand") 46 | lib = Symbol("lib$name") 47 | config[lib] = find_cuda_library(name, toolkit) 48 | if config[lib] == nothing 49 | build_error("Could not find library '$name' (it should be part of the CUDA toolkit)") 50 | end 51 | end 52 | 53 | # optional libraries 54 | for name in ("cudnn", ) 55 | lib = Symbol("lib$name") 56 | config[lib] = find_cuda_library(name, toolkit) 57 | if config[lib] == nothing 58 | build_warning("Could not find optional library '$name'") 59 | end 60 | end 61 | 62 | 63 | ## (re)generate ext.jl 64 | 65 | function globals(mod) 66 | all_names = names(mod, all=true) 67 | filter(name-> !any(name .== [nameof(mod), Symbol("#eval"), :eval]), all_names) 68 | end 69 | 70 | if isfile(previous_config_path) 71 | @eval module Previous; include($previous_config_path); end 72 | previous_config = Dict{Symbol,Any}(name => getfield(Previous, name) 73 | for name in globals(Previous)) 74 | 75 | if config == previous_config 76 | mv(previous_config_path, config_path; force=true) 77 | return 78 | end 79 | end 80 | 81 | config[:configured] = true 82 | write_ext(config) 83 | 84 | if status != 0 85 | # we got here, so the status is non-fatal 86 | build_error(""" 87 | 88 | CuArrays.jl has been built successfully, but there were warnings. 89 | Some functionality may be unavailable.""") 90 | end 91 | end 92 | 93 | main() 94 | -------------------------------------------------------------------------------- /src/solver/libcusolver_types.jl: -------------------------------------------------------------------------------- 1 | import ..CUBLAS: cublasfill, cublasop, cublasside, cublasFillMode_t, cublasOperation_t, cublasSideMode_t 2 | 3 | #enum cusolverStatus_t 4 | #error messages from CUSOLVER 5 | 6 | const cusolverStatus_t = UInt32 7 | const CUSOLVER_STATUS_SUCCESS = 0 8 | const CUSOLVER_STATUS_NOT_INITIALIZED = 1 9 | const CUSOLVER_STATUS_ALLOC_FAILED = 2 10 | const CUSOLVER_STATUS_INVALID_VALUE = 3 11 | const CUSOLVER_STATUS_ARCH_MISMATCH = 4 12 | const CUSOLVER_STATUS_EXECUTION_FAILED = 5 13 | const CUSOLVER_STATUS_INTERNAL_ERROR = 6 14 | const CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 7 15 | 16 | const csrqrInfo_t = Ptr{Nothing} 17 | const gesvdjInfo_t = Ptr{Cvoid} 18 | const syevjInfo_t = Ptr{Cvoid} 19 | 20 | const cusolverEigMode_t = UInt32 21 | const CUSOLVER_EIG_MODE_NOVECTOR = 0 22 | const CUSOLVER_EIG_MODE_VECTOR = 1 23 | 24 | const cusolverEigType_t = UInt32 25 | const CUSOLVER_EIG_TYPE_1 = 1 26 | const CUSOLVER_EIG_TYPE_2 = 2 27 | const CUSOLVER_EIG_TYPE_3 = 3 28 | 29 | # refactorization types 30 | 31 | const cusolverRfNumericBoostReport_t = UInt32 32 | const CUSOLVER_NUMERIC_BOOST_NOT_USED = 0 33 | const CUSOLVER_NUMERIC_BOOST_USED = 1 34 | 35 | const cusolverRfResetValuesFastMode_t = UInt32 36 | const CUSOLVER_RESET_VALUES_FAST_MODE_OFF = 0 37 | const CUSOLVER_RESET_VALUES_FAST_MODE_ON = 1 38 | 39 | const cusolverRfFactorization_t = UInt32 40 | const CUSOLVER_FACTORIZATION_ALG0 = 0 41 | const CUSOLVER_FACTORIZATION_ALG1 = 1 42 | const CUSOLVER_FACTORIZATION_ALG2 = 2 43 | 44 | const cusolverRfTriangularSolve_t = UInt32 45 | const CUSOLVER_TRIANGULAR_SOLVE_ALG0 = 0 46 | const CUSOLVER_TRIANGULAR_SOLVE_ALG1 = 1 47 | const CUSOLVER_TRIANGULAR_SOLVE_ALG2 = 2 48 | const CUSOLVER_TRIANGULAR_SOLVE_ALG3 = 3 49 | 50 | const cusolverRfUnitDiagonal_t = UInt32 51 | const CUSOLVER_UNIT_DIAGONAL_STORED_L = 0 52 | const CUSOLVER_UNIT_DIAGONAL_STORED_U = 1 53 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_L = 2 54 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_U = 3 55 | 56 | const cusolverDnContext = Nothing 57 | const cusolverDnHandle_t = Ptr{cusolverDnContext} 58 | const cusolverSpContext = Nothing 59 | const cusolverSpHandle_t = Ptr{cusolverSpContext} 60 | const cusolverRfContext = Nothing 61 | const cusolverRfHandle_t = Ptr{cusolverRfContext} 62 | 63 | #complex numbers 64 | 65 | const cuComplex = Complex{Float32} 66 | const cuDoubleComplex = Complex{Float64} 67 | 68 | const CusolverFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 69 | const CusolverReal = Union{Float64,Float32} 70 | const CusolverComplex = Union{ComplexF64,ComplexF32} 71 | -------------------------------------------------------------------------------- /src/fft/fft.jl: -------------------------------------------------------------------------------- 1 | # K is a flag for forward/backward 2 | # also used as an alias for r2c/c2r 3 | 4 | abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end 5 | 6 | mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace} 7 | plan::cufftHandle_t 8 | sz::NTuple{N,Int} # Julia size of input array 9 | osz::NTuple{N,Int} # Julia size of output array 10 | xtype::Int 11 | region::Any 12 | pinv::ScaledPlan # required by AbstractFFT API 13 | 14 | function cCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N}, 15 | sizey::Tuple, region, xtype::Integer 16 | ) where {T<:cufftNumber,K,inplace,N} 17 | # maybe enforce consistency of sizey 18 | p = new(plan, size(X), sizey, xtype, region) 19 | finalizer(destroy_plan, p) 20 | p 21 | end 22 | end 23 | 24 | cCuFFTPlan(plan,X,region,xtype::Integer) = cCuFFTPlan(plan,X,size(X),region,xtype) 25 | 26 | mutable struct rCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace} 27 | plan::cufftHandle_t 28 | sz::NTuple{N,Int} # Julia size of input array 29 | osz::NTuple{N,Int} # Julia size of output array 30 | xtype::Int 31 | region::Any 32 | pinv::ScaledPlan # required by AbstractFFT API 33 | 34 | function rCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N}, 35 | sizey::Tuple, region, xtype::Integer 36 | ) where {T<:cufftNumber,K,inplace,N} 37 | # maybe enforce consistency of sizey 38 | p = new(plan, size(X), sizey, xtype, region) 39 | finalizer(destroy_plan, p) 40 | p 41 | end 42 | end 43 | 44 | rCuFFTPlan(plan,X,region,xtype::Integer) = rCuFFTPlan(plan,X,size(X),region,xtype) 45 | 46 | const xtypenames = Dict{cufftType,String}(CUFFT_R2C => "real-to-complex", 47 | CUFFT_C2R => "complex-to-real", 48 | CUFFT_C2C => "complex", 49 | CUFFT_D2Z => "d.p. real-to-complex", 50 | CUFFT_Z2D => "d.p. complex-to-real", 51 | CUFFT_Z2Z => "d.p. complex") 52 | 53 | function showfftdims(io, sz, T) 54 | if isempty(sz) 55 | print(io,"0-dimensional") 56 | elseif length(sz) == 1 57 | print(io, sz[1], "-element") 58 | else 59 | print(io, join(sz, "×")) 60 | end 61 | print(io, " CuArray of ", T) 62 | end 63 | 64 | function show(io::IO, p::CuFFTPlan{T,K,inplace}) where {T,K,inplace} 65 | print(io, inplace ? "CUFFT in-place " : "CUFFT ", 66 | xtypenames[p.xtype], 67 | K == CUFFT_FORWARD ? " forward" : " backward", 68 | " plan for ") 69 | showfftdims(io, p.sz, T) 70 | end 71 | -------------------------------------------------------------------------------- /src/fft/libcufft_types.jl: -------------------------------------------------------------------------------- 1 | # CUFFT API function return values 2 | const cufftStatus_t = UInt32 3 | const CUFFT_STATUS_SUCCESS = 0 # The cuFFT operation was successful 4 | const CUFFT_STATUS_INVALID_PLAN = 1 # cuFFT was passed an invalid plan handle 5 | const CUFFT_STATUS_ALLOC_FAILED = 2 # cuFFT failed to allocate GPU or CPU memory 6 | const CUFFT_STATUS_INVALID_TYPE = 3 # No longer used 7 | const CUFFT_STATUS_INVALID_VALUE = 4 # User specified an invalid pointer or parameter 8 | const CUFFT_STATUS_INTERNAL_ERROR = 5 # Driver or internal cuFFT library error 9 | const CUFFT_STATUS_EXEC_FAILED = 6 # Failed to execute an FFT on the GPU 10 | const CUFFT_STATUS_SETUP_FAILED = 7 # The cuFFT library failed to initialize 11 | const CUFFT_STATUS_INVALID_SIZE = 8 # User specified an invalid transform size 12 | const CUFFT_STATUS_UNALIGNED_DATA = 9 # No longer used 13 | const CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST = 10 # Missing parameters in call 14 | const CUFFT_STATUS_INVALID_DEVICE = 11 # Execution of a plan was on different GPU than plan creation 15 | const CUFFT_STATUS_PARSE_ERROR = 12 # Internal plan database error 16 | const CUFFT_STATUS_NO_WORKSPACE = 13 # No workspace has been provided prior to plan execution 17 | const CUFFT_STATUS_NOT_IMPLEMENTED = 14 # Function does not implement functionality for parameters given. 18 | const CUFFT_STATUS_LICENSE_ERROR = 15 # Used in previous versions. 19 | const CUFFT_STATUS_NOT_SUPPORTED = 16 # Operation is not supported for parameters given. 20 | 21 | 22 | const cufftReal = Float32 23 | const cufftDoubleReal = Float64 24 | 25 | const cufftComplex = ComplexF32 26 | const cufftDoubleComplex = ComplexF64 27 | 28 | # CUFFT transform directions 29 | const CUFFT_FORWARD = -1 # Forward FFT 30 | const CUFFT_INVERSE = 1 # Inverse FFT 31 | 32 | # CUFFT supports the following transform types 33 | const cufftType = Cint 34 | const CUFFT_R2C = 0x2a # Real to Complex 35 | const CUFFT_C2R = 0x2c # Complex to Real 36 | const CUFFT_C2C = 0x29 # Complex to Complex 37 | const CUFFT_D2Z = 0x6a # Double to Double-Complex 38 | const CUFFT_Z2D = 0x6c # Double-Complex to Double 39 | const CUFFT_Z2Z = 0x69 # Double-Complex to Double-Complex 40 | 41 | const cufftCompatibility = Cint 42 | const CUFFT_COMPATIBILITY_NATIVE = 0x00 43 | const CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 44 | const CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02 45 | const CUFFT_COMPATIBILITY_FFTW_ALL = 0x03 46 | 47 | const cufftHandle_t = Cint 48 | 49 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex} 50 | # note trailing s to deconflict w/ header file 51 | const cufftReals = Union{cufftDoubleReal,cufftReal} 52 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex} 53 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex} 54 | const cufftSingle = Union{cufftReal,cufftComplex} 55 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}} 56 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}} 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CuArrays 2 | 3 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] 4 | 5 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CuArrays.jl/branch/master/graph/badge.svg 6 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CuArrays.jl 7 | 8 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg 9 | [docs-latest-url]: https://juliagpu.gitlab.io/CuArrays.jl/ 10 | 11 | CuArrays provides a fully-functional GPU array, which can give significant speedups over 12 | normal arrays without code changes. CuArrays are implemented fully in Julia, making the 13 | implementation [elegant and extremely 14 | generic](http://mikeinnes.github.io/2017/08/24/cudanative.html). 15 | 16 | Documentation for this package is sparse, and for many of the array operations you should 17 | refer to the official Julia documentation. The following resources can be useful to get a 18 | better understanding of the characteristics and performance trade offs that come with GPU 19 | arrays: 20 | 21 | - Introductory tutorial on [GPU programming with Julia](https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/) 22 | - Slide deck on [effectively using GPUs with Julia](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/) 23 | 24 | ## Installation 25 | 26 | CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a 27 | proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work 28 | (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues 29 | with CuArrays.jl, please make sure those other packages are working as expected. 30 | 31 | Some parts of CuArrays.jl depend on **optional libraries**, such as 32 | [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify 33 | about missing dependencies, i.e. inspect the output of `Pkg.build("CuArrays")` 34 | to see whether your installation is complete. 35 | 36 | 37 | ## Features 38 | 39 | ```julia 40 | xs = cu(rand(5, 5)) 41 | ys = cu[1, 2, 3] 42 | xs_cpu = collect(xs) 43 | ``` 44 | 45 | Because `CuArray` is an `AbstractArray`, it doesn't have much of a learning curve; just use your favourite array ops as usual. The following are supported (on arbitrary numbers of arguments, dimensions etc): 46 | 47 | * Conversions and `copy!` with CPU arrays 48 | * General indexing (`xs[1:2, 5, :]`) 49 | * `permutedims` 50 | * Concatenation (`vcat(x, y)`, `cat(3, xs, ys, zs)`) 51 | * `map`, fused broadcast (`zs .= xs.^2 .+ ys .* 2`) 52 | * `fill!(xs, 0)` 53 | * Reduction over dimensions (`reducedim(+, xs, 3)`, `sum(x -> x^2, xs, 1)` etc) 54 | * Reduction to scalar (`reduce(*, 1, xs)`, `sum(xs)` etc) 55 | * Various BLAS operations (matrix\*matrix, matrix\*vector) 56 | * FFTs, using the AbstractFFTs API 57 | 58 | We welcome issues or PRs for functionality not on this list. 59 | 60 | Note that some operations not on this list will work, but be slow, due to Base's generic 61 | implementations. This is intentional, to enable a "make it work, then make it fast" 62 | workflow. When you're ready you can disable slow fallback methods: 63 | 64 | ```julia 65 | julia> CuArrays.allowscalar(false) 66 | julia> xs[5] 67 | ERROR: getindex is disabled 68 | ``` 69 | -------------------------------------------------------------------------------- /src/matmul.jl: -------------------------------------------------------------------------------- 1 | using LinearAlgebra 2 | 3 | 4 | function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R} 5 | if size(A,2) != size(B,1) 6 | throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) 7 | end 8 | if size(C,1) != size(A,1) || size(C,2) != size(B,2) 9 | throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))")) 10 | end 11 | if isempty(A) || isempty(B) 12 | return fill!(C, zero(R)) 13 | end 14 | 15 | function kernel(C, A, B) 16 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 17 | j = (blockIdx().y-1) * blockDim().y + threadIdx().y 18 | 19 | if i <= size(A,1) && j <= size(B,2) 20 | z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j]) 21 | Ctmp = convert(promote_type(R, typeof(z2)), z2) 22 | for k in 1:size(A,2) 23 | Ctmp += A[i, k]*B[k, j] 24 | end 25 | C[i,j] = Ctmp 26 | end 27 | 28 | return 29 | end 30 | 31 | max_threads = 256 32 | threads_x = min(max_threads, size(C,1)) 33 | threads_y = min(max_threads ÷ threads_x, size(C,2)) 34 | threads = (threads_x, threads_y) 35 | blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads) 36 | 37 | @cuda threads=threads blocks=blocks kernel(C, A, B) 38 | 39 | C 40 | end 41 | 42 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 43 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 44 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 45 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 46 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 47 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 48 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 49 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 50 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 51 | 52 | 53 | function generic_rmul!(X::CuArray, s::Number) 54 | function kernel(X, s) 55 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 56 | @inbounds X[i] *= s 57 | return 58 | end 59 | @cuda blocks=length(X) kernel(X, s) 60 | X 61 | end 62 | 63 | LinearAlgebra.rmul!(A::CuArray, b::Number) = generic_rmul!(A, b) 64 | 65 | 66 | function generic_lmul!(s::Number, X::CuArray) 67 | function kernel(s, X) 68 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 69 | @inbounds X[i] = s*X[i] 70 | return 71 | end 72 | @cuda blocks=length(X) kernel(s, X) 73 | X 74 | end 75 | 76 | LinearAlgebra.lmul!(a::Number, B::CuArray) = generic_lmul!(a, B) 77 | -------------------------------------------------------------------------------- /src/CuArrays.jl: -------------------------------------------------------------------------------- 1 | module CuArrays 2 | 3 | using CUDAdrv, CUDAnative 4 | 5 | using GPUArrays 6 | 7 | export CuArray, CuVector, CuMatrix, CuVecOrMat, cu 8 | 9 | import LinearAlgebra 10 | 11 | using Adapt 12 | 13 | using Requires 14 | 15 | const ext = joinpath(dirname(@__DIR__), "deps", "ext.jl") 16 | isfile(ext) || error("CuArrays.jl has not been built, please run Pkg.build(\"CuArrays\").") 17 | include(ext) 18 | if !configured 19 | # default (non-functional) values for critical variables, 20 | # making it possible to _load_ the package at all times. 21 | const libcublas = nothing 22 | const libcusparse = nothing 23 | const libcusolver = nothing 24 | const libcufft = nothing 25 | const libcurand = nothing 26 | const libcudnn = nothing 27 | end 28 | 29 | include("memory.jl") 30 | include("array.jl") 31 | include("subarray.jl") 32 | include("utils.jl") 33 | include("indexing.jl") 34 | include("broadcast.jl") 35 | include("matmul.jl") 36 | include("mapreduce.jl") 37 | include("accumulate.jl") 38 | 39 | include("gpuarray_interface.jl") 40 | 41 | # many libraries need to be initialized per-device (per-context, really, but we assume users 42 | # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one. 43 | const active_context = Ref{CuContext}() 44 | 45 | include("blas/CUBLAS.jl") 46 | include("sparse/CUSPARSE.jl") 47 | include("solver/CUSOLVER.jl") 48 | include("fft/CUFFT.jl") 49 | include("rand/CURAND.jl") 50 | libcudnn !== nothing && include("dnn/CUDNN.jl") 51 | 52 | include("nnlib.jl") 53 | 54 | include("deprecated.jl") 55 | 56 | function __init__() 57 | if !configured 58 | @warn("CuArrays.jl has not been successfully built, and will not work properly.") 59 | @warn("Please run Pkg.build(\"CuArrays\") and restart Julia.") 60 | return 61 | end 62 | 63 | function check_library(name, path) 64 | path === nothing && return 65 | if !ispath(path) 66 | error("$name library has changed. Please run Pkg.build(\"CuArrays\") and restart Julia.") 67 | end 68 | end 69 | check_library("CUBLAS", libcublas) 70 | check_library("CUSPARSE", libcusparse) 71 | check_library("CUSOLVER", libcusolver) 72 | check_library("CUFFT", libcufft) 73 | check_library("CURAND", libcurand) 74 | check_library("CUDNN", libcudnn) 75 | 76 | # package integrations 77 | @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") 78 | 79 | # update the active context when we switch devices 80 | callback = (::CuDevice, ctx::CuContext) -> begin 81 | active_context[] = ctx 82 | 83 | # wipe the active handles 84 | CUBLAS._handle[] = C_NULL 85 | CUBLAS._xt_handle[] = C_NULL 86 | CUSOLVER._dense_handle[] = C_NULL 87 | CUSOLVER._sparse_handle[] = C_NULL 88 | CUSPARSE._handle[] = C_NULL 89 | CURAND._generator[] = nothing 90 | isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL) 91 | end 92 | push!(CUDAnative.device!_listeners, callback) 93 | 94 | # a device might be active already 95 | existing_ctx = CUDAdrv.CuCurrentContext() 96 | if existing_ctx !== nothing 97 | active_context[] = existing_ctx 98 | end 99 | 100 | __init_memory__() 101 | __init_pool_() 102 | end 103 | 104 | end # module 105 | -------------------------------------------------------------------------------- /src/fft/libcufft.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUFFT library 2 | 3 | import CUDAdrv: CuPtr, PtrOrCuPtr, CuStream_t 4 | 5 | cufftGetVersion() = ccall((:cufftGetVersion,libcufft), Cint, ()) 6 | 7 | function cufftGetProperty(property::CUDAapi.libraryPropertyType) 8 | value_ref = Ref{Cint}() 9 | @check ccall((:cufftGetProperty, libcufft), cufftStatus_t, 10 | (Cint, Ptr{Cint}), 11 | property, value_ref) 12 | value_ref[] 13 | end 14 | 15 | cufftDestroy(plan) = ccall((:cufftDestroy,libcufft), Nothing, (cufftHandle_t,), plan) 16 | 17 | function cufftPlan1d(plan, nx, type, batch) 18 | @check ccall((:cufftPlan1d,libcufft),cufftStatus_t, 19 | (Ptr{cufftHandle_t}, Cint, cufftType, Cint), 20 | plan, nx, type, batch) 21 | end 22 | 23 | function cufftPlan2d(plan, nx, ny, type) 24 | @check ccall((:cufftPlan2d,libcufft),cufftStatus_t, 25 | (Ptr{cufftHandle_t}, Cint, Cint, cufftType), 26 | plan, nx, ny, type) 27 | end 28 | 29 | function cufftPlan3d(plan, nx, ny, nz, type) 30 | @check ccall((:cufftPlan3d,libcufft),cufftStatus_t, 31 | (Ptr{cufftHandle_t}, Cint, Cint, Cint, cufftType), 32 | plan, nx, ny, nz, type) 33 | end 34 | 35 | function cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch) 36 | @check ccall((:cufftPlanMany,libcufft),cufftStatus_t, 37 | (Ptr{cufftHandle_t}, Cint, Ptr{Cint}, 38 | Ptr{Cint}, Cint, Cint, 39 | Ptr{Cint}, Cint, Cint, 40 | cufftType, Cint), 41 | plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch) 42 | end 43 | 44 | function cufftExecC2C(plan, idata, odata, direction) 45 | @check ccall((:cufftExecC2C,libcufft), cufftStatus_t, 46 | (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}, Cint), 47 | plan, idata, odata, direction) 48 | end 49 | 50 | function cufftExecC2R(plan, idata, odata) 51 | @check ccall((:cufftExecC2R,libcufft), cufftStatus_t, 52 | (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}), 53 | plan, idata, odata) 54 | end 55 | 56 | function cufftExecR2C(plan, idata, odata) 57 | @check ccall((:cufftExecR2C,libcufft), cufftStatus_t, 58 | (cufftHandle_t, CuPtr{cufftReal}, CuPtr{cufftComplex}), 59 | plan, idata, odata) 60 | end 61 | 62 | function cufftExecZ2Z(plan, idata, odata, direction) 63 | @check ccall((:cufftExecZ2Z,libcufft), cufftStatus_t, 64 | (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}, 65 | Cint), 66 | plan, idata, odata, direction) 67 | end 68 | 69 | function cufftExecZ2D(plan, idata, odata) 70 | @check ccall((:cufftExecZ2D,libcufft), cufftStatus_t, 71 | (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}), 72 | plan, idata, odata) 73 | end 74 | 75 | function cufftExecD2Z(plan, idata, odata) 76 | @check ccall((:cufftExecD2Z,libcufft), cufftStatus_t, 77 | (cufftHandle_t, CuPtr{cufftDoubleReal}, CuPtr{cufftDoubleComplex}), 78 | plan, idata, odata) 79 | end 80 | 81 | function cufftSetStream(plan, stream) 82 | @check ccall((:cufftSetStream,libcufft), cufftStatus_t, 83 | (cufftHandle_t, CuStream_t), 84 | plan, stream) 85 | end 86 | -------------------------------------------------------------------------------- /test/dnn.jl: -------------------------------------------------------------------------------- 1 | @testset "CUDNN" begin 2 | 3 | if !isdefined(CuArrays, :CUDNN) 4 | @warn "Not testing CUDNN" 5 | else 6 | using CuArrays.CUDNN 7 | @info "Testing CUDNN $(CUDNN.version())" 8 | 9 | @testset "NNlib" begin 10 | using NNlib 11 | using NNlib: ∇conv_data, ∇conv_filter, 12 | maxpool, meanpool, ∇maxpool, ∇meanpool, 13 | softmax, ∇softmax, logsoftmax, ∇logsoftmax 14 | a, b, c = rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4), rand(Float64, 9, 9, 4, 1) 15 | da, db, dc = CuArray(a), CuArray(b), CuArray(c) 16 | cdims = DenseConvDims(a, b) 17 | @test NNlib.conv(a, b, cdims) ≈ collect(NNlib.conv(da, db, cdims)) 18 | @test ∇conv_data(c, b, cdims) ≈ collect(∇conv_data(dc, db, cdims)) 19 | @test ∇conv_filter(a, c, cdims) ≈ collect(∇conv_filter(da, dc, cdims)) 20 | 21 | # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs 22 | for num_spatial_dims in (2, 3) 23 | # Initialize data we'll run our tests over 24 | C_in = 3 25 | C_out = 4 26 | batch_size = 1 27 | x = rand(Float64, repeat([8], num_spatial_dims)..., C_in, batch_size) 28 | w = rand(Float64, repeat([2], num_spatial_dims)..., C_in, C_out) 29 | b = rand(Float64, repeat([1], num_spatial_dims)..., C_in, C_out) 30 | options = (Dict(), Dict(:dilation => 2), Dict(:flipkernel => true), Dict(:stride => 2),) 31 | algos = (1, 0, 1, 1,) 32 | 33 | for (opts, algo) in zip(options, algos) 34 | cdims = DenseConvDims(x, w; opts...) 35 | y = NNlib.conv(x, w, cdims) 36 | 37 | # Test that basic convolution is equivalent across GPU/CPU 38 | @test testf((x, w) -> NNlib.conv(x, w, cdims), x, w) 39 | @test testf((y, w) -> ∇conv_data(y, w, cdims), y, w) 40 | @test testf((x, y) -> ∇conv_filter(x, y, cdims), x, y) 41 | # Test that we can use an alternative algorithm without dying 42 | @test_nowarn NNlib.conv!(cu(y), cu(x), cu(w), cdims; algo=algo) 43 | @test_nowarn NNlib.∇conv_data!(cu(x), cu(y), cu(w), cdims; algo=algo) 44 | @test_nowarn NNlib.∇conv_filter!(cu(w), cu(x), cu(y), cdims; algo=algo) 45 | end 46 | 47 | # Test that pooling is equivalent across GPU/CPU 48 | pdims = PoolDims(x, 2) 49 | y = maxpool(x, pdims) 50 | dy = ones(size(y)) 51 | @test testf(x -> maxpool(x, pdims), x) 52 | @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x) 53 | @test testf(x -> maxpool(x, pdims), x) 54 | @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x) 55 | 56 | # CPU implementation of ∇conv_bias! 57 | db = zeros(Float64, 1, 1, 3, 1) 58 | function CuArrays.CUDNN.∇conv_bias!(db, y) 59 | db .= sum(y, dims=(1:(ndims(y)-2))) 60 | return db 61 | end 62 | #@test testf(CuArrays.CUDNN.∇conv_bias!, db, y) 63 | end 64 | 65 | for dims in [(5,5), (5,)] 66 | @test testf(softmax, rand(Float64, dims)) 67 | @test testf(∇softmax, rand(Float64, dims), rand(Float64, dims)) 68 | @test testf(logsoftmax, rand(Float64, dims)) 69 | @test testf(∇logsoftmax, rand(Float64, dims), rand(Float64, dims)) 70 | end 71 | end 72 | 73 | @testset "Activations and Other Ops" begin 74 | @test testf(CuArrays.CUDNN.cudnnAddTensor, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 75 | @test testf(CuArrays.CUDNN.cudnnActivationForward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 76 | @test testf(CuArrays.CUDNN.cudnnActivationBackward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 77 | end 78 | 79 | end 80 | 81 | end 82 | -------------------------------------------------------------------------------- /src/mapreduce.jl: -------------------------------------------------------------------------------- 1 | using CuArrays: @cuindex, cudims 2 | 3 | function mapreducedim_kernel_serial(f, op, R, A, range) 4 | I = @cuindex R 5 | newrange = map((r, i) -> r === nothing ? i : r, range, I) 6 | for I′ in CartesianIndices(newrange) 7 | @inbounds R[I...] = op(R[I...], f(A[I′])) 8 | end 9 | return 10 | end 11 | 12 | @inline function reduce_block(arr::CuDeviceArray, op) 13 | sync_threads() 14 | len = blockDim().x 15 | while len != 1 16 | sync_threads() 17 | skip = (len + 1) >> 1 18 | reduce_to = threadIdx().x - skip 19 | if 0 < reduce_to <= (len >> 1) 20 | arr[reduce_to] = op(arr[reduce_to], arr[threadIdx().x]) 21 | end 22 | len = skip 23 | end 24 | sync_threads() 25 | end 26 | 27 | function mapreducedim_kernel_parallel(f, op, R::CuDeviceArray{T}, A::CuDeviceArray{T}, 28 | CIS, Rlength, Slength) where {T} 29 | for Ri_base in 0:(gridDim().x * blockDim().y):(Rlength-1) 30 | Ri = Ri_base + (blockIdx().x - 1) * blockDim().y + threadIdx().y 31 | Ri > Rlength && return 32 | RI = Tuple(CartesianIndices(R)[Ri]) 33 | S = @cuStaticSharedMem(T, 512) 34 | Si_folded_base = (threadIdx().y - 1) * blockDim().x 35 | Si_folded = Si_folded_base + threadIdx().x 36 | # serial reduction of A into S by Slength ÷ xthreads 37 | for Si_base in 0:blockDim().x:(Slength-1) 38 | Si = Si_base + threadIdx().x 39 | Si > Slength && break 40 | SI = Tuple(CIS[Si]) 41 | AI = ifelse.(size(R) .== 1, SI, RI) 42 | if Si_base == 0 43 | S[Si_folded] = f(A[AI...]) 44 | else 45 | S[Si_folded] = op(S[Si_folded], f(A[AI...])) 46 | end 47 | end 48 | # block-parallel reduction of S to S[1] by xthreads 49 | reduce_block(view(S, (Si_folded_base + 1):512), op) 50 | # reduce S[1] into R 51 | threadIdx().x == 1 && (R[Ri] = op(R[Ri], S[Si_folded])) 52 | end 53 | return 54 | end 55 | 56 | function Base._mapreducedim!(f, op, R::CuArray{T}, A::CuArray{T}) where {T} 57 | # the kernel as generated from `f` and `op` can require lots of registers (eg. #160), 58 | # so we need to be careful about how many threads we launch not to run out of them. 59 | Rlength = length(R) 60 | Ssize = ifelse.(size(R) .== 1, size(A), 1) 61 | Slength = prod(Ssize) 62 | CIS = CartesianIndices(Ssize) 63 | 64 | parallel_args = (f, op, R, A, CIS, Rlength, Slength) 65 | GC.@preserve parallel_args begin 66 | parallel_kargs = cudaconvert.(parallel_args) 67 | parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...} 68 | parallel_kernel = cufunction(mapreducedim_kernel_parallel, parallel_tt) 69 | 70 | # we are limited in how many threads we can launch... 71 | ## by the kernel 72 | kernel_threads = CUDAnative.maxthreads(parallel_kernel) 73 | ## by the device 74 | dev = CUDAdrv.device() 75 | block_threads = (x=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X), 76 | y=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y), 77 | total=attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)) 78 | 79 | # figure out a legal launch configuration 80 | y_thr = min(nextpow(2, Rlength ÷ 512 + 1), 512, block_threads.y, kernel_threads) 81 | x_thr = min(512 ÷ y_thr, Slength, block_threads.x, 82 | ceil(Int, block_threads.total/y_thr), 83 | ceil(Int, kernel_threads/y_thr)) 84 | 85 | if x_thr >= 8 86 | blk, thr = (Rlength - 1) ÷ y_thr + 1, (x_thr, y_thr, 1) 87 | parallel_kernel(parallel_kargs...; threads=thr, blocks=blk) 88 | else 89 | # not enough work, fall back to serial reduction 90 | range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing) 91 | blk, thr = cudims(R) 92 | @cuda(blocks=blk, threads=thr, mapreducedim_kernel_serial(f, op, R, A, range)) 93 | end 94 | end 95 | 96 | return R 97 | end 98 | -------------------------------------------------------------------------------- /src/dnn/nnlib.jl: -------------------------------------------------------------------------------- 1 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, 2 | maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, 3 | softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax 4 | import ..CuArrays: CuVecOrMat, CuVector 5 | using CUDAnative 6 | 7 | 8 | # Softmax 9 | 10 | const CUDNNFloat = Union{Float16,Float32,Float64} 11 | 12 | reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1) 13 | reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...) 14 | 15 | function softmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 16 | cudnnSoftmaxForward(reshape4D(xs), reshape4D(out)) 17 | return out 18 | end 19 | 20 | function ∇softmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 21 | cudnnSoftmaxBackward(reshape4D(softmax(xs)), reshape4D(Δ), reshape4D(out)) 22 | return out 23 | end 24 | 25 | function logsoftmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 26 | cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=CUDNN_SOFTMAX_LOG) 27 | return out 28 | end 29 | 30 | function ∇logsoftmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 31 | cudnnSoftmaxBackward(reshape4D(logsoftmax(xs)), reshape4D(Δ), reshape4D(out); 32 | algorithm=CUDNN_SOFTMAX_LOG) 33 | return out 34 | end 35 | 36 | ∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat = 37 | ∇logsoftmax!(similar(xs), Δ, xs) 38 | 39 | 40 | # Convolution 41 | 42 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}, cdims::DenseConvDims; 43 | alpha=1, algo=0) where T<:CUDNNFloat 44 | if version() < v"6" 45 | all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6") 46 | end 47 | 48 | workspace_size = cudnnGetConvolutionForwardWorkspaceSize(y, x, w, cdims, algo=algo) 49 | 50 | CuVector{UInt8}(undef, workspace_size) do workspace 51 | cudnnConvolutionForward(y, x, w, cdims, alpha=alpha, algo=algo, 52 | workspace=workspace, workspace_size=workspace_size) 53 | end 54 | end 55 | 56 | function ∇conv_filter!(dw::CuArray{T}, x::CuArray{T}, dy::CuArray{T}, 57 | cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat 58 | if version() < v"6" 59 | all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6") 60 | end 61 | 62 | workspace_size = cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, cdims, algo=algo) 63 | 64 | CuVector{UInt8}(undef, workspace_size) do workspace 65 | cudnnConvolutionBackwardFilter(dw, x, dy, cdims, alpha=alpha, algo=algo, 66 | workspace=workspace, workspace_size=workspace_size) 67 | end 68 | end 69 | 70 | function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T}, 71 | cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat 72 | if version() < v"6" 73 | all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6") 74 | end 75 | 76 | workspace_size = 77 | cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, cdims; algo=algo) 78 | CuVector{UInt8}(undef, workspace_size) do workspace 79 | cudnnConvolutionBackwardData(dx, w, dy, cdims, alpha=alpha, algo=algo, 80 | workspace=workspace, workspace_size=workspace_size) 81 | end 82 | end 83 | 84 | ∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat = 85 | cudnnConvolutionBackwardBias(db, dy, alpha=alpha, beta=beta) 86 | 87 | maxpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat = 88 | cudnnPoolingForward(y, x, pdims; mode=0) 89 | 90 | ∇maxpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, 91 | pdims::PoolDims) where T<:CUDNNFloat = 92 | cudnnPoolingBackward(dx, dy, x, y, pdims, mode=0) 93 | 94 | meanpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat = 95 | cudnnPoolingForward(y, x, pdims, mode=1) 96 | 97 | ∇meanpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, 98 | pdims::PoolDims) where T<:CUDNNFloat = 99 | cudnnPoolingBackward(dx, dy, x, y, pdims, mode=1) 100 | -------------------------------------------------------------------------------- /src/blas/libcublas_types.jl: -------------------------------------------------------------------------------- 1 | # libcublas_types.jl 2 | # 3 | # Initially generated with wrap_c from Clang.jl. Modified to remove anonymous 4 | # enums and add cublasContext. 5 | # 6 | # Author: Nick Henderson 7 | # Created: 2014-08-27 8 | # License: MIT 9 | # 10 | 11 | # begin enum cublasStatus_t 12 | const cublasStatus_t = UInt32 13 | const CUBLAS_STATUS_SUCCESS = 0 14 | const CUBLAS_STATUS_NOT_INITIALIZED = 1 15 | const CUBLAS_STATUS_ALLOC_FAILED = 3 16 | const CUBLAS_STATUS_INVALID_VALUE = 7 17 | const CUBLAS_STATUS_ARCH_MISMATCH = 8 18 | const CUBLAS_STATUS_MAPPING_ERROR = 11 19 | const CUBLAS_STATUS_EXECUTION_FAILED = 13 20 | const CUBLAS_STATUS_INTERNAL_ERROR = 14 21 | const CUBLAS_STATUS_NOT_SUPPORTED = 15 22 | const CUBLAS_STATUS_LICENSE_ERROR = 16 23 | # end enum cublasStatus_t 24 | # begin enum cublasFillMode_t 25 | const cublasFillMode_t = UInt32 26 | const CUBLAS_FILL_MODE_LOWER = 0 27 | const CUBLAS_FILL_MODE_UPPER = 1 28 | # end enum cublasFillMode_t 29 | # begin enum cublasDiagType_t 30 | const cublasDiagType_t = UInt32 31 | const CUBLAS_DIAG_NON_UNIT = 0 32 | const CUBLAS_DIAG_UNIT = 1 33 | # end enum cublasDiagType_t 34 | # begin enum cublasSideMode_t 35 | const cublasSideMode_t = UInt32 36 | const CUBLAS_SIDE_LEFT = 0 37 | const CUBLAS_SIDE_RIGHT = 1 38 | # end enum cublasSideMode_t 39 | # begin enum cublasOperation_t 40 | const cublasOperation_t = UInt32 41 | const CUBLAS_OP_N = 0 42 | const CUBLAS_OP_T = 1 43 | const CUBLAS_OP_C = 2 44 | # end enum cublasOperation_t 45 | # begin enum cublasPointerMode_t 46 | const cublasPointerMode_t = UInt32 47 | const CUBLAS_POINTER_MODE_HOST = 0 48 | const CUBLAS_POINTER_MODE_DEVICE = 1 49 | # end enum cublasPointerMode_t 50 | # begin enum cublasAtomicsMode_t 51 | const cublasAtomicsMode_t = UInt32 52 | const CUBLAS_ATOMICS_NOT_ALLOWED = 0 53 | const CUBLAS_ATOMICS_ALLOWED = 1 54 | # end enum cublasAtomicsMode_t 55 | const cublasContext = Nothing 56 | const cublasHandle_t = Ptr{cublasContext} 57 | const cublasXtHandle_t = Ptr{cublasContext} 58 | # complex numbers in cuda 59 | const cuComplex = Complex{Float32} 60 | const cuDoubleComplex = Complex{Float64} 61 | # complex types from Base/linalg.jl 62 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 63 | const CublasReal = Union{Float64,Float32} 64 | const CublasComplex = Union{ComplexF64,ComplexF32} 65 | # FP16 (cuda_fp16.h) in cuda 66 | const __half = Float16 67 | struct __half2 68 | x1::__half 69 | x2::__half 70 | end 71 | 72 | const cublasXtOpType_t = UInt32 73 | const CUBLASXT_FLOAT = 0 74 | const CUBLASXT_DOUBLE = 1 75 | const CUBLASXT_COMPLEX = 2 76 | const CUBLASXT_DOUBLECOMPLEX = 3 77 | 78 | const cublasXtBlasOp_t = UInt32 79 | const CUBLASXT_GEMM = 0 80 | const CUBLASXT_SYRK = 1 81 | const CUBLASXT_HERK = 2 82 | const CUBLASXT_SYMM= 3 83 | const CUBLASXT_HEMM= 4 84 | const CUBLASXT_TRSM= 5 85 | const CUBLASXT_SYR2K= 6 86 | const CUBLASXT_HER2K= 7 87 | const CUBLASXT_SPMM= 8 88 | const CUBLASXT_SYRKX= 9 89 | const CUBLASXT_HERKX= 10 90 | 91 | const cublasXtPinningMemMode_t = UInt32 92 | const CUBLASXT_PINNING_DISABLED = 0 93 | const CUBLASXT_PINNING_ENABLED = 1 94 | 95 | if CUDAdrv.version() >= v"0.7.5" 96 | # specify which GEMM algorithm to use in cublasGemmEx() (CUDA 7.5+) 97 | const cublasGemmAlgo_t = Int32 98 | const CUBLAS_GEMM_DFALT = -1 99 | const CUBLAS_GEMM_ALGO0 = 0 100 | const CUBLAS_GEMM_ALGO1 = 1 101 | const CUBLAS_GEMM_ALGO2 = 2 102 | const CUBLAS_GEMM_ALGO3 = 3 103 | const CUBLAS_GEMM_ALGO4 = 4 104 | const CUBLAS_GEMM_ALGO5 = 5 105 | const CUBLAS_GEMM_ALGO6 = 6 106 | const CUBLAS_GEMM_ALGO7 = 7 107 | # specify which DataType to use with cublasgemmEx() and cublasGemmEx() (CUDA 7.5+) functions 108 | const cudaDataType_t = UInt32 109 | const CUDA_R_16F = UInt32(2) 110 | const CUDA_C_16F = UInt32(6) 111 | const CUDA_R_32F = UInt32(0) 112 | const CUDA_C_32F = UInt32(4) 113 | const CUDA_R_64F = UInt32(1) 114 | const CUDA_C_64F = UInt32(5) 115 | const CUDA_R_8I = UInt32(3) 116 | const CUDA_C_8I = UInt32(7) 117 | const CUDA_R_8U = UInt32(8) 118 | const CUDA_C_8U = UInt32(9) 119 | const CUDA_R_32I = UInt32(10) 120 | const CUDA_C_32I = UInt32(11) 121 | const CUDA_R_32U = UInt32(12) 122 | const CUDA_C_32U = UInt32(13) 123 | end 124 | 125 | @enum CUBLASMathMode::Cint begin 126 | CUBLAS_DEFAULT_MATH = 0 127 | CUBLAS_TENSOR_OP_MATH = 1 128 | end 129 | -------------------------------------------------------------------------------- /src/sparse/highlevel.jl: -------------------------------------------------------------------------------- 1 | import LinearAlgebra: BlasFloat, mul! 2 | 3 | Base.:(\)(A::Union{UpperTriangular{T, S},LowerTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('N',A,B,'O') 4 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('T',parent(transA),B,'O') 5 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('T',parent(transA),B,'O') 6 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, S}},B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('C',parent(adjA),B,'O') 7 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, S}},B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('C',parent(adjA),B,'O') 8 | 9 | mul!(C::CuVector{T},A::CuSparseMatrix,B::CuVector) where {T} = mv!('N',one(T),A,B,zero(T),C,'O') 10 | mul!(C::CuVector{T},transA::Transpose{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O') 11 | mul!(C::CuVector{T},adjA::Adjoint{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('C',one(T),parent(transA),B,zero(T),C,'O') 12 | mul!(C::CuVector{T},A::HermOrSym{T,<:CuSparseMatrix{T}},B::CuVector{T}) where T = mv!('N',one(T),A,B,zero(T),C,'O') 13 | mul!(C::CuVector{T},transA::Transpose{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O') 14 | mul!(C::CuVector{T},adjA::Adjoint{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('C',one(T),parent(adjA),B,zero(T),C,'O') 15 | 16 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},B::CuMatrix{T}) where {T} = mm2!('N','N',one(T),A,B,zero(T),C,'O') 17 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('N','T',one(T),A,parent(transB),zero(T),C,'O') 18 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T}) where {T} = mm2!('T','N',one(T),parent(transA),B,zero(T),C,'O') 19 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('T','T',one(T),parent(transA),parent(transB),zero(T),C,'O') 20 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T}) where {T} = mm2!('C','N',one(T),parent(adjA),B,zero(T),C,'O') 21 | 22 | mul!(C::CuMatrix{T},A::HermOrSym{<:Number, <:CuSparseMatrix},B::CuMatrix) where {T} = mm!('N',one(T),A,B,zero(T),C,'O') 23 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('T',one(T),parent(transA),B,zero(T),C,'O') 24 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('C',one(T),parent(adjA),B,zero(T),C,'O') 25 | 26 | Base.:(\)(A::Union{UpperTriangular{T, S},LowerTriangular{T, S}}, B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('N',A,B,'O') 27 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('T',parent(transA),B,'O') 28 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('T',parent(transA),B,'O') 29 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('C',parent(adjA),B,'O') 30 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('C',parent(adjA),B,'O') 31 | Base.:(\)(A::AbstractTriangular{T,CuSparseMatrixHYB{T}},B::CuVector{T}) where T = sv('N',A,B,'O') 32 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('T',parent(transA),B,'O') 33 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('T',parent(transA),B,'O') 34 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('C',parent(adjA),B,'O') 35 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('C',parent(adjA),B,'O') 36 | 37 | Base.:(+)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,B,'O','O','O') 38 | Base.:(-)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,-one(eltype(A)),B,'O','O','O') 39 | -------------------------------------------------------------------------------- /src/solver/highlevel.jl: -------------------------------------------------------------------------------- 1 | # QR factorization 2 | 3 | struct CuQR{T,S<:AbstractMatrix} <: LinearAlgebra.Factorization{T} 4 | factors::S 5 | τ::CuVector{T} 6 | CuQR{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ) 7 | end 8 | 9 | struct CuQRPackedQ{T,S<:AbstractMatrix} <: LinearAlgebra.AbstractQ{T} 10 | factors::CuMatrix{T} 11 | τ::CuVector{T} 12 | CuQRPackedQ{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ) 13 | end 14 | 15 | CuQR(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQR{T,typeof(factors)}(factors, τ) 16 | CuQRPackedQ(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQRPackedQ{T,typeof(factors)}(factors, τ) 17 | 18 | LinearAlgebra.qr!(A::CuMatrix{T}) where T = CuQR(geqrf!(A::CuMatrix{T})...) 19 | Base.size(A::CuQR) = size(A.factors) 20 | Base.size(A::CuQRPackedQ, dim::Integer) = 0 < dim ? (dim <= 2 ? size(A.factors, 1) : 1) : throw(BoundsError()) 21 | CuArrays.CuMatrix(A::CuQRPackedQ) = orgqr!(copy(A.factors), A.τ) 22 | CuArrays.CuArray(A::CuQRPackedQ) = convert(CuMatrix, A) 23 | Base.Matrix(A::CuQRPackedQ) = Matrix(CuMatrix(A)) 24 | 25 | function Base.getproperty(A::CuQR, d::Symbol) 26 | m, n = size(getfield(A, :factors)) 27 | if d == :R 28 | return triu!(A.factors[1:min(m, n), 1:n]) 29 | elseif d == :Q 30 | return CuQRPackedQ(A.factors, A.τ) 31 | else 32 | getfield(A, d) 33 | end 34 | end 35 | 36 | # iteration for destructuring into components 37 | Base.iterate(S::CuQR) = (S.Q, Val(:R)) 38 | Base.iterate(S::CuQR, ::Val{:R}) = (S.R, Val(:done)) 39 | Base.iterate(S::CuQR, ::Val{:done}) = nothing 40 | 41 | # Apply changes Q from the left 42 | LinearAlgebra.lmul!(A::CuQRPackedQ{T,S}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} = 43 | ormqr!('L', 'N', A.factors, A.τ, B) 44 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Real, S<:CuMatrix} = 45 | ormqr!('L', 'T', parent(adjA).factors, parent(adjA).τ, B) 46 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Complex, S<:CuMatrix} = 47 | ormqr!('L', 'C', parent(adjA).factors, parent(adjA).τ, B) 48 | LinearAlgebra.lmul!(trA::Transpose{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} = 49 | ormqr!('L', 'T', parent(trA).factors, parent(trA).τ, B) 50 | 51 | function Base.getindex(A::CuQRPackedQ{T, S}, i::Integer, j::Integer) where {T, S} 52 | x = CuArray{T}(undef, size(A, 2)) .= 0 53 | x[j] = 1 54 | lmul!(A, x) 55 | return _getindex(x, i) 56 | end 57 | 58 | function Base.show(io::IO, F::CuQR) 59 | println(io, "$(typeof(F)) with factors Q and R:") 60 | show(io, F.Q) 61 | println(io) 62 | show(io, F.R) 63 | end 64 | 65 | # Singular Value Decomposition 66 | 67 | struct CuSVD{T,Tr,A<:AbstractMatrix{T}} <: LinearAlgebra.Factorization{T} 68 | U::CuMatrix{T} 69 | S::CuVector{Tr} 70 | V::A 71 | end 72 | 73 | # iteration for destructuring into components 74 | Base.iterate(S::CuSVD) = (S.U, Val(:S)) 75 | Base.iterate(S::CuSVD, ::Val{:S}) = (S.S, Val(:V)) 76 | Base.iterate(S::CuSVD, ::Val{:V}) = (S.V, Val(:done)) 77 | Base.iterate(S::CuSVD, ::Val{:done}) = nothing 78 | 79 | @inline function Base.getproperty(S::CuSVD, s::Symbol) 80 | if s === :Vt 81 | return getfield(S, :V)' 82 | else 83 | return getfield(S, s) 84 | end 85 | end 86 | 87 | @enum SVDAlgorithm QRAlgorithm JacobiAlgorithm 88 | function LinearAlgebra.svd!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm; full::Bool=false) where T 89 | if method === QRAlgorithm 90 | U, s, Vt = gesvd!(full ? 'A' : 'S', full ? 'A' : 'S', A::CuMatrix{T}) 91 | return CuSVD(U, s, Vt') 92 | elseif method === JacobiAlgorithm 93 | return CuSVD(gesvdj!('V', Int(!full), A::CuMatrix{T})...) 94 | end 95 | end 96 | # Once LinearAlgebra.svd(::AbstractMatrix) accepts kwargs this method can be deleted 97 | LinearAlgebra.svd(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm; full=false) = svd!(copy(A), method, full=full) 98 | 99 | function LinearAlgebra.svdvals!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm) where T 100 | if method === QRAlgorithm 101 | return gesvd!('N', 'N', A::CuMatrix{T})[2] 102 | elseif method === JacobiAlgorithm 103 | return gesvdj!('N', 1, A::CuMatrix{T})[2] 104 | end 105 | end 106 | # Once LinearAlgebra.svdvals(::AbstractMatrix) accepts kwargs this method can be deleted 107 | LinearAlgebra.svdvals(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm) = svdvals!(copy(A), method) 108 | -------------------------------------------------------------------------------- /src/rand/highlevel.jl: -------------------------------------------------------------------------------- 1 | # high-level interface for CURAND 2 | # 3 | # the interface is split in two levels: 4 | # - functions that extend the Random standard library, and take an RNG as first argument, 5 | # will only ever dispatch to CURAND and as a result are limited in the types they support. 6 | # - functions that take an array will dispatch to either CURAND or GPUArrays 7 | # - `cu`-prefixed functions are provided for constructing GPU arrays from only an eltype 8 | 9 | 10 | ## seeding 11 | 12 | seed!(rng::RNG=generator()) = generate_seeds(rng) 13 | 14 | 15 | ## in-place 16 | 17 | # uniform 18 | Random.rand!(rng::RNG, A::CuArray{Float32}) = generate_uniform(rng, A) 19 | Random.rand!(rng::RNG, A::CuArray{Float64}) = generate_uniform_double(rng, A) 20 | 21 | # normal 22 | Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_normal(rng, A, mean, stddev) 23 | Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_normal_double(rng, A, mean, stddev) 24 | 25 | # log-normal 26 | rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_log_normal(rng, A, mean, stddev) 27 | rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_log_normal_double(rng, A, mean, stddev) 28 | 29 | # log-normal 30 | rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1) = generate_poisson(rng, A, lambda) 31 | 32 | 33 | ## out of place 34 | 35 | Random.rand(rng::RNG, ::Type{X}, dims::Dims) where {X} = rand!(rng, CuArray{X}(undef, dims)) 36 | Random.randn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = randn!(rng, CuArray{X}(undef, dims); kwargs...) 37 | rand_logn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_logn!(rng, CuArray{X}(undef, dims); kwargs...) 38 | rand_poisson(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_poisson!(rng, CuArray{X}(undef, dims); kwargs...) 39 | 40 | # specify default types 41 | Random.rand(rng::RNG, dims::Integer...; kwargs...) = rand(rng, Float32, dims...; kwargs...) 42 | Random.randn(rng::RNG, dims::Integer...; kwargs...) = randn(rng, Float32, dims...; kwargs...) 43 | rand_logn(rng::RNG, dims::Integer...; kwargs...) = rand_logn(rng, Float32, dims...; kwargs...) 44 | rand_poisson(rng::RNG, dims::Integer...; kwargs...) = rand_poisson(rng, Cuint, dims...; kwargs...) 45 | 46 | # convenience 47 | Random.randn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 48 | randn(rng, X, Dims((dim1, dims...)); kwargs...) 49 | rand_logn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 50 | rand_logn(rng, X, Dims((dim1, dims...)); kwargs...) 51 | rand_poisson(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 52 | rand_poisson(rng, X, Dims((dim1, dims...)); kwargs...) 53 | 54 | 55 | ## functions that dispatch to either CURAND or GPUArrays 56 | 57 | uniform_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 58 | uniform_rng(A::CuArray) = GPUArrays.global_rng(A) 59 | 60 | normal_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 61 | normal_rng(::CuArray{T}) where {T} = 62 | error("CuArrays does not support generating normally distributed numbers of type $T") 63 | 64 | logn_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 65 | logn_rng(::CuArray{T}) where {T} = 66 | error("CuArrays does not support generating lognormally distributed numbers of type $T") 67 | 68 | poisson_rng(::CuArray{Cuint}) = generator() 69 | poisson_rng(::CuArray{T}) where {T} = 70 | error("CuArrays does not support generating Poisson distributed numbers of type $T") 71 | 72 | 73 | Random.rand!(A::CuArray; kwargs...) = rand!(uniform_rng(A), A; kwargs...) 74 | Random.randn!(A::CuArray; kwargs...) = randn!(normal_rng(A), A; kwargs...) 75 | rand_logn!(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...) 76 | rand_poisson!(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...) 77 | rand_logn(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...) 78 | rand_poisson(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...) 79 | 80 | rand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...) 81 | randn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...) 82 | rand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...) 83 | rand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...) 84 | 85 | # specify default types 86 | rand(args...; kwargs...) where {X} = rand(Float32, args...; kwargs...) 87 | randn(args...; kwargs...) where {X} = randn(Float32, args...; kwargs...) 88 | rand_logn(args...; kwargs...) where {X} = rand_logn(Float32, args...; kwargs...) 89 | rand_poisson(args...; kwargs...) where {X} = rand_poisson(Cuint, args...; kwargs...) 90 | -------------------------------------------------------------------------------- /src/sparse/libcusparse.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUSPARSE library 2 | 3 | #helper functions 4 | function cusparseCreate() 5 | handle = Ref{cusparseHandle_t}() 6 | @check ccall( (:cusparseCreate, libcusparse), cusparseStatus_t, (Ptr{cusparseHandle_t},), handle) 7 | handle[] 8 | end 9 | 10 | function cusparseDestroy(handle) 11 | @check ccall( (:cusparseDestroy, libcusparse), cusparseStatus_t, (cusparseHandle_t,), handle) 12 | end 13 | 14 | function cusparseGetVersion(handle, version) 15 | @check ccall( (:cusparseGetVersion, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{Cint}), handle, version) 16 | end 17 | 18 | function cusparseSetStream(handle, streamId) 19 | @check ccall( (:cusparseSetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, CuStream_t), handle, streamId) 20 | end 21 | 22 | function cusparseGetStream(handle, streamId) 23 | @check ccall( (:cusparseGetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{CuStream_t}), handle, streamId) 24 | end 25 | 26 | function cusparseGetPointerMode(handle, mode) 27 | @check ccall( (:cusparseGetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{cusparsePointerMode_t}), handle, mode) 28 | end 29 | 30 | function cusparseSetPointerMode(handle, mode) 31 | @check ccall( (:cusparseSetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparsePointerMode_t), handle, mode) 32 | end 33 | 34 | function cusparseCreateHybMat(hybA) 35 | @check ccall( (:cusparseCreateHybMat, libcusparse), cusparseStatus_t, (Ptr{cusparseHybMat_t},), hybA) 36 | end 37 | 38 | function cusparseDestroyHybMat(hybA) 39 | @check ccall( (:cusparseDestroyHybMat, libcusparse), cusparseStatus_t, (cusparseHybMat_t,), hybA) 40 | end 41 | 42 | function cusparseCreateSolveAnalysisInfo(info) 43 | @check ccall( (:cusparseCreateSolveAnalysisInfo, libcusparse), cusparseStatus_t, (Ptr{cusparseSolveAnalysisInfo_t},), info) 44 | end 45 | 46 | function cusparseDestroySolveAnalysisInfo(info) 47 | @check ccall( (:cusparseDestroySolveAnalysisInfo, libcusparse), cusparseStatus_t, (cusparseSolveAnalysisInfo_t,), info) 48 | end 49 | 50 | function cusparseCreateBsrsm2Info(info) 51 | @check ccall( (:cusparseCreateBsrsm2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsm2Info_t},), info) 52 | end 53 | 54 | function cusparseDestroyBsrsm2Info(info) 55 | @check ccall( (:cusparseDestroyBsrsm2Info, libcusparse), cusparseStatus_t, (bsrsm2Info_t,), info) 56 | end 57 | 58 | function cusparseCreateBsrsv2Info(info) 59 | @check ccall( (:cusparseCreateBsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsv2Info_t},), info) 60 | end 61 | 62 | function cusparseDestroyBsrsv2Info(info) 63 | @check ccall( (:cusparseDestroyBsrsv2Info, libcusparse), cusparseStatus_t, (bsrsv2Info_t,), info) 64 | end 65 | 66 | function cusparseCreateCsrsv2Info(info) 67 | @check ccall( (:cusparseCreateCsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{csrsv2Info_t},), info) 68 | end 69 | 70 | function cusparseDestroyCsrsv2Info(info) 71 | @check ccall( (:cusparseDestroyCsrsv2Info, libcusparse), cusparseStatus_t, (csrsv2Info_t,), info) 72 | end 73 | 74 | function cusparseCreateCsric02Info(info) 75 | @check ccall( (:cusparseCreateCsric02Info, libcusparse), cusparseStatus_t, (Ptr{csric02Info_t},), info) 76 | end 77 | 78 | function cusparseDestroyCsric02Info(info) 79 | @check ccall( (:cusparseDestroyCsric02Info, libcusparse), cusparseStatus_t, (csric02Info_t,), info) 80 | end 81 | 82 | function cusparseCreateCsrilu02Info(info) 83 | @check ccall( (:cusparseCreateCsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{csrilu02Info_t},), info) 84 | end 85 | 86 | function cusparseDestroyCsrilu02Info(info) 87 | @check ccall( (:cusparseDestroyCsrilu02Info, libcusparse), cusparseStatus_t, (csrilu02Info_t,), info) 88 | end 89 | 90 | function cusparseCreateBsric02Info(info) 91 | @check ccall( (:cusparseCreateBsric02Info, libcusparse), cusparseStatus_t, (Ptr{bsric02Info_t},), info) 92 | end 93 | 94 | function cusparseDestroyBsric02Info(info) 95 | @check ccall( (:cusparseDestroyBsric02Info, libcusparse), cusparseStatus_t, (bsric02Info_t,), info) 96 | end 97 | 98 | function cusparseCreateBsrilu02Info(info) 99 | @check ccall( (:cusparseCreateBsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{bsrilu02Info_t},), info) 100 | end 101 | 102 | function cusparseDestroyBsrilu02Info(info) 103 | @check ccall( (:cusparseDestroyBsrilu02Info, libcusparse), cusparseStatus_t, (bsrilu02Info_t,), info) 104 | end 105 | 106 | function cusparseGetProperty(property::CUDAapi.libraryPropertyType) 107 | value_ref = Ref{Cint}() 108 | @check ccall((:cusparseGetProperty, libcusparse), 109 | cusparseStatus_t, 110 | (Cint, Ptr{Cint}), 111 | property, value_ref) 112 | value_ref[] 113 | end 114 | -------------------------------------------------------------------------------- /test/sparse_solver.jl: -------------------------------------------------------------------------------- 1 | @testset "CUSPARSE + CUSOLVER" begin 2 | 3 | using CuArrays.CUSPARSE 4 | using CuArrays.CUSOLVER 5 | 6 | using LinearAlgebra 7 | using SparseArrays 8 | 9 | m = 15 10 | n = 10 11 | l = 13 12 | k = 1 13 | 14 | @testset for elty in [Float32, Float64, ComplexF32, ComplexF64] 15 | @testset "csrlsvlu!" begin 16 | A = sparse(rand(elty,n,n)) 17 | b = rand(elty,n) 18 | x = zeros(elty,n) 19 | tol = convert(real(elty),1e-6) 20 | x = CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 21 | @test x ≈ Array(A)\b 22 | A = sparse(rand(elty,m,n)) 23 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 24 | A = sparse(rand(elty,n,n)) 25 | b = rand(elty,m) 26 | x = zeros(elty,n) 27 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 28 | b = rand(elty,n) 29 | x = zeros(elty,m) 30 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 31 | end 32 | 33 | @testset "csrlsvqr!" begin 34 | A = sparse(rand(elty,n,n)) 35 | d_A = CuSparseMatrixCSR(A) 36 | b = rand(elty,n) 37 | d_b = CuArray(b) 38 | x = zeros(elty,n) 39 | d_x = CuArray(x) 40 | tol = convert(real(elty),1e-4) 41 | d_x = CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 42 | h_x = collect(d_x) 43 | @test h_x ≈ Array(A)\b 44 | A = sparse(rand(elty,m,n)) 45 | d_A = CuSparseMatrixCSR(A) 46 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 47 | A = sparse(rand(elty,n,n)) 48 | b = rand(elty,m) 49 | x = zeros(elty,n) 50 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 51 | b = rand(elty,n) 52 | x = zeros(elty,m) 53 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 54 | end 55 | 56 | @testset "csrlsvchol!" begin 57 | A = rand(elty,n,n) 58 | A = sparse(A*A') #posdef 59 | d_A = CuSparseMatrixCSR(A) 60 | b = rand(elty,n) 61 | d_b = CuArray(b) 62 | x = zeros(elty,n) 63 | d_x = CuArray(x) 64 | tol = 10^2*eps(real(elty)) 65 | d_x = CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 66 | h_x = collect(d_x) 67 | @test h_x ≈ Array(A)\b 68 | b = rand(elty,m) 69 | d_b = CuArray(b) 70 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 71 | b = rand(elty,n) 72 | d_b = CuArray(b) 73 | x = rand(elty,m) 74 | d_x = CuArray(x) 75 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 76 | A = sparse(rand(elty,m,n)) 77 | d_A = CuSparseMatrixCSR(A) 78 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 79 | end 80 | 81 | @testset "csreigvsi" begin 82 | A = sparse(rand(elty,n,n)) 83 | A = A + A' 84 | d_A = CuSparseMatrixCSR(A) 85 | evs = eigvals(Array(A)) 86 | x_0 = CuArray(rand(elty,n)) 87 | μ,x = CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 88 | @test μ ≈ evs[1] 89 | A = sparse(rand(elty,m,n)) 90 | d_A = CuSparseMatrixCSR(A) 91 | @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 92 | A = sparse(rand(elty,n,n)) 93 | d_A = CuSparseMatrixCSR(A) 94 | x_0 = CuArray(rand(elty,m)) 95 | @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 96 | end 97 | @testset "csreigs" begin 98 | celty = complex(elty) 99 | A = rand(real(elty),n,n) 100 | A = sparse(A + A') 101 | num = CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O') 102 | @test num <= n 103 | A = sparse(rand(celty,m,n)) 104 | d_A = CuSparseMatrixCSR(A) 105 | @test_throws DimensionMismatch CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O') 106 | end 107 | @testset "csrlsqvqr!" begin 108 | A = sparse(rand(elty,n,n)) 109 | b = rand(elty,n) 110 | x = zeros(elty,n) 111 | tol = convert(real(elty),1e-4) 112 | x = CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 113 | @test x[1] ≈ Array(A)\b 114 | A = sparse(rand(elty,n,m)) 115 | x = zeros(elty,n) 116 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 117 | A = sparse(rand(elty,n,n)) 118 | b = rand(elty,m) 119 | x = zeros(elty,n) 120 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 121 | b = rand(elty,n) 122 | x = zeros(elty,m) 123 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 124 | end 125 | end 126 | 127 | end 128 | -------------------------------------------------------------------------------- /src/fft/highlevel.jl: -------------------------------------------------------------------------------- 1 | # region is an iterable subset of dimensions 2 | # spec. an integer, range, tuple, or array 3 | 4 | # inplace complex 5 | function plan_fft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 6 | K = CUFFT_FORWARD 7 | inplace = true 8 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 9 | 10 | pp = _mkplan(xtype, size(X), region) 11 | 12 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 13 | end 14 | 15 | function plan_bfft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 16 | K = CUFFT_INVERSE 17 | inplace = true 18 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 19 | 20 | pp = _mkplan(xtype, size(X), region) 21 | 22 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 23 | end 24 | 25 | # out-of-place complex 26 | function plan_fft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 27 | K = CUFFT_FORWARD 28 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 29 | inplace = false 30 | 31 | pp = _mkplan(xtype, size(X), region) 32 | 33 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 34 | end 35 | 36 | function plan_bfft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 37 | K = CUFFT_INVERSE 38 | inplace = false 39 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 40 | 41 | pp = _mkplan(xtype, size(X), region) 42 | 43 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 44 | end 45 | 46 | # out-of-place real-to-complex 47 | function plan_rfft(X::CuArray{T,N}, region) where {T<:cufftReals,N} 48 | K = CUFFT_FORWARD 49 | inplace = false 50 | xtype = (T == cufftReal) ? CUFFT_R2C : CUFFT_D2Z 51 | 52 | pp = _mkplan(xtype, size(X), region) 53 | 54 | ydims = collect(size(X)) 55 | ydims[region[1]] = div(ydims[region[1]],2)+1 56 | 57 | rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype) 58 | end 59 | 60 | function plan_brfft(X::CuArray{T,N}, d::Integer, region::Any) where {T<:cufftComplexes,N} 61 | K = CUFFT_INVERSE 62 | inplace = false 63 | xtype = (T == cufftComplex) ? CUFFT_C2R : CUFFT_Z2D 64 | ydims = collect(size(X)) 65 | ydims[region[1]] = d 66 | 67 | pp = _mkplan(xtype, (ydims...,), region) 68 | 69 | rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype) 70 | end 71 | 72 | # FIXME: plan_inv methods allocate needlessly (to provide type parameters) 73 | # Perhaps use FakeArray types to avoid this. 74 | 75 | function plan_inv(p::cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}) where {T,N,inplace} 76 | X = CuArray{T}(undef, p.sz) 77 | pp = _mkplan(p.xtype, p.sz, p.region) 78 | ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}(pp, X, p.sz, p.region, 79 | p.xtype), 80 | normalization(X, p.region)) 81 | end 82 | 83 | function plan_inv(p::cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}) where {T,N,inplace} 84 | X = CuArray{T}(undef, p.sz) 85 | pp = _mkplan(p.xtype, p.sz, p.region) 86 | ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region, 87 | p.xtype), 88 | normalization(X, p.region)) 89 | end 90 | 91 | function plan_inv(p::rCuFFTPlan{T,CUFFT_INVERSE,inplace,N} 92 | ) where {T<:cufftComplexes,N,inplace} 93 | X = CuArray{real(T)}(undef, p.osz) 94 | Y = CuArray{T}(undef, p.sz) 95 | xtype = p.xtype == CUFFT_C2R ? CUFFT_R2C : CUFFT_D2Z 96 | pp = _mkplan(xtype, p.osz, p.region) 97 | ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region, 98 | xtype), 99 | normalization(X, p.region)) 100 | end 101 | 102 | function plan_inv(p::rCuFFTPlan{T,CUFFT_FORWARD,inplace,N} 103 | ) where {T<:cufftReals,N,inplace} 104 | X = CuArray{complex(T)}(undef, p.osz) 105 | Y = CuArray{T}(undef, p.sz) 106 | xtype = p.xtype == CUFFT_R2C ? CUFFT_C2R : CUFFT_Z2D 107 | pp = _mkplan(xtype, p.sz, p.region) 108 | ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,N}(pp, X, p.sz, 109 | p.region, xtype), 110 | normalization(Y, p.region)) 111 | end 112 | 113 | 114 | # The rest of the standard API 115 | 116 | size(p::CuFFTPlan) = p.sz 117 | 118 | function mul!(y::CuArray{Ty}, p::CuFFTPlan{T,K,false}, x::CuArray{T} 119 | ) where {Ty,T,K} 120 | assert_applicable(p,x,y) 121 | unsafe_execute!(p,x,y) 122 | return y 123 | end 124 | 125 | function *(p::cCuFFTPlan{T,K,true,N}, x::CuArray{T,N}) where {T,K,N} 126 | assert_applicable(p,x) 127 | unsafe_execute!(p,x) 128 | x 129 | end 130 | 131 | function *(p::rCuFFTPlan{T,CUFFT_FORWARD,false,N}, x::CuArray{T,N} 132 | ) where {T<:cufftReals,N} 133 | @assert p.xtype ∈ [CUFFT_R2C,CUFFT_D2Z] 134 | y = CuArray{complex(T),N}(undef, p.osz) 135 | mul!(y,p,x) 136 | y 137 | end 138 | 139 | function *(p::rCuFFTPlan{T,CUFFT_INVERSE,false,N}, x::CuArray{T,N} 140 | ) where {T<:cufftComplexes,N} 141 | @assert p.xtype ∈ [CUFFT_C2R,CUFFT_Z2D] 142 | y = CuArray{real(T),N}(undef, p.osz) 143 | mul!(y,p,x) 144 | y 145 | end 146 | 147 | function *(p::cCuFFTPlan{T,K,false,N}, x::CuArray{T,N}) where {T,K,N} 148 | y = CuArray{T,N}(undef, p.osz) 149 | mul!(y,p,x) 150 | y 151 | end -------------------------------------------------------------------------------- /src/solver/libcusolver.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUSOLVER library 2 | 3 | #helper functions 4 | function cusolverDnCreate() 5 | handle = Ref{cusolverDnHandle_t}() 6 | @check ccall((:cusolverDnCreate, libcusolver), 7 | cusolverStatus_t, 8 | (Ptr{cusolverDnHandle_t},), 9 | handle) 10 | return handle[] 11 | end 12 | 13 | function cusolverDnDestroy(handle) 14 | @check ccall((:cusolverDnDestroy, libcusolver), 15 | cusolverStatus_t, 16 | (cusolverDnHandle_t,), 17 | handle) 18 | end 19 | 20 | function cusolverDnSetStream(handle, streamId) 21 | @check ccall((:cusolverDnSetStream, libcusolver), 22 | cusolverStatus_t, 23 | (cusolverDnHandle_t, CuStream_t), 24 | handle, streamId) 25 | end 26 | 27 | function cusolverDnGetStream(handle, streamId) 28 | @check ccall((:cusolverDnGetStream, libcusolver), 29 | cusolverStatus_t, 30 | (cusolverDnHandle_t, Ptr{CuStream_t}), 31 | handle, streamId) 32 | end 33 | 34 | function cusolverSpCreate() 35 | handle = Ref{cusolverSpHandle_t}() 36 | @check ccall((:cusolverSpCreate, libcusolver), 37 | cusolverStatus_t, 38 | (Ptr{cusolverSpHandle_t},), 39 | handle) 40 | return handle[] 41 | end 42 | 43 | function cusolverSpDestroy(handle) 44 | @check ccall((:cusolverSpDestroy, libcusolver), 45 | cusolverStatus_t, 46 | (cusolverSpHandle_t,), 47 | handle) 48 | end 49 | 50 | function cusolverSpSetStream(handle, streamId) 51 | @check ccall((:cusolverSpSetStream, libcusolver), 52 | cusolverStatus_t, 53 | (cusolverSpHandle_t, CuStream_t), 54 | handle, streamId) 55 | end 56 | 57 | function cusolverSpGetStream(handle, streamId) 58 | @check ccall((:cusolverSpGetStream, libcusolver), 59 | cusolverStatus_t, 60 | (cusolverSpHandle_t, Ptr{CuStream_t}), 61 | handle, streamId) 62 | end 63 | 64 | function cusolverSpCreateCsrqrInfo(info) 65 | @check ccall((:cusolverSpCreateCsrqrInfo, libcusolver), 66 | cusolverStatus_t, 67 | (Ptr{csrqrInfo_t},), 68 | info) 69 | end 70 | 71 | function cusolverSpDestroyCsrqrInfo(info) 72 | @check ccall((:cusolverDestroyCsrqrInfo, libcusolver), 73 | cusolverStatus_t, 74 | (csrqrInfo_t,), 75 | info) 76 | end 77 | 78 | function cusolverDnCreateGesvdjInfo(info) 79 | @check ccall((:cusolverDnCreateGesvdjInfo, libcusolver), 80 | cusolverStatus_t, 81 | (Ptr{gesvdjInfo_t},), 82 | info) 83 | end 84 | 85 | function cusolverDnDestroyGesvdjInfo(info) 86 | @check ccall((:cusolverDnDestroyGesvdjInfo, libcusolver), 87 | cusolverStatus_t, 88 | (gesvdjInfo_t,), 89 | info) 90 | end 91 | 92 | function cusolverDnXgesvdjSetTolerance(info, tolerance) 93 | @check ccall((:cusolverDnXgesvdjSetTolerance, libcusolver), 94 | cusolverStatus_t, 95 | (gesvdjInfo_t, Float64), 96 | info, Float64(tolerance)) 97 | end 98 | 99 | function cusolverDnXgesvdjSetMaxSweeps(info, max_sweeps) 100 | @check ccall((:cusolverDnXgesvdjSetMaxSweeps, libcusolver), 101 | cusolverStatus_t, 102 | (gesvdjInfo_t, Cint), 103 | info, Cint(max_sweeps)) 104 | end 105 | 106 | function cusolverDnCreateSyevjInfo(info) 107 | @check ccall((:cusolverDnCreateSyevjInfo, libcusolver), 108 | cusolverStatus_t, 109 | (Ptr{syevjInfo_t},), 110 | info) 111 | end 112 | 113 | function cusolverDnDestroySyevjInfo(info) 114 | @check ccall((:cusolverDnDestroySyevjInfo, libcusolver), 115 | cusolverStatus_t, 116 | (syevjInfo_t,), 117 | info) 118 | end 119 | 120 | function cusolverDnXsyevjSetTolerance(info, tolerance) 121 | @check ccall((:cusolverDnXsyevjSetTolerance, libcusolver), 122 | cusolverStatus_t, 123 | (syevjInfo_t, Float64), 124 | info, Float64(tolerance)) 125 | end 126 | 127 | function cusolverDnXsyevjSetMaxSweeps(info, max_sweeps) 128 | @check ccall((:cusolverDnXsyevjSetMaxSweeps, libcusolver), 129 | cusolverStatus_t, 130 | (syevjInfo_t, Cint), 131 | info, Cint(max_sweeps)) 132 | end 133 | 134 | function cusolverRfCreate(handle) 135 | @check ccall((:cusolverRfCreate, libcusolver), 136 | cusolverStatus_t, 137 | (Ptr{cusolverRfHandle_t},), 138 | handle) 139 | end 140 | 141 | function cusolverRfDestroy(handle) 142 | @check ccall((:cusolverRfDestroy, libcusolver), 143 | cusolverStatus_t, 144 | (cusolverRfHandle_t,), 145 | handle) 146 | end 147 | 148 | function cusolverRfSetStream(handle, streamId) 149 | @check ccall((:cusolverRfSetStream, libcusolver), 150 | cusolverStatus_t, 151 | (cusolverRfHandle_t, CuStream_t), 152 | handle, streamId) 153 | end 154 | 155 | function cusolverRfGetStream(handle, streamId) 156 | @check ccall((:cusolverRfGetStream, libcusolver), 157 | cusolverStatus_t, 158 | (cusolverRfHandle_t, Ptr{CuStream_t}), 159 | handle, streamId) 160 | end 161 | 162 | function cusolverGetProperty(property::CUDAapi.libraryPropertyType) 163 | value_ref = Ref{Cint}() 164 | @check ccall((:cusolverGetProperty, libcusolver), 165 | cusolverStatus_t, 166 | (Cint, Ptr{Cint}), 167 | property, value_ref) 168 | value_ref[] 169 | end 170 | -------------------------------------------------------------------------------- /src/dnn/helpers.jl: -------------------------------------------------------------------------------- 1 | # For low level cudnn functions that require a pointer to a number 2 | cptr(x,a::CuArray{Float64})=Float64[x] 3 | cptr(x,a::CuArray{Float32})=Float32[x] 4 | cptr(x,a::CuArray{Float16})=Float32[x] 5 | 6 | # Conversion between Julia and CUDNN datatypes 7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF 8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT 9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE 10 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 : 11 | a==CUDNN_DATA_FLOAT ? Float32 : 12 | a==CUDNN_DATA_DOUBLE ? Float64 : error()) 13 | 14 | tuple_strides(A::Tuple) = _strides((1,), A) 15 | _strides(out::Tuple{Int}, A::Tuple{}) = () 16 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out 17 | function _strides(out::NTuple{M,Int}, A::Tuple) where M 18 | Base.@_inline_meta 19 | _strides((out..., out[M]*A[M]), A) 20 | end 21 | 22 | # Descriptors 23 | 24 | mutable struct TensorDesc; ptr; end 25 | free(td::TensorDesc) = cudnnDestroyTensorDescriptor(td.ptr) 26 | Base.unsafe_convert(::Type{cudnnTensorDescriptor_t}, td::TensorDesc) = td.ptr 27 | Base.unsafe_convert(::Type{Ptr{Nothing}}, td::TensorDesc) = convert(Ptr{Nothing}, td.ptr) 28 | 29 | function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N 30 | sz = Cint.(size) |> reverse |> collect 31 | st = Cint.(strides) |> reverse |> collect 32 | d = Ref{cudnnTensorDescriptor_t}() 33 | cudnnCreateTensorDescriptor(d) 34 | cudnnSetTensorNdDescriptor(d[], cudnnDataType(T), length(sz), sz, st) 35 | this = TensorDesc(d[]) 36 | finalizer(free, this) 37 | return this 38 | end 39 | 40 | TensorDesc(a::CuArray) = TensorDesc(eltype(a), size(a), strides(a)) 41 | 42 | mutable struct FilterDesc 43 | ptr 44 | end 45 | free(fd::FilterDesc)=cudnnDestroyFilterDescriptor(fd.ptr) 46 | Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc)=fd.ptr 47 | Base.unsafe_convert(::Type{Ptr{Nothing}}, fd::FilterDesc)=fd.ptr 48 | 49 | function createFilterDesc() 50 | d = Ref{cudnnFilterDescriptor_t}() 51 | @check cudnnCreateFilterDescriptor(d) 52 | return d[] 53 | end 54 | 55 | function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW) 56 | # The only difference of a FilterDescriptor is no strides. 57 | sz = Cint.(size) |> reverse |> collect 58 | d = createFilterDesc() 59 | version() >= v"5" ? 60 | cudnnSetFilterNdDescriptor(d, cudnnDataType(T), format, length(sz), sz) : 61 | version() >= v"4" ? 62 | cudnnSetFilterNdDescriptor_v4(d, cudnnDataType(T), format, length(sz), sz) : 63 | cudnnSetFilterNdDescriptor(d, cudnnDataType(T), length(sz), sz) 64 | this = FilterDesc(d) 65 | finalizer(free, this) 66 | return this 67 | end 68 | 69 | FilterDesc(a::CuArray; format = CUDNN_TENSOR_NCHW) = FilterDesc(eltype(a), size(a), format = format) 70 | 71 | function Base.size(f::FilterDesc) 72 | typ = Ref{Cuint}() 73 | format = Ref{Cuint}() 74 | ndims = Ref{Cint}() 75 | dims = Vector{Cint}(undef, 8) 76 | cudnnGetFilterNdDescriptor(f, 8, typ, format, ndims, dims) 77 | @assert ndims[] ≤ 8 78 | return (dims[1:ndims[]]...,) |> reverse 79 | end 80 | 81 | mutable struct ConvDesc; ptr; end 82 | free(cd::ConvDesc) = cudnnDestroyConvolutionDescriptor(cd.ptr) 83 | Base.unsafe_convert(::Type{cudnnConvolutionDescriptor_t}, cd::ConvDesc)=cd.ptr 84 | 85 | function cdsize(w, nd) 86 | isa(w, Integer) && return Cint[fill(w,nd)...] 87 | length(w) == nd && return Cint[reverse(w)...] 88 | length(w) == 2*nd && return Cint[reverse(w[nd+1:end])...] 89 | throw(DimensionMismatch()) 90 | end 91 | 92 | pdsize(w, nd)=Cint[reverse(psize(w,nd))...] 93 | function psize(w, nd) 94 | isa(w, Integer) && return Cint[fill(w,nd)...] 95 | length(w) == nd && return w 96 | length(w) == 2*nd && return w[1:nd] 97 | throw(DimensionMismatch()) 98 | end 99 | 100 | function ConvDesc(T, N, padding, stride, dilation, mode) 101 | cd = Ref{cudnnConvolutionDescriptor_t}() 102 | cudnnCreateConvolutionDescriptor(cd) 103 | version() >= v"4" ? cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) : 104 | version() >= v"3" ? cudnnSetConvolutionNdDescriptor_v3(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) : 105 | cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode) 106 | this = ConvDesc(cd[]) 107 | finalizer(free, this) 108 | return this 109 | end 110 | 111 | function ConvDesc(T, cdims::DenseConvDims) 112 | pd = NNlib.padding(cdims) 113 | if !all(pd[1:2:end] .== pd[2:2:end]) 114 | @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice") 115 | end 116 | return ConvDesc(T, NNlib.spatial_dims(cdims), pd[1:2:end], NNlib.stride(cdims), 117 | NNlib.dilation(cdims), NNlib.flipkernel(cdims)) 118 | end 119 | 120 | mutable struct PoolDesc; ptr; end 121 | free(pd::PoolDesc)=cudnnDestroyPoolingDescriptor(pd.ptr) 122 | Base.unsafe_convert(::Type{cudnnPoolingDescriptor_t}, pd::PoolDesc)=pd.ptr 123 | 124 | function PoolDesc(nd, window, padding, stride, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN) 125 | pd = Ref{cudnnPoolingDescriptor_t}() 126 | cudnnCreatePoolingDescriptor(pd) 127 | cudnnSetPoolingNdDescriptor(pd[],mode,maxpoolingNanOpt,nd,pdsize(window,nd),pdsize(padding,nd),pdsize(stride,nd)) 128 | this = PoolDesc(pd[]) 129 | finalizer(free, this) 130 | return this 131 | end 132 | 133 | function PoolDesc(pdims::PoolDims, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN) 134 | pd = NNlib.padding(pdims) 135 | if !all(pd[1:2:end] .== pd[2:2:end]) 136 | @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice") 137 | end 138 | return PoolDesc(NNlib.spatial_dims(pdims), NNlib.kernel_size(pdims), pd[1:2:end], 139 | NNlib.stride(pdims), mode, maxpoolingNanOpt) 140 | end 141 | 142 | mutable struct ActivationDesc; ptr; end 143 | free(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr) 144 | Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr 145 | 146 | function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN) 147 | ad = Ref{cudnnActivationDescriptor_t}() 148 | cudnnCreateActivationDescriptor(ad) 149 | cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff) 150 | this = ActivationDesc(ad[]) 151 | finalizer(free, this) 152 | return this 153 | end 154 | -------------------------------------------------------------------------------- /src/sparse/libcusparse_types.jl: -------------------------------------------------------------------------------- 1 | #enum cusparseStatus_t 2 | #error messages from CUSPARSE 3 | 4 | """ 5 | Status messages from CUSPARSE's C API. 6 | """ 7 | const cusparseStatus_t = UInt32 8 | const CUSPARSE_STATUS_SUCCESS = 0 9 | const CUSPARSE_STATUS_NOT_INITIALIZED = 1 10 | const CUSPARSE_STATUS_ALLOC_FAILED = 2 11 | const CUSPARSE_STATUS_INVALID_VALUE = 3 12 | const CUSPARSE_STATUS_ARCH_MISMATCH = 4 13 | const CUSPARSE_STATUS_MAPPING_ERROR = 5 14 | const CUSPARSE_STATUS_EXECUTION_FAILED = 6 15 | const CUSPARSE_STATUS_INTERNAL_ERROR = 7 16 | const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8 17 | 18 | #enum cusparseAction_t 19 | """ 20 | Perform operation on indices only (`CUSPARSE_ACTION_SYMBOLIC`) or 21 | on both data and indices (`CUSPARSE_ACTION_NUMERIC`). Used in 22 | conversion routines. 23 | """ 24 | const cusparseAction_t = UInt32 25 | const CUSPARSE_ACTION_SYMBOLIC = 0 26 | const CUSPARSE_ACTION_NUMERIC = 1 27 | 28 | #enum cusparseDirection_t 29 | """ 30 | Parse dense matrix by rows (`CUSPARSE_DIRECTION_ROW`) or columns 31 | (`CUSPARSE_DIRECTION_COL`) to compute its number of non-zeros. 32 | """ 33 | const cusparseDirection_t = UInt32 34 | const CUSPARSE_DIRECTION_ROW = 0 35 | const CUSPARSE_DIRECTION_COL = 1 36 | 37 | #enum cusparseHybPartition_t 38 | """ 39 | How to partition the HYB matrix in a [`CudaSparseMatrixHYB`](@ref). 40 | There are three choices: 41 | * `CUSPARSE_HYB_PARTITION_AUTO` - let CUSPARSE decide internally for best performance. 42 | * `CUSPARSE_HYB_PARTITION_USER` - set the partition manually in the conversion function. 43 | * `CUSPARSE_HYB_PARTITION_MAX` - use the maximum partition, putting the matrix in ELL format. 44 | """ 45 | const cusparseHybPartition_t = UInt32 46 | const CUSPARSE_HYB_PARTITION_AUTO = 0 47 | const CUSPARSE_HYB_PARTITION_USER = 1 48 | const CUSPARSE_HYB_PARTITION_MAX = 2 49 | 50 | #enum cusparseFillMode_t 51 | """ 52 | Determines if a symmetric/Hermitian/triangular matrix has its upper 53 | (`CUSPARSE_FILL_MODE_UPPER`) or lower (`CUSPARSE_FILL_MODE_LOWER`) 54 | triangle filled. 55 | """ 56 | const cusparseFillMode_t = UInt32 57 | const CUSPARSE_FILL_MODE_LOWER = 0 58 | const CUSPARSE_FILL_MODE_UPPER = 1 59 | 60 | #enum cusparseDiagType_t 61 | """ 62 | Determines if the diagonal of a matrix is all ones (`CUSPARSE_DIAG_TYPE_UNIT`) 63 | or not all ones (`CUSPARSE_DIAG_TYPE_NON_UNIT`). 64 | """ 65 | const cusparseDiagType_t = UInt32 66 | const CUSPARSE_DIAG_TYPE_NON_UNIT = 0 67 | const CUSPARSE_DIAG_TYPE_UNIT = 1 68 | 69 | #enum cusparsePointerMode_t 70 | """ 71 | Determines if scalar arguments to a function are present on the host CPU 72 | (`CUSPARSE_POINTER_MODE_HOST`) or on the GPU (`CUSPARSE_POINTER_MODE_DEVICE`). 73 | """ 74 | const cusparsePointerMode_t = UInt32 75 | const CUSPARSE_POINTER_MODE_HOST = 0 76 | const CUSPARSE_POINTER_MODE_DEVICE = 1 77 | 78 | #enum cusparseOperation_t 79 | """ 80 | Determines whether to perform an operation, such as a matrix multiplication 81 | or solve, on the matrix as-is (`CUSPARSE_OPERATION_NON_TRANSPOSE`), on the 82 | matrix's transpose (`CUSPARSE_OPERATION_TRANSPOSE`), or on its conjugate 83 | transpose (`CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE`). 84 | """ 85 | const cusparseOperation_t = UInt32 86 | const CUSPARSE_OPERATION_NON_TRANSPOSE = 0 87 | const CUSPARSE_OPERATION_TRANSPOSE = 1 88 | const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 89 | 90 | #enum cusparseMatrixType_t 91 | """ 92 | Indicates whether a matrix is a general matrix (`CUSPARSE_MATRIX_TYPE_GENERAL`), 93 | symmetric (`CUSPARSE_MATRIX_TYPE_SYMMETRIC`), Hermitian 94 | (`CUSPARSE_MATRIX_TYPE_HERMITIAN`), or triangular 95 | (`CUSPARSE_MATRIX_TYPE_TRIANGULAR`). Note that for some matrix types 96 | (those in [`CompressedSparse`](@ref)), this can be inferred for some function 97 | calls. 98 | """ 99 | const cusparseMatrixType_t = UInt32 100 | const CUSPARSE_MATRIX_TYPE_GENERAL = 0 101 | const CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 102 | const CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 103 | const CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 104 | 105 | #enum cusparseSolvePolicy_t 106 | """ 107 | Indicates whether to keep level info in solvers (`CUSPARSE_SOLVE_POLICY_USE_LEVEL`) 108 | or whether to not use it (`CUSPARSE_SOLVE_POLICY_NO_LEVEL`). 109 | """ 110 | const cusparseSolvePolicy_t = UInt32 111 | const CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0 112 | const CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 113 | 114 | #enum cusparseIndexBase_t 115 | """ 116 | Indicates whether a sparse object is zero-indexed (`CUSPARSE_INDEX_BASE_ZERO`) 117 | or one-indexed (`CUSPARSE_INDEX_BASE_ONE`). CUSPARSE.jl supports both. Julia 118 | sparse matrices are one-indexed, but you may wish to pass matrices from other 119 | libraries which use zero-indexing (e.g. C language ODE solvers). 120 | """ 121 | const cusparseIndexBase_t = UInt32 122 | const CUSPARSE_INDEX_BASE_ZERO = 0 123 | const CUSPARSE_INDEX_BASE_ONE = 1 124 | 125 | #struct cusparseMatDescr_t 126 | """ 127 | Describes shape and properties of a CUSPARSE matrix. A convenience wrapper. 128 | 129 | Contains: 130 | * `MatrixType` - a [`cusparseMatrixType_t`](@ref) 131 | * `FillMode` - a [`cusparseFillMode_t`](@ref) 132 | * `DiagType` - a [`cusparseDiagType_t`](@ref) 133 | * `IndexBase` - a [`cusparseIndexBase_t`](@ref) 134 | """ 135 | struct cusparseMatDescr_t 136 | MatrixType::cusparseMatrixType_t 137 | FillMode::cusparseFillMode_t 138 | DiagType::cusparseDiagType_t 139 | IndexBase::cusparseIndexBase_t 140 | function cusparseMatDescr_t(MatrixType,FillMode,DiagType,IndexBase) 141 | new(MatrixType,FillMode,DiagType,IndexBase) 142 | end 143 | end 144 | 145 | """ 146 | An opaque struct containing information about the solution approach 147 | CUSPARSE will take. Generated by [`sv_analysis`](@ref) or 148 | [`sm_analysis`](@ref) and passed to [`sv_solve!`](@ref), [`sm_solve`](@ref), 149 | [`ic0!`](@ref), or [`ilu0!`](@ref). 150 | """ 151 | const cusparseSolveAnalysisInfo_t = Ptr{Cvoid} 152 | const bsrsm2Info_t = Ptr{Cvoid} 153 | const bsrsv2Info_t = Ptr{Cvoid} 154 | const csrsv2Info_t = Ptr{Cvoid} 155 | const csric02Info_t = Ptr{Cvoid} 156 | const csrilu02Info_t = Ptr{Cvoid} 157 | const bsric02Info_t = Ptr{Cvoid} 158 | const bsrilu02Info_t = Ptr{Cvoid} 159 | 160 | const cusparseContext = Cvoid 161 | const cusparseHandle_t = Ptr{cusparseContext} 162 | 163 | #complex numbers 164 | 165 | const cuComplex = Complex{Float32} 166 | const cuDoubleComplex = Complex{Float64} 167 | 168 | const CusparseFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 169 | const CusparseReal = Union{Float64,Float32} 170 | const CusparseComplex = Union{ComplexF64,ComplexF32} 171 | 172 | const cusparseHybMat_t = Ptr{Cvoid} 173 | -------------------------------------------------------------------------------- /src/rand/libcurand.jl: -------------------------------------------------------------------------------- 1 | function create_generator(typ::Int=CURAND_RNG_PSEUDO_DEFAULT) 2 | ptr = Ref{curandGenerator_t}() 3 | @check ccall((:curandCreateGenerator, libcurand), 4 | curandStatus_t, 5 | (Ptr{curandGenerator_t}, Cint), ptr, typ) 6 | r = RNG(ptr[], typ) 7 | finalizer(destroy_generator, r) 8 | return r 9 | end 10 | 11 | function destroy_generator(rng::RNG) 12 | @check ccall((:curandDestroyGenerator, libcurand), 13 | curandStatus_t, 14 | (curandGenerator_t,), rng) 15 | end 16 | 17 | function get_version() 18 | ver = Ref{Cint}() 19 | @check ccall((:curandGetVersion, libcurand), 20 | curandStatus_t, 21 | (Ref{Cint},), ver) 22 | return ver[] 23 | end 24 | 25 | # TODO: curandSetStream 26 | 27 | function set_pseudo_random_generator_seed(rng::RNG, seed::Int64) 28 | @check ccall((:curandSetPseudoRandomGeneratorSeed, libcurand), 29 | curandStatus_t, 30 | (curandGenerator_t, Clonglong), rng, seed) 31 | end 32 | 33 | function set_generator_offset(rng::RNG, offset::Int64) 34 | @check ccall((:curandSetGeneratorOffset, libcurand), 35 | curandStatus_t, 36 | (curandGenerator_t, Clonglong), rng, offset) 37 | end 38 | 39 | function set_generator_ordering(rng::RNG, order::Int) 40 | @check ccall((:curandSetGeneratorOrdering, libcurand), 41 | curandStatus_t, 42 | (curandGenerator_t, Cint), rng, order) 43 | end 44 | 45 | function set_quasi_random_generator_dimensions(rng::RNG, num_dimensions::UInt) 46 | @check ccall((:curandSetQuasiRandomGeneratorDimensions, libcurand), 47 | curandStatus_t, 48 | (curandGenerator_t, Cuint), 49 | rng, num_dimensions) 50 | end 51 | 52 | 53 | """ 54 | Generate 64-bit quasirandom numbers. 55 | """ 56 | function generate(rng::RNG, arr::CuArray, n::UInt) 57 | @check ccall((:curandGenerate, libcurand), 58 | curandStatus_t, 59 | (curandGenerator_t, CuPtr{UInt32}, Csize_t), 60 | rng, arr, length(arr)) 61 | return arr 62 | end 63 | 64 | 65 | """ 66 | Generate uniformly distributed floats. 67 | 68 | Valid RNG types are: 69 | - CURAND_RNG_QUASI_SOBOL64 70 | - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 71 | """ 72 | function generate_long_long(rng::RNG, arr::CuArray) 73 | @check ccall((:curandGenerateLongLong, libcurand), 74 | curandStatus_t, 75 | (curandGenerator_t, CuPtr{Culonglong}, Csize_t), 76 | rng, arr, length(arr)) 77 | return arr 78 | end 79 | 80 | # uniform 81 | function generate_uniform(rng::RNG, arr::CuArray) 82 | @check ccall((:curandGenerateUniform, libcurand), 83 | curandStatus_t, 84 | (curandGenerator_t, CuPtr{Float32}, Csize_t), 85 | rng, arr, length(arr)) 86 | return arr 87 | end 88 | 89 | function generate_uniform_double(rng::RNG, arr::CuArray) 90 | @check ccall((:curandGenerateUniformDouble, libcurand), 91 | curandStatus_t, 92 | (curandGenerator_t, CuPtr{Float64}, Csize_t), 93 | rng, arr, length(arr)) 94 | return arr 95 | end 96 | 97 | # normal 98 | function generate_normal(rng::RNG, arr::CuArray, mean, stddev) 99 | @check ccall((:curandGenerateNormal, libcurand), 100 | curandStatus_t, 101 | (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat), 102 | rng, arr, length(arr), mean, stddev) 103 | return arr 104 | end 105 | 106 | function generate_normal_double(rng::RNG, arr::CuArray, mean, stddev) 107 | @check ccall((:curandGenerateNormalDouble, libcurand), 108 | curandStatus_t, 109 | (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble), 110 | rng, arr, length(arr), mean, stddev) 111 | return arr 112 | end 113 | 114 | 115 | # lognormal 116 | function generate_log_normal(rng::RNG, arr::CuArray, mean, stddev) 117 | @check ccall((:curandGenerateLogNormal, libcurand), 118 | curandStatus_t, 119 | (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat), 120 | rng, arr, length(arr), mean, stddev) 121 | return arr 122 | end 123 | 124 | function generate_log_normal_double(rng::RNG, arr::CuArray, mean, stddev) 125 | @check ccall((:curandGenerateLogNormalDouble, libcurand), 126 | curandStatus_t, 127 | (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble), 128 | rng, arr, length(arr), mean, stddev) 129 | return arr 130 | end 131 | 132 | # Poisson 133 | """Construct the histogram array for a Poisson distribution.""" 134 | function create_poisson_distribtion(lambda) 135 | ptr = Ref{curandDiscreteDistribution_t}() 136 | @check ccall((:curandCreatePoissonDistribution, libcurand), 137 | curandStatus_t, 138 | (Cdouble, Ptr{curandDiscreteDistribution_t}), 139 | lambda, ptr) 140 | dist = DiscreteDistribution(ptr[]) 141 | finalizer(destroy_distribution, dist) 142 | return dist 143 | end 144 | 145 | """Destroy the histogram array for a discrete distribution (e.g. Poisson).""" 146 | function destroy_distribution(dist::DiscreteDistribution) 147 | @check ccall((:curandDestroyDistribution, libcurand), 148 | curandStatus_t, 149 | (curandDiscreteDistribution_t,), 150 | dist) 151 | end 152 | 153 | """Generate Poisson-distributed unsigned ints.""" 154 | function generate_poisson(rng::RNG, arr::CuArray, lambda) 155 | @check ccall((:curandGeneratePoisson, libcurand), 156 | curandStatus_t, 157 | (curandGenerator_t, CuPtr{Cuint}, Csize_t, Cdouble), 158 | rng, arr, length(arr), lambda) 159 | return arr 160 | end 161 | 162 | # seeds 163 | """Generate the starting state of the generator. """ 164 | function generate_seeds(rng::RNG) 165 | @check ccall((:curandGenerateSeeds, libcurand), 166 | curandStatus_t, 167 | (curandGenerator_t,), rng) 168 | end 169 | 170 | # TODO: curandGetDirectionVectors32 171 | # TODO: curandGetScrambleConstants32 172 | # TODO: curandGetDirectionVectors64 173 | # TODO: curandGetScrambleConstants64 174 | 175 | function curandGetProperty(property::CUDAapi.libraryPropertyType) 176 | value_ref = Ref{Cint}() 177 | @check ccall((:curandGetProperty, libcurand), 178 | curandStatus_t, 179 | (Cint, Ptr{Cint}), 180 | property, value_ref) 181 | value_ref[] 182 | end 183 | -------------------------------------------------------------------------------- /test/fft.jl: -------------------------------------------------------------------------------- 1 | @testset "CUFFT" begin 2 | 3 | using CuArrays.CUFFT 4 | 5 | using FFTW 6 | 7 | # notes: 8 | # plan_bfft does not need separate testing since it is used by plan_ifft 9 | 10 | N1 = 8 11 | N2 = 32 12 | N3 = 64 13 | N4 = 8 14 | 15 | MYRTOL = 1e-5 16 | MYATOL = 1e-8 17 | 18 | # out-of-place 19 | function dotest1(X::AbstractArray{T,N}) where {T <: Complex,N} 20 | fftw_X = fft(X) 21 | d_X = CuArray(X) 22 | p = plan_fft(d_X) 23 | d_Y = p * d_X 24 | Y = collect(d_Y) 25 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 26 | 27 | pinv = plan_ifft(d_Y) 28 | d_Z = pinv * d_Y 29 | Z = collect(d_Z) 30 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 31 | 32 | pinv2 = inv(p) 33 | d_Z = pinv2 * d_Y 34 | Z = collect(d_Z) 35 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 36 | end 37 | 38 | function dotest1(X::AbstractArray{T,N}) where {T <: Real,N} 39 | fftw_X = rfft(X) 40 | d_X = CuArray(X) 41 | p = plan_rfft(d_X) 42 | d_Y = p * d_X 43 | Y = collect(d_Y) 44 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 45 | 46 | pinv = plan_irfft(d_Y,size(X,1)) 47 | d_Z = pinv * d_Y 48 | Z = collect(d_Z) 49 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 50 | 51 | pinv2 = inv(p) 52 | d_Z = pinv2 * d_Y 53 | Z = collect(d_Z) 54 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 55 | 56 | pinv3 = inv(pinv) 57 | d_W = pinv3 * d_X 58 | W = collect(d_W) 59 | @test isapprox(W, Y, rtol = MYRTOL, atol = MYATOL) 60 | end 61 | 62 | # in-place 63 | function dotest2(X::AbstractArray{T,N}) where {T <: Complex,N} 64 | fftw_X = fft(X) 65 | d_X = CuArray(X) 66 | p = plan_fft!(d_X) 67 | p * d_X 68 | Y = collect(d_X) 69 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 70 | 71 | pinv = plan_ifft!(d_X) 72 | pinv * d_X 73 | Z = collect(d_X) 74 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 75 | end 76 | 77 | # no inplace rfft for now 78 | 79 | # batch transforms 80 | function dotest3(X::AbstractArray{T,N},region) where {T <: Complex,N} 81 | fftw_X = fft(X,region) 82 | d_X = CuArray(X) 83 | p = plan_fft(d_X,region) 84 | d_Y = p * d_X 85 | Y = collect(d_Y) 86 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 87 | 88 | pinv = plan_ifft(d_Y,region) 89 | d_Z = pinv * d_Y 90 | Z = collect(d_Z) 91 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 92 | end 93 | 94 | function dotest3(X::AbstractArray{T,N},region) where {T <: Real,N} 95 | fftw_X = rfft(X,region) 96 | d_X = CuArray(X) 97 | p = plan_rfft(d_X,region) 98 | d_Y = p * d_X 99 | Y = collect(d_Y) 100 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 101 | 102 | pinv = plan_irfft(d_Y,size(X,region[1]),region) 103 | d_Z = pinv * d_Y 104 | Z = collect(d_Z) 105 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 106 | end 107 | 108 | 109 | @testset "FFT" for (rtype,ctype) in [(Float32,ComplexF32), (Float64,ComplexF64)] 110 | 111 | @testset "1D FFT" begin 112 | dims = (N1,) 113 | X = rand(ctype, dims) 114 | dotest1(X) 115 | end 116 | @testset "1D inplace FFT" begin 117 | dims = (N1,) 118 | X = rand(ctype, dims) 119 | dotest2(X) 120 | end 121 | 122 | @testset "2D FFT" begin 123 | dims = (N1,N2) 124 | X = rand(ctype, dims) 125 | dotest1(X) 126 | end 127 | @testset "2D inplace FFT" begin 128 | dims = (N1,N2) 129 | X = rand(ctype, dims) 130 | dotest2(X) 131 | end 132 | 133 | @testset "Batch 1D FFT" begin 134 | dims = (N1,N2) 135 | X = rand(ctype, dims) 136 | dotest3(X,1) 137 | 138 | dims = (N1,N2) 139 | X = rand(ctype, dims) 140 | dotest3(X,2) 141 | 142 | dims = (N1,N2) 143 | X = rand(ctype, dims) 144 | dotest3(X,(1,2)) 145 | end 146 | 147 | @testset "3D FFT" begin 148 | dims = (N1,N2,N3) 149 | X = rand(ctype, dims) 150 | dotest1(X) 151 | end 152 | @testset "3D inplace FFT" begin 153 | dims = (N1,N2,N3) 154 | X = rand(ctype, dims) 155 | dotest2(X) 156 | end 157 | 158 | @testset "Batch 2D FFT (in 3D)" begin 159 | dims = (N1,N2,N3) 160 | for region in [(1,2),(2,3),(1,3)] 161 | X = rand(ctype, dims) 162 | dotest3(X,region) 163 | end 164 | 165 | X = rand(ctype, dims) 166 | @test_throws ArgumentError dotest3(X,(3,1)) 167 | end 168 | 169 | @testset "Batch 2D FFT (in 4D)" begin 170 | dims = (N1,N2,N3,N4) 171 | for region in [(1,2),(1,4),(3,4)] 172 | X = rand(ctype, dims) 173 | dotest3(X,region) 174 | end 175 | for region in [(1,3),(2,3),(2,4)] 176 | X = rand(ctype, dims) 177 | @test_throws ArgumentError dotest3(X,region) 178 | end 179 | 180 | end 181 | 182 | @testset "1D real FFT" begin 183 | X = rand(rtype, N1) 184 | dotest1(X) 185 | end 186 | 187 | @testset "Batch 1D real FFT" begin 188 | dims = (N1,N2) 189 | X = rand(rtype, dims) 190 | dotest3(X,1) 191 | 192 | dims = (N1,N2) 193 | X = rand(rtype, dims) 194 | dotest3(X,2) 195 | 196 | dims = (N1,N2) 197 | X = rand(rtype, dims) 198 | dotest3(X,(1,2)) 199 | end 200 | 201 | @testset "2D real FFT" begin 202 | X = rand(rtype, N1,N2) 203 | dotest1(X) 204 | end 205 | 206 | @testset "Batch 2D real FFT (in 3D)" begin 207 | dims = (N1,N2,N3) 208 | for region in [(1,2),(2,3),(1,3)] 209 | X = rand(rtype, dims) 210 | dotest3(X,region) 211 | end 212 | 213 | X = rand(rtype, dims) 214 | @test_throws ArgumentError dotest3(X,(3,1)) 215 | end 216 | 217 | @testset "Batch 2D real FFT (in 4D)" begin 218 | dims = (N1,N2,N3,N4) 219 | for region in [(1,2),(1,4),(3,4)] 220 | X = rand(rtype, dims) 221 | dotest3(X,region) 222 | end 223 | for region in [(1,3),(2,3),(2,4)] 224 | X = rand(rtype, dims) 225 | @test_throws ArgumentError dotest3(X,region) 226 | end 227 | end 228 | 229 | @testset "3D real FFT" begin 230 | X = rand(rtype, N1, N2, N3) 231 | dotest1(X) 232 | end 233 | 234 | end # testset FFT 235 | 236 | # integer array arguments 237 | function dotest5(X::AbstractArray{T,N}) where {T <: Complex,N} 238 | fftw_X = fft(X) 239 | d_X = CuArray(X) 240 | p = plan_fft(d_X) 241 | d_Y = p * d_X 242 | Y = collect(d_Y) 243 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 244 | d_Y = fft(d_X) 245 | Y = collect(d_Y) 246 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 247 | end 248 | 249 | function dotest5(X::AbstractArray{T,N}) where {T <: Real,N} 250 | fftw_X = rfft(X) 251 | d_X = CuArray(X) 252 | p = plan_rfft(d_X) 253 | d_Y = p * d_X 254 | Y = collect(d_Y) 255 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 256 | d_Y = rfft(d_X) 257 | Y = collect(d_Y) 258 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 259 | end 260 | 261 | @testset "Int FFT" for (rtype,ctype) in [(Int32,Complex{Int32}), (Int64,Complex{Int64})] 262 | 263 | @testset "1D FFT" begin 264 | dims = (N1,) 265 | X = rand(ctype, dims) 266 | dotest5(X) 267 | end 268 | 269 | @testset "1D real FFT" begin 270 | X = rand(rtype, N1) 271 | dotest5(X) 272 | end 273 | 274 | 275 | end # testset int FFT 276 | 277 | @testset "streams" begin 278 | X = rand(N1) 279 | d_X = CuArray(X) 280 | p = plan_fft(d_X) 281 | CUFFT.set_stream(p, CUDAdrv.CuDefaultStream()) 282 | end 283 | 284 | end 285 | -------------------------------------------------------------------------------- /src/dnn/libcudnn_types.jl: -------------------------------------------------------------------------------- 1 | const CUDNN_DIM_MAX = 8 2 | const CUDNN_LRN_MIN_N = 1 3 | const CUDNN_LRN_MAX_N = 16 4 | const CUDNN_LRN_MIN_K = 1.0e-5 5 | const CUDNN_LRN_MIN_BETA = 0.01 6 | const CUDNN_BN_MIN_EPSILON = 1.0e-5 7 | 8 | mutable struct cudnnContext 9 | end 10 | 11 | const cudnnHandle_t = Ptr{cudnnContext} 12 | 13 | # begin enum cudnnStatus_t 14 | const cudnnStatus_t = UInt32 15 | const CUDNN_STATUS_SUCCESS = (UInt32)(0) 16 | const CUDNN_STATUS_NOT_INITIALIZED = (UInt32)(1) 17 | const CUDNN_STATUS_ALLOC_FAILED = (UInt32)(2) 18 | const CUDNN_STATUS_BAD_PARAM = (UInt32)(3) 19 | const CUDNN_STATUS_INTERNAL_ERROR = (UInt32)(4) 20 | const CUDNN_STATUS_INVALID_VALUE = (UInt32)(5) 21 | const CUDNN_STATUS_ARCH_MISMATCH = (UInt32)(6) 22 | const CUDNN_STATUS_MAPPING_ERROR = (UInt32)(7) 23 | const CUDNN_STATUS_EXECUTION_FAILED = (UInt32)(8) 24 | const CUDNN_STATUS_NOT_SUPPORTED = (UInt32)(9) 25 | const CUDNN_STATUS_LICENSE_ERROR = (UInt32)(10) 26 | # end enum cudnnStatus_t 27 | 28 | mutable struct cudnnTensorStruct 29 | end 30 | 31 | const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct} 32 | 33 | mutable struct cudnnConvolutionStruct 34 | end 35 | 36 | const cudnnConvolutionDescriptor_t = Ptr{cudnnConvolutionStruct} 37 | 38 | mutable struct cudnnPoolingStruct 39 | end 40 | 41 | const cudnnPoolingDescriptor_t = Ptr{cudnnPoolingStruct} 42 | 43 | mutable struct cudnnFilterStruct 44 | end 45 | 46 | const cudnnFilterDescriptor_t = Ptr{cudnnFilterStruct} 47 | 48 | mutable struct cudnnLRNStruct 49 | end 50 | 51 | const cudnnLRNDescriptor_t = Ptr{cudnnLRNStruct} 52 | 53 | mutable struct cudnnActivationStruct 54 | end 55 | 56 | const cudnnActivationDescriptor_t = Ptr{cudnnActivationStruct} 57 | 58 | # begin enum cudnnDataType_t 59 | const cudnnDataType_t = UInt32 60 | const CUDNN_DATA_FLOAT = (UInt32)(0) 61 | const CUDNN_DATA_DOUBLE = (UInt32)(1) 62 | const CUDNN_DATA_HALF = (UInt32)(2) 63 | # end enum cudnnDataType_t 64 | 65 | # begin enum cudnnNanPropagation_t 66 | const cudnnNanPropagation_t = UInt32 67 | const CUDNN_NOT_PROPAGATE_NAN = (UInt32)(0) 68 | const CUDNN_PROPAGATE_NAN = (UInt32)(1) 69 | # end enum cudnnNanPropagation_t 70 | 71 | # begin enum cudnnTensorFormat_t 72 | const cudnnTensorFormat_t = UInt32 73 | const CUDNN_TENSOR_NCHW = (UInt32)(0) 74 | const CUDNN_TENSOR_NHWC = (UInt32)(1) 75 | # end enum cudnnTensorFormat_t 76 | 77 | # begin enum cudnnAddMode_t 78 | const cudnnAddMode_t = UInt32 79 | const CUDNN_ADD_IMAGE = (UInt32)(0) 80 | const CUDNN_ADD_SAME_HW = (UInt32)(0) 81 | const CUDNN_ADD_FEATURE_MAP = (UInt32)(1) 82 | const CUDNN_ADD_SAME_CHW = (UInt32)(1) 83 | const CUDNN_ADD_SAME_C = (UInt32)(2) 84 | const CUDNN_ADD_FULL_TENSOR = (UInt32)(3) 85 | # end enum cudnnAddMode_t 86 | 87 | # begin enum cudnnConvolutionMode_t 88 | const cudnnConvolutionMode_t = UInt32 89 | const CUDNN_CONVOLUTION = (UInt32)(0) 90 | const CUDNN_CROSS_CORRELATION = (UInt32)(1) 91 | # end enum cudnnConvolutionMode_t 92 | 93 | # begin enum cudnnConvolutionFwdPreference_t 94 | const cudnnConvolutionFwdPreference_t = UInt32 95 | const CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = (UInt32)(0) 96 | const CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = (UInt32)(1) 97 | const CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 98 | # end enum cudnnConvolutionFwdPreference_t 99 | 100 | # begin enum cudnnConvolutionFwdAlgo_t 101 | const cudnnConvolutionFwdAlgo_t = UInt32 102 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = (UInt32)(0) 103 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = (UInt32)(1) 104 | const CUDNN_CONVOLUTION_FWD_ALGO_GEMM = (UInt32)(2) 105 | const CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = (UInt32)(3) 106 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT = (UInt32)(4) 107 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = (UInt32)(5) 108 | # end enum cudnnConvolutionFwdAlgo_t 109 | 110 | mutable struct cudnnConvolutionFwdAlgoPerf_t 111 | algo::cudnnConvolutionFwdAlgo_t 112 | status::cudnnStatus_t 113 | time::Cfloat 114 | memory::Cint 115 | end 116 | 117 | # begin enum cudnnConvolutionBwdFilterPreference_t 118 | const cudnnConvolutionBwdFilterPreference_t = UInt32 119 | const CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = (UInt32)(0) 120 | const CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = (UInt32)(1) 121 | const CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 122 | # end enum cudnnConvolutionBwdFilterPreference_t 123 | 124 | # begin enum cudnnConvolutionBwdFilterAlgo_t 125 | const cudnnConvolutionBwdFilterAlgo_t = UInt32 126 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = (UInt32)(0) 127 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = (UInt32)(1) 128 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = (UInt32)(2) 129 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = (UInt32)(3) 130 | # end enum cudnnConvolutionBwdFilterAlgo_t 131 | 132 | mutable struct cudnnConvolutionBwdFilterAlgoPerf_t 133 | algo::cudnnConvolutionBwdFilterAlgo_t 134 | status::cudnnStatus_t 135 | time::Cfloat 136 | memory::Cint 137 | end 138 | 139 | # begin enum cudnnConvolutionBwdDataPreference_t 140 | const cudnnConvolutionBwdDataPreference_t = UInt32 141 | const CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = (UInt32)(0) 142 | const CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = (UInt32)(1) 143 | const CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 144 | # end enum cudnnConvolutionBwdDataPreference_t 145 | 146 | # begin enum cudnnConvolutionBwdDataAlgo_t 147 | const cudnnConvolutionBwdDataAlgo_t = UInt32 148 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = (UInt32)(0) 149 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = (UInt32)(1) 150 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = (UInt32)(2) 151 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = (UInt32)(3) 152 | # end enum cudnnConvolutionBwdDataAlgo_t 153 | 154 | mutable struct cudnnConvolutionBwdDataAlgoPerf_t 155 | algo::cudnnConvolutionBwdDataAlgo_t 156 | status::cudnnStatus_t 157 | time::Cfloat 158 | memory::Cint 159 | end 160 | 161 | # begin enum cudnnSoftmaxAlgorithm_t 162 | const cudnnSoftmaxAlgorithm_t = UInt32 163 | const CUDNN_SOFTMAX_FAST = (UInt32)(0) 164 | const CUDNN_SOFTMAX_ACCURATE = (UInt32)(1) 165 | const CUDNN_SOFTMAX_LOG = (UInt32)(2) 166 | # end enum cudnnSoftmaxAlgorithm_t 167 | 168 | # begin enum cudnnSoftmaxMode_t 169 | const cudnnSoftmaxMode_t = UInt32 170 | const CUDNN_SOFTMAX_MODE_INSTANCE = (UInt32)(0) 171 | const CUDNN_SOFTMAX_MODE_CHANNEL = (UInt32)(1) 172 | # end enum cudnnSoftmaxMode_t 173 | 174 | # begin enum cudnnPoolingMode_t 175 | const cudnnPoolingMode_t = UInt32 176 | const CUDNN_POOLING_MAX = (UInt32)(0) 177 | const CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = (UInt32)(1) 178 | const CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = (UInt32)(2) 179 | # end enum cudnnPoolingMode_t 180 | 181 | # begin enum cudnnActivationMode_t 182 | const cudnnActivationMode_t = UInt32 183 | const CUDNN_ACTIVATION_SIGMOID = (UInt32)(0) 184 | const CUDNN_ACTIVATION_RELU = (UInt32)(1) 185 | const CUDNN_ACTIVATION_TANH = (UInt32)(2) 186 | const CUDNN_ACTIVATION_CLIPPED_RELU = (UInt32)(3) 187 | const CUDNN_ACTIVATION_ELU = (UInt32)(4) 188 | const CUDNN_ACTIVATION_IDENTITY = (UInt32)(5) 189 | # end enum cudnnActivationMode_t 190 | 191 | # begin enum cudnnLRNMode_t 192 | const cudnnLRNMode_t = UInt32 193 | const CUDNN_LRN_CROSS_CHANNEL_DIM1 = (UInt32)(0) 194 | # end enum cudnnLRNMode_t 195 | 196 | # begin enum cudnnDivNormMode_t 197 | const cudnnDivNormMode_t = UInt32 198 | const CUDNN_DIVNORM_PRECOMPUTED_MEANS = (UInt32)(0) 199 | # end enum cudnnDivNormMode_t 200 | 201 | # begin enum cudnnBatchNormMode_t 202 | const cudnnBatchNormMode_t = UInt32 203 | const CUDNN_BATCHNORM_PER_ACTIVATION = (UInt32)(0) 204 | const CUDNN_BATCHNORM_SPATIAL = (UInt32)(1) 205 | # end enum cudnnBatchNormMode_t 206 | 207 | # begin enum cudnnMathType_t 208 | const cudnnMathType_t = UInt32 209 | const CUDNN_DEFAULT_MATH = 0 210 | const CUDNN_TENSOR_OP_MATH = 1 211 | const CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2 212 | # end enum cudnnMathType_t 213 | -------------------------------------------------------------------------------- /src/fft/wrappers.jl: -------------------------------------------------------------------------------- 1 | # wrappers of the low-level CUBLAS functionality 2 | 3 | import CUDAdrv: CuStream 4 | 5 | # Note: we don't implement padded storage dimensions 6 | function _mkplan(xtype, xdims, region) 7 | nrank = length(region) 8 | sz = [xdims[i] for i in region] 9 | csz = copy(sz) 10 | csz[1] = div(sz[1],2) + 1 11 | batch = prod(xdims) ÷ prod(sz) 12 | 13 | pp = Ref{cufftHandle_t}() 14 | if (nrank == 1) && (batch == 1) 15 | cufftPlan1d(pp, sz[1], xtype, 1) 16 | elseif (nrank == 2) && (batch == 1) 17 | cufftPlan2d(pp, sz[2], sz[1], xtype) 18 | elseif (nrank == 3) && (batch == 1) 19 | cufftPlan3d(pp, sz[3], sz[2], sz[1], xtype) 20 | else 21 | rsz = (length(sz) > 1) ? rsz = reverse(sz) : sz 22 | if ((region...,) == ((1:nrank)...,)) 23 | # handle simple case ... simply! (for robustness) 24 | cufftPlanMany(pp, nrank, Cint[rsz...], C_NULL, 1, 1, C_NULL, 1, 1, 25 | xtype, batch) 26 | else 27 | if nrank==1 || all(diff(collect(region)) .== 1) 28 | # _stride: successive elements in innermost dimension 29 | # _dist: distance between first elements of batches 30 | if region[1] == 1 31 | istride = 1 32 | idist = prod(sz) 33 | cdist = prod(csz) 34 | else 35 | if region[end] != length(xdims) 36 | throw(ArgumentError("batching dims must be sequential")) 37 | end 38 | istride = prod(xdims[1:region[1]-1]) 39 | idist = 1 40 | cdist = 1 41 | end 42 | inembed = Cint[rsz...] 43 | cnembed = (length(csz) > 1) ? Cint[reverse(csz)...] : Cint[csz[1]] 44 | ostride = istride 45 | if xtype == CUFFT_R2C || xtype == CUFFT_D2Z 46 | odist = cdist 47 | onembed = cnembed 48 | else 49 | odist = idist 50 | onembed = inembed 51 | end 52 | if xtype == CUFFT_C2R || xtype == CUFFT_Z2D 53 | idist = cdist 54 | inembed = cnembed 55 | end 56 | else 57 | if any(diff(collect(region)) .< 1) 58 | throw(ArgumentError("region must be an increasing sequence")) 59 | end 60 | cdims = collect(xdims) 61 | cdims[region[1]] = div(cdims[region[1]],2)+1 62 | 63 | if region[1] == 1 64 | istride = 1 65 | ii=1 66 | while (ii < nrank) && (region[ii] == region[ii+1]-1) 67 | ii += 1 68 | end 69 | idist = prod(xdims[1:ii]) 70 | cdist = prod(cdims[1:ii]) 71 | ngaps = 0 72 | else 73 | istride = prod(xdims[1:region[1]-1]) 74 | idist = 1 75 | cdist = 1 76 | ngaps = 1 77 | end 78 | nem = ones(Int,nrank) 79 | cem = ones(Int,nrank) 80 | id = 1 81 | for ii=1:nrank-1 82 | if region[ii+1] > region[ii]+1 83 | ngaps += 1 84 | end 85 | while id < region[ii+1] 86 | nem[ii] *= xdims[id] 87 | cem[ii] *= cdims[id] 88 | id += 1 89 | end 90 | @assert nem[ii] >= sz[ii] 91 | end 92 | if region[end] < length(xdims) 93 | ngaps += 1 94 | end 95 | # CUFFT represents batches by a single stride (_dist) 96 | # so we must verify that region is consistent with this: 97 | if ngaps > 1 98 | throw(ArgumentError("batch regions must be sequential")) 99 | end 100 | 101 | inembed = Cint[reverse(nem)...] 102 | cnembed = Cint[reverse(cem)...] 103 | ostride = istride 104 | if xtype == CUFFT_R2C || xtype == CUFFT_D2Z 105 | odist = cdist 106 | onembed = cnembed 107 | else 108 | odist = idist 109 | onembed = inembed 110 | end 111 | if xtype == CUFFT_C2R || xtype == CUFFT_Z2D 112 | idist = cdist 113 | inembed = cnembed 114 | end 115 | end 116 | cufftPlanMany(pp, nrank, Cint[rsz...], 117 | inembed, istride, idist, onembed, ostride, odist, 118 | xtype, batch) 119 | end 120 | end 121 | pp[] 122 | end 123 | 124 | # this is used implicitly in the unsafe_execute methods below: 125 | unsafe_convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan 126 | 127 | convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan 128 | 129 | destroy_plan(plan::CuFFTPlan) = cufftDestroy(plan) 130 | 131 | set_stream(plan::CuFFTPlan, stream::CuStream) = cufftSetStream(plan, stream) 132 | 133 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}) where {T,K} 134 | (size(X) == p.sz) || 135 | throw(ArgumentError("CuFFT plan applied to wrong-size input")) 136 | end 137 | 138 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}, Y::CuArray{Ty}) where {T,K,Ty} 139 | assert_applicable(p, X) 140 | (size(Y) == p.osz) || 141 | throw(ArgumentError("CuFFT plan applied to wrong-size output")) 142 | # type errors should be impossible by dispatch, but just in case: 143 | if p.xtype ∈ [CUFFT_C2R, CUFFT_Z2D] 144 | (Ty == real(T)) || 145 | throw(ArgumentError("Type mismatch for argument Y")) 146 | elseif p.xtype ∈ [CUFFT_R2C, CUFFT_D2Z] 147 | (Ty == complex(T)) || 148 | throw(ArgumentError("Type mismatch for argument Y")) 149 | else 150 | (Ty == T) || 151 | throw(ArgumentError("Type mismatch for argument Y")) 152 | end 153 | end 154 | 155 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,true,N}, 156 | x::CuArray{cufftComplex,N}) where {K,N} 157 | @assert plan.xtype == CUFFT_C2C 158 | cufftExecC2C(plan, x, x, K) 159 | end 160 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,N}, 161 | x::CuArray{cufftComplex,N}) where {K,N} 162 | @assert plan.xtype == CUFFT_C2R 163 | cufftExecC2R(plan, x, x) 164 | end 165 | 166 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,false,N}, 167 | x::CuArray{cufftComplex,N}, y::CuArray{cufftComplex} 168 | ) where {K,N} 169 | @assert plan.xtype == CUFFT_C2C 170 | cufftExecC2C(plan, x, y, K) 171 | end 172 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N}, 173 | x::CuArray{cufftComplex,N}, y::CuArray{cufftReal} 174 | ) where {K,N} 175 | @assert plan.xtype == CUFFT_C2R 176 | cufftExecC2R(plan, x, y) 177 | end 178 | 179 | function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N}, 180 | x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N} 181 | ) where {K,N} 182 | @assert plan.xtype == CUFFT_R2C 183 | cufftExecR2C(plan, x, y) 184 | end 185 | 186 | # double prec. 187 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,true,N}, 188 | x::CuArray{cufftDoubleComplex,N}) where {K,N} 189 | @assert plan.xtype == CUFFT_Z2Z 190 | cufftExecZ2Z(plan, x, x, K) 191 | end 192 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,N}, 193 | x::CuArray{cufftDoubleComplex,N}) where {K,N} 194 | @assert plan.xtype == CUFFT_Z2D 195 | cufftExecZ2D(plan, x, x) 196 | end 197 | 198 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,false,N}, 199 | x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleComplex} 200 | ) where {K,N} 201 | @assert plan.xtype == CUFFT_Z2Z 202 | cufftExecZ2Z(plan, x, y, K) 203 | end 204 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N}, 205 | x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleReal} 206 | ) where {K,N} 207 | @assert plan.xtype == CUFFT_Z2D 208 | cufftExecZ2D(plan, x, y) 209 | end 210 | 211 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,false,N}, 212 | x::CuArray{cufftDoubleReal,N}, y::CuArray{cufftDoubleComplex,N} 213 | ) where {K,N} 214 | @assert plan.xtype == CUFFT_D2Z 215 | cufftExecD2Z(plan, x, y) 216 | end 217 | -------------------------------------------------------------------------------- /test/base.jl: -------------------------------------------------------------------------------- 1 | using ForwardDiff: Dual 2 | using LinearAlgebra 3 | using Adapt: adapt 4 | 5 | import CUDAdrv 6 | import CUDAdrv: CuPtr, CU_NULL 7 | 8 | @testset "GPUArrays test suite" begin 9 | GPUArrays.test(CuArray) 10 | end 11 | 12 | @testset "Memory" begin 13 | CuArrays.alloc(0) 14 | 15 | @test (CuArrays.@allocated CuArray{Int32}(undef,1)) == 4 16 | 17 | ret, out = @grab_output CuArrays.@time CuArray{Int32}(undef, 1) 18 | @test isa(ret, CuArray{Int32}) 19 | @test occursin("1 GPU allocation: 4 bytes", out) 20 | 21 | ret, out = @grab_output CuArrays.@time Base.unsafe_wrap(CuArray, CuPtr{Int32}(12345678), (2, 3)) 22 | @test isa(ret, CuArray{Int32}) 23 | @test !occursin("GPU allocation", out) 24 | end 25 | 26 | @testset "Array" begin 27 | xs = CuArray{Int}(undef, 2, 3) 28 | @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4] 29 | @test collect(cu[1, 2, 3]) == [1, 2, 3] 30 | @test collect(cu([1, 2, 3])) == [1, 2, 3] 31 | @test testf(vec, rand(5,3)) 32 | @test cu(1:3) === 1:3 33 | @test Base.elsize(xs) == sizeof(Int) 34 | @test CuArray{Int, 2}(xs) === xs 35 | 36 | @test_throws ArgumentError Base.cconvert(Ptr, xs) 37 | 38 | # Check that allowscalar works 39 | @test_throws ErrorException xs[1] 40 | @test_throws ErrorException xs[1] = 1 41 | 42 | # unsafe_wrap 43 | buf = CUDAdrv.Mem.DeviceBuffer(CU_NULL, 2, CUDAdrv.CuCurrentContext()) 44 | @test Base.unsafe_wrap(CuArray, CU_NULL, 1; own=false).own == false 45 | @test Base.unsafe_wrap(CuArray, CU_NULL, 1; ctx=CUDAdrv.CuCurrentContext()).buf.ctx == CUDAdrv.CuCurrentContext() 46 | @test Base.unsafe_wrap(CuArray, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 47 | @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 48 | @test Base.unsafe_wrap(CuArray{Nothing,1}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 49 | @test Base.unsafe_wrap(CuArray, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 50 | @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 51 | @test Base.unsafe_wrap(CuArray{Nothing,2}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 52 | 53 | @test collect(CuArrays.zeros(2, 2)) == zeros(Float32, 2, 2) 54 | @test collect(CuArrays.ones(2, 2)) == ones(Float32, 2, 2) 55 | 56 | @test collect(CuArrays.fill(0, 2, 2)) == zeros(Float32, 2, 2) 57 | @test collect(CuArrays.fill(1, 2, 2)) == ones(Float32, 2, 2) 58 | end 59 | 60 | @testset "Adapt" begin 61 | A = rand(Float32, 3, 3) 62 | dA = CuArray(A) 63 | @test adapt(Array, dA) ≈ A 64 | @test adapt(CuArray, A) ≈ dA 65 | end 66 | 67 | @testset "Broadcast" begin 68 | @test testf((x) -> fill!(x, 1), rand(3,3)) 69 | @test testf((x, y) -> map(+, x, y), rand(2, 3), rand(2, 3)) 70 | @test testf((x) -> sin.(x), rand(2, 3)) 71 | @test testf((x) -> log.(x) .+ 1, rand(2, 3)) 72 | @test testf((x) -> 2x, rand(2, 3)) 73 | @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) 74 | @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) 75 | @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL]) 76 | @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6]) 77 | 78 | @eval struct Whatever{T} 79 | x::Int 80 | end 81 | @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1]) 82 | end 83 | 84 | @testset "Cufunc" begin 85 | gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) 86 | sig(x) = one(x) / (one(x) + exp(-x)) 87 | f(x) = gelu(log(x)) * sig(x) * tanh(x) 88 | 89 | CuArrays.@cufunc gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) 90 | CuArrays.@cufunc sig(x) = one(x) / (one(x) + exp(-x)) 91 | CuArrays.@cufunc f(x) = gelu(log(x)) * sig(x) * tanh(x) 92 | 93 | @test :gelu ∈ CuArrays.cufuncs() 94 | @test :sig ∈ CuArrays.cufuncs() 95 | @test :f ∈ CuArrays.cufuncs() 96 | @test testf((x) -> gelu.(x), rand(3,3)) 97 | @test testf((x) -> sig.(x), rand(3,3)) 98 | @test testf((x) -> f.(x), rand(3,3)) 99 | end 100 | 101 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/223 102 | @testset "Ref Broadcast" begin 103 | foobar(idx, A) = A[idx] 104 | @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42]))) 105 | end 106 | 107 | @testset "Broadcast Fix" begin 108 | @test testf(x -> log.(x), rand(3,3)) 109 | @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3)) 110 | 111 | if isdefined(CuArrays, :CUDNN) 112 | using NNlib 113 | 114 | @test testf(x -> logσ.(x), rand(5)) 115 | 116 | f(x) = logσ.(x) 117 | ds = Dual.(rand(5),1) 118 | @test f(ds) ≈ collect(f(CuArray(ds))) 119 | end 120 | end 121 | 122 | @testset "Reduce" begin 123 | @test testf(x -> sum(x, dims=1), rand(2, 3)) 124 | @test testf(x -> sum(x, dims=2), rand(2, 3)) 125 | @test testf(x -> sum(x -> x^2, x, dims=1), rand(2, 3)) 126 | @test testf(x -> prod(x, dims=2), rand(2, 3)) 127 | 128 | @test testf(x -> sum(x), rand(2, 3)) 129 | @test testf(x -> prod(x), rand(2, 3)) 130 | end 131 | 132 | @testset "0D" begin 133 | x = CuArray{Float64}(undef) 134 | x .= 1 135 | @test collect(x)[] == 1 136 | x /= 2 137 | @test collect(x)[] == 0.5 138 | end 139 | 140 | @testset "Slices" begin 141 | @test testf(rand(5)) do x 142 | y = x[2:4] 143 | y .= 1 144 | x 145 | end 146 | @test testf(rand(5)) do x 147 | y = view(x, 2:4) 148 | y .= 1 149 | x 150 | end 151 | @test testf(x->view(x, :, 1:4, 3), rand(Float32, 5, 4, 3)) 152 | @allowscalar let x = cu(rand(Float32, 5, 4, 3)) 153 | @test_throws BoundsError view(x, :, :, 1:10) 154 | 155 | # Contiguous views should return new CuArray 156 | @test typeof(view(x, :, 1, 2)) == CuVector{Float32} 157 | @test typeof(view(x, 1:4, 1, 2)) == CuVector{Float32} 158 | @test typeof(view(x, :, 1:4, 3)) == CuMatrix{Float32} 159 | @test typeof(view(x, :, :, 1)) == CuMatrix{Float32} 160 | @test typeof(view(x, :, :, :)) == CuArray{Float32,3} 161 | @test typeof(view(x, :)) == CuVector{Float32} 162 | @test typeof(view(x, 1:3)) == CuVector{Float32} 163 | 164 | # Non-contiguous views should fall back to base's SubArray 165 | @test typeof(view(x, 1:3, 1:3, 3)) <: SubArray 166 | @test typeof(view(x, 1, :, 3)) <: SubArray 167 | @test typeof(view(x, 1, 1:4, 3)) <: SubArray 168 | @test typeof(view(x, :, 1, 1:3)) <: SubArray 169 | @test typeof(view(x, :, 1:2:4, 1)) <: SubArray 170 | @test typeof(view(x, 1:2:5, 1, 1)) <: SubArray 171 | end 172 | end 173 | 174 | @testset "Reshape" begin 175 | A = [1 2 3 4 176 | 5 6 7 8] 177 | gA = reshape(CuArray(A),1,8) 178 | _A = reshape(A,1,8) 179 | _gA = Array(gA) 180 | @test all(_A .== _gA) 181 | A = [1,2,3,4] 182 | gA = reshape(CuArray(A),4) 183 | end 184 | 185 | @testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)), 186 | d in -2:2 187 | A = randn(10, 10) 188 | @test f(A, d) == Array(f!(CuArray(A), d)) 189 | end 190 | 191 | @testset "Utilities" begin 192 | t = @elapsed ret = CuArrays.@sync begin 193 | # TODO: do something that takes a while on the GPU 194 | # (need to wrap clock64 in CUDAnative for that) 195 | 42 196 | end 197 | @test t >= 0 198 | @test ret == 42 199 | end 200 | 201 | @testset "accumulate" begin 202 | @test accumulate(+, CuArray{Int}(undef, 2)) isa CuVector 203 | @test cumsum(CuArray{Int}(undef, 2)) isa CuVector 204 | @test cumprod(CuArray{Int}(undef, 2)) isa CuVector 205 | 206 | @test testf(x->accumulate(+, x), rand(2)) 207 | @test testf(x->accumulate(+, x; dims=2), rand(2)) 208 | @test testf(x->(accumulate!(+, x, copy(x)); x), rand(2)) 209 | @test testf(cumsum, rand(2)) 210 | @test testf(cumprod, rand(2)) 211 | end 212 | 213 | @testset "logical indexing" begin 214 | @test CuArray{Int}(undef, 2)[CuArray{Bool}(undef, 2)] isa CuArray 215 | @test CuArray{Int}(undef, 2, 2)[CuArray{Bool}(undef, 2, 2)] isa CuArray 216 | @test CuArray{Int}(undef, 2, 2, 2)[CuArray{Bool}(undef, 2, 2, 2)] isa CuArray 217 | 218 | @test CuArray{Int}(undef, 2)[Array{Bool}(undef, 2)] isa CuArray 219 | @test CuArray{Int}(undef, 2, 2)[Array{Bool}(undef, 2, 2)] isa CuArray 220 | @test CuArray{Int}(undef, 2, 2, 2)[Array{Bool}(undef, 2, 2, 2)] isa CuArray 221 | 222 | @test testf((x,y)->x[y], rand(2), rand(Bool, 2)) 223 | @test testf((x,y)->x[y], rand(2, 2), rand(Bool, 2, 2)) 224 | @test testf((x,y)->x[y], rand(2, 2, 2), rand(Bool, 2, 2, 2)) 225 | 226 | @test testf(x -> x[x .> 0.5], rand(2)) 227 | @test testf(x -> x[x .> 0.5], rand(2,2)) 228 | @test testf(x -> x[x .> 0.5], rand(2,2,2)) 229 | 230 | @test testf(x -> filter(y->y .> 0.5, x), rand(2)) 231 | @test testf(x -> filter(y->y .> 0.5, x), rand(2,2)) 232 | @test testf(x -> filter(y->y .> 0.5, x), rand(2,2,2)) 233 | end 234 | 235 | @testset "generic fallbacks" begin 236 | a = rand(Int8, 3, 3) 237 | b = rand(Int8, 3, 3) 238 | d_a = CuArray{Int8}(a) 239 | d_b = CuArray{Int8}(b) 240 | d_c = d_a*d_b 241 | @test collect(d_c) == a*b 242 | a = rand(Complex{Int8}, 3, 3) 243 | b = rand(Complex{Int8}, 3, 3) 244 | d_a = CuArray{Complex{Int8}}(a) 245 | d_b = CuArray{Complex{Int8}}(b) 246 | d_c = d_a'*d_b 247 | @test collect(d_c) == a'*b 248 | d_c = d_a*d_b' 249 | @test collect(d_c) == a*b' 250 | d_c = d_a'*d_b' 251 | @test collect(d_c) == a'*b' 252 | d_c = transpose(d_a)*d_b' 253 | @test collect(d_c) == transpose(a)*b' 254 | d_c = d_a'*transpose(d_b) 255 | @test collect(d_c) == a'*transpose(b) 256 | d_c = transpose(d_a)*d_b 257 | @test collect(d_c) == transpose(a)*b 258 | d_c = d_a*transpose(d_b) 259 | @test collect(d_c) == a*transpose(b) 260 | d_c = transpose(d_a)*transpose(d_b) 261 | @test collect(d_c) == transpose(a)*transpose(b) 262 | d_c = rmul!(copy(d_a), Complex{Int8}(2, 2)) 263 | @test collect(d_c) == a*Complex{Int8}(2, 2) 264 | d_c = lmul!(Complex{Int8}(2, 2), copy(d_a)) 265 | @test collect(d_c) == Complex{Int8}(2, 2)*a 266 | end 267 | 268 | @testset "reverse" begin 269 | @test testf(x->reverse(x), rand(1000)) 270 | @test testf(x->reverse(x, 10), rand(1000)) 271 | @test testf(x->reverse(x, 10, 90), rand(1000)) 272 | 273 | @test testf(x->reverse!(x), rand(1000)) 274 | @test testf(x->reverse!(x, 10), rand(1000)) 275 | @test testf(x->reverse!(x, 10, 90), rand(1000)) 276 | end 277 | -------------------------------------------------------------------------------- /src/blas/highlevel.jl: -------------------------------------------------------------------------------- 1 | # LinearAlgebra-style wrappers of the CUBLAS functionality 2 | 3 | 4 | cublas_size(t::Char, M::CuVecOrMat) = (size(M, t=='N' ? 1 : 2), size(M, t=='N' ? 2 : 1)) 5 | 6 | CublasArray{T<:CublasFloat} = CuArray{T} 7 | 8 | 9 | # 10 | # BLAS 1 11 | # 12 | 13 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Number) = 14 | scal!(length(x), convert(eltype(x), k), x, 1) 15 | 16 | # Work around ambiguity with GPUArrays wrapper 17 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Real) = 18 | invoke(rmul!, Tuple{typeof(x), Number}, x, k) 19 | 20 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{Float32,Float64} 21 | n = length(DX) 22 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 23 | dot(n, DX, 1, DY, 1) 24 | end 25 | 26 | function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 27 | n = length(DX) 28 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 29 | dotc(n, DX, 1, DY, 1) 30 | end 31 | 32 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 33 | dotc(DX, DY) 34 | end 35 | 36 | function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 37 | n = length(DX) 38 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 39 | dotu(n, DX, 1, DY, 1) 40 | end 41 | 42 | LinearAlgebra.norm(x::CublasArray) = nrm2(x) 43 | LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1) 44 | 45 | function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat 46 | length(x)==length(y) || throw(DimensionMismatch("")) 47 | axpy!(length(x), convert(T,alpha), x, 1, y, 1) 48 | end 49 | 50 | Base.argmin(xs::CublasArray{<:CublasReal}) = iamin(xs) 51 | Base.argmax(xs::CublasArray{<:CublasReal}) = iamax(xs) 52 | 53 | 54 | 55 | # 56 | # BLAS 2 57 | # 58 | 59 | # GEMV 60 | 61 | function gemv_wrapper!(y::CuVector{T}, tA::Char, A::CuMatrix{T}, x::CuVector{T}, 62 | alpha = one(T), beta = zero(T)) where T<:CublasFloat 63 | mA, nA = cublas_size(tA, A) 64 | if nA != length(x) 65 | throw(DimensionMismatch("second dimension of A, $nA, does not match length of x, $(length(x))")) 66 | end 67 | if mA != length(y) 68 | throw(DimensionMismatch("first dimension of A, $mA, does not match length of y, $(length(y))")) 69 | end 70 | if mA == 0 71 | return y 72 | end 73 | if nA == 0 74 | return rmul!(y, 0) 75 | end 76 | gemv!(tA, alpha, A, x, beta, y) 77 | end 78 | 79 | LinearAlgebra.mul!(Y::CuVector{T}, A::CuMatrix{T}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'N', A, B) 80 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Transpose{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B) 81 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B) 82 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasComplex = gemv_wrapper!(Y, 'C', A.parent, B) 83 | 84 | 85 | 86 | # 87 | # BLAS 3 88 | # 89 | 90 | # GEMM 91 | 92 | function gemm_wrapper!(C::CuVecOrMat{T}, tA::Char, tB::Char, 93 | A::CuVecOrMat{T}, 94 | B::CuVecOrMat{T}, 95 | alpha = one(T), 96 | beta = zero(T)) where T <: CublasFloat 97 | mA, nA = cublas_size(tA, A) 98 | mB, nB = cublas_size(tB, B) 99 | 100 | if nA != mB 101 | throw(DimensionMismatch("A has dimensions ($mA,$nA) but B has dimensions ($mB,$nB)")) 102 | end 103 | 104 | if C === A || B === C 105 | throw(ArgumentError("output matrix must not be aliased with input matrix")) 106 | end 107 | 108 | if mA == 0 || nA == 0 || nB == 0 109 | if size(C) != (mA, nB) 110 | throw(DimensionMismatch("C has dimensions $(size(C)), should have ($mA,$nB)")) 111 | end 112 | return LinearAlgebra.rmul!(C, 0) 113 | end 114 | 115 | gemm!(tA, tB, alpha, A, B, beta, C) 116 | end 117 | 118 | # Mutating 119 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuVecOrMat{T}, B::CuVecOrMat{T}) where T<:CublasFloat = gemm_wrapper!(C, 'N', 'N', A, B) 120 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 121 | gemm_wrapper!(C, 'T', 'N', parent(trA), B) 122 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 123 | gemm_wrapper!(C, 'N', 'T', A, parent(trB)) 124 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 125 | gemm_wrapper!(C, 'T', 'T', parent(trA), parent(trB)) 126 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasReal = 127 | gemm_wrapper!(C, 'T', 'N', parent(adjA), B) 128 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 129 | gemm_wrapper!(C, 'C', 'N', parent(adjA), B) 130 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal = 131 | gemm_wrapper!(C, 'N', 'T', A, parent(adjB)) 132 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 133 | gemm_wrapper!(C, 'N', 'C', A, parent(adjB)) 134 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal = 135 | gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(adjB)) 136 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 137 | gemm_wrapper!(C, 'C', 'C', parent(adjA), parent(adjB)) 138 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}) where T<:CublasReal = 139 | gemm_wrapper!(C, 'T', 'T', parent(trA), parent(adjB)) 140 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 141 | gemm_wrapper!(C, 'T', 'C', parent(trA), parent(adjB)) 142 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasReal = 143 | gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(trB)) 144 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T <: CublasFloat = 145 | gemm_wrapper!(C, 'C', 'T', parent(adjA), parent(trB)) 146 | 147 | 148 | # TRSM 149 | 150 | # ldiv! 151 | ## No transpose/adjoint 152 | LinearAlgebra.ldiv!(A::UpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 153 | CUBLAS.trsm!('L', 'U', 'N', 'N', one(T), parent(A), B) 154 | LinearAlgebra.ldiv!(A::UnitUpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 155 | CUBLAS.trsm!('L', 'U', 'N', 'U', one(T), parent(A), B) 156 | LinearAlgebra.ldiv!(A::LowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 157 | CUBLAS.trsm!('L', 'L', 'N', 'N', one(T), parent(A), B) 158 | LinearAlgebra.ldiv!(A::UnitLowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 159 | CUBLAS.trsm!('L', 'L', 'N', 'U', one(T), parent(A), B) 160 | ## Adjoint 161 | LinearAlgebra.ldiv!(A::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 162 | CUBLAS.trsm!('L', 'U', 'C', 'N', one(T), parent(parent(A)), B) 163 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 164 | CUBLAS.trsm!('L', 'U', 'C', 'U', one(T), parent(parent(A)), B) 165 | LinearAlgebra.ldiv!(A::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 166 | CUBLAS.trsm!('L', 'L', 'C', 'N', one(T), parent(parent(A)), B) 167 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 168 | CUBLAS.trsm!('L', 'L', 'C', 'U', one(T), parent(parent(A)), B) 169 | ## Transpose 170 | LinearAlgebra.ldiv!(A::Transpose{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 171 | CUBLAS.trsm!('L', 'U', 'T', 'N', one(T), parent(parent(A)), B) 172 | LinearAlgebra.ldiv!(A::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 173 | CUBLAS.trsm!('L', 'U', 'T', 'U', one(T), parent(parent(A)), B) 174 | LinearAlgebra.ldiv!(A::Transpose{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 175 | CUBLAS.trsm!('L', 'L', 'T', 'N', one(T), parent(parent(A)), B) 176 | LinearAlgebra.ldiv!(A::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 177 | CUBLAS.trsm!('L', 'L', 'T', 'U', one(T), parent(parent(A)), B) 178 | 179 | # rdiv! 180 | ## No transpose/adjoint 181 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 182 | CUBLAS.trsm!('R', 'U', 'N', 'N', one(T), parent(B), A) 183 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitUpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 184 | CUBLAS.trsm!('R', 'U', 'N', 'U', one(T), parent(B), A) 185 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::LowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 186 | CUBLAS.trsm!('R', 'L', 'N', 'N', one(T), parent(B), A) 187 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitLowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 188 | CUBLAS.trsm!('R', 'L', 'N', 'U', one(T), parent(B), A) 189 | ## Adjoint 190 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 191 | CUBLAS.trsm!('R', 'U', 'C', 'N', one(T), parent(parent(B)), A) 192 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 193 | CUBLAS.trsm!('R', 'U', 'C', 'U', one(T), parent(parent(B)), A) 194 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 195 | CUBLAS.trsm!('R', 'L', 'C', 'N', one(T), parent(parent(B)), A) 196 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 197 | CUBLAS.trsm!('R', 'L', 'C', 'U', one(T), parent(parent(B)), A) 198 | ## Transpose 199 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 200 | CUBLAS.trsm!('R', 'U', 'T', 'N', one(T), parent(parent(B)), A) 201 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 202 | CUBLAS.trsm!('R', 'U', 'T', 'U', one(T), parent(parent(B)), A) 203 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 204 | CUBLAS.trsm!('R', 'L', 'T', 'N', one(T), parent(parent(B)), A) 205 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 206 | CUBLAS.trsm!('R', 'L', 'T', 'U', one(T), parent(parent(B)), A) 207 | -------------------------------------------------------------------------------- /src/sparse/array.jl: -------------------------------------------------------------------------------- 1 | # custom extension of CuArray in CUDArt for sparse vectors/matrices 2 | # using CSC format for interop with Julia's native sparse functionality 3 | 4 | import Base: length, size, ndims, eltype, similar, pointer, stride, 5 | copy, convert, reinterpret, show, summary, copyto!, get!, fill!, collect 6 | import LinearAlgebra: BlasFloat, Hermitian, HermOrSym, issymmetric, Transpose, Adjoint, 7 | ishermitian, istriu, istril, Symmetric, UpperTriangular, LowerTriangular 8 | import SparseArrays: sparse, SparseMatrixCSC 9 | 10 | abstract type AbstractCuSparseArray{Tv, N} <: AbstractSparseArray{Tv, Cint, N} end 11 | const AbstractCuSparseVector{Tv} = AbstractCuSparseArray{Tv,1} 12 | const AbstractCuSparseMatrix{Tv} = AbstractCuSparseArray{Tv,2} 13 | 14 | mutable struct CuSparseVector{Tv} <: AbstractCuSparseVector{Tv} 15 | iPtr::CuVector{Cint} 16 | nzVal::CuVector{Tv} 17 | dims::NTuple{2,Int} 18 | nnz::Cint 19 | 20 | function CuSparseVector{Tv}(iPtr::CuVector{Cint}, nzVal::CuVector{Tv}, dims::Int, nnz::Cint) where Tv 21 | new(iPtr,nzVal,(dims,1),nnz) 22 | end 23 | end 24 | 25 | function CuArrays.unsafe_free!(xs::CuSparseVector) 26 | unsafe_free!(xs.iPtr) 27 | unsafe_free!(xs.nzVal) 28 | return 29 | end 30 | 31 | mutable struct CuSparseMatrixCSC{Tv} <: AbstractCuSparseMatrix{Tv} 32 | colPtr::CuVector{Cint} 33 | rowVal::CuVector{Cint} 34 | nzVal::CuVector{Tv} 35 | dims::NTuple{2,Int} 36 | nnz::Cint 37 | 38 | function CuSparseMatrixCSC{Tv}(colPtr::CuVector{Cint}, rowVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv 39 | new(colPtr,rowVal,nzVal,dims,nnz) 40 | end 41 | end 42 | 43 | function CuSparseMatrixCSC!(xs::CuSparseVector) 44 | unsafe_free!(xs.colPtr) 45 | unsafe_free!(xs.rowVal) 46 | unsafe_free!(xs.nzVal) 47 | return 48 | end 49 | 50 | """ 51 | Container to hold sparse matrices in compressed sparse row (CSR) format on the 52 | GPU. 53 | 54 | **Note**: Most CUSPARSE operations work with CSR formatted matrices, rather 55 | than CSC. 56 | """ 57 | mutable struct CuSparseMatrixCSR{Tv} <: AbstractCuSparseMatrix{Tv} 58 | rowPtr::CuVector{Cint} 59 | colVal::CuVector{Cint} 60 | nzVal::CuVector{Tv} 61 | dims::NTuple{2,Int} 62 | nnz::Cint 63 | 64 | function CuSparseMatrixCSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv 65 | new(rowPtr,colVal,nzVal,dims,nnz) 66 | end 67 | end 68 | 69 | function CuSparseMatrixCSR!(xs::CuSparseVector) 70 | unsafe_free!(xs.rowPtr) 71 | unsafe_free!(xs.colVal) 72 | unsafe_free!(xs.nzVal) 73 | return 74 | end 75 | 76 | """ 77 | Container to hold sparse matrices in block compressed sparse row (BSR) format on 78 | the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are 79 | "block" sparse - rare blocks of non-sparse regions. 80 | """ 81 | mutable struct CuSparseMatrixBSR{Tv} <: AbstractCuSparseMatrix{Tv} 82 | rowPtr::CuVector{Cint} 83 | colVal::CuVector{Cint} 84 | nzVal::CuVector{Tv} 85 | dims::NTuple{2,Int} 86 | blockDim::Cint 87 | dir::SparseChar 88 | nnz::Cint 89 | 90 | function CuSparseMatrixBSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int},blockDim::Cint, dir::SparseChar, nnz::Cint) where Tv 91 | new(rowPtr,colVal,nzVal,dims,blockDim,dir,nnz) 92 | end 93 | end 94 | 95 | function CuSparseMatrixBSR!(xs::CuSparseVector) 96 | unsafe_free!(xs.rowPtr) 97 | unsafe_free!(xs.colVal) 98 | unsafe_free!(xs.nzVal) 99 | return 100 | end 101 | 102 | """ 103 | Container to hold sparse matrices in NVIDIA's hybrid (HYB) format on the GPU. 104 | HYB format is an opaque struct, which can be converted to/from using 105 | CUSPARSE routines. 106 | """ 107 | mutable struct CuSparseMatrixHYB{Tv} <: AbstractCuSparseMatrix{Tv} 108 | Mat::cusparseHybMat_t 109 | dims::NTuple{2,Int} 110 | nnz::Cint 111 | 112 | function CuSparseMatrixHYB{Tv}(Mat::cusparseHybMat_t, dims::NTuple{2,Int}, nnz::Cint) where Tv 113 | new(Mat,dims,nnz) 114 | end 115 | end 116 | 117 | """ 118 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref), 119 | and `Hermitian` and `Symmetric` versions of these two containers. A function accepting 120 | this type can make use of performance improvements by only indexing one triangle of the 121 | matrix if it is guaranteed to be hermitian/symmetric. 122 | """ 123 | const CompressedSparse{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},HermOrSym{T,CuSparseMatrixCSC{T}},HermOrSym{T,CuSparseMatrixCSR{T}}} 124 | 125 | """ 126 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref), 127 | [`CuSparseMatrixBSR`](@ref), and [`CuSparseMatrixHYB`](@ref). 128 | """ 129 | const CuSparseMatrix{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T}, CuSparseMatrixBSR{T}, CuSparseMatrixHYB{T}} 130 | 131 | Hermitian{T}(Mat::CuSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U') 132 | 133 | length(g::CuSparseVector) = prod(g.dims) 134 | size(g::CuSparseVector) = g.dims 135 | ndims(g::CuSparseVector) = 1 136 | length(g::CuSparseMatrix) = prod(g.dims) 137 | size(g::CuSparseMatrix) = g.dims 138 | ndims(g::CuSparseMatrix) = 2 139 | 140 | function size(g::CuSparseVector, d::Integer) 141 | if d == 1 142 | return g.dims[d] 143 | elseif d > 1 144 | return 1 145 | else 146 | throw(ArgumentError("dimension must be ≥ 1, got $d")) 147 | end 148 | end 149 | 150 | function size(g::CuSparseMatrix, d::Integer) 151 | if d in [1, 2] 152 | return g.dims[d] 153 | elseif d > 1 154 | return 1 155 | else 156 | throw(ArgumentError("dimension must be ≥ 1, got $d")) 157 | end 158 | end 159 | 160 | issymmetric(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR})= false 161 | ishermitian(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR}) where T = false 162 | issymmetric(M::Symmetric{CuSparseMatrixCSC})= true 163 | ishermitian(M::Hermitian{CuSparseMatrixCSC}) = true 164 | 165 | istriu(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = true 166 | istril(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = false 167 | istriu(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = false 168 | istril(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = true 169 | eltype(g::CuSparseMatrix{T}) where T = T 170 | 171 | function collect(Vec::CuSparseVector) 172 | SparseVector(Vec.dims[1], collect(Vec.iPtr), collect(Vec.nzVal)) 173 | end 174 | 175 | function collect(Mat::CuSparseMatrixCSC) 176 | SparseMatrixCSC(Mat.dims[1], Mat.dims[2], collect(Mat.colPtr), collect(Mat.rowVal), collect(Mat.nzVal)) 177 | end 178 | function collect(Mat::CuSparseMatrixCSR) 179 | rowPtr = collect(Mat.rowPtr) 180 | colVal = collect(Mat.colVal) 181 | nzVal = collect(Mat.nzVal) 182 | #construct Is 183 | I = similar(colVal) 184 | counter = 1 185 | for row = 1 : size(Mat)[1], k = rowPtr[row] : (rowPtr[row+1]-1) 186 | I[counter] = row 187 | counter += 1 188 | end 189 | return sparse(I,colVal,nzVal,Mat.dims[1],Mat.dims[2]) 190 | end 191 | 192 | summary(g::CuSparseMatrix) = string(g) 193 | summary(g::CuSparseVector) = string(g) 194 | 195 | CuSparseVector(iPtr::Vector{Ti}, nzVal::Vector{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(CuArray(convert(Vector{Cint},iPtr)), CuArray(nzVal), dims, convert(Cint,length(nzVal))) 196 | CuSparseVector(iPtr::CuArray{Ti}, nzVal::CuArray{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(iPtr, nzVal, dims, convert(Cint,length(nzVal))) 197 | 198 | CuSparseMatrixCSC(colPtr::Vector{Ti}, rowVal::Vector{Ti}, nzVal::Vector{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(CuArray(convert(Vector{Cint},colPtr)), CuArray(convert(Vector{Cint},rowVal)), CuArray(nzVal), dims, convert(Cint,length(nzVal))) 199 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, convert(Cint,length(nzVal))) 200 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, nnz) 201 | 202 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, convert(Cint,length(nzVal))) 203 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, nnz) 204 | 205 | CuSparseMatrixBSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, blockDim, dir, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixBSR{T}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz) 206 | 207 | CuSparseVector(Vec::SparseVector) = CuSparseVector(Vec.nzind, Vec.nzval, size(Vec)[1]) 208 | CuSparseMatrixCSC(Vec::SparseVector) = CuSparseMatrixCSC([1], Vec.nzind, Vec.nzval, size(Vec)) 209 | CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError()) 210 | CuSparseMatrixCSC(Mat::SparseMatrixCSC) = CuSparseMatrixCSC(Mat.colptr, Mat.rowval, Mat.nzval, size(Mat)) 211 | CuSparseMatrixCSR(Mat::SparseMatrixCSC) = switch2csr(CuSparseMatrixCSC(Mat)) 212 | 213 | similar(Vec::CuSparseVector) = CuSparseVector(copy(Vec.iPtr), similar(Vec.nzVal), Vec.dims[1]) 214 | similar(Mat::CuSparseMatrixCSC) = CuSparseMatrixCSC(copy(Mat.colPtr), copy(Mat.rowVal), similar(Mat.nzVal), Mat.nnz, Mat.dims) 215 | similar(Mat::CuSparseMatrixCSR) = CuSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.nnz, Mat.dims) 216 | similar(Mat::CuSparseMatrixBSR) = CuSparseMatrixBSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.blockDim, Mat.dir, Mat.nnz, Mat.dims) 217 | 218 | function copyto!(dst::CuSparseVector, src::CuSparseVector) 219 | if dst.dims != src.dims 220 | throw(ArgumentError("Inconsistent Sparse Vector size")) 221 | end 222 | copyto!(dst.iPtr, src.iPtr) 223 | copyto!(dst.nzVal, src.nzVal) 224 | dst.nnz = src.nnz 225 | dst 226 | end 227 | 228 | function copyto!(dst::CuSparseMatrixCSC, src::CuSparseMatrixCSC) 229 | if dst.dims != src.dims 230 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 231 | end 232 | copyto!(dst.colPtr, src.colPtr) 233 | copyto!(dst.rowVal, src.rowVal) 234 | copyto!(dst.nzVal, src.nzVal) 235 | dst.nnz = src.nnz 236 | dst 237 | end 238 | 239 | function copyto!(dst::CuSparseMatrixCSR, src::CuSparseMatrixCSR) 240 | if dst.dims != src.dims 241 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 242 | end 243 | copyto!(dst.rowPtr, src.rowPtr) 244 | copyto!(dst.colVal, src.colVal) 245 | copyto!(dst.nzVal, src.nzVal) 246 | dst.nnz = src.nnz 247 | dst 248 | end 249 | 250 | function copyto!(dst::CuSparseMatrixBSR, src::CuSparseMatrixBSR) 251 | if dst.dims != src.dims 252 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 253 | end 254 | copyto!(dst.rowPtr, src.rowPtr) 255 | copyto!(dst.colVal, src.colVal) 256 | copyto!(dst.nzVal, src.nzVal) 257 | dst.dir = src.dir 258 | dst.nnz = src.nnz 259 | dst 260 | end 261 | 262 | function copyto!(dst::CuSparseMatrixHYB, src::CuSparseMatrixHYB) 263 | if dst.dims != src.dims 264 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 265 | end 266 | dst.Mat = src.Mat 267 | dst.nnz = src.nnz 268 | dst 269 | end 270 | 271 | copy(Vec::CuSparseVector) = copyto!(similar(Vec),Vec) 272 | copy(Mat::CuSparseMatrixCSC) = copyto!(similar(Mat),Mat) 273 | copy(Mat::CuSparseMatrixCSR) = copyto!(similar(Mat),Mat) 274 | copy(Mat::CuSparseMatrixBSR) = copyto!(similar(Mat),Mat) 275 | -------------------------------------------------------------------------------- /src/array.jl: -------------------------------------------------------------------------------- 1 | import CUDAnative: DevicePtr 2 | 3 | mutable struct CuArray{T,N} <: GPUArray{T,N} 4 | buf::Mem.Buffer 5 | own::Bool 6 | 7 | dims::Dims{N} 8 | offset::Int 9 | 10 | function CuArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N} 11 | xs = new{T,N}(buf, own, dims, offset) 12 | if own 13 | Mem.retain(buf) 14 | finalizer(unsafe_free!, xs) 15 | end 16 | return xs 17 | end 18 | end 19 | 20 | CuVector{T} = CuArray{T,1} 21 | CuMatrix{T} = CuArray{T,2} 22 | CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} 23 | 24 | const INVALID = Mem.alloc(Mem.Device, 0) 25 | 26 | function unsafe_free!(xs::CuArray{<:Any,N}) where {N} 27 | xs.buf === INVALID && return 28 | Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs))) 29 | xs.dims = Tuple(0 for _ in 1:N) 30 | xs.buf = INVALID 31 | return 32 | end 33 | 34 | 35 | ## construction 36 | 37 | # type and dimensionality specified, accepting dims as tuples of Ints 38 | CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} = 39 | CuArray{T,N}(alloc(prod(dims)*sizeof(T)), dims) 40 | 41 | # type and dimensionality specified, accepting dims as series of Ints 42 | CuArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = CuArray{T,N}(undef, dims) 43 | 44 | # type but not dimensionality specified 45 | CuArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 46 | CuArray{T}(::UndefInitializer, dims::Integer...) where {T} = 47 | CuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims)) 48 | 49 | # empty vector constructor 50 | CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0) 51 | 52 | # do-block constructors 53 | for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N)) 54 | @eval begin 55 | function $ctor(f::Function, args...) where {$(tvars...)} 56 | xs = $ctor(args...) 57 | try 58 | f(xs) 59 | finally 60 | unsafe_free!(xs) 61 | end 62 | end 63 | end 64 | end 65 | 66 | 67 | Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a)) 68 | Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 69 | Base.similar(a::CuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 70 | 71 | 72 | """ 73 | unsafe_wrap(::CuArray, ptr::CuPtr{T}, dims; own=false, ctx=CuCurrentContext()) 74 | 75 | Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer 76 | element type `T` determines the array element type. `dims` is either an integer (for a 1d 77 | array) or a tuple of the array dimensions. `own` optionally specified whether Julia should 78 | take ownership of the memory, calling `free` when the array is no longer referenced. The 79 | `ctx` argument determines the CUDA context where the data is allocated in. 80 | """ 81 | function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}}, 82 | p::CuPtr{T}, dims::NTuple{N,Int}; 83 | own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T,N} 84 | buf = Mem.DeviceBuffer(convert(CuPtr{Cvoid}, p), prod(dims) * sizeof(T), ctx) 85 | return CuArray{T, length(dims)}(buf, dims; own=own) 86 | end 87 | function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}}, 88 | p::CuPtr{T}, dim::Integer; 89 | own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T} 90 | unsafe_wrap(Atype, p, (dim,); own=own, ctx=ctx) 91 | end 92 | Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} = 93 | throw(ArgumentError("cannot wrap a CPU pointer with a $T")) 94 | 95 | 96 | ## array interface 97 | 98 | Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T) 99 | 100 | Base.size(x::CuArray) = x.dims 101 | Base.sizeof(x::CuArray) = Base.elsize(x) * length(x) 102 | 103 | 104 | ## interop with other arrays 105 | 106 | CuArray{T,N}(xs::AbstractArray{T,N}) where {T,N} = 107 | isbits(xs) ? 108 | (CuArray{T,N}(undef, size(xs)) .= xs) : 109 | copyto!(CuArray{T,N}(undef, size(xs)), collect(xs)) 110 | 111 | CuArray{T,N}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}((x -> T(x)).(xs)) 112 | 113 | # underspecified constructors 114 | CuArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}(xs) 115 | (::Type{CuArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = CuArray{S,N}(x) 116 | CuArray(A::AbstractArray{T,N}) where {T,N} = CuArray{T,N}(A) 117 | 118 | # idempotency 119 | CuArray{T,N}(xs::CuArray{T,N}) where {T,N} = xs 120 | 121 | 122 | ## conversions 123 | 124 | Base.convert(::Type{T}, x::T) where T <: CuArray = x 125 | 126 | function Base._reshape(parent::CuArray, dims::Dims) 127 | n = length(parent) 128 | prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims")) 129 | return CuArray{eltype(parent),length(dims)}(parent.buf, dims; 130 | offset=parent.offset, own=parent.own) 131 | end 132 | function Base._reshape(parent::CuArray{T,1}, dims::Tuple{Int}) where T 133 | n = length(parent) 134 | prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims")) 135 | return parent 136 | end 137 | 138 | 139 | ## interop with C libraries 140 | 141 | """ 142 | buffer(array::CuArray [, index]) 143 | 144 | Get the native address of a CuArray, optionally at a given location `index`. 145 | Equivalent of `Base.pointer` on `Array`s. 146 | """ 147 | function buffer(xs::CuArray, index::Integer=1) 148 | extra_offset = (index-1) * Base.elsize(xs) 149 | view(xs.buf, xs.offset + extra_offset) 150 | end 151 | 152 | Base.cconvert(::Type{<:Ptr}, x::CuArray) = throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) 153 | Base.cconvert(::Type{<:CuPtr}, x::CuArray) = buffer(x) 154 | 155 | 156 | ## interop with CUDAnative 157 | 158 | function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N} 159 | ptr = convert(CuPtr{T}, buffer(a)) 160 | CuDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr)) 161 | end 162 | 163 | Adapt.adapt_storage(::CUDAnative.Adaptor, xs::CuArray{T,N}) where {T,N} = 164 | convert(CuDeviceArray{T,N,AS.Global}, xs) 165 | 166 | 167 | ## interop with CPU arrays 168 | 169 | # We don't convert isbits types in `adapt`, since they are already 170 | # considered GPU-compatible. 171 | 172 | Adapt.adapt_storage(::Type{<:CuArray}, xs::AbstractArray) = 173 | isbits(xs) ? xs : convert(CuArray, xs) 174 | 175 | Adapt.adapt_storage(::Type{<:CuArray{T}}, xs::AbstractArray{<:Real}) where T <: AbstractFloat = 176 | isbits(xs) ? xs : convert(CuArray{T}, xs) 177 | 178 | Adapt.adapt_storage(::Type{<:Array}, xs::CuArray) = convert(Array, xs) 179 | 180 | Base.collect(x::CuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x) 181 | 182 | function Base.copyto!(dest::CuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer, 183 | n::Integer) where T 184 | @boundscheck checkbounds(dest, doffs+n-1) 185 | @boundscheck checkbounds(src, soffs+n-1) 186 | Mem.copy!(buffer(dest, doffs), pointer(src, soffs), n*sizeof(T)) 187 | return dest 188 | end 189 | 190 | function Base.copyto!(dest::Array{T}, doffs::Integer, src::CuArray{T}, soffs::Integer, 191 | n::Integer) where T 192 | @boundscheck checkbounds(dest, doffs+n-1) 193 | @boundscheck checkbounds(src, soffs+n-1) 194 | Mem.copy!(pointer(dest, doffs), buffer(src, soffs), n*sizeof(T)) 195 | return dest 196 | end 197 | 198 | function Base.copyto!(dest::CuArray{T}, doffs::Integer, src::CuArray{T}, soffs::Integer, 199 | n::Integer) where T 200 | @boundscheck checkbounds(dest, doffs+n-1) 201 | @boundscheck checkbounds(src, soffs+n-1) 202 | Mem.copy!(buffer(dest, doffs), buffer(src, soffs), n*sizeof(T)) 203 | return dest 204 | end 205 | 206 | function Base.deepcopy_internal(x::CuArray, dict::IdDict) 207 | haskey(dict, x) && return dict[x]::typeof(x) 208 | return dict[x] = copy(x) 209 | end 210 | 211 | 212 | ## utilities 213 | 214 | cu(xs) = adapt(CuArray{Float32}, xs) 215 | Base.getindex(::typeof(cu), xs...) = CuArray([xs...]) 216 | 217 | zeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0) 218 | ones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1) 219 | zeros(dims...) = CuArrays.zeros(Float32, dims...) 220 | ones(dims...) = CuArrays.ones(Float32, dims...) 221 | fill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v) 222 | fill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v) 223 | 224 | # optimized implementation of `fill!` for types that are directly supported by memset 225 | const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32) 226 | const MemsetCompatTypes = Union{UInt8, Int8, 227 | UInt16, Int16, Float16, 228 | UInt32, Int32, Float32} 229 | function Base.fill!(A::CuArray{T}, x) where T <: MemsetCompatTypes 230 | y = reinterpret(MemsetTypes[sizeof(T)], convert(T, x)) 231 | Mem.set!(buffer(A), y, length(A)) 232 | A 233 | end 234 | 235 | 236 | ## generic linear algebra routines 237 | 238 | function LinearAlgebra.tril!(A::CuMatrix{T}, d::Integer = 0) where T 239 | function kernel!(_A, _d) 240 | li = (blockIdx().x - 1) * blockDim().x + threadIdx().x 241 | m, n = size(_A) 242 | if 0 < li <= m*n 243 | i, j = Tuple(CartesianIndices(_A)[li]) 244 | if i < j - _d 245 | _A[i, j] = 0 246 | end 247 | end 248 | return nothing 249 | end 250 | 251 | blk, thr = cudims(A) 252 | @cuda blocks=blk threads=thr kernel!(A, d) 253 | return A 254 | end 255 | 256 | function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T 257 | function kernel!(_A, _d) 258 | li = (blockIdx().x - 1) * blockDim().x + threadIdx().x 259 | m, n = size(_A) 260 | if 0 < li <= m*n 261 | i, j = Tuple(CartesianIndices(_A)[li]) 262 | if j < i + _d 263 | _A[i, j] = 0 264 | end 265 | end 266 | return nothing 267 | end 268 | 269 | blk, thr = cudims(A) 270 | @cuda blocks=blk threads=thr kernel!(A, d) 271 | return A 272 | end 273 | 274 | 275 | ## reversing 276 | 277 | function _reverse(input::CuVector{T}, output::CuVector{T}) where {T} 278 | @assert length(input) == length(output) 279 | 280 | nthreads = 256 281 | nblocks = ceil(Int, length(input) / nthreads) 282 | shmem = nthreads * sizeof(T) 283 | 284 | function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}) where {T} 285 | shared = @cuDynamicSharedMem(T, blockDim().x) 286 | 287 | # load one element per thread from device memory and buffer it in reversed order 288 | 289 | offset_in = blockDim().x * (blockIdx().x - 1) 290 | index_in = offset_in + threadIdx().x 291 | 292 | if index_in <= length(input) 293 | index_shared = blockDim().x - threadIdx().x + 1 294 | @inbounds shared[index_shared] = input[index_in] 295 | end 296 | 297 | sync_threads() 298 | 299 | # write back in forward order, but to the reversed block offset as before 300 | 301 | offset_out = length(output) - blockDim().x * blockIdx().x 302 | index_out = offset_out + threadIdx().x 303 | 304 | if 1 <= index_out <= length(output) 305 | index_shared = threadIdx().x 306 | @inbounds output[index_out] = shared[index_shared] 307 | end 308 | 309 | return 310 | end 311 | 312 | @cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output) 313 | 314 | return 315 | end 316 | 317 | function Base.reverse!(v::CuVector, start=1, stop=length(v)) 318 | v′ = view(v, start:stop) 319 | _reverse(v′, v′) 320 | return v 321 | end 322 | 323 | function Base.reverse(v::CuVector, start=1, stop=length(v)) 324 | v′ = similar(v) 325 | start > 1 && copyto!(v′, 1, v, 1, start-1) 326 | _reverse(view(v, start:stop), view(v′, start:stop)) 327 | stop < length(v) && copyto!(v′, stop+1, v, stop+1) 328 | return v′ 329 | end 330 | --------------------------------------------------------------------------------