├── docs
    ├── src
    │   ├── index.md
    │   └── tutorials
    │   │   ├── intro1.png
    │   │   └── common.jl
    ├── .gitignore
    ├── Project.toml
    └── make.jl
├── deps
    ├── .gitignore
    └── build.jl
├── bors.toml
├── .gitignore
├── src
    ├── deprecated.jl
    ├── forwarddiff.jl
    ├── dnn
    │   ├── error.jl
    │   ├── compat.jl
    │   ├── CUDNN.jl
    │   ├── nnlib.jl
    │   ├── helpers.jl
    │   └── libcudnn_types.jl
    ├── blas
    │   ├── util.jl
    │   ├── error.jl
    │   ├── README.md
    │   ├── CUBLAS.jl
    │   ├── libcublas_types.jl
    │   └── highlevel.jl
    ├── fft
    │   ├── CUFFT.jl
    │   ├── genericfft.jl
    │   ├── error.jl
    │   ├── fft.jl
    │   ├── libcufft_types.jl
    │   ├── libcufft.jl
    │   ├── highlevel.jl
    │   └── wrappers.jl
    ├── indexing.jl
    ├── nnlib.jl
    ├── rand
    │   ├── CURAND.jl
    │   ├── error.jl
    │   ├── libcurand_types.jl
    │   ├── highlevel.jl
    │   └── libcurand.jl
    ├── sparse
    │   ├── CUSPARSE.jl
    │   ├── error.jl
    │   ├── highlevel.jl
    │   ├── libcusparse.jl
    │   ├── libcusparse_types.jl
    │   └── array.jl
    ├── subarray.jl
    ├── solver
    │   ├── error.jl
    │   ├── CUSOLVER.jl
    │   ├── libcusolver_types.jl
    │   ├── highlevel.jl
    │   └── libcusolver.jl
    ├── accumulate.jl
    ├── broadcast.jl
    ├── utils.jl
    ├── gpuarray_interface.jl
    ├── matmul.jl
    ├── CuArrays.jl
    ├── mapreduce.jl
    └── array.jl
├── test
    ├── util.jl
    ├── runtests.jl
    ├── rand.jl
    ├── dnn.jl
    ├── sparse_solver.jl
    ├── fft.jl
    └── base.jl
├── .github
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── LICENSE.md
├── Project.toml
├── .gitlab-ci.yml
└── README.md


/docs/src/index.md:
--------------------------------------------------------------------------------
1 | # CuArrays.jl


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | src/**/generated/
2 | 


--------------------------------------------------------------------------------
/deps/.gitignore:
--------------------------------------------------------------------------------
1 | ext.jl.bak
2 | build.log
3 | 
4 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [
2 |   "ci/gitlab/%"
3 | ]
4 | delete_merged_branches = true
5 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/intro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CuArrays.jl/master/docs/src/tutorials/intro1.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jl.cov
2 | *.jl.*.cov
3 | *.jl.mem
4 | deps/ext.jl
5 | Manifest.toml
6 | tutorials/build/
7 | docs/build/
8 | 


--------------------------------------------------------------------------------
/docs/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
3 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
4 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
5 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
6 | 


--------------------------------------------------------------------------------
/src/deprecated.jl:
--------------------------------------------------------------------------------
 1 | # Deprecated functionality
 2 | 
 3 | import Base: @deprecate_binding
 4 | 
 5 | @deprecate_binding BLAS CUBLAS
 6 | @deprecate_binding FFT CUFFT
 7 | 
 8 | @deprecate cuzeros CuArrays.zeros
 9 | @deprecate cuones CuArrays.ones
10 | @deprecate cufill CuArrays.fill
11 | 


--------------------------------------------------------------------------------
/test/util.jl:
--------------------------------------------------------------------------------
 1 | macro grab_output(ex)
 2 |     quote
 3 |         mktemp() do fname, fout
 4 |             ret = nothing
 5 |             open(fname, "w") do fout
 6 |                 redirect_stdout(fout) do
 7 |                     ret = $(esc(ex))
 8 |                 end
 9 |             end
10 |             ret, read(fname, String)
11 |         end
12 |     end
13 | end
14 | 


--------------------------------------------------------------------------------
/docs/make.jl:
--------------------------------------------------------------------------------
 1 | using Documenter
 2 | using Literate
 3 | 
 4 | using CuArrays
 5 | 
 6 | # generate tutorials
 7 | OUTPUT = joinpath(@__DIR__, "src/tutorials/generated")
 8 | Literate.markdown(joinpath(@__DIR__, "src/tutorials/intro.jl"), OUTPUT)
 9 | 
10 | makedocs(
11 |     modules = [CuArrays],
12 |     format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
13 |     sitename = "CuArrays.jl",
14 |     pages = [
15 |         "Home" => "index.md",
16 |         "Tutorials"  => [
17 |             "tutorials/generated/intro.md"
18 |         ],
19 |     ],
20 |     doctest = true
21 | )
22 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/common.jl:
--------------------------------------------------------------------------------
 1 | # function to run a Julia script outside of the current environment
 2 | function script(code; wrapper=``, args=``)
 3 |     if Base.JLOptions().project != C_NULL
 4 |         args = `$args --project=$(unsafe_string(Base.JLOptions().project))`
 5 |     end
 6 |     mktemp() do path, io
 7 |         write(io, code)
 8 |         flush(io)
 9 |         cmd = `$wrapper $(Base.julia_cmd()) $args $path`
10 |         # redirect stderr to stdout to have it picked up by Weave.jl
11 |         run(pipeline(ignorestatus(cmd), stderr=stdout))
12 |     end
13 |     nothing
14 | end
15 | 


--------------------------------------------------------------------------------
/src/forwarddiff.jl:
--------------------------------------------------------------------------------
 1 | # ForwardDiff integration
 2 | 
 3 | for f in libdevice
 4 |   if haskey(ForwardDiff.DiffRules.DEFINED_DIFFRULES, (:Base,f,1))
 5 |     f == :tanh && continue
 6 |     diffrule = ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:Base,f,1)]
 7 |     ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative,f,1)] =
 8 |       (args...) -> replace_device(diffrule(args...))
 9 |     eval(ForwardDiff.unary_dual_definition(:CUDAnative, f))
10 |   end
11 | end
12 | 
13 | ForwardDiff.DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x ->
14 |   replace_device(:(1-tanh(x)^2))
15 | eval(ForwardDiff.unary_dual_definition(:CUDAnative, :tanh))
16 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | include("util.jl")
 4 | 
 5 | using Random
 6 | Random.seed!(1)
 7 | 
 8 | using CuArrays
 9 | 
10 | using GPUArrays
11 | import GPUArrays: allowscalar, @allowscalar
12 | 
13 | testf(f, xs...; kwargs...) = GPUArrays.TestSuite.compare(f, CuArray, xs...; kwargs...)
14 | 
15 | allowscalar(false)
16 | 
17 | @testset "CuArrays" begin
18 | 
19 | include("base.jl")
20 | include("blas.jl")
21 | include("rand.jl")
22 | include("fft.jl")
23 | include("sparse.jl")
24 | include("solver.jl")
25 | include("sparse_solver.jl")
26 | include("dnn.jl")
27 | 
28 | CuArrays.pool_status()
29 | CuArrays.pool_timings()
30 | 
31 | end
32 | 


--------------------------------------------------------------------------------
/src/dnn/error.jl:
--------------------------------------------------------------------------------
 1 | export CUDNNError
 2 | 
 3 | struct CUDNNError <: Exception
 4 |     code::cudnnStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUDNNError) = print(io, "CUDNNError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUDNNError(status::cudnnStatus_t)
10 |     msg = unsafe_string(cudnnGetErrorString(status))
11 |     return CUDNNError(status, msg)
12 | end
13 | 
14 | macro check(dnn_func)
15 |     quote
16 |         local err::cudnnStatus_t
17 |         err = $(esc(dnn_func))
18 |         if err != CUDNN_STATUS_SUCCESS
19 |             throw(CUDNNError(err))
20 |         end
21 |         err
22 |     end
23 | end
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/src/blas/util.jl:
--------------------------------------------------------------------------------
 1 | # convert matrix to band storage
 2 | function band(A::AbstractMatrix,kl,ku)
 3 |     m, n = size(A)
 4 |     AB = zeros(eltype(A),kl+ku+1,n)
 5 |     for j = 1:n
 6 |         for i = max(1,j-ku):min(m,j+kl)
 7 |             AB[ku+1-j+i,j] = A[i,j]
 8 |         end
 9 |     end
10 |     return AB
11 | end
12 | 
13 | # convert band storage to general matrix
14 | function unband(AB::AbstractMatrix,m,kl,ku)
15 |     bm, n = size(AB)
16 |     A = zeros(eltype(AB),m,n)
17 |     for j = 1:n
18 |         for i = max(1,j-ku):min(m,j+kl)
19 |             A[i,j] = AB[ku+1-j+i,j]
20 |         end
21 |     end
22 |     return A
23 | end
24 | 
25 | # zero out elements not on matrix bands
26 | function bandex(A::AbstractMatrix,kl,ku)
27 |     m, n = size(A)
28 |     AB = band(A,kl,ku)
29 |     B = unband(AB,m,kl,ku)
30 |     return B
31 | end
32 | 


--------------------------------------------------------------------------------
/src/fft/CUFFT.jl:
--------------------------------------------------------------------------------
 1 | module CUFFT
 2 | 
 3 | import CUDAapi
 4 | 
 5 | using ..CuArrays
 6 | using ..CuArrays: libcufft, configured
 7 | 
 8 | import AbstractFFTs: plan_fft, plan_fft!, plan_bfft, plan_bfft!,
 9 |     plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft,
10 |     Plan, ScaledPlan
11 | import Base: show, *, convert, unsafe_convert, size, strides, ndims
12 | import Base.Sys: WORD_SIZE
13 | 
14 | using LinearAlgebra
15 | import LinearAlgebra: mul!
16 | 
17 | include("libcufft_types.jl")
18 | include("error.jl")
19 | 
20 | include("libcufft.jl")
21 | include("genericfft.jl")
22 | include("fft.jl")
23 | include("wrappers.jl")
24 | include("highlevel.jl")
25 | 
26 | version() = VersionNumber(cufftGetProperty(CUDAapi.MAJOR_VERSION),
27 |                           cufftGetProperty(CUDAapi.MINOR_VERSION),
28 |                           cufftGetProperty(CUDAapi.PATCH_LEVEL))
29 | 
30 | end
31 | 


--------------------------------------------------------------------------------
/src/dnn/compat.jl:
--------------------------------------------------------------------------------
 1 | # Compatibility shims until users upgrade to new NNlib format
 2 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
 3 |     cdims = DenseConvDims(x, w; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation)
 4 |     return conv!(y, x, w, cdims; kwargs...)
 5 | end
 6 | 
 7 | function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T}; pad=0, stride=1, flipkernel=0, dilation=1, kwargs...) where {T<:CUDNNFloat}
 8 |     cdims = DenseConvDims(x, dw; padding=pad, stride=stride, flipkernel=flipkernel, dilation=dilation)
 9 |     # NOTE!!!  This compat shim re-arranges the argument order!
10 |     return ∇conv_filter!(dw, x, dy, cdims; kwargs...)
11 | end
12 | 
13 | function maxpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
14 |     pdims = PoolDims(x, k; padding=pad, stride=stride)
15 |     return maxpool!(y, x, pdims)
16 | end
17 | 
18 | function meanpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where {T<:CUDNNFloat}
19 |     pdims = PoolDims(x, k; padding=pad, stride=stride)
20 |     return meanpool!(y, x, pdims)
21 | end
22 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The CuArrays.jl package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2017: Mike J Innes.
 4 | > 
 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | > of this software and associated documentation files (the "Software"), to deal
 7 | > in the Software without restriction, including without limitation the rights
 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | > copies of the Software, and to permit persons to whom the Software is
10 | > furnished to do so, subject to the following conditions:
11 | > 
12 | > The above copyright notice and this permission notice shall be included in all
13 | > copies or substantial portions of the Software.
14 | > 
15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | > SOFTWARE.
22 | > 
23 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "CuArrays"
 2 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 3 | version = "2.0.0"
 4 | 
 5 | [deps]
 6 | AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 7 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 8 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 9 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
10 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
11 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
12 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
13 | MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
14 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
15 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
16 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
17 | Requires = "ae029012-a4dd-5104-9daa-d747884805df"
18 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
19 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
20 | 
21 | [extras]
22 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
23 | ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
24 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
25 | 
26 | [targets]
27 | test = ["Test", "FFTW", "ForwardDiff"]
28 | 
29 | [compat]
30 | julia = "1.0"
31 | CUDAnative = "2.0"
32 | CUDAdrv = "3.0"
33 | CUDAapi = "0.5.3, 0.6, 1.0"
34 | NNlib = "0.6"
35 | GPUArrays = "0.7"
36 | Adapt = "0.4"
37 | 


--------------------------------------------------------------------------------
/src/indexing.jl:
--------------------------------------------------------------------------------
 1 | import GPUArrays: allowscalar, @allowscalar
 2 | 
 3 | function _getindex(xs::CuArray{T}, i::Integer) where T
 4 |   buf = Array{T}(undef)
 5 |   copyto!(buf, 1, xs, i, 1)
 6 |   buf[]
 7 | end
 8 | 
 9 | function _setindex!(xs::CuArray{T}, v::T, i::Integer) where T
10 |   copyto!(xs, i, T[v], 1, 1)
11 | end
12 | 
13 | 
14 | ## logical indexing
15 | 
16 | Base.getindex(xs::CuArray, bools::AbstractArray{Bool}) = getindex(xs, CuArray(bools))
17 | 
18 | function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T}
19 |   bools = reshape(bools, prod(size(bools)))
20 |   indices = cumsum(bools)  # unique indices for elements that are true
21 | 
22 |   n = _getindex(indices, length(indices))  # number that are true
23 |   ys = CuArray{T}(undef, n)
24 | 
25 |   if n > 0
26 |     num_threads = min(n, 256)
27 |     num_blocks = ceil(Int, length(indices) / num_threads)
28 | 
29 |     function kernel(ys::CuDeviceArray{T}, xs::CuDeviceArray{T}, bools, indices)
30 |         i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
31 | 
32 |         if i <= length(xs) && bools[i]
33 |             b = indices[i]   # new position
34 |             ys[b] = xs[i]
35 | 
36 |         end
37 | 
38 |         return
39 |     end
40 | 
41 |     @cuda blocks=num_blocks threads=num_threads kernel(ys, xs, bools, indices)
42 |   end
43 | 
44 |   return ys
45 | end
46 | 


--------------------------------------------------------------------------------
/src/nnlib.jl:
--------------------------------------------------------------------------------
 1 | using NNlib
 2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
 3 |   maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
 4 |   softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
 5 | using CUDAnative
 6 | 
 7 | # Activation functions
 8 | @cufunc σ(x) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x)))
 9 | 
10 | @cufunc function logσ(x)
11 |   max_v = max(zero(x), -x)
12 |   z = exp(-max_v) + exp(-x-max_v)
13 |   -(max_v + log(z))
14 | end
15 | 
16 | @cufunc elu(x, α = one(x)) =
17 |   ifelse(x ≥ 0, x/1, α * (exp(x) - one(x)))
18 | 
19 | @cufunc swish(x) = x * σ(x)
20 | 
21 | @cufunc function gelu(x)
22 |   λ = oftype(x/1, √(2/π))
23 |   α = oftype(x/1, 0.044715)
24 |   h = oftype(x/1, 0.5)
25 |   h * x * (one(x) + tanh(λ * (x + α * x^3)))
26 | end
27 | 
28 | @cufunc function selu(x)
29 |   λ = oftype(x/1, 1.0507009873554804934193349852946)
30 |   α = oftype(x/1, 1.6732632423543772848170429916717)
31 |   λ * ifelse(x > 0, x/1, α * (exp(x) - 1))
32 | end
33 | 
34 | @cufunc softplus(x) = log1p(exp(x))
35 | 
36 | if !@isdefined CUDNN
37 |   function conv!(y::CuArray, x::CuArray, w::CuArray; kw...)
38 |     error("CUDNN is not installed.")
39 |   end
40 |   function softmax!(out::CuVecOrMat, xs::CuVecOrMat)
41 |     error("CUDNN is not installed.")
42 |   end
43 |   function logsoftmax!(out::CuVecOrMat, xs::CuVecOrMat)
44 |     error("CUDNN is not installed.")
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/src/dnn/CUDNN.jl:
--------------------------------------------------------------------------------
 1 | module CUDNN
 2 | 
 3 | import CUDAapi
 4 | 
 5 | import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL
 6 | 
 7 | using ..CuArrays
 8 | using ..CuArrays: libcudnn, active_context, configured, unsafe_free!
 9 | using ..CuArrays: CuVecOrMat, CuVector
10 | using NNlib
11 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, stride, dilation, flipkernel,
12 |   maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, spatial_dims, padding, kernel_size,
13 |   softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
14 | using CUDAnative
15 | include("libcudnn_types.jl")
16 | include("error.jl")
17 | 
18 | const _handles = Dict{CuContext,cudnnHandle_t}()
19 | const _handle = Ref{cudnnHandle_t}(C_NULL)
20 | 
21 | function handle()
22 |     if _handle[] == C_NULL
23 |         @assert isassigned(active_context) # some other call should have initialized CUDA
24 |         _handle[] = get!(_handles, active_context[]) do
25 |             context = active_context[]
26 |             handle = cudnnCreate()
27 |             atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle))
28 |             handle
29 |         end
30 |     end
31 | 
32 |     return _handle[]
33 | end
34 | 
35 | include("libcudnn.jl")
36 | include("helpers.jl")
37 | include("nnlib.jl")
38 | include("compat.jl")
39 | 
40 | version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION),
41 |                           cudnnGetProperty(CUDAapi.MINOR_VERSION),
42 |                           cudnnGetProperty(CUDAapi.PATCH_LEVEL))
43 | 
44 | end
45 | 


--------------------------------------------------------------------------------
/src/rand/CURAND.jl:
--------------------------------------------------------------------------------
 1 | module CURAND
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuPtr
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcurand, active_context
 8 | 
 9 | using GPUArrays
10 | 
11 | using Random
12 | 
13 | export rand_logn!, rand_poisson!
14 | 
15 | include("libcurand_types.jl")
16 | include("error.jl")
17 | 
18 | const _generators = Dict{CuContext,RNG}()
19 | const _generator = Ref{Union{Nothing,RNG}}(nothing)
20 | 
21 | function generator()
22 |     if _generator[] == nothing
23 |         @assert isassigned(active_context) # some other call should have initialized CUDA
24 |         _generator[] = get!(_generators, active_context[]) do
25 |             context = active_context[]
26 |             generator = create_generator()
27 |             # FIXME: crashes
28 |             #atexit(()->CUDAdrv.isvalid(context) && destroy_generator(generator))
29 |             generator
30 |         end
31 |     end
32 | 
33 |     return _generator[]::RNG
34 | end
35 | 
36 | include("libcurand.jl")
37 | include("highlevel.jl")
38 | 
39 | version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION),
40 |                           curandGetProperty(CUDAapi.MINOR_VERSION),
41 |                           curandGetProperty(CUDAapi.PATCH_LEVEL))
42 | 
43 | end
44 | 
45 | const rand = CURAND.rand
46 | const randn = CURAND.randn
47 | const rand_logn = CURAND.rand_logn
48 | const rand_poisson = CURAND.rand_poisson
49 | 
50 | @deprecate curand CuArrays.rand
51 | @deprecate curandn CuArrays.randn
52 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   CI_IMAGE_TAG: 'cuda'
 3 |   JULIA_NUM_THREADS: '4'
 4 | 
 5 | include:
 6 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/common.yml'
 7 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.0.yml'
 8 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_v1.1.yml'
 9 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/test_dev.yml'
10 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/coverage_v1.1.yml'
11 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v4/documentation_v1.1.yml'
12 | 
13 | test:dev:
14 |   allow_failure: true
15 | 
16 | coverage:
17 |   allow_failure: true
18 |   only:
19 |     - master
20 | 
21 | pages:
22 |   stage: deploy
23 |   script:
24 |     - mv docs/build public
25 |   artifacts:
26 |     paths:
27 |     - public
28 |   only:
29 |     - master
30 | 
31 | flux:
32 |   stage: test
33 |   image: "juliagpu/julia:v1.1-cuda"
34 |   script:
35 |     - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325
36 |     - julia -e 'using Pkg;
37 |                 Pkg.develop([PackageSpec(path=pwd());
38 |                              [PackageSpec(name=pkg)
39 |                               for pkg in split(get(ENV,"CI_DEV_PKGS",""))]]);
40 |                 Pkg.build("CuArrays")'
41 |     - julia -e 'using Pkg;
42 |                 Pkg.add("Flux");
43 |                 Pkg.test("Flux")'
44 |   allow_failure: true
45 | 


--------------------------------------------------------------------------------
/src/sparse/CUSPARSE.jl:
--------------------------------------------------------------------------------
 1 | module CUSPARSE
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcusparse, active_context, unsafe_free!
 8 | 
 9 | using SparseArrays
10 | using LinearAlgebra
11 | 
12 | import Base.one
13 | import Base.zero
14 | 
15 | const SparseChar = Char
16 | import Base.one
17 | import Base.zero
18 | 
19 | export CuSparseMatrixCSC, CuSparseMatrixCSR,
20 |        CuSparseMatrixHYB, CuSparseMatrixBSR,
21 |        CuSparseMatrix, AbstractCuSparseMatrix,
22 |        CuSparseVector
23 | 
24 | include("libcusparse_types.jl")
25 | include("error.jl")
26 | 
27 | const _handles = Dict{CuContext,cusparseHandle_t}()
28 | const _handle = Ref{cusparseHandle_t}()
29 | 
30 | function handle()
31 |     if _handle[] == C_NULL
32 |         @assert isassigned(active_context) # some other call should have initialized CUDA
33 |         _handle[] = get!(_handles, active_context[]) do
34 |             context = active_context[]
35 |             handle = cusparseCreate()
36 |             atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle))
37 |             handle
38 |         end
39 |     end
40 | 
41 |     return _handle[]
42 | end
43 | 
44 | include("libcusparse.jl")
45 | include("array.jl")
46 | include("util.jl")
47 | include("wrappers.jl")
48 | include("highlevel.jl")
49 | 
50 | version() = VersionNumber(cusparseGetProperty(CUDAapi.MAJOR_VERSION),
51 |                           cusparseGetProperty(CUDAapi.MINOR_VERSION),
52 |                           cusparseGetProperty(CUDAapi.PATCH_LEVEL))
53 | 
54 | end
55 | 


--------------------------------------------------------------------------------
/src/subarray.jl:
--------------------------------------------------------------------------------
 1 | import Base: view
 2 | 
 3 | using Base: ScalarIndex, ViewIndex, Slice, @_inline_meta, @boundscheck, 
 4 |             to_indices, compute_offset1, unsafe_length, _maybe_reshape_parent, index_ndims
 5 | 
 6 | struct Contiguous end
 7 | struct NonContiguous end
 8 | 
 9 | # Detect whether the view is contiguous or not
10 | CuIndexStyle() = Contiguous()
11 | CuIndexStyle(I...) = NonContiguous()
12 | CuIndexStyle(i1::Colon, ::ScalarIndex...) = Contiguous()
13 | CuIndexStyle(i1::AbstractUnitRange, ::ScalarIndex...) = Contiguous()
14 | CuIndexStyle(i1::Colon, I...) = CuIndexStyle(I...)
15 | 
16 | cuviewlength() = ()
17 | cuviewlength(::Real, I...) = (@_inline_meta; cuviewlength(I...)) # skip scalars
18 | cuviewlength(i1::AbstractUnitRange, I...) = (@_inline_meta; (unsafe_length(i1), cuviewlength(I...)...))
19 | cuviewlength(i1::AbstractUnitRange, ::ScalarIndex...) = (@_inline_meta; (unsafe_length(i1),))
20 | 
21 | view(A::CuArray, I::Vararg{Any,N}) where {N} = (@_inline_meta; _cuview(A, I, CuIndexStyle(I...)))
22 | 
23 | function _cuview(A, I, ::Contiguous)
24 |     @_inline_meta
25 |     J = to_indices(A, I)
26 |     @boundscheck checkbounds(A, J...)
27 |     _cuview(_maybe_reshape_parent(A, index_ndims(J...)), J, cuviewlength(J...))
28 | end
29 | 
30 | # for contiguous views just return a new CuArray
31 | _cuview(A::CuArray{T}, I::NTuple{N,ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M} =
32 |     CuArray{T,M}(A.buf, dims; offset=A.offset + compute_offset1(A, 1, I) * sizeof(T), own=A.own)
33 | 
34 | # fallback to SubArray when the view is not contiguous
35 | _cuview(A, I, ::NonContiguous) where {N} = invoke(view, Tuple{AbstractArray, typeof(I).parameters...}, A, I...)
36 | 


--------------------------------------------------------------------------------
/src/solver/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSOLVERError
 2 | 
 3 | struct CUSOLVERError <: Exception
 4 |     code::cusolverStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUSOLVERError) = print(io, "CUSOLVERError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUSOLVERError(code::cusolverStatus_t)
10 |     msg = status_message(code)
11 |     return CUSOLVERError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUSOLVER_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUSOLVER_STATUS_NOT_INITIALIZED
18 |         return "the library was not initialized"
19 |     elseif status == CUSOLVER_STATUS_ALLOC_FAILED
20 |         return "the resource allocation failed"
21 |     elseif status == CUSOLVER_STATUS_INVALID_VALUE
22 |         return "an invalid value was used as an argument"
23 |     elseif status == CUSOLVER_STATUS_ARCH_MISMATCH
24 |         return "an absent device architectural feature is required"
25 |     elseif status == CUSOLVER_STATUS_EXECUTION_FAILED
26 |         return "the GPU program failed to execute"
27 |     elseif status == CUSOLVER_STATUS_INTERNAL_ERROR
28 |         return "an internal operation failed"
29 |     elseif status == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED
30 |         return "the matrix type is not supported."
31 |     else
32 |         return "unknown status"
33 |     end
34 | end
35 | 
36 | macro check(solver_func)
37 |     quote
38 |         local err::cusolverStatus_t
39 |         err = $(esc(solver_func::Expr))
40 |         if err != CUSOLVER_STATUS_SUCCESS
41 |             throw(CUSOLVERError(err))
42 |         end
43 |         err
44 |     end
45 | end


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Sanity checks (read this first, then remove this section)**
11 | Make sure you're reporting *a bug*; for general questions, please use Discourse.
12 | 
13 | If you're dealing with a performance issue, make sure you **disable scalar iteration** (`CuArrays.allowscalar(false)`). Only file an issue if that shows scalar iteration happening within Base or CuArrays, as opposed to your own code.
14 | 
15 | If you're seeing an error message, **follow the error message instructions**, if any (eg. `inspect code with @device_code_warntype`). If you can't solve the problem using that information, make sure to post it as part of the issue.
16 | 
17 | If your bug is still valid, please go ahead and fill out the template below.
18 | 
19 | **Describe the bug**
20 | A clear and concise description of what the bug is.
21 | 
22 | **To Reproduce**
23 | The Minimal Working Example (MWE) for this bug:
24 | ```julia
25 | # some code here
26 | ```
27 | 
28 | **Expected behavior**
29 | A clear and concise description of what you expected to happen.
30 | 
31 | **Build log**
32 | ```
33 | # post the output of Pkg.build()
34 | # make sure the error still reproduces after that.
35 | ```
36 | 
37 | **Environment details (please complete this section)**
38 | Details on Julia:
39 | ```
40 | # please post the output of:
41 | versioninfo()
42 | ```
43 | 
44 | Julia packages:
45 |  - CuArrays.jl:
46 |  - CUDAnative.jl:
47 |  - ...
48 | 
49 | CUDA: toolkit and driver version
50 | 
51 | 
52 | **Additional context**
53 | Add any other context about the problem here.
54 | 


--------------------------------------------------------------------------------
/src/sparse/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSPARSError
 2 | 
 3 | struct CUSPARSEError <: Exception
 4 |     code::cusparseStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUSPARSEError) = print(io, "CUSPARSError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUSPARSError(code::cusparseStatus_t)
10 |     msg = status_message(code)
11 |     return CUSPARSEError(code, msg)
12 | end
13 | 
14 | 
15 | function statusmessage( status )
16 |     if status == CUSPARSE_STATUS_SUCCESS
17 |         return "cusparse success"
18 |     end
19 |     if status == CUSPARSE_STATUS_NOT_INITIALIZED
20 |         return "cusparse not initialized"
21 |     end
22 |     if status == CUSPARSE_STATUS_ALLOC_FAILED
23 |         return "cusparse allocation failed"
24 |     end
25 |     if status == CUSPARSE_STATUS_INVALID_VALUE
26 |         return "cusparse invalid value"
27 |     end
28 |     if status == CUSPARSE_STATUS_ARCH_MISMATCH
29 |         return "cusparse architecture mismatch"
30 |     end
31 |     if status == CUSPARSE_STATUS_MAPPING_ERROR
32 |         return "cusparse mapping error"
33 |     end
34 |     if status == CUSPARSE_STATUS_EXECUTION_FAILED
35 |         return "cusparse execution failed"
36 |     end
37 |     if status == CUSPARSE_STATUS_INTERNAL_ERROR
38 |         return "cusparse internal error"
39 |     end
40 |     if status == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
41 |         return "cusparse matrix type not supported"
42 |     end
43 | end
44 | 
45 | macro check(sparse_func)
46 |     quote
47 |         local err = $(esc(sparse_func::Expr))
48 |         if err != CUSPARSE_STATUS_SUCCESS
49 |             throw(CUSPARSEError(cusparseStatus_t(err)))
50 |         end
51 |         err
52 |     end
53 | end
54 | 
55 | 


--------------------------------------------------------------------------------
/test/rand.jl:
--------------------------------------------------------------------------------
 1 | @testset "CURAND" begin
 2 | 
 3 | using CuArrays.CURAND
 4 | 
 5 | CURAND.seed!()
 6 | 
 7 | # in-place
 8 | for (f,T) in ((rand!,Float32),
 9 |               (randn!,Float32),
10 |               (rand_logn!,Float32),
11 |               (rand_poisson!,Cuint)),
12 |     d in (2, (2,2), (2,2,2))
13 |     A = CuArray{T}(undef, d)
14 |     f(A)
15 | end
16 | 
17 | # out-of-place, with implicit type
18 | for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32),
19 |               (CuArrays.rand_logn,Float32), (CuArrays.rand_poisson,Cuint),
20 |               (rand,Float64), (randn,Float64)),
21 |     args in ((2,), (2, 2))
22 |     A = f(args...)
23 |     @test eltype(A) == T
24 | end
25 | 
26 | # out-of-place, with type specified
27 | for (f,T) in ((CuArrays.rand,Float32), (CuArrays.randn,Float32), (CuArrays.rand_logn,Float32),
28 |               (CuArrays.rand,Float64), (CuArrays.randn,Float64), (CuArrays.rand_logn,Float64),
29 |               (CuArrays.rand_poisson,Cuint),
30 |               (rand,Float32), (randn,Float32),
31 |               (rand,Float64), (randn,Float64)),
32 |     args in ((T, 2), (T, 2, 2), (T, (2, 2)))
33 |     A = f(args...)
34 |     @test eltype(A) == T
35 | end
36 | 
37 | # unsupported types that fall back to GPUArrays
38 | for (f,T) in ((CuArrays.rand,Int64),),
39 |     args in ((T, 2), (T, 2, 2), (T, (2, 2)))
40 |     A = f(args...)
41 |     @test eltype(A) == T
42 | end
43 | for (f,T) in ((rand!,Int64),),
44 |     d in (2, (2,2), (2,2,2))
45 |     A = CuArray{T}(undef, d)
46 |     f(A)
47 | end
48 | 
49 | @test_throws ErrorException randn!(CuArray{Cuint}(undef, 10)) 
50 | @test_throws ErrorException rand_logn!(CuArray{Cuint}(undef, 10)) 
51 | @test_throws ErrorException rand_poisson!(CuArray{Float64}(undef, 10)) 
52 | 
53 | end
54 | 


--------------------------------------------------------------------------------
/src/accumulate.jl:
--------------------------------------------------------------------------------
 1 | # Implements the Hillis--Steele algorithm using global memory
 2 | # See algorithm 1 at https://en.wikipedia.org/wiki/Prefix_sum#Parallel_algorithm
 3 | 
 4 | # TODO: features
 5 | # - init::Some
 6 | # - CuMatrix
 7 | # - pairwise
 8 | 
 9 | # TODO: performance
10 | # - shared memory / shuffle (see CUDAnative.jl/examples/scan)
11 | 
12 | function Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, dims::Int,
13 |                            init::Nothing) where {T}
14 |     if dims != 1
15 |         return copyto!(vout, v)
16 |     end
17 | 
18 |     return Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, nothing, nothing)
19 | end
20 | 
21 | function Base._accumulate!(op::Function, vout::CuVector{T}, v::CuVector, dims::Nothing,
22 |                            init::Nothing) where {T}
23 |     vin = T.(v)  # convert to vector with eltype T
24 | 
25 |     Δ = 1   # Δ = 2^d
26 |     n = ceil(Int, log2(length(v)))
27 | 
28 |     num_threads = 256
29 |     num_blocks = ceil(Int, length(v) / num_threads)
30 | 
31 |     for d in 0:n   # passes through data
32 |         @cuda blocks=num_blocks threads=num_threads _partial_accumulate!(op, vout, vin, Δ)
33 | 
34 |         vin, vout = vout, vin
35 |         Δ *= 2
36 |     end
37 | 
38 |     return vin
39 | end
40 | 
41 | function _partial_accumulate!(op, vout, vin, Δ)
42 |     @inbounds begin
43 |         k = threadIdx().x + (blockIdx().x - 1) * blockDim().x
44 | 
45 |         if k <= length(vin)
46 |             if k > Δ
47 |                 vout[k] = op(vin[k - Δ], vin[k])
48 |             else
49 |                 vout[k] = vin[k]
50 |             end
51 |         end
52 |     end
53 | 
54 |     return
55 | end
56 | 
57 | Base.accumulate_pairwise!(op, result::CuVector, v::CuVector) = accumulate!(op, result, v)
58 | 


--------------------------------------------------------------------------------
/src/fft/genericfft.jl:
--------------------------------------------------------------------------------
 1 | cufftfloat(x) = _cufftfloat(float(x))
 2 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T
 3 | _cufftfloat(::Type{Float16}) = Float32
 4 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)}
 5 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported")
 6 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x)
 7 | 
 8 | complexfloat(x::CuArray{Complex{<:cufftReals}}) = x
 9 | realfloat(x::CuArray{<:cufftReals}) = x
10 | 
11 | complexfloat(x::CuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x)
12 | complexfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x)
13 | 
14 | realfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x)
15 | 
16 | function copy1(::Type{T}, x) where T
17 |     y = CuArray{T}(undef, map(length, axes(x)))
18 |     #copy!(y, x)
19 |     y .= broadcast(xi->convert(T,xi),x)
20 | end
21 | 
22 | # promote to a complex floating-point type (out-of-place only),
23 | # so implementations only need Complex{Float} methods
24 | for f in (:fft, :bfft, :ifft)
25 |     pf = Symbol("plan_", f)
26 |     @eval begin
27 |         $f(x::CuArray{<:Real}, region=1:ndims(x)) = $f(complexfloat(x), region)
28 |         $pf(x::CuArray{<:Real}, region) = $pf(complexfloat(x), region)
29 |         $f(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region=1:ndims(x)) = $f(complexfloat(x), region)
30 |         $pf(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region) = $pf(complexfloat(x), region)
31 |     end
32 | end
33 | rfft(x::CuArray{<:Union{Integer,Rational}}, region=1:ndims(x)) = rfft(realfloat(x), region)
34 | plan_rfft(x::CuArray{<:Real}, region) = plan_rfft(realfloat(x), region)
35 | 
36 | *(p::Plan{T}, x::CuArray) where {T} = p * copy1(T, x)
37 | *(p::ScaledPlan, x::CuArray) = rmul!(p.p * x, p.scale)
38 | 


--------------------------------------------------------------------------------
/src/blas/error.jl:
--------------------------------------------------------------------------------
 1 | export CUBLASError
 2 | 
 3 | struct CUBLASError <: Exception
 4 |     code::cublasStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUBLASError) = print(io, "CUBLASError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUBLASError(code::cublasStatus_t)
10 |     msg = status_message(code)
11 |     return CUBLASError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUBLAS_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUBLAS_STATUS_NOT_INITIALIZED
18 |         return "the library was not initialized"
19 |     elseif status == CUBLAS_STATUS_ALLOC_FAILED
20 |         return "the resource allocation failed"
21 |     elseif status == CUBLAS_STATUS_INVALID_VALUE
22 |         return "an invalid value was used as an argument"
23 |     elseif status == CUBLAS_STATUS_ARCH_MISMATCH
24 |         return "an absent device architectural feature is required"
25 |     elseif status == CUBLAS_STATUS_MAPPING_ERROR
26 |         return "an access to GPU memory space failed"
27 |     elseif status == CUBLAS_STATUS_EXECUTION_FAILED
28 |         return "the GPU program failed to execute"
29 |     elseif status == CUBLAS_STATUS_INTERNAL_ERROR
30 |         return "an internal operation failed"
31 |     elseif status == CUBLAS_STATUS_NOT_SUPPORTED
32 |         return "the requested feature is not supported"
33 |     elseif status == CUBLAS_STATUS_LICENSE_ERROR
34 |         return "error detected trying to check the license"
35 |     else
36 |         return "unknown status"
37 |     end
38 | end
39 | 
40 | macro check(blas_func)
41 |     quote
42 |         local err::cublasStatus_t
43 |         err = $(esc(blas_func::Expr))
44 |         if err != CUBLAS_STATUS_SUCCESS
45 |             throw(CUBLASError(err))
46 |         end
47 |         err
48 |     end
49 | end


--------------------------------------------------------------------------------
/src/broadcast.jl:
--------------------------------------------------------------------------------
 1 | import Base.Broadcast: Broadcasted, Extruded, BroadcastStyle, ArrayStyle
 2 | 
 3 | BroadcastStyle(::Type{<:CuArray}) = ArrayStyle{CuArray}()
 4 | 
 5 | function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}) where T
 6 |     similar(CuArray{T}, axes(bc))
 7 | end
 8 | 
 9 | 
10 | # replace base functions with libdevice alternatives
11 | # TODO: do this with Cassette.jl
12 | 
13 | cufunc(f) = f
14 | cufunc(::Type{T}) where T = (x...) -> T(x...) # broadcasting type ctors isn't GPU compatible
15 | 
16 | Broadcast.broadcasted(::ArrayStyle{CuArray}, f, args...) =
17 |   Broadcasted{ArrayStyle{CuArray}}(cufunc(f), args, nothing)
18 | 
19 | libdevice = :[
20 |   cos, cospi, sin, sinpi, tan, acos, asin, atan,
21 |   cosh, sinh, tanh, acosh, asinh, atanh,
22 |   log, log10, log1p, log2, logb, ilogb,
23 |   exp, exp2, exp10, expm1, ldexp,
24 |   erf, erfinv, erfc, erfcinv, erfcx,
25 |   brev, clz, ffs, byte_perm, popc,
26 |   isfinite, isinf, isnan, nearbyint,
27 |   nextafter, signbit, copysign, abs,
28 |   sqrt, rsqrt, cbrt, rcbrt, pow,
29 |   ceil, floor, saturate,
30 |   lgamma, tgamma,
31 |   j0, j1, jn, y0, y1, yn,
32 |   normcdf, normcdfinv, hypot,
33 |   fma, sad, dim, mul24, mul64hi, hadd, rhadd, scalbn].args
34 | 
35 | for f in libdevice
36 |   isdefined(Base, f) || continue
37 |   @eval cufunc(::typeof(Base.$f)) = CUDAnative.$f
38 | end
39 | 
40 | using MacroTools
41 | 
42 | const _cufuncs = copy(libdevice)
43 | cufuncs() = (global _cufuncs; _cufuncs)
44 | 
45 | function replace_device(ex)
46 |   global _cufuncs
47 |   MacroTools.postwalk(ex) do x
48 |     x in _cufuncs ? :(CuArrays.cufunc($x)) : x
49 |   end
50 | end
51 | 
52 | macro cufunc(ex)
53 |   global _cufuncs
54 |   def = MacroTools.splitdef(ex)
55 |   f = def[:name]
56 |   def[:name] = Symbol(:cu, f)
57 |   def[:body] = replace_device(def[:body])
58 |   push!(_cufuncs, f)
59 |   quote
60 |     $(esc(MacroTools.combinedef(def)))
61 |     CuArrays.cufunc(::typeof($(esc(f)))) = $(esc(def[:name]))
62 |   end
63 | end
64 | 


--------------------------------------------------------------------------------
/src/solver/CUSOLVER.jl:
--------------------------------------------------------------------------------
 1 | module CUSOLVER
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free!
 8 | 
 9 | using LinearAlgebra
10 | using SparseArrays 
11 | 
12 | import Base.one
13 | import Base.zero
14 | import CuArrays.CUSPARSE.CuSparseMatrixCSR
15 | import CuArrays.CUSPARSE.CuSparseMatrixCSC
16 | import CuArrays.CUSPARSE.cusparseMatDescr_t
17 | 
18 | include("libcusolver_types.jl")
19 | include("error.jl")
20 | 
21 | const _dense_handles = Dict{CuContext,cusolverDnHandle_t}()
22 | const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL)
23 | const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}()
24 | const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL)
25 | 
26 | function dense_handle()
27 |     if _dense_handle[] == C_NULL
28 |         @assert isassigned(active_context) # some other call should have initialized CUDA
29 |         _dense_handle[] = get!(_dense_handles, active_context[]) do
30 |             context = active_context[]
31 |             handle = cusolverDnCreate()
32 |             atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle))
33 |             handle
34 |         end
35 |     end
36 |     return _dense_handle[]
37 | end
38 | 
39 | function sparse_handle()
40 |     if _sparse_handle[] == C_NULL
41 |         @assert isassigned(active_context) # some other call should have initialized CUDA
42 |         _sparse_handle[] = get!(_sparse_handles, active_context[]) do
43 |             context = active_context[]
44 |             handle = cusolverSpCreate()
45 |             atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle))
46 |             handle
47 |         end
48 |     end
49 |     return _sparse_handle[]
50 | end
51 | 
52 | include("libcusolver.jl")
53 | include("sparse.jl")
54 | include("dense.jl")
55 | include("highlevel.jl")
56 | 
57 | version() = VersionNumber(cusolverGetProperty(CUDAapi.MAJOR_VERSION),
58 |                           cusolverGetProperty(CUDAapi.MINOR_VERSION),
59 |                           cusolverGetProperty(CUDAapi.PATCH_LEVEL))
60 | 
61 | end
62 | 


--------------------------------------------------------------------------------
/src/rand/error.jl:
--------------------------------------------------------------------------------
 1 | export CURANDError
 2 | 
 3 | struct CURANDError <: Exception
 4 |     code::curandStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CURANDError) = print(io, "CURANDError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CURANDError(code::curandStatus_t)
10 |     msg = status_message(code)
11 |     return CURANDError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CURAND_STATUS_SUCCESS
16 |         return "generator was created successfully"
17 |     elseif status == CURAND_STATUS_VERSION_MISMATCH
18 |         return "Header file and linked library version do not match"
19 |     elseif status == CURAND_STATUS_NOT_INITIALIZED
20 |         return "Generator not initialized"
21 |     elseif status == CURAND_STATUS_ALLOCATION_FAILED
22 |         return "Memory allocation failed"
23 |     elseif status == CURAND_STATUS_TYPE_ERROR
24 |         return "Generator is wrong type"
25 |     elseif status == CURAND_STATUS_OUT_OF_RANGE
26 |         return "Argument out of range"
27 |     elseif status == CURAND_STATUS_LENGTH_NOT_MULTIPLE
28 |         return "Length requested is not a multple of dimension"
29 |     elseif status == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED
30 |         return "GPU does not have double precision required by MRG32k3a"
31 |     elseif status == CURAND_STATUS_LAUNCH_FAILURE
32 |         return "Kernel launch failure"
33 |     elseif status == CURAND_STATUS_PREEXISTING_FAILURE
34 |         return "Preexisting failure on library entry"
35 |     elseif status == CURAND_STATUS_INITIALIZATION_FAILED
36 |         return "Initialization of CUDA failed"
37 |     elseif status == CURAND_STATUS_ARCH_MISMATCH
38 |         return "Architecture mismatch, GPU does not support requested feature"
39 |     elseif status == CURAND_STATUS_INTERNAL_ERROR
40 |         return "Internal library error"
41 |     else
42 |         return "unknown status"
43 |     end
44 | end
45 | 
46 | macro check(func)
47 |     quote
48 |         local err::curandStatus_t
49 |         err = $(esc(func::Expr))
50 |         if err != CURAND_STATUS_SUCCESS
51 |             throw(CURANDError(err))
52 |         end
53 |         err
54 |     end
55 | end
56 | 


--------------------------------------------------------------------------------
/src/blas/README.md:
--------------------------------------------------------------------------------
 1 | # CUBLAS implementation progress
 2 | 
 3 | The following sections list the CUBLAS functions shown on the CUBLAS
 4 | documentation page:
 5 | 
 6 | http://docs.nvidia.com/cuda/cublas/index.html
 7 | 
 8 | ## Level 1 (13 functions)
 9 | 
10 | CUBLAS functions:
11 | 
12 | * [x] amax
13 | * [x] amin
14 | * [x] asum
15 | * [x] axpy
16 | * [x] copy
17 | * [x] dot, dotc, dotu
18 | * [x] nrm2
19 | * [ ] rot (not implemented in julia blas.jl)
20 | * [ ] rotg (not implemented in julia blas.jl)
21 | * [ ] rotm (not implemented in julia blas.jl)
22 | * [ ] rotmg (not implemented in julia blas.jl)
23 | * [x] scal
24 | * [ ] swap (not implemented in julia blas.jl)
25 | 
26 | ## Level 2
27 | 
28 | Key:
29 | * `ge`: general
30 | * `gb`: general banded
31 | * `sy`: symmetric
32 | * `sb`: symmetric banded
33 | * `sp`: symmetric packed
34 | * `tr`: triangular
35 | * `tb`: triangular banded
36 | * `tp`: triangular packed
37 | * `he`: hermitian
38 | * `hb`: hermitian banded
39 | * `hp`: hermitian packed
40 | 
41 | CUBLAS functions:
42 | 
43 | * [x] gbmv (in julia/blas.jl)
44 | * [x] gemv (in julia/blas.jl)
45 | * [x] ger (in julia/blas.jl)
46 | * [x] sbmv (in julia/blas.jl)
47 | * [ ] spmv
48 | * [ ] spr
49 | * [ ] spr2
50 | * [x] symv (in julia/blas.jl)
51 | * [x] syr (in julia/blas.jl)
52 | * [ ] syr2
53 | * [x] tbmv
54 | * [x] tbsv
55 | * [ ] tpmv
56 | * [ ] tpsv
57 | * [x] trmv (in julia/blas.jl)
58 | * [x] trsv (in julia/blas.jl)
59 | * [x] hemv (in julia/blas.jl)
60 | * [x] hbmv
61 | * [ ] hpmv
62 | * [x] her (in julia/blas.jl)
63 | * [x] her2
64 | * [ ] hpr
65 | * [ ] hpr2
66 | 
67 | ## Level 3
68 | 
69 | CUBLAS functions:
70 | 
71 | * [x] gemm (in julia/blas.jl)
72 | * [x] gemmBatched
73 | * [x] symm (in julia/blas.jl)
74 | * [x] syrk (in julia/blas.jl)
75 | * [x] syr2k (in julia/blas.jl)
76 | * [ ] syrkx
77 | * [x] trmm (in julia/blas.jl)
78 | * [x] trsm (in julia/blas.jl)
79 | * [x] trsmBatched
80 | * [x] hemm
81 | * [x] herk (in julia/blas.jl)
82 | * [x] her2k (in julia/blas.jl)
83 | * [ ] herkx
84 | 
85 | ## BLAS-like extensions
86 | 
87 | * [x] geam
88 | * [x] dgmm
89 | * [x] getrfBatched
90 | * [x] getriBatched
91 | * [x] geqrfBatched
92 | * [x] gelsBatched
93 | * [ ] tpttr
94 | * [ ] trttp
95 | 


--------------------------------------------------------------------------------
/src/blas/CUBLAS.jl:
--------------------------------------------------------------------------------
 1 | module CUBLAS
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL, devices
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcublas, active_context, unsafe_free!
 8 | using LinearAlgebra
 9 | 
10 | include("libcublas_types.jl")
11 | include("error.jl")
12 | 
13 | const _handles = Dict{CuContext,cublasHandle_t}()
14 | const _xt_handles = Dict{CuContext,cublasXtHandle_t}()
15 | const _handle = Ref{cublasHandle_t}(C_NULL)
16 | const _xt_handle = Ref{cublasXtHandle_t}(C_NULL)
17 | 
18 | function handle()
19 |     if _handle[] == C_NULL
20 |         @assert isassigned(active_context) # some other call should have initialized CUDA
21 |         _handle[] = get!(_handles, active_context[]) do
22 |             context = active_context[]
23 |             handle = cublasCreate_v2()
24 | 
25 |             # enable tensor math mode if our device supports it, and fast math is enabled
26 |             dev = CUDAdrv.device(context)
27 |             if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0"
28 |               cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle)
29 |             end
30 | 
31 |             atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle))
32 |             handle
33 |         end
34 |     end
35 | 
36 |     return _handle[]
37 | end
38 | 
39 | function xt_handle()
40 |     if _xt_handle[] == C_NULL
41 |         @assert isassigned(active_context) # some other call should have initialized CUDA
42 |         _xt_handle[] = get!(_xt_handles, active_context[]) do
43 |             context = active_context[]
44 |             handle = cublasXtCreate()
45 |             devs = convert.(Cint, CUDAdrv.devices())
46 |             cublasXtDeviceSelect(handle, length(devs), devs)
47 |             atexit(()->CUDAdrv.isvalid(context) && cublasXtDestroy(handle))
48 |             handle
49 |         end
50 |     end
51 |     return _xt_handle[]
52 | end
53 | 
54 | include("libcublas.jl")
55 | include("util.jl")
56 | include("wrappers.jl")
57 | include("highlevel.jl")
58 | 
59 | version() = VersionNumber(cublasGetProperty(CUDAapi.MAJOR_VERSION),
60 |                           cublasGetProperty(CUDAapi.MINOR_VERSION),
61 |                           cublasGetProperty(CUDAapi.PATCH_LEVEL))
62 | 
63 | end
64 | 


--------------------------------------------------------------------------------
/src/utils.jl:
--------------------------------------------------------------------------------
 1 | using Base.Cartesian
 2 | 
 3 | function cudims(n::Integer)
 4 |   threads = min(n, 256)
 5 |   ceil(Int, n / threads), threads
 6 | end
 7 | 
 8 | cudims(a::AbstractArray) = cudims(length(a))
 9 | 
10 | @inline ind2sub_(a::AbstractArray{T,0}, i) where T = ()
11 | @inline ind2sub_(a, i) = Tuple(CartesianIndices(a)[i])
12 | 
13 | macro cuindex(A)
14 |   quote
15 |     A = $(esc(A))
16 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
17 |     i > length(A) && return
18 |     ind2sub_(A, i)
19 |   end
20 | end
21 | 
22 | 
23 | @generated function nindex(i::T, ls::NTuple{N,T}) where {N,T}
24 |   na = one(i)
25 |   quote
26 |     Base.@_inline_meta
27 |     $(foldr((n, els) -> :(i ≤ ls[$n] ? ($n, i) : (i -= ls[$n]; $els)), :($na, $na), one(i):i(N)))
28 |   end
29 | end
30 | 
31 | @inline function catindex(dim, I::NTuple{N}, shapes) where N
32 |   @inbounds x, i = nindex(I[dim], getindex.(shapes, dim))
33 |   x, ntuple(n -> n == dim ? i : I[n], Val{N})
34 | end
35 | 
36 | function growdims(dim, x)
37 |   if ndims(x) >= dim
38 |     x
39 |   else
40 |     reshape(x, size.((x,), 1:dim)...)
41 |   end
42 | end
43 | 
44 | function _cat(dim, dest, xs...)
45 |   function kernel(dim, dest, xs)
46 |     I = @cuindex dest
47 |     @inbounds n, I′ = catindex(dim, Int.(I), size.(xs))
48 |     @inbounds dest[I...] = xs[n][I′...]
49 |     return
50 |   end
51 |   xs = growdims.(dim, xs)
52 |   blk, thr = cudims(dest)
53 |   @cuda blocks=blk threads=thr kernel(dim, dest, xs)
54 |   return dest
55 | end
56 | 
57 | function Base.cat_t(dims::Integer, T::Type, x::CuArray, xs::CuArray...)
58 |   catdims = Base.dims2cat(dims)
59 |   shape = Base.cat_shape(catdims, (), size.((x, xs...))...)
60 |   dest = Base.cat_similar(x, T, shape)
61 |   _cat(dims, dest, x, xs...)
62 | end
63 | 
64 | Base.vcat(xs::CuArray...) = cat(xs..., dims=1)
65 | Base.hcat(xs::CuArray...) = cat(xs..., dims=2)
66 | 
67 | 
68 | """
69 |     @sync ex
70 | 
71 | Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
72 | synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
73 | such, this operation is preferred over implicit synchronization (e.g. when performing a
74 | memory copy) for high-performance applications.
75 | 
76 | It is also useful for timing code that executes asynchronously.
77 | """
78 | macro sync(ex)
79 |     quote
80 |         local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
81 |         local ret = $(esc(ex))
82 |         CUDAdrv.record(e)
83 |         CUDAdrv.synchronize(e)
84 |         ret
85 |     end
86 | end
87 | 


--------------------------------------------------------------------------------
/src/gpuarray_interface.jl:
--------------------------------------------------------------------------------
 1 | import GPUArrays
 2 | 
 3 | struct CuArrayBackend <: GPUArrays.GPUBackend end
 4 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()
 5 | 
 6 | 
 7 | #Abstract GPU interface
 8 | struct CuKernelState end
 9 | 
10 | @inline function GPUArrays.LocalMemory(::CuKernelState, ::Type{T}, ::Val{N}, ::Val{id}) where {T, N, id}
11 |     ptr = CUDAnative._shmem(Val(id), T, Val(N))
12 |     CuDeviceArray(N, DevicePtr{T, CUDAnative.AS.Shared}(ptr))
13 | end
14 | 
15 | GPUArrays.AbstractDeviceArray(A::CUDAnative.CuDeviceArray, shape) = CUDAnative.CuDeviceArray(shape, pointer(A))
16 | 
17 | @inline GPUArrays.synchronize_threads(::CuKernelState) = CUDAnative.sync_threads()
18 | 
19 | GPUArrays.blas_module(::CuArray) = CuArrays.CUBLAS
20 | GPUArrays.blasbuffer(x::CuArray) = x
21 | 
22 | """
23 | Blocks until all operations are finished on `A`
24 | """
25 | GPUArrays.synchronize(A::CuArray) =
26 |     CUDAdrv.synchronize()
27 | 
28 | for (i, sym) in enumerate((:x, :y, :z))
29 |     for (f, fcu) in (
30 |             (:blockidx, :blockIdx),
31 |             (:blockdim, :blockDim),
32 |             (:threadidx, :threadIdx),
33 |             (:griddim, :gridDim)
34 |         )
35 |         fname = Symbol(string(f, '_', sym))
36 |         cufun = Symbol(string(fcu, '_', sym))
37 |         @eval GPUArrays.$fname(::CuKernelState) = CUDAnative.$cufun()
38 |     end
39 | end
40 | 
41 | # devices() = CUDAdrv.devices()
42 | GPUArrays.device(A::CuArray) = CUDAdrv.device(CUDAdrv.CuCurrentContext())
43 | GPUArrays.is_gpu(dev::CUDAdrv.CuDevice) = true
44 | GPUArrays.name(dev::CUDAdrv.CuDevice) = string("CU ", CUDAdrv.name(dev))
45 | GPUArrays.threads(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
46 | 
47 | GPUArrays.blocks(dev::CUDAdrv.CuDevice) =
48 |     (CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X),
49 |      CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y),
50 |      CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Z))
51 | 
52 | GPUArrays.free_global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.Mem.info()[1]
53 | GPUArrays.global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.totalmem(dev)
54 | GPUArrays.local_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.TOTAL_CONSTANT_MEMORY)
55 | 
56 | function GPUArrays._gpu_call(::CuArrayBackend, f, A, args::Tuple,
57 |                              blocks_threads::Tuple{T, T}) where {N, T <: NTuple{N, Integer}}
58 |     blk, thr = blocks_threads
59 |     @cuda blocks=blk threads=thr f(CuKernelState(), args...)
60 | end
61 | 
62 | # Save reinterpret and reshape implementation use this in GPUArrays
63 | GPUArrays.unsafe_reinterpret(::Type{T}, A::CuArray, size::NTuple{N, Integer}) where {T, N} =
64 |     CuArray{T, N}(A.buf, size)
65 | 


--------------------------------------------------------------------------------
/src/fft/error.jl:
--------------------------------------------------------------------------------
 1 | export CUFFTError
 2 | 
 3 | struct CUFFTError <: Exception
 4 |     code::cufftStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUFFTError) = print(io, "CUFFTError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUFFTError(code::cufftStatus_t)
10 |     msg = status_message(code)
11 |     return CUFFTError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUFFT_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUFFT_STATUS_INVALID_PLAN
18 |         return "cuFFT was passed an invalid plan handle"
19 |     elseif status == CUFFT_STATUS_ALLOC_FAILED
20 |         return "cuFFT failed to allocate GPU or CPU memory"
21 |     elseif status == CUFFT_STATUS_INVALID_TYPE
22 |         return "cuFFT invalid type " # No longer used
23 |     elseif status == CUFFT_STATUS_INVALID_VALUE
24 |         return "User specified an invalid pointer or parameter"
25 |     elseif status == CUFFT_STATUS_INTERNAL_ERROR
26 |         return "Driver or internal cuFFT library error"
27 |     elseif status == CUFFT_STATUS_EXEC_FAILED
28 |         return "Failed to execute an FFT on the GPU"
29 |     elseif status == CUFFT_STATUS_SETUP_FAILED
30 |         return "The cuFFT library failed to initialize"
31 |     elseif status == CUFFT_STATUS_INVALID_SIZE
32 |         return "User specified an invalid transform size"
33 |     elseif status == CUFFT_STATUS_UNALIGNED_DATA
34 |         return "cuFFT unaligned data" # No longer used
35 |     elseif status == CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST
36 |         return "Missing parameters in call"
37 |     elseif status == CUFFT_STATUS_INVALID_DEVICE
38 |         return "Execution of a plan was on different GPU than plan creation"
39 |     elseif status == CUFFT_STATUS_PARSE_ERROR
40 |         return "Internal plan database error"
41 |     elseif status == CUFFT_STATUS_NO_WORKSPACE
42 |         return "No workspace has been provided prior to plan execution"
43 |     elseif status == CUFFT_STATUS_NOT_IMPLEMENTED
44 |         return "Function does not implement functionality for parameters given."
45 |     elseif status == CUFFT_STATUS_LICENSE_ERROR
46 |         return "cuFFT license error" # Used in previous versions.
47 |     elseif status == CUFFT_STATUS_NOT_SUPPORTED
48 |         return "Operation is not supported for parameters given."
49 |     else
50 |         return "unknown status"
51 |     end
52 | end
53 | 
54 | macro check(fft_func)
55 |     quote
56 |         local err::cufftStatus_t
57 |         err = $(esc(fft_func::Expr))
58 |         if err != CUFFT_STATUS_SUCCESS
59 |             throw(CUFFTError(err))
60 |         end
61 |         err
62 |     end
63 | end
64 | 


--------------------------------------------------------------------------------
/src/rand/libcurand_types.jl:
--------------------------------------------------------------------------------
 1 | const curandGenerator_t = Ptr{Cvoid}
 2 | 
 3 | mutable struct RNG <: Random.AbstractRNG
 4 |     ptr::curandGenerator_t
 5 |     typ::Int
 6 | end
 7 | 
 8 | Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr
 9 | 
10 | 
11 | const curandDiscreteDistribution_t = Ptr{Cvoid}
12 | 
13 | mutable struct DiscreteDistribution
14 |     ptr::curandDiscreteDistribution_t
15 | end
16 | 
17 | Base.unsafe_convert(::Type{curandDiscreteDistribution_t}, dist::DiscreteDistribution) = dist.ptr
18 | 
19 | 
20 | # CURAND status codes
21 | const curandStatus_t = UInt32
22 | const CURAND_STATUS_SUCCESS = 0
23 | const CURAND_STATUS_VERSION_MISMATCH = 100
24 | const CURAND_STATUS_NOT_INITIALIZED = 101
25 | const CURAND_STATUS_ALLOCATION_FAILED = 102
26 | const CURAND_STATUS_TYPE_ERROR = 103
27 | const CURAND_STATUS_OUT_OF_RANGE = 104
28 | const CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105
29 | const CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106
30 | const CURAND_STATUS_LAUNCH_FAILURE = 201
31 | const CURAND_STATUS_PREEXISTING_FAILURE = 202
32 | const CURAND_STATUS_INITIALIZATION_FAILED = 203
33 | const CURAND_STATUS_ARCH_MISMATCH = 204
34 | const CURAND_STATUS_INTERNAL_ERROR = 999
35 | 
36 | # CURAND RNG types (curandRngType)
37 | const CURAND_RNG_TEST = 0
38 | const CURAND_RNG_PSEUDO_DEFAULT = 100
39 | const CURAND_RNG_PSEUDO_XORWOW = 101
40 | const CURAND_RNG_PSEUDO_MRG32K3A = 121
41 | const CURAND_RNG_PSEUDO_MTGP32 = 141
42 | const CURAND_RNG_PSEUDO_MT19937 = 142
43 | const CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161
44 | const CURAND_RNG_QUASI_DEFAULT = 200
45 | const CURAND_RNG_QUASI_SOBOL32 = 201
46 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202
47 | const CURAND_RNG_QUASI_SOBOL64 = 203
48 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204
49 | 
50 | # CURAND ordering of results in memory
51 | const CURAND_ORDERING_PSEUDO_BEST = 100
52 | const CURAND_ORDERING_PSEUDO_DEFAULT = 101
53 | const CURAND_ORDERING_PSEUDO_SEEDED = 102
54 | const CURAND_ORDERING_QUASI_DEFAULT = 201
55 | 
56 | # CURAND choice of direction vector set
57 | const CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101
58 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102
59 | const CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103
60 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104
61 | 
62 | # CURAND method
63 | const CURAND_CHOOSE_BEST = 0
64 | const CURAND_ITR = 1
65 | const CURAND_KNUTH = 2
66 | const CURAND_HITR = 3
67 | const CURAND_M1 = 4
68 | const CURAND_M2 = 5
69 | const CURAND_BINARY_SEARCH = 6
70 | const CURAND_DISCRETE_GAUSS = 7
71 | const CURAND_REJECTION = 8
72 | const CURAND_DEVICE_API = 9
73 | const CURAND_FAST_REJECTION = 10
74 | const CURAND_3RD = 11
75 | const CURAND_DEFINITION = 12
76 | const CURAND_POISSON = 13
77 | 


--------------------------------------------------------------------------------
/deps/build.jl:
--------------------------------------------------------------------------------
 1 | using CUDAapi
 2 | 
 3 | 
 4 | ## auxiliary routines
 5 | 
 6 | status = 0
 7 | function build_warning(reason)
 8 |     println("$reason.")
 9 |     global status
10 |     status = 1
11 |     # NOTE: it's annoying that we have to `exit(1)`, but otherwise messages are hidden
12 | end
13 | 
14 | function build_error(reason)
15 |     println(reason)
16 |     exit(1)
17 | end
18 | 
19 | 
20 | ## main
21 | 
22 | config_path = joinpath(@__DIR__, "ext.jl")
23 | const previous_config_path = config_path * ".bak"
24 | 
25 | function write_ext(config)
26 |     open(config_path, "w") do io
27 |         println(io, "# autogenerated file, do not edit")
28 |         for (key,val) in config
29 |             println(io, "const $key = $(repr(val))")
30 |         end
31 |     end
32 | end
33 | 
34 | function main()
35 |     ispath(config_path) && mv(config_path, previous_config_path; force=true)
36 |     config = Dict{Symbol,Any}(:configured => false)
37 |     write_ext(config)
38 | 
39 | 
40 |     ## discover stuff
41 | 
42 |     toolkit = find_toolkit()
43 | 
44 |     # required libraries that are part of the CUDA toolkit
45 |     for name in ("cublas", "cusparse", "cusolver", "cufft", "curand")
46 |         lib = Symbol("lib$name")
47 |         config[lib] = find_cuda_library(name, toolkit)
48 |         if config[lib] == nothing
49 |             build_error("Could not find library '$name' (it should be part of the CUDA toolkit)")
50 |         end
51 |     end
52 | 
53 |     # optional libraries
54 |     for name in ("cudnn", )
55 |         lib = Symbol("lib$name")
56 |         config[lib] = find_cuda_library(name, toolkit)
57 |         if config[lib] == nothing
58 |             build_warning("Could not find optional library '$name'")
59 |         end
60 |     end
61 | 
62 | 
63 |     ## (re)generate ext.jl
64 | 
65 |     function globals(mod)
66 |         all_names = names(mod, all=true)
67 |         filter(name-> !any(name .== [nameof(mod), Symbol("#eval"), :eval]), all_names)
68 |     end
69 | 
70 |     if isfile(previous_config_path)
71 |         @eval module Previous; include($previous_config_path); end
72 |         previous_config = Dict{Symbol,Any}(name => getfield(Previous, name)
73 |                                            for name in globals(Previous))
74 | 
75 |         if config == previous_config
76 |             mv(previous_config_path, config_path; force=true)
77 |             return
78 |         end
79 |     end
80 | 
81 |     config[:configured] = true
82 |     write_ext(config)
83 | 
84 |     if status != 0
85 |         # we got here, so the status is non-fatal
86 |         build_error("""
87 | 
88 |             CuArrays.jl has been built successfully, but there were warnings.
89 |             Some functionality may be unavailable.""")
90 |     end
91 | end
92 | 
93 | main()
94 | 


--------------------------------------------------------------------------------
/src/solver/libcusolver_types.jl:
--------------------------------------------------------------------------------
 1 | import ..CUBLAS: cublasfill, cublasop, cublasside, cublasFillMode_t, cublasOperation_t, cublasSideMode_t
 2 | 
 3 | #enum cusolverStatus_t
 4 | #error messages from CUSOLVER
 5 | 
 6 | const cusolverStatus_t = UInt32
 7 | const CUSOLVER_STATUS_SUCCESS                   = 0
 8 | const CUSOLVER_STATUS_NOT_INITIALIZED           = 1
 9 | const CUSOLVER_STATUS_ALLOC_FAILED              = 2
10 | const CUSOLVER_STATUS_INVALID_VALUE             = 3
11 | const CUSOLVER_STATUS_ARCH_MISMATCH             = 4
12 | const CUSOLVER_STATUS_EXECUTION_FAILED          = 5
13 | const CUSOLVER_STATUS_INTERNAL_ERROR            = 6
14 | const CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 7
15 | 
16 | const csrqrInfo_t = Ptr{Nothing}
17 | const gesvdjInfo_t = Ptr{Cvoid}
18 | const syevjInfo_t = Ptr{Cvoid}
19 | 
20 | const cusolverEigMode_t = UInt32
21 | const CUSOLVER_EIG_MODE_NOVECTOR                = 0
22 | const CUSOLVER_EIG_MODE_VECTOR                  = 1
23 | 
24 | const cusolverEigType_t = UInt32
25 | const CUSOLVER_EIG_TYPE_1                       = 1
26 | const CUSOLVER_EIG_TYPE_2                       = 2
27 | const CUSOLVER_EIG_TYPE_3                       = 3
28 | 
29 | # refactorization types
30 | 
31 | const cusolverRfNumericBoostReport_t = UInt32
32 | const CUSOLVER_NUMERIC_BOOST_NOT_USED           = 0
33 | const CUSOLVER_NUMERIC_BOOST_USED               = 1
34 | 
35 | const cusolverRfResetValuesFastMode_t = UInt32
36 | const CUSOLVER_RESET_VALUES_FAST_MODE_OFF       = 0
37 | const CUSOLVER_RESET_VALUES_FAST_MODE_ON        = 1
38 | 
39 | const cusolverRfFactorization_t = UInt32
40 | const CUSOLVER_FACTORIZATION_ALG0               = 0
41 | const CUSOLVER_FACTORIZATION_ALG1               = 1
42 | const CUSOLVER_FACTORIZATION_ALG2               = 2
43 | 
44 | const cusolverRfTriangularSolve_t = UInt32
45 | const CUSOLVER_TRIANGULAR_SOLVE_ALG0            = 0
46 | const CUSOLVER_TRIANGULAR_SOLVE_ALG1            = 1
47 | const CUSOLVER_TRIANGULAR_SOLVE_ALG2            = 2
48 | const CUSOLVER_TRIANGULAR_SOLVE_ALG3            = 3
49 | 
50 | const cusolverRfUnitDiagonal_t = UInt32
51 | const CUSOLVER_UNIT_DIAGONAL_STORED_L           = 0
52 | const CUSOLVER_UNIT_DIAGONAL_STORED_U           = 1
53 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_L          = 2
54 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_U          = 3
55 | 
56 | const cusolverDnContext = Nothing
57 | const cusolverDnHandle_t = Ptr{cusolverDnContext}
58 | const cusolverSpContext = Nothing
59 | const cusolverSpHandle_t = Ptr{cusolverSpContext}
60 | const cusolverRfContext = Nothing
61 | const cusolverRfHandle_t = Ptr{cusolverRfContext}
62 | 
63 | #complex numbers
64 | 
65 | const cuComplex = Complex{Float32}
66 | const cuDoubleComplex = Complex{Float64}
67 | 
68 | const CusolverFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
69 | const CusolverReal = Union{Float64,Float32}
70 | const CusolverComplex = Union{ComplexF64,ComplexF32}
71 | 


--------------------------------------------------------------------------------
/src/fft/fft.jl:
--------------------------------------------------------------------------------
 1 | # K is a flag for forward/backward
 2 | # also used as an alias for r2c/c2r
 3 | 
 4 | abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end
 5 | 
 6 | mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
 7 |     plan::cufftHandle_t
 8 |     sz::NTuple{N,Int} # Julia size of input array
 9 |     osz::NTuple{N,Int} # Julia size of output array
10 |     xtype::Int
11 |     region::Any
12 |     pinv::ScaledPlan # required by AbstractFFT API
13 | 
14 |     function cCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N},
15 |                                        sizey::Tuple, region, xtype::Integer
16 |                                        ) where {T<:cufftNumber,K,inplace,N}
17 |         # maybe enforce consistency of sizey
18 |         p = new(plan, size(X), sizey, xtype, region)
19 |         finalizer(destroy_plan, p)
20 |         p
21 |     end
22 | end
23 | 
24 | cCuFFTPlan(plan,X,region,xtype::Integer) = cCuFFTPlan(plan,X,size(X),region,xtype)
25 | 
26 | mutable struct rCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
27 |     plan::cufftHandle_t
28 |     sz::NTuple{N,Int} # Julia size of input array
29 |     osz::NTuple{N,Int} # Julia size of output array
30 |     xtype::Int
31 |     region::Any
32 |     pinv::ScaledPlan # required by AbstractFFT API
33 | 
34 |     function rCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N},
35 |                                        sizey::Tuple, region, xtype::Integer
36 |                                        ) where {T<:cufftNumber,K,inplace,N}
37 |         # maybe enforce consistency of sizey
38 |         p = new(plan, size(X), sizey, xtype, region)
39 |         finalizer(destroy_plan, p)
40 |         p
41 |     end
42 | end
43 | 
44 | rCuFFTPlan(plan,X,region,xtype::Integer) = rCuFFTPlan(plan,X,size(X),region,xtype)
45 | 
46 | const xtypenames = Dict{cufftType,String}(CUFFT_R2C => "real-to-complex",
47 |                                           CUFFT_C2R => "complex-to-real",
48 |                                           CUFFT_C2C => "complex",
49 |                                           CUFFT_D2Z => "d.p. real-to-complex",
50 |                                           CUFFT_Z2D => "d.p. complex-to-real",
51 |                                           CUFFT_Z2Z => "d.p. complex")
52 | 
53 | function showfftdims(io, sz, T)
54 |     if isempty(sz)
55 |         print(io,"0-dimensional")
56 |     elseif length(sz) == 1
57 |         print(io, sz[1], "-element")
58 |     else
59 |         print(io, join(sz, "×"))
60 |     end
61 |     print(io, " CuArray of ", T)
62 | end
63 | 
64 | function show(io::IO, p::CuFFTPlan{T,K,inplace}) where {T,K,inplace}
65 |     print(io, inplace ? "CUFFT in-place " : "CUFFT ",
66 |           xtypenames[p.xtype],
67 |           K == CUFFT_FORWARD ? " forward" : " backward",
68 |           " plan for ")
69 |     showfftdims(io, p.sz, T)
70 | end
71 | 


--------------------------------------------------------------------------------
/src/fft/libcufft_types.jl:
--------------------------------------------------------------------------------
 1 | # CUFFT API function return values
 2 | const cufftStatus_t = UInt32
 3 | const CUFFT_STATUS_SUCCESS        = 0  #  The cuFFT operation was successful
 4 | const CUFFT_STATUS_INVALID_PLAN   = 1  #  cuFFT was passed an invalid plan handle
 5 | const CUFFT_STATUS_ALLOC_FAILED   = 2  #  cuFFT failed to allocate GPU or CPU memory
 6 | const CUFFT_STATUS_INVALID_TYPE   = 3  #  No longer used
 7 | const CUFFT_STATUS_INVALID_VALUE  = 4  #  User specified an invalid pointer or parameter
 8 | const CUFFT_STATUS_INTERNAL_ERROR = 5  #  Driver or internal cuFFT library error
 9 | const CUFFT_STATUS_EXEC_FAILED    = 6  #  Failed to execute an FFT on the GPU
10 | const CUFFT_STATUS_SETUP_FAILED   = 7  #  The cuFFT library failed to initialize
11 | const CUFFT_STATUS_INVALID_SIZE   = 8  #  User specified an invalid transform size
12 | const CUFFT_STATUS_UNALIGNED_DATA = 9  #  No longer used
13 | const CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST = 10 #  Missing parameters in call
14 | const CUFFT_STATUS_INVALID_DEVICE = 11 #  Execution of a plan was on different GPU than plan creation
15 | const CUFFT_STATUS_PARSE_ERROR    = 12 #  Internal plan database error
16 | const CUFFT_STATUS_NO_WORKSPACE   = 13  #  No workspace has been provided prior to plan execution
17 | const CUFFT_STATUS_NOT_IMPLEMENTED = 14 # Function does not implement functionality for parameters given.
18 | const CUFFT_STATUS_LICENSE_ERROR  = 15 # Used in previous versions.
19 | const CUFFT_STATUS_NOT_SUPPORTED  = 16  # Operation is not supported for parameters given.
20 | 
21 | 
22 | const cufftReal = Float32
23 | const cufftDoubleReal = Float64
24 | 
25 | const cufftComplex = ComplexF32
26 | const cufftDoubleComplex = ComplexF64
27 | 
28 | # CUFFT transform directions
29 | const CUFFT_FORWARD = -1 # Forward FFT
30 | const CUFFT_INVERSE =  1 # Inverse FFT
31 | 
32 | # CUFFT supports the following transform types
33 | const cufftType = Cint
34 | const CUFFT_R2C = 0x2a     # Real to Complex
35 | const CUFFT_C2R = 0x2c     # Complex to Real
36 | const CUFFT_C2C = 0x29     # Complex to Complex
37 | const CUFFT_D2Z = 0x6a     # Double to Double-Complex
38 | const CUFFT_Z2D = 0x6c     # Double-Complex to Double
39 | const CUFFT_Z2Z = 0x69     # Double-Complex to Double-Complex
40 | 
41 | const cufftCompatibility = Cint
42 | const   CUFFT_COMPATIBILITY_NATIVE          = 0x00
43 | const   CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01
44 | const   CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02
45 | const   CUFFT_COMPATIBILITY_FFTW_ALL        = 0x03
46 | 
47 | const cufftHandle_t = Cint
48 | 
49 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex}
50 | # note trailing s to deconflict w/ header file
51 | const cufftReals = Union{cufftDoubleReal,cufftReal}
52 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex}
53 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex}
54 | const cufftSingle = Union{cufftReal,cufftComplex}
55 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}}
56 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}}
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CuArrays
 2 | 
 3 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url]
 4 | 
 5 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CuArrays.jl/branch/master/graph/badge.svg
 6 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CuArrays.jl
 7 | 
 8 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg
 9 | [docs-latest-url]: https://juliagpu.gitlab.io/CuArrays.jl/
10 | 
11 | CuArrays provides a fully-functional GPU array, which can give significant speedups over
12 | normal arrays without code changes. CuArrays are implemented fully in Julia, making the
13 | implementation [elegant and extremely
14 | generic](http://mikeinnes.github.io/2017/08/24/cudanative.html).
15 | 
16 | Documentation for this package is sparse, and for many of the array operations you should
17 | refer to the official Julia documentation. The following resources can be useful to get a
18 | better understanding of the characteristics and performance trade offs that come with GPU
19 | arrays:
20 | 
21 | - Introductory tutorial on [GPU programming with Julia](https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/)
22 | - Slide deck on [effectively using GPUs with Julia](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/)
23 | 
24 | ## Installation
25 | 
26 | CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a
27 | proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work
28 | (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues
29 | with CuArrays.jl, please make sure those other packages are working as expected.
30 | 
31 | Some parts of CuArrays.jl depend on **optional libraries**, such as
32 | [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify
33 | about missing dependencies, i.e. inspect the output of `Pkg.build("CuArrays")`
34 | to see whether your installation is complete.
35 | 
36 | 
37 | ## Features
38 | 
39 | ```julia
40 | xs = cu(rand(5, 5))
41 | ys = cu[1, 2, 3]
42 | xs_cpu = collect(xs)
43 | ```
44 | 
45 | Because `CuArray` is an `AbstractArray`, it doesn't have much of a learning curve; just use your favourite array ops as usual. The following are supported (on arbitrary numbers of arguments, dimensions etc):
46 | 
47 | * Conversions and `copy!` with CPU arrays
48 | * General indexing (`xs[1:2, 5, :]`)
49 | * `permutedims`
50 | * Concatenation (`vcat(x, y)`, `cat(3, xs, ys, zs)`)
51 | * `map`, fused broadcast (`zs .= xs.^2 .+ ys .* 2`)
52 | * `fill!(xs, 0)`
53 | * Reduction over dimensions (`reducedim(+, xs, 3)`, `sum(x -> x^2, xs, 1)` etc)
54 | * Reduction to scalar (`reduce(*, 1, xs)`, `sum(xs)` etc)
55 | * Various BLAS operations (matrix\*matrix, matrix\*vector)
56 | * FFTs, using the AbstractFFTs API
57 | 
58 | We welcome issues or PRs for functionality not on this list.
59 | 
60 | Note that some operations not on this list will work, but be slow, due to Base's generic
61 | implementations. This is intentional, to enable a "make it work, then make it fast"
62 | workflow. When you're ready you can disable slow fallback methods:
63 | 
64 | ```julia
65 | julia> CuArrays.allowscalar(false)
66 | julia> xs[5]
67 | ERROR: getindex is disabled
68 | ```
69 | 


--------------------------------------------------------------------------------
/src/matmul.jl:
--------------------------------------------------------------------------------
 1 | using LinearAlgebra
 2 | 
 3 | 
 4 | function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R}
 5 |     if size(A,2) != size(B,1)
 6 |         throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))"))
 7 |     end
 8 |     if size(C,1) != size(A,1) || size(C,2) != size(B,2)
 9 |         throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))"))
10 |     end
11 |     if isempty(A) || isempty(B)
12 |         return fill!(C, zero(R))
13 |     end
14 | 
15 |     function kernel(C, A, B)
16 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
17 |         j = (blockIdx().y-1) * blockDim().y + threadIdx().y
18 | 
19 |         if i <= size(A,1) && j <= size(B,2)
20 |             z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j])
21 |             Ctmp = convert(promote_type(R, typeof(z2)), z2)
22 |             for k in 1:size(A,2)
23 |                 Ctmp += A[i, k]*B[k, j]
24 |             end
25 |             C[i,j] = Ctmp
26 |         end
27 | 
28 |         return
29 |     end
30 | 
31 |     max_threads = 256
32 |     threads_x = min(max_threads, size(C,1))
33 |     threads_y = min(max_threads ÷ threads_x, size(C,2))
34 |     threads = (threads_x, threads_y)
35 |     blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads)
36 | 
37 |     @cuda threads=threads blocks=blocks kernel(C, A, B)
38 | 
39 |     C
40 | end
41 | 
42 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
43 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
44 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
45 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
46 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
47 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
48 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
49 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
50 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
51 | 
52 | 
53 | function generic_rmul!(X::CuArray, s::Number)
54 |     function kernel(X, s)
55 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
56 |         @inbounds X[i] *= s
57 |         return
58 |     end
59 |     @cuda blocks=length(X) kernel(X, s)
60 |     X
61 | end
62 | 
63 | LinearAlgebra.rmul!(A::CuArray, b::Number) = generic_rmul!(A, b)
64 | 
65 | 
66 | function generic_lmul!(s::Number, X::CuArray)
67 |     function kernel(s, X)
68 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
69 |         @inbounds X[i] = s*X[i]
70 |         return
71 |     end
72 |     @cuda blocks=length(X) kernel(s, X)
73 |     X
74 | end
75 | 
76 | LinearAlgebra.lmul!(a::Number, B::CuArray) = generic_lmul!(a, B)
77 | 


--------------------------------------------------------------------------------
/src/CuArrays.jl:
--------------------------------------------------------------------------------
  1 | module CuArrays
  2 | 
  3 | using CUDAdrv, CUDAnative
  4 | 
  5 | using GPUArrays
  6 | 
  7 | export CuArray, CuVector, CuMatrix, CuVecOrMat, cu
  8 | 
  9 | import LinearAlgebra
 10 | 
 11 | using Adapt
 12 | 
 13 | using Requires
 14 | 
 15 | const ext = joinpath(dirname(@__DIR__), "deps", "ext.jl")
 16 | isfile(ext) || error("CuArrays.jl has not been built, please run Pkg.build(\"CuArrays\").")
 17 | include(ext)
 18 | if !configured
 19 |     # default (non-functional) values for critical variables,
 20 |     # making it possible to _load_ the package at all times.
 21 |     const libcublas = nothing
 22 |     const libcusparse = nothing
 23 |     const libcusolver = nothing
 24 |     const libcufft = nothing
 25 |     const libcurand = nothing
 26 |     const libcudnn = nothing
 27 | end
 28 | 
 29 | include("memory.jl")
 30 | include("array.jl")
 31 | include("subarray.jl")
 32 | include("utils.jl")
 33 | include("indexing.jl")
 34 | include("broadcast.jl")
 35 | include("matmul.jl")
 36 | include("mapreduce.jl")
 37 | include("accumulate.jl")
 38 | 
 39 | include("gpuarray_interface.jl")
 40 | 
 41 | # many libraries need to be initialized per-device (per-context, really, but we assume users
 42 | # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one.
 43 | const active_context = Ref{CuContext}()
 44 | 
 45 | include("blas/CUBLAS.jl")
 46 | include("sparse/CUSPARSE.jl")
 47 | include("solver/CUSOLVER.jl")
 48 | include("fft/CUFFT.jl")
 49 | include("rand/CURAND.jl")
 50 | libcudnn !== nothing && include("dnn/CUDNN.jl")
 51 | 
 52 | include("nnlib.jl")
 53 | 
 54 | include("deprecated.jl")
 55 | 
 56 | function __init__()
 57 |     if !configured
 58 |         @warn("CuArrays.jl has not been successfully built, and will not work properly.")
 59 |         @warn("Please run Pkg.build(\"CuArrays\") and restart Julia.")
 60 |         return
 61 |     end
 62 | 
 63 |     function check_library(name, path)
 64 |         path === nothing && return
 65 |         if !ispath(path)
 66 |             error("$name library has changed. Please run Pkg.build(\"CuArrays\") and restart Julia.")
 67 |         end
 68 |     end
 69 |     check_library("CUBLAS", libcublas)
 70 |     check_library("CUSPARSE", libcusparse)
 71 |     check_library("CUSOLVER", libcusolver)
 72 |     check_library("CUFFT", libcufft)
 73 |     check_library("CURAND", libcurand)
 74 |     check_library("CUDNN", libcudnn)
 75 | 
 76 |     # package integrations
 77 |     @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl")
 78 | 
 79 |     # update the active context when we switch devices
 80 |     callback = (::CuDevice, ctx::CuContext) -> begin
 81 |         active_context[] = ctx
 82 | 
 83 |         # wipe the active handles
 84 |         CUBLAS._handle[] = C_NULL
 85 |         CUBLAS._xt_handle[] = C_NULL
 86 |         CUSOLVER._dense_handle[] = C_NULL
 87 |         CUSOLVER._sparse_handle[] = C_NULL
 88 |         CUSPARSE._handle[] = C_NULL
 89 |         CURAND._generator[] = nothing
 90 |         isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL)
 91 |     end
 92 |     push!(CUDAnative.device!_listeners, callback)
 93 | 
 94 |     # a device might be active already
 95 |     existing_ctx = CUDAdrv.CuCurrentContext()
 96 |     if existing_ctx !== nothing
 97 |         active_context[] = existing_ctx
 98 |     end
 99 | 
100 |     __init_memory__()
101 |     __init_pool_()
102 | end
103 | 
104 | end # module
105 | 


--------------------------------------------------------------------------------
/src/fft/libcufft.jl:
--------------------------------------------------------------------------------
 1 | # low-level wrappers of the CUFFT library
 2 | 
 3 | import CUDAdrv: CuPtr, PtrOrCuPtr, CuStream_t
 4 | 
 5 | cufftGetVersion() = ccall((:cufftGetVersion,libcufft), Cint, ())
 6 | 
 7 | function cufftGetProperty(property::CUDAapi.libraryPropertyType)
 8 |   value_ref = Ref{Cint}()
 9 |   @check ccall((:cufftGetProperty, libcufft), cufftStatus_t,
10 |                (Cint, Ptr{Cint}),
11 |                property, value_ref)
12 |   value_ref[]
13 | end
14 | 
15 | cufftDestroy(plan) = ccall((:cufftDestroy,libcufft), Nothing, (cufftHandle_t,), plan)
16 | 
17 | function cufftPlan1d(plan, nx, type, batch)
18 |     @check ccall((:cufftPlan1d,libcufft),cufftStatus_t,
19 |                  (Ptr{cufftHandle_t}, Cint, cufftType, Cint),
20 |                  plan, nx, type, batch)
21 | end
22 | 
23 | function cufftPlan2d(plan, nx, ny, type)
24 |     @check ccall((:cufftPlan2d,libcufft),cufftStatus_t,
25 |                  (Ptr{cufftHandle_t}, Cint, Cint, cufftType),
26 |                  plan, nx, ny, type)
27 | end
28 | 
29 | function cufftPlan3d(plan, nx, ny, nz, type)
30 |     @check ccall((:cufftPlan3d,libcufft),cufftStatus_t,
31 |                  (Ptr{cufftHandle_t}, Cint, Cint, Cint, cufftType),
32 |                  plan, nx, ny, nz, type)
33 | end
34 | 
35 | function cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)
36 |     @check ccall((:cufftPlanMany,libcufft),cufftStatus_t,
37 |                  (Ptr{cufftHandle_t}, Cint, Ptr{Cint},
38 |                   Ptr{Cint}, Cint, Cint,
39 |                   Ptr{Cint}, Cint, Cint,
40 |                   cufftType, Cint),
41 |                  plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)
42 | end
43 | 
44 | function cufftExecC2C(plan, idata, odata, direction)
45 |     @check ccall((:cufftExecC2C,libcufft), cufftStatus_t,
46 |                  (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}, Cint),
47 |                  plan, idata, odata, direction)
48 | end
49 | 
50 | function cufftExecC2R(plan, idata, odata)
51 |     @check ccall((:cufftExecC2R,libcufft), cufftStatus_t,
52 |                  (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}),
53 |                  plan, idata, odata)
54 | end
55 | 
56 | function cufftExecR2C(plan, idata, odata)
57 |     @check ccall((:cufftExecR2C,libcufft), cufftStatus_t,
58 |                  (cufftHandle_t, CuPtr{cufftReal}, CuPtr{cufftComplex}),
59 |                  plan, idata, odata)
60 | end
61 | 
62 | function cufftExecZ2Z(plan, idata, odata, direction)
63 |     @check ccall((:cufftExecZ2Z,libcufft), cufftStatus_t,
64 |                  (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex},
65 |                   Cint),
66 |                  plan, idata, odata, direction)
67 | end
68 | 
69 | function cufftExecZ2D(plan, idata, odata)
70 |     @check ccall((:cufftExecZ2D,libcufft), cufftStatus_t,
71 |                  (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}),
72 |                  plan, idata, odata)
73 | end
74 | 
75 | function cufftExecD2Z(plan, idata, odata)
76 |     @check ccall((:cufftExecD2Z,libcufft), cufftStatus_t,
77 |                  (cufftHandle_t, CuPtr{cufftDoubleReal}, CuPtr{cufftDoubleComplex}),
78 |                  plan, idata, odata)
79 | end
80 | 
81 | function cufftSetStream(plan, stream)
82 |     @check ccall((:cufftSetStream,libcufft), cufftStatus_t,
83 |                  (cufftHandle_t, CuStream_t),
84 |                  plan, stream)
85 | end
86 | 


--------------------------------------------------------------------------------
/test/dnn.jl:
--------------------------------------------------------------------------------
 1 | @testset "CUDNN" begin
 2 | 
 3 | if !isdefined(CuArrays, :CUDNN)
 4 | @warn "Not testing CUDNN"
 5 | else
 6 | using CuArrays.CUDNN
 7 | @info "Testing CUDNN $(CUDNN.version())"
 8 | 
 9 | @testset "NNlib" begin
10 |   using NNlib
11 |   using NNlib: ∇conv_data, ∇conv_filter,
12 |                maxpool, meanpool, ∇maxpool, ∇meanpool,
13 |                softmax, ∇softmax, logsoftmax, ∇logsoftmax
14 |   a, b, c = rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4), rand(Float64, 9, 9, 4, 1)
15 |   da, db, dc = CuArray(a), CuArray(b), CuArray(c)
16 |   cdims = DenseConvDims(a, b)
17 |   @test NNlib.conv(a, b, cdims) ≈ collect(NNlib.conv(da, db, cdims))
18 |   @test ∇conv_data(c, b, cdims) ≈ collect(∇conv_data(dc, db, cdims))
19 |   @test ∇conv_filter(a, c, cdims) ≈ collect(∇conv_filter(da, dc, cdims))
20 | 
21 |   # Test for agreement between CPU NNlib and CuDNN versions, across a variety of kwargs
22 |   for num_spatial_dims in (2, 3)
23 |     # Initialize data we'll run our tests over
24 |     C_in = 3
25 |     C_out = 4
26 |     batch_size = 1
27 |     x = rand(Float64, repeat([8], num_spatial_dims)..., C_in, batch_size)
28 |     w = rand(Float64, repeat([2], num_spatial_dims)..., C_in, C_out)
29 |     b = rand(Float64, repeat([1], num_spatial_dims)..., C_in, C_out)
30 |     options = (Dict(), Dict(:dilation => 2), Dict(:flipkernel => true), Dict(:stride => 2),)
31 |     algos = (1, 0, 1, 1,)
32 | 
33 |     for (opts, algo) in zip(options, algos)
34 |       cdims = DenseConvDims(x, w; opts...)
35 |       y = NNlib.conv(x, w, cdims)
36 | 
37 |       # Test that basic convolution is equivalent across GPU/CPU
38 |       @test testf((x, w) -> NNlib.conv(x, w, cdims), x, w)
39 |       @test testf((y, w) -> ∇conv_data(y, w, cdims), y, w)
40 |       @test testf((x, y) -> ∇conv_filter(x, y, cdims), x, y)
41 |       # Test that we can use an alternative algorithm without dying
42 |       @test_nowarn NNlib.conv!(cu(y), cu(x), cu(w), cdims; algo=algo)
43 |       @test_nowarn NNlib.∇conv_data!(cu(x), cu(y), cu(w), cdims; algo=algo)
44 |       @test_nowarn NNlib.∇conv_filter!(cu(w), cu(x), cu(y), cdims; algo=algo)
45 |     end
46 | 
47 |     # Test that pooling is equivalent across GPU/CPU
48 |     pdims = PoolDims(x, 2)
49 |     y = maxpool(x, pdims)
50 |     dy = ones(size(y))
51 |     @test testf(x -> maxpool(x, pdims), x)
52 |     @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
53 |     @test testf(x -> maxpool(x, pdims), x)
54 |     @test testf((dy, y, x) -> ∇maxpool(dy, y, x, pdims), dy, y, x)
55 |   
56 |     # CPU implementation of ∇conv_bias!
57 |     db = zeros(Float64, 1, 1, 3, 1)
58 |     function CuArrays.CUDNN.∇conv_bias!(db, y)
59 |       db .= sum(y, dims=(1:(ndims(y)-2)))
60 |       return db
61 |     end
62 |     #@test testf(CuArrays.CUDNN.∇conv_bias!, db, y)
63 |   end
64 | 
65 |   for dims in [(5,5), (5,)]
66 |     @test testf(softmax, rand(Float64, dims))
67 |     @test testf(∇softmax, rand(Float64, dims), rand(Float64, dims))
68 |     @test testf(logsoftmax, rand(Float64, dims))
69 |     @test testf(∇logsoftmax, rand(Float64, dims), rand(Float64, dims))
70 |   end
71 | end
72 | 
73 | @testset "Activations and Other Ops" begin
74 |   @test testf(CuArrays.CUDNN.cudnnAddTensor, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
75 |   @test testf(CuArrays.CUDNN.cudnnActivationForward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
76 |   @test testf(CuArrays.CUDNN.cudnnActivationBackward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
77 | end
78 | 
79 | end
80 | 
81 | end
82 | 


--------------------------------------------------------------------------------
/src/mapreduce.jl:
--------------------------------------------------------------------------------
 1 | using CuArrays: @cuindex, cudims
 2 | 
 3 | function mapreducedim_kernel_serial(f, op, R, A, range)
 4 |     I = @cuindex R
 5 |     newrange = map((r, i) -> r === nothing ? i : r, range, I)
 6 |     for I′ in CartesianIndices(newrange)
 7 |         @inbounds R[I...] = op(R[I...], f(A[I′]))
 8 |     end
 9 |     return
10 | end
11 | 
12 | @inline function reduce_block(arr::CuDeviceArray, op)
13 |     sync_threads()
14 |     len = blockDim().x
15 |     while len != 1
16 |         sync_threads()
17 |         skip = (len + 1) >> 1
18 |         reduce_to = threadIdx().x - skip
19 |         if 0 < reduce_to <= (len >> 1)
20 |             arr[reduce_to] = op(arr[reduce_to], arr[threadIdx().x])
21 |         end
22 |         len = skip
23 |     end
24 |     sync_threads()
25 | end
26 | 
27 | function mapreducedim_kernel_parallel(f, op, R::CuDeviceArray{T}, A::CuDeviceArray{T},
28 |                              CIS, Rlength, Slength) where {T}
29 |     for Ri_base in 0:(gridDim().x * blockDim().y):(Rlength-1)
30 |         Ri = Ri_base + (blockIdx().x - 1) * blockDim().y + threadIdx().y
31 |         Ri > Rlength && return
32 |         RI = Tuple(CartesianIndices(R)[Ri])
33 |         S = @cuStaticSharedMem(T, 512)
34 |         Si_folded_base = (threadIdx().y - 1) * blockDim().x
35 |         Si_folded = Si_folded_base + threadIdx().x
36 |         # serial reduction of A into S by Slength ÷ xthreads
37 |         for Si_base in 0:blockDim().x:(Slength-1)
38 |             Si = Si_base + threadIdx().x
39 |             Si > Slength && break
40 |             SI = Tuple(CIS[Si])
41 |             AI = ifelse.(size(R) .== 1, SI, RI)
42 |             if Si_base == 0
43 |                 S[Si_folded] = f(A[AI...])
44 |             else
45 |                 S[Si_folded] = op(S[Si_folded], f(A[AI...]))
46 |             end
47 |         end
48 |         # block-parallel reduction of S to S[1] by xthreads
49 |         reduce_block(view(S, (Si_folded_base + 1):512), op)
50 |         # reduce S[1] into R
51 |         threadIdx().x == 1 && (R[Ri] = op(R[Ri], S[Si_folded]))
52 |     end
53 |     return
54 | end
55 | 
56 | function Base._mapreducedim!(f, op, R::CuArray{T}, A::CuArray{T}) where {T}
57 |     # the kernel as generated from `f` and `op` can require lots of registers (eg. #160),
58 |     # so we need to be careful about how many threads we launch not to run out of them.
59 |     Rlength = length(R)
60 |     Ssize = ifelse.(size(R) .== 1, size(A), 1)
61 |     Slength = prod(Ssize)
62 |     CIS = CartesianIndices(Ssize)
63 | 
64 |     parallel_args = (f, op, R, A, CIS, Rlength, Slength)
65 |     GC.@preserve parallel_args begin
66 |         parallel_kargs = cudaconvert.(parallel_args)
67 |         parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...}
68 |         parallel_kernel = cufunction(mapreducedim_kernel_parallel, parallel_tt)
69 | 
70 |         # we are limited in how many threads we can launch...
71 |         ## by the kernel
72 |         kernel_threads = CUDAnative.maxthreads(parallel_kernel)
73 |         ## by the device
74 |         dev = CUDAdrv.device()
75 |         block_threads = (x=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X),
76 |                          y=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y),
77 |                          total=attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK))
78 | 
79 |         # figure out a legal launch configuration
80 |         y_thr = min(nextpow(2, Rlength ÷ 512 + 1), 512, block_threads.y, kernel_threads)
81 |         x_thr = min(512 ÷ y_thr, Slength, block_threads.x,
82 |                     ceil(Int, block_threads.total/y_thr),
83 |                     ceil(Int, kernel_threads/y_thr))
84 | 
85 |         if x_thr >= 8
86 |             blk, thr = (Rlength - 1) ÷ y_thr + 1, (x_thr, y_thr, 1)
87 |             parallel_kernel(parallel_kargs...; threads=thr, blocks=blk)
88 |         else
89 |             # not enough work, fall back to serial reduction
90 |             range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing)
91 |             blk, thr = cudims(R)
92 |             @cuda(blocks=blk, threads=thr, mapreducedim_kernel_serial(f, op, R, A, range))
93 |         end
94 |     end
95 | 
96 |     return R
97 | end
98 | 


--------------------------------------------------------------------------------
/src/dnn/nnlib.jl:
--------------------------------------------------------------------------------
  1 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
  2 |   maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
  3 |   softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
  4 | import ..CuArrays: CuVecOrMat, CuVector
  5 | using CUDAnative
  6 | 
  7 | 
  8 | # Softmax
  9 | 
 10 | const CUDNNFloat = Union{Float16,Float32,Float64}
 11 | 
 12 | reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1)
 13 | reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...)
 14 | 
 15 | function softmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 16 |   cudnnSoftmaxForward(reshape4D(xs), reshape4D(out))
 17 |   return out
 18 | end
 19 | 
 20 | function ∇softmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 21 |   cudnnSoftmaxBackward(reshape4D(softmax(xs)), reshape4D(Δ), reshape4D(out))
 22 |   return out
 23 | end
 24 | 
 25 | function logsoftmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 26 |   cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=CUDNN_SOFTMAX_LOG)
 27 |   return out
 28 | end
 29 | 
 30 | function ∇logsoftmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 31 |   cudnnSoftmaxBackward(reshape4D(logsoftmax(xs)), reshape4D(Δ), reshape4D(out);
 32 |                        algorithm=CUDNN_SOFTMAX_LOG)
 33 |   return out
 34 | end
 35 | 
 36 | ∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat =
 37 |   ∇logsoftmax!(similar(xs), Δ, xs)
 38 | 
 39 | 
 40 | # Convolution
 41 | 
 42 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}, cdims::DenseConvDims;
 43 |                alpha=1, algo=0) where T<:CUDNNFloat
 44 |   if version() < v"6"
 45 |     all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
 46 |   end
 47 | 
 48 |   workspace_size = cudnnGetConvolutionForwardWorkspaceSize(y, x, w, cdims, algo=algo)
 49 | 
 50 |   CuVector{UInt8}(undef, workspace_size) do workspace
 51 |     cudnnConvolutionForward(y, x, w, cdims, alpha=alpha, algo=algo,
 52 |                             workspace=workspace, workspace_size=workspace_size)
 53 |   end
 54 | end
 55 | 
 56 | function ∇conv_filter!(dw::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
 57 |                        cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat
 58 |   if version() < v"6"
 59 |     all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
 60 |   end
 61 | 
 62 |   workspace_size = cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, cdims, algo=algo)
 63 | 
 64 |   CuVector{UInt8}(undef, workspace_size) do workspace
 65 |     cudnnConvolutionBackwardFilter(dw, x, dy, cdims, alpha=alpha, algo=algo,
 66 |                                    workspace=workspace, workspace_size=workspace_size)
 67 |   end
 68 | end
 69 | 
 70 | function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T},
 71 |                      cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat
 72 |   if version() < v"6"
 73 |     all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
 74 |   end
 75 | 
 76 |   workspace_size =
 77 |     cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, cdims; algo=algo)
 78 |   CuVector{UInt8}(undef, workspace_size) do workspace
 79 |     cudnnConvolutionBackwardData(dx, w, dy, cdims, alpha=alpha, algo=algo,
 80 |                                  workspace=workspace, workspace_size=workspace_size)
 81 |   end
 82 | end
 83 | 
 84 | ∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =
 85 |   cudnnConvolutionBackwardBias(db, dy, alpha=alpha, beta=beta)
 86 | 
 87 | maxpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
 88 |   cudnnPoolingForward(y, x, pdims; mode=0)
 89 | 
 90 | ∇maxpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T},
 91 |           pdims::PoolDims) where T<:CUDNNFloat =
 92 |   cudnnPoolingBackward(dx, dy, x, y, pdims, mode=0)
 93 | 
 94 | meanpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
 95 |   cudnnPoolingForward(y, x, pdims, mode=1)
 96 | 
 97 | ∇meanpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T},
 98 |            pdims::PoolDims) where T<:CUDNNFloat =
 99 |   cudnnPoolingBackward(dx, dy, x, y, pdims, mode=1)
100 | 


--------------------------------------------------------------------------------
/src/blas/libcublas_types.jl:
--------------------------------------------------------------------------------
  1 | # libcublas_types.jl
  2 | #
  3 | # Initially generated with wrap_c from Clang.jl. Modified to remove anonymous
  4 | # enums and add cublasContext.
  5 | #
  6 | # Author: Nick Henderson <nwh@stanford.edu>
  7 | # Created: 2014-08-27
  8 | # License: MIT
  9 | #
 10 | 
 11 | # begin enum cublasStatus_t
 12 | const cublasStatus_t = UInt32
 13 | const CUBLAS_STATUS_SUCCESS = 0
 14 | const CUBLAS_STATUS_NOT_INITIALIZED = 1
 15 | const CUBLAS_STATUS_ALLOC_FAILED = 3
 16 | const CUBLAS_STATUS_INVALID_VALUE = 7
 17 | const CUBLAS_STATUS_ARCH_MISMATCH = 8
 18 | const CUBLAS_STATUS_MAPPING_ERROR = 11
 19 | const CUBLAS_STATUS_EXECUTION_FAILED = 13
 20 | const CUBLAS_STATUS_INTERNAL_ERROR = 14
 21 | const CUBLAS_STATUS_NOT_SUPPORTED = 15
 22 | const CUBLAS_STATUS_LICENSE_ERROR = 16
 23 | # end enum cublasStatus_t
 24 | # begin enum cublasFillMode_t
 25 | const cublasFillMode_t = UInt32
 26 | const CUBLAS_FILL_MODE_LOWER = 0
 27 | const CUBLAS_FILL_MODE_UPPER = 1
 28 | # end enum cublasFillMode_t
 29 | # begin enum cublasDiagType_t
 30 | const cublasDiagType_t = UInt32
 31 | const CUBLAS_DIAG_NON_UNIT = 0
 32 | const CUBLAS_DIAG_UNIT = 1
 33 | # end enum cublasDiagType_t
 34 | # begin enum cublasSideMode_t
 35 | const cublasSideMode_t = UInt32
 36 | const CUBLAS_SIDE_LEFT = 0
 37 | const CUBLAS_SIDE_RIGHT = 1
 38 | # end enum cublasSideMode_t
 39 | # begin enum cublasOperation_t
 40 | const cublasOperation_t = UInt32
 41 | const CUBLAS_OP_N = 0
 42 | const CUBLAS_OP_T = 1
 43 | const CUBLAS_OP_C = 2
 44 | # end enum cublasOperation_t
 45 | # begin enum cublasPointerMode_t
 46 | const cublasPointerMode_t = UInt32
 47 | const CUBLAS_POINTER_MODE_HOST = 0
 48 | const CUBLAS_POINTER_MODE_DEVICE = 1
 49 | # end enum cublasPointerMode_t
 50 | # begin enum cublasAtomicsMode_t
 51 | const cublasAtomicsMode_t = UInt32
 52 | const CUBLAS_ATOMICS_NOT_ALLOWED = 0
 53 | const CUBLAS_ATOMICS_ALLOWED = 1
 54 | # end enum cublasAtomicsMode_t
 55 | const cublasContext = Nothing
 56 | const cublasHandle_t = Ptr{cublasContext}
 57 | const cublasXtHandle_t = Ptr{cublasContext}
 58 | # complex numbers in cuda
 59 | const cuComplex = Complex{Float32}
 60 | const cuDoubleComplex = Complex{Float64}
 61 | # complex types from Base/linalg.jl
 62 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
 63 | const CublasReal = Union{Float64,Float32}
 64 | const CublasComplex = Union{ComplexF64,ComplexF32}
 65 | # FP16 (cuda_fp16.h) in cuda
 66 | const __half = Float16
 67 | struct __half2
 68 |     x1::__half
 69 |     x2::__half
 70 | end
 71 | 
 72 | const cublasXtOpType_t = UInt32
 73 | const CUBLASXT_FLOAT = 0
 74 | const CUBLASXT_DOUBLE = 1
 75 | const CUBLASXT_COMPLEX = 2
 76 | const CUBLASXT_DOUBLECOMPLEX = 3
 77 | 
 78 | const cublasXtBlasOp_t = UInt32
 79 | const CUBLASXT_GEMM = 0
 80 | const CUBLASXT_SYRK = 1
 81 | const CUBLASXT_HERK = 2
 82 | const CUBLASXT_SYMM= 3
 83 | const CUBLASXT_HEMM= 4
 84 | const CUBLASXT_TRSM= 5
 85 | const CUBLASXT_SYR2K= 6
 86 | const CUBLASXT_HER2K= 7
 87 | const CUBLASXT_SPMM= 8
 88 | const CUBLASXT_SYRKX= 9
 89 | const CUBLASXT_HERKX= 10
 90 | 
 91 | const cublasXtPinningMemMode_t = UInt32
 92 | const CUBLASXT_PINNING_DISABLED = 0
 93 | const CUBLASXT_PINNING_ENABLED = 1
 94 | 
 95 | if CUDAdrv.version() >= v"0.7.5"
 96 |     # specify which GEMM algorithm to use in cublasGemmEx() (CUDA 7.5+)
 97 |     const cublasGemmAlgo_t = Int32
 98 |     const CUBLAS_GEMM_DFALT = -1
 99 |     const CUBLAS_GEMM_ALGO0 = 0
100 |     const CUBLAS_GEMM_ALGO1 = 1
101 |     const CUBLAS_GEMM_ALGO2 = 2
102 |     const CUBLAS_GEMM_ALGO3 = 3
103 |     const CUBLAS_GEMM_ALGO4 = 4
104 |     const CUBLAS_GEMM_ALGO5 = 5
105 |     const CUBLAS_GEMM_ALGO6 = 6
106 |     const CUBLAS_GEMM_ALGO7 = 7
107 |     # specify which DataType to use with cublas<t>gemmEx() and cublasGemmEx() (CUDA 7.5+) functions
108 |     const cudaDataType_t = UInt32
109 |     const CUDA_R_16F = UInt32(2)
110 |     const CUDA_C_16F = UInt32(6)
111 |     const CUDA_R_32F = UInt32(0)
112 |     const CUDA_C_32F = UInt32(4)
113 |     const CUDA_R_64F = UInt32(1)
114 |     const CUDA_C_64F = UInt32(5)
115 |     const CUDA_R_8I  = UInt32(3)
116 |     const CUDA_C_8I  = UInt32(7)
117 |     const CUDA_R_8U  = UInt32(8)
118 |     const CUDA_C_8U  = UInt32(9)
119 |     const CUDA_R_32I = UInt32(10)
120 |     const CUDA_C_32I = UInt32(11)
121 |     const CUDA_R_32U = UInt32(12)
122 |     const CUDA_C_32U = UInt32(13)
123 | end
124 | 
125 | @enum CUBLASMathMode::Cint begin
126 |    CUBLAS_DEFAULT_MATH = 0
127 |    CUBLAS_TENSOR_OP_MATH = 1
128 | end
129 | 


--------------------------------------------------------------------------------
/src/sparse/highlevel.jl:
--------------------------------------------------------------------------------
 1 | import LinearAlgebra: BlasFloat, mul!
 2 | 
 3 | Base.:(\)(A::Union{UpperTriangular{T, S},LowerTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}}       = sm('N',A,B,'O')
 4 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('T',parent(transA),B,'O')
 5 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, S}}, B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('T',parent(transA),B,'O')
 6 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, S}},B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('C',parent(adjA),B,'O')
 7 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, S}},B::CuMatrix{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sm('C',parent(adjA),B,'O')
 8 | 
 9 | mul!(C::CuVector{T},A::CuSparseMatrix,B::CuVector) where {T} = mv!('N',one(T),A,B,zero(T),C,'O')
10 | mul!(C::CuVector{T},transA::Transpose{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O')
11 | mul!(C::CuVector{T},adjA::Adjoint{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('C',one(T),parent(transA),B,zero(T),C,'O')
12 | mul!(C::CuVector{T},A::HermOrSym{T,<:CuSparseMatrix{T}},B::CuVector{T}) where T = mv!('N',one(T),A,B,zero(T),C,'O')
13 | mul!(C::CuVector{T},transA::Transpose{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O')
14 | mul!(C::CuVector{T},adjA::Adjoint{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('C',one(T),parent(adjA),B,zero(T),C,'O')
15 | 
16 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},B::CuMatrix{T}) where {T} = mm2!('N','N',one(T),A,B,zero(T),C,'O')
17 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},transB::Transpose{<:Any, CuMatrix{T}})  where {T} = mm2!('N','T',one(T),A,parent(transB),zero(T),C,'O')
18 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T})  where {T} = mm2!('T','N',one(T),parent(transA),B,zero(T),C,'O')
19 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('T','T',one(T),parent(transA),parent(transB),zero(T),C,'O')
20 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T})  where {T} = mm2!('C','N',one(T),parent(adjA),B,zero(T),C,'O')
21 | 
22 | mul!(C::CuMatrix{T},A::HermOrSym{<:Number, <:CuSparseMatrix},B::CuMatrix) where {T} = mm!('N',one(T),A,B,zero(T),C,'O')
23 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('T',one(T),parent(transA),B,zero(T),C,'O')
24 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('C',one(T),parent(adjA),B,zero(T),C,'O')
25 | 
26 | Base.:(\)(A::Union{UpperTriangular{T, S},LowerTriangular{T, S}}, B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}}       = sv2('N',A,B,'O')
27 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('T',parent(transA),B,'O')
28 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}} = sv2('T',parent(transA),B,'O')
29 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}}  = sv2('C',parent(adjA),B,'O')
30 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, S}},B::CuVector{T}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix{T}}  = sv2('C',parent(adjA),B,'O')
31 | Base.:(\)(A::AbstractTriangular{T,CuSparseMatrixHYB{T}},B::CuVector{T})       where T = sv('N',A,B,'O')
32 | Base.:(\)(transA::Transpose{T, UpperTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('T',parent(transA),B,'O')
33 | Base.:(\)(transA::Transpose{T, LowerTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('T',parent(transA),B,'O')
34 | Base.:(\)(adjA::Adjoint{T, UpperTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('C',parent(adjA),B,'O')
35 | Base.:(\)(adjA::Adjoint{T, LowerTriangular{T, CuSparseMatrixHYB{T}}},B::CuVector{T}) where {T<:BlasFloat} = sv('C',parent(adjA),B,'O')
36 | 
37 | Base.:(+)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,B,'O','O','O')
38 | Base.:(-)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,-one(eltype(A)),B,'O','O','O')
39 | 


--------------------------------------------------------------------------------
/src/solver/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # QR factorization
  2 | 
  3 | struct CuQR{T,S<:AbstractMatrix} <: LinearAlgebra.Factorization{T}
  4 |     factors::S
  5 |     τ::CuVector{T}
  6 |     CuQR{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ)
  7 | end
  8 | 
  9 | struct CuQRPackedQ{T,S<:AbstractMatrix} <: LinearAlgebra.AbstractQ{T}
 10 |     factors::CuMatrix{T}
 11 |     τ::CuVector{T}
 12 |     CuQRPackedQ{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ)
 13 | end
 14 | 
 15 | CuQR(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQR{T,typeof(factors)}(factors, τ)
 16 | CuQRPackedQ(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQRPackedQ{T,typeof(factors)}(factors, τ)
 17 | 
 18 | LinearAlgebra.qr!(A::CuMatrix{T}) where T = CuQR(geqrf!(A::CuMatrix{T})...)
 19 | Base.size(A::CuQR) = size(A.factors)
 20 | Base.size(A::CuQRPackedQ, dim::Integer) = 0 < dim ? (dim <= 2 ? size(A.factors, 1) : 1) : throw(BoundsError())
 21 | CuArrays.CuMatrix(A::CuQRPackedQ) = orgqr!(copy(A.factors), A.τ)
 22 | CuArrays.CuArray(A::CuQRPackedQ) = convert(CuMatrix, A)
 23 | Base.Matrix(A::CuQRPackedQ) = Matrix(CuMatrix(A))
 24 | 
 25 | function Base.getproperty(A::CuQR, d::Symbol)
 26 |     m, n = size(getfield(A, :factors))
 27 |     if d == :R
 28 |         return triu!(A.factors[1:min(m, n), 1:n])
 29 |     elseif d == :Q
 30 |         return CuQRPackedQ(A.factors, A.τ)
 31 |     else
 32 |         getfield(A, d)
 33 |     end
 34 | end
 35 | 
 36 | # iteration for destructuring into components
 37 | Base.iterate(S::CuQR) = (S.Q, Val(:R))
 38 | Base.iterate(S::CuQR, ::Val{:R}) = (S.R, Val(:done))
 39 | Base.iterate(S::CuQR, ::Val{:done}) = nothing
 40 | 
 41 | # Apply changes Q from the left
 42 | LinearAlgebra.lmul!(A::CuQRPackedQ{T,S}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} =
 43 |     ormqr!('L', 'N', A.factors, A.τ, B)
 44 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Real, S<:CuMatrix} =
 45 |     ormqr!('L', 'T', parent(adjA).factors, parent(adjA).τ, B)
 46 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Complex, S<:CuMatrix} =
 47 |     ormqr!('L', 'C', parent(adjA).factors, parent(adjA).τ, B)
 48 | LinearAlgebra.lmul!(trA::Transpose{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} =
 49 |     ormqr!('L', 'T', parent(trA).factors, parent(trA).τ, B)
 50 | 
 51 | function Base.getindex(A::CuQRPackedQ{T, S}, i::Integer, j::Integer) where {T, S}
 52 |     x = CuArray{T}(undef, size(A, 2)) .= 0
 53 |     x[j] = 1
 54 |     lmul!(A, x)
 55 |     return _getindex(x, i)
 56 | end
 57 | 
 58 | function Base.show(io::IO, F::CuQR)
 59 |     println(io, "$(typeof(F)) with factors Q and R:")
 60 |     show(io, F.Q)
 61 |     println(io)
 62 |     show(io, F.R)
 63 | end
 64 | 
 65 | # Singular Value Decomposition
 66 | 
 67 | struct CuSVD{T,Tr,A<:AbstractMatrix{T}} <: LinearAlgebra.Factorization{T}
 68 |     U::CuMatrix{T}
 69 |     S::CuVector{Tr}
 70 |     V::A
 71 | end
 72 | 
 73 | # iteration for destructuring into components
 74 | Base.iterate(S::CuSVD) = (S.U, Val(:S))
 75 | Base.iterate(S::CuSVD, ::Val{:S}) = (S.S, Val(:V))
 76 | Base.iterate(S::CuSVD, ::Val{:V}) = (S.V, Val(:done))
 77 | Base.iterate(S::CuSVD, ::Val{:done}) = nothing
 78 | 
 79 | @inline function Base.getproperty(S::CuSVD, s::Symbol)
 80 |     if s === :Vt
 81 |         return getfield(S, :V)'
 82 |     else
 83 |         return getfield(S, s)
 84 |     end
 85 | end
 86 | 
 87 | @enum SVDAlgorithm QRAlgorithm JacobiAlgorithm
 88 | function LinearAlgebra.svd!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm; full::Bool=false) where T
 89 |     if method === QRAlgorithm
 90 |         U, s, Vt = gesvd!(full ? 'A' : 'S', full ? 'A' : 'S', A::CuMatrix{T})
 91 |         return CuSVD(U, s, Vt')
 92 |     elseif method === JacobiAlgorithm
 93 |         return CuSVD(gesvdj!('V', Int(!full), A::CuMatrix{T})...)
 94 |     end
 95 | end
 96 | # Once LinearAlgebra.svd(::AbstractMatrix) accepts kwargs this method can be deleted
 97 | LinearAlgebra.svd(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm; full=false) = svd!(copy(A), method, full=full)
 98 | 
 99 | function LinearAlgebra.svdvals!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm) where T
100 |     if method === QRAlgorithm
101 |         return gesvd!('N', 'N', A::CuMatrix{T})[2]
102 |     elseif method === JacobiAlgorithm
103 |         return gesvdj!('N', 1, A::CuMatrix{T})[2]
104 |     end
105 | end
106 | # Once LinearAlgebra.svdvals(::AbstractMatrix) accepts kwargs this method can be deleted
107 | LinearAlgebra.svdvals(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm) = svdvals!(copy(A), method)
108 | 


--------------------------------------------------------------------------------
/src/rand/highlevel.jl:
--------------------------------------------------------------------------------
 1 | # high-level interface for CURAND
 2 | #
 3 | # the interface is split in two levels:
 4 | # - functions that extend the Random standard library, and take an RNG as first argument,
 5 | #   will only ever dispatch to CURAND and as a result are limited in the types they support.
 6 | # - functions that take an array will dispatch to either CURAND or GPUArrays
 7 | # - `cu`-prefixed functions are provided for constructing GPU arrays from only an eltype
 8 | 
 9 | 
10 | ## seeding
11 | 
12 | seed!(rng::RNG=generator()) = generate_seeds(rng)
13 | 
14 | 
15 | ## in-place
16 | 
17 | # uniform
18 | Random.rand!(rng::RNG, A::CuArray{Float32}) = generate_uniform(rng, A)
19 | Random.rand!(rng::RNG, A::CuArray{Float64}) = generate_uniform_double(rng, A)
20 | 
21 | # normal
22 | Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_normal(rng, A, mean, stddev)
23 | Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_normal_double(rng, A, mean, stddev)
24 | 
25 | # log-normal
26 | rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_log_normal(rng, A, mean, stddev)
27 | rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_log_normal_double(rng, A, mean, stddev)
28 | 
29 | # log-normal
30 | rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1) = generate_poisson(rng, A, lambda)
31 | 
32 | 
33 | ## out of place
34 | 
35 | Random.rand(rng::RNG, ::Type{X}, dims::Dims) where {X} = rand!(rng, CuArray{X}(undef, dims))
36 | Random.randn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = randn!(rng, CuArray{X}(undef, dims); kwargs...)
37 | rand_logn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_logn!(rng, CuArray{X}(undef, dims); kwargs...)
38 | rand_poisson(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_poisson!(rng, CuArray{X}(undef, dims); kwargs...)
39 | 
40 | # specify default types
41 | Random.rand(rng::RNG, dims::Integer...; kwargs...) = rand(rng, Float32, dims...; kwargs...)
42 | Random.randn(rng::RNG, dims::Integer...; kwargs...) = randn(rng, Float32, dims...; kwargs...)
43 | rand_logn(rng::RNG, dims::Integer...; kwargs...) = rand_logn(rng, Float32, dims...; kwargs...)
44 | rand_poisson(rng::RNG, dims::Integer...; kwargs...) = rand_poisson(rng, Cuint, dims...; kwargs...)
45 | 
46 | # convenience
47 | Random.randn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
48 |     randn(rng, X, Dims((dim1, dims...)); kwargs...)
49 | rand_logn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
50 |     rand_logn(rng, X, Dims((dim1, dims...)); kwargs...)
51 | rand_poisson(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
52 |     rand_poisson(rng, X, Dims((dim1, dims...)); kwargs...)
53 | 
54 | 
55 | ## functions that dispatch to either CURAND or GPUArrays
56 | 
57 | uniform_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
58 | uniform_rng(A::CuArray) = GPUArrays.global_rng(A)
59 | 
60 | normal_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
61 | normal_rng(::CuArray{T}) where {T} =
62 |     error("CuArrays does not support generating normally distributed numbers of type $T")
63 | 
64 | logn_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
65 | logn_rng(::CuArray{T}) where {T} =
66 |     error("CuArrays does not support generating lognormally distributed numbers of type $T")
67 | 
68 | poisson_rng(::CuArray{Cuint}) = generator()
69 | poisson_rng(::CuArray{T}) where {T} =
70 |     error("CuArrays does not support generating Poisson distributed numbers of type $T")
71 | 
72 | 
73 | Random.rand!(A::CuArray; kwargs...) = rand!(uniform_rng(A), A; kwargs...)
74 | Random.randn!(A::CuArray; kwargs...) = randn!(normal_rng(A), A; kwargs...)
75 | rand_logn!(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...)
76 | rand_poisson!(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...)
77 | rand_logn(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...)
78 | rand_poisson(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...)
79 | 
80 | rand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...)
81 | randn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...)
82 | rand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...)
83 | rand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...)
84 | 
85 | # specify default types
86 | rand(args...; kwargs...) where {X} = rand(Float32, args...; kwargs...)
87 | randn(args...; kwargs...) where {X} = randn(Float32, args...; kwargs...)
88 | rand_logn(args...; kwargs...) where {X} = rand_logn(Float32, args...; kwargs...)
89 | rand_poisson(args...; kwargs...) where {X} = rand_poisson(Cuint, args...; kwargs...)
90 | 


--------------------------------------------------------------------------------
/src/sparse/libcusparse.jl:
--------------------------------------------------------------------------------
  1 | # low-level wrappers of the CUSPARSE library
  2 | 
  3 | #helper functions
  4 | function cusparseCreate()
  5 |   handle = Ref{cusparseHandle_t}()
  6 |   @check ccall( (:cusparseCreate, libcusparse), cusparseStatus_t, (Ptr{cusparseHandle_t},), handle)
  7 |   handle[]
  8 | end
  9 | 
 10 | function cusparseDestroy(handle)
 11 |   @check ccall( (:cusparseDestroy, libcusparse), cusparseStatus_t, (cusparseHandle_t,), handle)
 12 | end
 13 | 
 14 | function cusparseGetVersion(handle, version)
 15 |   @check ccall( (:cusparseGetVersion, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{Cint}), handle, version)
 16 | end
 17 | 
 18 | function cusparseSetStream(handle, streamId)
 19 |   @check ccall( (:cusparseSetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, CuStream_t), handle, streamId)
 20 | end
 21 | 
 22 | function cusparseGetStream(handle, streamId)
 23 |   @check ccall( (:cusparseGetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{CuStream_t}), handle, streamId)
 24 | end
 25 | 
 26 | function cusparseGetPointerMode(handle, mode)
 27 |   @check ccall( (:cusparseGetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{cusparsePointerMode_t}), handle, mode)
 28 | end
 29 | 
 30 | function cusparseSetPointerMode(handle, mode)
 31 |   @check ccall( (:cusparseSetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparsePointerMode_t), handle, mode)
 32 | end
 33 | 
 34 | function cusparseCreateHybMat(hybA)
 35 |   @check ccall( (:cusparseCreateHybMat, libcusparse), cusparseStatus_t, (Ptr{cusparseHybMat_t},), hybA)
 36 | end
 37 | 
 38 | function cusparseDestroyHybMat(hybA)
 39 |   @check ccall( (:cusparseDestroyHybMat, libcusparse), cusparseStatus_t, (cusparseHybMat_t,), hybA)
 40 | end
 41 | 
 42 | function cusparseCreateSolveAnalysisInfo(info)
 43 |   @check ccall( (:cusparseCreateSolveAnalysisInfo, libcusparse), cusparseStatus_t, (Ptr{cusparseSolveAnalysisInfo_t},), info)
 44 | end
 45 | 
 46 | function cusparseDestroySolveAnalysisInfo(info)
 47 |   @check ccall( (:cusparseDestroySolveAnalysisInfo, libcusparse), cusparseStatus_t, (cusparseSolveAnalysisInfo_t,), info)
 48 | end
 49 | 
 50 | function cusparseCreateBsrsm2Info(info)
 51 |   @check ccall( (:cusparseCreateBsrsm2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsm2Info_t},), info)
 52 | end
 53 | 
 54 | function cusparseDestroyBsrsm2Info(info)
 55 |   @check ccall( (:cusparseDestroyBsrsm2Info, libcusparse), cusparseStatus_t, (bsrsm2Info_t,), info)
 56 | end
 57 | 
 58 | function cusparseCreateBsrsv2Info(info)
 59 |   @check ccall( (:cusparseCreateBsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsv2Info_t},), info)
 60 | end
 61 | 
 62 | function cusparseDestroyBsrsv2Info(info)
 63 |   @check ccall( (:cusparseDestroyBsrsv2Info, libcusparse), cusparseStatus_t, (bsrsv2Info_t,), info)
 64 | end
 65 | 
 66 | function cusparseCreateCsrsv2Info(info)
 67 |   @check ccall( (:cusparseCreateCsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{csrsv2Info_t},), info)
 68 | end
 69 | 
 70 | function cusparseDestroyCsrsv2Info(info)
 71 |   @check ccall( (:cusparseDestroyCsrsv2Info, libcusparse), cusparseStatus_t, (csrsv2Info_t,), info)
 72 | end
 73 | 
 74 | function cusparseCreateCsric02Info(info)
 75 |   @check ccall( (:cusparseCreateCsric02Info, libcusparse), cusparseStatus_t, (Ptr{csric02Info_t},), info)
 76 | end
 77 | 
 78 | function cusparseDestroyCsric02Info(info)
 79 |   @check ccall( (:cusparseDestroyCsric02Info, libcusparse), cusparseStatus_t, (csric02Info_t,), info)
 80 | end
 81 | 
 82 | function cusparseCreateCsrilu02Info(info)
 83 |   @check ccall( (:cusparseCreateCsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{csrilu02Info_t},), info)
 84 | end
 85 | 
 86 | function cusparseDestroyCsrilu02Info(info)
 87 |   @check ccall( (:cusparseDestroyCsrilu02Info, libcusparse), cusparseStatus_t, (csrilu02Info_t,), info)
 88 | end
 89 | 
 90 | function cusparseCreateBsric02Info(info)
 91 |   @check ccall( (:cusparseCreateBsric02Info, libcusparse), cusparseStatus_t, (Ptr{bsric02Info_t},), info)
 92 | end
 93 | 
 94 | function cusparseDestroyBsric02Info(info)
 95 |   @check ccall( (:cusparseDestroyBsric02Info, libcusparse), cusparseStatus_t, (bsric02Info_t,), info)
 96 | end
 97 | 
 98 | function cusparseCreateBsrilu02Info(info)
 99 |   @check ccall( (:cusparseCreateBsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{bsrilu02Info_t},), info)
100 | end
101 | 
102 | function cusparseDestroyBsrilu02Info(info)
103 |   @check ccall( (:cusparseDestroyBsrilu02Info, libcusparse), cusparseStatus_t, (bsrilu02Info_t,), info)
104 | end
105 | 
106 | function cusparseGetProperty(property::CUDAapi.libraryPropertyType)
107 |   value_ref = Ref{Cint}()
108 |   @check ccall((:cusparseGetProperty, libcusparse),
109 |                cusparseStatus_t,
110 |                (Cint, Ptr{Cint}),
111 |                property, value_ref)
112 |   value_ref[]
113 | end
114 | 


--------------------------------------------------------------------------------
/test/sparse_solver.jl:
--------------------------------------------------------------------------------
  1 | @testset "CUSPARSE + CUSOLVER" begin
  2 | 
  3 | using CuArrays.CUSPARSE
  4 | using CuArrays.CUSOLVER
  5 | 
  6 | using LinearAlgebra
  7 | using SparseArrays
  8 | 
  9 | m = 15
 10 | n = 10
 11 | l = 13
 12 | k = 1
 13 | 
 14 | @testset for elty in [Float32, Float64, ComplexF32, ComplexF64]
 15 |     @testset "csrlsvlu!" begin
 16 |         A = sparse(rand(elty,n,n))
 17 |         b = rand(elty,n)
 18 |         x = zeros(elty,n)
 19 |         tol = convert(real(elty),1e-6)
 20 |         x = CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 21 |         @test x ≈ Array(A)\b
 22 |         A = sparse(rand(elty,m,n))
 23 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 24 |         A = sparse(rand(elty,n,n))
 25 |         b = rand(elty,m)
 26 |         x = zeros(elty,n)
 27 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 28 |         b = rand(elty,n)
 29 |         x = zeros(elty,m)
 30 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 31 |     end
 32 | 
 33 |     @testset "csrlsvqr!" begin
 34 |         A     = sparse(rand(elty,n,n))
 35 |         d_A   = CuSparseMatrixCSR(A)
 36 |         b     = rand(elty,n)
 37 |         d_b   = CuArray(b)
 38 |         x     = zeros(elty,n)
 39 |         d_x   = CuArray(x)
 40 |         tol   = convert(real(elty),1e-4)
 41 |         d_x   = CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 42 |         h_x   = collect(d_x)
 43 |         @test h_x ≈ Array(A)\b
 44 |         A     = sparse(rand(elty,m,n))
 45 |         d_A   = CuSparseMatrixCSR(A)
 46 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 47 |         A = sparse(rand(elty,n,n))
 48 |         b = rand(elty,m)
 49 |         x = zeros(elty,n)
 50 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 51 |         b = rand(elty,n)
 52 |         x = zeros(elty,m)
 53 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 54 |     end
 55 | 
 56 |     @testset "csrlsvchol!" begin
 57 |         A     = rand(elty,n,n)
 58 |         A     = sparse(A*A') #posdef
 59 |         d_A   = CuSparseMatrixCSR(A)
 60 |         b     = rand(elty,n)
 61 |         d_b   = CuArray(b)
 62 |         x     = zeros(elty,n)
 63 |         d_x   = CuArray(x)
 64 |         tol   = 10^2*eps(real(elty))
 65 |         d_x   = CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 66 |         h_x   = collect(d_x)
 67 |         @test h_x ≈ Array(A)\b
 68 |         b     = rand(elty,m)
 69 |         d_b   = CuArray(b)
 70 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 71 |         b     = rand(elty,n)
 72 |         d_b   = CuArray(b)
 73 |         x     = rand(elty,m)
 74 |         d_x   = CuArray(x)
 75 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 76 |         A     = sparse(rand(elty,m,n))
 77 |         d_A   = CuSparseMatrixCSR(A)
 78 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 79 |     end
 80 | 
 81 |     @testset "csreigvsi" begin
 82 |         A     = sparse(rand(elty,n,n))
 83 |         A     = A + A'
 84 |         d_A   = CuSparseMatrixCSR(A)
 85 |         evs   = eigvals(Array(A))
 86 |         x_0   = CuArray(rand(elty,n))
 87 |         μ,x   = CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 88 |         @test μ ≈ evs[1]
 89 |         A     = sparse(rand(elty,m,n))
 90 |         d_A   = CuSparseMatrixCSR(A)
 91 |         @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 92 |         A     = sparse(rand(elty,n,n))
 93 |         d_A   = CuSparseMatrixCSR(A)
 94 |         x_0   = CuArray(rand(elty,m))
 95 |         @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 96 |     end
 97 |     @testset "csreigs" begin
 98 |         celty = complex(elty)
 99 |         A   = rand(real(elty),n,n)
100 |         A   = sparse(A + A')
101 |         num = CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O')
102 |         @test num <= n
103 |         A     = sparse(rand(celty,m,n))
104 |         d_A   = CuSparseMatrixCSR(A)
105 |         @test_throws DimensionMismatch CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O')
106 |     end
107 |     @testset "csrlsqvqr!" begin
108 |         A = sparse(rand(elty,n,n))
109 |         b = rand(elty,n)
110 |         x = zeros(elty,n)
111 |         tol = convert(real(elty),1e-4)
112 |         x = CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
113 |         @test x[1] ≈ Array(A)\b
114 |         A = sparse(rand(elty,n,m))
115 |         x = zeros(elty,n)
116 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
117 |         A = sparse(rand(elty,n,n))
118 |         b = rand(elty,m)
119 |         x = zeros(elty,n)
120 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
121 |         b = rand(elty,n)
122 |         x = zeros(elty,m)
123 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
124 |     end
125 | end
126 | 
127 | end
128 | 


--------------------------------------------------------------------------------
/src/fft/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # region is an iterable subset of dimensions
  2 | # spec. an integer, range, tuple, or array
  3 | 
  4 | # inplace complex
  5 | function plan_fft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
  6 |     K = CUFFT_FORWARD
  7 |     inplace = true
  8 |     xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
  9 | 
 10 |     pp = _mkplan(xtype, size(X), region)
 11 | 
 12 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 13 | end
 14 | 
 15 | function plan_bfft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 16 |     K = CUFFT_INVERSE
 17 |     inplace = true
 18 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 19 | 
 20 |     pp = _mkplan(xtype, size(X), region)
 21 | 
 22 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 23 | end
 24 | 
 25 | # out-of-place complex
 26 | function plan_fft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 27 |     K = CUFFT_FORWARD
 28 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 29 |     inplace = false
 30 | 
 31 |     pp = _mkplan(xtype, size(X), region)
 32 | 
 33 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 34 | end
 35 | 
 36 | function plan_bfft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 37 |     K = CUFFT_INVERSE
 38 |     inplace = false
 39 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 40 | 
 41 |     pp = _mkplan(xtype, size(X), region)
 42 | 
 43 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 44 | end
 45 | 
 46 | # out-of-place real-to-complex
 47 | function plan_rfft(X::CuArray{T,N}, region) where {T<:cufftReals,N}
 48 |     K = CUFFT_FORWARD
 49 |     inplace = false
 50 |     xtype =  (T == cufftReal) ? CUFFT_R2C : CUFFT_D2Z
 51 | 
 52 |     pp = _mkplan(xtype, size(X), region)
 53 | 
 54 |     ydims = collect(size(X))
 55 |     ydims[region[1]] = div(ydims[region[1]],2)+1
 56 | 
 57 |     rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype)
 58 | end
 59 | 
 60 | function plan_brfft(X::CuArray{T,N}, d::Integer, region::Any) where {T<:cufftComplexes,N}
 61 |     K = CUFFT_INVERSE
 62 |     inplace = false
 63 |     xtype =  (T == cufftComplex) ? CUFFT_C2R : CUFFT_Z2D
 64 |     ydims = collect(size(X))
 65 |     ydims[region[1]] = d
 66 | 
 67 |     pp = _mkplan(xtype, (ydims...,), region)
 68 | 
 69 |     rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype)
 70 | end
 71 | 
 72 | # FIXME: plan_inv methods allocate needlessly (to provide type parameters)
 73 | # Perhaps use FakeArray types to avoid this.
 74 | 
 75 | function plan_inv(p::cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}) where {T,N,inplace}
 76 |     X = CuArray{T}(undef, p.sz)
 77 |     pp = _mkplan(p.xtype, p.sz, p.region)
 78 |     ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}(pp, X, p.sz, p.region,
 79 |                                                      p.xtype),
 80 |                normalization(X, p.region))
 81 | end
 82 | 
 83 | function plan_inv(p::cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}) where {T,N,inplace}
 84 |     X = CuArray{T}(undef, p.sz)
 85 |     pp = _mkplan(p.xtype, p.sz, p.region)
 86 |     ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region,
 87 |                                                      p.xtype),
 88 |                normalization(X, p.region))
 89 | end
 90 | 
 91 | function plan_inv(p::rCuFFTPlan{T,CUFFT_INVERSE,inplace,N}
 92 |                   ) where {T<:cufftComplexes,N,inplace}
 93 |     X = CuArray{real(T)}(undef, p.osz)
 94 |     Y = CuArray{T}(undef, p.sz)
 95 |     xtype = p.xtype == CUFFT_C2R ? CUFFT_R2C : CUFFT_D2Z
 96 |     pp = _mkplan(xtype, p.osz, p.region)
 97 |     ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region,
 98 |                                                      xtype),
 99 |                normalization(X, p.region))
100 | end
101 | 
102 | function plan_inv(p::rCuFFTPlan{T,CUFFT_FORWARD,inplace,N}
103 |                   ) where {T<:cufftReals,N,inplace}
104 |     X = CuArray{complex(T)}(undef, p.osz)
105 |     Y = CuArray{T}(undef, p.sz)
106 |     xtype = p.xtype == CUFFT_R2C ? CUFFT_C2R : CUFFT_Z2D
107 |     pp = _mkplan(xtype, p.sz, p.region)
108 |     ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,N}(pp, X, p.sz,
109 |                                                               p.region, xtype),
110 |                normalization(Y, p.region))
111 | end
112 | 
113 | 
114 | # The rest of the standard API
115 | 
116 | size(p::CuFFTPlan) = p.sz
117 | 
118 | function mul!(y::CuArray{Ty}, p::CuFFTPlan{T,K,false}, x::CuArray{T}
119 |                   ) where {Ty,T,K}
120 |     assert_applicable(p,x,y)
121 |     unsafe_execute!(p,x,y)
122 |     return y
123 | end
124 | 
125 | function *(p::cCuFFTPlan{T,K,true,N}, x::CuArray{T,N}) where {T,K,N}
126 |     assert_applicable(p,x)
127 |     unsafe_execute!(p,x)
128 |     x
129 | end
130 | 
131 | function *(p::rCuFFTPlan{T,CUFFT_FORWARD,false,N}, x::CuArray{T,N}
132 |            ) where {T<:cufftReals,N}
133 |     @assert p.xtype ∈ [CUFFT_R2C,CUFFT_D2Z]
134 |     y = CuArray{complex(T),N}(undef, p.osz)
135 |     mul!(y,p,x)
136 |     y
137 | end
138 | 
139 | function *(p::rCuFFTPlan{T,CUFFT_INVERSE,false,N}, x::CuArray{T,N}
140 |            ) where {T<:cufftComplexes,N}
141 |     @assert p.xtype ∈ [CUFFT_C2R,CUFFT_Z2D]
142 |     y = CuArray{real(T),N}(undef, p.osz)
143 |     mul!(y,p,x)
144 |     y
145 | end
146 | 
147 | function *(p::cCuFFTPlan{T,K,false,N}, x::CuArray{T,N}) where {T,K,N}
148 |     y = CuArray{T,N}(undef, p.osz)
149 |     mul!(y,p,x)
150 |     y
151 | end


--------------------------------------------------------------------------------
/src/solver/libcusolver.jl:
--------------------------------------------------------------------------------
  1 | # low-level wrappers of the CUSOLVER library
  2 | 
  3 | #helper functions
  4 | function cusolverDnCreate()
  5 |   handle = Ref{cusolverDnHandle_t}()
  6 |   @check ccall((:cusolverDnCreate, libcusolver),
  7 |                cusolverStatus_t,
  8 |                (Ptr{cusolverDnHandle_t},),
  9 |                handle)
 10 |   return handle[]
 11 | end
 12 | 
 13 | function cusolverDnDestroy(handle)
 14 |   @check ccall((:cusolverDnDestroy, libcusolver),
 15 |                cusolverStatus_t,
 16 |                (cusolverDnHandle_t,),
 17 |                handle)
 18 | end
 19 | 
 20 | function cusolverDnSetStream(handle, streamId)
 21 |   @check ccall((:cusolverDnSetStream, libcusolver),
 22 |                cusolverStatus_t,
 23 |                (cusolverDnHandle_t, CuStream_t),
 24 |                handle, streamId)
 25 | end
 26 | 
 27 | function cusolverDnGetStream(handle, streamId)
 28 |   @check ccall((:cusolverDnGetStream, libcusolver),
 29 |                cusolverStatus_t,
 30 |                (cusolverDnHandle_t, Ptr{CuStream_t}),
 31 |                handle, streamId)
 32 | end
 33 | 
 34 | function cusolverSpCreate()
 35 |   handle = Ref{cusolverSpHandle_t}()
 36 |   @check ccall((:cusolverSpCreate, libcusolver),
 37 |                cusolverStatus_t,
 38 |                (Ptr{cusolverSpHandle_t},),
 39 |                handle)
 40 |   return handle[]
 41 | end
 42 | 
 43 | function cusolverSpDestroy(handle)
 44 |   @check ccall((:cusolverSpDestroy, libcusolver),
 45 |                cusolverStatus_t,
 46 |                (cusolverSpHandle_t,),
 47 |                handle)
 48 | end
 49 | 
 50 | function cusolverSpSetStream(handle, streamId)
 51 |   @check ccall((:cusolverSpSetStream, libcusolver),
 52 |                cusolverStatus_t,
 53 |                (cusolverSpHandle_t, CuStream_t),
 54 |                handle, streamId)
 55 | end
 56 | 
 57 | function cusolverSpGetStream(handle, streamId)
 58 |   @check ccall((:cusolverSpGetStream, libcusolver),
 59 |                cusolverStatus_t,
 60 |                (cusolverSpHandle_t, Ptr{CuStream_t}),
 61 |                handle, streamId)
 62 | end
 63 | 
 64 | function cusolverSpCreateCsrqrInfo(info)
 65 |   @check ccall((:cusolverSpCreateCsrqrInfo, libcusolver),
 66 |                cusolverStatus_t,
 67 |                (Ptr{csrqrInfo_t},),
 68 |                info)
 69 | end
 70 | 
 71 | function cusolverSpDestroyCsrqrInfo(info)
 72 |   @check ccall((:cusolverDestroyCsrqrInfo, libcusolver),
 73 |                cusolverStatus_t,
 74 |                (csrqrInfo_t,),
 75 |                info)
 76 | end
 77 | 
 78 | function cusolverDnCreateGesvdjInfo(info)
 79 |   @check ccall((:cusolverDnCreateGesvdjInfo, libcusolver),
 80 |                cusolverStatus_t,
 81 |                (Ptr{gesvdjInfo_t},),
 82 |                info)
 83 | end
 84 | 
 85 | function cusolverDnDestroyGesvdjInfo(info)
 86 |   @check ccall((:cusolverDnDestroyGesvdjInfo, libcusolver),
 87 |                cusolverStatus_t,
 88 |                (gesvdjInfo_t,),
 89 |                info)
 90 | end
 91 | 
 92 | function cusolverDnXgesvdjSetTolerance(info, tolerance)
 93 |   @check ccall((:cusolverDnXgesvdjSetTolerance, libcusolver),
 94 |                cusolverStatus_t,
 95 |                (gesvdjInfo_t, Float64),
 96 |                info, Float64(tolerance))
 97 | end
 98 | 
 99 | function cusolverDnXgesvdjSetMaxSweeps(info, max_sweeps)
100 |   @check ccall((:cusolverDnXgesvdjSetMaxSweeps, libcusolver),
101 |                cusolverStatus_t,
102 |                (gesvdjInfo_t, Cint),
103 |                info, Cint(max_sweeps))
104 | end
105 | 
106 | function cusolverDnCreateSyevjInfo(info)
107 |   @check ccall((:cusolverDnCreateSyevjInfo, libcusolver),
108 |                cusolverStatus_t,
109 |                (Ptr{syevjInfo_t},),
110 |                info)
111 | end
112 | 
113 | function cusolverDnDestroySyevjInfo(info)
114 |   @check ccall((:cusolverDnDestroySyevjInfo, libcusolver),
115 |                cusolverStatus_t,
116 |                (syevjInfo_t,),
117 |                info)
118 | end
119 | 
120 | function cusolverDnXsyevjSetTolerance(info, tolerance)
121 |   @check ccall((:cusolverDnXsyevjSetTolerance, libcusolver),
122 |                cusolverStatus_t,
123 |                (syevjInfo_t, Float64),
124 |                info, Float64(tolerance))
125 | end
126 | 
127 | function cusolverDnXsyevjSetMaxSweeps(info, max_sweeps)
128 |   @check ccall((:cusolverDnXsyevjSetMaxSweeps, libcusolver),
129 |                cusolverStatus_t,
130 |                (syevjInfo_t, Cint),
131 |                info, Cint(max_sweeps))
132 | end
133 | 
134 | function cusolverRfCreate(handle)
135 |   @check ccall((:cusolverRfCreate, libcusolver),
136 |                cusolverStatus_t,
137 |                (Ptr{cusolverRfHandle_t},),
138 |                handle)
139 | end
140 | 
141 | function cusolverRfDestroy(handle)
142 |   @check ccall((:cusolverRfDestroy, libcusolver),
143 |                cusolverStatus_t,
144 |                (cusolverRfHandle_t,),
145 |                handle)
146 | end
147 | 
148 | function cusolverRfSetStream(handle, streamId)
149 |   @check ccall((:cusolverRfSetStream, libcusolver),
150 |                cusolverStatus_t,
151 |                (cusolverRfHandle_t, CuStream_t),
152 |                handle, streamId)
153 | end
154 | 
155 | function cusolverRfGetStream(handle, streamId)
156 |   @check ccall((:cusolverRfGetStream, libcusolver),
157 |                cusolverStatus_t,
158 |                (cusolverRfHandle_t, Ptr{CuStream_t}),
159 |                handle, streamId)
160 | end
161 | 
162 | function cusolverGetProperty(property::CUDAapi.libraryPropertyType)
163 |   value_ref = Ref{Cint}()
164 |   @check ccall((:cusolverGetProperty, libcusolver),
165 |                cusolverStatus_t,
166 |                (Cint, Ptr{Cint}),
167 |                property, value_ref)
168 |   value_ref[]
169 | end
170 | 


--------------------------------------------------------------------------------
/src/dnn/helpers.jl:
--------------------------------------------------------------------------------
  1 | # For low level cudnn functions that require a pointer to a number
  2 | cptr(x,a::CuArray{Float64})=Float64[x]
  3 | cptr(x,a::CuArray{Float32})=Float32[x]
  4 | cptr(x,a::CuArray{Float16})=Float32[x]
  5 | 
  6 | # Conversion between Julia and CUDNN datatypes
  7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF
  8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT
  9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE
 10 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 :
 11 |                   a==CUDNN_DATA_FLOAT ? Float32 :
 12 |                   a==CUDNN_DATA_DOUBLE ? Float64 : error())
 13 | 
 14 | tuple_strides(A::Tuple) = _strides((1,), A)
 15 | _strides(out::Tuple{Int}, A::Tuple{}) = ()
 16 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out
 17 | function _strides(out::NTuple{M,Int}, A::Tuple) where M
 18 |     Base.@_inline_meta
 19 |     _strides((out..., out[M]*A[M]), A)
 20 | end
 21 | 
 22 | # Descriptors
 23 | 
 24 | mutable struct TensorDesc; ptr; end
 25 | free(td::TensorDesc) = cudnnDestroyTensorDescriptor(td.ptr)
 26 | Base.unsafe_convert(::Type{cudnnTensorDescriptor_t}, td::TensorDesc) = td.ptr
 27 | Base.unsafe_convert(::Type{Ptr{Nothing}}, td::TensorDesc) = convert(Ptr{Nothing}, td.ptr)
 28 | 
 29 | function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N
 30 |     sz = Cint.(size) |> reverse |> collect
 31 |     st = Cint.(strides) |> reverse |> collect
 32 |     d = Ref{cudnnTensorDescriptor_t}()
 33 |     cudnnCreateTensorDescriptor(d)
 34 |     cudnnSetTensorNdDescriptor(d[], cudnnDataType(T), length(sz), sz, st)
 35 |     this = TensorDesc(d[])
 36 |     finalizer(free, this)
 37 |     return this
 38 | end
 39 | 
 40 | TensorDesc(a::CuArray) = TensorDesc(eltype(a), size(a), strides(a))
 41 | 
 42 | mutable struct FilterDesc
 43 |   ptr
 44 | end
 45 | free(fd::FilterDesc)=cudnnDestroyFilterDescriptor(fd.ptr)
 46 | Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc)=fd.ptr
 47 | Base.unsafe_convert(::Type{Ptr{Nothing}}, fd::FilterDesc)=fd.ptr
 48 | 
 49 | function createFilterDesc()
 50 |   d = Ref{cudnnFilterDescriptor_t}()
 51 |   @check cudnnCreateFilterDescriptor(d)
 52 |   return d[]
 53 | end
 54 | 
 55 | function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW)
 56 |     # The only difference of a FilterDescriptor is no strides.
 57 |     sz = Cint.(size) |> reverse |> collect
 58 |     d = createFilterDesc()
 59 |     version() >= v"5" ?
 60 |         cudnnSetFilterNdDescriptor(d, cudnnDataType(T), format, length(sz), sz) :
 61 |     version() >= v"4" ?
 62 |         cudnnSetFilterNdDescriptor_v4(d, cudnnDataType(T), format, length(sz), sz) :
 63 |         cudnnSetFilterNdDescriptor(d, cudnnDataType(T), length(sz), sz)
 64 |     this = FilterDesc(d)
 65 |     finalizer(free, this)
 66 |     return this
 67 | end
 68 | 
 69 | FilterDesc(a::CuArray; format = CUDNN_TENSOR_NCHW) = FilterDesc(eltype(a), size(a), format = format)
 70 | 
 71 | function Base.size(f::FilterDesc)
 72 |   typ = Ref{Cuint}()
 73 |   format = Ref{Cuint}()
 74 |   ndims = Ref{Cint}()
 75 |   dims = Vector{Cint}(undef, 8)
 76 |   cudnnGetFilterNdDescriptor(f, 8, typ, format, ndims, dims)
 77 |   @assert ndims[] ≤ 8
 78 |   return (dims[1:ndims[]]...,) |> reverse
 79 | end
 80 | 
 81 | mutable struct ConvDesc; ptr; end
 82 | free(cd::ConvDesc) = cudnnDestroyConvolutionDescriptor(cd.ptr)
 83 | Base.unsafe_convert(::Type{cudnnConvolutionDescriptor_t}, cd::ConvDesc)=cd.ptr
 84 | 
 85 | function cdsize(w, nd)
 86 |     isa(w, Integer) && return Cint[fill(w,nd)...]
 87 |     length(w) == nd && return Cint[reverse(w)...]
 88 |     length(w) == 2*nd && return Cint[reverse(w[nd+1:end])...]
 89 |     throw(DimensionMismatch())
 90 | end
 91 | 
 92 | pdsize(w, nd)=Cint[reverse(psize(w,nd))...]
 93 | function psize(w, nd)
 94 |     isa(w, Integer) && return Cint[fill(w,nd)...]
 95 |     length(w) == nd && return w
 96 |     length(w) == 2*nd && return w[1:nd]
 97 |     throw(DimensionMismatch())
 98 | end
 99 | 
100 | function ConvDesc(T, N, padding, stride, dilation, mode)
101 |     cd = Ref{cudnnConvolutionDescriptor_t}()
102 |     cudnnCreateConvolutionDescriptor(cd)
103 |     version() >= v"4" ? cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) :
104 |     version() >= v"3" ? cudnnSetConvolutionNdDescriptor_v3(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) :
105 |     cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode)
106 |     this = ConvDesc(cd[])
107 |     finalizer(free, this)
108 |     return this
109 | end
110 | 
111 | function ConvDesc(T, cdims::DenseConvDims)
112 |     pd = NNlib.padding(cdims)
113 |     if !all(pd[1:2:end] .== pd[2:2:end])
114 |         @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice")
115 |     end
116 |     return ConvDesc(T, NNlib.spatial_dims(cdims), pd[1:2:end], NNlib.stride(cdims),
117 |                        NNlib.dilation(cdims), NNlib.flipkernel(cdims))
118 | end
119 | 
120 | mutable struct PoolDesc; ptr; end
121 | free(pd::PoolDesc)=cudnnDestroyPoolingDescriptor(pd.ptr)
122 | Base.unsafe_convert(::Type{cudnnPoolingDescriptor_t}, pd::PoolDesc)=pd.ptr
123 | 
124 | function PoolDesc(nd, window, padding, stride, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN)
125 |     pd = Ref{cudnnPoolingDescriptor_t}()
126 |     cudnnCreatePoolingDescriptor(pd)
127 |     cudnnSetPoolingNdDescriptor(pd[],mode,maxpoolingNanOpt,nd,pdsize(window,nd),pdsize(padding,nd),pdsize(stride,nd))
128 |     this = PoolDesc(pd[])
129 |     finalizer(free, this)
130 |     return this
131 | end
132 | 
133 | function PoolDesc(pdims::PoolDims, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN)
134 |     pd = NNlib.padding(pdims)
135 |     if !all(pd[1:2:end] .== pd[2:2:end])
136 |         @warn("CuDNN does not support asymmetric padding; defaulting to symmetric choice")
137 |     end
138 |     return PoolDesc(NNlib.spatial_dims(pdims), NNlib.kernel_size(pdims), pd[1:2:end],
139 |                     NNlib.stride(pdims), mode, maxpoolingNanOpt)
140 | end
141 | 
142 | mutable struct ActivationDesc; ptr; end
143 | free(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
144 | Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
145 | 
146 | function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
147 |     ad = Ref{cudnnActivationDescriptor_t}()
148 |     cudnnCreateActivationDescriptor(ad)
149 |     cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
150 |     this = ActivationDesc(ad[])
151 |     finalizer(free, this)
152 |     return this
153 | end
154 | 


--------------------------------------------------------------------------------
/src/sparse/libcusparse_types.jl:
--------------------------------------------------------------------------------
  1 | #enum cusparseStatus_t
  2 | #error messages from CUSPARSE
  3 | 
  4 | """
  5 | Status messages from CUSPARSE's C API.
  6 | """
  7 | const cusparseStatus_t = UInt32
  8 | const CUSPARSE_STATUS_SUCCESS                   = 0
  9 | const CUSPARSE_STATUS_NOT_INITIALIZED           = 1
 10 | const CUSPARSE_STATUS_ALLOC_FAILED              = 2
 11 | const CUSPARSE_STATUS_INVALID_VALUE             = 3
 12 | const CUSPARSE_STATUS_ARCH_MISMATCH             = 4
 13 | const CUSPARSE_STATUS_MAPPING_ERROR             = 5
 14 | const CUSPARSE_STATUS_EXECUTION_FAILED          = 6
 15 | const CUSPARSE_STATUS_INTERNAL_ERROR            = 7
 16 | const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8
 17 | 
 18 | #enum cusparseAction_t
 19 | """
 20 | Perform operation on indices only (`CUSPARSE_ACTION_SYMBOLIC`) or
 21 | on both data and indices (`CUSPARSE_ACTION_NUMERIC`). Used in
 22 | conversion routines.
 23 | """
 24 | const cusparseAction_t = UInt32
 25 | const CUSPARSE_ACTION_SYMBOLIC = 0
 26 | const CUSPARSE_ACTION_NUMERIC  = 1
 27 | 
 28 | #enum cusparseDirection_t
 29 | """
 30 | Parse dense matrix by rows (`CUSPARSE_DIRECTION_ROW`) or columns
 31 | (`CUSPARSE_DIRECTION_COL`) to compute its number of non-zeros.
 32 | """
 33 | const cusparseDirection_t = UInt32
 34 | const CUSPARSE_DIRECTION_ROW = 0
 35 | const CUSPARSE_DIRECTION_COL = 1
 36 | 
 37 | #enum cusparseHybPartition_t
 38 | """
 39 | How to partition the HYB matrix in a [`CudaSparseMatrixHYB`](@ref).
 40 | There are three choices:
 41 | * `CUSPARSE_HYB_PARTITION_AUTO` - let CUSPARSE decide internally for best performance.
 42 | * `CUSPARSE_HYB_PARTITION_USER` - set the partition manually in the conversion function.
 43 | * `CUSPARSE_HYB_PARTITION_MAX` - use the maximum partition, putting the matrix in ELL format.
 44 | """
 45 | const cusparseHybPartition_t = UInt32
 46 | const CUSPARSE_HYB_PARTITION_AUTO = 0
 47 | const CUSPARSE_HYB_PARTITION_USER = 1
 48 | const CUSPARSE_HYB_PARTITION_MAX  = 2
 49 | 
 50 | #enum cusparseFillMode_t
 51 | """
 52 | Determines if a symmetric/Hermitian/triangular matrix has its upper
 53 | (`CUSPARSE_FILL_MODE_UPPER`) or lower (`CUSPARSE_FILL_MODE_LOWER`)
 54 | triangle filled.
 55 | """
 56 | const cusparseFillMode_t = UInt32
 57 | const CUSPARSE_FILL_MODE_LOWER = 0
 58 | const CUSPARSE_FILL_MODE_UPPER = 1
 59 | 
 60 | #enum cusparseDiagType_t
 61 | """
 62 | Determines if the diagonal of a matrix is all ones (`CUSPARSE_DIAG_TYPE_UNIT`)
 63 | or not all ones (`CUSPARSE_DIAG_TYPE_NON_UNIT`).
 64 | """
 65 | const cusparseDiagType_t = UInt32
 66 | const CUSPARSE_DIAG_TYPE_NON_UNIT = 0
 67 | const CUSPARSE_DIAG_TYPE_UNIT     = 1
 68 | 
 69 | #enum cusparsePointerMode_t
 70 | """
 71 | Determines if scalar arguments to a function are present on the host CPU
 72 | (`CUSPARSE_POINTER_MODE_HOST`) or on the GPU (`CUSPARSE_POINTER_MODE_DEVICE`).
 73 | """
 74 | const cusparsePointerMode_t = UInt32
 75 | const CUSPARSE_POINTER_MODE_HOST   = 0
 76 | const CUSPARSE_POINTER_MODE_DEVICE = 1
 77 | 
 78 | #enum cusparseOperation_t
 79 | """
 80 | Determines whether to perform an operation, such as a matrix multiplication
 81 | or solve, on the matrix as-is (`CUSPARSE_OPERATION_NON_TRANSPOSE`), on the
 82 | matrix's transpose (`CUSPARSE_OPERATION_TRANSPOSE`), or on its conjugate
 83 | transpose (`CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE`).
 84 | """
 85 | const cusparseOperation_t = UInt32
 86 | const CUSPARSE_OPERATION_NON_TRANSPOSE       = 0
 87 | const CUSPARSE_OPERATION_TRANSPOSE           = 1
 88 | const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
 89 | 
 90 | #enum cusparseMatrixType_t
 91 | """
 92 | Indicates whether a matrix is a general matrix (`CUSPARSE_MATRIX_TYPE_GENERAL`),
 93 | symmetric (`CUSPARSE_MATRIX_TYPE_SYMMETRIC`), Hermitian
 94 | (`CUSPARSE_MATRIX_TYPE_HERMITIAN`), or triangular
 95 | (`CUSPARSE_MATRIX_TYPE_TRIANGULAR`). Note that for some matrix types
 96 | (those in [`CompressedSparse`](@ref)), this can be inferred for some function
 97 | calls.
 98 | """
 99 | const cusparseMatrixType_t = UInt32
100 | const CUSPARSE_MATRIX_TYPE_GENERAL    = 0
101 | const CUSPARSE_MATRIX_TYPE_SYMMETRIC  = 1
102 | const CUSPARSE_MATRIX_TYPE_HERMITIAN  = 2
103 | const CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
104 | 
105 | #enum cusparseSolvePolicy_t
106 | """
107 | Indicates whether to keep level info in solvers (`CUSPARSE_SOLVE_POLICY_USE_LEVEL`)
108 | or whether to not use it (`CUSPARSE_SOLVE_POLICY_NO_LEVEL`).
109 | """
110 | const cusparseSolvePolicy_t = UInt32
111 | const CUSPARSE_SOLVE_POLICY_NO_LEVEL  = 0
112 | const CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1
113 | 
114 | #enum cusparseIndexBase_t
115 | """
116 | Indicates whether a sparse object is zero-indexed (`CUSPARSE_INDEX_BASE_ZERO`)
117 | or one-indexed (`CUSPARSE_INDEX_BASE_ONE`). CUSPARSE.jl supports both. Julia
118 | sparse matrices are one-indexed, but you may wish to pass matrices from other
119 | libraries which use zero-indexing (e.g. C language ODE solvers).
120 | """
121 | const cusparseIndexBase_t = UInt32
122 | const CUSPARSE_INDEX_BASE_ZERO = 0
123 | const CUSPARSE_INDEX_BASE_ONE  = 1
124 | 
125 | #struct cusparseMatDescr_t
126 | """
127 | Describes shape and properties of a CUSPARSE matrix. A convenience wrapper.
128 | 
129 | Contains:
130 | * `MatrixType` - a [`cusparseMatrixType_t`](@ref)
131 | * `FillMode` - a [`cusparseFillMode_t`](@ref)
132 | * `DiagType` - a [`cusparseDiagType_t`](@ref)
133 | * `IndexBase` - a [`cusparseIndexBase_t`](@ref)
134 | """
135 | struct cusparseMatDescr_t
136 |     MatrixType::cusparseMatrixType_t
137 |     FillMode::cusparseFillMode_t
138 |     DiagType::cusparseDiagType_t
139 |     IndexBase::cusparseIndexBase_t
140 |     function cusparseMatDescr_t(MatrixType,FillMode,DiagType,IndexBase)
141 |         new(MatrixType,FillMode,DiagType,IndexBase)
142 |     end
143 | end
144 | 
145 | """
146 | An opaque struct containing information about the solution approach
147 | CUSPARSE will take. Generated by [`sv_analysis`](@ref) or
148 | [`sm_analysis`](@ref) and passed to [`sv_solve!`](@ref), [`sm_solve`](@ref),
149 | [`ic0!`](@ref), or [`ilu0!`](@ref).
150 | """
151 | const cusparseSolveAnalysisInfo_t = Ptr{Cvoid}
152 | const bsrsm2Info_t = Ptr{Cvoid}
153 | const bsrsv2Info_t = Ptr{Cvoid}
154 | const csrsv2Info_t = Ptr{Cvoid}
155 | const csric02Info_t = Ptr{Cvoid}
156 | const csrilu02Info_t = Ptr{Cvoid}
157 | const bsric02Info_t = Ptr{Cvoid}
158 | const bsrilu02Info_t = Ptr{Cvoid}
159 | 
160 | const cusparseContext = Cvoid
161 | const cusparseHandle_t = Ptr{cusparseContext}
162 | 
163 | #complex numbers
164 | 
165 | const cuComplex = Complex{Float32}
166 | const cuDoubleComplex = Complex{Float64}
167 | 
168 | const CusparseFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
169 | const CusparseReal = Union{Float64,Float32}
170 | const CusparseComplex = Union{ComplexF64,ComplexF32}
171 | 
172 | const cusparseHybMat_t = Ptr{Cvoid}
173 | 


--------------------------------------------------------------------------------
/src/rand/libcurand.jl:
--------------------------------------------------------------------------------
  1 | function create_generator(typ::Int=CURAND_RNG_PSEUDO_DEFAULT)
  2 |     ptr = Ref{curandGenerator_t}()
  3 |     @check ccall((:curandCreateGenerator, libcurand),
  4 |                  curandStatus_t,
  5 |                  (Ptr{curandGenerator_t}, Cint), ptr, typ)
  6 |     r = RNG(ptr[], typ)
  7 |     finalizer(destroy_generator, r)
  8 |     return r
  9 | end
 10 | 
 11 | function destroy_generator(rng::RNG)
 12 |     @check ccall((:curandDestroyGenerator, libcurand),
 13 |                  curandStatus_t,
 14 |                  (curandGenerator_t,), rng)
 15 | end
 16 | 
 17 | function get_version()
 18 |     ver = Ref{Cint}()
 19 |     @check ccall((:curandGetVersion, libcurand),
 20 |                  curandStatus_t,
 21 |                  (Ref{Cint},), ver)
 22 |     return ver[]
 23 | end
 24 | 
 25 | # TODO: curandSetStream
 26 | 
 27 | function set_pseudo_random_generator_seed(rng::RNG, seed::Int64)
 28 |     @check ccall((:curandSetPseudoRandomGeneratorSeed, libcurand),
 29 |                  curandStatus_t,
 30 |                  (curandGenerator_t, Clonglong), rng, seed)
 31 | end
 32 | 
 33 | function set_generator_offset(rng::RNG, offset::Int64)
 34 |     @check ccall((:curandSetGeneratorOffset, libcurand),
 35 |                  curandStatus_t,
 36 |                  (curandGenerator_t, Clonglong), rng, offset)
 37 | end
 38 | 
 39 | function set_generator_ordering(rng::RNG, order::Int)
 40 |     @check ccall((:curandSetGeneratorOrdering, libcurand),
 41 |                  curandStatus_t,
 42 |                  (curandGenerator_t, Cint), rng, order)
 43 | end
 44 | 
 45 | function set_quasi_random_generator_dimensions(rng::RNG, num_dimensions::UInt)
 46 |     @check ccall((:curandSetQuasiRandomGeneratorDimensions, libcurand),
 47 |                  curandStatus_t,
 48 |                  (curandGenerator_t, Cuint),
 49 |                  rng, num_dimensions)
 50 | end
 51 | 
 52 | 
 53 | """
 54 | Generate 64-bit quasirandom numbers.
 55 | """
 56 | function generate(rng::RNG, arr::CuArray, n::UInt)
 57 |     @check ccall((:curandGenerate, libcurand),
 58 |                  curandStatus_t,
 59 |                  (curandGenerator_t, CuPtr{UInt32}, Csize_t),
 60 |                  rng, arr, length(arr))
 61 |     return arr
 62 | end
 63 | 
 64 | 
 65 | """
 66 | Generate uniformly distributed floats.
 67 | 
 68 | Valid RNG types are:
 69 |  - CURAND_RNG_QUASI_SOBOL64
 70 |  - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
 71 | """
 72 | function generate_long_long(rng::RNG, arr::CuArray)
 73 |     @check ccall((:curandGenerateLongLong, libcurand),
 74 |                  curandStatus_t,
 75 |                  (curandGenerator_t, CuPtr{Culonglong}, Csize_t),
 76 |                  rng, arr, length(arr))
 77 |     return arr
 78 | end
 79 | 
 80 | # uniform
 81 | function generate_uniform(rng::RNG, arr::CuArray)
 82 |     @check ccall((:curandGenerateUniform, libcurand),
 83 |                  curandStatus_t,
 84 |                  (curandGenerator_t, CuPtr{Float32}, Csize_t),
 85 |                  rng, arr, length(arr))
 86 |     return arr
 87 | end
 88 | 
 89 | function generate_uniform_double(rng::RNG, arr::CuArray)
 90 |     @check ccall((:curandGenerateUniformDouble, libcurand),
 91 |                  curandStatus_t,
 92 |                  (curandGenerator_t, CuPtr{Float64}, Csize_t),
 93 |                  rng, arr, length(arr))
 94 |     return arr
 95 | end
 96 | 
 97 | # normal
 98 | function generate_normal(rng::RNG, arr::CuArray, mean, stddev)
 99 |     @check ccall((:curandGenerateNormal, libcurand),
100 |                  curandStatus_t,
101 |                  (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat),
102 |                  rng, arr, length(arr), mean, stddev)
103 |     return arr
104 | end
105 | 
106 | function generate_normal_double(rng::RNG, arr::CuArray, mean, stddev)
107 |     @check ccall((:curandGenerateNormalDouble, libcurand),
108 |                  curandStatus_t,
109 |                  (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble),
110 |                  rng, arr, length(arr), mean, stddev)
111 |     return arr
112 | end
113 | 
114 | 
115 | # lognormal
116 | function generate_log_normal(rng::RNG, arr::CuArray, mean, stddev)
117 |     @check ccall((:curandGenerateLogNormal, libcurand),
118 |                  curandStatus_t,
119 |                  (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat),
120 |                  rng, arr, length(arr), mean, stddev)
121 |     return arr
122 | end
123 | 
124 | function generate_log_normal_double(rng::RNG, arr::CuArray, mean, stddev)
125 |     @check ccall((:curandGenerateLogNormalDouble, libcurand),
126 |                  curandStatus_t,
127 |                  (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble),
128 |                  rng, arr, length(arr), mean, stddev)
129 |     return arr
130 | end
131 | 
132 | # Poisson
133 | """Construct the histogram array for a Poisson distribution."""
134 | function create_poisson_distribtion(lambda)
135 |     ptr = Ref{curandDiscreteDistribution_t}()
136 |     @check ccall((:curandCreatePoissonDistribution, libcurand),
137 |                  curandStatus_t,
138 |                  (Cdouble, Ptr{curandDiscreteDistribution_t}),
139 |                  lambda, ptr)
140 |     dist = DiscreteDistribution(ptr[])
141 |     finalizer(destroy_distribution, dist)
142 |     return dist
143 | end
144 | 
145 | """Destroy the histogram array for a discrete distribution (e.g. Poisson)."""
146 | function destroy_distribution(dist::DiscreteDistribution)
147 |     @check ccall((:curandDestroyDistribution, libcurand),
148 |                  curandStatus_t,
149 |                  (curandDiscreteDistribution_t,),
150 |                  dist)
151 | end
152 | 
153 | """Generate Poisson-distributed unsigned ints."""
154 | function generate_poisson(rng::RNG, arr::CuArray, lambda)
155 |     @check ccall((:curandGeneratePoisson, libcurand),
156 |                  curandStatus_t,
157 |                  (curandGenerator_t, CuPtr{Cuint}, Csize_t, Cdouble),
158 |                  rng, arr, length(arr), lambda)
159 |     return arr
160 | end
161 | 
162 | # seeds
163 | """Generate the starting state of the generator. """
164 | function generate_seeds(rng::RNG)
165 |     @check ccall((:curandGenerateSeeds, libcurand),
166 |                  curandStatus_t,
167 |                  (curandGenerator_t,), rng)
168 | end
169 | 
170 | # TODO: curandGetDirectionVectors32
171 | # TODO: curandGetScrambleConstants32
172 | # TODO: curandGetDirectionVectors64
173 | # TODO: curandGetScrambleConstants64
174 | 
175 | function curandGetProperty(property::CUDAapi.libraryPropertyType)
176 |   value_ref = Ref{Cint}()
177 |   @check ccall((:curandGetProperty, libcurand),
178 |                curandStatus_t,
179 |                (Cint, Ptr{Cint}),
180 |                property, value_ref)
181 |   value_ref[]
182 | end
183 | 


--------------------------------------------------------------------------------
/test/fft.jl:
--------------------------------------------------------------------------------
  1 | @testset "CUFFT" begin
  2 | 
  3 | using CuArrays.CUFFT
  4 | 
  5 | using FFTW
  6 | 
  7 | # notes:
  8 | #   plan_bfft does not need separate testing since it is used by plan_ifft
  9 | 
 10 | N1 = 8
 11 | N2 = 32
 12 | N3 = 64
 13 | N4 = 8
 14 | 
 15 | MYRTOL = 1e-5
 16 | MYATOL = 1e-8
 17 | 
 18 | # out-of-place
 19 | function dotest1(X::AbstractArray{T,N}) where {T <: Complex,N}
 20 |     fftw_X = fft(X)
 21 |     d_X = CuArray(X)
 22 |     p = plan_fft(d_X)
 23 |     d_Y = p * d_X
 24 |     Y = collect(d_Y)
 25 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 26 | 
 27 |     pinv = plan_ifft(d_Y)
 28 |     d_Z = pinv * d_Y
 29 |     Z = collect(d_Z)
 30 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 31 | 
 32 |     pinv2 = inv(p)
 33 |     d_Z = pinv2 * d_Y
 34 |     Z = collect(d_Z)
 35 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 36 | end
 37 | 
 38 | function dotest1(X::AbstractArray{T,N}) where {T <: Real,N}
 39 |     fftw_X = rfft(X)
 40 |     d_X = CuArray(X)
 41 |     p = plan_rfft(d_X)
 42 |     d_Y = p * d_X
 43 |     Y = collect(d_Y)
 44 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 45 | 
 46 |     pinv = plan_irfft(d_Y,size(X,1))
 47 |     d_Z = pinv * d_Y
 48 |     Z = collect(d_Z)
 49 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 50 | 
 51 |     pinv2 = inv(p)
 52 |     d_Z = pinv2 * d_Y
 53 |     Z = collect(d_Z)
 54 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 55 | 
 56 |     pinv3 = inv(pinv)
 57 |     d_W = pinv3 * d_X
 58 |     W = collect(d_W)
 59 |     @test isapprox(W, Y, rtol = MYRTOL, atol = MYATOL)
 60 | end
 61 | 
 62 | # in-place
 63 | function dotest2(X::AbstractArray{T,N}) where {T <: Complex,N}
 64 |     fftw_X = fft(X)
 65 |     d_X = CuArray(X)
 66 |     p = plan_fft!(d_X)
 67 |     p * d_X
 68 |     Y = collect(d_X)
 69 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 70 | 
 71 |     pinv = plan_ifft!(d_X)
 72 |     pinv * d_X
 73 |     Z = collect(d_X)
 74 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 75 | end
 76 | 
 77 | # no inplace rfft for now
 78 | 
 79 | # batch transforms
 80 | function dotest3(X::AbstractArray{T,N},region) where {T <: Complex,N}
 81 |     fftw_X = fft(X,region)
 82 |     d_X = CuArray(X)
 83 |     p = plan_fft(d_X,region)
 84 |     d_Y = p * d_X
 85 |     Y = collect(d_Y)
 86 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 87 | 
 88 |     pinv = plan_ifft(d_Y,region)
 89 |     d_Z = pinv * d_Y
 90 |     Z = collect(d_Z)
 91 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 92 | end
 93 | 
 94 | function dotest3(X::AbstractArray{T,N},region) where {T <: Real,N}
 95 |     fftw_X = rfft(X,region)
 96 |     d_X = CuArray(X)
 97 |     p = plan_rfft(d_X,region)
 98 |     d_Y = p * d_X
 99 |     Y = collect(d_Y)
100 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
101 | 
102 |     pinv = plan_irfft(d_Y,size(X,region[1]),region)
103 |     d_Z = pinv * d_Y
104 |     Z = collect(d_Z)
105 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
106 | end
107 | 
108 | 
109 | @testset "FFT" for (rtype,ctype) in [(Float32,ComplexF32), (Float64,ComplexF64)]
110 | 
111 | @testset "1D FFT" begin
112 |     dims = (N1,)
113 |     X = rand(ctype, dims)
114 |     dotest1(X)
115 | end
116 | @testset "1D inplace FFT" begin
117 |     dims = (N1,)
118 |     X = rand(ctype, dims)
119 |     dotest2(X)
120 | end
121 | 
122 | @testset "2D FFT" begin
123 |     dims = (N1,N2)
124 |     X = rand(ctype, dims)
125 |     dotest1(X)
126 | end
127 | @testset "2D inplace FFT" begin
128 |     dims = (N1,N2)
129 |     X = rand(ctype, dims)
130 |     dotest2(X)
131 | end
132 | 
133 | @testset "Batch 1D FFT" begin
134 |     dims = (N1,N2)
135 |     X = rand(ctype, dims)
136 |     dotest3(X,1)
137 | 
138 |     dims = (N1,N2)
139 |     X = rand(ctype, dims)
140 |     dotest3(X,2)
141 | 
142 |     dims = (N1,N2)
143 |     X = rand(ctype, dims)
144 |     dotest3(X,(1,2))
145 | end
146 | 
147 | @testset "3D FFT" begin
148 |     dims = (N1,N2,N3)
149 |     X = rand(ctype, dims)
150 |     dotest1(X)
151 | end
152 | @testset "3D inplace FFT" begin
153 |     dims = (N1,N2,N3)
154 |     X = rand(ctype, dims)
155 |     dotest2(X)
156 | end
157 | 
158 | @testset "Batch 2D FFT (in 3D)" begin
159 |     dims = (N1,N2,N3)
160 |     for region in [(1,2),(2,3),(1,3)]
161 |         X = rand(ctype, dims)
162 |         dotest3(X,region)
163 |     end
164 | 
165 |     X = rand(ctype, dims)
166 |     @test_throws ArgumentError dotest3(X,(3,1))
167 | end
168 | 
169 | @testset "Batch 2D FFT (in 4D)" begin
170 |     dims = (N1,N2,N3,N4)
171 |     for region in [(1,2),(1,4),(3,4)]
172 |         X = rand(ctype, dims)
173 |         dotest3(X,region)
174 |     end
175 |     for region in [(1,3),(2,3),(2,4)]
176 |         X = rand(ctype, dims)
177 |         @test_throws ArgumentError dotest3(X,region)
178 |     end
179 | 
180 | end
181 | 
182 | @testset "1D real FFT" begin
183 |     X = rand(rtype, N1)
184 |     dotest1(X)
185 | end
186 | 
187 | @testset "Batch 1D real FFT" begin
188 |     dims = (N1,N2)
189 |     X = rand(rtype, dims)
190 |     dotest3(X,1)
191 | 
192 |     dims = (N1,N2)
193 |     X = rand(rtype, dims)
194 |     dotest3(X,2)
195 | 
196 |     dims = (N1,N2)
197 |     X = rand(rtype, dims)
198 |     dotest3(X,(1,2))
199 | end
200 | 
201 | @testset "2D real FFT" begin
202 |     X = rand(rtype, N1,N2)
203 |     dotest1(X)
204 | end
205 | 
206 | @testset "Batch 2D real FFT (in 3D)" begin
207 |     dims = (N1,N2,N3)
208 |     for region in [(1,2),(2,3),(1,3)]
209 |         X = rand(rtype, dims)
210 |         dotest3(X,region)
211 |     end
212 | 
213 |     X = rand(rtype, dims)
214 |     @test_throws ArgumentError dotest3(X,(3,1))
215 | end
216 | 
217 | @testset "Batch 2D real FFT (in 4D)" begin
218 |     dims = (N1,N2,N3,N4)
219 |     for region in [(1,2),(1,4),(3,4)]
220 |         X = rand(rtype, dims)
221 |         dotest3(X,region)
222 |     end
223 |     for region in [(1,3),(2,3),(2,4)]
224 |         X = rand(rtype, dims)
225 |         @test_throws ArgumentError dotest3(X,region)
226 |     end
227 | end
228 | 
229 | @testset "3D real FFT" begin
230 |     X = rand(rtype, N1, N2, N3)
231 |     dotest1(X)
232 | end
233 | 
234 | end # testset FFT
235 | 
236 | # integer array arguments
237 | function dotest5(X::AbstractArray{T,N}) where {T <: Complex,N}
238 |     fftw_X = fft(X)
239 |     d_X = CuArray(X)
240 |     p = plan_fft(d_X)
241 |     d_Y = p * d_X
242 |     Y = collect(d_Y)
243 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
244 |     d_Y = fft(d_X)
245 |     Y = collect(d_Y)
246 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
247 | end
248 | 
249 | function dotest5(X::AbstractArray{T,N}) where {T <: Real,N}
250 |     fftw_X = rfft(X)
251 |     d_X = CuArray(X)
252 |     p = plan_rfft(d_X)
253 |     d_Y = p * d_X
254 |     Y = collect(d_Y)
255 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
256 |     d_Y = rfft(d_X)
257 |     Y = collect(d_Y)
258 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
259 | end
260 | 
261 | @testset "Int FFT" for (rtype,ctype) in [(Int32,Complex{Int32}), (Int64,Complex{Int64})]
262 | 
263 | @testset "1D FFT" begin
264 |     dims = (N1,)
265 |     X = rand(ctype, dims)
266 |     dotest5(X)
267 | end
268 | 
269 | @testset "1D real FFT" begin
270 |     X = rand(rtype, N1)
271 |     dotest5(X)
272 | end
273 | 
274 | 
275 | end # testset int FFT
276 | 
277 | @testset "streams" begin
278 |     X = rand(N1)
279 |     d_X = CuArray(X)
280 |     p = plan_fft(d_X)
281 |     CUFFT.set_stream(p, CUDAdrv.CuDefaultStream())
282 | end
283 | 
284 | end
285 | 


--------------------------------------------------------------------------------
/src/dnn/libcudnn_types.jl:
--------------------------------------------------------------------------------
  1 | const CUDNN_DIM_MAX = 8
  2 | const CUDNN_LRN_MIN_N = 1
  3 | const CUDNN_LRN_MAX_N = 16
  4 | const CUDNN_LRN_MIN_K = 1.0e-5
  5 | const CUDNN_LRN_MIN_BETA = 0.01
  6 | const CUDNN_BN_MIN_EPSILON = 1.0e-5
  7 | 
  8 | mutable struct cudnnContext
  9 | end
 10 | 
 11 | const cudnnHandle_t = Ptr{cudnnContext}
 12 | 
 13 | # begin enum cudnnStatus_t
 14 | const cudnnStatus_t = UInt32
 15 | const CUDNN_STATUS_SUCCESS = (UInt32)(0)
 16 | const CUDNN_STATUS_NOT_INITIALIZED = (UInt32)(1)
 17 | const CUDNN_STATUS_ALLOC_FAILED = (UInt32)(2)
 18 | const CUDNN_STATUS_BAD_PARAM = (UInt32)(3)
 19 | const CUDNN_STATUS_INTERNAL_ERROR = (UInt32)(4)
 20 | const CUDNN_STATUS_INVALID_VALUE = (UInt32)(5)
 21 | const CUDNN_STATUS_ARCH_MISMATCH = (UInt32)(6)
 22 | const CUDNN_STATUS_MAPPING_ERROR = (UInt32)(7)
 23 | const CUDNN_STATUS_EXECUTION_FAILED = (UInt32)(8)
 24 | const CUDNN_STATUS_NOT_SUPPORTED = (UInt32)(9)
 25 | const CUDNN_STATUS_LICENSE_ERROR = (UInt32)(10)
 26 | # end enum cudnnStatus_t
 27 | 
 28 | mutable struct cudnnTensorStruct
 29 | end
 30 | 
 31 | const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct}
 32 | 
 33 | mutable struct cudnnConvolutionStruct
 34 | end
 35 | 
 36 | const cudnnConvolutionDescriptor_t = Ptr{cudnnConvolutionStruct}
 37 | 
 38 | mutable struct cudnnPoolingStruct
 39 | end
 40 | 
 41 | const cudnnPoolingDescriptor_t = Ptr{cudnnPoolingStruct}
 42 | 
 43 | mutable struct cudnnFilterStruct
 44 | end
 45 | 
 46 | const cudnnFilterDescriptor_t = Ptr{cudnnFilterStruct}
 47 | 
 48 | mutable struct cudnnLRNStruct
 49 | end
 50 | 
 51 | const cudnnLRNDescriptor_t = Ptr{cudnnLRNStruct}
 52 | 
 53 | mutable struct cudnnActivationStruct
 54 | end
 55 | 
 56 | const cudnnActivationDescriptor_t = Ptr{cudnnActivationStruct}
 57 | 
 58 | # begin enum cudnnDataType_t
 59 | const cudnnDataType_t = UInt32
 60 | const CUDNN_DATA_FLOAT = (UInt32)(0)
 61 | const CUDNN_DATA_DOUBLE = (UInt32)(1)
 62 | const CUDNN_DATA_HALF = (UInt32)(2)
 63 | # end enum cudnnDataType_t
 64 | 
 65 | # begin enum cudnnNanPropagation_t
 66 | const cudnnNanPropagation_t = UInt32
 67 | const CUDNN_NOT_PROPAGATE_NAN = (UInt32)(0)
 68 | const CUDNN_PROPAGATE_NAN = (UInt32)(1)
 69 | # end enum cudnnNanPropagation_t
 70 | 
 71 | # begin enum cudnnTensorFormat_t
 72 | const cudnnTensorFormat_t = UInt32
 73 | const CUDNN_TENSOR_NCHW = (UInt32)(0)
 74 | const CUDNN_TENSOR_NHWC = (UInt32)(1)
 75 | # end enum cudnnTensorFormat_t
 76 | 
 77 | # begin enum cudnnAddMode_t
 78 | const cudnnAddMode_t = UInt32
 79 | const CUDNN_ADD_IMAGE = (UInt32)(0)
 80 | const CUDNN_ADD_SAME_HW = (UInt32)(0)
 81 | const CUDNN_ADD_FEATURE_MAP = (UInt32)(1)
 82 | const CUDNN_ADD_SAME_CHW = (UInt32)(1)
 83 | const CUDNN_ADD_SAME_C = (UInt32)(2)
 84 | const CUDNN_ADD_FULL_TENSOR = (UInt32)(3)
 85 | # end enum cudnnAddMode_t
 86 | 
 87 | # begin enum cudnnConvolutionMode_t
 88 | const cudnnConvolutionMode_t = UInt32
 89 | const CUDNN_CONVOLUTION = (UInt32)(0)
 90 | const CUDNN_CROSS_CORRELATION = (UInt32)(1)
 91 | # end enum cudnnConvolutionMode_t
 92 | 
 93 | # begin enum cudnnConvolutionFwdPreference_t
 94 | const cudnnConvolutionFwdPreference_t = UInt32
 95 | const CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = (UInt32)(0)
 96 | const CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = (UInt32)(1)
 97 | const CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
 98 | # end enum cudnnConvolutionFwdPreference_t
 99 | 
100 | # begin enum cudnnConvolutionFwdAlgo_t
101 | const cudnnConvolutionFwdAlgo_t = UInt32
102 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = (UInt32)(0)
103 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = (UInt32)(1)
104 | const CUDNN_CONVOLUTION_FWD_ALGO_GEMM = (UInt32)(2)
105 | const CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = (UInt32)(3)
106 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT = (UInt32)(4)
107 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = (UInt32)(5)
108 | # end enum cudnnConvolutionFwdAlgo_t
109 | 
110 | mutable struct cudnnConvolutionFwdAlgoPerf_t
111 |     algo::cudnnConvolutionFwdAlgo_t
112 |     status::cudnnStatus_t
113 |     time::Cfloat
114 |     memory::Cint
115 | end
116 | 
117 | # begin enum cudnnConvolutionBwdFilterPreference_t
118 | const cudnnConvolutionBwdFilterPreference_t = UInt32
119 | const CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = (UInt32)(0)
120 | const CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = (UInt32)(1)
121 | const CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
122 | # end enum cudnnConvolutionBwdFilterPreference_t
123 | 
124 | # begin enum cudnnConvolutionBwdFilterAlgo_t
125 | const cudnnConvolutionBwdFilterAlgo_t = UInt32
126 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = (UInt32)(0)
127 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = (UInt32)(1)
128 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = (UInt32)(2)
129 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = (UInt32)(3)
130 | # end enum cudnnConvolutionBwdFilterAlgo_t
131 | 
132 | mutable struct cudnnConvolutionBwdFilterAlgoPerf_t
133 |     algo::cudnnConvolutionBwdFilterAlgo_t
134 |     status::cudnnStatus_t
135 |     time::Cfloat
136 |     memory::Cint
137 | end
138 | 
139 | # begin enum cudnnConvolutionBwdDataPreference_t
140 | const cudnnConvolutionBwdDataPreference_t = UInt32
141 | const CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = (UInt32)(0)
142 | const CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = (UInt32)(1)
143 | const CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
144 | # end enum cudnnConvolutionBwdDataPreference_t
145 | 
146 | # begin enum cudnnConvolutionBwdDataAlgo_t
147 | const cudnnConvolutionBwdDataAlgo_t = UInt32
148 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = (UInt32)(0)
149 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = (UInt32)(1)
150 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = (UInt32)(2)
151 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = (UInt32)(3)
152 | # end enum cudnnConvolutionBwdDataAlgo_t
153 | 
154 | mutable struct cudnnConvolutionBwdDataAlgoPerf_t
155 |     algo::cudnnConvolutionBwdDataAlgo_t
156 |     status::cudnnStatus_t
157 |     time::Cfloat
158 |     memory::Cint
159 | end
160 | 
161 | # begin enum cudnnSoftmaxAlgorithm_t
162 | const cudnnSoftmaxAlgorithm_t = UInt32
163 | const CUDNN_SOFTMAX_FAST = (UInt32)(0)
164 | const CUDNN_SOFTMAX_ACCURATE = (UInt32)(1)
165 | const CUDNN_SOFTMAX_LOG = (UInt32)(2)
166 | # end enum cudnnSoftmaxAlgorithm_t
167 | 
168 | # begin enum cudnnSoftmaxMode_t
169 | const cudnnSoftmaxMode_t = UInt32
170 | const CUDNN_SOFTMAX_MODE_INSTANCE = (UInt32)(0)
171 | const CUDNN_SOFTMAX_MODE_CHANNEL = (UInt32)(1)
172 | # end enum cudnnSoftmaxMode_t
173 | 
174 | # begin enum cudnnPoolingMode_t
175 | const cudnnPoolingMode_t = UInt32
176 | const CUDNN_POOLING_MAX = (UInt32)(0)
177 | const CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = (UInt32)(1)
178 | const CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = (UInt32)(2)
179 | # end enum cudnnPoolingMode_t
180 | 
181 | # begin enum cudnnActivationMode_t
182 | const cudnnActivationMode_t = UInt32
183 | const CUDNN_ACTIVATION_SIGMOID = (UInt32)(0)
184 | const CUDNN_ACTIVATION_RELU = (UInt32)(1)
185 | const CUDNN_ACTIVATION_TANH = (UInt32)(2)
186 | const CUDNN_ACTIVATION_CLIPPED_RELU = (UInt32)(3)
187 | const CUDNN_ACTIVATION_ELU = (UInt32)(4)
188 | const CUDNN_ACTIVATION_IDENTITY = (UInt32)(5)
189 | # end enum cudnnActivationMode_t
190 | 
191 | # begin enum cudnnLRNMode_t
192 | const cudnnLRNMode_t = UInt32
193 | const CUDNN_LRN_CROSS_CHANNEL_DIM1 = (UInt32)(0)
194 | # end enum cudnnLRNMode_t
195 | 
196 | # begin enum cudnnDivNormMode_t
197 | const cudnnDivNormMode_t = UInt32
198 | const CUDNN_DIVNORM_PRECOMPUTED_MEANS = (UInt32)(0)
199 | # end enum cudnnDivNormMode_t
200 | 
201 | # begin enum cudnnBatchNormMode_t
202 | const cudnnBatchNormMode_t = UInt32
203 | const CUDNN_BATCHNORM_PER_ACTIVATION = (UInt32)(0)
204 | const CUDNN_BATCHNORM_SPATIAL = (UInt32)(1)
205 | # end enum cudnnBatchNormMode_t
206 | 
207 | # begin enum cudnnMathType_t
208 | const cudnnMathType_t = UInt32
209 | const CUDNN_DEFAULT_MATH                    = 0
210 | const CUDNN_TENSOR_OP_MATH                  = 1
211 | const CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2
212 | # end enum cudnnMathType_t
213 | 


--------------------------------------------------------------------------------
/src/fft/wrappers.jl:
--------------------------------------------------------------------------------
  1 | # wrappers of the low-level CUBLAS functionality
  2 | 
  3 | import CUDAdrv: CuStream
  4 | 
  5 | # Note: we don't implement padded storage dimensions
  6 | function _mkplan(xtype, xdims, region)
  7 |     nrank = length(region)
  8 |     sz = [xdims[i] for i in region]
  9 |     csz = copy(sz)
 10 |     csz[1] = div(sz[1],2) + 1
 11 |     batch = prod(xdims) ÷ prod(sz)
 12 | 
 13 |     pp = Ref{cufftHandle_t}()
 14 |     if (nrank == 1) && (batch == 1)
 15 |         cufftPlan1d(pp, sz[1], xtype, 1)
 16 |     elseif (nrank == 2) && (batch == 1)
 17 |         cufftPlan2d(pp, sz[2], sz[1], xtype)
 18 |     elseif (nrank == 3) && (batch == 1)
 19 |         cufftPlan3d(pp, sz[3], sz[2], sz[1], xtype)
 20 |     else
 21 |         rsz = (length(sz) > 1) ? rsz = reverse(sz) : sz
 22 |         if ((region...,) == ((1:nrank)...,))
 23 |             # handle simple case ... simply! (for robustness)
 24 |            cufftPlanMany(pp, nrank, Cint[rsz...], C_NULL, 1, 1, C_NULL, 1, 1,
 25 |                          xtype, batch)
 26 |         else
 27 |             if nrank==1 || all(diff(collect(region)) .== 1)
 28 |                 # _stride: successive elements in innermost dimension
 29 |                 # _dist: distance between first elements of batches
 30 |                 if region[1] == 1
 31 |                     istride = 1
 32 |                     idist = prod(sz)
 33 |                     cdist = prod(csz)
 34 |                 else
 35 |                     if region[end] != length(xdims)
 36 |                         throw(ArgumentError("batching dims must be sequential"))
 37 |                     end
 38 |                     istride = prod(xdims[1:region[1]-1])
 39 |                     idist = 1
 40 |                     cdist = 1
 41 |                 end
 42 |                 inembed = Cint[rsz...]
 43 |                 cnembed = (length(csz) > 1) ? Cint[reverse(csz)...] : Cint[csz[1]]
 44 |                 ostride = istride
 45 |                 if xtype == CUFFT_R2C || xtype == CUFFT_D2Z
 46 |                     odist = cdist
 47 |                     onembed = cnembed
 48 |                 else
 49 |                     odist = idist
 50 |                     onembed = inembed
 51 |                 end
 52 |                 if xtype == CUFFT_C2R || xtype == CUFFT_Z2D
 53 |                     idist = cdist
 54 |                     inembed = cnembed
 55 |                 end
 56 |             else
 57 |                 if any(diff(collect(region)) .< 1)
 58 |                     throw(ArgumentError("region must be an increasing sequence"))
 59 |                 end
 60 |                 cdims = collect(xdims)
 61 |                 cdims[region[1]] = div(cdims[region[1]],2)+1
 62 | 
 63 |                 if region[1] == 1
 64 |                     istride = 1
 65 |                     ii=1
 66 |                     while (ii < nrank) && (region[ii] == region[ii+1]-1)
 67 |                         ii += 1
 68 |                     end
 69 |                     idist = prod(xdims[1:ii])
 70 |                     cdist = prod(cdims[1:ii])
 71 |                     ngaps = 0
 72 |                 else
 73 |                     istride = prod(xdims[1:region[1]-1])
 74 |                     idist = 1
 75 |                     cdist = 1
 76 |                     ngaps = 1
 77 |                 end
 78 |                 nem = ones(Int,nrank)
 79 |                 cem = ones(Int,nrank)
 80 |                 id = 1
 81 |                 for ii=1:nrank-1
 82 |                     if region[ii+1] > region[ii]+1
 83 |                         ngaps += 1
 84 |                     end
 85 |                     while id < region[ii+1]
 86 |                         nem[ii] *= xdims[id]
 87 |                         cem[ii] *= cdims[id]
 88 |                         id += 1
 89 |                     end
 90 |                     @assert nem[ii] >= sz[ii]
 91 |                 end
 92 |                 if region[end] < length(xdims)
 93 |                     ngaps += 1
 94 |                 end
 95 |                 # CUFFT represents batches by a single stride (_dist)
 96 |                 # so we must verify that region is consistent with this:
 97 |                 if ngaps > 1
 98 |                     throw(ArgumentError("batch regions must be sequential"))
 99 |                 end
100 | 
101 |                 inembed = Cint[reverse(nem)...]
102 |                 cnembed = Cint[reverse(cem)...]
103 |                 ostride = istride
104 |                 if xtype == CUFFT_R2C || xtype == CUFFT_D2Z
105 |                     odist = cdist
106 |                     onembed = cnembed
107 |                 else
108 |                     odist = idist
109 |                     onembed = inembed
110 |                 end
111 |                 if xtype == CUFFT_C2R || xtype == CUFFT_Z2D
112 |                     idist = cdist
113 |                     inembed = cnembed
114 |                 end
115 |             end
116 |             cufftPlanMany(pp, nrank, Cint[rsz...],
117 |                           inembed, istride, idist, onembed, ostride, odist,
118 |                           xtype, batch)
119 |         end
120 |     end
121 |     pp[]
122 | end
123 | 
124 | # this is used implicitly in the unsafe_execute methods below:
125 | unsafe_convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan
126 | 
127 | convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan
128 | 
129 | destroy_plan(plan::CuFFTPlan) = cufftDestroy(plan)
130 | 
131 | set_stream(plan::CuFFTPlan, stream::CuStream) = cufftSetStream(plan, stream)
132 | 
133 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}) where {T,K}
134 |     (size(X) == p.sz) ||
135 |         throw(ArgumentError("CuFFT plan applied to wrong-size input"))
136 | end
137 | 
138 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}, Y::CuArray{Ty}) where {T,K,Ty}
139 |     assert_applicable(p, X)
140 |     (size(Y) == p.osz) ||
141 |         throw(ArgumentError("CuFFT plan applied to wrong-size output"))
142 |     # type errors should be impossible by dispatch, but just in case:
143 |     if p.xtype ∈ [CUFFT_C2R, CUFFT_Z2D]
144 |         (Ty == real(T)) ||
145 |             throw(ArgumentError("Type mismatch for argument Y"))
146 |     elseif p.xtype ∈ [CUFFT_R2C, CUFFT_D2Z]
147 |         (Ty == complex(T)) ||
148 |             throw(ArgumentError("Type mismatch for argument Y"))
149 |     else
150 |         (Ty == T) ||
151 |             throw(ArgumentError("Type mismatch for argument Y"))
152 |     end
153 | end
154 | 
155 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,true,N},
156 |                          x::CuArray{cufftComplex,N}) where {K,N}
157 |     @assert plan.xtype == CUFFT_C2C
158 |     cufftExecC2C(plan, x, x, K)
159 | end
160 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,N},
161 |                          x::CuArray{cufftComplex,N}) where {K,N}
162 |     @assert plan.xtype == CUFFT_C2R
163 |     cufftExecC2R(plan, x, x)
164 | end
165 | 
166 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,false,N},
167 |                          x::CuArray{cufftComplex,N}, y::CuArray{cufftComplex}
168 |                          ) where {K,N}
169 |     @assert plan.xtype == CUFFT_C2C
170 |     cufftExecC2C(plan, x, y, K)
171 | end
172 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
173 |                          x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
174 |                          ) where {K,N}
175 |     @assert plan.xtype == CUFFT_C2R
176 |     cufftExecC2R(plan, x, y)
177 | end
178 | 
179 | function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
180 |                          x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
181 |                          ) where {K,N}
182 |     @assert plan.xtype == CUFFT_R2C
183 |     cufftExecR2C(plan, x, y)
184 | end
185 | 
186 | # double prec.
187 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,true,N},
188 |                          x::CuArray{cufftDoubleComplex,N}) where {K,N}
189 |     @assert plan.xtype == CUFFT_Z2Z
190 |     cufftExecZ2Z(plan, x, x, K)
191 | end
192 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,N},
193 |                          x::CuArray{cufftDoubleComplex,N}) where {K,N}
194 |     @assert plan.xtype == CUFFT_Z2D
195 |     cufftExecZ2D(plan, x, x)
196 | end
197 | 
198 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,false,N},
199 |                          x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleComplex}
200 |                          ) where {K,N}
201 |     @assert plan.xtype == CUFFT_Z2Z
202 |     cufftExecZ2Z(plan, x, y, K)
203 | end
204 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N},
205 |                          x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleReal}
206 |                          ) where {K,N}
207 |     @assert plan.xtype == CUFFT_Z2D
208 |     cufftExecZ2D(plan, x, y)
209 | end
210 | 
211 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,false,N},
212 |                          x::CuArray{cufftDoubleReal,N}, y::CuArray{cufftDoubleComplex,N}
213 |                          ) where {K,N}
214 |     @assert plan.xtype == CUFFT_D2Z
215 |     cufftExecD2Z(plan, x, y)
216 | end
217 | 


--------------------------------------------------------------------------------
/test/base.jl:
--------------------------------------------------------------------------------
  1 | using ForwardDiff: Dual
  2 | using LinearAlgebra
  3 | using Adapt: adapt
  4 | 
  5 | import CUDAdrv
  6 | import CUDAdrv: CuPtr, CU_NULL
  7 | 
  8 | @testset "GPUArrays test suite" begin
  9 |   GPUArrays.test(CuArray)
 10 | end
 11 | 
 12 | @testset "Memory" begin
 13 |   CuArrays.alloc(0)
 14 | 
 15 |   @test (CuArrays.@allocated CuArray{Int32}(undef,1)) == 4
 16 | 
 17 |   ret, out = @grab_output CuArrays.@time CuArray{Int32}(undef, 1)
 18 |   @test isa(ret, CuArray{Int32})
 19 |   @test occursin("1 GPU allocation: 4 bytes", out)
 20 | 
 21 |   ret, out = @grab_output CuArrays.@time Base.unsafe_wrap(CuArray, CuPtr{Int32}(12345678), (2, 3))
 22 |   @test isa(ret, CuArray{Int32})
 23 |   @test !occursin("GPU allocation", out)
 24 | end
 25 | 
 26 | @testset "Array" begin
 27 |   xs = CuArray{Int}(undef, 2, 3)
 28 |   @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4]
 29 |   @test collect(cu[1, 2, 3]) == [1, 2, 3]
 30 |   @test collect(cu([1, 2, 3])) == [1, 2, 3]
 31 |   @test testf(vec, rand(5,3))
 32 |   @test cu(1:3) === 1:3
 33 |   @test Base.elsize(xs) == sizeof(Int)
 34 |   @test CuArray{Int, 2}(xs) === xs
 35 | 
 36 |   @test_throws ArgumentError Base.cconvert(Ptr, xs)
 37 | 
 38 |   # Check that allowscalar works
 39 |   @test_throws ErrorException xs[1]
 40 |   @test_throws ErrorException xs[1] = 1
 41 | 
 42 |   # unsafe_wrap
 43 |   buf = CUDAdrv.Mem.DeviceBuffer(CU_NULL, 2, CUDAdrv.CuCurrentContext())
 44 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 1; own=false).own == false
 45 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 1; ctx=CUDAdrv.CuCurrentContext()).buf.ctx == CUDAdrv.CuCurrentContext()
 46 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 2)            == CuArray{Nothing,1}(buf, (2,))
 47 |   @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, 2)   == CuArray{Nothing,1}(buf, (2,))
 48 |   @test Base.unsafe_wrap(CuArray{Nothing,1}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,))
 49 |   @test Base.unsafe_wrap(CuArray, CU_NULL, (1,2))            == CuArray{Nothing,2}(buf, (1,2))
 50 |   @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, (1,2))   == CuArray{Nothing,2}(buf, (1,2))
 51 |   @test Base.unsafe_wrap(CuArray{Nothing,2}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2))
 52 | 
 53 |   @test collect(CuArrays.zeros(2, 2)) == zeros(Float32, 2, 2)
 54 |   @test collect(CuArrays.ones(2, 2)) == ones(Float32, 2, 2)
 55 | 
 56 |   @test collect(CuArrays.fill(0, 2, 2)) == zeros(Float32, 2, 2)
 57 |   @test collect(CuArrays.fill(1, 2, 2)) == ones(Float32, 2, 2)
 58 | end
 59 | 
 60 | @testset "Adapt" begin
 61 |   A = rand(Float32, 3, 3)
 62 |   dA = CuArray(A)
 63 |   @test adapt(Array, dA) ≈ A
 64 |   @test adapt(CuArray, A) ≈ dA
 65 | end
 66 | 
 67 | @testset "Broadcast" begin
 68 |   @test testf((x)       -> fill!(x, 1),  rand(3,3))
 69 |   @test testf((x, y)    -> map(+, x, y), rand(2, 3), rand(2, 3))
 70 |   @test testf((x)       -> sin.(x),      rand(2, 3))
 71 |   @test testf((x)       -> log.(x) .+ 1, rand(2, 3))
 72 |   @test testf((x)       -> 2x,           rand(2, 3))
 73 |   @test testf((x, y)    -> x .+ y,       rand(2, 3), rand(1, 3))
 74 |   @test testf((z, x, y) -> z .= x .+ y,  rand(2, 3), rand(2, 3), rand(2))
 75 |   @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL])
 76 |   @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6])
 77 | 
 78 |   @eval struct Whatever{T}
 79 |       x::Int
 80 |   end
 81 |   @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1])
 82 | end
 83 | 
 84 | @testset "Cufunc" begin
 85 |   gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3)))
 86 |   sig(x) = one(x) / (one(x) + exp(-x))
 87 |   f(x) = gelu(log(x)) * sig(x) * tanh(x)
 88 | 
 89 |   CuArrays.@cufunc gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3)))
 90 |   CuArrays.@cufunc sig(x) = one(x) / (one(x) + exp(-x))
 91 |   CuArrays.@cufunc f(x) = gelu(log(x)) * sig(x) * tanh(x)
 92 | 
 93 |   @test :gelu ∈ CuArrays.cufuncs()
 94 |   @test :sig ∈ CuArrays.cufuncs()
 95 |   @test :f ∈ CuArrays.cufuncs()
 96 |   @test testf((x)  -> gelu.(x), rand(3,3))
 97 |   @test testf((x)  -> sig.(x),  rand(3,3))
 98 |   @test testf((x)  -> f.(x),    rand(3,3))
 99 | end
100 | 
101 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/223
102 | @testset "Ref Broadcast" begin
103 |   foobar(idx, A) = A[idx]
104 |   @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42])))
105 | end
106 | 
107 | @testset "Broadcast Fix" begin
108 |   @test testf(x -> log.(x), rand(3,3))
109 |   @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3))
110 | 
111 |   if isdefined(CuArrays, :CUDNN)
112 |     using NNlib
113 | 
114 |     @test testf(x -> logσ.(x), rand(5))
115 | 
116 |     f(x) = logσ.(x)
117 |     ds = Dual.(rand(5),1)
118 |     @test f(ds) ≈ collect(f(CuArray(ds)))
119 |   end
120 | end
121 | 
122 | @testset "Reduce" begin
123 |   @test testf(x -> sum(x, dims=1), rand(2, 3))
124 |   @test testf(x -> sum(x, dims=2), rand(2, 3))
125 |   @test testf(x -> sum(x -> x^2, x, dims=1), rand(2, 3))
126 |   @test testf(x -> prod(x, dims=2), rand(2, 3))
127 | 
128 |   @test testf(x -> sum(x), rand(2, 3))
129 |   @test testf(x -> prod(x), rand(2, 3))
130 | end
131 | 
132 | @testset "0D" begin
133 |   x = CuArray{Float64}(undef)
134 |   x .= 1
135 |   @test collect(x)[] == 1
136 |   x /= 2
137 |   @test collect(x)[] == 0.5
138 | end
139 | 
140 | @testset "Slices" begin
141 |   @test testf(rand(5)) do x
142 |     y = x[2:4]
143 |     y .= 1
144 |     x
145 |   end
146 |   @test testf(rand(5)) do x
147 |     y = view(x, 2:4)
148 |     y .= 1
149 |     x
150 |   end
151 |   @test testf(x->view(x, :, 1:4, 3), rand(Float32, 5, 4, 3))
152 |   @allowscalar let x = cu(rand(Float32, 5, 4, 3))
153 |     @test_throws BoundsError view(x, :, :, 1:10)
154 | 
155 |     # Contiguous views should return new CuArray
156 |     @test typeof(view(x, :, 1, 2)) == CuVector{Float32}
157 |     @test typeof(view(x, 1:4, 1, 2)) == CuVector{Float32}
158 |     @test typeof(view(x, :, 1:4, 3)) == CuMatrix{Float32}
159 |     @test typeof(view(x, :, :, 1)) == CuMatrix{Float32}
160 |     @test typeof(view(x, :, :, :)) == CuArray{Float32,3}
161 |     @test typeof(view(x, :)) == CuVector{Float32}
162 |     @test typeof(view(x, 1:3)) == CuVector{Float32}
163 | 
164 |     # Non-contiguous views should fall back to base's SubArray
165 |     @test typeof(view(x, 1:3, 1:3, 3)) <: SubArray
166 |     @test typeof(view(x, 1, :, 3)) <: SubArray
167 |     @test typeof(view(x, 1, 1:4, 3)) <: SubArray
168 |     @test typeof(view(x, :, 1, 1:3)) <: SubArray
169 |     @test typeof(view(x, :, 1:2:4, 1)) <: SubArray
170 |     @test typeof(view(x, 1:2:5, 1, 1)) <: SubArray
171 |   end
172 | end
173 | 
174 | @testset "Reshape" begin
175 |   A = [1 2 3 4
176 |        5 6 7 8]
177 |   gA = reshape(CuArray(A),1,8)
178 |   _A = reshape(A,1,8)
179 |   _gA = Array(gA)
180 |   @test all(_A .== _gA)
181 |   A = [1,2,3,4]
182 |   gA = reshape(CuArray(A),4)
183 | end
184 | 
185 | @testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)),
186 |                                           d in -2:2
187 |   A = randn(10, 10)
188 |   @test f(A, d) == Array(f!(CuArray(A), d))
189 | end
190 | 
191 | @testset "Utilities" begin
192 |   t = @elapsed ret = CuArrays.@sync begin
193 |     # TODO: do something that takes a while on the GPU
194 |     #       (need to wrap clock64 in CUDAnative for that)
195 |     42
196 |   end
197 |   @test t >= 0
198 |   @test ret == 42
199 | end
200 | 
201 | @testset "accumulate" begin
202 |   @test accumulate(+, CuArray{Int}(undef, 2)) isa CuVector
203 |   @test cumsum(CuArray{Int}(undef, 2)) isa CuVector
204 |   @test cumprod(CuArray{Int}(undef, 2)) isa CuVector
205 | 
206 |   @test testf(x->accumulate(+, x), rand(2))
207 |   @test testf(x->accumulate(+, x; dims=2), rand(2))
208 |   @test testf(x->(accumulate!(+, x, copy(x)); x), rand(2))
209 |   @test testf(cumsum, rand(2))
210 |   @test testf(cumprod, rand(2))
211 | end
212 | 
213 | @testset "logical indexing" begin
214 |   @test CuArray{Int}(undef, 2)[CuArray{Bool}(undef, 2)] isa CuArray
215 |   @test CuArray{Int}(undef, 2, 2)[CuArray{Bool}(undef, 2, 2)] isa CuArray
216 |   @test CuArray{Int}(undef, 2, 2, 2)[CuArray{Bool}(undef, 2, 2, 2)] isa CuArray
217 | 
218 |   @test CuArray{Int}(undef, 2)[Array{Bool}(undef, 2)] isa CuArray
219 |   @test CuArray{Int}(undef, 2, 2)[Array{Bool}(undef, 2, 2)] isa CuArray
220 |   @test CuArray{Int}(undef, 2, 2, 2)[Array{Bool}(undef, 2, 2, 2)] isa CuArray
221 | 
222 |   @test testf((x,y)->x[y], rand(2), rand(Bool, 2))
223 |   @test testf((x,y)->x[y], rand(2, 2), rand(Bool, 2, 2))
224 |   @test testf((x,y)->x[y], rand(2, 2, 2), rand(Bool, 2, 2, 2))
225 | 
226 |   @test testf(x -> x[x .> 0.5], rand(2))
227 |   @test testf(x -> x[x .> 0.5], rand(2,2))
228 |   @test testf(x -> x[x .> 0.5], rand(2,2,2))
229 | 
230 |   @test testf(x -> filter(y->y .> 0.5, x), rand(2))
231 |   @test testf(x -> filter(y->y .> 0.5, x), rand(2,2))
232 |   @test testf(x -> filter(y->y .> 0.5, x), rand(2,2,2))
233 | end
234 | 
235 | @testset "generic fallbacks" begin
236 |     a = rand(Int8, 3, 3)
237 |     b = rand(Int8, 3, 3)
238 |     d_a = CuArray{Int8}(a)
239 |     d_b = CuArray{Int8}(b)
240 |     d_c = d_a*d_b
241 |     @test collect(d_c) == a*b
242 |     a = rand(Complex{Int8}, 3, 3)
243 |     b = rand(Complex{Int8}, 3, 3)
244 |     d_a = CuArray{Complex{Int8}}(a)
245 |     d_b = CuArray{Complex{Int8}}(b)
246 |     d_c = d_a'*d_b
247 |     @test collect(d_c) == a'*b
248 |     d_c = d_a*d_b'
249 |     @test collect(d_c) == a*b'
250 |     d_c = d_a'*d_b'
251 |     @test collect(d_c) == a'*b'
252 |     d_c = transpose(d_a)*d_b'
253 |     @test collect(d_c) == transpose(a)*b'
254 |     d_c = d_a'*transpose(d_b)
255 |     @test collect(d_c) == a'*transpose(b)
256 |     d_c = transpose(d_a)*d_b
257 |     @test collect(d_c) == transpose(a)*b
258 |     d_c = d_a*transpose(d_b)
259 |     @test collect(d_c) == a*transpose(b)
260 |     d_c = transpose(d_a)*transpose(d_b)
261 |     @test collect(d_c) == transpose(a)*transpose(b)
262 |     d_c = rmul!(copy(d_a), Complex{Int8}(2, 2))
263 |     @test collect(d_c) == a*Complex{Int8}(2, 2)
264 |     d_c = lmul!(Complex{Int8}(2, 2), copy(d_a))
265 |     @test collect(d_c) == Complex{Int8}(2, 2)*a
266 | end
267 | 
268 | @testset "reverse" begin
269 |     @test testf(x->reverse(x), rand(1000))
270 |     @test testf(x->reverse(x, 10), rand(1000))
271 |     @test testf(x->reverse(x, 10, 90), rand(1000))
272 | 
273 |     @test testf(x->reverse!(x), rand(1000))
274 |     @test testf(x->reverse!(x, 10), rand(1000))
275 |     @test testf(x->reverse!(x, 10, 90), rand(1000))
276 | end
277 | 


--------------------------------------------------------------------------------
/src/blas/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # LinearAlgebra-style wrappers of the CUBLAS functionality
  2 | 
  3 | 
  4 | cublas_size(t::Char, M::CuVecOrMat) = (size(M, t=='N' ? 1 : 2), size(M, t=='N' ? 2 : 1))
  5 | 
  6 | CublasArray{T<:CublasFloat} = CuArray{T}
  7 | 
  8 | 
  9 | #
 10 | # BLAS 1
 11 | #
 12 | 
 13 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Number) =
 14 |   scal!(length(x), convert(eltype(x), k), x, 1)
 15 | 
 16 | # Work around ambiguity with GPUArrays wrapper
 17 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Real) =
 18 |   invoke(rmul!, Tuple{typeof(x), Number}, x, k)
 19 | 
 20 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{Float32,Float64}
 21 |     n = length(DX)
 22 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 23 |     dot(n, DX, 1, DY, 1)
 24 | end
 25 | 
 26 | function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 27 |     n = length(DX)
 28 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 29 |     dotc(n, DX, 1, DY, 1)
 30 | end
 31 | 
 32 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 33 |     dotc(DX, DY)
 34 | end
 35 | 
 36 | function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 37 |     n = length(DX)
 38 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 39 |     dotu(n, DX, 1, DY, 1)
 40 | end
 41 | 
 42 | LinearAlgebra.norm(x::CublasArray) = nrm2(x)
 43 | LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1)
 44 | 
 45 | function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat
 46 |     length(x)==length(y) || throw(DimensionMismatch(""))
 47 |     axpy!(length(x), convert(T,alpha), x, 1, y, 1)
 48 | end
 49 | 
 50 | Base.argmin(xs::CublasArray{<:CublasReal}) = iamin(xs)
 51 | Base.argmax(xs::CublasArray{<:CublasReal}) = iamax(xs)
 52 | 
 53 | 
 54 | 
 55 | #
 56 | # BLAS 2
 57 | #
 58 | 
 59 | # GEMV
 60 | 
 61 | function gemv_wrapper!(y::CuVector{T}, tA::Char, A::CuMatrix{T}, x::CuVector{T},
 62 |                        alpha = one(T), beta = zero(T)) where T<:CublasFloat
 63 |     mA, nA = cublas_size(tA, A)
 64 |     if nA != length(x)
 65 |         throw(DimensionMismatch("second dimension of A, $nA, does not match length of x, $(length(x))"))
 66 |     end
 67 |     if mA != length(y)
 68 |         throw(DimensionMismatch("first dimension of A, $mA, does not match length of y, $(length(y))"))
 69 |     end
 70 |     if mA == 0
 71 |         return y
 72 |     end
 73 |     if nA == 0
 74 |         return rmul!(y, 0)
 75 |     end
 76 |     gemv!(tA, alpha, A, x, beta, y)
 77 | end
 78 | 
 79 | LinearAlgebra.mul!(Y::CuVector{T}, A::CuMatrix{T}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'N', A,  B)
 80 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Transpose{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B)
 81 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B)
 82 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasComplex = gemv_wrapper!(Y, 'C', A.parent, B)
 83 | 
 84 | 
 85 | 
 86 | #
 87 | # BLAS 3
 88 | #
 89 | 
 90 | # GEMM
 91 | 
 92 | function gemm_wrapper!(C::CuVecOrMat{T}, tA::Char, tB::Char,
 93 |                    A::CuVecOrMat{T},
 94 |                    B::CuVecOrMat{T},
 95 |                    alpha = one(T),
 96 |                    beta = zero(T)) where T <: CublasFloat
 97 |     mA, nA = cublas_size(tA, A)
 98 |     mB, nB = cublas_size(tB, B)
 99 | 
100 |     if nA != mB
101 |         throw(DimensionMismatch("A has dimensions ($mA,$nA) but B has dimensions ($mB,$nB)"))
102 |     end
103 | 
104 |     if C === A || B === C
105 |         throw(ArgumentError("output matrix must not be aliased with input matrix"))
106 |     end
107 | 
108 |     if mA == 0 || nA == 0 || nB == 0
109 |         if size(C) != (mA, nB)
110 |             throw(DimensionMismatch("C has dimensions $(size(C)), should have ($mA,$nB)"))
111 |         end
112 |         return LinearAlgebra.rmul!(C, 0)
113 |     end
114 | 
115 |     gemm!(tA, tB, alpha, A, B, beta, C)
116 | end
117 | 
118 | # Mutating
119 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuVecOrMat{T}, B::CuVecOrMat{T}) where T<:CublasFloat = gemm_wrapper!(C, 'N', 'N', A, B)
120 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
121 |     gemm_wrapper!(C, 'T', 'N', parent(trA), B)
122 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
123 |     gemm_wrapper!(C, 'N', 'T', A, parent(trB))
124 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
125 |     gemm_wrapper!(C, 'T', 'T', parent(trA), parent(trB))
126 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasReal =
127 |     gemm_wrapper!(C, 'T', 'N', parent(adjA), B)
128 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
129 |     gemm_wrapper!(C, 'C', 'N', parent(adjA), B)
130 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal =
131 |     gemm_wrapper!(C, 'N', 'T', A, parent(adjB))
132 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
133 |     gemm_wrapper!(C, 'N', 'C', A, parent(adjB))
134 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal =
135 |     gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(adjB))
136 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
137 |     gemm_wrapper!(C, 'C', 'C', parent(adjA), parent(adjB))
138 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}) where T<:CublasReal =
139 |     gemm_wrapper!(C, 'T', 'T', parent(trA), parent(adjB))
140 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
141 |     gemm_wrapper!(C, 'T', 'C', parent(trA), parent(adjB))
142 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasReal =
143 |     gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(trB))
144 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T <: CublasFloat =
145 |     gemm_wrapper!(C, 'C', 'T', parent(adjA), parent(trB))
146 | 
147 | 
148 | # TRSM
149 | 
150 | # ldiv!
151 | ## No transpose/adjoint
152 | LinearAlgebra.ldiv!(A::UpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
153 |     CUBLAS.trsm!('L', 'U', 'N', 'N', one(T), parent(A), B)
154 | LinearAlgebra.ldiv!(A::UnitUpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
155 |     CUBLAS.trsm!('L', 'U', 'N', 'U', one(T), parent(A), B)
156 | LinearAlgebra.ldiv!(A::LowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
157 |     CUBLAS.trsm!('L', 'L', 'N', 'N', one(T), parent(A), B)
158 | LinearAlgebra.ldiv!(A::UnitLowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
159 |     CUBLAS.trsm!('L', 'L', 'N', 'U', one(T), parent(A), B)
160 | ## Adjoint
161 | LinearAlgebra.ldiv!(A::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
162 |     CUBLAS.trsm!('L', 'U', 'C', 'N', one(T), parent(parent(A)), B)
163 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
164 |     CUBLAS.trsm!('L', 'U', 'C', 'U', one(T), parent(parent(A)), B)
165 | LinearAlgebra.ldiv!(A::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
166 |     CUBLAS.trsm!('L', 'L', 'C', 'N', one(T), parent(parent(A)), B)
167 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
168 |     CUBLAS.trsm!('L', 'L', 'C', 'U', one(T), parent(parent(A)), B)
169 | ## Transpose
170 | LinearAlgebra.ldiv!(A::Transpose{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
171 |     CUBLAS.trsm!('L', 'U', 'T', 'N', one(T), parent(parent(A)), B)
172 | LinearAlgebra.ldiv!(A::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
173 |     CUBLAS.trsm!('L', 'U', 'T', 'U', one(T), parent(parent(A)), B)
174 | LinearAlgebra.ldiv!(A::Transpose{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
175 |     CUBLAS.trsm!('L', 'L', 'T', 'N', one(T), parent(parent(A)), B)
176 | LinearAlgebra.ldiv!(A::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
177 |     CUBLAS.trsm!('L', 'L', 'T', 'U', one(T), parent(parent(A)), B)
178 | 
179 | # rdiv!
180 | ## No transpose/adjoint
181 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
182 |     CUBLAS.trsm!('R', 'U', 'N', 'N', one(T), parent(B), A)
183 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitUpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
184 |     CUBLAS.trsm!('R', 'U', 'N', 'U', one(T), parent(B), A)
185 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::LowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
186 |     CUBLAS.trsm!('R', 'L', 'N', 'N', one(T), parent(B), A)
187 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitLowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
188 |     CUBLAS.trsm!('R', 'L', 'N', 'U', one(T), parent(B), A)
189 | ## Adjoint
190 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
191 |     CUBLAS.trsm!('R', 'U', 'C', 'N', one(T), parent(parent(B)), A)
192 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
193 |     CUBLAS.trsm!('R', 'U', 'C', 'U', one(T), parent(parent(B)), A)
194 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
195 |     CUBLAS.trsm!('R', 'L', 'C', 'N', one(T), parent(parent(B)), A)
196 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
197 |     CUBLAS.trsm!('R', 'L', 'C', 'U', one(T), parent(parent(B)), A)
198 | ## Transpose
199 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
200 |     CUBLAS.trsm!('R', 'U', 'T', 'N', one(T), parent(parent(B)), A)
201 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
202 |     CUBLAS.trsm!('R', 'U', 'T', 'U', one(T), parent(parent(B)), A)
203 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
204 |     CUBLAS.trsm!('R', 'L', 'T', 'N', one(T), parent(parent(B)), A)
205 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
206 |     CUBLAS.trsm!('R', 'L', 'T', 'U', one(T), parent(parent(B)), A)
207 | 


--------------------------------------------------------------------------------
/src/sparse/array.jl:
--------------------------------------------------------------------------------
  1 | # custom extension of CuArray in CUDArt for sparse vectors/matrices
  2 | # using CSC format for interop with Julia's native sparse functionality
  3 | 
  4 | import Base: length, size, ndims, eltype, similar, pointer, stride,
  5 |     copy, convert, reinterpret, show, summary, copyto!, get!, fill!, collect
  6 | import LinearAlgebra: BlasFloat, Hermitian, HermOrSym, issymmetric, Transpose, Adjoint,
  7 |     ishermitian, istriu, istril, Symmetric, UpperTriangular, LowerTriangular
  8 | import SparseArrays: sparse, SparseMatrixCSC
  9 | 
 10 | abstract type AbstractCuSparseArray{Tv, N} <: AbstractSparseArray{Tv, Cint, N} end
 11 | const AbstractCuSparseVector{Tv} = AbstractCuSparseArray{Tv,1}
 12 | const AbstractCuSparseMatrix{Tv} = AbstractCuSparseArray{Tv,2}
 13 | 
 14 | mutable struct CuSparseVector{Tv} <: AbstractCuSparseVector{Tv}
 15 |     iPtr::CuVector{Cint}
 16 |     nzVal::CuVector{Tv}
 17 |     dims::NTuple{2,Int}
 18 |     nnz::Cint
 19 | 
 20 |     function CuSparseVector{Tv}(iPtr::CuVector{Cint}, nzVal::CuVector{Tv}, dims::Int, nnz::Cint) where Tv
 21 |         new(iPtr,nzVal,(dims,1),nnz)
 22 |     end
 23 | end
 24 | 
 25 | function CuArrays.unsafe_free!(xs::CuSparseVector)
 26 |     unsafe_free!(xs.iPtr)
 27 |     unsafe_free!(xs.nzVal)
 28 |     return
 29 | end
 30 | 
 31 | mutable struct CuSparseMatrixCSC{Tv} <: AbstractCuSparseMatrix{Tv}
 32 |     colPtr::CuVector{Cint}
 33 |     rowVal::CuVector{Cint}
 34 |     nzVal::CuVector{Tv}
 35 |     dims::NTuple{2,Int}
 36 |     nnz::Cint
 37 | 
 38 |     function CuSparseMatrixCSC{Tv}(colPtr::CuVector{Cint}, rowVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv
 39 |         new(colPtr,rowVal,nzVal,dims,nnz)
 40 |     end
 41 | end
 42 | 
 43 | function CuSparseMatrixCSC!(xs::CuSparseVector)
 44 |     unsafe_free!(xs.colPtr)
 45 |     unsafe_free!(xs.rowVal)
 46 |     unsafe_free!(xs.nzVal)
 47 |     return
 48 | end
 49 | 
 50 | """
 51 | Container to hold sparse matrices in compressed sparse row (CSR) format on the
 52 | GPU.
 53 | 
 54 | **Note**: Most CUSPARSE operations work with CSR formatted matrices, rather
 55 | than CSC.
 56 | """
 57 | mutable struct CuSparseMatrixCSR{Tv} <: AbstractCuSparseMatrix{Tv}
 58 |     rowPtr::CuVector{Cint}
 59 |     colVal::CuVector{Cint}
 60 |     nzVal::CuVector{Tv}
 61 |     dims::NTuple{2,Int}
 62 |     nnz::Cint
 63 | 
 64 |     function CuSparseMatrixCSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv
 65 |         new(rowPtr,colVal,nzVal,dims,nnz)
 66 |     end
 67 | end
 68 | 
 69 | function CuSparseMatrixCSR!(xs::CuSparseVector)
 70 |     unsafe_free!(xs.rowPtr)
 71 |     unsafe_free!(xs.colVal)
 72 |     unsafe_free!(xs.nzVal)
 73 |     return
 74 | end
 75 | 
 76 | """
 77 | Container to hold sparse matrices in block compressed sparse row (BSR) format on
 78 | the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are
 79 | "block" sparse - rare blocks of non-sparse regions.
 80 | """
 81 | mutable struct CuSparseMatrixBSR{Tv} <: AbstractCuSparseMatrix{Tv}
 82 |     rowPtr::CuVector{Cint}
 83 |     colVal::CuVector{Cint}
 84 |     nzVal::CuVector{Tv}
 85 |     dims::NTuple{2,Int}
 86 |     blockDim::Cint
 87 |     dir::SparseChar
 88 |     nnz::Cint
 89 | 
 90 |     function CuSparseMatrixBSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int},blockDim::Cint, dir::SparseChar, nnz::Cint) where Tv
 91 |         new(rowPtr,colVal,nzVal,dims,blockDim,dir,nnz)
 92 |     end
 93 | end
 94 | 
 95 | function CuSparseMatrixBSR!(xs::CuSparseVector)
 96 |     unsafe_free!(xs.rowPtr)
 97 |     unsafe_free!(xs.colVal)
 98 |     unsafe_free!(xs.nzVal)
 99 |     return
100 | end
101 | 
102 | """
103 | Container to hold sparse matrices in NVIDIA's hybrid (HYB) format on the GPU.
104 | HYB format is an opaque struct, which can be converted to/from using
105 | CUSPARSE routines.
106 | """
107 | mutable struct CuSparseMatrixHYB{Tv} <: AbstractCuSparseMatrix{Tv}
108 |     Mat::cusparseHybMat_t
109 |     dims::NTuple{2,Int}
110 |     nnz::Cint
111 | 
112 |     function CuSparseMatrixHYB{Tv}(Mat::cusparseHybMat_t, dims::NTuple{2,Int}, nnz::Cint) where Tv
113 |         new(Mat,dims,nnz)
114 |     end
115 | end
116 | 
117 | """
118 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref),
119 | and `Hermitian` and `Symmetric` versions of these two containers. A function accepting
120 | this type can make use of performance improvements by only indexing one triangle of the
121 | matrix if it is guaranteed to be hermitian/symmetric.
122 | """
123 | const CompressedSparse{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},HermOrSym{T,CuSparseMatrixCSC{T}},HermOrSym{T,CuSparseMatrixCSR{T}}}
124 | 
125 | """
126 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref),
127 | [`CuSparseMatrixBSR`](@ref), and [`CuSparseMatrixHYB`](@ref).
128 | """
129 | const CuSparseMatrix{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T}, CuSparseMatrixBSR{T}, CuSparseMatrixHYB{T}}
130 | 
131 | Hermitian{T}(Mat::CuSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U')
132 | 
133 | length(g::CuSparseVector) = prod(g.dims)
134 | size(g::CuSparseVector) = g.dims
135 | ndims(g::CuSparseVector) = 1
136 | length(g::CuSparseMatrix) = prod(g.dims)
137 | size(g::CuSparseMatrix) = g.dims
138 | ndims(g::CuSparseMatrix) = 2
139 | 
140 | function size(g::CuSparseVector, d::Integer)
141 |     if d == 1
142 |         return g.dims[d]
143 |     elseif d > 1
144 |         return 1
145 |     else
146 |         throw(ArgumentError("dimension must be ≥ 1, got $d"))
147 |     end
148 | end
149 | 
150 | function size(g::CuSparseMatrix, d::Integer)
151 |     if d in [1, 2]
152 |         return g.dims[d]
153 |     elseif d > 1
154 |         return 1
155 |     else
156 |         throw(ArgumentError("dimension must be ≥ 1, got $d"))
157 |     end
158 | end
159 | 
160 | issymmetric(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR})= false
161 | ishermitian(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR}) where T = false
162 | issymmetric(M::Symmetric{CuSparseMatrixCSC})= true
163 | ishermitian(M::Hermitian{CuSparseMatrixCSC}) = true
164 | 
165 | istriu(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = true
166 | istril(M::UpperTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = false
167 | istriu(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = false
168 | istril(M::LowerTriangular{T,S}) where {T<:BlasFloat, S<:AbstractCuSparseMatrix} = true
169 | eltype(g::CuSparseMatrix{T}) where T = T
170 | 
171 | function collect(Vec::CuSparseVector)
172 |     SparseVector(Vec.dims[1], collect(Vec.iPtr), collect(Vec.nzVal))
173 | end
174 | 
175 | function collect(Mat::CuSparseMatrixCSC)
176 |     SparseMatrixCSC(Mat.dims[1], Mat.dims[2], collect(Mat.colPtr), collect(Mat.rowVal), collect(Mat.nzVal))
177 | end
178 | function collect(Mat::CuSparseMatrixCSR)
179 |     rowPtr = collect(Mat.rowPtr)
180 |     colVal = collect(Mat.colVal)
181 |     nzVal = collect(Mat.nzVal)
182 |     #construct Is
183 |     I = similar(colVal)
184 |     counter = 1
185 |     for row = 1 : size(Mat)[1], k = rowPtr[row] : (rowPtr[row+1]-1)
186 |         I[counter] = row
187 |         counter += 1
188 |     end
189 |     return sparse(I,colVal,nzVal,Mat.dims[1],Mat.dims[2])
190 | end
191 | 
192 | summary(g::CuSparseMatrix) = string(g)
193 | summary(g::CuSparseVector) = string(g)
194 | 
195 | CuSparseVector(iPtr::Vector{Ti}, nzVal::Vector{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(CuArray(convert(Vector{Cint},iPtr)), CuArray(nzVal), dims, convert(Cint,length(nzVal)))
196 | CuSparseVector(iPtr::CuArray{Ti}, nzVal::CuArray{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(iPtr, nzVal, dims, convert(Cint,length(nzVal)))
197 | 
198 | CuSparseMatrixCSC(colPtr::Vector{Ti}, rowVal::Vector{Ti}, nzVal::Vector{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(CuArray(convert(Vector{Cint},colPtr)), CuArray(convert(Vector{Cint},rowVal)), CuArray(nzVal), dims, convert(Cint,length(nzVal)))
199 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, convert(Cint,length(nzVal)))
200 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, nnz)
201 | 
202 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, convert(Cint,length(nzVal)))
203 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, nnz)
204 | 
205 | CuSparseMatrixBSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, blockDim, dir, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixBSR{T}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
206 | 
207 | CuSparseVector(Vec::SparseVector)    = CuSparseVector(Vec.nzind, Vec.nzval, size(Vec)[1])
208 | CuSparseMatrixCSC(Vec::SparseVector)    = CuSparseMatrixCSC([1], Vec.nzind, Vec.nzval, size(Vec))
209 | CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError())
210 | CuSparseMatrixCSC(Mat::SparseMatrixCSC) = CuSparseMatrixCSC(Mat.colptr, Mat.rowval, Mat.nzval, size(Mat))
211 | CuSparseMatrixCSR(Mat::SparseMatrixCSC) = switch2csr(CuSparseMatrixCSC(Mat))
212 | 
213 | similar(Vec::CuSparseVector) = CuSparseVector(copy(Vec.iPtr), similar(Vec.nzVal), Vec.dims[1])
214 | similar(Mat::CuSparseMatrixCSC) = CuSparseMatrixCSC(copy(Mat.colPtr), copy(Mat.rowVal), similar(Mat.nzVal), Mat.nnz, Mat.dims)
215 | similar(Mat::CuSparseMatrixCSR) = CuSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.nnz, Mat.dims)
216 | similar(Mat::CuSparseMatrixBSR) = CuSparseMatrixBSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.blockDim, Mat.dir, Mat.nnz, Mat.dims)
217 | 
218 | function copyto!(dst::CuSparseVector, src::CuSparseVector)
219 |     if dst.dims != src.dims
220 |         throw(ArgumentError("Inconsistent Sparse Vector size"))
221 |     end
222 |     copyto!(dst.iPtr, src.iPtr)
223 |     copyto!(dst.nzVal, src.nzVal)
224 |     dst.nnz = src.nnz
225 |     dst
226 | end
227 | 
228 | function copyto!(dst::CuSparseMatrixCSC, src::CuSparseMatrixCSC)
229 |     if dst.dims != src.dims
230 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
231 |     end
232 |     copyto!(dst.colPtr, src.colPtr)
233 |     copyto!(dst.rowVal, src.rowVal)
234 |     copyto!(dst.nzVal, src.nzVal)
235 |     dst.nnz = src.nnz
236 |     dst
237 | end
238 | 
239 | function copyto!(dst::CuSparseMatrixCSR, src::CuSparseMatrixCSR)
240 |     if dst.dims != src.dims
241 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
242 |     end
243 |     copyto!(dst.rowPtr, src.rowPtr)
244 |     copyto!(dst.colVal, src.colVal)
245 |     copyto!(dst.nzVal, src.nzVal)
246 |     dst.nnz = src.nnz
247 |     dst
248 | end
249 | 
250 | function copyto!(dst::CuSparseMatrixBSR, src::CuSparseMatrixBSR)
251 |     if dst.dims != src.dims
252 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
253 |     end
254 |     copyto!(dst.rowPtr, src.rowPtr)
255 |     copyto!(dst.colVal, src.colVal)
256 |     copyto!(dst.nzVal, src.nzVal)
257 |     dst.dir = src.dir
258 |     dst.nnz = src.nnz
259 |     dst
260 | end
261 | 
262 | function copyto!(dst::CuSparseMatrixHYB, src::CuSparseMatrixHYB)
263 |     if dst.dims != src.dims
264 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
265 |     end
266 |     dst.Mat = src.Mat
267 |     dst.nnz = src.nnz
268 |     dst
269 | end
270 | 
271 | copy(Vec::CuSparseVector) = copyto!(similar(Vec),Vec)
272 | copy(Mat::CuSparseMatrixCSC) = copyto!(similar(Mat),Mat)
273 | copy(Mat::CuSparseMatrixCSR) = copyto!(similar(Mat),Mat)
274 | copy(Mat::CuSparseMatrixBSR) = copyto!(similar(Mat),Mat)
275 | 


--------------------------------------------------------------------------------
/src/array.jl:
--------------------------------------------------------------------------------
  1 | import CUDAnative: DevicePtr
  2 | 
  3 | mutable struct CuArray{T,N} <: GPUArray{T,N}
  4 |   buf::Mem.Buffer
  5 |   own::Bool
  6 | 
  7 |   dims::Dims{N}
  8 |   offset::Int
  9 | 
 10 |   function CuArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N}
 11 |     xs = new{T,N}(buf, own, dims, offset)
 12 |     if own
 13 |       Mem.retain(buf)
 14 |       finalizer(unsafe_free!, xs)
 15 |     end
 16 |     return xs
 17 |   end
 18 | end
 19 | 
 20 | CuVector{T} = CuArray{T,1}
 21 | CuMatrix{T} = CuArray{T,2}
 22 | CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}
 23 | 
 24 | const INVALID = Mem.alloc(Mem.Device, 0)
 25 | 
 26 | function unsafe_free!(xs::CuArray{<:Any,N}) where {N}
 27 |   xs.buf === INVALID && return
 28 |   Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs)))
 29 |   xs.dims = Tuple(0 for _ in 1:N)
 30 |   xs.buf = INVALID
 31 |   return
 32 | end
 33 | 
 34 | 
 35 | ## construction
 36 | 
 37 | # type and dimensionality specified, accepting dims as tuples of Ints
 38 | CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
 39 |   CuArray{T,N}(alloc(prod(dims)*sizeof(T)), dims)
 40 | 
 41 | # type and dimensionality specified, accepting dims as series of Ints
 42 | CuArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = CuArray{T,N}(undef, dims)
 43 | 
 44 | # type but not dimensionality specified
 45 | CuArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 46 | CuArray{T}(::UndefInitializer, dims::Integer...) where {T} =
 47 |   CuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
 48 | 
 49 | # empty vector constructor
 50 | CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0)
 51 | 
 52 | # do-block constructors
 53 | for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N))
 54 |   @eval begin
 55 |     function $ctor(f::Function, args...) where {$(tvars...)}
 56 |       xs = $ctor(args...)
 57 |       try
 58 |         f(xs)
 59 |       finally
 60 |         unsafe_free!(xs)
 61 |       end
 62 |     end
 63 |   end
 64 | end
 65 | 
 66 | 
 67 | Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a))
 68 | Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 69 | Base.similar(a::CuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 70 | 
 71 | 
 72 | """
 73 |   unsafe_wrap(::CuArray, ptr::CuPtr{T}, dims; own=false, ctx=CuCurrentContext())
 74 | 
 75 | Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer
 76 | element type `T` determines the array element type. `dims` is either an integer (for a 1d
 77 | array) or a tuple of the array dimensions. `own` optionally specified whether Julia should
 78 | take ownership of the memory, calling `free` when the array is no longer referenced. The
 79 | `ctx` argument determines the CUDA context where the data is allocated in.
 80 | """
 81 | function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}},
 82 |                           p::CuPtr{T}, dims::NTuple{N,Int};
 83 |                           own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T,N}
 84 |   buf = Mem.DeviceBuffer(convert(CuPtr{Cvoid}, p), prod(dims) * sizeof(T), ctx)
 85 |   return CuArray{T, length(dims)}(buf, dims; own=own)
 86 | end
 87 | function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}},
 88 |                           p::CuPtr{T}, dim::Integer;
 89 |                           own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T}
 90 |   unsafe_wrap(Atype, p, (dim,); own=own, ctx=ctx)
 91 | end
 92 | Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} =
 93 |   throw(ArgumentError("cannot wrap a CPU pointer with a $T"))
 94 | 
 95 | 
 96 | ## array interface
 97 | 
 98 | Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T)
 99 | 
100 | Base.size(x::CuArray) = x.dims
101 | Base.sizeof(x::CuArray) = Base.elsize(x) * length(x)
102 | 
103 | 
104 | ## interop with other arrays
105 | 
106 | CuArray{T,N}(xs::AbstractArray{T,N}) where {T,N} =
107 |   isbits(xs) ?
108 |     (CuArray{T,N}(undef, size(xs)) .= xs) :
109 |     copyto!(CuArray{T,N}(undef, size(xs)), collect(xs))
110 | 
111 | CuArray{T,N}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}((x -> T(x)).(xs))
112 | 
113 | # underspecified constructors
114 | CuArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}(xs)
115 | (::Type{CuArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = CuArray{S,N}(x)
116 | CuArray(A::AbstractArray{T,N}) where {T,N} = CuArray{T,N}(A)
117 | 
118 | # idempotency
119 | CuArray{T,N}(xs::CuArray{T,N}) where {T,N} = xs
120 | 
121 | 
122 | ## conversions
123 | 
124 | Base.convert(::Type{T}, x::T) where T <: CuArray = x
125 | 
126 | function Base._reshape(parent::CuArray, dims::Dims)
127 |   n = length(parent)
128 |   prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
129 |   return CuArray{eltype(parent),length(dims)}(parent.buf, dims;
130 |                                               offset=parent.offset, own=parent.own)
131 | end
132 | function Base._reshape(parent::CuArray{T,1}, dims::Tuple{Int}) where T
133 |   n = length(parent)
134 |   prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
135 |   return parent
136 | end
137 | 
138 | 
139 | ## interop with C libraries
140 | 
141 | """
142 |   buffer(array::CuArray [, index])
143 | 
144 | Get the native address of a CuArray, optionally at a given location `index`.
145 | Equivalent of `Base.pointer` on `Array`s.
146 | """
147 | function buffer(xs::CuArray, index::Integer=1)
148 |   extra_offset = (index-1) * Base.elsize(xs)
149 |   view(xs.buf, xs.offset + extra_offset)
150 | end
151 | 
152 | Base.cconvert(::Type{<:Ptr}, x::CuArray) = throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
153 | Base.cconvert(::Type{<:CuPtr}, x::CuArray) = buffer(x)
154 | 
155 | 
156 | ## interop with CUDAnative
157 | 
158 | function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
159 |   ptr = convert(CuPtr{T}, buffer(a))
160 |   CuDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr))
161 | end
162 | 
163 | Adapt.adapt_storage(::CUDAnative.Adaptor, xs::CuArray{T,N}) where {T,N} =
164 |   convert(CuDeviceArray{T,N,AS.Global}, xs)
165 | 
166 | 
167 | ## interop with CPU arrays
168 | 
169 | # We don't convert isbits types in `adapt`, since they are already
170 | # considered GPU-compatible.
171 | 
172 | Adapt.adapt_storage(::Type{<:CuArray}, xs::AbstractArray) =
173 |   isbits(xs) ? xs : convert(CuArray, xs)
174 | 
175 | Adapt.adapt_storage(::Type{<:CuArray{T}}, xs::AbstractArray{<:Real}) where T <: AbstractFloat =
176 |   isbits(xs) ? xs : convert(CuArray{T}, xs)
177 | 
178 | Adapt.adapt_storage(::Type{<:Array}, xs::CuArray) = convert(Array, xs)
179 | 
180 | Base.collect(x::CuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x)
181 | 
182 | function Base.copyto!(dest::CuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
183 |                       n::Integer) where T
184 |   @boundscheck checkbounds(dest, doffs+n-1)
185 |   @boundscheck checkbounds(src, soffs+n-1)
186 |   Mem.copy!(buffer(dest, doffs), pointer(src, soffs), n*sizeof(T))
187 |   return dest
188 | end
189 | 
190 | function Base.copyto!(dest::Array{T}, doffs::Integer, src::CuArray{T}, soffs::Integer,
191 |                       n::Integer) where T
192 |   @boundscheck checkbounds(dest, doffs+n-1)
193 |   @boundscheck checkbounds(src, soffs+n-1)
194 |   Mem.copy!(pointer(dest, doffs), buffer(src, soffs), n*sizeof(T))
195 |   return dest
196 | end
197 | 
198 | function Base.copyto!(dest::CuArray{T}, doffs::Integer, src::CuArray{T}, soffs::Integer,
199 |                       n::Integer) where T
200 |   @boundscheck checkbounds(dest, doffs+n-1)
201 |   @boundscheck checkbounds(src, soffs+n-1)
202 |   Mem.copy!(buffer(dest, doffs), buffer(src, soffs), n*sizeof(T))
203 |   return dest
204 | end
205 | 
206 | function Base.deepcopy_internal(x::CuArray, dict::IdDict)
207 |   haskey(dict, x) && return dict[x]::typeof(x)
208 |   return dict[x] = copy(x)
209 | end
210 | 
211 | 
212 | ## utilities
213 | 
214 | cu(xs) = adapt(CuArray{Float32}, xs)
215 | Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
216 | 
217 | zeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0)
218 | ones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1)
219 | zeros(dims...) = CuArrays.zeros(Float32, dims...)
220 | ones(dims...) = CuArrays.ones(Float32, dims...)
221 | fill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v)
222 | fill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v)
223 | 
224 | # optimized implementation of `fill!` for types that are directly supported by memset
225 | const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32)
226 | const MemsetCompatTypes = Union{UInt8, Int8,
227 |                                 UInt16, Int16, Float16,
228 |                                 UInt32, Int32, Float32}
229 | function Base.fill!(A::CuArray{T}, x) where T <: MemsetCompatTypes
230 |   y = reinterpret(MemsetTypes[sizeof(T)], convert(T, x))
231 |   Mem.set!(buffer(A), y, length(A))
232 |   A
233 | end
234 | 
235 | 
236 | ## generic linear algebra routines
237 | 
238 | function LinearAlgebra.tril!(A::CuMatrix{T}, d::Integer = 0) where T
239 |   function kernel!(_A, _d)
240 |     li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
241 |     m, n = size(_A)
242 |     if 0 < li <= m*n
243 |       i, j = Tuple(CartesianIndices(_A)[li])
244 |       if i < j - _d
245 |         _A[i, j] = 0
246 |       end
247 |     end
248 |     return nothing
249 |   end
250 | 
251 |   blk, thr = cudims(A)
252 |   @cuda blocks=blk threads=thr kernel!(A, d)
253 |   return A
254 | end
255 | 
256 | function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T
257 |   function kernel!(_A, _d)
258 |     li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
259 |     m, n = size(_A)
260 |     if 0 < li <= m*n
261 |       i, j = Tuple(CartesianIndices(_A)[li])
262 |       if j < i + _d
263 |         _A[i, j] = 0
264 |       end
265 |     end
266 |     return nothing
267 |   end
268 | 
269 |   blk, thr = cudims(A)
270 |   @cuda blocks=blk threads=thr kernel!(A, d)
271 |   return A
272 | end
273 | 
274 | 
275 | ## reversing
276 | 
277 | function _reverse(input::CuVector{T}, output::CuVector{T}) where {T}
278 |     @assert length(input) == length(output)
279 | 
280 |     nthreads = 256
281 |     nblocks = ceil(Int, length(input) / nthreads)
282 |     shmem = nthreads * sizeof(T)
283 | 
284 |     function kernel(input::CuDeviceVector{T}, output::CuDeviceVector{T}) where {T}
285 |         shared = @cuDynamicSharedMem(T, blockDim().x)
286 | 
287 |         # load one element per thread from device memory and buffer it in reversed order
288 | 
289 |         offset_in = blockDim().x * (blockIdx().x - 1)
290 |         index_in = offset_in + threadIdx().x
291 | 
292 |         if index_in <= length(input)
293 |             index_shared = blockDim().x - threadIdx().x + 1
294 |             @inbounds shared[index_shared] = input[index_in]
295 |         end
296 | 
297 |         sync_threads()
298 | 
299 |         # write back in forward order, but to the reversed block offset as before
300 | 
301 |         offset_out = length(output) -  blockDim().x * blockIdx().x
302 |         index_out = offset_out + threadIdx().x
303 | 
304 |         if 1 <= index_out <= length(output)
305 |             index_shared = threadIdx().x
306 |             @inbounds output[index_out] = shared[index_shared]
307 |         end
308 | 
309 |         return
310 |     end
311 | 
312 |     @cuda threads=nthreads blocks=nblocks shmem=shmem kernel(input, output)
313 | 
314 |     return
315 | end
316 | 
317 | function Base.reverse!(v::CuVector, start=1, stop=length(v))
318 |     v′ = view(v, start:stop)
319 |     _reverse(v′, v′)
320 |     return v
321 | end
322 | 
323 | function Base.reverse(v::CuVector, start=1, stop=length(v))
324 |     v′ = similar(v)
325 |     start > 1 && copyto!(v′, 1, v, 1, start-1)
326 |     _reverse(view(v, start:stop), view(v′, start:stop))
327 |     stop < length(v) && copyto!(v′, stop+1, v, stop+1)
328 |     return v′
329 | end
330 | 


--------------------------------------------------------------------------------