├── test
    ├── REQUIRE
    ├── util.jl
    ├── runtests.jl
    ├── rand.jl
    ├── dnn.jl
    ├── sparse_solver.jl
    ├── base.jl
    ├── fft.jl
    └── solver.jl
├── docs
    ├── src
    │   ├── index.md
    │   └── tutorials
    │   │   ├── intro1.png
    │   │   └── common.jl
    ├── .gitignore
    ├── Project.toml
    └── make.jl
├── deps
    ├── .gitignore
    └── build.jl
├── bors.toml
├── .gitignore
├── src
    ├── deprecated.jl
    ├── indexing.jl
    ├── dnn
    │   ├── error.jl
    │   ├── CUDNN.jl
    │   ├── nnlib.jl
    │   ├── helpers.jl
    │   └── libcudnn_types.jl
    ├── blas
    │   ├── util.jl
    │   ├── CUBLAS.jl
    │   ├── error.jl
    │   ├── README.md
    │   ├── libcublas_types.jl
    │   └── highlevel.jl
    ├── fft
    │   ├── CUFFT.jl
    │   ├── genericfft.jl
    │   ├── error.jl
    │   ├── fft.jl
    │   ├── libcufft_types.jl
    │   ├── libcufft.jl
    │   ├── highlevel.jl
    │   └── wrappers.jl
    ├── nnlib.jl
    ├── rand
    │   ├── CURAND.jl
    │   ├── error.jl
    │   ├── libcurand_types.jl
    │   ├── highlevel.jl
    │   └── libcurand.jl
    ├── sparse
    │   ├── CUSPARSE.jl
    │   ├── error.jl
    │   ├── highlevel.jl
    │   ├── libcusparse.jl
    │   ├── libcusparse_types.jl
    │   └── array.jl
    ├── subarray.jl
    ├── solver
    │   ├── error.jl
    │   ├── CUSOLVER.jl
    │   ├── libcusolver_types.jl
    │   ├── highlevel.jl
    │   └── libcusolver.jl
    ├── utils.jl
    ├── broadcast.jl
    ├── gpuarray_interface.jl
    ├── CuArrays.jl
    ├── matmul.jl
    ├── mapreduce.jl
    └── array.jl
├── REQUIRE
├── .github
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── Project.toml
├── LICENSE.md
├── .gitlab-ci.yml
└── README.md


/test/REQUIRE:
--------------------------------------------------------------------------------
1 | FFTW
2 | 


--------------------------------------------------------------------------------
/docs/src/index.md:
--------------------------------------------------------------------------------
1 | # CuArrays.jl


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | src/**/generated/
2 | 


--------------------------------------------------------------------------------
/deps/.gitignore:
--------------------------------------------------------------------------------
1 | ext.jl.bak
2 | build.log
3 | 
4 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [
2 |   "ci/gitlab/%"
3 | ]
4 | delete_merged_branches = true
5 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/intro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpsanders/CuArrays.jl/master/docs/src/tutorials/intro1.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jl.cov
2 | *.jl.*.cov
3 | *.jl.mem
4 | deps/ext.jl
5 | Manifest.toml
6 | tutorials/build/
7 | docs/build/
8 | 


--------------------------------------------------------------------------------
/src/deprecated.jl:
--------------------------------------------------------------------------------
1 | # Deprecated functionality
2 | 
3 | import Base: @deprecate_binding
4 | 
5 | @deprecate_binding BLAS CUBLAS
6 | @deprecate_binding FFT CUFFT
7 | 


--------------------------------------------------------------------------------
/REQUIRE:
--------------------------------------------------------------------------------
 1 | julia 1.0
 2 | CUDAnative 1.1
 3 | CUDAdrv 1.1
 4 | CUDAapi 0.5.3
 5 | NNlib 0.5.0
 6 | GPUArrays 0.5
 7 | Adapt 0.4
 8 | AbstractFFTs
 9 | MacroTools
10 | ForwardDiff
11 | DiffRules
12 | 


--------------------------------------------------------------------------------
/docs/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
3 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
4 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
5 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
6 | 


--------------------------------------------------------------------------------
/test/util.jl:
--------------------------------------------------------------------------------
 1 | macro grab_output(ex)
 2 |     quote
 3 |         mktemp() do fname, fout
 4 |             ret = nothing
 5 |             open(fname, "w") do fout
 6 |                 redirect_stdout(fout) do
 7 |                     ret = $(esc(ex))
 8 |                 end
 9 |             end
10 |             ret, read(fname, String)
11 |         end
12 |     end
13 | end
14 | 


--------------------------------------------------------------------------------
/src/indexing.jl:
--------------------------------------------------------------------------------
 1 | import GPUArrays: allowscalar, @allowscalar
 2 | 
 3 | function _getindex(xs::CuArray{T}, i::Integer) where T
 4 |   buf = Mem.view(buffer(xs), (i-1)*sizeof(T))
 5 |   return Mem.download(T, buf)[1]
 6 | end
 7 | 
 8 | function _setindex!(xs::CuArray{T}, v::T, i::Integer) where T
 9 |   buf = Mem.view(buffer(xs), (i-1)*sizeof(T))
10 |   Mem.upload!(buf, T[v])
11 | end
12 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/common.jl:
--------------------------------------------------------------------------------
 1 | # function to run a Julia script outside of the current environment
 2 | function script(code; wrapper=``, args=``)
 3 |     if Base.JLOptions().project != C_NULL
 4 |         args = `$args --project=$(unsafe_string(Base.JLOptions().project))`
 5 |     end
 6 |     mktemp() do path, io
 7 |         write(io, code)
 8 |         flush(io)
 9 |         cmd = `$wrapper $(Base.julia_cmd()) $args $path`
10 |         # redirect stderr to stdout to have it picked up by Weave.jl
11 |         run(pipeline(ignorestatus(cmd), stderr=stdout))
12 |     end
13 |     nothing
14 | end
15 | 


--------------------------------------------------------------------------------
/src/dnn/error.jl:
--------------------------------------------------------------------------------
 1 | export CUDNNError
 2 | 
 3 | struct CUDNNError <: Exception
 4 |     code::cudnnStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUDNNError) = print(io, "CUDNNError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUDNNError(status::cudnnStatus_t)
10 |     msg = unsafe_string(cudnnGetErrorString(status))
11 |     return CUDNNError(status, msg)
12 | end
13 | 
14 | macro check(dnn_func)
15 |     quote
16 |         local err::cudnnStatus_t
17 |         err = $(esc(dnn_func))
18 |         if err != CUDNN_STATUS_SUCCESS
19 |             throw(CUDNNError(err))
20 |         end
21 |         err
22 |     end
23 | end
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/docs/make.jl:
--------------------------------------------------------------------------------
 1 | using Documenter
 2 | using Literate
 3 | 
 4 | using Pkg
 5 | if haskey(ENV, "GITLAB_CI")
 6 |     Pkg.add([PackageSpec(name = x; rev = "master")
 7 |              for x in ["CUDAapi", "GPUArrays", "CUDAnative", "NNlib", "CUDAdrv"]])
 8 | end
 9 | 
10 | using CuArrays
11 | 
12 | # generate tutorials
13 | OUTPUT = joinpath(@__DIR__, "src/tutorials/generated")
14 | Literate.markdown(joinpath(@__DIR__, "src/tutorials/intro.jl"), OUTPUT)
15 | 
16 | makedocs(
17 |     modules = [CuArrays],
18 |     format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
19 |     sitename = "CuArrays.jl",
20 |     pages = [
21 |         "Home" => "index.md",
22 |         "Tutorials"  => [
23 |             "tutorials/generated/intro.md"
24 |         ],
25 |     ],
26 |     doctest = true
27 | )
28 | 


--------------------------------------------------------------------------------
/src/blas/util.jl:
--------------------------------------------------------------------------------
 1 | # convert matrix to band storage
 2 | function band(A::AbstractMatrix,kl,ku)
 3 |     m, n = size(A)
 4 |     AB = zeros(eltype(A),kl+ku+1,n)
 5 |     for j = 1:n
 6 |         for i = max(1,j-ku):min(m,j+kl)
 7 |             AB[ku+1-j+i,j] = A[i,j]
 8 |         end
 9 |     end
10 |     return AB
11 | end
12 | 
13 | # convert band storage to general matrix
14 | function unband(AB::AbstractMatrix,m,kl,ku)
15 |     bm, n = size(AB)
16 |     A = zeros(eltype(AB),m,n)
17 |     for j = 1:n
18 |         for i = max(1,j-ku):min(m,j+kl)
19 |             A[i,j] = AB[ku+1-j+i,j]
20 |         end
21 |     end
22 |     return A
23 | end
24 | 
25 | # zero out elements not on matrix bands
26 | function bandex(A::AbstractMatrix,kl,ku)
27 |     m, n = size(A)
28 |     AB = band(A,kl,ku)
29 |     B = unband(AB,m,kl,ku)
30 |     return B
31 | end
32 | 


--------------------------------------------------------------------------------
/src/fft/CUFFT.jl:
--------------------------------------------------------------------------------
 1 | module CUFFT
 2 | 
 3 | import CUDAapi
 4 | 
 5 | import CUDAdrv: CuPtr, PtrOrCuPtr
 6 | 
 7 | using ..CuArrays
 8 | using ..CuArrays: libcufft, configured
 9 | 
10 | import AbstractFFTs: plan_fft, plan_fft!, plan_bfft, plan_bfft!,
11 |     plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft,
12 |     Plan, ScaledPlan
13 | import Base: show, *, convert, unsafe_convert, size, strides, ndims
14 | import Base.Sys: WORD_SIZE
15 | 
16 | using LinearAlgebra
17 | import LinearAlgebra: mul!
18 | 
19 | include("libcufft_types.jl")
20 | include("error.jl")
21 | 
22 | include("libcufft.jl")
23 | include("genericfft.jl")
24 | include("fft.jl")
25 | include("wrappers.jl")
26 | include("highlevel.jl")
27 | 
28 | version() = VersionNumber(cufftGetProperty(CUDAapi.MAJOR_VERSION),
29 |                           cufftGetProperty(CUDAapi.MINOR_VERSION),
30 |                           cufftGetProperty(CUDAapi.PATCH_LEVEL))
31 | 
32 | end
33 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | # development often happens in lockstep with other packages,
 4 | # so check-out the master branch of those packages.
 5 | using Pkg
 6 | if haskey(ENV, "GITLAB_CI")
 7 |   Pkg.add([PackageSpec(name = x; rev = "master")
 8 |            for x in ["CUDAapi", "GPUArrays", "CUDAnative", "NNlib", "CUDAdrv"]])
 9 | end
10 | 
11 | include("util.jl")
12 | 
13 | using Random
14 | Random.seed!(1)
15 | 
16 | using CuArrays
17 | 
18 | using GPUArrays
19 | import GPUArrays: allowscalar, @allowscalar
20 | 
21 | testf(f, xs...; kwargs...) = GPUArrays.TestSuite.compare(f, CuArray, xs...; kwargs...)
22 | 
23 | allowscalar(false)
24 | 
25 | @testset "CuArrays" begin
26 | 
27 | include("base.jl")
28 | include("dnn.jl")
29 | include("blas.jl")
30 | include("sparse.jl")
31 | include("solver.jl")
32 | include("fft.jl")
33 | include("rand.jl")
34 | include("sparse_solver.jl")
35 | 
36 | CuArrays.pool_status()
37 | CuArrays.pool_timings()
38 | 
39 | end
40 | 


--------------------------------------------------------------------------------
/src/dnn/CUDNN.jl:
--------------------------------------------------------------------------------
 1 | module CUDNN
 2 | 
 3 | import CUDAapi
 4 | 
 5 | import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL
 6 | 
 7 | using ..CuArrays
 8 | using ..CuArrays: libcudnn, active_context, configured, unsafe_free!
 9 | 
10 | include("libcudnn_types.jl")
11 | include("error.jl")
12 | 
13 | const _handles = Dict{CuContext,cudnnHandle_t}()
14 | const _handle = Ref{cudnnHandle_t}(C_NULL)
15 | 
16 | function handle()
17 |     if _handle[] == C_NULL
18 |         @assert isassigned(active_context) # some other call should have initialized CUDA
19 |         _handle[] = get!(_handles, active_context[]) do
20 |             context = active_context[]
21 |             handle = cudnnCreate()
22 |             atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle))
23 |             handle
24 |         end
25 |     end
26 | 
27 |     return _handle[]
28 | end
29 | 
30 | include("libcudnn.jl")
31 | include("helpers.jl")
32 | include("nnlib.jl")
33 | 
34 | version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION),
35 |                           cudnnGetProperty(CUDAapi.MINOR_VERSION),
36 |                           cudnnGetProperty(CUDAapi.PATCH_LEVEL))
37 | 
38 | end
39 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "CuArrays"
 2 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
 3 | version = "0.10.0"
 4 | 
 5 | [deps]
 6 | AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 7 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 8 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 9 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
10 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
11 | DiffRules = "b552c78f-8df3-52c6-915a-8e097449b14b"
12 | ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
13 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
14 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
15 | MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
16 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
17 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
18 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
19 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
20 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
21 | 
22 | [extras]
23 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
24 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
25 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
26 | 
27 | [targets]
28 | test = ["Test", "FFTW", "Pkg"]
29 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The CuArrays.jl package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2017: Mike J Innes.
 4 | > 
 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | > of this software and associated documentation files (the "Software"), to deal
 7 | > in the Software without restriction, including without limitation the rights
 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | > copies of the Software, and to permit persons to whom the Software is
10 | > furnished to do so, subject to the following conditions:
11 | > 
12 | > The above copyright notice and this permission notice shall be included in all
13 | > copies or substantial portions of the Software.
14 | > 
15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | > SOFTWARE.
22 | > 
23 | 


--------------------------------------------------------------------------------
/src/nnlib.jl:
--------------------------------------------------------------------------------
 1 | using NNlib
 2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
 3 |   maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
 4 |   softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
 5 | using CUDAnative
 6 | 
 7 | # Activation functions
 8 | @cufunc σ(x) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x)))
 9 | 
10 | @cufunc function logσ(x)
11 |   max_v = max(zero(x), -x)
12 |   z = exp(-max_v) + exp(-x-max_v)
13 |   -(max_v + log(z))
14 | end
15 | 
16 | @cufunc elu(x, α = one(x)) =
17 |   ifelse(x ≥ 0, x/1, α * (exp(x) - one(x)))
18 | 
19 | # TODO: make @cufunc recognise its own definitions
20 | cufunc(::typeof(swish)) = x -> x * cufunc(σ)(x)
21 | 
22 | @cufunc function selu(x)
23 |   λ = oftype(x/1, 1.0507009873554804934193349852946)
24 |   α = oftype(x/1, 1.6732632423543772848170429916717)
25 |   λ * ifelse(x > 0, x/1, α * (exp(x) - 1))
26 | end
27 | 
28 | @cufunc softplus(x) = log1p(exp(x))
29 | 
30 | if !@isdefined CUDNN
31 |   function conv!(y::CuArray, x::CuArray, w::CuArray; kw...)
32 |     error("CUDNN is not installed.")
33 |   end
34 |   function softmax!(out::CuVecOrMat, xs::CuVecOrMat)
35 |     error("CUDNN is not installed.")
36 |   end
37 |   function logsoftmax!(out::CuVecOrMat, xs::CuVecOrMat)
38 |     error("CUDNN is not installed.")
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/src/rand/CURAND.jl:
--------------------------------------------------------------------------------
 1 | module CURAND
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuPtr
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcurand, active_context
 8 | 
 9 | using GPUArrays
10 | 
11 | using Random
12 | 
13 | export curand,
14 |        curandn,
15 |        curand_logn, rand_logn!,
16 |        curand_poisson, rand_poisson!
17 | 
18 | include("libcurand_types.jl")
19 | include("error.jl")
20 | 
21 | const _generators = Dict{CuContext,RNG}()
22 | const _generator = Ref{Union{Nothing,RNG}}(nothing)
23 | 
24 | function generator()
25 |     if _generator[] == nothing
26 |         @assert isassigned(active_context) # some other call should have initialized CUDA
27 |         _generator[] = get!(_generators, active_context[]) do
28 |             context = active_context[]
29 |             generator = create_generator()
30 |             # FIXME: crashes
31 |             #atexit(()->CUDAdrv.isvalid(context) && destroy_generator(generator))
32 |             generator
33 |         end
34 |     end
35 | 
36 |     return _generator[]::RNG
37 | end
38 | 
39 | include("libcurand.jl")
40 | include("highlevel.jl")
41 | 
42 | version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION),
43 |                           curandGetProperty(CUDAapi.MINOR_VERSION),
44 |                           curandGetProperty(CUDAapi.PATCH_LEVEL))
45 | 
46 | end
47 | 


--------------------------------------------------------------------------------
/test/rand.jl:
--------------------------------------------------------------------------------
 1 | @testset "CURAND" begin
 2 | 
 3 | if !isdefined(CuArrays, :CURAND)
 4 | @warn "Not testing CURAND"
 5 | else
 6 | using CuArrays.CURAND
 7 | @info "Testing CURAND $(CURAND.version())"
 8 | 
 9 | CURAND.seed!()
10 | 
11 | # in-place
12 | for (f,T) in ((rand!,Float32),
13 |               (randn!,Float32),
14 |               (rand_logn!,Float32),
15 |               (rand_poisson!,Cuint)),
16 |     d in (2, (2,2), (2,2,2))
17 |     A = CuArray{T}(undef, d)
18 |     f(A)
19 | end
20 | 
21 | # out-of-place, with implicit type
22 | for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32),
23 |               (curand_poisson,Cuint)),
24 |     args in ((2,), (2, 2))
25 |     A = f(args...)
26 |     @test eltype(A) == T
27 | end
28 | 
29 | # out-of-place, with type specified
30 | for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32),
31 |               (curand,Float64), (curandn,Float64), (curand_logn,Float64),
32 |               (curand_poisson,Cuint)),
33 |     args in ((T, 2), (T, 2, 2), (T, (2, 2)))
34 |     A = f(args...)
35 |     @test eltype(A) == T
36 | end
37 | 
38 | # unsupported types that fall back to GPUArrays
39 | for (f,T) in ((curand,Int64),),
40 |     args in ((T, 2), (T, 2, 2), (T, (2, 2)))
41 |     A = f(args...)
42 |     @test eltype(A) == T
43 | end
44 | for (f,T) in ((rand!,Int64),),
45 |     d in (2, (2,2), (2,2,2))
46 |     A = CuArray{T}(undef, d)
47 |     f(A)
48 | end
49 | 
50 | end
51 | 
52 | end
53 | 


--------------------------------------------------------------------------------
/src/blas/CUBLAS.jl:
--------------------------------------------------------------------------------
 1 | module CUBLAS
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcublas, active_context, unsafe_free!
 8 | 
 9 | using LinearAlgebra
10 | 
11 | include("libcublas_types.jl")
12 | include("error.jl")
13 | 
14 | const _handles = Dict{CuContext,cublasHandle_t}()
15 | const _handle = Ref{cublasHandle_t}(C_NULL)
16 | 
17 | function handle()
18 |     if _handle[] == C_NULL
19 |         @assert isassigned(active_context) # some other call should have initialized CUDA
20 |         _handle[] = get!(_handles, active_context[]) do
21 |             context = active_context[]
22 |             handle = cublasCreate_v2()
23 | 
24 |             # enable tensor math mode if our device supports it, and fast math is enabled
25 |             dev = CUDAdrv.device(context)
26 |             if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0"
27 |               cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle)
28 |             end
29 | 
30 |             atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle))
31 |             handle
32 |         end
33 |     end
34 | 
35 |     return _handle[]
36 | end
37 | 
38 | include("libcublas.jl")
39 | include("util.jl")
40 | include("wrappers.jl")
41 | include("highlevel.jl")
42 | 
43 | version() = VersionNumber(cublasGetProperty(CUDAapi.MAJOR_VERSION),
44 |                           cublasGetProperty(CUDAapi.MINOR_VERSION),
45 |                           cublasGetProperty(CUDAapi.PATCH_LEVEL))
46 | 
47 | end
48 | 


--------------------------------------------------------------------------------
/src/sparse/CUSPARSE.jl:
--------------------------------------------------------------------------------
 1 | module CUSPARSE
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcusparse, active_context, unsafe_free!
 8 | 
 9 | using SparseArrays
10 | using LinearAlgebra
11 | 
12 | import Base.one
13 | import Base.zero
14 | 
15 | const SparseChar = Char
16 | import Base.one
17 | import Base.zero
18 | 
19 | export CuSparseMatrixCSC, CuSparseMatrixCSR,
20 |        CuSparseMatrixHYB, CuSparseMatrixBSR,
21 |        CuSparseMatrix, AbstractCuSparseMatrix,
22 |        CuSparseVector
23 | 
24 | include("libcusparse_types.jl")
25 | include("error.jl")
26 | 
27 | const _handles = Dict{CuContext,cusparseHandle_t}()
28 | const _handle = Ref{cusparseHandle_t}()
29 | 
30 | function handle()
31 |     if _handle[] == C_NULL
32 |         @assert isassigned(active_context) # some other call should have initialized CUDA
33 |         _handle[] = get!(_handles, active_context[]) do
34 |             context = active_context[]
35 |             handle = cusparseCreate()
36 |             atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle))
37 |             handle
38 |         end
39 |     end
40 | 
41 |     return _handle[]
42 | end
43 | 
44 | include("libcusparse.jl")
45 | include("array.jl")
46 | include("util.jl")
47 | include("wrappers.jl")
48 | include("highlevel.jl")
49 | 
50 | version() = VersionNumber(cusparseGetProperty(CUDAapi.MAJOR_VERSION),
51 |                           cusparseGetProperty(CUDAapi.MINOR_VERSION),
52 |                           cusparseGetProperty(CUDAapi.PATCH_LEVEL))
53 | 
54 | end
55 | 


--------------------------------------------------------------------------------
/src/subarray.jl:
--------------------------------------------------------------------------------
 1 | import Base: view
 2 | 
 3 | using Base: ScalarIndex, ViewIndex, Slice, @_inline_meta, @boundscheck, 
 4 |             to_indices, compute_offset1, unsafe_length, _maybe_reshape_parent, index_ndims
 5 | 
 6 | struct Contiguous end
 7 | struct NonContiguous end
 8 | 
 9 | # Detect whether the view is contiguous or not
10 | CuIndexStyle() = Contiguous()
11 | CuIndexStyle(I...) = NonContiguous()
12 | CuIndexStyle(i1::Colon, ::ScalarIndex...) = Contiguous()
13 | CuIndexStyle(i1::AbstractUnitRange, ::ScalarIndex...) = Contiguous()
14 | CuIndexStyle(i1::Colon, I...) = CuIndexStyle(I...)
15 | 
16 | cuviewlength() = ()
17 | cuviewlength(::Real, I...) = (@_inline_meta; cuviewlength(I...)) # skip scalars
18 | cuviewlength(i1::AbstractUnitRange, I...) = (@_inline_meta; (unsafe_length(i1), cuviewlength(I...)...))
19 | cuviewlength(i1::AbstractUnitRange, ::ScalarIndex...) = (@_inline_meta; (unsafe_length(i1),))
20 | 
21 | view(A::CuArray, I::Vararg{Any,N}) where {N} = (@_inline_meta; _cuview(A, I, CuIndexStyle(I...)))
22 | 
23 | function _cuview(A, I, ::Contiguous)
24 |     @_inline_meta
25 |     J = to_indices(A, I)
26 |     @boundscheck checkbounds(A, J...)
27 |     _cuview(_maybe_reshape_parent(A, index_ndims(J...)), J, cuviewlength(J...))
28 | end
29 | 
30 | # for contiguous views just return a new CuArray
31 | _cuview(A::CuArray{T}, I::NTuple{N,ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M} =
32 |     CuArray{T,M}(A.buf, dims; offset=A.offset + compute_offset1(A, 1, I) * sizeof(T), own=A.own)
33 | 
34 | # fallback to SubArray when the view is not contiguous
35 | _cuview(A, I, ::NonContiguous) where {N} = invoke(view, Tuple{AbstractArray, typeof(I).parameters...}, A, I...)
36 | 


--------------------------------------------------------------------------------
/src/solver/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSOLVERError
 2 | 
 3 | struct CUSOLVERError <: Exception
 4 |     code::cusolverStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUSOLVERError) = print(io, "CUSOLVERError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUSOLVERError(code::cusolverStatus_t)
10 |     msg = status_message(code)
11 |     return CUSOLVERError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUSOLVER_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUSOLVER_STATUS_NOT_INITIALIZED
18 |         return "the library was not initialized"
19 |     elseif status == CUSOLVER_STATUS_ALLOC_FAILED
20 |         return "the resource allocation failed"
21 |     elseif status == CUSOLVER_STATUS_INVALID_VALUE
22 |         return "an invalid value was used as an argument"
23 |     elseif status == CUSOLVER_STATUS_ARCH_MISMATCH
24 |         return "an absent device architectural feature is required"
25 |     elseif status == CUSOLVER_STATUS_EXECUTION_FAILED
26 |         return "the GPU program failed to execute"
27 |     elseif status == CUSOLVER_STATUS_INTERNAL_ERROR
28 |         return "an internal operation failed"
29 |     elseif status == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED
30 |         return "the matrix type is not supported."
31 |     else
32 |         return "unknown status"
33 |     end
34 | end
35 | 
36 | macro check(solver_func)
37 |     quote
38 |         local err::cusolverStatus_t
39 |         err = $(esc(solver_func::Expr))
40 |         if err != CUSOLVER_STATUS_SUCCESS
41 |             throw(CUSOLVERError(err))
42 |         end
43 |         err
44 |     end
45 | end


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Sanity checks (read this first, then remove this section)**
11 | Make sure you're reporting *a bug*; for general questions, please use Discourse.
12 | 
13 | If you're dealing with a performance issue, make sure you **disable scalar iteration** (`CuArrays.allowscalar(false)`). Only file an issue if that shows scalar iteration happening within Base or CuArrays, as opposed to your own code.
14 | 
15 | If you're seeing an error message, **follow the error message instructions**, if any (eg. `inspect code with @device_code_warntype`). If you can't solve the problem using that information, make sure to post it as part of the issue.
16 | 
17 | If your bug is still valid, please go ahead and fill out the template below.
18 | 
19 | **Describe the bug**
20 | A clear and concise description of what the bug is.
21 | 
22 | **To Reproduce**
23 | The Minimal Working Example (MWE) for this bug:
24 | ```julia
25 | # some code here
26 | ```
27 | 
28 | **Expected behavior**
29 | A clear and concise description of what you expected to happen.
30 | 
31 | **Build log**
32 | ```
33 | # post the output of Pkg.build()
34 | # make sure the error still reproduces after that.
35 | ```
36 | 
37 | **Environment details (please complete this section)**
38 | Details on Julia:
39 | ```
40 | # please post the output of:
41 | versioninfo()
42 | ```
43 | 
44 | Julia packages:
45 |  - CuArrays.jl:
46 |  - CUDAnative.jl:
47 |  - ...
48 | 
49 | CUDA: toolkit and driver version
50 | 
51 | 
52 | **Additional context**
53 | Add any other context about the problem here.
54 | 


--------------------------------------------------------------------------------
/src/sparse/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSPARSError
 2 | 
 3 | struct CUSPARSEError <: Exception
 4 |     code::cusparseStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUSPARSEError) = print(io, "CUSPARSError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUSPARSError(code::cusparseStatus_t)
10 |     msg = status_message(code)
11 |     return CUSPARSEError(code, msg)
12 | end
13 | 
14 | 
15 | function statusmessage( status )
16 |     if status == CUSPARSE_STATUS_SUCCESS
17 |         return "cusparse success"
18 |     end
19 |     if status == CUSPARSE_STATUS_NOT_INITIALIZED
20 |         return "cusparse not initialized"
21 |     end
22 |     if status == CUSPARSE_STATUS_ALLOC_FAILED
23 |         return "cusparse allocation failed"
24 |     end
25 |     if status == CUSPARSE_STATUS_INVALID_VALUE
26 |         return "cusparse invalid value"
27 |     end
28 |     if status == CUSPARSE_STATUS_ARCH_MISMATCH
29 |         return "cusparse architecture mismatch"
30 |     end
31 |     if status == CUSPARSE_STATUS_MAPPING_ERROR
32 |         return "cusparse mapping error"
33 |     end
34 |     if status == CUSPARSE_STATUS_EXECUTION_FAILED
35 |         return "cusparse execution failed"
36 |     end
37 |     if status == CUSPARSE_STATUS_INTERNAL_ERROR
38 |         return "cusparse internal error"
39 |     end
40 |     if status == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
41 |         return "cusparse matrix type not supported"
42 |     end
43 | end
44 | 
45 | macro check(sparse_func)
46 |     quote
47 |         local err = $(esc(sparse_func::Expr))
48 |         if err != CUSPARSE_STATUS_SUCCESS
49 |             throw(CUSPARSEError(cusparseStatus_t(err)))
50 |         end
51 |         err
52 |     end
53 | end
54 | 
55 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   CI_IMAGE_TAG: 'cuda'
 3 | 
 4 | include:
 5 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/common.yml'
 6 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.0.yml'
 7 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml'
 8 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_dev.yml'
 9 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/coverage_v1.1.yml'
10 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/documentation_v1.1.yml'
11 | 
12 | test:v1.0:
13 |   only:
14 |     - master
15 |     - staging
16 |     - trying
17 | 
18 | test:v1.1:
19 |   only:
20 |     - master
21 |     - staging
22 |     - trying
23 | 
24 | test:dev:
25 |   allow_failure: true
26 |   only:
27 |     - master
28 |     - staging
29 |     - trying
30 | 
31 | coverage:
32 |   allow_failure: true
33 |   only:
34 |     - master
35 |     - staging
36 |     - trying
37 | 
38 | documentation:
39 |   only:
40 |     - master
41 |     - staging
42 |     - trying
43 | 
44 | pages:
45 |   stage: deploy
46 |   script:
47 |     - mv docs/build public
48 |   artifacts:
49 |     paths:
50 |     - public
51 |   only:
52 |     - master
53 | 
54 | flux:
55 |   stage: test
56 |   image: "juliagpu/julia:v1.1-cuda"
57 |   script:
58 |     - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325
59 |     - julia -e 'using Pkg;
60 |                 Pkg.develop(PackageSpec(path=pwd()));
61 |                 Pkg.build();
62 |                 Pkg.add(PackageSpec(name="Flux", rev="master"));
63 |                 Pkg.test("Flux");'
64 |   allow_failure: true
65 |   only:
66 |     - master
67 |     - staging
68 |     - trying
69 | 


--------------------------------------------------------------------------------
/src/fft/genericfft.jl:
--------------------------------------------------------------------------------
 1 | cufftfloat(x) = _cufftfloat(float(x))
 2 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T
 3 | _cufftfloat(::Type{Float16}) = Float32
 4 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)}
 5 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported")
 6 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x)
 7 | 
 8 | complexfloat(x::CuArray{Complex{<:cufftReals}}) = x
 9 | realfloat(x::CuArray{<:cufftReals}) = x
10 | 
11 | complexfloat(x::CuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x)
12 | complexfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x)
13 | 
14 | realfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x)
15 | 
16 | function copy1(::Type{T}, x) where T
17 |     y = CuArray{T}(undef, map(length, axes(x)))
18 |     #copy!(y, x)
19 |     y .= broadcast(xi->convert(T,xi),x)
20 | end
21 | 
22 | # promote to a complex floating-point type (out-of-place only),
23 | # so implementations only need Complex{Float} methods
24 | for f in (:fft, :bfft, :ifft)
25 |     pf = Symbol("plan_", f)
26 |     @eval begin
27 |         $f(x::CuArray{<:Real}, region=1:ndims(x)) = $f(complexfloat(x), region)
28 |         $pf(x::CuArray{<:Real}, region) = $pf(complexfloat(x), region)
29 |         $f(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region=1:ndims(x)) = $f(complexfloat(x), region)
30 |         $pf(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region) = $pf(complexfloat(x), region)
31 |     end
32 | end
33 | rfft(x::CuArray{<:Union{Integer,Rational}}, region=1:ndims(x)) = rfft(realfloat(x), region)
34 | plan_rfft(x::CuArray{<:Real}, region) = plan_rfft(realfloat(x), region)
35 | 
36 | *(p::Plan{T}, x::CuArray) where {T} = p * copy1(T, x)
37 | *(p::ScaledPlan, x::CuArray) = rmul!(p.p * x, p.scale)
38 | 


--------------------------------------------------------------------------------
/src/blas/error.jl:
--------------------------------------------------------------------------------
 1 | export CUBLASError
 2 | 
 3 | struct CUBLASError <: Exception
 4 |     code::cublasStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUBLASError) = print(io, "CUBLASError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUBLASError(code::cublasStatus_t)
10 |     msg = status_message(code)
11 |     return CUBLASError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUBLAS_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUBLAS_STATUS_NOT_INITIALIZED
18 |         return "the library was not initialized"
19 |     elseif status == CUBLAS_STATUS_ALLOC_FAILED
20 |         return "the resource allocation failed"
21 |     elseif status == CUBLAS_STATUS_INVALID_VALUE
22 |         return "an invalid value was used as an argument"
23 |     elseif status == CUBLAS_STATUS_ARCH_MISMATCH
24 |         return "an absent device architectural feature is required"
25 |     elseif status == CUBLAS_STATUS_MAPPING_ERROR
26 |         return "an access to GPU memory space failed"
27 |     elseif status == CUBLAS_STATUS_EXECUTION_FAILED
28 |         return "the GPU program failed to execute"
29 |     elseif status == CUBLAS_STATUS_INTERNAL_ERROR
30 |         return "an internal operation failed"
31 |     elseif status == CUBLAS_STATUS_NOT_SUPPORTED
32 |         return "the requested feature is not supported"
33 |     elseif status == CUBLAS_STATUS_LICENSE_ERROR
34 |         return "error detected trying to check the license"
35 |     else
36 |         return "unknown status"
37 |     end
38 | end
39 | 
40 | macro check(blas_func)
41 |     quote
42 |         local err::cublasStatus_t
43 |         err = $(esc(blas_func::Expr))
44 |         if err != CUBLAS_STATUS_SUCCESS
45 |             throw(CUBLASError(err))
46 |         end
47 |         err
48 |     end
49 | end


--------------------------------------------------------------------------------
/src/solver/CUSOLVER.jl:
--------------------------------------------------------------------------------
 1 | module CUSOLVER
 2 | 
 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 4 | import CUDAapi
 5 | 
 6 | using ..CuArrays
 7 | using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free!
 8 | 
 9 | using LinearAlgebra
10 | using SparseArrays 
11 | 
12 | import Base.one
13 | import Base.zero
14 | import CuArrays.CUSPARSE.CuSparseMatrixCSR
15 | import CuArrays.CUSPARSE.CuSparseMatrixCSC
16 | import CuArrays.CUSPARSE.cusparseMatDescr_t
17 | 
18 | include("libcusolver_types.jl")
19 | include("error.jl")
20 | 
21 | const _dense_handles = Dict{CuContext,cusolverDnHandle_t}()
22 | const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL)
23 | const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}()
24 | const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL)
25 | 
26 | function dense_handle()
27 |     if _dense_handle[] == C_NULL
28 |         @assert isassigned(active_context) # some other call should have initialized CUDA
29 |         _dense_handle[] = get!(_dense_handles, active_context[]) do
30 |             context = active_context[]
31 |             handle = cusolverDnCreate()
32 |             atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle))
33 |             handle
34 |         end
35 |     end
36 |     return _dense_handle[]
37 | end
38 | 
39 | function sparse_handle()
40 |     if _sparse_handle[] == C_NULL
41 |         @assert isassigned(active_context) # some other call should have initialized CUDA
42 |         _sparse_handle[] = get!(_sparse_handles, active_context[]) do
43 |             context = active_context[]
44 |             handle = cusolverSpCreate()
45 |             atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle))
46 |             handle
47 |         end
48 |     end
49 |     return _sparse_handle[]
50 | end
51 | 
52 | include("libcusolver.jl")
53 | include("sparse.jl")
54 | include("dense.jl")
55 | include("highlevel.jl")
56 | 
57 | version() = VersionNumber(cusolverGetProperty(CUDAapi.MAJOR_VERSION),
58 |                           cusolverGetProperty(CUDAapi.MINOR_VERSION),
59 |                           cusolverGetProperty(CUDAapi.PATCH_LEVEL))
60 | 
61 | end
62 | 


--------------------------------------------------------------------------------
/src/rand/error.jl:
--------------------------------------------------------------------------------
 1 | export CURANDError
 2 | 
 3 | struct CURANDError <: Exception
 4 |     code::curandStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CURANDError) = print(io, "CURANDError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CURANDError(code::curandStatus_t)
10 |     msg = status_message(code)
11 |     return CURANDError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CURAND_STATUS_SUCCESS
16 |         return "generator was created successfully"
17 |     elseif status == CURAND_STATUS_VERSION_MISMATCH
18 |         return "Header file and linked library version do not match"
19 |     elseif status == CURAND_STATUS_NOT_INITIALIZED
20 |         return "Generator not initialized"
21 |     elseif status == CURAND_STATUS_ALLOCATION_FAILED
22 |         return "Memory allocation failed"
23 |     elseif status == CURAND_STATUS_TYPE_ERROR
24 |         return "Generator is wrong type"
25 |     elseif status == CURAND_STATUS_OUT_OF_RANGE
26 |         return "Argument out of range"
27 |     elseif status == CURAND_STATUS_LENGTH_NOT_MULTIPLE
28 |         return "Length requested is not a multple of dimension"
29 |     elseif status == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED
30 |         return "GPU does not have double precision required by MRG32k3a"
31 |     elseif status == CURAND_STATUS_LAUNCH_FAILURE
32 |         return "Kernel launch failure"
33 |     elseif status == CURAND_STATUS_PREEXISTING_FAILURE
34 |         return "Preexisting failure on library entry"
35 |     elseif status == CURAND_STATUS_INITIALIZATION_FAILED
36 |         return "Initialization of CUDA failed"
37 |     elseif status == CURAND_STATUS_ARCH_MISMATCH
38 |         return "Architecture mismatch, GPU does not support requested feature"
39 |     elseif status == CURAND_STATUS_INTERNAL_ERROR
40 |         return "Internal library error"
41 |     else
42 |         return "unknown status"
43 |     end
44 | end
45 | 
46 | macro check(func)
47 |     quote
48 |         local err::curandStatus_t
49 |         err = $(esc(func::Expr))
50 |         if err != CURAND_STATUS_SUCCESS
51 |             throw(CURANDError(err))
52 |         end
53 |         err
54 |     end
55 | end
56 | 


--------------------------------------------------------------------------------
/src/blas/README.md:
--------------------------------------------------------------------------------
 1 | # CUBLAS implementation progress
 2 | 
 3 | The following sections list the CUBLAS functions shown on the CUBLAS
 4 | documentation page:
 5 | 
 6 | http://docs.nvidia.com/cuda/cublas/index.html
 7 | 
 8 | ## Level 1 (13 functions)
 9 | 
10 | CUBLAS functions:
11 | 
12 | * [x] amax
13 | * [x] amin
14 | * [x] asum
15 | * [x] axpy
16 | * [x] copy
17 | * [x] dot, dotc, dotu
18 | * [x] nrm2
19 | * [ ] rot (not implemented in julia blas.jl)
20 | * [ ] rotg (not implemented in julia blas.jl)
21 | * [ ] rotm (not implemented in julia blas.jl)
22 | * [ ] rotmg (not implemented in julia blas.jl)
23 | * [x] scal
24 | * [ ] swap (not implemented in julia blas.jl)
25 | 
26 | ## Level 2
27 | 
28 | Key:
29 | * `ge`: general
30 | * `gb`: general banded
31 | * `sy`: symmetric
32 | * `sb`: symmetric banded
33 | * `sp`: symmetric packed
34 | * `tr`: triangular
35 | * `tb`: triangular banded
36 | * `tp`: triangular packed
37 | * `he`: hermitian
38 | * `hb`: hermitian banded
39 | * `hp`: hermitian packed
40 | 
41 | CUBLAS functions:
42 | 
43 | * [x] gbmv (in julia/blas.jl)
44 | * [x] gemv (in julia/blas.jl)
45 | * [x] ger (in julia/blas.jl)
46 | * [x] sbmv (in julia/blas.jl)
47 | * [ ] spmv
48 | * [ ] spr
49 | * [ ] spr2
50 | * [x] symv (in julia/blas.jl)
51 | * [x] syr (in julia/blas.jl)
52 | * [ ] syr2
53 | * [x] tbmv
54 | * [x] tbsv
55 | * [ ] tpmv
56 | * [ ] tpsv
57 | * [x] trmv (in julia/blas.jl)
58 | * [x] trsv (in julia/blas.jl)
59 | * [x] hemv (in julia/blas.jl)
60 | * [x] hbmv
61 | * [ ] hpmv
62 | * [x] her (in julia/blas.jl)
63 | * [x] her2
64 | * [ ] hpr
65 | * [ ] hpr2
66 | 
67 | ## Level 3
68 | 
69 | CUBLAS functions:
70 | 
71 | * [x] gemm (in julia/blas.jl)
72 | * [x] gemmBatched
73 | * [x] symm (in julia/blas.jl)
74 | * [x] syrk (in julia/blas.jl)
75 | * [x] syr2k (in julia/blas.jl)
76 | * [ ] syrkx
77 | * [x] trmm (in julia/blas.jl)
78 | * [x] trsm (in julia/blas.jl)
79 | * [x] trsmBatched
80 | * [x] hemm
81 | * [x] herk (in julia/blas.jl)
82 | * [x] her2k (in julia/blas.jl)
83 | * [ ] herkx
84 | 
85 | ## BLAS-like extensions
86 | 
87 | * [x] geam
88 | * [x] dgmm
89 | * [x] getrfBatched
90 | * [x] getriBatched
91 | * [x] geqrfBatched
92 | * [x] gelsBatched
93 | * [ ] tpttr
94 | * [ ] trttp
95 | 


--------------------------------------------------------------------------------
/src/utils.jl:
--------------------------------------------------------------------------------
 1 | using Base.Cartesian
 2 | 
 3 | function cudims(n::Integer)
 4 |   threads = min(n, 256)
 5 |   ceil(Int, n / threads), threads
 6 | end
 7 | 
 8 | cudims(a::AbstractArray) = cudims(length(a))
 9 | 
10 | @inline ind2sub_(a::AbstractArray{T,0}, i) where T = ()
11 | @inline ind2sub_(a, i) = Tuple(CartesianIndices(a)[i])
12 | 
13 | macro cuindex(A)
14 |   quote
15 |     A = $(esc(A))
16 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
17 |     i > length(A) && return
18 |     ind2sub_(A, i)
19 |   end
20 | end
21 | 
22 | 
23 | @generated function nindex(i::T, ls::NTuple{N,T}) where {N,T}
24 |   na = one(i)
25 |   quote
26 |     Base.@_inline_meta
27 |     $(foldr((n, els) -> :(i ≤ ls[$n] ? ($n, i) : (i -= ls[$n]; $els)), :($na, $na), one(i):i(N)))
28 |   end
29 | end
30 | 
31 | @inline function catindex(dim, I::NTuple{N}, shapes) where N
32 |   @inbounds x, i = nindex(I[dim], getindex.(shapes, dim))
33 |   x, ntuple(n -> n == dim ? i : I[n], Val{N})
34 | end
35 | 
36 | function growdims(dim, x)
37 |   if ndims(x) >= dim
38 |     x
39 |   else
40 |     reshape(x, size.((x,), 1:dim)...)
41 |   end
42 | end
43 | 
44 | function _cat(dim, dest, xs...)
45 |   function kernel(dim, dest, xs)
46 |     I = @cuindex dest
47 |     @inbounds n, I′ = catindex(dim, Int.(I), size.(xs))
48 |     @inbounds dest[I...] = xs[n][I′...]
49 |     return
50 |   end
51 |   xs = growdims.(dim, xs)
52 |   blk, thr = cudims(dest)
53 |   @cuda blocks=blk threads=thr kernel(dim, dest, xs)
54 |   return dest
55 | end
56 | 
57 | function Base.cat_t(dims::Integer, T::Type, x::CuArray, xs::CuArray...)
58 |   catdims = Base.dims2cat(dims)
59 |   shape = Base.cat_shape(catdims, (), size.((x, xs...))...)
60 |   dest = Base.cat_similar(x, T, shape)
61 |   _cat(dims, dest, x, xs...)
62 | end
63 | 
64 | Base.vcat(xs::CuArray...) = cat(xs..., dims=1)
65 | Base.hcat(xs::CuArray...) = cat(xs..., dims=2)
66 | 
67 | 
68 | """
69 |     @sync ex
70 | 
71 | Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly
72 | synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As
73 | such, this operation is preferred over implicit synchronization (e.g. when performing a
74 | memory copy) for high-performance applications.
75 | 
76 | It is also useful for timing code that executes asynchronously.
77 | """
78 | macro sync(ex)
79 |     quote
80 |         local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING)
81 |         local ret = $(esc(ex))
82 |         CUDAdrv.record(e)
83 |         CUDAdrv.synchronize(e)
84 |         ret
85 |     end
86 | end
87 | 


--------------------------------------------------------------------------------
/src/broadcast.jl:
--------------------------------------------------------------------------------
 1 | import Base.Broadcast: Broadcasted, Extruded, BroadcastStyle, ArrayStyle
 2 | 
 3 | BroadcastStyle(::Type{<:CuArray}) = ArrayStyle{CuArray}()
 4 | 
 5 | function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}) where T
 6 |     similar(CuArray{T}, axes(bc))
 7 | end
 8 | 
 9 | 
10 | # replace base functions with libdevice alternatives
11 | # TODO: do this with Cassette.jl
12 | 
13 | cufunc(f) = f
14 | cufunc(::Type{T}) where T = (x...) -> T(x...) # broadcasting type ctors isn't GPU compatible
15 | 
16 | Broadcast.broadcasted(::ArrayStyle{CuArray}, f, args...) =
17 |   Broadcasted{ArrayStyle{CuArray}}(cufunc(f), args, nothing)
18 | 
19 | libdevice = :[
20 |   cos, cospi, sin, sinpi, tan, acos, asin, atan,
21 |   cosh, sinh, tanh, acosh, asinh, atanh,
22 |   log, log10, log1p, log2, logb, ilogb,
23 |   exp, exp2, exp10, expm1, ldexp,
24 |   erf, erfinv, erfc, erfcinv, erfcx,
25 |   brev, clz, ffs, byte_perm, popc,
26 |   isfinite, isinf, isnan, nearbyint,
27 |   nextafter, signbit, copysign, abs,
28 |   sqrt, rsqrt, cbrt, rcbrt, pow,
29 |   ceil, floor, saturate,
30 |   lgamma, tgamma,
31 |   j0, j1, jn, y0, y1, yn,
32 |   normcdf, normcdfinv, hypot,
33 |   fma, sad, dim, mul24, mul64hi, hadd, rhadd, scalbn].args
34 | 
35 | for f in libdevice
36 |   isdefined(Base, f) || continue
37 |   @eval cufunc(::typeof(Base.$f)) = CUDAnative.$f
38 | end
39 | 
40 | using MacroTools
41 | 
42 | const _cufuncs = copy(libdevice)
43 | cufuncs() = (global _cufuncs; _cufuncs)
44 | 
45 | function replace_device(ex)
46 |   global _cufuncs
47 |   MacroTools.postwalk(ex) do x
48 |     x in _cufuncs ? :(CuArrays.cufunc($x)) : x
49 |   end
50 | end
51 | 
52 | macro cufunc(ex)
53 |   global _cufuncs
54 |   def = MacroTools.splitdef(ex)
55 |   f = def[:name]
56 |   def[:name] = Symbol(:cu, f)
57 |   def[:body] = replace_device(def[:body])
58 |   push!(_cufuncs, f)
59 |   quote
60 |     $(esc(MacroTools.combinedef(def)))
61 |     CuArrays.cufunc(::typeof($(esc(f)))) = $(esc(def[:name]))
62 |   end
63 | end
64 | 
65 | # ForwardDiff Integration
66 | using ForwardDiff: Dual, value, partials, unary_dual_definition
67 | using DiffRules
68 | 
69 | for f in libdevice
70 |   if haskey(DiffRules.DEFINED_DIFFRULES, (:Base,f,1))
71 |     f == :tanh && continue
72 |     diffrule = DiffRules.DEFINED_DIFFRULES[(:Base,f,1)]
73 |     DiffRules.DEFINED_DIFFRULES[(:CUDAnative,f,1)] =
74 |       (args...) -> replace_device(diffrule(args...))
75 |     eval(unary_dual_definition(:CUDAnative, f))
76 |   end
77 | end
78 | 
79 | DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x ->
80 |   replace_device(:(1-tanh(x)^2))
81 | eval(unary_dual_definition(:CUDAnative, :tanh))
82 | 


--------------------------------------------------------------------------------
/deps/build.jl:
--------------------------------------------------------------------------------
 1 | using CUDAapi
 2 | using CUDAdrv
 3 | using CUDAnative
 4 | 
 5 | 
 6 | ## auxiliary routines
 7 | 
 8 | status = 0
 9 | function build_warning(reason)
10 |     println("$reason.")
11 |     global status
12 |     status = 1
13 |     # NOTE: it's annoying that we have to `exit(1)`, but otherwise messages are hidden
14 | end
15 | 
16 | function build_error(reason)
17 |     println(reason)
18 |     exit(1)
19 | end
20 | 
21 | 
22 | ## main
23 | 
24 | config_path = joinpath(@__DIR__, "ext.jl")
25 | const previous_config_path = config_path * ".bak"
26 | 
27 | function write_ext(config)
28 |     open(config_path, "w") do io
29 |         println(io, "# autogenerated file, do not edit")
30 |         for (key,val) in config
31 |             println(io, "const $key = $(repr(val))")
32 |         end
33 |     end
34 | end
35 | 
36 | function main()
37 |     ispath(config_path) && mv(config_path, previous_config_path; force=true)
38 |     config = Dict{Symbol,Any}(:configured => false)
39 |     write_ext(config)
40 | 
41 | 
42 |     ## discover stuff
43 | 
44 |     CUDAdrv.configured || build_error("Dependent package CUDAdrv.jl has not been built successfully")
45 |     CUDAnative.configured || build_error("Dependent package CUDAnative.jl has not been built successfully")
46 | 
47 |     toolkit = find_toolkit()
48 | 
49 |     for name in ("cublas", "cusparse", "cusolver", "cufft", "curand", "cudnn")
50 |         lib = Symbol("lib$name")
51 |         config[lib] = find_cuda_library(name, toolkit)
52 |         if config[lib] == nothing
53 |             build_warning("Could not find library '$name'")
54 |         end
55 |     end
56 | 
57 | 
58 |     ## (re)generate ext.jl
59 | 
60 |     function globals(mod)
61 |         all_names = names(mod, all=true)
62 |         filter(name-> !any(name .== [nameof(mod), Symbol("#eval"), :eval]), all_names)
63 |     end
64 | 
65 |     if isfile(previous_config_path)
66 |         @eval module Previous; include($previous_config_path); end
67 |         previous_config = Dict{Symbol,Any}(name => getfield(Previous, name)
68 |                                            for name in globals(Previous))
69 | 
70 |         if config == previous_config
71 |             mv(previous_config_path, config_path; force=true)
72 |             return
73 |         end
74 |     end
75 | 
76 |     config[:configured] = true
77 |     write_ext(config)
78 | 
79 |     if status != 0
80 |         # we got here, so the status is non-fatal
81 |         build_error("""
82 | 
83 |             CuArrays.jl has been built successfully, but there were warnings.
84 |             Some functionality may be unavailable.""")
85 |     end
86 | end
87 | 
88 | main()
89 | 


--------------------------------------------------------------------------------
/src/gpuarray_interface.jl:
--------------------------------------------------------------------------------
 1 | import GPUArrays
 2 | 
 3 | struct CuArrayBackend <: GPUArrays.GPUBackend end
 4 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()
 5 | 
 6 | 
 7 | #Abstract GPU interface
 8 | struct CuKernelState end
 9 | 
10 | @inline function GPUArrays.LocalMemory(::CuKernelState, ::Type{T}, ::Val{N}, ::Val{id}) where {T, N, id}
11 |     ptr = CUDAnative._shmem(Val(id), T, Val(N))
12 |     CuDeviceArray(N, DevicePtr{T, CUDAnative.AS.Shared}(ptr))
13 | end
14 | 
15 | GPUArrays.AbstractDeviceArray(A::CUDAnative.CuDeviceArray, shape) = CUDAnative.CuDeviceArray(shape, pointer(A))
16 | 
17 | @inline GPUArrays.synchronize_threads(::CuKernelState) = CUDAnative.sync_threads()
18 | 
19 | GPUArrays.blas_module(::CuArray) = CuArrays.CUBLAS
20 | GPUArrays.blasbuffer(x::CuArray) = x
21 | 
22 | """
23 | Blocks until all operations are finished on `A`
24 | """
25 | GPUArrays.synchronize(A::CuArray) =
26 |     CUDAdrv.synchronize()
27 | 
28 | for (i, sym) in enumerate((:x, :y, :z))
29 |     for (f, fcu) in (
30 |             (:blockidx, :blockIdx),
31 |             (:blockdim, :blockDim),
32 |             (:threadidx, :threadIdx),
33 |             (:griddim, :gridDim)
34 |         )
35 |         fname = Symbol(string(f, '_', sym))
36 |         cufun = Symbol(string(fcu, '_', sym))
37 |         @eval GPUArrays.$fname(::CuKernelState) = CUDAnative.$cufun()
38 |     end
39 | end
40 | 
41 | # devices() = CUDAdrv.devices()
42 | GPUArrays.device(A::CuArray) = CUDAdrv.device(CUDAdrv.CuCurrentContext())
43 | GPUArrays.is_gpu(dev::CUDAdrv.CuDevice) = true
44 | GPUArrays.name(dev::CUDAdrv.CuDevice) = string("CU ", CUDAdrv.name(dev))
45 | GPUArrays.threads(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)
46 | 
47 | GPUArrays.blocks(dev::CUDAdrv.CuDevice) =
48 |     (CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X),
49 |      CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y),
50 |      CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Z))
51 | 
52 | GPUArrays.free_global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.Mem.info()[1]
53 | GPUArrays.global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.totalmem(dev)
54 | GPUArrays.local_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.TOTAL_CONSTANT_MEMORY)
55 | 
56 | function GPUArrays._gpu_call(::CuArrayBackend, f, A, args::Tuple,
57 |                              blocks_threads::Tuple{T, T}) where {N, T <: NTuple{N, Integer}}
58 |     blk, thr = blocks_threads
59 |     @cuda blocks=blk threads=thr f(CuKernelState(), args...)
60 | end
61 | 
62 | # Save reinterpret and reshape implementation use this in GPUArrays
63 | GPUArrays.unsafe_reinterpret(::Type{T}, A::CuArray, size::NTuple{N, Integer}) where {T, N} =
64 |     CuArray{T, N}(A.buf, size)
65 | 


--------------------------------------------------------------------------------
/src/fft/error.jl:
--------------------------------------------------------------------------------
 1 | export CUFFTError
 2 | 
 3 | struct CUFFTError <: Exception
 4 |     code::cufftStatus_t
 5 |     msg::AbstractString
 6 | end
 7 | Base.show(io::IO, err::CUFFTError) = print(io, "CUFFTError(code $(err.code), $(err.msg))")
 8 | 
 9 | function CUFFTError(code::cufftStatus_t)
10 |     msg = status_message(code)
11 |     return CUFFTError(code, msg)
12 | end
13 | 
14 | function status_message(status)
15 |     if status == CUFFT_STATUS_SUCCESS
16 |         return "the operation completed successfully"
17 |     elseif status == CUFFT_STATUS_INVALID_PLAN
18 |         return "cuFFT was passed an invalid plan handle"
19 |     elseif status == CUFFT_STATUS_ALLOC_FAILED
20 |         return "cuFFT failed to allocate GPU or CPU memory"
21 |     elseif status == CUFFT_STATUS_INVALID_TYPE
22 |         return "cuFFT invalid type " # No longer used
23 |     elseif status == CUFFT_STATUS_INVALID_VALUE
24 |         return "User specified an invalid pointer or parameter"
25 |     elseif status == CUFFT_STATUS_INTERNAL_ERROR
26 |         return "Driver or internal cuFFT library error"
27 |     elseif status == CUFFT_STATUS_EXEC_FAILED
28 |         return "Failed to execute an FFT on the GPU"
29 |     elseif status == CUFFT_STATUS_SETUP_FAILED
30 |         return "The cuFFT library failed to initialize"
31 |     elseif status == CUFFT_STATUS_INVALID_SIZE
32 |         return "User specified an invalid transform size"
33 |     elseif status == CUFFT_STATUS_UNALIGNED_DATA
34 |         return "cuFFT unaligned data" # No longer used
35 |     elseif status == CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST
36 |         return "Missing parameters in call"
37 |     elseif status == CUFFT_STATUS_INVALID_DEVICE
38 |         return "Execution of a plan was on different GPU than plan creation"
39 |     elseif status == CUFFT_STATUS_PARSE_ERROR
40 |         return "Internal plan database error"
41 |     elseif status == CUFFT_STATUS_NO_WORKSPACE
42 |         return "No workspace has been provided prior to plan execution"
43 |     elseif status == CUFFT_STATUS_NOT_IMPLEMENTED
44 |         return "Function does not implement functionality for parameters given."
45 |     elseif status == CUFFT_STATUS_LICENSE_ERROR
46 |         return "cuFFT license error" # Used in previous versions.
47 |     elseif status == CUFFT_STATUS_NOT_SUPPORTED
48 |         return "Operation is not supported for parameters given."
49 |     else
50 |         return "unknown status"
51 |     end
52 | end
53 | 
54 | macro check(fft_func)
55 |     quote
56 |         local err::cufftStatus_t
57 |         err = $(esc(fft_func::Expr))
58 |         if err != CUFFT_STATUS_SUCCESS
59 |             throw(CUFFTError(err))
60 |         end
61 |         err
62 |     end
63 | end
64 | 


--------------------------------------------------------------------------------
/src/rand/libcurand_types.jl:
--------------------------------------------------------------------------------
 1 | const curandGenerator_t = Ptr{Cvoid}
 2 | 
 3 | mutable struct RNG <: Random.AbstractRNG
 4 |     ptr::curandGenerator_t
 5 |     typ::Int
 6 | end
 7 | 
 8 | Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr
 9 | 
10 | 
11 | const curandDiscreteDistribution_t = Ptr{Cvoid}
12 | 
13 | mutable struct DiscreteDistribution
14 |     ptr::curandDiscreteDistribution_t
15 | end
16 | 
17 | Base.unsafe_convert(::Type{curandDiscreteDistribution_t}, dist::DiscreteDistribution) = dist.ptr
18 | 
19 | 
20 | # CURAND status codes
21 | const curandStatus_t = UInt32
22 | const CURAND_STATUS_SUCCESS = 0
23 | const CURAND_STATUS_VERSION_MISMATCH = 100
24 | const CURAND_STATUS_NOT_INITIALIZED = 101
25 | const CURAND_STATUS_ALLOCATION_FAILED = 102
26 | const CURAND_STATUS_TYPE_ERROR = 103
27 | const CURAND_STATUS_OUT_OF_RANGE = 104
28 | const CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105
29 | const CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106
30 | const CURAND_STATUS_LAUNCH_FAILURE = 201
31 | const CURAND_STATUS_PREEXISTING_FAILURE = 202
32 | const CURAND_STATUS_INITIALIZATION_FAILED = 203
33 | const CURAND_STATUS_ARCH_MISMATCH = 204
34 | const CURAND_STATUS_INTERNAL_ERROR = 999
35 | 
36 | # CURAND RNG types (curandRngType)
37 | const CURAND_RNG_TEST = 0
38 | const CURAND_RNG_PSEUDO_DEFAULT = 100
39 | const CURAND_RNG_PSEUDO_XORWOW = 101
40 | const CURAND_RNG_PSEUDO_MRG32K3A = 121
41 | const CURAND_RNG_PSEUDO_MTGP32 = 141
42 | const CURAND_RNG_PSEUDO_MT19937 = 142
43 | const CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161
44 | const CURAND_RNG_QUASI_DEFAULT = 200
45 | const CURAND_RNG_QUASI_SOBOL32 = 201
46 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202
47 | const CURAND_RNG_QUASI_SOBOL64 = 203
48 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204
49 | 
50 | # CURAND ordering of results in memory
51 | const CURAND_ORDERING_PSEUDO_BEST = 100
52 | const CURAND_ORDERING_PSEUDO_DEFAULT = 101
53 | const CURAND_ORDERING_PSEUDO_SEEDED = 102
54 | const CURAND_ORDERING_QUASI_DEFAULT = 201
55 | 
56 | # CURAND choice of direction vector set
57 | const CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101
58 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102
59 | const CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103
60 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104
61 | 
62 | # CURAND method
63 | const CURAND_CHOOSE_BEST = 0
64 | const CURAND_ITR = 1
65 | const CURAND_KNUTH = 2
66 | const CURAND_HITR = 3
67 | const CURAND_M1 = 4
68 | const CURAND_M2 = 5
69 | const CURAND_BINARY_SEARCH = 6
70 | const CURAND_DISCRETE_GAUSS = 7
71 | const CURAND_REJECTION = 8
72 | const CURAND_DEVICE_API = 9
73 | const CURAND_FAST_REJECTION = 10
74 | const CURAND_3RD = 11
75 | const CURAND_DEFINITION = 12
76 | const CURAND_POISSON = 13
77 | 


--------------------------------------------------------------------------------
/src/solver/libcusolver_types.jl:
--------------------------------------------------------------------------------
 1 | import ..CUBLAS: cublasfill, cublasop, cublasside, cublasFillMode_t, cublasOperation_t, cublasSideMode_t
 2 | 
 3 | #enum cusolverStatus_t
 4 | #error messages from CUSOLVER
 5 | 
 6 | const cusolverStatus_t = UInt32
 7 | const CUSOLVER_STATUS_SUCCESS                   = 0
 8 | const CUSOLVER_STATUS_NOT_INITIALIZED           = 1
 9 | const CUSOLVER_STATUS_ALLOC_FAILED              = 2
10 | const CUSOLVER_STATUS_INVALID_VALUE             = 3
11 | const CUSOLVER_STATUS_ARCH_MISMATCH             = 4
12 | const CUSOLVER_STATUS_EXECUTION_FAILED          = 5
13 | const CUSOLVER_STATUS_INTERNAL_ERROR            = 6
14 | const CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 7
15 | 
16 | const csrqrInfo_t = Ptr{Nothing}
17 | const gesvdjInfo_t = Ptr{Cvoid}
18 | const syevjInfo_t = Ptr{Cvoid}
19 | 
20 | const cusolverEigMode_t = UInt32
21 | const CUSOLVER_EIG_MODE_NOVECTOR                = 0
22 | const CUSOLVER_EIG_MODE_VECTOR                  = 1
23 | 
24 | const cusolverEigType_t = UInt32
25 | const CUSOLVER_EIG_TYPE_1                       = 1
26 | const CUSOLVER_EIG_TYPE_2                       = 2
27 | const CUSOLVER_EIG_TYPE_3                       = 3
28 | 
29 | # refactorization types
30 | 
31 | const cusolverRfNumericBoostReport_t = UInt32
32 | const CUSOLVER_NUMERIC_BOOST_NOT_USED           = 0
33 | const CUSOLVER_NUMERIC_BOOST_USED               = 1
34 | 
35 | const cusolverRfResetValuesFastMode_t = UInt32
36 | const CUSOLVER_RESET_VALUES_FAST_MODE_OFF       = 0
37 | const CUSOLVER_RESET_VALUES_FAST_MODE_ON        = 1
38 | 
39 | const cusolverRfFactorization_t = UInt32
40 | const CUSOLVER_FACTORIZATION_ALG0               = 0
41 | const CUSOLVER_FACTORIZATION_ALG1               = 1
42 | const CUSOLVER_FACTORIZATION_ALG2               = 2
43 | 
44 | const cusolverRfTriangularSolve_t = UInt32
45 | const CUSOLVER_TRIANGULAR_SOLVE_ALG0            = 0
46 | const CUSOLVER_TRIANGULAR_SOLVE_ALG1            = 1
47 | const CUSOLVER_TRIANGULAR_SOLVE_ALG2            = 2
48 | const CUSOLVER_TRIANGULAR_SOLVE_ALG3            = 3
49 | 
50 | const cusolverRfUnitDiagonal_t = UInt32
51 | const CUSOLVER_UNIT_DIAGONAL_STORED_L           = 0
52 | const CUSOLVER_UNIT_DIAGONAL_STORED_U           = 1
53 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_L          = 2
54 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_U          = 3
55 | 
56 | const cusolverDnContext = Nothing
57 | const cusolverDnHandle_t = Ptr{cusolverDnContext}
58 | const cusolverSpContext = Nothing
59 | const cusolverSpHandle_t = Ptr{cusolverSpContext}
60 | const cusolverRfContext = Nothing
61 | const cusolverRfHandle_t = Ptr{cusolverRfContext}
62 | 
63 | #complex numbers
64 | 
65 | const cuComplex = Complex{Float32}
66 | const cuDoubleComplex = Complex{Float64}
67 | 
68 | const CusolverFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
69 | const CusolverReal = Union{Float64,Float32}
70 | const CusolverComplex = Union{ComplexF64,ComplexF32}
71 | 


--------------------------------------------------------------------------------
/src/fft/fft.jl:
--------------------------------------------------------------------------------
 1 | # K is a flag for forward/backward
 2 | # also used as an alias for r2c/c2r
 3 | 
 4 | abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end
 5 | 
 6 | mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
 7 |     plan::cufftHandle_t
 8 |     sz::NTuple{N,Int} # Julia size of input array
 9 |     osz::NTuple{N,Int} # Julia size of output array
10 |     xtype::Int
11 |     region::Any
12 |     pinv::ScaledPlan # required by AbstractFFT API
13 | 
14 |     function cCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N},
15 |                                        sizey::Tuple, region, xtype::Integer
16 |                                        ) where {T<:cufftNumber,K,inplace,N}
17 |         # maybe enforce consistency of sizey
18 |         p = new(plan, size(X), sizey, xtype, region)
19 |         finalizer(destroy_plan, p)
20 |         p
21 |     end
22 | end
23 | 
24 | cCuFFTPlan(plan,X,region,xtype::Integer) = cCuFFTPlan(plan,X,size(X),region,xtype)
25 | 
26 | mutable struct rCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
27 |     plan::cufftHandle_t
28 |     sz::NTuple{N,Int} # Julia size of input array
29 |     osz::NTuple{N,Int} # Julia size of output array
30 |     xtype::Int
31 |     region::Any
32 |     pinv::ScaledPlan # required by AbstractFFT API
33 | 
34 |     function rCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N},
35 |                                        sizey::Tuple, region, xtype::Integer
36 |                                        ) where {T<:cufftNumber,K,inplace,N}
37 |         # maybe enforce consistency of sizey
38 |         p = new(plan, size(X), sizey, xtype, region)
39 |         finalizer(destroy_plan, p)
40 |         p
41 |     end
42 | end
43 | 
44 | rCuFFTPlan(plan,X,region,xtype::Integer) = rCuFFTPlan(plan,X,size(X),region,xtype)
45 | 
46 | const xtypenames = Dict{cufftType,String}(CUFFT_R2C => "real-to-complex",
47 |                                           CUFFT_C2R => "complex-to-real",
48 |                                           CUFFT_C2C => "complex",
49 |                                           CUFFT_D2Z => "d.p. real-to-complex",
50 |                                           CUFFT_Z2D => "d.p. complex-to-real",
51 |                                           CUFFT_Z2Z => "d.p. complex")
52 | 
53 | function showfftdims(io, sz, T)
54 |     if isempty(sz)
55 |         print(io,"0-dimensional")
56 |     elseif length(sz) == 1
57 |         print(io, sz[1], "-element")
58 |     else
59 |         print(io, join(sz, "×"))
60 |     end
61 |     print(io, " CuArray of ", T)
62 | end
63 | 
64 | function show(io::IO, p::CuFFTPlan{T,K,inplace}) where {T,K,inplace}
65 |     print(io, inplace ? "CUFFT in-place " : "CUFFT ",
66 |           xtypenames[p.xtype],
67 |           K == CUFFT_FORWARD ? " forward" : " backward",
68 |           " plan for ")
69 |     showfftdims(io, p.sz, T)
70 | end
71 | 


--------------------------------------------------------------------------------
/src/fft/libcufft_types.jl:
--------------------------------------------------------------------------------
 1 | # CUFFT API function return values
 2 | const cufftStatus_t = UInt32
 3 | const CUFFT_STATUS_SUCCESS        = 0  #  The cuFFT operation was successful
 4 | const CUFFT_STATUS_INVALID_PLAN   = 1  #  cuFFT was passed an invalid plan handle
 5 | const CUFFT_STATUS_ALLOC_FAILED   = 2  #  cuFFT failed to allocate GPU or CPU memory
 6 | const CUFFT_STATUS_INVALID_TYPE   = 3  #  No longer used
 7 | const CUFFT_STATUS_INVALID_VALUE  = 4  #  User specified an invalid pointer or parameter
 8 | const CUFFT_STATUS_INTERNAL_ERROR = 5  #  Driver or internal cuFFT library error
 9 | const CUFFT_STATUS_EXEC_FAILED    = 6  #  Failed to execute an FFT on the GPU
10 | const CUFFT_STATUS_SETUP_FAILED   = 7  #  The cuFFT library failed to initialize
11 | const CUFFT_STATUS_INVALID_SIZE   = 8  #  User specified an invalid transform size
12 | const CUFFT_STATUS_UNALIGNED_DATA = 9  #  No longer used
13 | const CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST = 10 #  Missing parameters in call
14 | const CUFFT_STATUS_INVALID_DEVICE = 11 #  Execution of a plan was on different GPU than plan creation
15 | const CUFFT_STATUS_PARSE_ERROR    = 12 #  Internal plan database error
16 | const CUFFT_STATUS_NO_WORKSPACE   = 13  #  No workspace has been provided prior to plan execution
17 | const CUFFT_STATUS_NOT_IMPLEMENTED = 14 # Function does not implement functionality for parameters given.
18 | const CUFFT_STATUS_LICENSE_ERROR  = 15 # Used in previous versions.
19 | const CUFFT_STATUS_NOT_SUPPORTED  = 16  # Operation is not supported for parameters given.
20 | 
21 | 
22 | const cufftReal = Float32
23 | const cufftDoubleReal = Float64
24 | 
25 | const cufftComplex = ComplexF32
26 | const cufftDoubleComplex = ComplexF64
27 | 
28 | # CUFFT transform directions
29 | const CUFFT_FORWARD = -1 # Forward FFT
30 | const CUFFT_INVERSE =  1 # Inverse FFT
31 | 
32 | # CUFFT supports the following transform types
33 | const cufftType = Cint
34 | const CUFFT_R2C = 0x2a     # Real to Complex
35 | const CUFFT_C2R = 0x2c     # Complex to Real
36 | const CUFFT_C2C = 0x29     # Complex to Complex
37 | const CUFFT_D2Z = 0x6a     # Double to Double-Complex
38 | const CUFFT_Z2D = 0x6c     # Double-Complex to Double
39 | const CUFFT_Z2Z = 0x69     # Double-Complex to Double-Complex
40 | 
41 | const cufftCompatibility = Cint
42 | const   CUFFT_COMPATIBILITY_NATIVE          = 0x00
43 | const   CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01
44 | const   CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02
45 | const   CUFFT_COMPATIBILITY_FFTW_ALL        = 0x03
46 | 
47 | const cufftHandle_t = Cint
48 | 
49 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex}
50 | # note trailing s to deconflict w/ header file
51 | const cufftReals = Union{cufftDoubleReal,cufftReal}
52 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex}
53 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex}
54 | const cufftSingle = Union{cufftReal,cufftComplex}
55 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}}
56 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}}
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CuArrays
 2 | 
 3 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url]
 4 | 
 5 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CuArrays.jl/branch/master/graph/badge.svg
 6 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CuArrays.jl
 7 | 
 8 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg
 9 | [docs-latest-url]: https://juliagpu.gitlab.io/CuArrays.jl/
10 | 
11 | CuArrays provides a fully-functional GPU array, which can give significant speedups over
12 | normal arrays without code changes. CuArrays are implemented fully in Julia, making the
13 | implementation [elegant and extremely
14 | generic](http://mikeinnes.github.io/2017/08/24/cudanative.html).
15 | 
16 | Documentation for this package is sparse, and for many of the array operations you should
17 | refer to the official Julia documentation. The following resources can be useful to get a
18 | better understanding of the characteristics and performance trade offs that come with GPU
19 | arrays:
20 | 
21 | - Introductory tutorial on [GPU programming with Julia](https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/)
22 | - Slide deck on [effectively using GPUs with Julia](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/)
23 | 
24 | ## Installation
25 | 
26 | CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a
27 | proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work
28 | (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues
29 | with CuArrays.jl, please make sure those other packages are working as expected.
30 | 
31 | Some parts of CuArrays.jl depend on **optional libraries**, such as
32 | [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify
33 | about missing dependencies, i.e. inspect the output of `Pkg.build("CuArrays")`
34 | to see whether your installation is complete.
35 | 
36 | 
37 | ## Features
38 | 
39 | ```julia
40 | xs = cu(rand(5, 5))
41 | ys = cu[1, 2, 3]
42 | xs_cpu = collect(xs)
43 | ```
44 | 
45 | Because `CuArray` is an `AbstractArray`, it doesn't have much of a learning curve; just use your favourite array ops as usual. The following are supported (on arbitrary numbers of arguments, dimensions etc):
46 | 
47 | * Conversions and `copy!` with CPU arrays
48 | * General indexing (`xs[1:2, 5, :]`)
49 | * `permutedims`
50 | * Concatenation (`vcat(x, y)`, `cat(3, xs, ys, zs)`)
51 | * `map`, fused broadcast (`zs .= xs.^2 .+ ys .* 2`)
52 | * `fill!(xs, 0)`
53 | * Reduction over dimensions (`reducedim(+, xs, 3)`, `sum(x -> x^2, xs, 1)` etc)
54 | * Reduction to scalar (`reduce(*, 1, xs)`, `sum(xs)` etc)
55 | * Various BLAS operations (matrix\*matrix, matrix\*vector)
56 | * FFTs, using the AbstractFFTs API
57 | 
58 | We welcome issues or PRs for functionality not on this list.
59 | 
60 | Note that some operations not on this list will work, but be slow, due to Base's generic
61 | implementations. This is intentional, to enable a "make it work, then make it fast"
62 | workflow. When you're ready you can disable slow fallback methods:
63 | 
64 | ```julia
65 | julia> CuArrays.allowscalar(false)
66 | julia> xs[5]
67 | ERROR: getindex is disabled
68 | ```
69 | 


--------------------------------------------------------------------------------
/src/fft/libcufft.jl:
--------------------------------------------------------------------------------
 1 | # low-level wrappers of the CUFFT library
 2 | 
 3 | cufftGetVersion() = ccall((:cufftGetVersion,libcufft), Cint, ())
 4 | 
 5 | function cufftGetProperty(property::CUDAapi.libraryPropertyType)
 6 |   value_ref = Ref{Cint}()
 7 |   @check ccall((:cufftGetProperty, libcufft), cufftStatus_t,
 8 |                (Cint, Ptr{Cint}),
 9 |                property, value_ref)
10 |   value_ref[]
11 | end
12 | 
13 | cufftDestroy(plan) = ccall((:cufftDestroy,libcufft), Nothing, (cufftHandle_t,), plan)
14 | 
15 | function cufftPlan1d(plan, nx, type, batch)
16 |     @check ccall((:cufftPlan1d,libcufft),cufftStatus_t,
17 |                  (Ptr{cufftHandle_t}, Cint, cufftType, Cint),
18 |                  plan, nx, type, batch)
19 | end
20 | 
21 | function cufftPlan2d(plan, nx, ny, type)
22 |     @check ccall((:cufftPlan2d,libcufft),cufftStatus_t,
23 |                  (Ptr{cufftHandle_t}, Cint, Cint, cufftType),
24 |                  plan, nx, ny, type)
25 | end
26 | 
27 | function cufftPlan3d(plan, nx, ny, nz, type)
28 |     @check ccall((:cufftPlan3d,libcufft),cufftStatus_t,
29 |                  (Ptr{cufftHandle_t}, Cint, Cint, Cint, cufftType),
30 |                  plan, nx, ny, nz, type)
31 | end
32 | 
33 | function cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)
34 |     @check ccall((:cufftPlanMany,libcufft),cufftStatus_t,
35 |                  (Ptr{cufftHandle_t}, Cint, Ptr{Cint},
36 |                   Ptr{Cint}, Cint, Cint,
37 |                   Ptr{Cint}, Cint, Cint,
38 |                   cufftType, Cint),
39 |                  plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch)
40 | end
41 | 
42 | function cufftExecC2C(plan, idata, odata, direction)
43 |     @check ccall((:cufftExecC2C,libcufft), cufftStatus_t,
44 |                  (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}, Cint),
45 |                  plan, idata, odata, direction)
46 | end
47 | 
48 | function cufftExecC2R(plan, idata, odata)
49 |     @check ccall((:cufftExecC2R,libcufft), cufftStatus_t,
50 |                  (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}),
51 |                  plan, idata, odata)
52 | end
53 | 
54 | function cufftExecR2C(plan, idata, odata)
55 |     @check ccall((:cufftExecR2C,libcufft), cufftStatus_t,
56 |                  (cufftHandle_t, CuPtr{cufftReal}, CuPtr{cufftComplex}),
57 |                  plan, idata, odata)
58 | end
59 | 
60 | function cufftExecZ2Z(plan, idata, odata, direction)
61 |     @check ccall((:cufftExecZ2Z,libcufft), cufftStatus_t,
62 |                  (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex},
63 |                   Cint),
64 |                  plan, idata, odata, direction)
65 | end
66 | 
67 | function cufftExecZ2D(plan, idata, odata)
68 |     @check ccall((:cufftExecZ2D,libcufft), cufftStatus_t,
69 |                  (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}),
70 |                  plan, idata, odata)
71 | end
72 | 
73 | function cufftExecD2Z(plan, idata, odata)
74 |     @check ccall((:cufftExecD2Z,libcufft), cufftStatus_t,
75 |                  (cufftHandle_t, CuPtr{cufftDoubleReal}, CuPtr{cufftDoubleComplex}),
76 |                  plan, idata, odata)
77 | end
78 | 


--------------------------------------------------------------------------------
/src/sparse/highlevel.jl:
--------------------------------------------------------------------------------
 1 | (\)(A::AbstractTriangular{<:CuSparseMatrix},B::CuMatrix)       = sm('N',A,B,'O')
 2 | (\)(transA::Transpose{<:Any, <:AbstractTriangular{<:CuSparseMatrix}}, B::CuMatrix) = sm('T',parent(transA),B,'O')
 3 | (\)(adjA::Adjoint{<:Any, <:AbstractTriangular{<:CuSparseMatrix}},B::CuMatrix) = sm('C',parent(adjA),B,'O')
 4 | 
 5 | mul!(C::CuVector{T},A::CuSparseMatrix,B::CuVector) where {T} = mv!('N',one(T),A,B,zero(T),C,'O')
 6 | mul!(C::CuVector{T},transA::Transpose{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O')
 7 | mul!(C::CuVector{T},adjA::Adjoint{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('C',one(T),parent(transA),B,zero(T),C,'O')
 8 | mul!(C::CuVector{T},A::HermOrSym{T,<:CuSparseMatrix{T}},B::CuVector{T}) where T = mv!('N',one(T),A,B,zero(T),C,'O')
 9 | mul!(C::CuVector{T},transA::Transpose{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O')
10 | mul!(C::CuVector{T},adjA::Adjoint{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('C',one(T),parent(adjA),B,zero(T),C,'O')
11 | 
12 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},B::CuMatrix{T}) where {T} = mm2!('N','N',one(T),A,B,zero(T),C,'O')
13 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},transB::Transpose{<:Any, CuMatrix{T}})  where {T} = mm2!('N','T',one(T),A,parent(transB),zero(T),C,'O')
14 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T})  where {T} = mm2!('T','N',one(T),parent(transA),B,zero(T),C,'O')
15 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('T','T',one(T),parent(transA),parent(transB),zero(T),C,'O')
16 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T})  where {T} = mm2!('C','N',one(T),parent(adjA),B,zero(T),C,'O')
17 | 
18 | mul!(C::CuMatrix{T},A::HermOrSym{<:Number, <:CuSparseMatrix},B::CuMatrix) where {T} = mm!('N',one(T),A,B,zero(T),C,'O')
19 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('T',one(T),parent(transA),B,zero(T),C,'O')
20 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('C',one(T),parent(adjA),B,zero(T),C,'O')
21 | 
22 | (\)(A::AbstractTriangular{<:CuSparseMatrix},B::CuVector)       = sv2('N',A,B,'O')
23 | (\)(transA::Transpose{<:Any, <:AbstractTriangular{<:CuSparseMatrix}},B::CuVector) = sv2('T',parent(transA),B,'O')
24 | (\)(adjA::Adjoint{<:Any, <:AbstractTriangular{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where T = sv2('C',parent(adjA),B,'O')
25 | (\)(A::AbstractTriangular{T,CuSparseMatrixHYB{T}},B::CuVector{T})       where T = sv('N',A,B,'O')
26 | (\)(transA::Transpose{<:Any, AbstractTriangular{T,CuSparseMatrixHYB{T}}},B::CuVector{T}) where T = sv('T',parent(transA),B,'O')
27 | (\)(adjA::Adjoint{<:Any, AbstractTriangular{T,CuSparseMatrixHYB{T}}},B::CuVector{T}) where T = sv('C',parent(adjA),B,'O')
28 | 
29 | (+)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,B,'O','O','O')
30 | (-)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,-one(eltype(A)),B,'O','O','O')
31 | 


--------------------------------------------------------------------------------
/src/CuArrays.jl:
--------------------------------------------------------------------------------
 1 | __precompile__()
 2 | 
 3 | module CuArrays
 4 | 
 5 | using CUDAdrv, CUDAnative
 6 | 
 7 | using GPUArrays
 8 | 
 9 | export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuzeros, cuones, cufill
10 | 
11 | import LinearAlgebra
12 | 
13 | using Adapt
14 | 
15 | const ext = joinpath(dirname(@__DIR__), "deps", "ext.jl")
16 | isfile(ext) || error("CuArrays.jl has not been built, please run Pkg.build(\"CuArrays\").")
17 | include(ext)
18 | if !configured
19 |     # default (non-functional) values for critical variables,
20 |     # making it possible to _load_ the package at all times.
21 |     const libcublas = nothing
22 |     const libcusparse = nothing
23 |     const libcusolver = nothing
24 |     const libcufft = nothing
25 |     const libcurand = nothing
26 |     const libcudnn = nothing
27 | end
28 | 
29 | include("memory.jl")
30 | include("array.jl")
31 | include("subarray.jl")
32 | include("utils.jl")
33 | include("indexing.jl")
34 | include("broadcast.jl")
35 | include("matmul.jl")
36 | include("mapreduce.jl")
37 | 
38 | include("gpuarray_interface.jl")
39 | 
40 | # many libraries need to be initialized per-device (per-context, really, but we assume users
41 | # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one.
42 | const active_context = Ref{CuContext}()
43 | 
44 | libcublas !== nothing   && include("blas/CUBLAS.jl")
45 | libcusparse !== nothing && include("sparse/CUSPARSE.jl")
46 | libcusolver !== nothing && include("solver/CUSOLVER.jl")
47 | libcufft !== nothing    && include("fft/CUFFT.jl")
48 | libcurand !== nothing   && include("rand/CURAND.jl")
49 | libcudnn !== nothing    && include("dnn/CUDNN.jl")
50 | 
51 | include("nnlib.jl")
52 | 
53 | include("deprecated.jl")
54 | 
55 | function __init__()
56 |     if !configured
57 |         @warn("CuArrays.jl has not been successfully built, and will not work properly.")
58 |         @warn("Please run Pkg.build(\"CuArrays\") and restart Julia.")
59 |         return
60 |     end
61 | 
62 |     function check_library(name, path)
63 |         path === nothing && return
64 |         if !ispath(path)
65 |             error("$name library has changed. Please run Pkg.build(\"CuArrays\") and restart Julia.")
66 |         end
67 |     end
68 |     check_library("CUBLAS", libcublas)
69 |     check_library("CUSPARSE", libcusparse)
70 |     check_library("CUSOLVER", libcusolver)
71 |     check_library("CUFFT", libcufft)
72 |     check_library("CURAND", libcurand)
73 |     check_library("CUDNN", libcudnn)
74 | 
75 |     # update the active context when we switch devices
76 |     callback = (::CuDevice, ctx::CuContext) -> begin
77 |         active_context[] = ctx
78 | 
79 |         # wipe the active handles
80 |         isdefined(CuArrays, :CUBLAS)   && (CUBLAS._handle[] = C_NULL)
81 |         isdefined(CuArrays, :CUSOLVER) && (CUSOLVER._dense_handle[] = C_NULL)
82 |         isdefined(CuArrays, :CURAND)   && (CURAND._generator[] = nothing)
83 |         isdefined(CuArrays, :CUDNN)    && (CUDNN._handle[] = C_NULL)
84 |     end
85 |     push!(CUDAnative.device!_listeners, callback)
86 | 
87 |     # a device might be active already
88 |     existing_ctx = CUDAdrv.CuCurrentContext()
89 |     if existing_ctx !== nothing
90 |         active_context[] = existing_ctx
91 |     end
92 | 
93 |     __init_memory__()
94 | end
95 | 
96 | end # module
97 | 


--------------------------------------------------------------------------------
/src/matmul.jl:
--------------------------------------------------------------------------------
 1 | using LinearAlgebra
 2 | 
 3 | 
 4 | function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R}
 5 |     if size(A,2) != size(B,1)
 6 |         throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))"))
 7 |     end
 8 |     if size(C,1) != size(A,1) || size(C,2) != size(B,2)
 9 |         throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))"))
10 |     end
11 |     if isempty(A) || isempty(B)
12 |         return fill!(C, zero(R))
13 |     end
14 | 
15 |     function kernel(C, A, B)
16 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
17 |         j = (blockIdx().y-1) * blockDim().y + threadIdx().y
18 | 
19 |         if i <= size(A,1) && j <= size(B,2)
20 |             z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j])
21 |             Ctmp = convert(promote_type(R, typeof(z2)), z2)
22 |             for k in 1:size(A,2)
23 |                 Ctmp += A[i, k]*B[k, j]
24 |             end
25 |             C[i,j] = Ctmp
26 |         end
27 | 
28 |         return
29 |     end
30 | 
31 |     max_threads = 256
32 |     threads_x = min(max_threads, size(C,1))
33 |     threads_y = min(max_threads ÷ threads_x, size(C,2))
34 |     threads = (threads_x, threads_y)
35 |     blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads)
36 | 
37 |     @cuda threads=threads blocks=blocks kernel(C, A, B)
38 | 
39 |     C
40 | end
41 | 
42 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
43 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
44 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
45 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
46 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B)
47 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
48 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
49 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
50 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B)
51 | 
52 | 
53 | function generic_rmul!(X::CuArray, s::Number)
54 |     function kernel(X, s)
55 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
56 |         @inbounds X[i] *= s
57 |         return
58 |     end
59 |     @cuda blocks=length(X) kernel(X, s)
60 |     X
61 | end
62 | 
63 | LinearAlgebra.rmul!(A::CuArray, b::Number) = generic_rmul!(A, b)
64 | 
65 | 
66 | function generic_lmul!(s::Number, X::CuArray)
67 |     function kernel(s, X)
68 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
69 |         @inbounds X[i] = s*X[i]
70 |         return
71 |     end
72 |     @cuda blocks=length(X) kernel(s, X)
73 |     X
74 | end
75 | 
76 | LinearAlgebra.lmul!(a::Number, B::CuArray) = generic_lmul!(a, B)
77 | 


--------------------------------------------------------------------------------
/src/blas/libcublas_types.jl:
--------------------------------------------------------------------------------
  1 | # libcublas_types.jl
  2 | #
  3 | # Initially generated with wrap_c from Clang.jl. Modified to remove anonymous
  4 | # enums and add cublasContext.
  5 | #
  6 | # Author: Nick Henderson <nwh@stanford.edu>
  7 | # Created: 2014-08-27
  8 | # License: MIT
  9 | #
 10 | 
 11 | # begin enum cublasStatus_t
 12 | const cublasStatus_t = UInt32
 13 | const CUBLAS_STATUS_SUCCESS = 0
 14 | const CUBLAS_STATUS_NOT_INITIALIZED = 1
 15 | const CUBLAS_STATUS_ALLOC_FAILED = 3
 16 | const CUBLAS_STATUS_INVALID_VALUE = 7
 17 | const CUBLAS_STATUS_ARCH_MISMATCH = 8
 18 | const CUBLAS_STATUS_MAPPING_ERROR = 11
 19 | const CUBLAS_STATUS_EXECUTION_FAILED = 13
 20 | const CUBLAS_STATUS_INTERNAL_ERROR = 14
 21 | const CUBLAS_STATUS_NOT_SUPPORTED = 15
 22 | const CUBLAS_STATUS_LICENSE_ERROR = 16
 23 | # end enum cublasStatus_t
 24 | # begin enum cublasFillMode_t
 25 | const cublasFillMode_t = UInt32
 26 | const CUBLAS_FILL_MODE_LOWER = 0
 27 | const CUBLAS_FILL_MODE_UPPER = 1
 28 | # end enum cublasFillMode_t
 29 | # begin enum cublasDiagType_t
 30 | const cublasDiagType_t = UInt32
 31 | const CUBLAS_DIAG_NON_UNIT = 0
 32 | const CUBLAS_DIAG_UNIT = 1
 33 | # end enum cublasDiagType_t
 34 | # begin enum cublasSideMode_t
 35 | const cublasSideMode_t = UInt32
 36 | const CUBLAS_SIDE_LEFT = 0
 37 | const CUBLAS_SIDE_RIGHT = 1
 38 | # end enum cublasSideMode_t
 39 | # begin enum cublasOperation_t
 40 | const cublasOperation_t = UInt32
 41 | const CUBLAS_OP_N = 0
 42 | const CUBLAS_OP_T = 1
 43 | const CUBLAS_OP_C = 2
 44 | # end enum cublasOperation_t
 45 | # begin enum cublasPointerMode_t
 46 | const cublasPointerMode_t = UInt32
 47 | const CUBLAS_POINTER_MODE_HOST = 0
 48 | const CUBLAS_POINTER_MODE_DEVICE = 1
 49 | # end enum cublasPointerMode_t
 50 | # begin enum cublasAtomicsMode_t
 51 | const cublasAtomicsMode_t = UInt32
 52 | const CUBLAS_ATOMICS_NOT_ALLOWED = 0
 53 | const CUBLAS_ATOMICS_ALLOWED = 1
 54 | # end enum cublasAtomicsMode_t
 55 | const cublasContext = Nothing
 56 | const cublasHandle_t = Ptr{cublasContext}
 57 | # complex numbers in cuda
 58 | const cuComplex = Complex{Float32}
 59 | const cuDoubleComplex = Complex{Float64}
 60 | # complex types from Base/linalg.jl
 61 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
 62 | const CublasReal = Union{Float64,Float32}
 63 | const CublasComplex = Union{ComplexF64,ComplexF32}
 64 | # FP16 (cuda_fp16.h) in cuda
 65 | const __half = Float16
 66 | struct __half2
 67 |     x1::__half
 68 |     x2::__half
 69 | end
 70 | 
 71 | if CUDAdrv.version() >= v"0.7.5"
 72 |     # specify which GEMM algorithm to use in cublasGemmEx() (CUDA 7.5+)
 73 |     const cublasGemmAlgo_t = Int32
 74 |     const CUBLAS_GEMM_DFALT = -1
 75 |     const CUBLAS_GEMM_ALGO0 = 0
 76 |     const CUBLAS_GEMM_ALGO1 = 1
 77 |     const CUBLAS_GEMM_ALGO2 = 2
 78 |     const CUBLAS_GEMM_ALGO3 = 3
 79 |     const CUBLAS_GEMM_ALGO4 = 4
 80 |     const CUBLAS_GEMM_ALGO5 = 5
 81 |     const CUBLAS_GEMM_ALGO6 = 6
 82 |     const CUBLAS_GEMM_ALGO7 = 7
 83 |     # specify which DataType to use with cublas<t>gemmEx() and cublasGemmEx() (CUDA 7.5+) functions
 84 |     const cudaDataType_t = UInt32
 85 |     const CUDA_R_16F = UInt32(2)
 86 |     const CUDA_C_16F = UInt32(6)
 87 |     const CUDA_R_32F = UInt32(0)
 88 |     const CUDA_C_32F = UInt32(4)
 89 |     const CUDA_R_64F = UInt32(1)
 90 |     const CUDA_C_64F = UInt32(5)
 91 |     const CUDA_R_8I  = UInt32(3)
 92 |     const CUDA_C_8I  = UInt32(7)
 93 |     const CUDA_R_8U  = UInt32(8)
 94 |     const CUDA_C_8U  = UInt32(9)
 95 |     const CUDA_R_32I = UInt32(10)
 96 |     const CUDA_C_32I = UInt32(11)
 97 |     const CUDA_R_32U = UInt32(12)
 98 |     const CUDA_C_32U = UInt32(13)
 99 | end
100 | 
101 | @enum CUBLASMathMode::Cint begin
102 |    CUBLAS_DEFAULT_MATH = 0
103 |    CUBLAS_TENSOR_OP_MATH = 1
104 | end
105 | 


--------------------------------------------------------------------------------
/src/mapreduce.jl:
--------------------------------------------------------------------------------
 1 | using CuArrays: @cuindex, cudims
 2 | 
 3 | function mapreducedim_kernel_serial(f, op, R, A, range)
 4 |     I = @cuindex R
 5 |     newrange = map((r, i) -> r === nothing ? i : r, range, I)
 6 |     for I′ in CartesianIndices(newrange)
 7 |         @inbounds R[I...] = op(R[I...], f(A[I′]))
 8 |     end
 9 |     return
10 | end
11 | 
12 | @inline function reduce_block(arr::CuDeviceArray, op)
13 |     sync_threads()
14 |     len = blockDim().x
15 |     while len != 1
16 |         sync_threads()
17 |         skip = (len + 1) >> 1
18 |         reduce_to = threadIdx().x - skip
19 |         if 0 < reduce_to <= (len >> 1)
20 |             arr[reduce_to] = op(arr[reduce_to], arr[threadIdx().x])
21 |         end
22 |         len = skip
23 |     end
24 |     sync_threads()
25 | end
26 | 
27 | function mapreducedim_kernel_parallel(f, op, R::CuDeviceArray{T}, A::CuDeviceArray{T},
28 |                              CIS, Rlength, Slength) where {T}
29 |     for Ri_base in 0:(gridDim().x * blockDim().y):(Rlength-1)
30 |         Ri = Ri_base + (blockIdx().x - 1) * blockDim().y + threadIdx().y
31 |         Ri > Rlength && return
32 |         RI = Tuple(CartesianIndices(R)[Ri])
33 |         S = @cuStaticSharedMem(T, 512)
34 |         Si_folded_base = (threadIdx().y - 1) * blockDim().x
35 |         Si_folded = Si_folded_base + threadIdx().x
36 |         # serial reduction of A into S by Slength ÷ xthreads
37 |         for Si_base in 0:blockDim().x:(Slength-1)
38 |             Si = Si_base + threadIdx().x
39 |             Si > Slength && break
40 |             SI = Tuple(CIS[Si])
41 |             AI = ifelse.(size(R) .== 1, SI, RI)
42 |             if Si_base == 0
43 |                 S[Si_folded] = f(A[AI...])
44 |             else
45 |                 S[Si_folded] = op(S[Si_folded], f(A[AI...]))
46 |             end
47 |         end
48 |         # block-parallel reduction of S to S[1] by xthreads
49 |         reduce_block(view(S, (Si_folded_base + 1):512), op)
50 |         # reduce S[1] into R
51 |         threadIdx().x == 1 && (R[Ri] = op(R[Ri], S[Si_folded]))
52 |     end
53 |     return
54 | end
55 | 
56 | function Base._mapreducedim!(f, op, R::CuArray{T}, A::CuArray{T}) where {T}
57 |     # the kernel as generated from `f` and `op` can require lots of registers (eg. #160),
58 |     # so we need to be careful about how many threads we launch not to run out of them.
59 |     Rlength = length(R)
60 |     Ssize = ifelse.(size(R) .== 1, size(A), 1)
61 |     Slength = prod(Ssize)
62 |     CIS = CartesianIndices(Ssize)
63 | 
64 |     parallel_args = (f, op, R, A, CIS, Rlength, Slength)
65 |     GC.@preserve parallel_args begin
66 |         parallel_kargs = cudaconvert.(parallel_args)
67 |         parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...}
68 |         parallel_kernel = cufunction(mapreducedim_kernel_parallel, parallel_tt)
69 | 
70 |         # we are limited in how many threads we can launch...
71 |         ## by the kernel
72 |         kernel_threads = CUDAnative.maxthreads(parallel_kernel)
73 |         ## by the device
74 |         dev = CUDAdrv.device()
75 |         block_threads = (x=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X),
76 |                          y=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y),
77 |                          total=attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK))
78 | 
79 |         # figure out a legal launch configuration
80 |         y_thr = min(nextpow(2, Rlength ÷ 512 + 1), 512, block_threads.y, kernel_threads)
81 |         x_thr = min(512 ÷ y_thr, Slength, block_threads.x,
82 |                     ceil(Int, block_threads.total/y_thr),
83 |                     ceil(Int, kernel_threads/y_thr))
84 | 
85 |         if x_thr >= 8
86 |             blk, thr = (Rlength - 1) ÷ y_thr + 1, (x_thr, y_thr, 1)
87 |             parallel_kernel(parallel_kargs...; threads=thr, blocks=blk)
88 |         else
89 |             # not enough work, fall back to serial reduction
90 |             range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing)
91 |             blk, thr = cudims(R)
92 |             @cuda(blocks=blk, threads=thr, mapreducedim_kernel_serial(f, op, R, A, range))
93 |         end
94 |     end
95 | 
96 |     return R
97 | end
98 | 


--------------------------------------------------------------------------------
/test/dnn.jl:
--------------------------------------------------------------------------------
 1 | @testset "CUDNN" begin
 2 | 
 3 | if !isdefined(CuArrays, :CUDNN)
 4 | @warn "Not testing CUDNN"
 5 | else
 6 | using CuArrays.CUDNN
 7 | @info "Testing CUDNN $(CUDNN.version())"
 8 | 
 9 | @testset "NNlib" begin
10 |   using NNlib
11 |   using NNlib: ∇conv_data, ∇conv_filter,
12 |                maxpool, meanpool, ∇maxpool, ∇meanpool,
13 |                softmax, ∇softmax, logsoftmax, ∇logsoftmax
14 | 
15 |   @test testf(NNlib.conv, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4))
16 |   @test testf(∇conv_data, rand(Float64, 9, 9, 4, 1), rand(Float64, 2, 2, 3, 4))
17 |   @test testf(∇conv_filter, rand(Float64, 9, 9, 4, 1), rand(Float64, 10, 10, 3, 1))
18 |   @test testf(CuArrays.CUDNN.∇conv_bias!, cu(rand(Float64, 1, 1, 10, 1)), cu(rand(Float64, 10, 10, 10, 1)))
19 | 
20 |   @test testf(NNlib.conv, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4); dilation=2)
21 |   @test testf(∇conv_data, rand(Float64, 8, 8, 4, 1), rand(Float64, 2, 2, 3, 4); dilation=2)
22 |   @test testf(∇conv_filter, rand(Float64, 8, 8, 4, 1), rand(Float64, 10, 10, 3, 1); dilation=2)
23 | 
24 |   @test testf(NNlib.crosscor, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4))
25 |   @test testf(∇conv_data, rand(Float64, 9, 9, 4, 1), rand(Float64, 2, 2, 3, 4); flipkernel=1)
26 |   @test testf(∇conv_filter, rand(Float64, 9, 9, 4, 1), rand(Float64, 10, 10, 3, 1); flipkernel=1)
27 |   
28 |   @test_nowarn NNlib.conv!(cu(zeros(Float64, 9, 9, 3, 1)), cu(rand(Float64, 10, 10, 1, 1)), cu(rand(Float64, 2, 2, 1, 3)), algo=1)
29 |   @test_nowarn NNlib.∇conv_data!(cu(zeros(Float64, 10, 10, 1, 1)), cu(ones(Float64, 9, 9, 3, 1)), cu(rand(Float64, 2, 2, 1, 3)), algo=1)
30 |   @test_nowarn NNlib.∇conv_filter!(cu(zeros(Float64, 2, 2, 1, 3)), cu(ones(Float64, 9, 9, 3, 1)), cu(rand(Float64, 10, 10, 1, 1)), algo=1)
31 | 
32 |   @test testf(NNlib.conv, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4))
33 |   @test testf(∇conv_data, rand(Float64, 9, 9, 9, 4, 1),  rand(Float64, 2, 2, 2, 3, 4))
34 |   @test testf(∇conv_filter, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 10, 10, 10, 3, 1))
35 |   
36 |   @test testf(NNlib.conv, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4); dilation=2)
37 |   @test testf(∇conv_data, rand(Float64, 8, 8, 8, 4, 1), rand(Float64, 2, 2, 2, 3, 4); dilation=2)
38 |   @test testf(∇conv_filter, rand(Float64, 8, 8, 8, 4, 1), rand(Float64, 10, 10, 10, 3, 1); dilation=2)
39 | 
40 |   @test testf(NNlib.crosscor, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4))
41 |   @test testf(∇conv_data, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 2, 2, 2, 3, 4); flipkernel=1)
42 |   @test testf(∇conv_filter, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 10, 10, 10, 3, 1); flipkernel=1)
43 |   
44 |   @test testf(x -> maxpool(x, (2,2)), rand(Float64, 10, 10, 3, 1))
45 |   @test testf(x -> meanpool(x, (2,2)), rand(Float64, 10, 10, 3, 1))
46 |   @test testf((x, dy) -> ∇maxpool(dy, maxpool(x, (2,2)), x, (2,2)), rand(Float64, 10, 10, 3, 1), rand(Float64, 5, 5, 3, 1))
47 |   @test testf((x, dy) -> ∇meanpool(dy, meanpool(x, (2,2)), x, (2,2)), rand(Float64, 10, 10, 3, 1), rand(Float64, 5, 5, 3, 1))
48 | 
49 |   @test testf(x -> maxpool(x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1))
50 |   @test testf(x -> meanpool(x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1))
51 |   @test testf((x, dy) -> ∇maxpool(dy, maxpool(x, (2,2,2)), x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 5, 5, 5, 3, 1))
52 |   @test testf((x, dy) -> ∇meanpool(dy, meanpool(x, (2,2,2)), x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 5, 5, 5, 3, 1))
53 | 
54 |   for dims in [(5,5), (5,)]
55 |     @test testf(softmax, rand(Float64, dims))
56 |     @test testf(∇softmax, rand(Float64, dims), rand(Float64, dims))
57 |     @test testf(logsoftmax, rand(Float64, dims))
58 |     @test testf(∇logsoftmax, rand(Float64, dims), rand(Float64, dims))
59 |   end
60 | end
61 | 
62 | @testset "Activations and Other Ops" begin
63 |   @test testf(CuArrays.CUDNN.cudnnAddTensor, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
64 |   @test testf(CuArrays.CUDNN.cudnnActivationForward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
65 |   @test testf(CuArrays.CUDNN.cudnnActivationBackward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)))
66 | end
67 | 
68 | end
69 | 
70 | end
71 | 


--------------------------------------------------------------------------------
/src/solver/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # QR factorization
  2 | 
  3 | struct CuQR{T,S<:AbstractMatrix} <: LinearAlgebra.Factorization{T}
  4 |     factors::S
  5 |     τ::CuVector{T}
  6 |     CuQR{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ)
  7 | end
  8 | 
  9 | struct CuQRPackedQ{T,S<:AbstractMatrix} <: LinearAlgebra.AbstractQ{T}
 10 |     factors::CuMatrix{T}
 11 |     τ::CuVector{T}
 12 |     CuQRPackedQ{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ)
 13 | end
 14 | 
 15 | CuQR(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQR{T,typeof(factors)}(factors, τ)
 16 | CuQRPackedQ(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQRPackedQ{T,typeof(factors)}(factors, τ)
 17 | 
 18 | LinearAlgebra.qr!(A::CuMatrix{T}) where T = CuQR(geqrf!(A::CuMatrix{T})...)
 19 | Base.size(A::CuQR) = size(A.factors)
 20 | Base.size(A::CuQRPackedQ, dim::Integer) = 0 < dim ? (dim <= 2 ? size(A.factors, 1) : 1) : throw(BoundsError())
 21 | CuArrays.CuMatrix(A::CuQRPackedQ) = orgqr!(copy(A.factors), A.τ)
 22 | CuArrays.CuArray(A::CuQRPackedQ) = convert(CuMatrix, A)
 23 | Base.Matrix(A::CuQRPackedQ) = Matrix(CuMatrix(A))
 24 | 
 25 | function Base.getproperty(A::CuQR, d::Symbol)
 26 |     m, n = size(getfield(A, :factors))
 27 |     if d == :R
 28 |         return triu!(A.factors[1:min(m, n), 1:n])
 29 |     elseif d == :Q
 30 |         return CuQRPackedQ(A.factors, A.τ)
 31 |     else
 32 |         getfield(A, d)
 33 |     end
 34 | end
 35 | 
 36 | # iteration for destructuring into components
 37 | Base.iterate(S::CuQR) = (S.Q, Val(:R))
 38 | Base.iterate(S::CuQR, ::Val{:R}) = (S.R, Val(:done))
 39 | Base.iterate(S::CuQR, ::Val{:done}) = nothing
 40 | 
 41 | # Apply changes Q from the left
 42 | LinearAlgebra.lmul!(A::CuQRPackedQ{T,S}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} =
 43 |     ormqr!('L', 'N', A.factors, A.τ, B)
 44 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Real, S<:CuMatrix} =
 45 |     ormqr!('L', 'T', parent(adjA).factors, parent(adjA).τ, B)
 46 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Complex, S<:CuMatrix} =
 47 |     ormqr!('L', 'C', parent(adjA).factors, parent(adjA).τ, B)
 48 | LinearAlgebra.lmul!(trA::Transpose{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} =
 49 |     ormqr!('L', 'T', parent(trA).factors, parent(trA).τ, B)
 50 | 
 51 | function Base.getindex(A::CuQRPackedQ{T, S}, i::Integer, j::Integer) where {T, S}
 52 |     x = CuArray{T}(undef, size(A, 2)) .= 0
 53 |     x[j] = 1
 54 |     lmul!(A, x)
 55 |     return _getindex(x, i)
 56 | end
 57 | 
 58 | function Base.show(io::IO, F::CuQR)
 59 |     println(io, "$(typeof(F)) with factors Q and R:")
 60 |     show(io, F.Q)
 61 |     println(io)
 62 |     show(io, F.R)
 63 | end
 64 | 
 65 | # Singular Value Decomposition
 66 | 
 67 | struct CuSVD{T,Tr,A<:AbstractMatrix{T}} <: LinearAlgebra.Factorization{T}
 68 |     U::CuMatrix{T}
 69 |     S::CuVector{Tr}
 70 |     V::A
 71 | end
 72 | 
 73 | # iteration for destructuring into components
 74 | Base.iterate(S::CuSVD) = (S.U, Val(:S))
 75 | Base.iterate(S::CuSVD, ::Val{:S}) = (S.S, Val(:V))
 76 | Base.iterate(S::CuSVD, ::Val{:V}) = (S.V, Val(:done))
 77 | Base.iterate(S::CuSVD, ::Val{:done}) = nothing
 78 | 
 79 | @inline function Base.getproperty(S::CuSVD, s::Symbol)
 80 |     if s === :Vt
 81 |         return getfield(S, :V)'
 82 |     else
 83 |         return getfield(S, s)
 84 |     end
 85 | end
 86 | 
 87 | @enum SVDAlgorithm QRAlgorithm JacobiAlgorithm
 88 | function LinearAlgebra.svd!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm; full::Bool=false) where T
 89 |     if method === QRAlgorithm
 90 |         U, s, Vt = gesvd!(full ? 'A' : 'S', full ? 'A' : 'S', A::CuMatrix{T})
 91 |         return CuSVD(U, s, Vt')
 92 |     elseif method === JacobiAlgorithm
 93 |         return CuSVD(gesvdj!('V', Int(!full), A::CuMatrix{T})...)
 94 |     end
 95 | end
 96 | # Once LinearAlgebra.svd(::AbstractMatrix) accepts kwargs this method can be deleted
 97 | LinearAlgebra.svd(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm; full=false) = svd!(copy(A), method, full=full)
 98 | 
 99 | function LinearAlgebra.svdvals!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm) where T
100 |     if method === QRAlgorithm
101 |         return gesvd!('N', 'N', A::CuMatrix{T})[2]
102 |     elseif method === JacobiAlgorithm
103 |         return gesvdj!('N', 1, A::CuMatrix{T})[2]
104 |     end
105 | end
106 | # Once LinearAlgebra.svdvals(::AbstractMatrix) accepts kwargs this method can be deleted
107 | LinearAlgebra.svdvals(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm) = svdvals!(copy(A), method)
108 | 


--------------------------------------------------------------------------------
/src/rand/highlevel.jl:
--------------------------------------------------------------------------------
 1 | # high-level interface for CURAND
 2 | #
 3 | # the interface is split in two levels:
 4 | # - functions that extend the Random standard library, and take an RNG as first argument,
 5 | #   will only ever dispatch to CURAND and as a result are limited in the types they support.
 6 | # - functions that take an array will dispatch to either CURAND or GPUArrays
 7 | # - `cu`-prefixed functions are provided for constructing GPU arrays from only an eltype
 8 | 
 9 | 
10 | ## seeding
11 | 
12 | seed!(rng::RNG=generator()) = generate_seeds(rng)
13 | 
14 | 
15 | ## in-place
16 | 
17 | # uniform
18 | Random.rand!(rng::RNG, A::CuArray{Float32}) = generate_uniform(rng, A)
19 | Random.rand!(rng::RNG, A::CuArray{Float64}) = generate_uniform_double(rng, A)
20 | 
21 | # normal
22 | Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_normal(rng, A, mean, stddev)
23 | Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_normal_double(rng, A, mean, stddev)
24 | 
25 | # log-normal
26 | rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_log_normal(rng, A, mean, stddev)
27 | rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_log_normal_double(rng, A, mean, stddev)
28 | 
29 | # log-normal
30 | rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1) = generate_poisson(rng, A, lambda)
31 | 
32 | 
33 | ## out of place
34 | 
35 | Random.rand(rng::RNG, ::Type{X}, dims::Dims) where {X} = rand!(rng, CuArray{X}(undef, dims))
36 | Random.randn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = randn!(rng, CuArray{X}(undef, dims); kwargs...)
37 | rand_logn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_logn!(rng, CuArray{X}(undef, dims); kwargs...)
38 | rand_poisson(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_poisson!(rng, CuArray{X}(undef, dims); kwargs...)
39 | 
40 | # specify default types
41 | Random.rand(rng::RNG, dims::Integer...; kwargs...) = rand(rng, Float32, dims...; kwargs...)
42 | Random.randn(rng::RNG, dims::Integer...; kwargs...) = randn(rng, Float32, dims...; kwargs...)
43 | rand_logn(rng::RNG, dims...; kwargs...) = rand_logn(rng, Float32, dims...; kwargs...)
44 | rand_poisson(rng::RNG, dims...; kwargs...) = rand_poisson(rng, Cuint, dims...; kwargs...)
45 | 
46 | # convenience
47 | Random.randn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
48 |     randn(rng, X, Dims((dim1, dims...)); kwargs...)
49 | rand_logn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
50 |     rand_logn(rng, X, Dims((dim1, dims...)); kwargs...)
51 | rand_poisson(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} =
52 |     rand_poisson(rng, X, Dims((dim1, dims...)); kwargs...)
53 | 
54 | 
55 | ## functions that dispatch to either CURAND or GPUArrays
56 | 
57 | uniform_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
58 | uniform_rng(A::CuArray) = GPUArrays.global_rng(A)
59 | 
60 | normal_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
61 | normal_rng(::CuArray{T}) where {T} =
62 |     error("CuArrays does not support generating normally distributed numbers of type $T")
63 | 
64 | logn_rng(::CuArray{<:Union{Float32,Float64}}) = generator()
65 | logn_rng(::CuArray{T}) where {T} =
66 |     error("CuArrays does not support generating lognormally distributed numbers of type $T")
67 | 
68 | poisson_rng(::CuArray{Cuint}) = generator()
69 | poisson_rng(::CuArray{T}) where {T} =
70 |     error("CuArrays does not support generating Poisson distributed numbers of type $T")
71 | 
72 | 
73 | Random.rand!(A::CuArray; kwargs...) = rand!(uniform_rng(A), A; kwargs...)
74 | Random.randn!(A::CuArray; kwargs...) = randn!(normal_rng(A), A; kwargs...)
75 | rand_logn!(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...)
76 | rand_poisson!(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...)
77 | 
78 | 
79 | # need to prefix with `cu` to disambiguate from Random functions that return an Array
80 | # TODO: `@gpu rand` with Cassette
81 | curand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...)
82 | curandn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...)
83 | curand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...)
84 | curand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...)
85 | 
86 | # specify default types
87 | curand(args...; kwargs...) where {X} = curand(Float32, args...; kwargs...)
88 | curandn(args...; kwargs...) where {X} = curandn(Float32, args...; kwargs...)
89 | curand_logn(args...; kwargs...) where {X} = curand_logn(Float32, args...; kwargs...)
90 | curand_poisson(args...; kwargs...) where {X} = curand_poisson(Cuint, args...; kwargs...)
91 | 


--------------------------------------------------------------------------------
/src/sparse/libcusparse.jl:
--------------------------------------------------------------------------------
  1 | # low-level wrappers of the CUSPARSE library
  2 | 
  3 | #helper functions
  4 | function cusparseCreate()
  5 |   handle = Ref{cusparseHandle_t}()
  6 |   @check ccall( (:cusparseCreate, libcusparse), cusparseStatus_t, (Ptr{cusparseHandle_t},), handle)
  7 |   handle[]
  8 | end
  9 | 
 10 | function cusparseDestroy(handle)
 11 |   @check ccall( (:cusparseDestroy, libcusparse), cusparseStatus_t, (cusparseHandle_t,), handle)
 12 | end
 13 | 
 14 | function cusparseGetVersion(handle, version)
 15 |   @check ccall( (:cusparseGetVersion, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{Cint}), handle, version)
 16 | end
 17 | 
 18 | function cusparseSetStream(handle, streamId)
 19 |   @check ccall( (:cusparseSetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, CuStream_t), handle, streamId)
 20 | end
 21 | 
 22 | function cusparseGetStream(handle, streamId)
 23 |   @check ccall( (:cusparseGetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{CuStream_t}), handle, streamId)
 24 | end
 25 | 
 26 | function cusparseGetPointerMode(handle, mode)
 27 |   @check ccall( (:cusparseGetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{cusparsePointerMode_t}), handle, mode)
 28 | end
 29 | 
 30 | function cusparseSetPointerMode(handle, mode)
 31 |   @check ccall( (:cusparseSetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparsePointerMode_t), handle, mode)
 32 | end
 33 | 
 34 | function cusparseCreateHybMat(hybA)
 35 |   @check ccall( (:cusparseCreateHybMat, libcusparse), cusparseStatus_t, (Ptr{cusparseHybMat_t},), hybA)
 36 | end
 37 | 
 38 | function cusparseDestroyHybMat(hybA)
 39 |   @check ccall( (:cusparseDestroyHybMat, libcusparse), cusparseStatus_t, (cusparseHybMat_t,), hybA)
 40 | end
 41 | 
 42 | function cusparseCreateSolveAnalysisInfo(info)
 43 |   @check ccall( (:cusparseCreateSolveAnalysisInfo, libcusparse), cusparseStatus_t, (Ptr{cusparseSolveAnalysisInfo_t},), info)
 44 | end
 45 | 
 46 | function cusparseDestroySolveAnalysisInfo(info)
 47 |   @check ccall( (:cusparseDestroySolveAnalysisInfo, libcusparse), cusparseStatus_t, (cusparseSolveAnalysisInfo_t,), info)
 48 | end
 49 | 
 50 | function cusparseCreateBsrsm2Info(info)
 51 |   @check ccall( (:cusparseCreateBsrsm2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsm2Info_t},), info)
 52 | end
 53 | 
 54 | function cusparseDestroyBsrsm2Info(info)
 55 |   @check ccall( (:cusparseDestroyBsrsm2Info, libcusparse), cusparseStatus_t, (bsrsm2Info_t,), info)
 56 | end
 57 | 
 58 | function cusparseCreateBsrsv2Info(info)
 59 |   @check ccall( (:cusparseCreateBsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsv2Info_t},), info)
 60 | end
 61 | 
 62 | function cusparseDestroyBsrsv2Info(info)
 63 |   @check ccall( (:cusparseDestroyBsrsv2Info, libcusparse), cusparseStatus_t, (bsrsv2Info_t,), info)
 64 | end
 65 | 
 66 | function cusparseCreateCsrsv2Info(info)
 67 |   @check ccall( (:cusparseCreateCsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{csrsv2Info_t},), info)
 68 | end
 69 | 
 70 | function cusparseDestroyCsrsv2Info(info)
 71 |   @check ccall( (:cusparseDestroyCsrsv2Info, libcusparse), cusparseStatus_t, (csrsv2Info_t,), info)
 72 | end
 73 | 
 74 | function cusparseCreateCsric02Info(info)
 75 |   @check ccall( (:cusparseCreateCsric02Info, libcusparse), cusparseStatus_t, (Ptr{csric02Info_t},), info)
 76 | end
 77 | 
 78 | function cusparseDestroyCsric02Info(info)
 79 |   @check ccall( (:cusparseDestroyCsric02Info, libcusparse), cusparseStatus_t, (csric02Info_t,), info)
 80 | end
 81 | 
 82 | function cusparseCreateCsrilu02Info(info)
 83 |   @check ccall( (:cusparseCreateCsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{csrilu02Info_t},), info)
 84 | end
 85 | 
 86 | function cusparseDestroyCsrilu02Info(info)
 87 |   @check ccall( (:cusparseDestroyCsrilu02Info, libcusparse), cusparseStatus_t, (csrilu02Info_t,), info)
 88 | end
 89 | 
 90 | function cusparseCreateBsric02Info(info)
 91 |   @check ccall( (:cusparseCreateBsric02Info, libcusparse), cusparseStatus_t, (Ptr{bsric02Info_t},), info)
 92 | end
 93 | 
 94 | function cusparseDestroyBsric02Info(info)
 95 |   @check ccall( (:cusparseDestroyBsric02Info, libcusparse), cusparseStatus_t, (bsric02Info_t,), info)
 96 | end
 97 | 
 98 | function cusparseCreateBsrilu02Info(info)
 99 |   @check ccall( (:cusparseCreateBsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{bsrilu02Info_t},), info)
100 | end
101 | 
102 | function cusparseDestroyBsrilu02Info(info)
103 |   @check ccall( (:cusparseDestroyBsrilu02Info, libcusparse), cusparseStatus_t, (bsrilu02Info_t,), info)
104 | end
105 | 
106 | function cusparseGetProperty(property::CUDAapi.libraryPropertyType)
107 |   value_ref = Ref{Cint}()
108 |   @check ccall((:cusparseGetProperty, libcusparse),
109 |                cusparseStatus_t,
110 |                (Cint, Ptr{Cint}),
111 |                property, value_ref)
112 |   value_ref[]
113 | end
114 | 


--------------------------------------------------------------------------------
/src/dnn/nnlib.jl:
--------------------------------------------------------------------------------
  1 | using NNlib
  2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
  3 |   maxpool!, meanpool!, ∇maxpool!, ∇meanpool!,
  4 |   softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax
  5 | import ..CuArrays: CuVecOrMat, CuVector
  6 | using CUDAnative
  7 | 
  8 | 
  9 | # Softmax
 10 | 
 11 | const CUDNNFloat = Union{Float16,Float32,Float64}
 12 | 
 13 | reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1)
 14 | reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...)
 15 | 
 16 | function softmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 17 |   cudnnSoftmaxForward(reshape4D(xs), reshape4D(out))
 18 |   return out
 19 | end
 20 | 
 21 | function ∇softmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 22 |   cudnnSoftmaxBackward(reshape4D(softmax(xs)), reshape4D(Δ), reshape4D(out))
 23 |   return out
 24 | end
 25 | 
 26 | function logsoftmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 27 |   cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=CUDNN_SOFTMAX_LOG)
 28 |   return out
 29 | end
 30 | 
 31 | function ∇logsoftmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
 32 |   cudnnSoftmaxBackward(reshape4D(logsoftmax(xs)), reshape4D(Δ), reshape4D(out);
 33 |                        algorithm=CUDNN_SOFTMAX_LOG)
 34 |   return out
 35 | end
 36 | 
 37 | ∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat =
 38 |   ∇logsoftmax!(similar(xs), Δ, xs)
 39 | 
 40 | 
 41 | # Convolution
 42 | 
 43 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T};
 44 |                pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
 45 |   if version() < v"6"
 46 |     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
 47 |   end
 48 | 
 49 |   workspace_size =
 50 |     cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation,
 51 |                                             algo=algo, mode=flipkernel)
 52 | 
 53 |   CuVector{UInt8}(undef, workspace_size) do workspace
 54 |     cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel,
 55 |                             alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size)
 56 |   end
 57 | end
 58 | 
 59 | function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T};
 60 |                        pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
 61 |   if version() < v"6"
 62 |     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
 63 |   end
 64 | 
 65 |   workspace_size =
 66 |     cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride,
 67 |                                                    dilation=dilation, algo=algo, mode=flipkernel)
 68 | 
 69 |   CuVector{UInt8}(undef, workspace_size) do workspace
 70 |     cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation,
 71 |                                    mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
 72 |                                    workspace_size=workspace_size)
 73 |   end
 74 | end
 75 | 
 76 | function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T};
 77 |                      pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
 78 |   if version() < v"6"
 79 |     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
 80 |   end
 81 | 
 82 |   workspace_size =
 83 |     cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride,
 84 |                                                  dilation=dilation, algo=algo, mode=flipkernel)
 85 |   CuVector{UInt8}(undef, workspace_size) do workspace
 86 |     cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation,
 87 |                                  mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
 88 |                                  workspace_size=workspace_size)
 89 |   end
 90 | end
 91 | 
 92 | ∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =
 93 |   cudnnConvolutionBackwardBias(db, dy, alpha=alpha, beta=beta)
 94 | 
 95 | maxpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where T<:CUDNNFloat =
 96 |   cudnnPoolingForward(y, x, window=k, padding=pad, stride=stride, mode=0)
 97 | 
 98 | ∇maxpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, k;
 99 |           pad=map(_->0,k), stride=k) where T<:CUDNNFloat =
100 |   cudnnPoolingBackward(dx, dy, x, y, window=k, padding=pad, stride=stride, mode=0)
101 | 
102 | meanpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where T<:CUDNNFloat =
103 |   cudnnPoolingForward(y, x, window=k, padding=pad, stride=stride, mode=1)
104 | 
105 | ∇meanpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, k;
106 |            pad=map(_->0,k), stride=k) where T<:CUDNNFloat =
107 |   cudnnPoolingBackward(dx, dy, x, y, window=k, padding=pad, stride=stride, mode=1)
108 | 


--------------------------------------------------------------------------------
/test/sparse_solver.jl:
--------------------------------------------------------------------------------
  1 | @testset "CUSPARSE + CUSOLVER" begin
  2 | 
  3 | if isdefined(CuArrays, :CUSPARSE) && isdefined(CuArrays, :CUSOLVER)
  4 | using CuArrays.CUSOLVER
  5 | using CuArrays.CUSPARSE
  6 | 
  7 | using LinearAlgebra
  8 | using SparseArrays
  9 | 
 10 | m = 15
 11 | n = 10
 12 | l = 13
 13 | k = 1
 14 | 
 15 | @testset for elty in [Float32, Float64, ComplexF32, ComplexF64]
 16 |     @testset "csrlsvlu!" begin
 17 |         A = sparse(rand(elty,n,n))
 18 |         b = rand(elty,n)
 19 |         x = zeros(elty,n)
 20 |         tol = convert(real(elty),1e-6)
 21 |         x = CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 22 |         @test x ≈ Array(A)\b
 23 |         A = sparse(rand(elty,m,n))
 24 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 25 |         A = sparse(rand(elty,n,n))
 26 |         b = rand(elty,m)
 27 |         x = zeros(elty,n)
 28 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 29 |         b = rand(elty,n)
 30 |         x = zeros(elty,m)
 31 |         @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O')
 32 |     end
 33 | 
 34 |     @testset "csrlsvqr!" begin
 35 |         A     = sparse(rand(elty,n,n))
 36 |         d_A   = CuSparseMatrixCSR(A)
 37 |         b     = rand(elty,n)
 38 |         d_b   = CuArray(b)
 39 |         x     = zeros(elty,n)
 40 |         d_x   = CuArray(x)
 41 |         tol   = convert(real(elty),1e-4)
 42 |         d_x   = CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 43 |         h_x   = collect(d_x)
 44 |         @test h_x ≈ Array(A)\b
 45 |         A     = sparse(rand(elty,m,n))
 46 |         d_A   = CuSparseMatrixCSR(A)
 47 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 48 |         A = sparse(rand(elty,n,n))
 49 |         b = rand(elty,m)
 50 |         x = zeros(elty,n)
 51 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 52 |         b = rand(elty,n)
 53 |         x = zeros(elty,m)
 54 |         @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O')
 55 |     end
 56 | 
 57 |     @testset "csrlsvchol!" begin
 58 |         A     = rand(elty,n,n)
 59 |         A     = sparse(A*A') #posdef
 60 |         d_A   = CuSparseMatrixCSR(A)
 61 |         b     = rand(elty,n)
 62 |         d_b   = CuArray(b)
 63 |         x     = zeros(elty,n)
 64 |         d_x   = CuArray(x)
 65 |         tol   = 10^2*eps(real(elty))
 66 |         d_x   = CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 67 |         h_x   = collect(d_x)
 68 |         @test h_x ≈ Array(A)\b
 69 |         b     = rand(elty,m)
 70 |         d_b   = CuArray(b)
 71 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 72 |         b     = rand(elty,n)
 73 |         d_b   = CuArray(b)
 74 |         x     = rand(elty,m)
 75 |         d_x   = CuArray(x)
 76 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 77 |         A     = sparse(rand(elty,m,n))
 78 |         d_A   = CuSparseMatrixCSR(A)
 79 |         @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O')
 80 |     end
 81 | 
 82 |     @testset "csreigvsi" begin
 83 |         A     = sparse(rand(elty,n,n))
 84 |         d_A   = CuSparseMatrixCSR(A)
 85 |         evs   = eigvals(Array(A))
 86 |         x_0   = CuArray(rand(elty,n))
 87 |         μ,x   = CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 88 |         @test μ ≈ evs[1]
 89 |         A     = sparse(rand(elty,m,n))
 90 |         d_A   = CuSparseMatrixCSR(A)
 91 |         @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 92 |         A     = sparse(rand(elty,n,n))
 93 |         d_A   = CuSparseMatrixCSR(A)
 94 |         x_0   = CuArray(rand(elty,m))
 95 |         @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O')
 96 |     end
 97 |     @testset "csreigs" begin
 98 |         celty = complex(elty)
 99 |         A   = rand(real(elty),n,n)
100 |         A   = sparse(A + A')
101 |         num = CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O')
102 |         @test num <= n
103 |         A     = sparse(rand(celty,m,n))
104 |         d_A   = CuSparseMatrixCSR(A)
105 |         @test_throws DimensionMismatch CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O')
106 |     end
107 |     @testset "csrlsqvqr!" begin
108 |         A = sparse(rand(elty,n,n))
109 |         b = rand(elty,n)
110 |         x = zeros(elty,n)
111 |         tol = convert(real(elty),1e-4)
112 |         x = CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
113 |         @test x[1] ≈ Array(A)\b
114 |         A = sparse(rand(elty,n,m))
115 |         x = zeros(elty,n)
116 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
117 |         A = sparse(rand(elty,n,n))
118 |         b = rand(elty,m)
119 |         x = zeros(elty,n)
120 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
121 |         b = rand(elty,n)
122 |         x = zeros(elty,m)
123 |         @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O')
124 |     end
125 | end
126 | 
127 | end
128 | 
129 | end
130 | 


--------------------------------------------------------------------------------
/src/fft/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # region is an iterable subset of dimensions
  2 | # spec. an integer, range, tuple, or array
  3 | 
  4 | # inplace complex
  5 | function plan_fft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
  6 |     K = CUFFT_FORWARD
  7 |     inplace = true
  8 |     xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
  9 | 
 10 |     pp = _mkplan(xtype, size(X), region)
 11 | 
 12 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 13 | end
 14 | 
 15 | function plan_bfft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 16 |     K = CUFFT_INVERSE
 17 |     inplace = true
 18 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 19 | 
 20 |     pp = _mkplan(xtype, size(X), region)
 21 | 
 22 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 23 | end
 24 | 
 25 | # out-of-place complex
 26 | function plan_fft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 27 |     K = CUFFT_FORWARD
 28 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 29 |     inplace = false
 30 | 
 31 |     pp = _mkplan(xtype, size(X), region)
 32 | 
 33 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 34 | end
 35 | 
 36 | function plan_bfft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N}
 37 |     K = CUFFT_INVERSE
 38 |     inplace = false
 39 |     xtype =  (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z
 40 | 
 41 |     pp = _mkplan(xtype, size(X), region)
 42 | 
 43 |     cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype)
 44 | end
 45 | 
 46 | # out-of-place real-to-complex
 47 | function plan_rfft(X::CuArray{T,N}, region) where {T<:cufftReals,N}
 48 |     K = CUFFT_FORWARD
 49 |     inplace = false
 50 |     xtype =  (T == cufftReal) ? CUFFT_R2C : CUFFT_D2Z
 51 | 
 52 |     pp = _mkplan(xtype, size(X), region)
 53 | 
 54 |     ydims = collect(size(X))
 55 |     ydims[region[1]] = div(ydims[region[1]],2)+1
 56 | 
 57 |     rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype)
 58 | end
 59 | 
 60 | function plan_brfft(X::CuArray{T,N}, d::Integer, region::Any) where {T<:cufftComplexes,N}
 61 |     K = CUFFT_INVERSE
 62 |     inplace = false
 63 |     xtype =  (T == cufftComplex) ? CUFFT_C2R : CUFFT_Z2D
 64 |     ydims = collect(size(X))
 65 |     ydims[region[1]] = d
 66 | 
 67 |     pp = _mkplan(xtype, (ydims...,), region)
 68 | 
 69 |     rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype)
 70 | end
 71 | 
 72 | # FIXME: plan_inv methods allocate needlessly (to provide type parameters)
 73 | # Perhaps use FakeArray types to avoid this.
 74 | 
 75 | function plan_inv(p::cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}) where {T,N,inplace}
 76 |     X = CuArray{T}(undef, p.sz)
 77 |     pp = _mkplan(p.xtype, p.sz, p.region)
 78 |     ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}(pp, X, p.sz, p.region,
 79 |                                                      p.xtype),
 80 |                normalization(X, p.region))
 81 | end
 82 | 
 83 | function plan_inv(p::cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}) where {T,N,inplace}
 84 |     X = CuArray{T}(undef, p.sz)
 85 |     pp = _mkplan(p.xtype, p.sz, p.region)
 86 |     ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region,
 87 |                                                      p.xtype),
 88 |                normalization(X, p.region))
 89 | end
 90 | 
 91 | function plan_inv(p::rCuFFTPlan{T,CUFFT_INVERSE,inplace,N}
 92 |                   ) where {T<:cufftComplexes,N,inplace}
 93 |     X = CuArray{real(T)}(undef, p.osz)
 94 |     Y = CuArray{T}(undef, p.sz)
 95 |     xtype = p.xtype == CUFFT_C2R ? CUFFT_R2C : CUFFT_D2Z
 96 |     pp = _mkplan(xtype, p.osz, p.region)
 97 |     ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region,
 98 |                                                      xtype),
 99 |                normalization(X, p.region))
100 | end
101 | 
102 | function plan_inv(p::rCuFFTPlan{T,CUFFT_FORWARD,inplace,N}
103 |                   ) where {T<:cufftReals,N,inplace}
104 |     X = CuArray{complex(T)}(undef, p.osz)
105 |     Y = CuArray{T}(undef, p.sz)
106 |     xtype = p.xtype == CUFFT_R2C ? CUFFT_C2R : CUFFT_Z2D
107 |     pp = _mkplan(xtype, p.sz, p.region)
108 |     ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,N}(pp, X, p.sz,
109 |                                                               p.region, xtype),
110 |                normalization(Y, p.region))
111 | end
112 | 
113 | 
114 | # The rest of the standard API
115 | 
116 | size(p::CuFFTPlan) = p.sz
117 | 
118 | function mul!(y::CuArray{Ty}, p::CuFFTPlan{T,K,false}, x::CuArray{T}
119 |                   ) where {Ty,T,K}
120 |     assert_applicable(p,x,y)
121 |     unsafe_execute!(p,x,y)
122 |     return y
123 | end
124 | 
125 | function *(p::cCuFFTPlan{T,K,true,N}, x::CuArray{T,N}) where {T,K,N}
126 |     assert_applicable(p,x)
127 |     unsafe_execute!(p,x)
128 |     x
129 | end
130 | 
131 | function *(p::rCuFFTPlan{T,CUFFT_FORWARD,false,N}, x::CuArray{T,N}
132 |            ) where {T<:cufftReals,N}
133 |     @assert p.xtype ∈ [CUFFT_R2C,CUFFT_D2Z]
134 |     y = CuArray{complex(T),N}(undef, p.osz)
135 |     mul!(y,p,x)
136 |     y
137 | end
138 | 
139 | function *(p::rCuFFTPlan{T,CUFFT_INVERSE,false,N}, x::CuArray{T,N}
140 |            ) where {T<:cufftComplexes,N}
141 |     @assert p.xtype ∈ [CUFFT_C2R,CUFFT_Z2D]
142 |     y = CuArray{real(T),N}(undef, p.osz)
143 |     mul!(y,p,x)
144 |     y
145 | end
146 | 
147 | function *(p::cCuFFTPlan{T,K,false,N}, x::CuArray{T,N}) where {T,K,N}
148 |     y = CuArray{T,N}(undef, p.osz)
149 |     mul!(y,p,x)
150 |     y
151 | end


--------------------------------------------------------------------------------
/src/dnn/helpers.jl:
--------------------------------------------------------------------------------
  1 | # For low level cudnn functions that require a pointer to a number
  2 | cptr(x,a::CuArray{Float64})=Float64[x]
  3 | cptr(x,a::CuArray{Float32})=Float32[x]
  4 | cptr(x,a::CuArray{Float16})=Float32[x]
  5 | 
  6 | # Conversion between Julia and CUDNN datatypes
  7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF
  8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT
  9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE
 10 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 :
 11 |                   a==CUDNN_DATA_FLOAT ? Float32 :
 12 |                   a==CUDNN_DATA_DOUBLE ? Float64 : error())
 13 | 
 14 | tuple_strides(A::Tuple) = _strides((1,), A)
 15 | _strides(out::Tuple{Int}, A::Tuple{}) = ()
 16 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out
 17 | function _strides(out::NTuple{M,Int}, A::Tuple) where M
 18 |     Base.@_inline_meta
 19 |     _strides((out..., out[M]*A[M]), A)
 20 | end
 21 | 
 22 | # Descriptors
 23 | 
 24 | mutable struct TensorDesc; ptr; end
 25 | free(td::TensorDesc) = cudnnDestroyTensorDescriptor(td.ptr)
 26 | Base.unsafe_convert(::Type{cudnnTensorDescriptor_t}, td::TensorDesc) = td.ptr
 27 | Base.unsafe_convert(::Type{Ptr{Nothing}}, td::TensorDesc) = convert(Ptr{Nothing}, td.ptr)
 28 | 
 29 | function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N
 30 |     sz = Cint.(size) |> reverse |> collect
 31 |     st = Cint.(strides) |> reverse |> collect
 32 |     d = Ref{cudnnTensorDescriptor_t}()
 33 |     cudnnCreateTensorDescriptor(d)
 34 |     cudnnSetTensorNdDescriptor(d[], cudnnDataType(T), length(sz), sz, st)
 35 |     this = TensorDesc(d[])
 36 |     finalizer(free, this)
 37 |     return this
 38 | end
 39 | 
 40 | TensorDesc(a::CuArray) = TensorDesc(eltype(a), size(a), strides(a))
 41 | 
 42 | mutable struct FilterDesc
 43 |   ptr
 44 | end
 45 | free(fd::FilterDesc)=cudnnDestroyFilterDescriptor(fd.ptr)
 46 | Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc)=fd.ptr
 47 | Base.unsafe_convert(::Type{Ptr{Nothing}}, fd::FilterDesc)=fd.ptr
 48 | 
 49 | function createFilterDesc()
 50 |   d = Ref{cudnnFilterDescriptor_t}()
 51 |   @check cudnnCreateFilterDescriptor(d)
 52 |   return d[]
 53 | end
 54 | 
 55 | function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW)
 56 |     # The only difference of a FilterDescriptor is no strides.
 57 |     sz = Cint.(size) |> reverse |> collect
 58 |     d = createFilterDesc()
 59 |     version() >= v"5" ?
 60 |         cudnnSetFilterNdDescriptor(d, cudnnDataType(T), format, length(sz), sz) :
 61 |     version() >= v"4" ?
 62 |         cudnnSetFilterNdDescriptor_v4(d, cudnnDataType(T), format, length(sz), sz) :
 63 |         cudnnSetFilterNdDescriptor(d, cudnnDataType(T), length(sz), sz)
 64 |     this = FilterDesc(d)
 65 |     finalizer(free, this)
 66 |     return this
 67 | end
 68 | 
 69 | FilterDesc(a::CuArray; format = CUDNN_TENSOR_NCHW) = FilterDesc(eltype(a), size(a), format = format)
 70 | 
 71 | function Base.size(f::FilterDesc)
 72 |   typ = Ref{Cuint}()
 73 |   format = Ref{Cuint}()
 74 |   ndims = Ref{Cint}()
 75 |   dims = Vector{Cint}(undef, 8)
 76 |   cudnnGetFilterNdDescriptor(f, 8, typ, format, ndims, dims)
 77 |   @assert ndims[] ≤ 8
 78 |   return (dims[1:ndims[]]...,) |> reverse
 79 | end
 80 | 
 81 | mutable struct ConvDesc; ptr; end
 82 | free(cd::ConvDesc) = cudnnDestroyConvolutionDescriptor(cd.ptr)
 83 | Base.unsafe_convert(::Type{cudnnConvolutionDescriptor_t}, cd::ConvDesc)=cd.ptr
 84 | 
 85 | function cdsize(w, nd)
 86 |     isa(w, Integer) ? Cint[fill(w,nd)...] :
 87 |     length(w)!=nd ? error("Dimension mismatch") :
 88 |     Cint[reverse(w)...]
 89 | end
 90 | 
 91 | pdsize(w, nd)=Cint[reverse(psize(w,nd))...]
 92 | psize(w, nd)=(isa(w,Integer)  ? fill(w,nd) : length(w) != nd ? error("Dimension mismatch") : w)
 93 | 
 94 | function ConvDesc(T, N, padding, stride, dilation, mode)
 95 |     cd = Ref{cudnnConvolutionDescriptor_t}()
 96 |     cudnnCreateConvolutionDescriptor(cd)
 97 |     version() >= v"4" ? cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) :
 98 |     version() >= v"3" ? cudnnSetConvolutionNdDescriptor_v3(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) :
 99 |     cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode)
100 |     this = ConvDesc(cd[])
101 |     finalizer(free, this)
102 |     return this
103 | end
104 | 
105 | mutable struct PoolDesc; ptr; end
106 | free(pd::PoolDesc)=cudnnDestroyPoolingDescriptor(pd.ptr)
107 | Base.unsafe_convert(::Type{cudnnPoolingDescriptor_t}, pd::PoolDesc)=pd.ptr
108 | 
109 | function PoolDesc(nd, window, padding, stride, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN)
110 |     pd = Ref{cudnnPoolingDescriptor_t}()
111 |     cudnnCreatePoolingDescriptor(pd)
112 |     cudnnSetPoolingNdDescriptor(pd[],mode,maxpoolingNanOpt,nd,pdsize(window,nd),pdsize(padding,nd),pdsize(stride,nd))
113 |     this = PoolDesc(pd[])
114 |     finalizer(free, this)
115 |     return this
116 | end
117 | 
118 | mutable struct ActivationDesc; ptr; end
119 | free(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr)
120 | Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr
121 | 
122 | function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN)
123 |     ad = Ref{cudnnActivationDescriptor_t}()
124 |     cudnnCreateActivationDescriptor(ad)
125 |     cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff)
126 |     this = ActivationDesc(ad[])
127 |     finalizer(free, this)
128 |     return this
129 | end
130 | 


--------------------------------------------------------------------------------
/src/solver/libcusolver.jl:
--------------------------------------------------------------------------------
  1 | # low-level wrappers of the CUSOLVER library
  2 | 
  3 | #helper functions
  4 | function cusolverDnCreate()
  5 |   handle = Ref{cusolverDnHandle_t}()
  6 |   @check ccall((:cusolverDnCreate, libcusolver),
  7 |                cusolverStatus_t,
  8 |                (Ptr{cusolverDnHandle_t},),
  9 |                handle)
 10 |   return handle[]
 11 | end
 12 | 
 13 | function cusolverDnDestroy(handle)
 14 |   @check ccall((:cusolverDnDestroy, libcusolver),
 15 |                cusolverStatus_t,
 16 |                (cusolverDnHandle_t,),
 17 |                handle)
 18 | end
 19 | 
 20 | function cusolverDnSetStream(handle, streamId)
 21 |   @check ccall((:cusolverDnSetStream, libcusolver),
 22 |                cusolverStatus_t,
 23 |                (cusolverDnHandle_t, CuStream_t),
 24 |                handle, streamId)
 25 | end
 26 | 
 27 | function cusolverDnGetStream(handle, streamId)
 28 |   @check ccall((:cusolverDnGetStream, libcusolver),
 29 |                cusolverStatus_t,
 30 |                (cusolverDnHandle_t, Ptr{CuStream_t}),
 31 |                handle, streamId)
 32 | end
 33 | 
 34 | function cusolverSpCreate()
 35 |   handle = Ref{cusolverSpHandle_t}()
 36 |   @check ccall((:cusolverSpCreate, libcusolver),
 37 |                cusolverStatus_t,
 38 |                (Ptr{cusolverSpHandle_t},),
 39 |                handle)
 40 |   return handle[]
 41 | end
 42 | 
 43 | function cusolverSpDestroy(handle)
 44 |   @check ccall((:cusolverSpDestroy, libcusolver),
 45 |                cusolverStatus_t,
 46 |                (cusolverSpHandle_t,),
 47 |                handle)
 48 | end
 49 | 
 50 | function cusolverSpSetStream(handle, streamId)
 51 |   @check ccall((:cusolverSpSetStream, libcusolver),
 52 |                cusolverStatus_t,
 53 |                (cusolverSpHandle_t, CuStream_t),
 54 |                handle, streamId)
 55 | end
 56 | 
 57 | function cusolverSpGetStream(handle, streamId)
 58 |   @check ccall((:cusolverSpGetStream, libcusolver),
 59 |                cusolverStatus_t,
 60 |                (cusolverSpHandle_t, Ptr{CuStream_t}),
 61 |                handle, streamId)
 62 | end
 63 | 
 64 | function cusolverSpCreateCsrqrInfo(info)
 65 |   @check ccall((:cusolverSpCreateCsrqrInfo, libcusolver),
 66 |                cusolverStatus_t,
 67 |                (Ptr{csrqrInfo_t},),
 68 |                info)
 69 | end
 70 | 
 71 | function cusolverSpDestroyCsrqrInfo(info)
 72 |   @check ccall((:cusolverDestroyCsrqrInfo, libcusolver),
 73 |                cusolverStatus_t,
 74 |                (csrqrInfo_t,),
 75 |                info)
 76 | end
 77 | 
 78 | function cusolverDnCreateGesvdjInfo(info)
 79 |   @check ccall((:cusolverDnCreateGesvdjInfo, libcusolver),
 80 |                cusolverStatus_t,
 81 |                (Ptr{gesvdjInfo_t},),
 82 |                info)
 83 | end
 84 | 
 85 | function cusolverDnDestroyGesvdjInfo(info)
 86 |   @check ccall((:cusolverDnDestroyGesvdjInfo, libcusolver),
 87 |                cusolverStatus_t,
 88 |                (gesvdjInfo_t,),
 89 |                info)
 90 | end
 91 | 
 92 | function cusolverDnXgesvdjSetTolerance(info, tolerance)
 93 |   @check ccall((:cusolverDnXgesvdjSetTolerance, libcusolver),
 94 |                cusolverStatus_t,
 95 |                (gesvdjInfo_t, Float64),
 96 |                info, Float64(tolerance))
 97 | end
 98 | 
 99 | function cusolverDnXgesvdjSetMaxSweeps(info, max_sweeps)
100 |   @check ccall((:cusolverDnXgesvdjSetMaxSweeps, libcusolver),
101 |                cusolverStatus_t,
102 |                (gesvdjInfo_t, Cint),
103 |                info, Cint(max_sweeps))
104 | end
105 | 
106 | function cusolverDnCreateSyevjInfo(info)
107 |   @check ccall((:cusolverDnCreateSyevjInfo, libcusolver),
108 |                cusolverStatus_t,
109 |                (Ptr{syevjInfo_t},),
110 |                info)
111 | end
112 | 
113 | function cusolverDnDestroySyevjInfo(info)
114 |   @check ccall((:cusolverDnDestroySyevjInfo, libcusolver),
115 |                cusolverStatus_t,
116 |                (syevjInfo_t,),
117 |                info)
118 | end
119 | 
120 | function cusolverDnXsyevjSetTolerance(info, tolerance)
121 |   @check ccall((:cusolverDnXsyevjSetTolerance, libcusolver),
122 |                cusolverStatus_t,
123 |                (syevjInfo_t, Float64),
124 |                info, Float64(tolerance))
125 | end
126 | 
127 | function cusolverDnXsyevjSetMaxSweeps(info, max_sweeps)
128 |   @check ccall((:cusolverDnXsyevjSetMaxSweeps, libcusolver),
129 |                cusolverStatus_t,
130 |                (syevjInfo_t, Cint),
131 |                info, Cint(max_sweeps))
132 | end
133 | 
134 | function cusolverRfCreate(handle)
135 |   @check ccall((:cusolverRfCreate, libcusolver),
136 |                cusolverStatus_t,
137 |                (Ptr{cusolverRfHandle_t},),
138 |                handle)
139 | end
140 | 
141 | function cusolverRfDestroy(handle)
142 |   @check ccall((:cusolverRfDestroy, libcusolver),
143 |                cusolverStatus_t,
144 |                (cusolverRfHandle_t,),
145 |                handle)
146 | end
147 | 
148 | function cusolverRfSetStream(handle, streamId)
149 |   @check ccall((:cusolverRfSetStream, libcusolver),
150 |                cusolverStatus_t,
151 |                (cusolverRfHandle_t, CuStream_t),
152 |                handle, streamId)
153 | end
154 | 
155 | function cusolverRfGetStream(handle, streamId)
156 |   @check ccall((:cusolverRfGetStream, libcusolver),
157 |                cusolverStatus_t,
158 |                (cusolverRfHandle_t, Ptr{CuStream_t}),
159 |                handle, streamId)
160 | end
161 | 
162 | function cusolverGetProperty(property::CUDAapi.libraryPropertyType)
163 |   value_ref = Ref{Cint}()
164 |   @check ccall((:cusolverGetProperty, libcusolver),
165 |                cusolverStatus_t,
166 |                (Cint, Ptr{Cint}),
167 |                property, value_ref)
168 |   value_ref[]
169 | end
170 | 


--------------------------------------------------------------------------------
/src/sparse/libcusparse_types.jl:
--------------------------------------------------------------------------------
  1 | #enum cusparseStatus_t
  2 | #error messages from CUSPARSE
  3 | 
  4 | """
  5 | Status messages from CUSPARSE's C API.
  6 | """
  7 | const cusparseStatus_t = UInt32
  8 | const CUSPARSE_STATUS_SUCCESS                   = 0
  9 | const CUSPARSE_STATUS_NOT_INITIALIZED           = 1
 10 | const CUSPARSE_STATUS_ALLOC_FAILED              = 2
 11 | const CUSPARSE_STATUS_INVALID_VALUE             = 3
 12 | const CUSPARSE_STATUS_ARCH_MISMATCH             = 4
 13 | const CUSPARSE_STATUS_MAPPING_ERROR             = 5
 14 | const CUSPARSE_STATUS_EXECUTION_FAILED          = 6
 15 | const CUSPARSE_STATUS_INTERNAL_ERROR            = 7
 16 | const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8
 17 | 
 18 | #enum cusparseAction_t
 19 | """
 20 | Perform operation on indices only (`CUSPARSE_ACTION_SYMBOLIC`) or
 21 | on both data and indices (`CUSPARSE_ACTION_NUMERIC`). Used in
 22 | conversion routines.
 23 | """
 24 | const cusparseAction_t = UInt32
 25 | const CUSPARSE_ACTION_SYMBOLIC = 0
 26 | const CUSPARSE_ACTION_NUMERIC  = 1
 27 | 
 28 | #enum cusparseDirection_t
 29 | """
 30 | Parse dense matrix by rows (`CUSPARSE_DIRECTION_ROW`) or columns
 31 | (`CUSPARSE_DIRECTION_COL`) to compute its number of non-zeros.
 32 | """
 33 | const cusparseDirection_t = UInt32
 34 | const CUSPARSE_DIRECTION_ROW = 0
 35 | const CUSPARSE_DIRECTION_COL = 1
 36 | 
 37 | #enum cusparseHybPartition_t
 38 | """
 39 | How to partition the HYB matrix in a [`CudaSparseMatrixHYB`](@ref).
 40 | There are three choices:
 41 | * `CUSPARSE_HYB_PARTITION_AUTO` - let CUSPARSE decide internally for best performance.
 42 | * `CUSPARSE_HYB_PARTITION_USER` - set the partition manually in the conversion function.
 43 | * `CUSPARSE_HYB_PARTITION_MAX` - use the maximum partition, putting the matrix in ELL format.
 44 | """
 45 | const cusparseHybPartition_t = UInt32
 46 | const CUSPARSE_HYB_PARTITION_AUTO = 0
 47 | const CUSPARSE_HYB_PARTITION_USER = 1
 48 | const CUSPARSE_HYB_PARTITION_MAX  = 2
 49 | 
 50 | #enum cusparseFillMode_t
 51 | """
 52 | Determines if a symmetric/Hermitian/triangular matrix has its upper
 53 | (`CUSPARSE_FILL_MODE_UPPER`) or lower (`CUSPARSE_FILL_MODE_LOWER`)
 54 | triangle filled.
 55 | """
 56 | const cusparseFillMode_t = UInt32
 57 | const CUSPARSE_FILL_MODE_LOWER = 0
 58 | const CUSPARSE_FILL_MODE_UPPER = 1
 59 | 
 60 | #enum cusparseDiagType_t
 61 | """
 62 | Determines if the diagonal of a matrix is all ones (`CUSPARSE_DIAG_TYPE_UNIT`)
 63 | or not all ones (`CUSPARSE_DIAG_TYPE_NON_UNIT`).
 64 | """
 65 | const cusparseDiagType_t = UInt32
 66 | const CUSPARSE_DIAG_TYPE_NON_UNIT = 0
 67 | const CUSPARSE_DIAG_TYPE_UNIT     = 1
 68 | 
 69 | #enum cusparsePointerMode_t
 70 | """
 71 | Determines if scalar arguments to a function are present on the host CPU
 72 | (`CUSPARSE_POINTER_MODE_HOST`) or on the GPU (`CUSPARSE_POINTER_MODE_DEVICE`).
 73 | """
 74 | const cusparsePointerMode_t = UInt32
 75 | const CUSPARSE_POINTER_MODE_HOST   = 0
 76 | const CUSPARSE_POINTER_MODE_DEVICE = 1
 77 | 
 78 | #enum cusparseOperation_t
 79 | """
 80 | Determines whether to perform an operation, such as a matrix multiplication
 81 | or solve, on the matrix as-is (`CUSPARSE_OPERATION_NON_TRANSPOSE`), on the
 82 | matrix's transpose (`CUSPARSE_OPERATION_TRANSPOSE`), or on its conjugate
 83 | transpose (`CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE`).
 84 | """
 85 | const cusparseOperation_t = UInt32
 86 | const CUSPARSE_OPERATION_NON_TRANSPOSE       = 0
 87 | const CUSPARSE_OPERATION_TRANSPOSE           = 1
 88 | const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
 89 | 
 90 | #enum cusparseMatrixType_t
 91 | """
 92 | Indicates whether a matrix is a general matrix (`CUSPARSE_MATRIX_TYPE_GENERAL`),
 93 | symmetric (`CUSPARSE_MATRIX_TYPE_SYMMETRIC`), Hermitian
 94 | (`CUSPARSE_MATRIX_TYPE_HERMITIAN`), or triangular
 95 | (`CUSPARSE_MATRIX_TYPE_TRIANGULAR`). Note that for some matrix types
 96 | (those in [`CompressedSparse`](@ref)), this can be inferred for some function
 97 | calls.
 98 | """
 99 | const cusparseMatrixType_t = UInt32
100 | const CUSPARSE_MATRIX_TYPE_GENERAL    = 0
101 | const CUSPARSE_MATRIX_TYPE_SYMMETRIC  = 1
102 | const CUSPARSE_MATRIX_TYPE_HERMITIAN  = 2
103 | const CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
104 | 
105 | #enum cusparseSolvePolicy_t
106 | """
107 | Indicates whether to keep level info in solvers (`CUSPARSE_SOLVE_POLICY_USE_LEVEL`)
108 | or whether to not use it (`CUSPARSE_SOLVE_POLICY_NO_LEVEL`).
109 | """
110 | const cusparseSolvePolicy_t = UInt32
111 | const CUSPARSE_SOLVE_POLICY_NO_LEVEL  = 0
112 | const CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1
113 | 
114 | #enum cusparseIndexBase_t
115 | """
116 | Indicates whether a sparse object is zero-indexed (`CUSPARSE_INDEX_BASE_ZERO`)
117 | or one-indexed (`CUSPARSE_INDEX_BASE_ONE`). CUSPARSE.jl supports both. Julia
118 | sparse matrices are one-indexed, but you may wish to pass matrices from other
119 | libraries which use zero-indexing (e.g. C language ODE solvers).
120 | """
121 | const cusparseIndexBase_t = UInt32
122 | const CUSPARSE_INDEX_BASE_ZERO = 0
123 | const CUSPARSE_INDEX_BASE_ONE  = 1
124 | 
125 | #struct cusparseMatDescr_t
126 | """
127 | Describes shape and properties of a CUSPARSE matrix. A convenience wrapper.
128 | 
129 | Contains:
130 | * `MatrixType` - a [`cusparseMatrixType_t`](@ref)
131 | * `FillMode` - a [`cusparseFillMode_t`](@ref)
132 | * `DiagType` - a [`cusparseDiagType_t`](@ref)
133 | * `IndexBase` - a [`cusparseIndexBase_t`](@ref)
134 | """
135 | struct cusparseMatDescr_t
136 |     MatrixType::cusparseMatrixType_t
137 |     FillMode::cusparseFillMode_t
138 |     DiagType::cusparseDiagType_t
139 |     IndexBase::cusparseIndexBase_t
140 |     function cusparseMatDescr_t(MatrixType,FillMode,DiagType,IndexBase)
141 |         new(MatrixType,FillMode,DiagType,IndexBase)
142 |     end
143 | end
144 | 
145 | """
146 | An opaque struct containing information about the solution approach
147 | CUSPARSE will take. Generated by [`sv_analysis`](@ref) or
148 | [`sm_analysis`](@ref) and passed to [`sv_solve!`](@ref), [`sm_solve`](@ref),
149 | [`ic0!`](@ref), or [`ilu0!`](@ref).
150 | """
151 | const cusparseSolveAnalysisInfo_t = Ptr{Cvoid}
152 | const bsrsm2Info_t = Ptr{Cvoid}
153 | const bsrsv2Info_t = Ptr{Cvoid}
154 | const csrsv2Info_t = Ptr{Cvoid}
155 | const csric02Info_t = Ptr{Cvoid}
156 | const csrilu02Info_t = Ptr{Cvoid}
157 | const bsric02Info_t = Ptr{Cvoid}
158 | const bsrilu02Info_t = Ptr{Cvoid}
159 | 
160 | const cusparseContext = Cvoid
161 | const cusparseHandle_t = Ptr{cusparseContext}
162 | 
163 | #complex numbers
164 | 
165 | const cuComplex = Complex{Float32}
166 | const cuDoubleComplex = Complex{Float64}
167 | 
168 | const CusparseFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
169 | const CusparseReal = Union{Float64,Float32}
170 | const CusparseComplex = Union{ComplexF64,ComplexF32}
171 | 
172 | const cusparseHybMat_t = Ptr{Cvoid}
173 | 


--------------------------------------------------------------------------------
/src/rand/libcurand.jl:
--------------------------------------------------------------------------------
  1 | function create_generator(typ::Int=CURAND_RNG_PSEUDO_DEFAULT)
  2 |     ptr = Ref{curandGenerator_t}()
  3 |     @check ccall((:curandCreateGenerator, libcurand),
  4 |                  curandStatus_t,
  5 |                  (Ptr{curandGenerator_t}, Cint), ptr, typ)
  6 |     r = RNG(ptr[], typ)
  7 |     finalizer(destroy_generator, r)
  8 |     return r
  9 | end
 10 | 
 11 | function destroy_generator(rng::RNG)
 12 |     @check ccall((:curandDestroyGenerator, libcurand),
 13 |                  curandStatus_t,
 14 |                  (curandGenerator_t,), rng)
 15 | end
 16 | 
 17 | function get_version()
 18 |     ver = Ref{Cint}()
 19 |     @check ccall((:curandGetVersion, libcurand),
 20 |                  curandStatus_t,
 21 |                  (Ref{Cint},), ver)
 22 |     return ver[]
 23 | end
 24 | 
 25 | # TODO: curandSetStream
 26 | 
 27 | function set_pseudo_random_generator_seed(rng::RNG, seed::Int64)
 28 |     @check ccall((:curandSetPseudoRandomGeneratorSeed, libcurand),
 29 |                  curandStatus_t,
 30 |                  (curandGenerator_t, Clonglong), rng, seed)
 31 | end
 32 | 
 33 | function set_generator_offset(rng::RNG, offset::Int64)
 34 |     @check ccall((:curandSetGeneratorOffset, libcurand),
 35 |                  curandStatus_t,
 36 |                  (curandGenerator_t, Clonglong), rng, offset)
 37 | end
 38 | 
 39 | function set_generator_ordering(rng::RNG, order::Int)
 40 |     @check ccall((:curandSetGeneratorOrdering, libcurand),
 41 |                  curandStatus_t,
 42 |                  (curandGenerator_t, Cint), rng, order)
 43 | end
 44 | 
 45 | function set_quasi_random_generator_dimensions(rng::RNG, num_dimensions::UInt)
 46 |     @check ccall((:curandSetQuasiRandomGeneratorDimensions, libcurand),
 47 |                  curandStatus_t,
 48 |                  (curandGenerator_t, Cuint),
 49 |                  rng, num_dimensions)
 50 | end
 51 | 
 52 | 
 53 | """
 54 | Generate 64-bit quasirandom numbers.
 55 | """
 56 | function generate(rng::RNG, arr::CuArray, n::UInt)
 57 |     @check ccall((:curandGenerate, libcurand),
 58 |                  curandStatus_t,
 59 |                  (curandGenerator_t, CuPtr{UInt32}, Csize_t),
 60 |                  rng, arr, length(arr))
 61 |     return arr
 62 | end
 63 | 
 64 | 
 65 | """
 66 | Generate uniformly distributed floats.
 67 | 
 68 | Valid RNG types are:
 69 |  - CURAND_RNG_QUASI_SOBOL64
 70 |  - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
 71 | """
 72 | function generate_long_long(rng::RNG, arr::CuArray)
 73 |     @check ccall((:curandGenerateLongLong, libcurand),
 74 |                  curandStatus_t,
 75 |                  (curandGenerator_t, CuPtr{Culonglong}, Csize_t),
 76 |                  rng, arr, length(arr))
 77 |     return arr
 78 | end
 79 | 
 80 | # uniform
 81 | function generate_uniform(rng::RNG, arr::CuArray)
 82 |     @check ccall((:curandGenerateUniform, libcurand),
 83 |                  curandStatus_t,
 84 |                  (curandGenerator_t, CuPtr{Float32}, Csize_t),
 85 |                  rng, arr, length(arr))
 86 |     return arr
 87 | end
 88 | 
 89 | function generate_uniform_double(rng::RNG, arr::CuArray)
 90 |     @check ccall((:curandGenerateUniformDouble, libcurand),
 91 |                  curandStatus_t,
 92 |                  (curandGenerator_t, CuPtr{Float64}, Csize_t),
 93 |                  rng, arr, length(arr))
 94 |     return arr
 95 | end
 96 | 
 97 | # normal
 98 | function generate_normal(rng::RNG, arr::CuArray, mean, stddev)
 99 |     @check ccall((:curandGenerateNormal, libcurand),
100 |                  curandStatus_t,
101 |                  (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat),
102 |                  rng, arr, length(arr), mean, stddev)
103 |     return arr
104 | end
105 | 
106 | function generate_normal_double(rng::RNG, arr::CuArray, mean, stddev)
107 |     @check ccall((:curandGenerateNormalDouble, libcurand),
108 |                  curandStatus_t,
109 |                  (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble),
110 |                  rng, arr, length(arr), mean, stddev)
111 |     return arr
112 | end
113 | 
114 | 
115 | # lognormal
116 | function generate_log_normal(rng::RNG, arr::CuArray, mean, stddev)
117 |     @check ccall((:curandGenerateLogNormal, libcurand),
118 |                  curandStatus_t,
119 |                  (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat),
120 |                  rng, arr, length(arr), mean, stddev)
121 |     return arr
122 | end
123 | 
124 | function generate_log_normal_double(rng::RNG, arr::CuArray, mean, stddev)
125 |     @check ccall((:curandGenerateLogNormalDouble, libcurand),
126 |                  curandStatus_t,
127 |                  (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble),
128 |                  rng, arr, length(arr), mean, stddev)
129 |     return arr
130 | end
131 | 
132 | # Poisson
133 | """Construct the histogram array for a Poisson distribution."""
134 | function create_poisson_distribtion(lambda)
135 |     ptr = Ref{curandDiscreteDistribution_t}()
136 |     @check ccall((:curandCreatePoissonDistribution, libcurand),
137 |                  curandStatus_t,
138 |                  (Cdouble, Ptr{curandDiscreteDistribution_t}),
139 |                  lambda, ptr)
140 |     dist = DiscreteDistribution(ptr[])
141 |     finalizer(destroy_distribution, dist)
142 |     return dist
143 | end
144 | 
145 | """Destroy the histogram array for a discrete distribution (e.g. Poisson)."""
146 | function destroy_distribution(dist::DiscreteDistribution)
147 |     @check ccall((:curandDestroyDistribution, libcurand),
148 |                  curandStatus_t,
149 |                  (curandDiscreteDistribution_t,),
150 |                  dist)
151 | end
152 | 
153 | """Generate Poisson-distributed unsigned ints."""
154 | function generate_poisson(rng::RNG, arr::CuArray, lambda)
155 |     @check ccall((:curandGeneratePoisson, libcurand),
156 |                  curandStatus_t,
157 |                  (curandGenerator_t, CuPtr{Cuint}, Csize_t, Cdouble),
158 |                  rng, arr, length(arr), lambda)
159 |     return arr
160 | end
161 | 
162 | # seeds
163 | """Generate the starting state of the generator. """
164 | function generate_seeds(rng::RNG)
165 |     @check ccall((:curandGenerateSeeds, libcurand),
166 |                  curandStatus_t,
167 |                  (curandGenerator_t,), rng)
168 | end
169 | 
170 | # TODO: curandGetDirectionVectors32
171 | # TODO: curandGetScrambleConstants32
172 | # TODO: curandGetDirectionVectors64
173 | # TODO: curandGetScrambleConstants64
174 | 
175 | function curandGetProperty(property::CUDAapi.libraryPropertyType)
176 |   value_ref = Ref{Cint}()
177 |   @check ccall((:curandGetProperty, libcurand),
178 |                curandStatus_t,
179 |                (Cint, Ptr{Cint}),
180 |                property, value_ref)
181 |   value_ref[]
182 | end
183 | 


--------------------------------------------------------------------------------
/test/base.jl:
--------------------------------------------------------------------------------
  1 | using ForwardDiff: Dual
  2 | using LinearAlgebra
  3 | using Adapt: adapt
  4 | 
  5 | import CUDAdrv
  6 | import CUDAdrv: CuPtr, CU_NULL
  7 | 
  8 | @testset "GPUArrays test suite" begin
  9 |   GPUArrays.test(CuArray)
 10 | end
 11 | 
 12 | @testset "Memory" begin
 13 |   CuArrays.alloc(0)
 14 | 
 15 |   @test (CuArrays.@allocated CuArray{Int32}(undef,1)) == 4
 16 | 
 17 |   ret, out = @grab_output CuArrays.@time CuArray{Int32}(undef, 1)
 18 |   @test isa(ret, CuArray{Int32})
 19 |   @test occursin("1 GPU allocation: 4 bytes", out)
 20 | 
 21 |   ret, out = @grab_output CuArrays.@time Base.unsafe_wrap(CuArray, CuPtr{Int32}(12345678), (2, 3))
 22 |   @test isa(ret, CuArray{Int32})
 23 |   @test !occursin("GPU allocation", out)
 24 | end
 25 | 
 26 | @testset "Array" begin
 27 |   xs = CuArray{Int}(undef, 2, 3)
 28 |   @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4]
 29 |   @test collect(cu[1, 2, 3]) == [1, 2, 3]
 30 |   @test collect(cu([1, 2, 3])) == [1, 2, 3]
 31 |   @test testf(vec, rand(5,3))
 32 |   @test cu(1:3) === 1:3
 33 | 
 34 |   # Check that allowscalar works
 35 |   @test_throws ErrorException xs[1]
 36 |   @test_throws ErrorException xs[1] = 1
 37 | 
 38 |   # unsafe_wrap
 39 |   buf = CUDAdrv.Mem.Buffer(CU_NULL, 2, CUDAdrv.CuCurrentContext())
 40 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 1; own=false).own == false
 41 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 1; ctx=CUDAdrv.CuCurrentContext()).buf.ctx == CUDAdrv.CuCurrentContext()
 42 |   @test Base.unsafe_wrap(CuArray, CU_NULL, 2)            == CuArray{Nothing,1}(buf, (2,))
 43 |   @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, 2)   == CuArray{Nothing,1}(buf, (2,))
 44 |   @test Base.unsafe_wrap(CuArray{Nothing,1}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,))
 45 |   @test Base.unsafe_wrap(CuArray, CU_NULL, (1,2))            == CuArray{Nothing,2}(buf, (1,2))
 46 |   @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, (1,2))   == CuArray{Nothing,2}(buf, (1,2))
 47 |   @test Base.unsafe_wrap(CuArray{Nothing,2}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2))
 48 | 
 49 |   @test collect(cuzeros(2, 2)) == zeros(Float32, 2, 2)
 50 |   @test collect(cuones(2, 2)) == ones(Float32, 2, 2)
 51 | 
 52 |   @test collect(cufill(0, 2, 2)) == zeros(Float32, 2, 2)
 53 |   @test collect(cufill(1, 2, 2)) == ones(Float32, 2, 2)
 54 | end
 55 | 
 56 | @testset "Adapt" begin
 57 |   A = rand(Float32, 3, 3)
 58 |   dA = CuArray(A)
 59 |   @test adapt(Array, dA) ≈ A
 60 |   @test adapt(CuArray, A) ≈ dA
 61 | end
 62 | 
 63 | @testset "Broadcast" begin
 64 |   @test testf((x)       -> fill!(x, 1),  rand(3,3))
 65 |   @test testf((x, y)    -> map(+, x, y), rand(2, 3), rand(2, 3))
 66 |   @test testf((x)       -> sin.(x),      rand(2, 3))
 67 |   @test testf((x)       -> log.(x) .+ 1, rand(2, 3))
 68 |   @test testf((x)       -> 2x,           rand(2, 3))
 69 |   @test testf((x, y)    -> x .+ y,       rand(2, 3), rand(1, 3))
 70 |   @test testf((z, x, y) -> z .= x .+ y,  rand(2, 3), rand(2, 3), rand(2))
 71 |   @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL])
 72 |   @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6])
 73 | 
 74 |   @eval struct Whatever{T}
 75 |       x::Int
 76 |   end
 77 |   @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1])
 78 | end
 79 | 
 80 | @testset "Cufunc" begin
 81 |   gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3)))
 82 |   sig(x) = one(x) / (one(x) + exp(-x))
 83 |   f(x) = gelu(log(x)) * sig(x) * tanh(x)
 84 | 
 85 |   CuArrays.@cufunc gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3)))
 86 |   CuArrays.@cufunc sig(x) = one(x) / (one(x) + exp(-x))
 87 |   CuArrays.@cufunc f(x) = gelu(log(x)) * sig(x) * tanh(x)
 88 | 
 89 |   @test :gelu ∈ CuArrays.cufuncs()
 90 |   @test :sig ∈ CuArrays.cufuncs()
 91 |   @test :f ∈ CuArrays.cufuncs()
 92 |   @test testf((x)  -> gelu.(x), rand(3,3))
 93 |   @test testf((x)  -> sig.(x),  rand(3,3))
 94 |   @test testf((x)  -> f.(x),    rand(3,3))
 95 | end
 96 | 
 97 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/223
 98 | @testset "Ref Broadcast" begin
 99 |   foobar(idx, A) = A[idx]
100 |   @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42])))
101 | end
102 | 
103 | @testset "Broadcast Fix" begin
104 |   @test testf(x -> log.(x), rand(3,3))
105 |   @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3))
106 | 
107 |   if isdefined(CuArrays, :CUDNN)
108 |     using NNlib
109 | 
110 |     @test testf(x -> logσ.(x), rand(5))
111 | 
112 |     f(x) = logσ.(x)
113 |     ds = Dual.(rand(5),1)
114 |     @test f(ds) ≈ collect(f(CuArray(ds)))
115 |   end
116 | end
117 | 
118 | @testset "Reduce" begin
119 |   @test testf(x -> sum(x, dims=1), rand(2, 3))
120 |   @test testf(x -> sum(x, dims=2), rand(2, 3))
121 |   @test testf(x -> sum(x -> x^2, x, dims=1), rand(2, 3))
122 |   @test testf(x -> prod(x, dims=2), rand(2, 3))
123 | 
124 |   @test testf(x -> sum(x), rand(2, 3))
125 |   @test testf(x -> prod(x), rand(2, 3))
126 | end
127 | 
128 | @testset "0D" begin
129 |   x = CuArray{Float64}(undef)
130 |   x .= 1
131 |   @test collect(x)[] == 1
132 |   x /= 2
133 |   @test collect(x)[] == 0.5
134 | end
135 | 
136 | @testset "Slices" begin
137 |   @test testf(rand(5)) do x
138 |     y = x[2:4]
139 |     y .= 1
140 |     x
141 |   end
142 |   @test testf(rand(5)) do x
143 |     y = view(x, 2:4)
144 |     y .= 1
145 |     x
146 |   end
147 |   @test testf(x->view(x, :, 1:4, 3), rand(Float32, 5, 4, 3))
148 |   @allowscalar let x = cu(rand(Float32, 5, 4, 3))
149 |     @test_throws BoundsError view(x, :, :, 1:10)
150 | 
151 |     # Contiguous views should return new CuArray
152 |     @test typeof(view(x, :, 1, 2)) == CuVector{Float32}
153 |     @test typeof(view(x, 1:4, 1, 2)) == CuVector{Float32}
154 |     @test typeof(view(x, :, 1:4, 3)) == CuMatrix{Float32}
155 |     @test typeof(view(x, :, :, 1)) == CuMatrix{Float32}
156 |     @test typeof(view(x, :, :, :)) == CuArray{Float32,3}
157 |     @test typeof(view(x, :)) == CuVector{Float32}
158 |     @test typeof(view(x, 1:3)) == CuVector{Float32}
159 | 
160 |     # Non-contiguous views should fall back to base's SubArray
161 |     @test typeof(view(x, 1:3, 1:3, 3)) <: SubArray
162 |     @test typeof(view(x, 1, :, 3)) <: SubArray
163 |     @test typeof(view(x, 1, 1:4, 3)) <: SubArray
164 |     @test typeof(view(x, :, 1, 1:3)) <: SubArray
165 |     @test typeof(view(x, :, 1:2:4, 1)) <: SubArray
166 |     @test typeof(view(x, 1:2:5, 1, 1)) <: SubArray
167 |   end
168 | end
169 | 
170 | @testset "Reshape" begin
171 |   A = [1 2 3 4
172 |        5 6 7 8]
173 |   gA = reshape(CuArray(A),1,8)
174 |   _A = reshape(A,1,8)
175 |   _gA = Array(gA)
176 |   @test all(_A .== _gA)
177 |   A = [1,2,3,4]
178 |   gA = reshape(CuArray(A),4)
179 | end
180 | 
181 | @testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)),
182 |                                           d in -2:2
183 |   A = randn(10, 10)
184 |   @test f(A, d) == Array(f!(CuArray(A), d))
185 | end
186 | 
187 | @testset "Utilities" begin
188 |   t = @elapsed ret = CuArrays.@sync begin
189 |     # TODO: do something that takes a while on the GPU
190 |     #       (need to wrap clock64 in CUDAnative for that)
191 |     42
192 |   end
193 |   @test t >= 0
194 |   @test ret == 42
195 | end
196 | 


--------------------------------------------------------------------------------
/test/fft.jl:
--------------------------------------------------------------------------------
  1 | @testset "CUFFT" begin
  2 | 
  3 | if !isdefined(CuArrays, :CUFFT)
  4 | @warn "Not testing CUFFT"
  5 | else
  6 | using CuArrays.CUFFT
  7 | @info "Testing CUFFT $(CUFFT.version())"
  8 | 
  9 | # notes:
 10 | #   plan_bfft does not need separate testing since it is used by plan_ifft
 11 | 
 12 | using FFTW
 13 | 
 14 | N1 = 8
 15 | N2 = 32
 16 | N3 = 64
 17 | N4 = 8
 18 | 
 19 | MYRTOL = 1e-5
 20 | MYATOL = 1e-8
 21 | 
 22 | # out-of-place
 23 | function dotest1(X::AbstractArray{T,N}) where {T <: Complex,N}
 24 |     fftw_X = fft(X)
 25 |     d_X = CuArray(X)
 26 |     p = plan_fft(d_X)
 27 |     d_Y = p * d_X
 28 |     Y = collect(d_Y)
 29 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 30 | 
 31 |     pinv = plan_ifft(d_Y)
 32 |     d_Z = pinv * d_Y
 33 |     Z = collect(d_Z)
 34 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 35 | 
 36 |     pinv2 = inv(p)
 37 |     d_Z = pinv2 * d_Y
 38 |     Z = collect(d_Z)
 39 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 40 | end
 41 | 
 42 | function dotest1(X::AbstractArray{T,N}) where {T <: Real,N}
 43 |     fftw_X = rfft(X)
 44 |     d_X = CuArray(X)
 45 |     p = plan_rfft(d_X)
 46 |     d_Y = p * d_X
 47 |     Y = collect(d_Y)
 48 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 49 | 
 50 |     pinv = plan_irfft(d_Y,size(X,1))
 51 |     d_Z = pinv * d_Y
 52 |     Z = collect(d_Z)
 53 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 54 | 
 55 |     pinv2 = inv(p)
 56 |     d_Z = pinv2 * d_Y
 57 |     Z = collect(d_Z)
 58 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 59 | 
 60 |     pinv3 = inv(pinv)
 61 |     d_W = pinv3 * d_X
 62 |     W = collect(d_W)
 63 |     @test isapprox(W, Y, rtol = MYRTOL, atol = MYATOL)
 64 | end
 65 | 
 66 | # in-place
 67 | function dotest2(X::AbstractArray{T,N}) where {T <: Complex,N}
 68 |     fftw_X = fft(X)
 69 |     d_X = CuArray(X)
 70 |     p = plan_fft!(d_X)
 71 |     p * d_X
 72 |     Y = collect(d_X)
 73 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 74 | 
 75 |     pinv = plan_ifft!(d_X)
 76 |     pinv * d_X
 77 |     Z = collect(d_X)
 78 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 79 | end
 80 | 
 81 | # no inplace rfft for now
 82 | 
 83 | # batch transforms
 84 | function dotest3(X::AbstractArray{T,N},region) where {T <: Complex,N}
 85 |     fftw_X = fft(X,region)
 86 |     d_X = CuArray(X)
 87 |     p = plan_fft(d_X,region)
 88 |     d_Y = p * d_X
 89 |     Y = collect(d_Y)
 90 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
 91 | 
 92 |     pinv = plan_ifft(d_Y,region)
 93 |     d_Z = pinv * d_Y
 94 |     Z = collect(d_Z)
 95 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
 96 | end
 97 | 
 98 | function dotest3(X::AbstractArray{T,N},region) where {T <: Real,N}
 99 |     fftw_X = rfft(X,region)
100 |     d_X = CuArray(X)
101 |     p = plan_rfft(d_X,region)
102 |     d_Y = p * d_X
103 |     Y = collect(d_Y)
104 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
105 | 
106 |     pinv = plan_irfft(d_Y,size(X,region[1]),region)
107 |     d_Z = pinv * d_Y
108 |     Z = collect(d_Z)
109 |     @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL)
110 | end
111 | 
112 | 
113 | @testset "FFT" for (rtype,ctype) in [(Float32,ComplexF32), (Float64,ComplexF64)]
114 | 
115 | @testset "1D FFT" begin
116 |     dims = (N1,)
117 |     X = rand(ctype, dims)
118 |     dotest1(X)
119 | end
120 | @testset "1D inplace FFT" begin
121 |     dims = (N1,)
122 |     X = rand(ctype, dims)
123 |     dotest2(X)
124 | end
125 | 
126 | @testset "2D FFT" begin
127 |     dims = (N1,N2)
128 |     X = rand(ctype, dims)
129 |     dotest1(X)
130 | end
131 | @testset "2D inplace FFT" begin
132 |     dims = (N1,N2)
133 |     X = rand(ctype, dims)
134 |     dotest2(X)
135 | end
136 | 
137 | @testset "Batch 1D FFT" begin
138 |     dims = (N1,N2)
139 |     X = rand(ctype, dims)
140 |     dotest3(X,1)
141 | 
142 |     dims = (N1,N2)
143 |     X = rand(ctype, dims)
144 |     dotest3(X,2)
145 | 
146 |     dims = (N1,N2)
147 |     X = rand(ctype, dims)
148 |     dotest3(X,(1,2))
149 | end
150 | 
151 | @testset "3D FFT" begin
152 |     dims = (N1,N2,N3)
153 |     X = rand(ctype, dims)
154 |     dotest1(X)
155 | end
156 | @testset "3D inplace FFT" begin
157 |     dims = (N1,N2,N3)
158 |     X = rand(ctype, dims)
159 |     dotest2(X)
160 | end
161 | 
162 | @testset "Batch 2D FFT (in 3D)" begin
163 |     dims = (N1,N2,N3)
164 |     for region in [(1,2),(2,3),(1,3)]
165 |         X = rand(ctype, dims)
166 |         dotest3(X,region)
167 |     end
168 | 
169 |     X = rand(ctype, dims)
170 |     @test_throws ArgumentError dotest3(X,(3,1))
171 | end
172 | 
173 | @testset "Batch 2D FFT (in 4D)" begin
174 |     dims = (N1,N2,N3,N4)
175 |     for region in [(1,2),(1,4),(3,4)]
176 |         X = rand(ctype, dims)
177 |         dotest3(X,region)
178 |     end
179 |     for region in [(1,3),(2,3),(2,4)]
180 |         X = rand(ctype, dims)
181 |         @test_throws ArgumentError dotest3(X,region)
182 |     end
183 | 
184 | end
185 | 
186 | @testset "1D real FFT" begin
187 |     X = rand(rtype, N1)
188 |     dotest1(X)
189 | end
190 | 
191 | @testset "Batch 1D real FFT" begin
192 |     dims = (N1,N2)
193 |     X = rand(rtype, dims)
194 |     dotest3(X,1)
195 | 
196 |     dims = (N1,N2)
197 |     X = rand(rtype, dims)
198 |     dotest3(X,2)
199 | 
200 |     dims = (N1,N2)
201 |     X = rand(rtype, dims)
202 |     dotest3(X,(1,2))
203 | end
204 | 
205 | @testset "2D real FFT" begin
206 |     X = rand(rtype, N1,N2)
207 |     dotest1(X)
208 | end
209 | 
210 | @testset "Batch 2D real FFT (in 3D)" begin
211 |     dims = (N1,N2,N3)
212 |     for region in [(1,2),(2,3),(1,3)]
213 |         X = rand(rtype, dims)
214 |         dotest3(X,region)
215 |     end
216 | 
217 |     X = rand(rtype, dims)
218 |     @test_throws ArgumentError dotest3(X,(3,1))
219 | end
220 | 
221 | @testset "Batch 2D real FFT (in 4D)" begin
222 |     dims = (N1,N2,N3,N4)
223 |     for region in [(1,2),(1,4),(3,4)]
224 |         X = rand(rtype, dims)
225 |         dotest3(X,region)
226 |     end
227 |     for region in [(1,3),(2,3),(2,4)]
228 |         X = rand(rtype, dims)
229 |         @test_throws ArgumentError dotest3(X,region)
230 |     end
231 | end
232 | 
233 | @testset "3D real FFT" begin
234 |     X = rand(rtype, N1, N2, N3)
235 |     dotest1(X)
236 | end
237 | 
238 | end # testset FFT
239 | 
240 | # integer array arguments
241 | function dotest5(X::AbstractArray{T,N}) where {T <: Complex,N}
242 |     fftw_X = fft(X)
243 |     d_X = CuArray(X)
244 |     p = plan_fft(d_X)
245 |     d_Y = p * d_X
246 |     Y = collect(d_Y)
247 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
248 |     d_Y = fft(d_X)
249 |     Y = collect(d_Y)
250 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
251 | end
252 | 
253 | function dotest5(X::AbstractArray{T,N}) where {T <: Real,N}
254 |     fftw_X = rfft(X)
255 |     d_X = CuArray(X)
256 |     p = plan_rfft(d_X)
257 |     d_Y = p * d_X
258 |     Y = collect(d_Y)
259 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
260 |     d_Y = rfft(d_X)
261 |     Y = collect(d_Y)
262 |     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)
263 | end
264 | 
265 | @testset "Int FFT" for (rtype,ctype) in [(Int32,Complex{Int32}), (Int64,Complex{Int64})]
266 | 
267 | @testset "1D FFT" begin
268 |     dims = (N1,)
269 |     X = rand(ctype, dims)
270 |     dotest5(X)
271 | end
272 | 
273 | @testset "1D real FFT" begin
274 |     X = rand(rtype, N1)
275 |     dotest5(X)
276 | end
277 | 
278 | 
279 | end # testset int FFT
280 | 
281 | end
282 | 
283 | end
284 | 


--------------------------------------------------------------------------------
/src/dnn/libcudnn_types.jl:
--------------------------------------------------------------------------------
  1 | const CUDNN_DIM_MAX = 8
  2 | const CUDNN_LRN_MIN_N = 1
  3 | const CUDNN_LRN_MAX_N = 16
  4 | const CUDNN_LRN_MIN_K = 1.0e-5
  5 | const CUDNN_LRN_MIN_BETA = 0.01
  6 | const CUDNN_BN_MIN_EPSILON = 1.0e-5
  7 | 
  8 | mutable struct cudnnContext
  9 | end
 10 | 
 11 | const cudnnHandle_t = Ptr{cudnnContext}
 12 | 
 13 | # begin enum cudnnStatus_t
 14 | const cudnnStatus_t = UInt32
 15 | const CUDNN_STATUS_SUCCESS = (UInt32)(0)
 16 | const CUDNN_STATUS_NOT_INITIALIZED = (UInt32)(1)
 17 | const CUDNN_STATUS_ALLOC_FAILED = (UInt32)(2)
 18 | const CUDNN_STATUS_BAD_PARAM = (UInt32)(3)
 19 | const CUDNN_STATUS_INTERNAL_ERROR = (UInt32)(4)
 20 | const CUDNN_STATUS_INVALID_VALUE = (UInt32)(5)
 21 | const CUDNN_STATUS_ARCH_MISMATCH = (UInt32)(6)
 22 | const CUDNN_STATUS_MAPPING_ERROR = (UInt32)(7)
 23 | const CUDNN_STATUS_EXECUTION_FAILED = (UInt32)(8)
 24 | const CUDNN_STATUS_NOT_SUPPORTED = (UInt32)(9)
 25 | const CUDNN_STATUS_LICENSE_ERROR = (UInt32)(10)
 26 | # end enum cudnnStatus_t
 27 | 
 28 | mutable struct cudnnTensorStruct
 29 | end
 30 | 
 31 | const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct}
 32 | 
 33 | mutable struct cudnnConvolutionStruct
 34 | end
 35 | 
 36 | const cudnnConvolutionDescriptor_t = Ptr{cudnnConvolutionStruct}
 37 | 
 38 | mutable struct cudnnPoolingStruct
 39 | end
 40 | 
 41 | const cudnnPoolingDescriptor_t = Ptr{cudnnPoolingStruct}
 42 | 
 43 | mutable struct cudnnFilterStruct
 44 | end
 45 | 
 46 | const cudnnFilterDescriptor_t = Ptr{cudnnFilterStruct}
 47 | 
 48 | mutable struct cudnnLRNStruct
 49 | end
 50 | 
 51 | const cudnnLRNDescriptor_t = Ptr{cudnnLRNStruct}
 52 | 
 53 | mutable struct cudnnActivationStruct
 54 | end
 55 | 
 56 | const cudnnActivationDescriptor_t = Ptr{cudnnActivationStruct}
 57 | 
 58 | # begin enum cudnnDataType_t
 59 | const cudnnDataType_t = UInt32
 60 | const CUDNN_DATA_FLOAT = (UInt32)(0)
 61 | const CUDNN_DATA_DOUBLE = (UInt32)(1)
 62 | const CUDNN_DATA_HALF = (UInt32)(2)
 63 | # end enum cudnnDataType_t
 64 | 
 65 | # begin enum cudnnNanPropagation_t
 66 | const cudnnNanPropagation_t = UInt32
 67 | const CUDNN_NOT_PROPAGATE_NAN = (UInt32)(0)
 68 | const CUDNN_PROPAGATE_NAN = (UInt32)(1)
 69 | # end enum cudnnNanPropagation_t
 70 | 
 71 | # begin enum cudnnTensorFormat_t
 72 | const cudnnTensorFormat_t = UInt32
 73 | const CUDNN_TENSOR_NCHW = (UInt32)(0)
 74 | const CUDNN_TENSOR_NHWC = (UInt32)(1)
 75 | # end enum cudnnTensorFormat_t
 76 | 
 77 | # begin enum cudnnAddMode_t
 78 | const cudnnAddMode_t = UInt32
 79 | const CUDNN_ADD_IMAGE = (UInt32)(0)
 80 | const CUDNN_ADD_SAME_HW = (UInt32)(0)
 81 | const CUDNN_ADD_FEATURE_MAP = (UInt32)(1)
 82 | const CUDNN_ADD_SAME_CHW = (UInt32)(1)
 83 | const CUDNN_ADD_SAME_C = (UInt32)(2)
 84 | const CUDNN_ADD_FULL_TENSOR = (UInt32)(3)
 85 | # end enum cudnnAddMode_t
 86 | 
 87 | # begin enum cudnnConvolutionMode_t
 88 | const cudnnConvolutionMode_t = UInt32
 89 | const CUDNN_CONVOLUTION = (UInt32)(0)
 90 | const CUDNN_CROSS_CORRELATION = (UInt32)(1)
 91 | # end enum cudnnConvolutionMode_t
 92 | 
 93 | # begin enum cudnnConvolutionFwdPreference_t
 94 | const cudnnConvolutionFwdPreference_t = UInt32
 95 | const CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = (UInt32)(0)
 96 | const CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = (UInt32)(1)
 97 | const CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
 98 | # end enum cudnnConvolutionFwdPreference_t
 99 | 
100 | # begin enum cudnnConvolutionFwdAlgo_t
101 | const cudnnConvolutionFwdAlgo_t = UInt32
102 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = (UInt32)(0)
103 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = (UInt32)(1)
104 | const CUDNN_CONVOLUTION_FWD_ALGO_GEMM = (UInt32)(2)
105 | const CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = (UInt32)(3)
106 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT = (UInt32)(4)
107 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = (UInt32)(5)
108 | # end enum cudnnConvolutionFwdAlgo_t
109 | 
110 | mutable struct cudnnConvolutionFwdAlgoPerf_t
111 |     algo::cudnnConvolutionFwdAlgo_t
112 |     status::cudnnStatus_t
113 |     time::Cfloat
114 |     memory::Cint
115 | end
116 | 
117 | # begin enum cudnnConvolutionBwdFilterPreference_t
118 | const cudnnConvolutionBwdFilterPreference_t = UInt32
119 | const CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = (UInt32)(0)
120 | const CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = (UInt32)(1)
121 | const CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
122 | # end enum cudnnConvolutionBwdFilterPreference_t
123 | 
124 | # begin enum cudnnConvolutionBwdFilterAlgo_t
125 | const cudnnConvolutionBwdFilterAlgo_t = UInt32
126 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = (UInt32)(0)
127 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = (UInt32)(1)
128 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = (UInt32)(2)
129 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = (UInt32)(3)
130 | # end enum cudnnConvolutionBwdFilterAlgo_t
131 | 
132 | mutable struct cudnnConvolutionBwdFilterAlgoPerf_t
133 |     algo::cudnnConvolutionBwdFilterAlgo_t
134 |     status::cudnnStatus_t
135 |     time::Cfloat
136 |     memory::Cint
137 | end
138 | 
139 | # begin enum cudnnConvolutionBwdDataPreference_t
140 | const cudnnConvolutionBwdDataPreference_t = UInt32
141 | const CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = (UInt32)(0)
142 | const CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = (UInt32)(1)
143 | const CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2)
144 | # end enum cudnnConvolutionBwdDataPreference_t
145 | 
146 | # begin enum cudnnConvolutionBwdDataAlgo_t
147 | const cudnnConvolutionBwdDataAlgo_t = UInt32
148 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = (UInt32)(0)
149 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = (UInt32)(1)
150 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = (UInt32)(2)
151 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = (UInt32)(3)
152 | # end enum cudnnConvolutionBwdDataAlgo_t
153 | 
154 | mutable struct cudnnConvolutionBwdDataAlgoPerf_t
155 |     algo::cudnnConvolutionBwdDataAlgo_t
156 |     status::cudnnStatus_t
157 |     time::Cfloat
158 |     memory::Cint
159 | end
160 | 
161 | # begin enum cudnnSoftmaxAlgorithm_t
162 | const cudnnSoftmaxAlgorithm_t = UInt32
163 | const CUDNN_SOFTMAX_FAST = (UInt32)(0)
164 | const CUDNN_SOFTMAX_ACCURATE = (UInt32)(1)
165 | const CUDNN_SOFTMAX_LOG = (UInt32)(2)
166 | # end enum cudnnSoftmaxAlgorithm_t
167 | 
168 | # begin enum cudnnSoftmaxMode_t
169 | const cudnnSoftmaxMode_t = UInt32
170 | const CUDNN_SOFTMAX_MODE_INSTANCE = (UInt32)(0)
171 | const CUDNN_SOFTMAX_MODE_CHANNEL = (UInt32)(1)
172 | # end enum cudnnSoftmaxMode_t
173 | 
174 | # begin enum cudnnPoolingMode_t
175 | const cudnnPoolingMode_t = UInt32
176 | const CUDNN_POOLING_MAX = (UInt32)(0)
177 | const CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = (UInt32)(1)
178 | const CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = (UInt32)(2)
179 | # end enum cudnnPoolingMode_t
180 | 
181 | # begin enum cudnnActivationMode_t
182 | const cudnnActivationMode_t = UInt32
183 | const CUDNN_ACTIVATION_SIGMOID = (UInt32)(0)
184 | const CUDNN_ACTIVATION_RELU = (UInt32)(1)
185 | const CUDNN_ACTIVATION_TANH = (UInt32)(2)
186 | const CUDNN_ACTIVATION_CLIPPED_RELU = (UInt32)(3)
187 | const CUDNN_ACTIVATION_ELU = (UInt32)(4)
188 | const CUDNN_ACTIVATION_IDENTITY = (UInt32)(5)
189 | # end enum cudnnActivationMode_t
190 | 
191 | # begin enum cudnnLRNMode_t
192 | const cudnnLRNMode_t = UInt32
193 | const CUDNN_LRN_CROSS_CHANNEL_DIM1 = (UInt32)(0)
194 | # end enum cudnnLRNMode_t
195 | 
196 | # begin enum cudnnDivNormMode_t
197 | const cudnnDivNormMode_t = UInt32
198 | const CUDNN_DIVNORM_PRECOMPUTED_MEANS = (UInt32)(0)
199 | # end enum cudnnDivNormMode_t
200 | 
201 | # begin enum cudnnBatchNormMode_t
202 | const cudnnBatchNormMode_t = UInt32
203 | const CUDNN_BATCHNORM_PER_ACTIVATION = (UInt32)(0)
204 | const CUDNN_BATCHNORM_SPATIAL = (UInt32)(1)
205 | # end enum cudnnBatchNormMode_t
206 | 
207 | # begin enum cudnnMathType_t
208 | const cudnnMathType_t = UInt32
209 | const CUDNN_DEFAULT_MATH                    = 0
210 | const CUDNN_TENSOR_OP_MATH                  = 1
211 | const CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2
212 | # end enum cudnnMathType_t
213 | 


--------------------------------------------------------------------------------
/src/fft/wrappers.jl:
--------------------------------------------------------------------------------
  1 | # wrappers of the low-level CUBLAS functionality
  2 | 
  3 | # Note: we don't implement padded storage dimensions
  4 | function _mkplan(xtype, xdims, region)
  5 |     nrank = length(region)
  6 |     sz = [xdims[i] for i in region]
  7 |     csz = copy(sz)
  8 |     csz[1] = div(sz[1],2) + 1
  9 |     batch = prod(xdims) ÷ prod(sz)
 10 | 
 11 |     pp = Ref{cufftHandle_t}()
 12 |     if (nrank == 1) && (batch == 1)
 13 |         cufftPlan1d(pp, sz[1], xtype, 1)
 14 |     elseif (nrank == 2) && (batch == 1)
 15 |         cufftPlan2d(pp, sz[2], sz[1], xtype)
 16 |     elseif (nrank == 3) && (batch == 1)
 17 |         cufftPlan3d(pp, sz[3], sz[2], sz[1], xtype)
 18 |     else
 19 |         rsz = (length(sz) > 1) ? rsz = reverse(sz) : sz
 20 |         if ((region...,) == ((1:nrank)...,))
 21 |             # handle simple case ... simply! (for robustness)
 22 |            cufftPlanMany(pp, nrank, Cint[rsz...], C_NULL, 1, 1, C_NULL, 1, 1,
 23 |                          xtype, batch)
 24 |         else
 25 |             if nrank==1 || all(diff(collect(region)) .== 1)
 26 |                 # _stride: successive elements in innermost dimension
 27 |                 # _dist: distance between first elements of batches
 28 |                 if region[1] == 1
 29 |                     istride = 1
 30 |                     idist = prod(sz)
 31 |                     cdist = prod(csz)
 32 |                 else
 33 |                     if region[end] != length(xdims)
 34 |                         throw(ArgumentError("batching dims must be sequential"))
 35 |                     end
 36 |                     istride = prod(xdims[1:region[1]-1])
 37 |                     idist = 1
 38 |                     cdist = 1
 39 |                 end
 40 |                 inembed = Cint[rsz...]
 41 |                 cnembed = (length(csz) > 1) ? Cint[reverse(csz)...] : Cint[csz[1]]
 42 |                 ostride = istride
 43 |                 if xtype == CUFFT_R2C || xtype == CUFFT_D2Z
 44 |                     odist = cdist
 45 |                     onembed = cnembed
 46 |                 else
 47 |                     odist = idist
 48 |                     onembed = inembed
 49 |                 end
 50 |                 if xtype == CUFFT_C2R || xtype == CUFFT_Z2D
 51 |                     idist = cdist
 52 |                     inembed = cnembed
 53 |                 end
 54 |             else
 55 |                 if any(diff(collect(region)) .< 1)
 56 |                     throw(ArgumentError("region must be an increasing sequence"))
 57 |                 end
 58 |                 cdims = collect(xdims)
 59 |                 cdims[region[1]] = div(cdims[region[1]],2)+1
 60 | 
 61 |                 if region[1] == 1
 62 |                     istride = 1
 63 |                     ii=1
 64 |                     while (ii < nrank) && (region[ii] == region[ii+1]-1)
 65 |                         ii += 1
 66 |                     end
 67 |                     idist = prod(xdims[1:ii])
 68 |                     cdist = prod(cdims[1:ii])
 69 |                     ngaps = 0
 70 |                 else
 71 |                     istride = prod(xdims[1:region[1]-1])
 72 |                     idist = 1
 73 |                     cdist = 1
 74 |                     ngaps = 1
 75 |                 end
 76 |                 nem = ones(Int,nrank)
 77 |                 cem = ones(Int,nrank)
 78 |                 id = 1
 79 |                 for ii=1:nrank-1
 80 |                     if region[ii+1] > region[ii]+1
 81 |                         ngaps += 1
 82 |                     end
 83 |                     while id < region[ii+1]
 84 |                         nem[ii] *= xdims[id]
 85 |                         cem[ii] *= cdims[id]
 86 |                         id += 1
 87 |                     end
 88 |                     @assert nem[ii] >= sz[ii]
 89 |                 end
 90 |                 if region[end] < length(xdims)
 91 |                     ngaps += 1
 92 |                 end
 93 |                 # CUFFT represents batches by a single stride (_dist)
 94 |                 # so we must verify that region is consistent with this:
 95 |                 if ngaps > 1
 96 |                     throw(ArgumentError("batch regions must be sequential"))
 97 |                 end
 98 | 
 99 |                 inembed = Cint[reverse(nem)...]
100 |                 cnembed = Cint[reverse(cem)...]
101 |                 ostride = istride
102 |                 if xtype == CUFFT_R2C || xtype == CUFFT_D2Z
103 |                     odist = cdist
104 |                     onembed = cnembed
105 |                 else
106 |                     odist = idist
107 |                     onembed = inembed
108 |                 end
109 |                 if xtype == CUFFT_C2R || xtype == CUFFT_Z2D
110 |                     idist = cdist
111 |                     inembed = cnembed
112 |                 end
113 |             end
114 |             cufftPlanMany(pp, nrank, Cint[rsz...],
115 |                           inembed, istride, idist, onembed, ostride, odist,
116 |                           xtype, batch)
117 |         end
118 |     end
119 |     pp[]
120 | end
121 | 
122 | # this is used implicitly in the unsafe_execute methods below:
123 | unsafe_convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan
124 | 
125 | convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan
126 | 
127 | destroy_plan(plan::CuFFTPlan) = cufftDestroy(plan)
128 | 
129 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}) where {T,K}
130 |     (size(X) == p.sz) ||
131 |         throw(ArgumentError("CuFFT plan applied to wrong-size input"))
132 | end
133 | 
134 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}, Y::CuArray{Ty}) where {T,K,Ty}
135 |     assert_applicable(p, X)
136 |     (size(Y) == p.osz) ||
137 |         throw(ArgumentError("CuFFT plan applied to wrong-size output"))
138 |     # type errors should be impossible by dispatch, but just in case:
139 |     if p.xtype ∈ [CUFFT_C2R, CUFFT_Z2D]
140 |         (Ty == real(T)) ||
141 |             throw(ArgumentError("Type mismatch for argument Y"))
142 |     elseif p.xtype ∈ [CUFFT_R2C, CUFFT_D2Z]
143 |         (Ty == complex(T)) ||
144 |             throw(ArgumentError("Type mismatch for argument Y"))
145 |     else
146 |         (Ty == T) ||
147 |             throw(ArgumentError("Type mismatch for argument Y"))
148 |     end
149 | end
150 | 
151 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,true,N},
152 |                          x::CuArray{cufftComplex,N}) where {K,N}
153 |     @assert plan.xtype == CUFFT_C2C
154 |     cufftExecC2C(plan, x, x, K)
155 | end
156 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,N},
157 |                          x::CuArray{cufftComplex,N}) where {K,N}
158 |     @assert plan.xtype == CUFFT_C2R
159 |     cufftExecC2R(plan, x, x)
160 | end
161 | 
162 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,false,N},
163 |                          x::CuArray{cufftComplex,N}, y::CuArray{cufftComplex}
164 |                          ) where {K,N}
165 |     @assert plan.xtype == CUFFT_C2C
166 |     cufftExecC2C(plan, x, y, K)
167 | end
168 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
169 |                          x::CuArray{cufftComplex,N}, y::CuArray{cufftReal}
170 |                          ) where {K,N}
171 |     @assert plan.xtype == CUFFT_C2R
172 |     cufftExecC2R(plan, x, y)
173 | end
174 | 
175 | function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N},
176 |                          x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N}
177 |                          ) where {K,N}
178 |     @assert plan.xtype == CUFFT_R2C
179 |     cufftExecR2C(plan, x, y)
180 | end
181 | 
182 | # double prec.
183 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,true,N},
184 |                          x::CuArray{cufftDoubleComplex,N}) where {K,N}
185 |     @assert plan.xtype == CUFFT_Z2Z
186 |     cufftExecZ2Z(plan, x, x, K)
187 | end
188 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,N},
189 |                          x::CuArray{cufftDoubleComplex,N}) where {K,N}
190 |     @assert plan.xtype == CUFFT_Z2D
191 |     cufftExecZ2D(plan, x, x)
192 | end
193 | 
194 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,false,N},
195 |                          x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleComplex}
196 |                          ) where {K,N}
197 |     @assert plan.xtype == CUFFT_Z2Z
198 |     cufftExecZ2Z(plan, x, y, K)
199 | end
200 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N},
201 |                          x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleReal}
202 |                          ) where {K,N}
203 |     @assert plan.xtype == CUFFT_Z2D
204 |     cufftExecZ2D(plan, x, y)
205 | end
206 | 
207 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,false,N},
208 |                          x::CuArray{cufftDoubleReal,N}, y::CuArray{cufftDoubleComplex,N}
209 |                          ) where {K,N}
210 |     @assert plan.xtype == CUFFT_D2Z
211 |     cufftExecD2Z(plan, x, y)
212 | end
213 | 


--------------------------------------------------------------------------------
/src/array.jl:
--------------------------------------------------------------------------------
  1 | import CUDAnative: DevicePtr
  2 | 
  3 | mutable struct CuArray{T,N} <: GPUArray{T,N}
  4 |   buf::Mem.Buffer
  5 |   own::Bool
  6 | 
  7 |   dims::Dims{N}
  8 |   offset::Int
  9 | 
 10 |   function CuArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N}
 11 |     xs = new{T,N}(buf, own, dims, offset)
 12 |     if own
 13 |       Mem.retain(buf)
 14 |       finalizer(unsafe_free!, xs)
 15 |     end
 16 |     return xs
 17 |   end
 18 | end
 19 | 
 20 | CuVector{T} = CuArray{T,1}
 21 | CuMatrix{T} = CuArray{T,2}
 22 | CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}
 23 | 
 24 | const INVALID = Mem.alloc(0)
 25 | 
 26 | function unsafe_free!(xs::CuArray{<:Any,N}) where {N}
 27 |   xs.buf === INVALID && return
 28 |   Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs)))
 29 |   xs.dims = Tuple(0 for _ in 1:N)
 30 |   xs.buf = INVALID
 31 |   return
 32 | end
 33 | 
 34 | 
 35 | ## construction
 36 | 
 37 | # type and dimensionality specified, accepting dims as tuples of Ints
 38 | CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
 39 |   CuArray{T,N}(alloc(prod(dims)*sizeof(T)), dims)
 40 | 
 41 | # type and dimensionality specified, accepting dims as series of Ints
 42 | CuArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = CuArray{T,N}(undef, dims)
 43 | 
 44 | # type but not dimensionality specified
 45 | CuArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 46 | CuArray{T}(::UndefInitializer, dims::Integer...) where {T} =
 47 |   CuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
 48 | 
 49 | # empty vector constructor
 50 | CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0)
 51 | 
 52 | # do-block constructors
 53 | for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N))
 54 |   @eval begin
 55 |     function $ctor(f::Function, args...) where {$(tvars...)}
 56 |       xs = $ctor(args...)
 57 |       try
 58 |         f(xs)
 59 |       finally
 60 |         unsafe_free!(xs)
 61 |       end
 62 |     end
 63 |   end
 64 | end
 65 | 
 66 | 
 67 | Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a))
 68 | Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 69 | Base.similar(a::CuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
 70 | 
 71 | 
 72 | """
 73 |   unsafe_wrap(::CuArray, ptr::CuPtr{T}, dims; own=false, ctx=CuCurrentContext())
 74 | 
 75 | Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer
 76 | element type `T` determines the array element type. `dims` is either an integer (for a 1d
 77 | array) or a tuple of the array dimensions. `own` optionally specified whether Julia should
 78 | take ownership of the memory, calling `free` when the array is no longer referenced. The
 79 | `ctx` argument determines the CUDA context where the data is allocated in.
 80 | """
 81 | function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}},
 82 |                           p::CuPtr{T}, dims::NTuple{N,Int};
 83 |                           own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T,N}
 84 |   buf = Mem.Buffer(convert(CuPtr{Cvoid}, p), prod(dims) * sizeof(T), ctx)
 85 |   return CuArray{T, length(dims)}(buf, dims; own=own)
 86 | end
 87 | function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}},
 88 |                           p::CuPtr{T}, dim::Integer;
 89 |                           own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T}
 90 |   unsafe_wrap(Atype, p, (dim,); own=own, ctx=ctx)
 91 | end
 92 | Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} =
 93 |   throw(ArgumentError("cannot wrap a CPU pointer with a $T"))
 94 | 
 95 | 
 96 | ## array interface
 97 | 
 98 | Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T)
 99 | 
100 | Base.size(x::CuArray) = x.dims
101 | Base.sizeof(x::CuArray) = Base.elsize(x) * length(x)
102 | 
103 | 
104 | ## interop with other arrays
105 | 
106 | CuArray{T,N}(xs::AbstractArray{T,N}) where {T,N} =
107 |   isbits(xs) ?
108 |     (CuArray{T,N}(undef, size(xs)) .= xs) :
109 |     copyto!(CuArray{T,N}(undef, size(xs)), collect(xs))
110 | 
111 | CuArray{T,N}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}((x -> T(x)).(xs))
112 | 
113 | # underspecified constructors
114 | CuArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}(xs)
115 | (::Type{CuArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = CuArray{S,N}(x)
116 | CuArray(A::AbstractArray{T,N}) where {T,N} = CuArray{T,N}(A)
117 | 
118 | # idempotency
119 | CuArray{T,N}(xs::CuArray{T,N}) where {T,N} = xs
120 | 
121 | 
122 | ## conversions
123 | 
124 | Base.convert(::Type{T}, x::T) where T <: CuArray = x
125 | 
126 | function Base._reshape(parent::CuArray, dims::Dims)
127 |   n = length(parent)
128 |   prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
129 |   return CuArray{eltype(parent),length(dims)}(parent.buf, dims;
130 |                                               offset=parent.offset, own=parent.own)
131 | end
132 | function Base._reshape(parent::CuArray{T,1}, dims::Tuple{Int}) where T
133 |   n = length(parent)
134 |   prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
135 |   return parent
136 | end
137 | 
138 | 
139 | ## interop with C libraries
140 | 
141 | """
142 |   buffer(array::CuArray [, index])
143 | 
144 | Get the native address of a CuArray, optionally at a given location `index`.
145 | Equivalent of `Base.pointer` on `Array`s.
146 | """
147 | function buffer(xs::CuArray, index=1)
148 |   extra_offset = (index-1) * Base.elsize(xs)
149 |   Mem.Buffer(xs.buf.ptr + xs.offset + extra_offset,
150 |              sizeof(xs) - extra_offset,
151 |              xs.buf.ctx)
152 | end
153 | 
154 | Base.cconvert(::Type{<:Ptr}, x::CuArray) = throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
155 | Base.cconvert(::Type{<:CuPtr}, x::CuArray) = buffer(x)
156 | 
157 | 
158 | ## interop with CUDAnative
159 | 
160 | function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N}
161 |   ptr = Base.unsafe_convert(CuPtr{T}, buffer(a))
162 |   CuDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr))
163 | end
164 | 
165 | Adapt.adapt_storage(::CUDAnative.Adaptor, xs::CuArray{T,N}) where {T,N} =
166 |   convert(CuDeviceArray{T,N,AS.Global}, xs)
167 | 
168 | 
169 | 
170 | ## interop with CPU array
171 | 
172 | # We don't convert isbits types in `adapt`, since they are already
173 | # considered GPU-compatible.
174 | 
175 | Adapt.adapt_storage(::Type{<:CuArray}, xs::AbstractArray) =
176 |   isbits(xs) ? xs : convert(CuArray, xs)
177 | 
178 | Adapt.adapt_storage(::Type{<:CuArray{T}}, xs::AbstractArray{<:Real}) where T <: AbstractFloat =
179 |   isbits(xs) ? xs : convert(CuArray{T}, xs)
180 | 
181 | Adapt.adapt_storage(::Type{<:Array}, xs::CuArray) = convert(Array, xs)
182 | 
183 | Base.collect(x::CuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x)
184 | 
185 | function Base.unsafe_copyto!(dest::CuArray{T}, doffs, src::Array{T}, soffs, n) where T
186 |   Mem.upload!(buffer(dest, doffs), pointer(src, soffs), n*sizeof(T))
187 |   return dest
188 | end
189 | 
190 | function Base.unsafe_copyto!(dest::Array{T}, doffs, src::CuArray{T}, soffs, n) where T
191 |   Mem.download!(pointer(dest, doffs), buffer(src, soffs), n*sizeof(T))
192 |   return dest
193 | end
194 | 
195 | function Base.unsafe_copyto!(dest::CuArray{T}, doffs, src::CuArray{T}, soffs, n) where T
196 |   Mem.transfer!(buffer(dest, doffs), buffer(src, soffs), n*sizeof(T))
197 |   return dest
198 | end
199 | 
200 | function Base.deepcopy_internal(x::CuArray, dict::IdDict)
201 |   haskey(dict, x) && return dict[x]::typeof(x)
202 |   return dict[x] = copy(x)
203 | end
204 | 
205 | 
206 | ## utilities
207 | 
208 | cu(xs) = adapt(CuArray{Float32}, xs)
209 | Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
210 | 
211 | cuzeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0)
212 | cuones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1)
213 | cuzeros(dims...) = cuzeros(Float32, dims...)
214 | cuones(dims...) = cuones(Float32, dims...)
215 | cufill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v)
216 | cufill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v)
217 | 
218 | # optimized implementation of `fill!` for types that are directly supported by memset
219 | const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32)
220 | const MemsetCompatTypes = Union{UInt8, Int8,
221 |                                 UInt16, Int16, Float16,
222 |                                 UInt32, Int32, Float32}
223 | function Base.fill!(A::CuArray{T}, x) where T <: MemsetCompatTypes
224 |   y = reinterpret(MemsetTypes[sizeof(T)], convert(T, x))
225 |   Mem.set!(buffer(A), y, length(A))
226 |   A
227 | end
228 | 
229 | 
230 | ## generic linear algebra routines
231 | 
232 | function LinearAlgebra.tril!(A::CuMatrix{T}, d::Integer = 0) where T
233 |   function kernel!(_A, _d)
234 |     li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
235 |     m, n = size(_A)
236 |     if 0 < li <= m*n
237 |       i, j = Tuple(CartesianIndices(_A)[li])
238 |       if i < j - _d
239 |         _A[i, j] = 0
240 |       end
241 |     end
242 |     return nothing
243 |   end
244 | 
245 |   blk, thr = cudims(A)
246 |   @cuda blocks=blk threads=thr kernel!(A, d)
247 |   return A
248 | end
249 | 
250 | function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T
251 |   function kernel!(_A, _d)
252 |     li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
253 |     m, n = size(_A)
254 |     if 0 < li <= m*n
255 |       i, j = Tuple(CartesianIndices(_A)[li])
256 |       if j < i + _d
257 |         _A[i, j] = 0
258 |       end
259 |     end
260 |     return nothing
261 |   end
262 | 
263 |   blk, thr = cudims(A)
264 |   @cuda blocks=blk threads=thr kernel!(A, d)
265 |   return A
266 | end
267 | 


--------------------------------------------------------------------------------
/src/blas/highlevel.jl:
--------------------------------------------------------------------------------
  1 | # LinearAlgebra-style wrappers of the CUBLAS functionality
  2 | 
  3 | 
  4 | cublas_size(t::Char, M::CuVecOrMat) = (size(M, t=='N' ? 1 : 2), size(M, t=='N' ? 2 : 1))
  5 | 
  6 | CublasArray{T<:CublasFloat} = CuArray{T}
  7 | 
  8 | 
  9 | #
 10 | # BLAS 1
 11 | #
 12 | 
 13 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Number) =
 14 |   scal!(length(x), convert(eltype(x), k), x, 1)
 15 | 
 16 | # Work around ambiguity with GPUArrays wrapper
 17 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Real) =
 18 |   invoke(rmul!, Tuple{typeof(x), Number}, x, k)
 19 | 
 20 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{Float32,Float64}
 21 |     n = length(DX)
 22 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 23 |     dot(n, DX, 1, DY, 1)
 24 | end
 25 | 
 26 | function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 27 |     n = length(DX)
 28 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 29 |     dotc(n, DX, 1, DY, 1)
 30 | end
 31 | 
 32 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 33 |     dotc(DX, DY)
 34 | end
 35 | 
 36 | function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64}
 37 |     n = length(DX)
 38 |     n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))"))
 39 |     dotu(n, DX, 1, DY, 1)
 40 | end
 41 | 
 42 | LinearAlgebra.norm(x::CublasArray) = nrm2(x)
 43 | LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1)
 44 | 
 45 | function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat
 46 |     length(x)==length(y) || throw(DimensionMismatch(""))
 47 |     axpy!(length(x), convert(T,alpha), x, 1, y, 1)
 48 | end
 49 | 
 50 | Base.argmin(xs::CublasArray{<:CublasReal}) = iamin(xs)
 51 | Base.argmax(xs::CublasArray{<:CublasReal}) = iamax(xs)
 52 | 
 53 | 
 54 | 
 55 | #
 56 | # BLAS 2
 57 | #
 58 | 
 59 | # GEMV
 60 | 
 61 | function gemv_wrapper!(y::CuVector{T}, tA::Char, A::CuMatrix{T}, x::CuVector{T},
 62 |                        alpha = one(T), beta = zero(T)) where T<:CublasFloat
 63 |     mA, nA = cublas_size(tA, A)
 64 |     if nA != length(x)
 65 |         throw(DimensionMismatch("second dimension of A, $nA, does not match length of x, $(length(x))"))
 66 |     end
 67 |     if mA != length(y)
 68 |         throw(DimensionMismatch("first dimension of A, $mA, does not match length of y, $(length(y))"))
 69 |     end
 70 |     if mA == 0
 71 |         return y
 72 |     end
 73 |     if nA == 0
 74 |         return rmul!(y, 0)
 75 |     end
 76 |     gemv!(tA, alpha, A, x, beta, y)
 77 | end
 78 | 
 79 | LinearAlgebra.mul!(Y::CuVector{T}, A::CuMatrix{T}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'N', A,  B)
 80 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Transpose{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B)
 81 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B)
 82 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasComplex = gemv_wrapper!(Y, 'C', A.parent, B)
 83 | 
 84 | 
 85 | 
 86 | #
 87 | # BLAS 3
 88 | #
 89 | 
 90 | # GEMM
 91 | 
 92 | function gemm_wrapper!(C::CuVecOrMat{T}, tA::Char, tB::Char,
 93 |                    A::CuVecOrMat{T},
 94 |                    B::CuVecOrMat{T},
 95 |                    alpha = one(T),
 96 |                    beta = zero(T)) where T <: CublasFloat
 97 |     mA, nA = cublas_size(tA, A)
 98 |     mB, nB = cublas_size(tB, B)
 99 | 
100 |     if nA != mB
101 |         throw(DimensionMismatch("A has dimensions ($mA,$nA) but B has dimensions ($mB,$nB)"))
102 |     end
103 | 
104 |     if C === A || B === C
105 |         throw(ArgumentError("output matrix must not be aliased with input matrix"))
106 |     end
107 | 
108 |     if mA == 0 || nA == 0 || nB == 0
109 |         if size(C) != (mA, nB)
110 |             throw(DimensionMismatch("C has dimensions $(size(C)), should have ($mA,$nB)"))
111 |         end
112 |         return LinearAlgebra.rmul!(C, 0)
113 |     end
114 | 
115 |     gemm!(tA, tB, alpha, A, B, beta, C)
116 | end
117 | 
118 | # Mutating
119 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuVecOrMat{T}, B::CuVecOrMat{T}) where T<:CublasFloat = gemm_wrapper!(C, 'N', 'N', A, B)
120 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
121 |     gemm_wrapper!(C, 'T', 'N', parent(trA), B)
122 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
123 |     gemm_wrapper!(C, 'N', 'T', A, parent(trB))
124 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
125 |     gemm_wrapper!(C, 'T', 'T', parent(trA), parent(trB))
126 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasReal =
127 |     gemm_wrapper!(C, 'T', 'N', parent(adjA), B)
128 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
129 |     gemm_wrapper!(C, 'C', 'N', parent(adjA), B)
130 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal =
131 |     gemm_wrapper!(C, 'N', 'T', A, parent(adjB))
132 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
133 |     gemm_wrapper!(C, 'N', 'C', A, parent(adjB))
134 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal =
135 |     gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(adjB))
136 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
137 |     gemm_wrapper!(C, 'C', 'C', parent(adjA), parent(adjB))
138 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}) where T<:CublasReal =
139 |     gemm_wrapper!(C, 'T', 'T', parent(trA), parent(adjB))
140 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat =
141 |     gemm_wrapper!(C, 'T', 'C', parent(trA), parent(adjB))
142 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasReal =
143 |     gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(trB))
144 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T <: CublasFloat =
145 |     gemm_wrapper!(C, 'C', 'T', parent(adjA), parent(trB))
146 | 
147 | 
148 | # TRSM
149 | 
150 | # ldiv!
151 | ## No transpose/adjoint
152 | LinearAlgebra.ldiv!(A::UpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
153 |     CUBLAS.trsm!('L', 'U', 'N', 'N', one(T), parent(A), B)
154 | LinearAlgebra.ldiv!(A::UnitUpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
155 |     CUBLAS.trsm!('L', 'U', 'N', 'U', one(T), parent(A), B)
156 | LinearAlgebra.ldiv!(A::LowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
157 |     CUBLAS.trsm!('L', 'L', 'N', 'N', one(T), parent(A), B)
158 | LinearAlgebra.ldiv!(A::UnitLowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat =
159 |     CUBLAS.trsm!('L', 'L', 'N', 'U', one(T), parent(A), B)
160 | ## Adjoint
161 | LinearAlgebra.ldiv!(A::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
162 |     CUBLAS.trsm!('L', 'U', 'C', 'N', one(T), parent(parent(A)), B)
163 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
164 |     CUBLAS.trsm!('L', 'U', 'C', 'U', one(T), parent(parent(A)), B)
165 | LinearAlgebra.ldiv!(A::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
166 |     CUBLAS.trsm!('L', 'L', 'C', 'N', one(T), parent(parent(A)), B)
167 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
168 |     CUBLAS.trsm!('L', 'L', 'C', 'U', one(T), parent(parent(A)), B)
169 | ## Transpose
170 | LinearAlgebra.ldiv!(A::Transpose{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
171 |     CUBLAS.trsm!('L', 'U', 'T', 'N', one(T), parent(parent(A)), B)
172 | LinearAlgebra.ldiv!(A::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
173 |     CUBLAS.trsm!('L', 'U', 'T', 'U', one(T), parent(parent(A)), B)
174 | LinearAlgebra.ldiv!(A::Transpose{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
175 |     CUBLAS.trsm!('L', 'L', 'T', 'N', one(T), parent(parent(A)), B)
176 | LinearAlgebra.ldiv!(A::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat =
177 |     CUBLAS.trsm!('L', 'L', 'T', 'U', one(T), parent(parent(A)), B)
178 | 
179 | # rdiv!
180 | ## No transpose/adjoint
181 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
182 |     CUBLAS.trsm!('R', 'U', 'N', 'N', one(T), parent(B), A)
183 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitUpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
184 |     CUBLAS.trsm!('R', 'U', 'N', 'U', one(T), parent(B), A)
185 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::LowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
186 |     CUBLAS.trsm!('R', 'L', 'N', 'N', one(T), parent(B), A)
187 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitLowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat =
188 |     CUBLAS.trsm!('R', 'L', 'N', 'U', one(T), parent(B), A)
189 | ## Adjoint
190 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
191 |     CUBLAS.trsm!('R', 'U', 'C', 'N', one(T), parent(parent(B)), A)
192 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
193 |     CUBLAS.trsm!('R', 'U', 'C', 'U', one(T), parent(parent(B)), A)
194 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
195 |     CUBLAS.trsm!('R', 'L', 'C', 'N', one(T), parent(parent(B)), A)
196 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
197 |     CUBLAS.trsm!('R', 'L', 'C', 'U', one(T), parent(parent(B)), A)
198 | ## Transpose
199 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
200 |     CUBLAS.trsm!('R', 'U', 'T', 'N', one(T), parent(parent(B)), A)
201 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
202 |     CUBLAS.trsm!('R', 'U', 'T', 'U', one(T), parent(parent(B)), A)
203 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
204 |     CUBLAS.trsm!('R', 'L', 'T', 'N', one(T), parent(parent(B)), A)
205 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat =
206 |     CUBLAS.trsm!('R', 'L', 'T', 'U', one(T), parent(parent(B)), A)
207 | 


--------------------------------------------------------------------------------
/src/sparse/array.jl:
--------------------------------------------------------------------------------
  1 | # custom extension of CuArray in CUDArt for sparse vectors/matrices
  2 | # using CSC format for interop with Julia's native sparse functionality
  3 | 
  4 | import Base: length, size, ndims, eltype, similar, pointer, stride,
  5 |     copy, convert, reinterpret, show, summary, copyto!, get!, fill!, collect
  6 | import LinearAlgebra: BlasFloat, Hermitian, HermOrSym, issymmetric,
  7 |     ishermitian, istriu, istril, Symmetric, UpperTriangular, LowerTriangular
  8 | import SparseArrays: sparse, SparseMatrixCSC
  9 | 
 10 | abstract type AbstractCuSparseArray{Tv, N} <: AbstractSparseArray{Tv, Cint, N} end
 11 | const AbstractCuSparseVector{Tv} = AbstractCuSparseArray{Tv,1}
 12 | const AbstractCuSparseMatrix{Tv} = AbstractCuSparseArray{Tv,2}
 13 | 
 14 | mutable struct CuSparseVector{Tv} <: AbstractCuSparseVector{Tv}
 15 |     iPtr::CuVector{Cint}
 16 |     nzVal::CuVector{Tv}
 17 |     dims::NTuple{2,Int}
 18 |     nnz::Cint
 19 | 
 20 |     function CuSparseVector{Tv}(iPtr::CuVector{Cint}, nzVal::CuVector{Tv}, dims::Int, nnz::Cint) where Tv
 21 |         new(iPtr,nzVal,(dims,1),nnz)
 22 |     end
 23 | end
 24 | 
 25 | function CuArrays.unsafe_free!(xs::CuSparseVector)
 26 |     unsafe_free!(xs.iPtr)
 27 |     unsafe_free!(xs.nzVal)
 28 |     return
 29 | end
 30 | 
 31 | mutable struct CuSparseMatrixCSC{Tv} <: AbstractCuSparseMatrix{Tv}
 32 |     colPtr::CuVector{Cint}
 33 |     rowVal::CuVector{Cint}
 34 |     nzVal::CuVector{Tv}
 35 |     dims::NTuple{2,Int}
 36 |     nnz::Cint
 37 | 
 38 |     function CuSparseMatrixCSC{Tv}(colPtr::CuVector{Cint}, rowVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv
 39 |         new(colPtr,rowVal,nzVal,dims,nnz)
 40 |     end
 41 | end
 42 | 
 43 | function CuSparseMatrixCSC!(xs::CuSparseVector)
 44 |     unsafe_free!(xs.colPtr)
 45 |     unsafe_free!(xs.rowVal)
 46 |     unsafe_free!(xs.nzVal)
 47 |     return
 48 | end
 49 | 
 50 | """
 51 | Container to hold sparse matrices in compressed sparse row (CSR) format on the
 52 | GPU.
 53 | 
 54 | **Note**: Most CUSPARSE operations work with CSR formatted matrices, rather
 55 | than CSC.
 56 | """
 57 | mutable struct CuSparseMatrixCSR{Tv} <: AbstractCuSparseMatrix{Tv}
 58 |     rowPtr::CuVector{Cint}
 59 |     colVal::CuVector{Cint}
 60 |     nzVal::CuVector{Tv}
 61 |     dims::NTuple{2,Int}
 62 |     nnz::Cint
 63 | 
 64 |     function CuSparseMatrixCSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv
 65 |         new(rowPtr,colVal,nzVal,dims,nnz)
 66 |     end
 67 | end
 68 | 
 69 | function CuSparseMatrixCSR!(xs::CuSparseVector)
 70 |     unsafe_free!(xs.rowPtr)
 71 |     unsafe_free!(xs.colVal)
 72 |     unsafe_free!(xs.nzVal)
 73 |     return
 74 | end
 75 | 
 76 | """
 77 | Container to hold sparse matrices in block compressed sparse row (BSR) format on
 78 | the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are
 79 | "block" sparse - rare blocks of non-sparse regions.
 80 | """
 81 | mutable struct CuSparseMatrixBSR{Tv} <: AbstractCuSparseMatrix{Tv}
 82 |     rowPtr::CuVector{Cint}
 83 |     colVal::CuVector{Cint}
 84 |     nzVal::CuVector{Tv}
 85 |     dims::NTuple{2,Int}
 86 |     blockDim::Cint
 87 |     dir::SparseChar
 88 |     nnz::Cint
 89 | 
 90 |     function CuSparseMatrixBSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int},blockDim::Cint, dir::SparseChar, nnz::Cint) where Tv
 91 |         new(rowPtr,colVal,nzVal,dims,blockDim,dir,nnz)
 92 |     end
 93 | end
 94 | 
 95 | function CuSparseMatrixBSR!(xs::CuSparseVector)
 96 |     unsafe_free!(xs.rowPtr)
 97 |     unsafe_free!(xs.colVal)
 98 |     unsafe_free!(xs.nzVal)
 99 |     return
100 | end
101 | 
102 | """
103 | Container to hold sparse matrices in NVIDIA's hybrid (HYB) format on the GPU.
104 | HYB format is an opaque struct, which can be converted to/from using
105 | CUSPARSE routines.
106 | """
107 | mutable struct CuSparseMatrixHYB{Tv} <: AbstractCuSparseMatrix{Tv}
108 |     Mat::cusparseHybMat_t
109 |     dims::NTuple{2,Int}
110 |     nnz::Cint
111 | 
112 |     function CuSparseMatrixHYB{Tv}(Mat::cusparseHybMat_t, dims::NTuple{2,Int}, nnz::Cint) where Tv
113 |         new(Mat,dims,nnz)
114 |     end
115 | end
116 | 
117 | """
118 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref),
119 | and `Hermitian` and `Symmetric` versions of these two containers. A function accepting
120 | this type can make use of performance improvements by only indexing one triangle of the
121 | matrix if it is guaranteed to be hermitian/symmetric.
122 | """
123 | const CompressedSparse{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},HermOrSym{T,CuSparseMatrixCSC{T}},HermOrSym{T,CuSparseMatrixCSR{T}}}
124 | 
125 | """
126 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref),
127 | [`CuSparseMatrixBSR`](@ref), and [`CuSparseMatrixHYB`](@ref).
128 | """
129 | const CuSparseMatrix{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T}, CuSparseMatrixBSR{T}, CuSparseMatrixHYB{T}}
130 | 
131 | Hermitian{T}(Mat::CuSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U')
132 | 
133 | length(g::CuSparseVector) = prod(g.dims)
134 | size(g::CuSparseVector) = g.dims
135 | ndims(g::CuSparseVector) = 1
136 | length(g::CuSparseMatrix) = prod(g.dims)
137 | size(g::CuSparseMatrix) = g.dims
138 | ndims(g::CuSparseMatrix) = 2
139 | 
140 | function size(g::CuSparseVector, d::Integer)
141 |     if d == 1
142 |         return g.dims[d]
143 |     elseif d > 1
144 |         return 1
145 |     else
146 |         throw(ArgumentError("dimension must be ≥ 1, got $d"))
147 |     end
148 | end
149 | 
150 | function size(g::CuSparseMatrix, d::Integer)
151 |     if d in [1, 2]
152 |         return g.dims[d]
153 |     elseif d > 1
154 |         return 1
155 |     else
156 |         throw(ArgumentError("dimension must be ≥ 1, got $d"))
157 |     end
158 | end
159 | 
160 | issymmetric(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR})= false
161 | ishermitian(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR}) where T = false
162 | issymmetric(M::Symmetric{CuSparseMatrixCSC})= true
163 | ishermitian(M::Hermitian{CuSparseMatrixCSC}) = true
164 | 
165 | for mat_type in [:CuSparseMatrixCSC, :CuSparseMatrixCSR, :CuSparseMatrixBSR, :CuSparseMatrixHYB]
166 |     @eval begin
167 |         istriu(M::UpperTriangular{$mat_type}) = true
168 |         istril(M::UpperTriangular{$mat_type}) = false
169 |         istriu(M::LowerTriangular{$mat_type}) = false 
170 |         istril(M::LowerTriangular{$mat_type}) = true 
171 |     end
172 | end
173 | eltype(g::CuSparseMatrix{T}) where T = T
174 | 
175 | function collect(Vec::CuSparseVector)
176 |     SparseVector(Vec.dims[1], collect(Vec.iPtr), collect(Vec.nzVal))
177 | end
178 | 
179 | function collect(Mat::CuSparseMatrixCSC)
180 |     SparseMatrixCSC(Mat.dims[1], Mat.dims[2], collect(Mat.colPtr), collect(Mat.rowVal), collect(Mat.nzVal))
181 | end
182 | function collect(Mat::CuSparseMatrixCSR)
183 |     rowPtr = collect(Mat.rowPtr)
184 |     colVal = collect(Mat.colVal)
185 |     nzVal = collect(Mat.nzVal)
186 |     #construct Is
187 |     I = similar(colVal)
188 |     counter = 1
189 |     for row = 1 : size(Mat)[1], k = rowPtr[row] : (rowPtr[row+1]-1)
190 |         I[counter] = row
191 |         counter += 1
192 |     end
193 |     return sparse(I,colVal,nzVal,Mat.dims[1],Mat.dims[2])
194 | end
195 | 
196 | summary(g::CuSparseMatrix) = string(g)
197 | summary(g::CuSparseVector) = string(g)
198 | 
199 | CuSparseVector(iPtr::Vector{Ti}, nzVal::Vector{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(CuArray(convert(Vector{Cint},iPtr)), CuArray(nzVal), dims, convert(Cint,length(nzVal)))
200 | CuSparseVector(iPtr::CuArray{Ti}, nzVal::CuArray{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(iPtr, nzVal, dims, convert(Cint,length(nzVal)))
201 | 
202 | CuSparseMatrixCSC(colPtr::Vector{Ti}, rowVal::Vector{Ti}, nzVal::Vector{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(CuArray(convert(Vector{Cint},colPtr)), CuArray(convert(Vector{Cint},rowVal)), CuArray(nzVal), dims, convert(Cint,length(nzVal)))
203 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, convert(Cint,length(nzVal)))
204 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, nnz)
205 | 
206 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, convert(Cint,length(nzVal)))
207 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, nnz)
208 | 
209 | CuSparseMatrixBSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, blockDim, dir, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixBSR{T}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz)
210 | 
211 | CuSparseVector(Vec::SparseVector)    = CuSparseVector(Vec.nzind, Vec.nzval, size(Vec)[1])
212 | CuSparseMatrixCSC(Vec::SparseVector)    = CuSparseMatrixCSC([1], Vec.nzind, Vec.nzval, size(Vec))
213 | CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError())
214 | CuSparseMatrixCSC(Mat::SparseMatrixCSC) = CuSparseMatrixCSC(Mat.colptr, Mat.rowval, Mat.nzval, size(Mat))
215 | CuSparseMatrixCSR(Mat::SparseMatrixCSC) = switch2csr(CuSparseMatrixCSC(Mat))
216 | 
217 | similar(Vec::CuSparseVector) = CuSparseVector(copy(Vec.iPtr), similar(Vec.nzVal), Vec.dims[1])
218 | similar(Mat::CuSparseMatrixCSC) = CuSparseMatrixCSC(copy(Mat.colPtr), copy(Mat.rowVal), similar(Mat.nzVal), Mat.nnz, Mat.dims)
219 | similar(Mat::CuSparseMatrixCSR) = CuSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.nnz, Mat.dims)
220 | similar(Mat::CuSparseMatrixBSR) = CuSparseMatrixBSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.blockDim, Mat.dir, Mat.nnz, Mat.dims)
221 | 
222 | function copyto!(dst::CuSparseVector, src::CuSparseVector)
223 |     if dst.dims != src.dims
224 |         throw(ArgumentError("Inconsistent Sparse Vector size"))
225 |     end
226 |     copyto!(dst.iPtr, src.iPtr)
227 |     copyto!(dst.nzVal, src.nzVal)
228 |     dst.nnz = src.nnz
229 |     dst
230 | end
231 | 
232 | function copyto!(dst::CuSparseMatrixCSC, src::CuSparseMatrixCSC)
233 |     if dst.dims != src.dims
234 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
235 |     end
236 |     copyto!(dst.colPtr, src.colPtr)
237 |     copyto!(dst.rowVal, src.rowVal)
238 |     copyto!(dst.nzVal, src.nzVal)
239 |     dst.nnz = src.nnz
240 |     dst
241 | end
242 | 
243 | function copyto!(dst::CuSparseMatrixCSR, src::CuSparseMatrixCSR)
244 |     if dst.dims != src.dims
245 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
246 |     end
247 |     copyto!(dst.rowPtr, src.rowPtr)
248 |     copyto!(dst.colVal, src.colVal)
249 |     copyto!(dst.nzVal, src.nzVal)
250 |     dst.nnz = src.nnz
251 |     dst
252 | end
253 | 
254 | function copyto!(dst::CuSparseMatrixBSR, src::CuSparseMatrixBSR)
255 |     if dst.dims != src.dims
256 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
257 |     end
258 |     copyto!(dst.rowPtr, src.rowPtr)
259 |     copyto!(dst.colVal, src.colVal)
260 |     copyto!(dst.nzVal, src.nzVal)
261 |     dst.dir = src.dir
262 |     dst.nnz = src.nnz
263 |     dst
264 | end
265 | 
266 | function copyto!(dst::CuSparseMatrixHYB, src::CuSparseMatrixHYB)
267 |     if dst.dims != src.dims
268 |         throw(ArgumentError("Inconsistent Sparse Matrix size"))
269 |     end
270 |     dst.Mat = src.Mat
271 |     dst.nnz = src.nnz
272 |     dst
273 | end
274 | 
275 | copy(Vec::CuSparseVector) = copyto!(similar(Vec),Vec)
276 | copy(Mat::CuSparseMatrixCSC) = copyto!(similar(Mat),Mat)
277 | copy(Mat::CuSparseMatrixCSR) = copyto!(similar(Mat),Mat)
278 | copy(Mat::CuSparseMatrixBSR) = copyto!(similar(Mat),Mat)
279 | 


--------------------------------------------------------------------------------
/test/solver.jl:
--------------------------------------------------------------------------------
  1 | @testset "CUSOLVER" begin
  2 | 
  3 | if !isdefined(CuArrays, :CUSOLVER)
  4 | @warn "Not testing CUSOLVER"
  5 | else
  6 | using CuArrays.CUSOLVER
  7 | @info "Testing CUSOLVER $(CUSOLVER.version())"
  8 | 
  9 | using LinearAlgebra
 10 | 
 11 | m = 15
 12 | n = 10
 13 | l = 13
 14 | k = 1
 15 | 
 16 | @test_throws ArgumentError CUSOLVER.cusolverjob('M')
 17 | 
 18 | @testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
 19 |     @testset "Cholesky (po)" begin
 20 |         A    = rand(elty,n,n)
 21 |         A    = A*A' #posdef
 22 |         B    = rand(elty,n,n)
 23 |         d_A  = CuArray(A)
 24 |         d_B  = CuArray(B)
 25 | 
 26 |         d_F  = cholesky(d_A, Val(false))
 27 |         F    = cholesky(A, Val(false))
 28 |         @test F.U   ≈ collect(d_F.U)
 29 |         @test F\(A'B) ≈ collect(d_F\(d_A'd_B))
 30 | 
 31 |         d_F  = cholesky(Hermitian(d_A, :L), Val(false))
 32 |         F    = cholesky(Hermitian(A, :L), Val(false))
 33 |         @test F.L   ≈ collect(d_F.L)
 34 |         @test F\(A'B) ≈ collect(d_F\(d_A'd_B))
 35 | 
 36 |         @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,CuArray(rand(elty,m,m)))
 37 | 
 38 |         A    = rand(elty,m,n)
 39 |         d_A  = CuArray(A)
 40 |         @test_throws DimensionMismatch cholesky(d_A)
 41 |         @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,d_B)
 42 | 
 43 |         A    = zeros(elty,n,n)
 44 |         d_A  = CuArray(A)
 45 |         @test_throws LinearAlgebra.PosDefException cholesky(d_A)
 46 |     end
 47 | 
 48 |     @testset "getrf!" begin
 49 |         A          = rand(elty,m,n)
 50 |         d_A        = CuArray(A)
 51 |         d_A,d_ipiv = CUSOLVER.getrf!(d_A)
 52 |         h_A        = collect(d_A)
 53 |         h_ipiv     = collect(d_ipiv)
 54 |         alu        = LinearAlgebra.LU(h_A, convert(Vector{Int},h_ipiv), zero(Int))
 55 |         @test A ≈ Array(alu)
 56 |         A    = zeros(elty,n,n)
 57 |         d_A  = CuArray(A)
 58 |         @test_throws LinearAlgebra.SingularException CUSOLVER.getrf!(d_A)
 59 |     end
 60 | 
 61 |     @testset "getrs!" begin
 62 |         A          = rand(elty,n,n)
 63 |         d_A        = CuArray(A)
 64 |         d_A,d_ipiv = CUSOLVER.getrf!(d_A)
 65 |         B          = rand(elty,n,n)
 66 |         d_B        = CuArray(B)
 67 |         d_B        = CUSOLVER.getrs!('N',d_A,d_ipiv,d_B)
 68 |         h_B        = collect(d_B)
 69 |         @test h_B  ≈ A\B
 70 |         A          = rand(elty,m,n)
 71 |         d_A        = CuArray(A)
 72 |         @test_throws DimensionMismatch CUSOLVER.getrs!('N',d_A,d_ipiv,d_B)
 73 |         A          = rand(elty,n,n)
 74 |         d_A        = CuArray(A)
 75 |         B          = rand(elty,m,n)
 76 |         d_B        = CuArray(B)
 77 |         @test_throws DimensionMismatch CUSOLVER.getrs!('N',d_A,d_ipiv,d_B)
 78 |     end
 79 | 
 80 |     @testset "geqrf!" begin
 81 |         A         = rand(elty,m,n)
 82 |         d_A       = CuArray(A)
 83 |         d_A,d_tau = CUSOLVER.geqrf!(d_A)
 84 |         h_A       = collect(d_A)
 85 |         h_tau     = collect(d_tau)
 86 |         qra       = LinearAlgebra.QR(h_A, h_tau)
 87 |         @test A ≈ Array(qra)
 88 |     end
 89 | 
 90 |     @testset "ormqr!" begin
 91 |         A          = rand(elty, m, n)
 92 |         d_A        = CuArray(A)
 93 |         d_A, d_tau = CUSOLVER.geqrf!(d_A)
 94 |         B          = rand(elty, n, l)
 95 |         d_B        = CuArray(B)
 96 |         d_B        = CUSOLVER.ormqr!('L', 'N', d_A, d_tau, d_B)
 97 |         h_B        = collect(d_B)
 98 |         F          = qr!(A)
 99 |         @test h_B  ≈ Array(F.Q)*B
100 |         A          = rand(elty, n, m)
101 |         d_A        = CuArray(A)
102 |         d_A, d_tau = CUSOLVER.geqrf!(d_A)
103 |         B          = rand(elty, n, l)
104 |         d_B        = CuArray(B)
105 |         d_B        = CUSOLVER.ormqr!('L', 'N', d_A, d_tau, d_B)
106 |         h_B        = collect(d_B)
107 |         F          = qr!(A)
108 |         @test h_B  ≈ Array(F.Q)*B
109 |         A          = rand(elty, m, n)
110 |         d_A        = CuArray(A)
111 |         d_A, d_tau = CUSOLVER.geqrf!(d_A)
112 |         B          = rand(elty, l, m)
113 |         d_B        = CuArray(B)
114 |         d_B        = CUSOLVER.ormqr!('R', 'N', d_A, d_tau, d_B)
115 |         h_B        = collect(d_B)
116 |         F          = qr!(A)
117 |         @test h_B  ≈ B*Array(F.Q)
118 |         A          = rand(elty, n, m)
119 |         d_A        = CuArray(A)
120 |         d_A, d_tau = CUSOLVER.geqrf!(d_A)
121 |         B          = rand(elty, l, n)
122 |         d_B        = CuArray(B)
123 |         d_B        = CUSOLVER.ormqr!('R', 'N', d_A, d_tau, d_B)
124 |         h_B        = collect(d_B)
125 |         F          = qr!(A)
126 |         @test h_B  ≈ B*Array(F.Q)
127 |     end
128 | 
129 |     @testset "orgqr!" begin
130 |         A         = rand(elty,n,m)
131 |         d_A       = CuArray(A)
132 |         d_A,d_tau = CUSOLVER.geqrf!(d_A)
133 |         d_Q       = CUSOLVER.orgqr!(d_A, d_tau)
134 |         h_Q       = collect(d_Q)
135 |         F         = qr!(A)
136 |         @test h_Q ≈ Array(F.Q)
137 |         A         = rand(elty,m,n)
138 |         d_A       = CuArray(A)
139 |         d_A,d_tau = CUSOLVER.geqrf!(d_A)
140 |         d_Q       = CUSOLVER.orgqr!(d_A, d_tau)
141 |         h_Q       = collect(d_Q)
142 |         F         = qr!(A)
143 |         @test h_Q ≈ Array(F.Q)
144 |     end
145 | 
146 |     @testset "sytrf!" begin
147 |         A          = rand(elty,n,n)
148 |         A          = A + A' #symmetric
149 |         d_A        = CuArray(A)
150 |         d_A,d_ipiv = CUSOLVER.sytrf!('U',d_A)
151 |         h_A        = collect(d_A)
152 |         h_ipiv     = collect(d_ipiv)
153 |         A, ipiv    = LAPACK.sytrf!('U',A)
154 |         @test ipiv == h_ipiv
155 |         @test A ≈ h_A
156 |         A    = rand(elty,m,n)
157 |         d_A  = CuArray(A)
158 |         @test_throws DimensionMismatch CUSOLVER.sytrf!('U',d_A)
159 |         A    = zeros(elty,n,n)
160 |         d_A  = CuArray(A)
161 |         @test_throws LinearAlgebra.SingularException CUSOLVER.sytrf!('U',d_A)
162 |     end
163 | 
164 |     @testset "gebrd!" begin
165 |         A                             = rand(elty,m,n)
166 |         d_A                           = CuArray(A)
167 |         d_A, d_D, d_E, d_TAUQ, d_TAUP = CUSOLVER.gebrd!(d_A)
168 |         h_A                           = collect(d_A)
169 |         h_D                           = collect(d_D)
170 |         h_E                           = collect(d_E)
171 |         h_TAUQ                        = collect(d_TAUQ)
172 |         h_TAUP                        = collect(d_TAUP)
173 |         A,d,e,q,p                     = LAPACK.gebrd!(A)
174 |         #@test A ≈ h_A
175 |         @test d ≈ h_D
176 |         @test e ≈ h_E
177 |         @test q ≈ h_TAUQ
178 |         @test p ≈ h_TAUP
179 |     end
180 | 
181 |     @testset "syevd!" begin
182 |         A              = rand(elty,m,m)
183 |         A             += A'
184 |         d_A            = CuArray(A)
185 |         local d_W, d_V
186 |         if( elty <: Complex )
187 |             d_W, d_V   = CUSOLVER.heevd!('V','U', d_A)
188 |         else
189 |             d_W, d_V   = CUSOLVER.syevd!('V','U', d_A)
190 |         end
191 |         h_W            = collect(d_W)
192 |         h_V            = collect(d_V)
193 |         Eig            = eigen(A)
194 |         @test Eig.values ≈ h_W
195 |         @test abs.(Eig.vectors'*h_V) ≈ I
196 |         d_A            = CuArray(A)
197 |         if( elty <: Complex )
198 |             d_W   = CUSOLVER.heevd!('N','U', d_A)
199 |         else
200 |             d_W   = CUSOLVER.syevd!('N','U', d_A)
201 |         end
202 |         h_W            = collect(d_W)
203 |         @test Eig.values ≈ h_W
204 |     end
205 | 
206 |     @testset "sygvd!" begin
207 |         A              = rand(elty,m,m)
208 |         B              = rand(elty,m,m)
209 |         A             *= A'
210 |         B             *= B'
211 |         d_A            = CuArray(A)
212 |         d_B            = CuArray(B)
213 |         local d_W, d_VA, d_VB
214 |         if( elty <: Complex )
215 |             d_W, d_VA, d_VB = CUSOLVER.hegvd!(1, 'V','U', d_A, d_B)
216 |         else
217 |             d_W, d_VA, d_VB = CUSOLVER.sygvd!(1, 'V','U', d_A, d_B)
218 |         end
219 |         h_W            = collect(d_W)
220 |         h_VA           = collect(d_VA)
221 |         h_VB           = collect(d_VB)
222 |         Eig            = eigen(Hermitian(A), Hermitian(B))
223 |         @test Eig.values ≈ h_W
224 |         @test A*h_VA ≈ B*h_VA*Diagonal(h_W) rtol=1e-4
225 |         # test normalization condition for eigtype 1
226 |         @test abs.(h_VA'B*h_VA) ≈ Matrix(one(elty)*I, m, m)
227 |         d_A            = CuArray(A)
228 |         d_B            = CuArray(B)
229 |         if( elty <: Complex )
230 |             d_W   = CUSOLVER.hegvd!(1, 'N','U', d_A, d_B)
231 |         else
232 |             d_W   = CUSOLVER.sygvd!(1, 'N','U', d_A, d_B)
233 |         end
234 |         h_W            = collect(d_W)
235 |         @test Eig.values ≈ h_W
236 |         d_B            = CuArray(rand(elty, m+1, m+1))
237 |         if( elty <: Complex )
238 |             @test_throws DimensionMismatch CUSOLVER.hegvd!(1, 'N','U', d_A, d_B)
239 |         else
240 |             @test_throws DimensionMismatch CUSOLVER.sygvd!(1, 'N','U', d_A, d_B)
241 |         end
242 |     end
243 | 
244 |     @testset "syevj!" begin
245 |         A              = rand(elty,m,m)
246 |         B              = rand(elty,m,m)
247 |         A             *= A'
248 |         B             *= B'
249 |         d_A            = CuArray(A)
250 |         d_B            = CuArray(B)
251 |         local d_W, d_VA, d_VB
252 |         if( elty <: Complex )
253 |             d_W, d_VA, d_VB = CUSOLVER.hegvj!(1, 'V','U', d_A, d_B)
254 |         else
255 |             d_W, d_VA, d_VB = CUSOLVER.sygvj!(1, 'V','U', d_A, d_B)
256 |         end
257 |         h_W            = collect(d_W)
258 |         h_VA           = collect(d_VA)
259 |         h_VB           = collect(d_VB)
260 |         Eig            = eigen(Hermitian(A), Hermitian(B))
261 |         @test Eig.values ≈ h_W
262 |         @test A*h_VA ≈ B*h_VA*Diagonal(h_W) rtol=1e-4
263 |         # test normalization condition for eigtype 1
264 |         @test abs.(h_VA'B*h_VA) ≈ Matrix(one(elty)*I, m, m)
265 |         d_A            = CuArray(A)
266 |         d_B            = CuArray(B)
267 |         if( elty <: Complex )
268 |             d_W   = CUSOLVER.hegvj!(1, 'N','U', d_A, d_B)
269 |         else
270 |             d_W   = CUSOLVER.sygvj!(1, 'N','U', d_A, d_B)
271 |         end
272 |         h_W            = collect(d_W)
273 |         @test Eig.values ≈ h_W
274 |     end
275 | 
276 |     @testset "svd with $method method" for
277 |         method in (CUSOLVER.QRAlgorithm, CUSOLVER.JacobiAlgorithm),
278 |         (_m, _n) in ((m, n), (n, m))
279 | 
280 |         A              = rand(elty, _m, _n)
281 |         U, S, V        = svd(A, full=true)
282 |         d_A            = CuArray(A)
283 | 
284 |         if _m > _n || method == CUSOLVER.JacobiAlgorithm
285 |             d_U, d_S, d_V  = svd(d_A, method, full=true)
286 |             h_S            = collect(d_S)
287 |             h_U            = collect(d_U)
288 |             h_V            = collect(d_V)
289 |             @test abs.(h_U'h_U) ≈ I
290 |             @test abs.(h_U[:,1:min(_m,_n)]'U[:,1:min(_m,_n)]) ≈ I
291 |             @test collect(svdvals(d_A, method)) ≈ svdvals(A)
292 |             @test abs.(h_V'h_V) ≈ I
293 |             @test abs.(h_V[:,1:min(_m,_n)]'*V[:,1:min(_m,_n)]) ≈ I
294 |             @test collect(d_U'*d_A*d_V) ≈ U'*A*V
295 |             @test collect(svd(d_A, method).V') == h_V[:,1:min(_m,_n)]'
296 |         else
297 |             @test_throws ArgumentError svd(d_A, method)
298 |         end
299 |     end
300 |     # Check that constant propagation works
301 |     _svd(A) = svd(A, CUSOLVER.QRAlgorithm)
302 |     @inferred _svd(CuArrays.CURAND.curand(Float32, 4, 4))
303 | 
304 | 
305 |     @testset "qr" begin
306 |         tol = min(m, n)*eps(real(elty))*(1 + (elty <: Complex))
307 | 
308 |         A              = rand(elty, m, n)
309 |         d_A            = CuArray(A)
310 |         d_F            = qr(d_A)
311 |         d_RR           = d_F.Q'*d_A
312 |         @test d_RR[1:n,:] ≈ d_F.R atol=tol*norm(A)
313 |         @test norm(d_RR[n+1:end,:]) < tol*norm(A)
314 |         A              = rand(elty, n, m)
315 |         d_A            = CuArray(A)
316 |         d_F            = qr(d_A)
317 |         @test d_F.Q'*d_A ≈ d_F.R atol=tol*norm(A)
318 |         A              = rand(elty, m, n)
319 |         d_A            = CuArray(A)
320 |         h_q, h_r       = qr(d_A)
321 |         q, r           = qr(A)
322 |         @test Array(h_q) ≈ Array(q)
323 |         @test Array(h_r) ≈ Array(r)
324 |         A              = rand(elty, n, m)
325 |         d_A            = CuArray(A)
326 |         h_q, h_r       = qr(d_A) # FixMe! Use iteration protocol when implemented
327 |         q, r           = qr(A)
328 |         @test Array(h_q) ≈ Array(q)
329 |         @test Array(h_r) ≈ Array(r)
330 |     end
331 | 
332 | end
333 | 
334 | end
335 | 
336 | end
337 | 


--------------------------------------------------------------------------------