├── test
    ├── ptx
    │   ├── empty.cu
    │   ├── global.cu
    │   ├── dummy.cu
    │   ├── empty.ptx
    │   ├── global.ptx
    │   ├── vadd_child.cu
    │   ├── dummy.ptx
    │   ├── vadd.cu
    │   ├── vadd_parent.cu
    │   ├── Makefile
    │   ├── vadd_child.ptx
    │   ├── vectorops.cu
    │   ├── vadd.ptx
    │   └── vadd_parent.ptx
    ├── .gitignore
    ├── curand.jl
    ├── nvtx.jl
    ├── iterator.jl
    ├── linalg.jl
    ├── cutensor
    │   ├── base.jl
    │   └── permutations.jl
    ├── apiutils.jl
    ├── Project.toml
    ├── cudnn
    │   ├── tensor.jl
    │   ├── dropout.jl
    │   ├── inplace.jl
    │   ├── softmax.jl
    │   ├── activation.jl
    │   └── optensor.jl
    ├── examples.jl
    ├── pool.jl
    ├── cusparse
    │   └── conversions.jl
    ├── utils.jl
    ├── device
    │   ├── sparse.jl
    │   ├── ldg.jl
    │   └── intrinsics
    │   │   └── math.jl
    ├── broadcast.jl
    ├── nvml.jl
    ├── threading.jl
    └── exceptions.jl
├── .github
    ├── FUNDING.yml
    ├── workflows
    │   ├── TagBot.yml
    │   ├── CompatHelper.yml
    │   └── ManifestUpdater.yml
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── res
    └── wrap
    │   ├── .gitignore
    │   ├── patches
    │       ├── cusolver
    │       │   └── cppversion.patch
    │       ├── cudadrv
    │       │   ├── cudeviceptr.patch
    │       │   ├── cuarray.patch
    │       │   └── batched_memop.patch
    │       ├── cudnn
    │       │   ├── severity.patch
    │       │   └── algorithm.patch
    │       ├── nvtx
    │       │   ├── unions.patch
    │       │   └── macro.patch
    │       ├── cusparse
    │       │   └── cppversion.patch
    │       └── cublas
    │       │   └── computetype.patch
    │   ├── Project.toml
    │   └── README.md
├── docs
    ├── src
    │   ├── assets
    │   │   ├── logo.png
    │   │   └── favicon.ico
    │   ├── development
    │   │   ├── nvvp.png
    │   │   ├── nsight_systems.png
    │   │   ├── nsight_compute-api.png
    │   │   ├── nsight_compute-attach.png
    │   │   ├── nsight_compute-kernel.png
    │   │   └── nsight_compute-resume.png
    │   ├── tutorials
    │   │   ├── intro1.png
    │   │   ├── common.jl
    │   │   └── custom_structs.jl
    │   ├── usage
    │   │   ├── multitasking
    │   │   │   ├── tasks.png
    │   │   │   └── tasks_pinned.png
    │   │   └── workflow.md
    │   ├── api
    │   │   ├── array.md
    │   │   ├── essentials.md
    │   │   └── compiler.md
    │   ├── installation
    │   │   └── troubleshooting.md
    │   ├── index.md
    │   └── faq.md
    ├── .gitignore
    ├── Project.toml
    ├── make.jl
    └── Manifest.toml
├── .gitignore
├── lib
    ├── cupti
    │   ├── wrappers.jl
    │   └── CUPTI.jl
    ├── complex.jl
    ├── utils
    │   ├── APIUtils.jl
    │   ├── enum.jl
    │   ├── threading.jl
    │   ├── cache.jl
    │   └── memoization.jl
    ├── nvtx
    │   ├── NVTX.jl
    │   └── highlevel.jl
    ├── cufft
    │   ├── wrappers.jl
    │   ├── CUFFT.jl
    │   ├── util.jl
    │   ├── libcufft_common.jl
    │   └── error.jl
    ├── cublas
    │   ├── libcublas_deprecated.jl
    │   ├── README.md
    │   ├── util.jl
    │   └── error.jl
    ├── cusparse
    │   ├── management.jl
    │   ├── util.jl
    │   ├── helpers.jl
    │   ├── error.jl
    │   ├── CUSPARSE.jl
    │   └── extra.jl
    ├── curand
    │   ├── wrappers.jl
    │   ├── CURAND.jl
    │   └── error.jl
    ├── cudnn
    │   ├── base.jl
    │   ├── error.jl
    │   ├── util.jl
    │   ├── softmax.jl
    │   ├── inplace.jl
    │   └── activation.jl
    ├── cudadrv
    │   ├── module
    │   │   ├── function.jl
    │   │   └── global.jl
    │   ├── types.jl
    │   ├── libcuda_deprecated.jl
    │   ├── version.jl
    │   ├── error.jl
    │   └── pool.jl
    ├── nvml
    │   ├── system.jl
    │   ├── error.jl
    │   ├── NVML.jl
    │   └── libnvml_deprecated.jl
    ├── cusolver
    │   ├── base.jl
    │   └── error.jl
    └── cutensor
    │   ├── tensor.jl
    │   ├── CUTENSOR.jl
    │   ├── interfaces.jl
    │   └── error.jl
├── src
    ├── debug.jl
    ├── deprecated.jl
    ├── device
    │   ├── intrinsics
    │   │   ├── cooperative_groups.jl
    │   │   ├── misc.jl
    │   │   ├── memory_dynamic.jl
    │   │   ├── version.jl
    │   │   └── assertion.jl
    │   ├── pointer.jl
    │   ├── intrinsics.jl
    │   ├── utils.jl
    │   └── quirks.jl
    ├── broadcast.jl
    ├── precompile.jl
    ├── iterator.jl
    ├── compiler
    │   └── exceptions.jl
    ├── gpuarrays.jl
    ├── linalg.jl
    └── CUDA.jl
├── codecov.yml
├── examples
    ├── driver
    │   ├── vadd.cu
    │   ├── Makefile
    │   ├── vadd.jl
    │   └── vadd.ptx
    ├── hello_world.jl
    ├── vadd.jl
    ├── wmma
    │   ├── high-level.jl
    │   └── low-level.jl
    └── peakflops.jl
├── deps
    ├── Deps.jl
    └── utils.jl
├── perf
    ├── Project.toml
    ├── cudadevrt.jl
    ├── kernel.jl
    ├── latency.jl
    └── byval.jl
├── CITATION.bib
├── LICENSE.md
└── Project.toml


/test/ptx/empty.cu:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | Manifest.toml
2 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: JuliaLang
2 | 


--------------------------------------------------------------------------------
/test/ptx/global.cu:
--------------------------------------------------------------------------------
1 | __device__ int foobar;
2 | 


--------------------------------------------------------------------------------
/res/wrap/.gitignore:
--------------------------------------------------------------------------------
1 | LibTemplate.jl
2 | ctypes.jl
3 | lib*.jl
4 | 


--------------------------------------------------------------------------------
/test/ptx/dummy.cu:
--------------------------------------------------------------------------------
1 | extern "C" {
2 | 
3 | __global__ void dummy()
4 | {
5 | }
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/test/ptx/empty.ptx:
--------------------------------------------------------------------------------
1 | 
2 | .version 3.1
3 | .target sm_20
4 | .address_size 64
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/src/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/assets/logo.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jl.*.cov
2 | *.jl.cov
3 | *.jl.mem
4 | /docs/build/
5 | .vscode
6 | lcov.info
7 | build/
8 | 


--------------------------------------------------------------------------------
/docs/src/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/assets/favicon.ico


--------------------------------------------------------------------------------
/docs/src/development/nvvp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nvvp.png


--------------------------------------------------------------------------------
/docs/src/tutorials/intro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/tutorials/intro1.png


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 
3 | # generated files
4 | src/tutorials/introduction.md
5 | src/tutorials/custom_structs.md
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/src/development/nsight_systems.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_systems.png


--------------------------------------------------------------------------------
/docs/src/usage/multitasking/tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/usage/multitasking/tasks.png


--------------------------------------------------------------------------------
/test/ptx/global.ptx:
--------------------------------------------------------------------------------
1 | 
2 | .version 3.1
3 | .target sm_20
4 | .address_size 64
5 | 
6 | .global .align 4 .u32 foobar;
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/test/ptx/vadd_child.cu:
--------------------------------------------------------------------------------
1 | extern "C" {
2 | 
3 | __device__ float add(float a, float b)
4 | {
5 |     return a+b;
6 | }
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/docs/src/development/nsight_compute-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-api.png


--------------------------------------------------------------------------------
/docs/src/usage/multitasking/tasks_pinned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/usage/multitasking/tasks_pinned.png


--------------------------------------------------------------------------------
/docs/src/development/nsight_compute-attach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-attach.png


--------------------------------------------------------------------------------
/docs/src/development/nsight_compute-kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-kernel.png


--------------------------------------------------------------------------------
/docs/src/development/nsight_compute-resume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-resume.png


--------------------------------------------------------------------------------
/lib/cupti/wrappers.jl:
--------------------------------------------------------------------------------
1 | function version()
2 |     version_ref = Ref{Cuint}()
3 |     cuptiGetVersion(version_ref)
4 |     VersionNumber(version_ref[])
5 | end
6 | 


--------------------------------------------------------------------------------
/docs/src/api/array.md:
--------------------------------------------------------------------------------
1 | # Array programming
2 | 
3 | The CUDA array type, `CuArray`, generally implements the Base array interface and all of its
4 | expected methods.
5 | 


--------------------------------------------------------------------------------
/src/debug.jl:
--------------------------------------------------------------------------------
1 | # debug functionality
2 | 
3 | isdebug(group, mod=CUDA) =
4 |     Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, mod) !== nothing
5 | 


--------------------------------------------------------------------------------
/test/ptx/dummy.ptx:
--------------------------------------------------------------------------------
 1 | 
 2 | .version 3.1
 3 | .target sm_20
 4 | .address_size 64
 5 | 
 6 | 
 7 | .visible .entry dummy(
 8 | 
 9 | )
10 | {
11 | 
12 | 
13 | 
14 | 	ret;
15 | }
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/test/ptx/vadd.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | 
 3 | __global__ void vadd(const float *a, const float *b, float *c)
 4 | {
 5 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
 6 |     c[i] = a[i] + b[i];
 7 | }
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   ignore:
 3 |     - "lib/*/lib*.jl"
 4 |     - "src/device"
 5 |     - "res/"
 6 |     - "doc/"
 7 |     - "perf/"
 8 |   status:
 9 |     patch: false
10 |     project: false
11 |     changes: false
12 | 


--------------------------------------------------------------------------------
/examples/driver/vadd.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | 
 3 | __global__ void kernel_vadd(const float *a, const float *b, float *c)
 4 | {
 5 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
 6 |     c[i] = a[i] + b[i];
 7 | }
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/curand.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CURAND
 2 | 
 3 | @test CURAND.version() isa VersionNumber
 4 | 
 5 | rng = CURAND.default_rng()
 6 | Random.seed!(rng)
 7 | Random.seed!(rng, nothing)
 8 | Random.seed!(rng, 1)
 9 | Random.seed!(rng, 1, 0)
10 | 


--------------------------------------------------------------------------------
/examples/driver/Makefile:
--------------------------------------------------------------------------------
 1 | OBJS=vadd.ptx
 2 | 
 3 | NVCC=nvcc
 4 | NVCCFLAGS=
 5 | 
 6 | 
 7 | .PHONY: all
 8 | all: $(OBJS)
 9 | 
10 | .PHONY: clean
11 | clean:
12 | 	$(RM) $(OBJS)
13 | 
14 | 
15 | %.ptx: %.cu
16 | 	$(NVCC) $(NVCCFLAGS) -ptx $^ -o $@
17 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cusolver/cppversion.patch:
--------------------------------------------------------------------------------
1 | --- a/libcusolver_common.jl
2 | +++ b/libcusolver_common.jl
3 | @@ -8,3 +7,0 @@
4 | -const CUSOLVER_CPP_VERSION = __cplusplus
5 | -const CUSOLVER_DEPRECATED = new_func
6 | -const CUSOLVER_DEPRECATED_ENUM = new_enum
7 | 


--------------------------------------------------------------------------------
/test/ptx/vadd_parent.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | 
 3 | __device__ float add(float a, float b);
 4 | 
 5 | __global__ void vadd(const float *a, const float *b, float *c)
 6 | {
 7 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
 8 |     c[i] = add(a[i], b[i]);
 9 | }
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/deps/Deps.jl:
--------------------------------------------------------------------------------
 1 | module Deps
 2 | 
 3 | Base.Experimental.@compiler_options compile=min optimize=0 infer=false
 4 | 
 5 | import ..CUDA
 6 | import ..LLVM
 7 | 
 8 | include("discovery.jl")
 9 | include("compatibility.jl")
10 | include("bindeps.jl")
11 | include("utils.jl")
12 | 
13 | end
14 | 


--------------------------------------------------------------------------------
/perf/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
3 | HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
4 | JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
5 | StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
6 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
7 | 


--------------------------------------------------------------------------------
/docs/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 3 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 4 | Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 5 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 6 | 
 7 | [compat]
 8 | Documenter = "0.27"
 9 | Literate = "2.2"
10 | 


--------------------------------------------------------------------------------
/lib/complex.jl:
--------------------------------------------------------------------------------
1 | # CUDA's complex types are defined in terms of vector types (float2, double2),
2 | # but those seem compatible with Julia's complex numbers, so use those.
3 | const cuFloatComplex = Complex{Float32}
4 | const cuDoubleComplex = Complex{Float64}
5 | 
6 | # aliases
7 | const cuComplex = cuFloatComplex
8 | 


--------------------------------------------------------------------------------
/lib/utils/APIUtils.jl:
--------------------------------------------------------------------------------
 1 | module APIUtils
 2 | 
 3 | using ..CUDA
 4 | 
 5 | using LLVM
 6 | using LLVM.Interop
 7 | 
 8 | # helpers that facilitate working with CUDA APIs
 9 | include("call.jl")
10 | include("enum.jl")
11 | include("threading.jl")
12 | include("cache.jl")
13 | include("memoization.jl")
14 | 
15 | end
16 | 


--------------------------------------------------------------------------------
/docs/src/api/essentials.md:
--------------------------------------------------------------------------------
 1 | # Essentials
 2 | 
 3 | 
 4 | ## Initialization
 5 | 
 6 | ```@docs
 7 | CUDA.functional(::Bool)
 8 | has_cuda
 9 | has_cuda_gpu
10 | ```
11 | 
12 | 
13 | ## Global state
14 | 
15 | ```@docs
16 | context
17 | context!
18 | device
19 | device!
20 | device_reset!
21 | stream
22 | stream!
23 | ```
24 | 


--------------------------------------------------------------------------------
/test/nvtx.jl:
--------------------------------------------------------------------------------
 1 | # markers
 2 | 
 3 | NVTX.mark("test")
 4 | 
 5 | 
 6 | # ranges
 7 | 
 8 | NVTX.@range "test" begin
 9 | end
10 | 
11 | NVTX.@range function test()
12 | end
13 | test()
14 | 
15 | @eval test2() = nothing
16 | 
17 | NVTX.@range function Main.test2(::Int)
18 | end
19 | 
20 | NVTX.@range function Main.test2(::T) where T
21 | end
22 | 
23 | NVTX.@range test3() = nothing
24 | 


--------------------------------------------------------------------------------
/test/ptx/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS=$(wildcard *.cu)
 2 | OBJS=$(SRCS:.cu=.ptx)
 3 | 
 4 | CUDA_ROOT=/usr
 5 | NVCC=$(CUDA_ROOT)/bin/nvcc
 6 | NVCCFLAGS=-arch=sm_20
 7 | 
 8 | 
 9 | .PHONY: all
10 | all: $(OBJS)
11 | 
12 | .PHONY: clean
13 | clean:
14 | 	$(RM) $(OBJS)
15 | 
16 | 
17 | %.ptx: %.cu
18 | 	$(NVCC) $(NVCCFLAGS) -ptx $^ -o $@
19 | 	sed -i -e '/\.file/d' -e '/\.loc/d' -e '/^\/\//d' $@
20 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cudadrv/cudeviceptr.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcuda_common.jl	2019-10-15 15:11:11.826266035 +0200
 2 | +++ b/libcuda_common.jl	2019-10-15 15:31:06.144762261 +0200
 3 | @@ -42,7 +42,6 @@
 4 | 
 5 |  const cuuint32_t = UInt32
 6 |  const cuuint64_t = UInt64
 7 | -const CUdeviceptr = Culonglong
 8 |  const CUdevice = Cint
 9 |  const CUctx_st = Cvoid
10 |  const CUcontext = Ptr{CUctx_st}
11 | 


--------------------------------------------------------------------------------
/lib/nvtx/NVTX.jl:
--------------------------------------------------------------------------------
 1 | module NVTX
 2 | 
 3 | using ..CUDA
 4 | using ..CUDA: libnvtx, @checked
 5 | using ..CUDA: CUstream, CUdevice, CUcontext, CUevent
 6 | 
 7 | using CEnum: @cenum
 8 | 
 9 | using ExprTools: splitdef, combinedef
10 | 
11 | 
12 | # core library
13 | initialize_context() = return
14 | include("libnvtx_common.jl")
15 | include("libnvtx.jl")
16 | 
17 | include("highlevel.jl")
18 | 
19 | end
20 | 


--------------------------------------------------------------------------------
/res/wrap/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | CSTParser = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
 3 | CUDA_full_jll = "4f82f1eb-248c-5f56-a42e-99106d144614"
 4 | CUDNN_jll = "62b44479-cb7b-5706-934f-f13b2eb2e645"
 5 | CUTENSOR_jll = "35b6c64b-1ee1-5834-92a3-3f624899209a"
 6 | Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
 7 | Tokenize = "0796e94c-ce3b-5d07-9a54-7f471281c624"
 8 | 
 9 | [compat]
10 | julia = "1.4"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/TagBot.yml:
--------------------------------------------------------------------------------
 1 | name: TagBot
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types:
 6 |       - created
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   TagBot:
11 |     if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: JuliaRegistries/TagBot@v1
15 |         with:
16 |           token: ${{ secrets.GITHUB_TOKEN }}
17 | 


--------------------------------------------------------------------------------
/lib/cufft/wrappers.jl:
--------------------------------------------------------------------------------
 1 | # wrappers of low-level functionality
 2 | 
 3 | function cufftGetProperty(property::libraryPropertyType)
 4 |   value_ref = Ref{Cint}()
 5 |   cufftGetProperty(property, value_ref)
 6 |   value_ref[]
 7 | end
 8 | 
 9 | version() = VersionNumber(cufftGetProperty(CUDA.MAJOR_VERSION),
10 |                           cufftGetProperty(CUDA.MINOR_VERSION),
11 |                           cufftGetProperty(CUDA.PATCH_LEVEL))
12 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cudadrv/cuarray.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcuda_common.jl
 2 | +++ b/libcuda_common.jl
 3 | @@ -82,8 +82,6 @@ const CUmod_st = Cvoid
 4 |  const CUmodule = Ptr{CUmod_st}
 5 |  const CUfunc_st = Cvoid
 6 |  const CUfunction = Ptr{CUfunc_st}
 7 | -const CUarray_st = Cvoid
 8 | -const CUarray = Ptr{CUarray_st}
 9 |  const CUmipmappedArray_st = Cvoid
10 |  const CUmipmappedArray = Ptr{CUmipmappedArray_st}
11 |  const CUtexref_st = Cvoid
12 | 


--------------------------------------------------------------------------------
/test/ptx/vadd_child.ptx:
--------------------------------------------------------------------------------
 1 | 
 2 | .version 3.1
 3 | .target sm_20
 4 | .address_size 64
 5 | 
 6 | 
 7 | .visible .func  (.param .b32 func_retval0) add(
 8 | 	.param .b32 add_param_0,
 9 | 	.param .b32 add_param_1
10 | )
11 | {
12 | 	.reg .f32 	%f<4>;
13 | 
14 | 
15 | 	ld.param.f32 	%f1, [add_param_0];
16 | 	ld.param.f32 	%f2, [add_param_1];
17 | 	add.f32 	%f3, %f1, %f2;
18 | 	st.param.f32	[func_retval0+0], %f3;
19 | 	ret;
20 | }
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/deps/utils.jl:
--------------------------------------------------------------------------------
 1 | export getenv
 2 | 
 3 | # robustly get and parse an env var
 4 | function getenv(var, default::T) where T
 5 |     if haskey(ENV, var)
 6 |         result = tryparse(T, ENV[var])
 7 |         if result === nothing
 8 |             @warn "Could not parse $(var)=$(ENV[var]), using default value '$default'"
 9 |             default
10 |         else
11 |             result
12 |         end
13 |     else
14 |         default
15 |     end
16 | end
17 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cudnn/severity.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcudnn_common.jl	2019-10-23 17:52:17.651150610 +0200
 2 | +++ b/libcudnn_common.jl	2019-10-23 17:51:42.383658270 +0200
 3 | @@ -26,9 +26,6 @@
 4 |  end
 5 |  
 6 |  
 7 | -const CUDNN_SEV_ERROR_EN = UInt32(1) << CUDNN_SEV_ERROR
 8 | -const CUDNN_SEV_WARNING_EN = UInt32(1) << CUDNN_SEV_WARNING
 9 | -const CUDNN_SEV_INFO_EN = UInt32(1) << CUDNN_SEV_INFO
10 |  const cudnnContext = Cvoid
11 |  const cudnnHandle_t = Ptr{cudnnContext}
12 |  
13 | 


--------------------------------------------------------------------------------
/examples/hello_world.jl:
--------------------------------------------------------------------------------
 1 | using CUDA
 2 | 
 3 | if Sys.iswindows()
 4 |     function hello_world()
 5 |         @cuprintf("Greetings from block %lld, thread %lld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
 6 |         return
 7 |     end
 8 | else
 9 |     function hello_world()
10 |        @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
11 |        return
12 |    end
13 | end
14 | @cuda blocks=2 threads=2 hello_world()
15 | synchronize()
16 | 


--------------------------------------------------------------------------------
/lib/cupti/CUPTI.jl:
--------------------------------------------------------------------------------
 1 | module CUPTI
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | using ..CUDA: libcupti, @retry_reclaim, initialize_context
 7 | using ..CUDA: CUuuid, CUcontext, CUstream, CUdevice, CUdevice_attribute,
 8 |               CUgraph, CUgraphNode, CUgraphNodeType, CUgraphExec, CUaccessPolicyWindow
 9 | 
10 | using CEnum: @cenum
11 | 
12 | 
13 | # core library
14 | include("libcupti_common.jl")
15 | include("error.jl")
16 | include("libcupti.jl")
17 | 
18 | include("wrappers.jl")
19 | 
20 | end
21 | 


--------------------------------------------------------------------------------
/examples/driver/vadd.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | using CUDA
 4 | 
 5 | md = CuModuleFile(joinpath(@__DIR__, "vadd.ptx"))
 6 | vadd = CuFunction(md, "kernel_vadd")
 7 | 
 8 | dims = (3,4)
 9 | a = round.(rand(Float32, dims) * 100)
10 | b = round.(rand(Float32, dims) * 100)
11 | c = similar(a)
12 | 
13 | d_a = CuArray(a)
14 | d_b = CuArray(b)
15 | d_c = CuArray(c)
16 | 
17 | len = prod(dims)
18 | cudacall(vadd, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, d_a, d_b, d_c; threads=len)
19 | 
20 | @test a+b ≈ Array(d_c)
21 | 


--------------------------------------------------------------------------------
/examples/vadd.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | using CUDA
 4 | using CUDA: i32
 5 | 
 6 | function vadd(a, b, c)
 7 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
 8 |     c[i] = a[i] + b[i]
 9 |     return
10 | end
11 | 
12 | dims = (3,4)
13 | a = round.(rand(Float32, dims) * 100)
14 | b = round.(rand(Float32, dims) * 100)
15 | c = similar(a)
16 | 
17 | d_a = CuArray(a)
18 | d_b = CuArray(b)
19 | d_c = CuArray(c)
20 | 
21 | len = prod(dims)
22 | @cuda threads=len vadd(d_a, d_b, d_c)
23 | c = Array(d_c)
24 | @test a+b ≈ c
25 | 


--------------------------------------------------------------------------------
/res/wrap/patches/nvtx/unions.patch:
--------------------------------------------------------------------------------
 1 | --- a/libnvtx_common.jl	2019-10-25 16:09:36.638690884 +0200
 2 | +++ b/libnvtx_common.jl	2019-10-25 16:33:25.506013480 +0200
 3 | @@ -72,6 +64,10 @@
 4 |  end
 5 |  
 6 |  
 7 | +struct payload_t
 8 | +    ullValue::UInt64
 9 | +end
10 | +
11 |  struct nvtxEventAttributes_v2
12 |      version::UInt16
13 |      size::UInt16
14 | @@ -96,6 +92,10 @@
15 |  end
16 |  
17 |  
18 | +struct identifier_t
19 | +    ullValue::UInt64
20 | +end
21 | +
22 |  struct nvtxResourceAttributes_v0
23 |      version::UInt16
24 |      size::UInt16
25 | 


--------------------------------------------------------------------------------
/src/deprecated.jl:
--------------------------------------------------------------------------------
 1 | # Deprecated functionality
 2 | 
 3 | @deprecate CuDevice(ctx::CuContext) device(ctx)
 4 | @deprecate CuCurrentDevice() current_device()
 5 | @deprecate CuCurrentContext() current_context()
 6 | @deprecate CuContext(ptr::Union{Ptr,CuPtr}) context(ptr)
 7 | @deprecate CuDevice(ptr::Union{Ptr,CuPtr}) device(ptr)
 8 | 
 9 | @deprecate CuDefaultStream() default_stream()
10 | @deprecate CuStreamLegacy() legacy_stream()
11 | @deprecate CuStreamPerThread() per_thread_stream()
12 | @deprecate query(s::CuStream) isdone(s)
13 | @deprecate query(e::CuEvent) isdone(e)
14 | 


--------------------------------------------------------------------------------
/lib/cufft/CUFFT.jl:
--------------------------------------------------------------------------------
 1 | module CUFFT
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | using ..CUDA: CUstream, cuComplex, cuDoubleComplex, libraryPropertyType
 7 | using ..CUDA: libcufft, unsafe_free!, @retry_reclaim, @context!, initialize_context
 8 | 
 9 | using CEnum: @cenum
10 | 
11 | using Reexport: @reexport
12 | 
13 | 
14 | # core library
15 | include("libcufft_common.jl")
16 | include("error.jl")
17 | include("libcufft.jl")
18 | 
19 | # low-level wrappers
20 | include("util.jl")
21 | include("wrappers.jl")
22 | 
23 | # high-level integrations
24 | include("fft.jl")
25 | 
26 | end
27 | 


--------------------------------------------------------------------------------
/res/wrap/patches/nvtx/macro.patch:
--------------------------------------------------------------------------------
 1 | --- a/libnvtx_common.jl	2019-10-25 16:09:36.638690884 +0200
 2 | +++ b/libnvtx_common.jl	2019-10-25 16:25:03.653940666 +0200
 3 | @@ -5,7 +5,7 @@
 4 |  
 5 |  # Skipping MacroDefinition: NVTX_INLINE_STATIC inline static
 6 |  
 7 | -const NVTX_DECLSPEC = NVTX_INLINE_STATIC
 8 | +# Skipping MacroDefinition: NVTX_DECLSPEC
 9 |  
10 |  # Skipping MacroDefinition: NVTX_VERSIONED_IDENTIFIER_L3 ( NAME , VERSION ) NAME ## _v ## VERSION
11 |  # Skipping MacroDefinition: NVTX_VERSIONED_IDENTIFIER_L2 ( NAME , VERSION ) NVTX_VERSIONED_IDENTIFIER_L3 ( NAME , VERSION )
12 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/common.jl:
--------------------------------------------------------------------------------
 1 | # function to run a Julia script outside of the current environment
 2 | function script(code; wrapper=``, args=``)
 3 |     if Base.JLOptions().project != C_NULL
 4 |         args = `$args --project=$(unsafe_string(Base.JLOptions().project))`
 5 |     end
 6 |     mktemp() do path, io
 7 |         write(io, code)
 8 |         flush(io)
 9 |         cmd = `$wrapper $(Base.julia_cmd()) $args $path`
10 |         # redirect stderr to stdout to have it picked up by Weave.jl
11 |         run(pipeline(ignorestatus(cmd), stderr=stdout))
12 |     end
13 |     nothing
14 | end
15 | 


--------------------------------------------------------------------------------
/lib/cublas/libcublas_deprecated.jl:
--------------------------------------------------------------------------------
 1 | # Removed in CUDA 11.0
 2 | 
 3 | @checked function cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
 4 |                               beta, C, ldc)
 5 |     initialize_context()
 6 |     ccall((:cublasDgemm_v2, libcublas()), cublasStatus_t,
 7 |                    (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint,
 8 |                     Cint, RefOrCuRef{Float16}, CuPtr{Float16}, Cint, CuPtr{Float16}, Cint,
 9 |                     RefOrCuRef{Float16}, CuPtr{Float16}, Cint),
10 |                    handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/cusparse/management.jl:
--------------------------------------------------------------------------------
 1 | # cuSPARSE functions for managing the library
 2 | 
 3 | function cusparseCreate()
 4 |     handle = Ref{cusparseHandle_t}()
 5 |     @check unsafe_cusparseCreate(handle) CUSPARSE_STATUS_NOT_INITIALIZED
 6 |     handle[]
 7 | end
 8 | 
 9 | function cusparseGetProperty(property::libraryPropertyType)
10 |     value_ref = Ref{Cint}()
11 |     cusparseGetProperty(property, value_ref)
12 |     value_ref[]
13 | end
14 | 
15 | version() = VersionNumber(cusparseGetProperty(CUDA.MAJOR_VERSION),
16 |                           cusparseGetProperty(CUDA.MINOR_VERSION),
17 |                           cusparseGetProperty(CUDA.PATCH_LEVEL))
18 | 


--------------------------------------------------------------------------------
/lib/curand/wrappers.jl:
--------------------------------------------------------------------------------
 1 | # wrappers of low-level functionality
 2 | 
 3 | function curandCreateGenerator(typ)
 4 |   handle_ref = Ref{curandGenerator_t}()
 5 |   @check unsafe_curandCreateGenerator(handle_ref, typ) CURAND_STATUS_INITIALIZATION_FAILED
 6 |   handle_ref[]
 7 | end
 8 | 
 9 | function curandGetProperty(property::libraryPropertyType)
10 |   value_ref = Ref{Cint}()
11 |   curandGetProperty(property, value_ref)
12 |   value_ref[]
13 | end
14 | 
15 | version() = VersionNumber(curandGetProperty(CUDA.MAJOR_VERSION),
16 |                           curandGetProperty(CUDA.MINOR_VERSION),
17 |                           curandGetProperty(CUDA.PATCH_LEVEL))
18 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cusparse/cppversion.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcusparse_common.jl
 2 | +++ b/libcusparse_common.jl
 3 | @@ -10,7 +10,6 @@ const CUSPARSE_VER_MINOR = 1
 4 |  const CUSPARSE_VER_PATCH = 0
 5 |  const CUSPARSE_VER_BUILD = 218
 6 |  const CUSPARSE_VERSION = CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR * 100 + CUSPARSE_VER_PATCH
 7 | -const CUSPARSE_CPP_VERSION = __cplusplus
 8 | 
 9 |  # Skipping MacroDefinition: CUSPARSE_DEPRECATED ( new_func ) __attribute__ ( ( deprecated ( "please use " # new_func " instead" ) ) )
10 |  # Skipping MacroDefinition: CUSPARSE_DEPRECATED_ENUM ( new_enum ) __attribute__ ( ( deprecated ( "please use " # new_enum " instead" ) ) )
11 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cudnn/algorithm.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcudnn_common.jl	2019-10-23 17:52:17.651150610 +0200
 2 | +++ b/libcudnn_common.jl	2019-10-23 17:53:07.195648729 +0200
 3 | @@ -403,5 +400,8 @@
 4 | -struct cudnnAlgorithmUnionStruct
 5 | -    algo::Algorithm
 6 | -end
 7 | -
 8 | -const cudnnAlgorithm_t = cudnnAlgorithmUnionStruct
 9 | +# FIXME: can't use such a union as the type in a ccall expression
10 | +#Algorithm = Union{cudnnConvolutionFwdAlgo_t, cudnnConvolutionBwdFilterAlgo_t, cudnnConvolutionBwdDataAlgo_t, cudnnRNNAlgo_t, cudnnCTCLossAlgo_t}
11 | +#struct cudnnAlgorithm_t
12 | +#    algo::Algorithm
13 | +#end
14 | +#
15 | +#const cudnnAlgorithm_t = cudnnAlgorithmUnionStruct
16 | +const cudnnAlgorithm_t = Cint
17 | 


--------------------------------------------------------------------------------
/src/device/intrinsics/cooperative_groups.jl:
--------------------------------------------------------------------------------
 1 | # C. Cooperative Groups
 2 | 
 3 | export this_grid, sync_grid
 4 | 
 5 | """
 6 |     this_grid()
 7 | 
 8 | Returns a `grid_handle` of the grid group this thread belongs to. Only available if a
 9 | cooperative kernel is launched.
10 | """
11 | this_grid() = cudaCGGetIntrinsicHandle(cudaCGScopeGrid)
12 | 
13 | """
14 |     sync_grid(grid_handle::Culonglong)
15 | 
16 | Waits until all threads in all blocks in the grid `grid_handle` have reached this point and
17 | all global memory accesses made by these threads prior to `sync_grid()` are visible to all
18 | threads in the grid. A 32-bit integer `cudaError_t` is returned.
19 | """
20 | sync_grid(handle) = cudaCGSynchronize(handle, 0)
21 | 


--------------------------------------------------------------------------------
/test/ptx/vectorops.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | 
 3 | __global__ void vadd(const float *a, const float *b, float *c)
 4 | {
 5 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
 6 |     c[i] = a[i] + b[i];
 7 | }
 8 | 
 9 | __global__ void vsub(const float *a, const float *b, float *c)
10 | {
11 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
12 |     c[i] = a[i] - b[i];
13 | }
14 | 
15 | __global__ void vmul(const float *a, const float *b, float *c)
16 | {
17 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
18 |     c[i] = a[i] * b[i];
19 | }
20 | 
21 | __global__ void vdiv(const float *a, const float *b, float *c)
22 | {
23 |     int i = blockIdx.x *blockDim.x + threadIdx.x;
24 |     c[i] = a[i] / b[i];
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/test/iterator.jl:
--------------------------------------------------------------------------------
 1 | batch_count = 10
 2 | max_batch_items = 3
 3 | max_ndims = 3
 4 | sizes = 20:50
 5 | 
 6 | rand_shape = () -> rand(sizes, rand(1:max_ndims))
 7 | batches = [[rand(Float32, rand_shape()...) for _ in 1:rand(1:max_batch_items)]
 8 |                                            for _ in 1:batch_count]
 9 | cubatches = CuIterator(batch for batch in batches) # ensure generators are accepted
10 | 
11 | previous_cubatch = missing
12 | for (batch, cubatch) in zip(batches, cubatches)
13 |     global previous_cubatch
14 |     @test ismissing(previous_cubatch) || all(x -> x.storage === nothing, previous_cubatch)
15 |     @test batch == Array.(cubatch)
16 |     @test all(x -> x isa CuArray, cubatch)
17 |     previous_cubatch = cubatch
18 | end
19 | 


--------------------------------------------------------------------------------
/test/linalg.jl:
--------------------------------------------------------------------------------
 1 | using LinearAlgebra
 2 | 
 3 | @testset "qr size mismatch" begin
 4 |     X = rand(Float32, 2, 1)
 5 |     Q,R = qr(X)
 6 | 
 7 |     @test collect(Q) == Array(collect(Q))
 8 |     @test Array(Q) == Array(CuArray(Q))
 9 |     @test Array{Float32}(Q) == Array(CuArray{Float32}(Q))
10 |     @test Matrix(Q) == Array(CuMatrix(Q))
11 |     @test Matrix{Float32}(Q) == Array(CuMatrix{Float32}(Q))
12 |     @test convert(Array, Q) == Array(convert(CuArray, Q))
13 |     @test convert(Array{Float32}, Q) == Array(convert(CuArray{Float32}, Q))
14 | end
15 | 
16 | @testset "normalize!" begin
17 |     x = rand(ComplexF32, 10)
18 |     dx = CuVector{ComplexF32}(x)
19 |     @test isreal(norm(dx, 2))
20 |     @test norm(normalize!(dx)) ≈ 1
21 | end
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | 
12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13 | 
14 | 
15 | **Describe the solution you'd like**
16 | 
17 | A clear and concise description of what you want to happen.
18 | 
19 | 
20 | **Describe alternatives you've considered**
21 | 
22 | A clear and concise description of any alternative solutions or features you've considered.
23 | 
24 | 
25 | **Additional context**
26 | 
27 | Add any other context or information about the feature request here.
28 | 


--------------------------------------------------------------------------------
/test/cutensor/base.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUTENSOR
 2 | using CUDA
 3 | using LinearAlgebra
 4 | 
 5 | @test has_cutensor()
 6 | @test CUTENSOR.version() isa VersionNumber
 7 | 
 8 | @testset "CuTensor type basics" begin
 9 |     N = 2
10 |     dmax = 2^div(18,N)
11 |     dims = rand(2:dmax, N)
12 |     p = randperm(N)
13 |     indsA = collect(('a':'z')[1:N])
14 |     dimsA = dims
15 |     A = rand(Float64, dimsA...)
16 |     dA = CuArray(A)
17 |     p = randperm(N)
18 |     indsA = collect(('a':'z')[1:N])
19 |     ctA = CuTensor(dA, indsA)
20 |     @test length(ctA) == length(A)
21 |     @test size(ctA) == size(A)
22 |     @test size(ctA, 1) == size(A, 1)
23 |     @test ndims(ctA) == ndims(A)
24 |     @test strides(ctA) == strides(A)
25 |     @test eltype(ctA) == eltype(A)
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/cudnn/base.jl:
--------------------------------------------------------------------------------
 1 | function cudnnCreate()
 2 |     handle_ref = Ref{cudnnHandle_t}()
 3 |     @check unsafe_cudnnCreate(handle_ref) CUDNN_STATUS_NOT_INITIALIZED CUDNN_STATUS_INTERNAL_ERROR
 4 |     return handle_ref[]
 5 | end
 6 | 
 7 | function cudnnGetProperty(property::CUDA.libraryPropertyType)
 8 |   value_ref = Ref{Cint}()
 9 |   cudnnGetProperty(property, value_ref)
10 |   value_ref[]
11 | end
12 | 
13 | function version()
14 |   ver = cudnnGetVersion()
15 |   major, ver = divrem(ver, 1000)
16 |   minor, patch = divrem(ver, 10)
17 | 
18 |   VersionNumber(major, minor, patch)
19 | end
20 | 
21 | function cuda_version()
22 |   ver = cudnnGetCudartVersion()
23 |   major, ver = divrem(ver, 1000)
24 |   minor, patch = divrem(ver, 10)
25 | 
26 |   VersionNumber(major, minor, patch)
27 | end
28 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cudadrv/batched_memop.patch:
--------------------------------------------------------------------------------
 1 | --- a/libcuda.jl	2019-10-16 09:15:14.213122392 +0200
 2 | +++ b/libcuda.jl	2019-10-16 09:15:29.233281015 +0200
 3 | @@ -964,5 +963,0 @@
 4 | -@checked function cuStreamBatchMemOp(stream, count, paramArray, flags)
 5 | -    initialize_context()
 6 | -    ccall((:cuStreamBatchMemOp, libcuda), CUresult, (CUstream, UInt32, Ptr{CUstreamBatchMemOpParams}, UInt32), stream, count, paramArray, flags)
 7 | -end
 8 | -
 9 | --- a/libcuda_common.jl	2019-10-16 09:15:14.289789877 +0200
10 | +++ b/libcuda_common.jl	2019-10-16 09:16:50.574087901 +0200
11 | @@ -214,6 +213,0 @@
12 | -struct CUstreamBatchMemOpParams_union
13 | -    waitValue::CUstreamMemOpWaitValueParams_st
14 | -end
15 | -
16 | -const CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_union
17 | -
18 | 


--------------------------------------------------------------------------------
/src/broadcast.jl:
--------------------------------------------------------------------------------
 1 | # broadcasting
 2 | 
 3 | using Base.Broadcast: BroadcastStyle, Broadcasted
 4 | 
 5 | struct CuArrayStyle{N} <: AbstractGPUArrayStyle{N} end
 6 | CuArrayStyle(::Val{N}) where N = CuArrayStyle{N}()
 7 | CuArrayStyle{M}(::Val{N}) where {N,M} = CuArrayStyle{N}()
 8 | 
 9 | BroadcastStyle(::Type{<:CuArray{T,N}}) where {T,N} = CuArrayStyle{N}()
10 | 
11 | Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}) where {N,T} =
12 |     similar(CuArray{T}, axes(bc))
13 | 
14 | Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}, dims) where {N,T} =
15 |     CuArray{T}(undef, dims)
16 | 
17 | # broadcasting type ctors isn't GPU compatible
18 | Broadcast.broadcasted(::CuArrayStyle{N}, f::Type{T}, args...) where {N, T} =
19 |     Broadcasted{CuArrayStyle{N}}((x...) -> T(x...), args, nothing)
20 | 


--------------------------------------------------------------------------------
/lib/cudadrv/module/function.jl:
--------------------------------------------------------------------------------
 1 | # Functions in modules
 2 | 
 3 | export
 4 |     CuFunction
 5 | 
 6 | 
 7 | """
 8 |     CuFunction(mod::CuModule, name::String)
 9 | 
10 | Acquires a function handle from a named function in a module.
11 | """
12 | struct CuFunction
13 |     handle::CUfunction
14 |     mod::CuModule
15 | 
16 |     "Get a handle to a kernel function in a CUDA module."
17 |     function CuFunction(mod::CuModule, name::String)
18 |         handle_ref = Ref{CUfunction}()
19 |         cuModuleGetFunction(handle_ref, mod, name)
20 |         new(handle_ref[], mod)
21 |     end
22 | end
23 | 
24 | Base.unsafe_convert(::Type{CUfunction}, fun::CuFunction) = fun.handle
25 | 
26 | Base.:(==)(a::CuFunction, b::CuFunction) = a.handle == b.handle
27 | Base.hash(fun::CuFunction, h::UInt) = hash(mod.handle, h)
28 | 


--------------------------------------------------------------------------------
/src/device/intrinsics/misc.jl:
--------------------------------------------------------------------------------
 1 | export clock, nanosleep
 2 | 
 3 | """
 4 |     clock(UInt32)
 5 | 
 6 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
 7 | """
 8 | clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ())
 9 | 
10 | """
11 |     clock(UInt64)
12 | 
13 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
14 | """
15 | clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ())
16 | 
17 | 
18 | """
19 |     nanosleep(t)
20 | 
21 | Puts a thread for a given amount `t`(in nanoseconds).
22 | 
23 | !!! note
24 |     Requires CUDA >= 10.0 and sm_6.2
25 | """
26 | @inline function nanosleep(t::Unsigned)
27 |     @asmcall("nanosleep.u32 \$0;", "r", true,
28 |              Cvoid, Tuple{UInt32}, convert(UInt32, t))
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/cusparse/util.jl:
--------------------------------------------------------------------------------
 1 | # utility functions for the CUSPARSE wrappers
 2 | 
 3 | """
 4 | check that the dimensions of matrix `X` and vector `Y` make sense for a multiplication
 5 | """
 6 | function chkmvdims(X, n, Y, m)
 7 |     if length(X) != n
 8 |         throw(DimensionMismatch("X must have length $n, but has length $(length(X))"))
 9 |     elseif length(Y) != m
10 |         throw(DimensionMismatch("Y must have length $m, but has length $(length(Y))"))
11 |     end
12 | end
13 | 
14 | """
15 | check that the dimensions of matrices `X` and `Y` make sense for a multiplication
16 | """
17 | function chkmmdims( B, C, k, l, m, n )
18 |     if size(B) != (k,l)
19 |         throw(DimensionMismatch("B has dimensions $(size(B)) but needs ($k,$l)"))
20 |     elseif size(C) != (m,n)
21 |         throw(DimensionMismatch("C has dimensions $(size(C)) but needs ($m,$n)"))
22 |     end
23 | end
24 | 


--------------------------------------------------------------------------------
/src/precompile.jl:
--------------------------------------------------------------------------------
 1 | 
 2 | # installation management
 3 | precompile(__init_toolkit__, ())
 4 | precompile(libcuda, ())
 5 | 
 6 | # array
 7 | precompile(CuArray, (Vector{Int},))
 8 | 
 9 | # compilation
10 | precompile(CUDACompilerTarget, (CuDevice,))
11 | precompile(cufunction_compile, (CompilerJob,))
12 | precompile(cufunction_link, (CompilerJob,NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}))
13 | precompile(cufunction_cache, (CuContext,))
14 | precompile(create_exceptions!, (CuModule,))
15 | precompile(run_and_collect, (Cmd,))
16 | 
17 | # launch
18 | precompile(cudaconvert, (Function,))
19 | precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
20 | precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
21 | 


--------------------------------------------------------------------------------
/test/apiutils.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.APIUtils
 2 | 
 3 | @testset "@enum_without_prefix" begin
 4 |     mod = @eval module $(gensym())
 5 |         using CUDA.APIUtils
 6 |         @enum MY_ENUM MY_ENUM_VALUE
 7 |         @enum_without_prefix MY_ENUM MY_
 8 |     end
 9 | 
10 |     @test mod.ENUM_VALUE == mod.MY_ENUM_VALUE
11 | end
12 | 
13 | @testset "@checked" begin
14 |     mod = @eval module $(gensym())
15 |         using CUDA.APIUtils
16 | 
17 |         const checks = Ref(0)
18 |         macro check(ex)
19 |             esc(quote
20 |                 $checks[] += 1
21 |                 $ex
22 |             end)
23 |         end
24 | 
25 |         @checked function foo()
26 |             ccall(:jl_getpid, Cint, ())
27 |         end
28 |     end
29 | 
30 |     @test mod.checks[] == 0
31 |     @test mod.foo() == getpid()
32 |     @test mod.checks[] == 1
33 |     @test mod.unsafe_foo() == getpid()
34 |     @test mod.checks[] == 1
35 | end
36 | 


--------------------------------------------------------------------------------
/test/ptx/vadd.ptx:
--------------------------------------------------------------------------------
 1 | 
 2 | .version 3.1
 3 | .target sm_20
 4 | .address_size 64
 5 | 
 6 | 
 7 | .visible .entry vadd(
 8 | 	.param .u64 vadd_param_0,
 9 | 	.param .u64 vadd_param_1,
10 | 	.param .u64 vadd_param_2
11 | )
12 | {
13 | 	.reg .s32 	%r<8>;
14 | 	.reg .f32 	%f<4>;
15 | 	.reg .s64 	%rd<11>;
16 | 
17 | 
18 | 	ld.param.u64 	%rd1, [vadd_param_0];
19 | 	ld.param.u64 	%rd2, [vadd_param_1];
20 | 	ld.param.u64 	%rd3, [vadd_param_2];
21 | 	cvta.to.global.u64 	%rd4, %rd3;
22 | 	mov.u32 	%r1, %ntid.x;
23 | 	mov.u32 	%r2, %ctaid.x;
24 | 	mov.u32 	%r3, %tid.x;
25 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
26 | 	cvta.to.global.u64 	%rd5, %rd1;
27 | 	mul.wide.s32 	%rd6, %r4, 4;
28 | 	add.s64 	%rd7, %rd5, %rd6;
29 | 	cvta.to.global.u64 	%rd8, %rd2;
30 | 	add.s64 	%rd9, %rd8, %rd6;
31 | 	ld.global.f32 	%f1, [%rd9];
32 | 	ld.global.f32 	%f2, [%rd7];
33 | 	add.f32 	%f3, %f2, %f1;
34 | 	add.s64 	%rd10, %rd4, %rd6;
35 | 	st.global.f32 	[%rd10], %f3;
36 | 	ret;
37 | }
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/lib/nvml/system.jl:
--------------------------------------------------------------------------------
 1 | function version()
 2 |     buf = Vector{Cchar}(undef, NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)
 3 |     nvmlSystemGetNVMLVersion(pointer(buf), length(buf))
 4 | 
 5 |     # the version string is too long for Julia to handle, e.g. 11.450.36.06,
 6 |     # so split off the driver part into the build suffix
 7 |     ver = unsafe_string(pointer(buf))
 8 |     parts = parse.(Int, split(ver, '.'))
 9 |     return VersionNumber(parts[1], 0, 0, (), Tuple(parts[2:end]))
10 | end
11 | 
12 | function driver_version()
13 |     buf = Vector{Cchar}(undef, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
14 |     nvmlSystemGetDriverVersion(pointer(buf), length(buf))
15 |     return VersionNumber(unsafe_string(pointer(buf)))
16 | end
17 | 
18 | function cuda_driver_version()
19 |     ref = Ref{Cint}()
20 |     nvmlSystemGetCudaDriverVersion_v2(ref)
21 |     major, ver = divrem(ref[], 1000)
22 |     minor, patch = divrem(ver, 10)
23 |     return VersionNumber(major, minor, patch)
24 | end
25 | 


--------------------------------------------------------------------------------
/lib/cusparse/helpers.jl:
--------------------------------------------------------------------------------
 1 | # cuSPARSE helper functions
 2 | 
 3 | 
 4 | ## matrix descriptor
 5 | 
 6 | mutable struct CuMatrixDescriptor
 7 |     handle::cusparseMatDescr_t
 8 | 
 9 |     function CuMatrixDescriptor()
10 |         descr_ref = Ref{cusparseMatDescr_t}()
11 |         cusparseCreateMatDescr(descr_ref)
12 |         obj = new(descr_ref[])
13 |         finalizer(cusparseDestroyMatDescr, obj)
14 |         obj
15 |     end
16 | end
17 | 
18 | Base.unsafe_convert(::Type{cusparseMatDescr_t}, desc::CuMatrixDescriptor) = desc.handle
19 | 
20 | function CuMatrixDescriptor(MatrixType::Char, FillMode::Char, DiagType::Char, IndexBase::Char)
21 |     desc = CuMatrixDescriptor()
22 |     if MatrixType != 'G'
23 |         cusparseSetMatType(desc, MatrixType)
24 |     end
25 |     cusparseSetMatFillMode(desc, FillMode)
26 |     cusparseSetMatDiagType(desc, DiagType)
27 |     if IndexBase != 'Z'
28 |         cusparseSetMatIndexBase(desc, IndexBase)
29 |     end
30 |     return desc
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/utils/enum.jl:
--------------------------------------------------------------------------------
 1 | export @enum_without_prefix
 2 | 
 3 | 
 4 | ## redeclare enum values without a prefix
 5 | 
 6 | # this is useful when enum values from an underlying C library, typically prefixed for the
 7 | # lack of namespacing in C, are to be used in Julia where we do have module namespacing.
 8 | macro enum_without_prefix(enum, prefix)
 9 |     if isa(enum, Symbol)
10 |         mod = __module__
11 |     elseif Meta.isexpr(enum, :(.))
12 |         mod = getfield(__module__, enum.args[1])
13 |         enum = enum.args[2].value
14 |     else
15 |         error("Do not know how to refer to $enum")
16 |     end
17 |     enum = getfield(mod, enum)
18 |     prefix = String(prefix)
19 | 
20 |     ex = quote end
21 |     for instance in instances(enum)
22 |         name = String(Symbol(instance))
23 |         @assert startswith(name, prefix)
24 |         push!(ex.args, :(const $(Symbol(name[length(prefix)+1:end])) = $(mod).$(Symbol(name))))
25 |     end
26 | 
27 |     return esc(ex)
28 | end
29 | 


--------------------------------------------------------------------------------
/test/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 3 | BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 4 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 5 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 6 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 7 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 8 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 9 | Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
10 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
11 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
12 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
13 | REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
14 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
15 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
16 | SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
17 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
18 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
19 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
20 | 


--------------------------------------------------------------------------------
/examples/wmma/high-level.jl:
--------------------------------------------------------------------------------
 1 | using CUDA
 2 | if capability(device()) < v"7.0"
 3 |     exit()
 4 | end
 5 | 
 6 | ### START
 7 | using Test
 8 | 
 9 | using CUDA
10 | 
11 | a     = rand(Float16, (16, 16))
12 | b     = rand(Float16, (16, 16))
13 | c     = rand(Float32, (16, 16))
14 | 
15 | a_dev = CuArray(a)
16 | b_dev = CuArray(b)
17 | c_dev = CuArray(c)
18 | d_dev = similar(c_dev)
19 | 
20 | function kernel(a_dev, b_dev, c_dev, d_dev)
21 |     conf = WMMA.Config{16, 16, 16, Float32}
22 | 
23 |     a_frag = WMMA.load_a(pointer(a_dev), 16, WMMA.ColMajor, conf)
24 |     b_frag = WMMA.load_b(pointer(b_dev), 16, WMMA.ColMajor, conf)
25 |     c_frag = WMMA.load_c(pointer(c_dev), 16, WMMA.ColMajor, conf)
26 | 
27 |     c_frag = 0.5f0 .* c_frag
28 | 
29 |     d_frag = WMMA.mma(a_frag, b_frag, c_frag, conf)
30 | 
31 |     WMMA.store_d(pointer(d_dev), d_frag, 16, WMMA.ColMajor, conf)
32 | 
33 |     return
34 | end
35 | 
36 | @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev)
37 | d = Array(d_dev)
38 | 
39 | @test all(isapprox.(a * b + 0.5 * c, d; rtol=0.01))
40 | ### END
41 | 


--------------------------------------------------------------------------------
/res/wrap/patches/cublas/computetype.patch:
--------------------------------------------------------------------------------
1 | --- a/libcublas.jl
2 | +++ b/libcublas.jl
3 | @@ -1414,5 +1414,5 @@ end
4 |  @checked function cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)
5 |      initialize_api()
6 | -    ccall((:cublasGemmEx, libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, cublasComputeType_t, cublasGemmAlgo_t), handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)
7 | +    ccall((:cublasGemmEx, libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, UInt32, cublasGemmAlgo_t), handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)
8 |  end
9 | 


--------------------------------------------------------------------------------
/CITATION.bib:
--------------------------------------------------------------------------------
 1 | % primary paper, detailing the GPU compiler and relevant aspects
 2 | @article{besard2018juliagpu,
 3 |   author        = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn},
 4 |   title         = {Effective Extensible Programming: Unleashing {Julia} on {GPUs}},
 5 |   journal       = {IEEE Transactions on Parallel and Distributed Systems},
 6 |   year          = {2018},
 7 |   doi           = {10.1109/TPDS.2018.2872064},
 8 |   ISSN          = {1045-9219},
 9 |   archivePrefix = {arXiv},
10 |   eprint        = {1712.03112},
11 |   primaryClass  = {cs.PL},
12 | }
13 | 
14 | % specific paper on array programming for heterogeneous systems
15 | @article{besard2019prototyping,
16 |   title         = {Rapid software prototyping for heterogeneous and distributed platforms},
17 |   author        = {Besard, Tim and Churavy, Valentin and Edelman, Alan and De Sutter, Bjorn},
18 |   journal       = {Advances in Engineering Software},
19 |   volume        = {132},
20 |   pages         = {29--46},
21 |   year          = {2019},
22 |   publisher     = {Elsevier}
23 | }
24 | 


--------------------------------------------------------------------------------
/lib/cusolver/base.jl:
--------------------------------------------------------------------------------
 1 | # wrappers of low-level functionality
 2 | 
 3 | function cusolverGetProperty(property::libraryPropertyType)
 4 |   value_ref = Ref{Cint}()
 5 |   cusolverGetProperty(property, value_ref)
 6 |   value_ref[]
 7 | end
 8 | 
 9 | version() = VersionNumber(cusolverGetProperty(CUDA.MAJOR_VERSION),
10 |                           cusolverGetProperty(CUDA.MINOR_VERSION),
11 |                           cusolverGetProperty(CUDA.PATCH_LEVEL))
12 | 
13 | function Base.convert(::Type{cusolverEigType_t}, typ::Int)
14 |     if typ == 1
15 |         CUSOLVER_EIG_TYPE_1
16 |     elseif typ == 2
17 |         CUSOLVER_EIG_TYPE_2
18 |     elseif typ == 3
19 |         CUSOLVER_EIG_TYPE_3
20 |     else
21 |         throw(ArgumentError("Unknown eigenvalue solver type $typ."))
22 |     end
23 | end
24 | 
25 | function Base.convert(::Type{cusolverEigMode_t}, jobz::Char)
26 |     if jobz == 'N'
27 |         CUSOLVER_EIG_MODE_NOVECTOR
28 |     elseif jobz == 'V'
29 |         CUSOLVER_EIG_MODE_VECTOR
30 |     else
31 |         throw(ArgumentError("Unknown eigenvalue solver mode $jobz."))
32 |     end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/cudnn/error.jl:
--------------------------------------------------------------------------------
 1 | export CUDNNError
 2 | 
 3 | struct CUDNNError <: Exception
 4 |     code::cudnnStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{cudnnStatus_t}, err::CUDNNError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUDNNError) =
10 |     print(io, "CUDNNError: ", name(err), " (code $(reinterpret(Int32, err.code)))")
11 | 
12 | name(err::CUDNNError) = unsafe_string(cudnnGetErrorString(err))
13 | 
14 | 
15 | ## API call wrapper
16 | 
17 | # outlined functionality to avoid GC frame allocation
18 | @noinline function throw_api_error(res)
19 |     if res == CUDNN_STATUS_ALLOC_FAILED
20 |         throw(OutOfGPUMemoryError())
21 |     else
22 |         throw(CUDNNError(res))
23 |     end
24 | end
25 | 
26 | macro check(ex, errs...)
27 |     check = :(isequal(err, CUDNN_STATUS_ALLOC_FAILED))
28 |     for err in errs
29 |         check = :($check || isequal(err, $(esc(err))))
30 |     end
31 | 
32 |     quote
33 |         res = @retry_reclaim err->$check $(esc(ex))
34 |         if res != CUDNN_STATUS_SUCCESS
35 |             throw_api_error(res)
36 |         end
37 | 
38 |         nothing
39 |     end
40 | end
41 | 


--------------------------------------------------------------------------------
/test/cudnn/tensor.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUDNN:
 2 |     cudnnTensorDescriptor,
 3 |     cudnnCreateTensorDescriptor,
 4 |     cudnnFilterDescriptor,
 5 |     cudnnDataType,
 6 |     cudnnDataType_t,
 7 |     CUDNN_TENSOR_NCHW,
 8 |     CUDNN_STATUS_SUCCESS,
 9 |     @retry_reclaim
10 | 
11 | @testset "cudnn/tensor" begin
12 |     x = CUDA.rand(1,1,1,2)
13 | 
14 |     TD = cudnnTensorDescriptor
15 |     FD = cudnnFilterDescriptor
16 |     DT = cudnnDataType
17 | 
18 |     @test TD(x) isa TD
19 |     @test TD(CUDNN_TENSOR_NCHW, DT(eltype(x)), Cint(ndims(x)), Cint[reverse(size(x))...]) isa TD
20 |     td = TD(x)
21 |     @test TD(td.ptr) isa TD
22 |     @test Base.unsafe_convert(Ptr, TD(td.ptr)) isa Ptr
23 | 
24 |     @test FD(x) isa FD
25 |     @test FD(DT(eltype(x)),CUDNN_TENSOR_NCHW,Cint(ndims(x)),Cint[reverse(size(x))...]) isa FD
26 |     fd = FD(x)
27 |     @test FD(fd.ptr) isa FD
28 |     @test Base.unsafe_convert(Ptr, FD(fd.ptr)) isa Ptr
29 | 
30 |     @test DT(Float32) isa cudnnDataType_t
31 | 
32 |     @test (@retry_reclaim(x->(x!==CUDNN_STATUS_SUCCESS),cudnnCreateTensorDescriptor(Ref{Ptr{Cvoid}}(C_NULL)))) isa Nothing
33 | end
34 | 


--------------------------------------------------------------------------------
/test/examples.jl:
--------------------------------------------------------------------------------
 1 | # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces,
 2 | #                     but --show-backtrace=no does not survive execve.
 3 | @not_if_sanitize begin
 4 | 
 5 | function find_sources(path::String, sources=String[])
 6 |     if isdir(path)
 7 |         for entry in readdir(path)
 8 |             find_sources(joinpath(path, entry), sources)
 9 |         end
10 |     elseif endswith(path, ".jl")
11 |         push!(sources, path)
12 |     end
13 |     sources
14 | end
15 | 
16 | examples_dir = joinpath(@__DIR__, "..", "examples")
17 | examples = find_sources(examples_dir)
18 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples)
19 | 
20 | cd(examples_dir) do
21 |     global examples
22 |     examples = relpath.(examples, Ref(examples_dir))
23 |     @testset for example in examples
24 |         cmd = Base.julia_cmd()
25 |         if Base.JLOptions().project != C_NULL
26 |             cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))`
27 |         end
28 | 
29 |         @test success(pipeline(`$cmd $example`, stderr=stderr))
30 |     end
31 | end
32 | 
33 | end
34 | 


--------------------------------------------------------------------------------
/src/iterator.jl:
--------------------------------------------------------------------------------
 1 | export CuIterator
 2 | 
 3 | """
 4 |     CuIterator(batches)
 5 | 
 6 | Return a `CuIterator` that can iterate through the provided `batches` via `Base.iterate`.
 7 | 
 8 | Upon each iteration, the current `batch` is adapted to the GPU (via `map(x -> adapt(CuArray, x), batch)`)
 9 | and the previous iteration is marked as freeable from GPU memory (via `unsafe_free!`).
10 | 
11 | This abstraction is useful for batching data into GPU memory in a manner that
12 | allows old iterations to potentially be freed (or marked as reusable) earlier
13 | than they otherwise would via CuArray's internal polling mechanism.
14 | """
15 | mutable struct CuIterator{B}
16 |     batches::B
17 |     previous::Any
18 |     CuIterator(batches) = new{typeof(batches)}(batches)
19 | end
20 | 
21 | function Base.iterate(c::CuIterator, state...)
22 |     item = iterate(c.batches, state...)
23 |     isdefined(c, :previous) && foreach(unsafe_free!, c.previous)
24 |     item === nothing && return nothing
25 |     batch, next_state = item
26 |     cubatch = map(x -> adapt(CuArray, x), batch)
27 |     c.previous = cubatch
28 |     return cubatch, next_state
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/cutensor/tensor.jl:
--------------------------------------------------------------------------------
 1 | export CuTensor
 2 | 
 3 | mutable struct CuTensor{T, N}
 4 |     data::DenseCuArray{T, N}
 5 |     inds::Vector{Char}
 6 |     function CuTensor{T, N}(data::DenseCuArray{T, N}, inds::Vector{Char}) where {T<:Number, N}
 7 |         new(data, inds)
 8 |     end
 9 |     function CuTensor{T, N}(data::DenseCuArray{N, T}, inds::Vector{<:AbstractChar}) where {T<:Number, N}
10 |         new(data, Char.(inds))
11 |     end
12 | end
13 | 
14 | CuTensor(data::DenseCuArray{T, N}, inds::Vector{<:AbstractChar}) where {T<:Number, N} =
15 |     CuTensor{T, N}(data, convert(Vector{Char}, inds))
16 | 
17 | CuTensor(data::DenseCuArray{T, N}, inds::Vector{Char}) where {T<:Number, N} =
18 |     CuTensor{T, N}(data, inds)
19 | 
20 | Base.size(T::CuTensor) = size(T.data)
21 | Base.size(T::CuTensor, i) = size(T.data, i)
22 | Base.length(T::CuTensor) = length(T.data)
23 | Base.ndims(T::CuTensor) = length(T.inds)
24 | Base.strides(T::CuTensor) = strides(T.data)
25 | Base.eltype(T::CuTensor) = eltype(T.data)
26 | Base.similar(T::CuTensor{Tv, N}) where {Tv, N} = CuTensor{Tv, N}(similar(T.data), copy(T.inds))
27 | Base.collect(T::CuTensor) = (collect(T.data), T.inds)
28 | 


--------------------------------------------------------------------------------
/test/pool.jl:
--------------------------------------------------------------------------------
 1 | CUDA.alloc(0)
 2 | 
 3 | @test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20)
 4 | 
 5 | @testset "@allocated" begin
 6 |     @test (CUDA.@allocated CuArray{Int32}(undef,1)) == 4
 7 | end
 8 | 
 9 | @testset "@timed" begin
10 |     out = CUDA.@timed CuArray{Int32}(undef, 1)
11 |     @test isa(out.value, CuArray{Int32})
12 |     @test out.gpu_bytes > 0
13 | end
14 | 
15 | @testset "@time" begin
16 |     ret, out = @grab_output CUDA.@time CuArray{Int32}(undef, 1)
17 |     @test isa(ret, CuArray{Int32})
18 |     @test occursin("1 GPU allocation: 4 bytes", out)
19 | 
20 |     x = CuArray{Int32}(undef, 6)
21 |     ret, out = @grab_output CUDA.@time Base.unsafe_wrap(CuArray, pointer(x), (2, 3))
22 |     @test isa(ret, CuArray{Int32})
23 |     @test !occursin("GPU allocation", out)
24 | end
25 | 
26 | @testset "reclaim" begin
27 |     CUDA.reclaim(1024)
28 |     CUDA.reclaim()
29 | 
30 |     @test CUDA.@retry_reclaim(isequal(42), 42) == 42
31 |     @test CUDA.@retry_reclaim(isequal(42), 41) == 41
32 | end
33 | 
34 | @testset "memory_status" begin
35 |     CUDA.memory_status(devnull)
36 |     CUDA.used_memory()
37 |     CUDA.cached_memory()
38 | end
39 | 


--------------------------------------------------------------------------------
/examples/wmma/low-level.jl:
--------------------------------------------------------------------------------
 1 | using CUDA
 2 | if capability(device()) < v"7.0"
 3 |     exit()
 4 | end
 5 | 
 6 | ### START
 7 | using Test
 8 | 
 9 | using CUDA
10 | 
11 | # Generate input matrices
12 | a     = rand(Float16, (16, 16))
13 | a_dev = CuArray(a)
14 | b     = rand(Float16, (16, 16))
15 | b_dev = CuArray(b)
16 | c     = rand(Float32, (16, 16))
17 | c_dev = CuArray(c)
18 | 
19 | # Allocate space for result
20 | d_dev = similar(c_dev)
21 | 
22 | # Matrix multiply-accumulate kernel (D = A * B + C)
23 | function kernel(a_dev, b_dev, c_dev, d_dev)
24 |     a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_f16(pointer(a_dev), 16)
25 |     b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_f16(pointer(b_dev), 16)
26 |     c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_f32(pointer(c_dev), 16)
27 | 
28 |     d_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_f32_f32(a_frag, b_frag, c_frag)
29 | 
30 |     WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_f32(pointer(d_dev), d_frag, 16)
31 |     return
32 | end
33 | 
34 | @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev)
35 | @test all(isapprox.(a * b + c, Array(d_dev); rtol=0.01))
36 | ### END
37 | 


--------------------------------------------------------------------------------
/.github/workflows/CompatHelper.yml:
--------------------------------------------------------------------------------
 1 | name: CompatHelper
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   CompatHelper:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: Get Julia compatibility
14 |         id: julia_compat
15 |         # NOTE: this requires a Julia compat lower-bound with minor version!
16 |         run : |
17 |           version=$(grep '^julia = ' Project.toml | grep -o '".*"' | cut -d '"' -f2)
18 |           echo "::set-output name=version::$version"
19 |       - uses: julia-actions/setup-julia@v1
20 |         with:
21 |           version: ${{ steps.julia_compat.outputs.version }}
22 |       - name: Install CompatHelper
23 |         run: |
24 |           import Pkg
25 |           name = "CompatHelper"
26 |           version = "2"
27 |           Pkg.add(; name, version)
28 |         shell: julia --color=yes {0}
29 |       - name: Run CompatHelper
30 |         run: |
31 |           using CompatHelper
32 |           CompatHelper.main()
33 |         shell: julia --color=yes {0}
34 |         env:
35 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/docs/src/api/compiler.md:
--------------------------------------------------------------------------------
 1 | # Compiler
 2 | 
 3 | ## Execution
 4 | 
 5 | The main entry-point to the compiler is the `@cuda` macro:
 6 | 
 7 | ```@docs
 8 | @cuda
 9 | ```
10 | 
11 | If needed, you can use a lower-level API that lets you inspect the compiler kernel:
12 | 
13 | ```@docs
14 | cudaconvert
15 | cufunction
16 | CUDA.HostKernel
17 | CUDA.version
18 | CUDA.maxthreads
19 | CUDA.registers
20 | CUDA.memory
21 | ```
22 | 
23 | 
24 | ## Reflection
25 | 
26 | If you want to inspect generated code, you can use macros that resemble functionality from
27 | the InteractiveUtils standard library:
28 | 
29 | ```
30 | @device_code_lowered
31 | @device_code_typed
32 | @device_code_warntype
33 | @device_code_llvm
34 | @device_code_ptx
35 | @device_code_sass
36 | @device_code
37 | ```
38 | 
39 | These macros are also available in function-form:
40 | 
41 | ```
42 | CUDA.code_typed
43 | CUDA.code_warntype
44 | CUDA.code_llvm
45 | CUDA.code_ptx
46 | CUDA.code_sass
47 | ```
48 | 
49 | For more information, please consult the GPUCompiler.jl documentation. Only the `code_sass`
50 | functionality is actually defined in CUDA.jl:
51 | 
52 | ```@docs
53 | @device_code_sass
54 | CUDA.code_sass
55 | ```
56 | 


--------------------------------------------------------------------------------
/perf/cudadevrt.jl:
--------------------------------------------------------------------------------
 1 | module cudadevrt
 2 | 
 3 | using CUDA, BenchmarkTools, Random
 4 | 
 5 | const threads = 256
 6 | #simple add matrix and vector kernel
 7 | function kernel_add_mat_vec(m, x1, x2, y)
 8 |     # one block per column
 9 |     offset = (blockIdx().x-1) * m
10 |     @inbounds xtmp = x2[blockIdx().x]
11 |     for i = threadIdx().x : blockDim().x : m
12 |         @inbounds y[offset + i] = x1[offset + i] + xtmp
13 |     end
14 |     return
15 | end
16 | 
17 | function add!(y, x1, x2)
18 |     m, n = size(x1)
19 |     @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
20 | end
21 | 
22 | function main()
23 |     Random.seed!(1)
24 |     m, n = 3072, 1536    # 256 multiplier
25 |     x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
26 |     x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
27 |     y1 = similar(x1)
28 | 
29 |     results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
30 | 
31 |     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
32 |     CUDA.unsafe_free!(x1)
33 |     CUDA.unsafe_free!(x2)
34 |     CUDA.unsafe_free!(y1)
35 | 
36 |     return results
37 | end
38 | 
39 | end
40 | 
41 | cudadevrt.main()
42 | 
43 | 


--------------------------------------------------------------------------------
/test/cusparse/conversions.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUSPARSE, SparseArrays
 2 | 
 3 | @testset "sparse" begin
 4 |     n, m = 4, 4
 5 |     I = [1,2,3] |> cu
 6 |     J = [2,3,4] |> cu
 7 |     V = Float32[1,2,3] |> cu
 8 | 
 9 |     dense = rand(3,3) |> cu
10 | 
11 |     # check defaults
12 |     @test sparse(I, J, V) isa CuSparseMatrixCSC
13 |     @test sparse(dense) isa CuSparseMatrixCSC
14 | 
15 |     for (fmt, T) in  [(:coo, CuSparseMatrixCOO),
16 |                       (:csc, CuSparseMatrixCSC),
17 |                       (:csr, CuSparseMatrixCSR),
18 |                       (:bsr, CuSparseMatrixBSR)
19 |                      ]
20 |         if fmt != :bsr # bsr not supported
21 |             x = sparse(I, J, V; fmt=fmt)
22 |             @test x isa T{Float32}
23 |             @test size(x) == (3, 4)
24 | 
25 |             x = sparse(I, J, V, m, n; fmt=fmt)
26 |             @test x isa T{Float32}
27 |             @test  size(x) == (4, 4)
28 |         end
29 | 
30 |         if fmt != :coo # dense to COO not implemented
31 |             x = sparse(dense; fmt=fmt)
32 |             @test x isa T{Float32}
33 |             @test collect(x) == collect(dense)
34 |         end
35 |     end
36 | end
37 | 


--------------------------------------------------------------------------------
/test/utils.jl:
--------------------------------------------------------------------------------
 1 | @testset "test utilities" begin
 2 |     mutable struct NoThrowTestSet <: Test.AbstractTestSet
 3 |         results::Vector
 4 |         NoThrowTestSet(desc) = new([])
 5 |     end
 6 |     Test.record(ts::NoThrowTestSet, t::Test.Result) = (push!(ts.results, t); t)
 7 |     Test.finish(ts::NoThrowTestSet) = ts.results
 8 |     fails = @testset NoThrowTestSet begin
 9 |         # OK
10 |         @test_throws_cuerror CUDA.ERROR_UNKNOWN throw(CuError(CUDA.ERROR_UNKNOWN))
11 |         # Fail, wrong CuError
12 |         @test_throws_cuerror CUDA.ERROR_UNKNOWN throw(CuError(CUDA.ERROR_INVALID_VALUE))
13 |         # Fail, wrong Exception
14 |         @test_throws_cuerror CUDA.ERROR_UNKNOWN error()
15 |     end
16 |     @test isa(fails[1], Test.Pass)
17 |     @test isa(fails[2], Test.Fail)
18 |     @test isa(fails[3], Test.Fail)
19 | end
20 | 
21 | @testset "@sync" begin
22 |   t = Base.@elapsed ret = CUDA.@sync begin
23 |     # TODO: do something that takes a while on the GPU
24 |     #       (need to wrap clock64 for that)
25 |     42
26 |   end
27 |   @test t >= 0
28 |   @test ret == 42
29 | end
30 | 
31 | @testset "versioninfo" begin
32 |     CUDA.versioninfo(devnull)
33 | end
34 | 


--------------------------------------------------------------------------------
/examples/driver/vadd.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-19856038
 5 | // Cuda compilation tools, release 7.5, V7.5.17
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 4.3
10 | .target sm_20
11 | .address_size 64
12 | 
13 | 	// .globl	kernel_vadd
14 | 
15 | .visible .entry kernel_vadd(
16 | 	.param .u64 kernel_vadd_param_0,
17 | 	.param .u64 kernel_vadd_param_1,
18 | 	.param .u64 kernel_vadd_param_2
19 | )
20 | {
21 | 	.reg .f32 	%f<4>;
22 | 	.reg .b32 	%r<5>;
23 | 	.reg .b64 	%rd<11>;
24 | 
25 | 
26 | 	ld.param.u64 	%rd1, [kernel_vadd_param_0];
27 | 	ld.param.u64 	%rd2, [kernel_vadd_param_1];
28 | 	ld.param.u64 	%rd3, [kernel_vadd_param_2];
29 | 	cvta.to.global.u64 	%rd4, %rd3;
30 | 	cvta.to.global.u64 	%rd5, %rd2;
31 | 	cvta.to.global.u64 	%rd6, %rd1;
32 | 	mov.u32 	%r1, %ctaid.x;
33 | 	mov.u32 	%r2, %ntid.x;
34 | 	mov.u32 	%r3, %tid.x;
35 | 	mad.lo.s32 	%r4, %r2, %r1, %r3;
36 | 	mul.wide.s32 	%rd7, %r4, 4;
37 | 	add.s64 	%rd8, %rd6, %rd7;
38 | 	ld.global.f32 	%f1, [%rd8];
39 | 	add.s64 	%rd9, %rd5, %rd7;
40 | 	ld.global.f32 	%f2, [%rd9];
41 | 	add.f32 	%f3, %f1, %f2;
42 | 	add.s64 	%rd10, %rd4, %rd7;
43 | 	st.global.f32 	[%rd10], %f3;
44 | 	ret;
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/lib/cudadrv/types.jl:
--------------------------------------------------------------------------------
 1 | export CuDim3, CuDim
 2 | 
 3 | """
 4 |     CuDim3(x)
 5 | 
 6 |     CuDim3((x,))
 7 |     CuDim3((x, y))
 8 |     CuDim3((x, y, x))
 9 | 
10 | A type used to specify dimensions, consisting of 3 integers for respectively the `x`, `y`
11 | and `z` dimension. Unspecified dimensions default to `1`.
12 | 
13 | Often accepted as argument through the `CuDim` type alias, eg. in the case of
14 | [`cudacall`](@ref) or [`CUDA.launch`](@ref), allowing to pass dimensions as a plain integer
15 | or a tuple without having to construct an explicit `CuDim3` object.
16 | """
17 | struct CuDim3
18 |     x::Cuint
19 |     y::Cuint
20 |     z::Cuint
21 | end
22 | 
23 | CuDim3(dims::Integer)             = CuDim3(dims,    Cuint(1), Cuint(1))
24 | CuDim3(dims::NTuple{1,<:Integer}) = CuDim3(dims[1], Cuint(1), Cuint(1))
25 | CuDim3(dims::NTuple{2,<:Integer}) = CuDim3(dims[1], dims[2],  Cuint(1))
26 | CuDim3(dims::NTuple{3,<:Integer}) = CuDim3(dims[1], dims[2],  dims[3])
27 | 
28 | # Type alias for conveniently specifying the dimensions
29 | # (e.g. `(len, 2)` instead of `CuDim3((len, 2))`)
30 | const CuDim = Union{Integer,
31 |                     Tuple{Integer},
32 |                     Tuple{Integer, Integer},
33 |                     Tuple{Integer, Integer, Integer}}
34 | 


--------------------------------------------------------------------------------
/lib/cusparse/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSPARSEError
 2 | 
 3 | struct CUSPARSEError <: Exception
 4 |     code::cusparseStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{cusparseStatus_t}, err::CUSPARSEError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUSPARSEError) =
10 |     print(io, "CUSPARSEError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CUSPARSEError) = unsafe_string(cusparseGetErrorName(err))
13 | 
14 | description(err::CUSPARSEError) = unsafe_string(cusparseGetErrorString(err))
15 | 
16 | 
17 | ## API call wrapper
18 | 
19 | # outlined functionality to avoid GC frame allocation
20 | @noinline function throw_api_error(res)
21 |     if res == CUSPARSE_STATUS_ALLOC_FAILED
22 |         throw(OutOfGPUMemoryError())
23 |     else
24 |         throw(CUSPARSEError(res))
25 |     end
26 | end
27 | 
28 | macro check(ex, errs...)
29 |     check = :(isequal(err, CUSPARSE_STATUS_ALLOC_FAILED))
30 |     for err in errs
31 |         check = :($check || isequal(err, $(esc(err))))
32 |     end
33 | 
34 |     quote
35 |         res = @retry_reclaim err->$check $(esc(ex))
36 |         if res != CUSPARSE_STATUS_SUCCESS
37 |             throw_api_error(res)
38 |         end
39 | 
40 |         nothing
41 |     end
42 | end
43 | 


--------------------------------------------------------------------------------
/test/cudnn/dropout.jl:
--------------------------------------------------------------------------------
 1 | using Statistics
 2 | using CUDA.CUDNN:
 3 |     cudnnDropoutForward,
 4 |     cudnnDropoutForward!,
 5 |     cudnnDropoutBackward,
 6 |     cudnnDropoutSeed,
 7 |     cudnnDropoutDescriptor,
 8 |         cudnnDropoutDescriptor_t,
 9 |         cudnnCreateDropoutDescriptor,
10 |         cudnnSetDropoutDescriptor,
11 |         cudnnGetDropoutDescriptor,
12 |         cudnnRestoreDropoutDescriptor,
13 |         cudnnDestroyDropoutDescriptor,
14 |     cudnnDropoutGetStatesSize,
15 |     cudnnDropoutGetReserveSpaceSize
16 | 
17 | @testset "cudnn/dropout" begin
18 |     @test cudnnDropoutDescriptor(C_NULL) isa cudnnDropoutDescriptor
19 |     @test Base.unsafe_convert(Ptr, cudnnDropoutDescriptor(C_NULL)) isa Ptr
20 |     @test cudnnDropoutDescriptor(0.5) isa cudnnDropoutDescriptor
21 | 
22 |     N,P = 1000, 0.7
23 |     x = CUDA.rand(N)
24 |     d = cudnnDropoutDescriptor(P)
25 |     cudnnDropoutSeed[] = 1
26 |     y = cudnnDropoutForward(x; dropout = P) |> Array
27 |     @test isapprox(mean(y.==0), P; atol = 3/sqrt(N))
28 |     @test y == cudnnDropoutForward(x, d) |> Array
29 |     @test y == cudnnDropoutForward!(similar(x), x; dropout = P) |> Array
30 |     @test y == cudnnDropoutForward!(similar(x), x, d) |> Array
31 |     cudnnDropoutSeed[] = -1
32 | end
33 | 


--------------------------------------------------------------------------------
/test/cudnn/inplace.jl:
--------------------------------------------------------------------------------
 1 | import CUDA.CUDNN:
 2 |     cudnnSetTensor!,
 3 |     cudnnScaleTensor!,
 4 |     cudnnScaleTensor,
 5 |     cudnnAddTensor!,
 6 |     cudnnAddTensor,
 7 |     CUDNN_TENSOR_NHWC
 8 | 
 9 | @testset "cudnn/inplace" begin
10 |     x = CUDA.rand(10)
11 |     cudnnSetTensor!(x, 7)
12 |     @test all(isequal(7), Array(x))
13 |     ax = rand(10)
14 |     cx = CuArray(ax)
15 |     @test 7*ax ≈ cudnnScaleTensor(cx, 7) |> Array
16 |     @test 7*ax ≈ cudnnScaleTensor!(similar(cx), cx, 7) |> Array
17 |     ax,ab = rand(5,4,3,2),rand(1,1,3,1)
18 |     cx,cb = CuArray.((ax,ab))
19 |     @test ax .+ ab ≈ cudnnAddTensor(cx, cb) |> Array
20 |     @test ax .+ 7*ab ≈ cudnnAddTensor(cx, cb, alpha=7) |> Array
21 |     @test 7*ax .+ ab ≈ cudnnAddTensor(cx, cb, beta=7) |> Array
22 |     @test ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb) |> Array
23 |     @test ax .+ 7*ab ≈ cudnnAddTensor!(similar(cx), cx, cb, alpha=7) |> Array
24 |     @test 7*ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb, beta=7) |> Array
25 |     @test ax .+ ab ≈ cudnnAddTensor!(cx, cx, cb) |> Array
26 |     @test ax .+ ab ≈ cx |> Array
27 |     ax,ab = rand(3,5,4,2),rand(3,1,1,1)
28 |     cx,cb = CuArray.((ax,ab))
29 |     @test ax .+ ab ≈ cudnnAddTensor(cx, cb, format=CUDNN_TENSOR_NHWC) |> Array
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/nvml/error.jl:
--------------------------------------------------------------------------------
 1 | export NVMLError
 2 | 
 3 | struct NVMLError <: Exception
 4 |     code::nvmlReturn_t
 5 | end
 6 | 
 7 | Base.convert(::Type{nvmlReturn_t}, err::NVMLError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::NVMLError) =
10 |     print(io, "NVMLError: ", description(err), " (code $(reinterpret(Int32, err.code)))")
11 | 
12 | description(err::NVMLError) = unsafe_string(nvmlErrorString(err))
13 | 
14 | @enum_without_prefix nvmlReturn_enum NVML_
15 | 
16 | 
17 | ## API call wrapper
18 | 
19 | # outlined functionality to avoid GC frame allocation
20 | @noinline function throw_api_error(res)
21 |     throw(NVMLError(res))
22 | end
23 | 
24 | const initialized = Ref(false)
25 | function initialize_context()
26 |     if !initialized[]
27 |         res = unsafe_nvmlInitWithFlags(0)
28 |         if res !== NVML_SUCCESS
29 |             # NOTE: we can't call nvmlErrorString during initialization
30 |             error("NVML could not be initialized ($res)")
31 |         end
32 |         atexit() do
33 |             nvmlShutdown()
34 |         end
35 |         initialized[] = true
36 |     end
37 | end
38 | 
39 | macro check(ex)
40 |     quote
41 |         res = $(esc(ex))
42 |         if res != NVML_SUCCESS
43 |             throw_api_error(res)
44 |         end
45 | 
46 |         nothing
47 |     end
48 | end
49 | 


--------------------------------------------------------------------------------
/src/compiler/exceptions.jl:
--------------------------------------------------------------------------------
 1 | # support for device-side exceptions
 2 | 
 3 | ## exception type
 4 | 
 5 | struct KernelException <: Exception
 6 |     dev::CuDevice
 7 | end
 8 | 
 9 | function Base.showerror(io::IO, err::KernelException)
10 |     print(io, "KernelException: exception thrown during kernel execution on device $(name(err.dev))")
11 | end
12 | 
13 | 
14 | ## exception handling
15 | 
16 | const exception_flags = Dict{CuContext, Mem.HostBuffer}()
17 | 
18 | # create a CPU/GPU exception flag for error signalling, and put it in the module
19 | function create_exceptions!(mod::CuModule)
20 |     exception_flag = get!(exception_flags, mod.ctx,
21 |                           Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP))
22 |     return reinterpret(Ptr{Cvoid}, convert(CuPtr{Cvoid}, exception_flag))
23 | end
24 | 
25 | # check the exception flags on every API call, similarly to how CUDA handles errors
26 | function check_exceptions()
27 |     for (ctx,buf) in exception_flags
28 |         if isvalid(ctx)
29 |             ptr = convert(Ptr{Int}, buf)
30 |             flag = unsafe_load(ptr)
31 |             if flag != 0
32 |                 unsafe_store!(ptr, 0)
33 |                 dev = device(ctx)
34 |                 throw(KernelException(dev))
35 |             end
36 |         end
37 |     end
38 |     return
39 | end
40 | 


--------------------------------------------------------------------------------
/lib/cudadrv/libcuda_deprecated.jl:
--------------------------------------------------------------------------------
 1 | ## superseded in CUDA 11.0
 2 | 
 3 | @checked function cuDevicePrimaryCtxRelease(dev)
 4 |     ccall((:cuDevicePrimaryCtxRelease, libcuda()), CUresult,
 5 |                    (CUdevice,),
 6 |                    dev)
 7 | end
 8 | 
 9 | @checked function cuDevicePrimaryCtxSetFlags(dev, flags)
10 |     ccall((:cuDevicePrimaryCtxSetFlags, libcuda()), CUresult,
11 |                    (CUdevice, UInt32),
12 |                    dev, flags)
13 | end
14 | 
15 | @checked function cuDevicePrimaryCtxReset(dev)
16 |     ccall((:cuDevicePrimaryCtxReset, libcuda()), CUresult,
17 |                    (CUdevice,),
18 |                    dev)
19 | end
20 | 
21 | @checked function cuGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer,
22 |                                         bufferSize)
23 |     ccall((:cuGraphInstantiate, libcuda()), CUresult,
24 |                    (Ptr{CUgraphExec}, CUgraph, Ptr{CUgraphNode}, Cstring, Csize_t),
25 |                    phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize)
26 | end
27 | 
28 | ## superseded in CUDA 11.1
29 | 
30 | @checked function cuIpcOpenMemHandle(pdptr, handle, Flags)
31 |     ccall((:cuIpcOpenMemHandle, libcuda()), CUresult,
32 |                    (Ptr{CUdeviceptr}, CUipcMemHandle, UInt32),
33 |                    pdptr, handle, Flags)
34 | end
35 | 
36 | ##
37 | 


--------------------------------------------------------------------------------
/test/device/sparse.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | using CUDA
 3 | using CUDA.CUSPARSE
 4 | using SparseArrays
 5 | using CUDA: CuSparseDeviceVector, CuSparseDeviceMatrixCSC, CuSparseDeviceMatrixCSR,
 6 |     CuSparseDeviceMatrixBSR, CuSparseDeviceMatrixCOO
 7 | 
 8 | @testset "cudaconvert" begin
 9 |     @test isbitstype(CuSparseDeviceVector{Float32, Cint, CUDA.AS.Global})
10 |     @test isbitstype(CuSparseDeviceMatrixCSC{Float32, Cint, CUDA.AS.Global})
11 |     @test isbitstype(CuSparseDeviceMatrixCSR{Float32, Cint, CUDA.AS.Global})
12 |     @test isbitstype(CuSparseDeviceMatrixBSR{Float32, Cint, CUDA.AS.Global})
13 |     @test isbitstype(CuSparseDeviceMatrixCOO{Float32, Cint, CUDA.AS.Global})
14 | 
15 |     V = sprand(10, 0.5)
16 |     cuV = CuSparseVector(V)
17 |     @test cudaconvert(cuV) isa CuSparseDeviceVector{Float64, Cint, 1}
18 | 
19 |     A = sprand(10, 10, 0.5)
20 |     cuA = CuSparseMatrixCSC(A)
21 |     @test cudaconvert(cuA) isa CuSparseDeviceMatrixCSC{Float64, Cint, 1}
22 | 
23 |     cuA = CuSparseMatrixCSR(A)
24 |     @test cudaconvert(cuA) isa CuSparseDeviceMatrixCSR{Float64, Cint, 1}
25 | 
26 |     cuA = CuSparseMatrixCOO(A)
27 |     @test cudaconvert(cuA) isa CuSparseDeviceMatrixCOO{Float64, Cint, 1}
28 | 
29 |     # Roger-Luo: I'm not sure how to create a BSR matrix
30 |     # cuA = CuSparseMatrixBSR(A)
31 |     # @test cudaconvert(cuA) isa CuSparseDeviceMatrixBSR
32 | end
33 | 


--------------------------------------------------------------------------------
/perf/kernel.jl:
--------------------------------------------------------------------------------
 1 | using CUDA: i32
 2 | 
 3 | group = addgroup!(SUITE, "kernel")
 4 | 
 5 | dummy_kernel() = nothing
 6 | group["launch"] = @benchmarkable @cuda dummy_kernel()
 7 | 
 8 | wanted_threads = 10000
 9 | group["occupancy"] = @benchmarkable begin
10 |     kernel = @cuda launch=false dummy_kernel()
11 |     config = launch_configuration(kernel.fun)
12 |     threads = min($wanted_threads, config.threads)
13 |     blocks = cld($wanted_threads, threads)
14 | end
15 | 
16 | src = CUDA.rand(Float32, 512, 1000)
17 | dest = similar(src)
18 | function indexing_kernel(dest, src)
19 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
20 |     @inbounds dest[i] = src[i]
21 |     return
22 | end
23 | group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
24 | 
25 | function checked_indexing_kernel(dest, src)
26 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
27 |     dest[i] = src[i]
28 |     return
29 | end
30 | group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
31 | 
32 | function rand_kernel(dest::AbstractArray{T}) where {T}
33 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
34 |     dest[i] = rand(T)
35 |     return
36 | end
37 | group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
38 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The CUDA.jl package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2019-present: Julia Computing and other contributors
 4 | > 
 5 | > Copyright (c) 2014-2018: Tim Besard
 6 | > 
 7 | > Copyright (c) 2013: Dahua Lin
 8 | >
 9 | > All Rights Reserved.
10 | >
11 | > Permission is hereby granted, free of charge, to any person obtaining a copy
12 | > of this software and associated documentation files (the "Software"), to deal
13 | > in the Software without restriction, including without limitation the rights
14 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | > copies of the Software, and to permit persons to whom the Software is
16 | > furnished to do so, subject to the following conditions:
17 | >
18 | > The above copyright notice and this permission notice shall be included in all
19 | > copies or substantial portions of the Software.
20 | >
21 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | > SOFTWARE.
28 | >
29 | 


--------------------------------------------------------------------------------
/lib/cufft/util.jl:
--------------------------------------------------------------------------------
 1 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex}
 2 | const cufftReals = Union{cufftDoubleReal,cufftReal}
 3 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex}
 4 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex}
 5 | const cufftSingle = Union{cufftReal,cufftComplex}
 6 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}}
 7 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}}
 8 | 
 9 | cufftfloat(x) = _cufftfloat(float(x))
10 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T
11 | _cufftfloat(::Type{Float16}) = Float32
12 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)}
13 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported")
14 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x)
15 | 
16 | complexfloat(x::DenseCuArray{Complex{<:cufftReals}}) = x
17 | realfloat(x::DenseCuArray{<:cufftReals}) = x
18 | 
19 | complexfloat(x::DenseCuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x)
20 | complexfloat(x::DenseCuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x)
21 | 
22 | realfloat(x::DenseCuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x)
23 | 
24 | function copy1(::Type{T}, x) where T
25 |     y = CuArray{T}(undef, map(length, axes(x)))
26 |     #copy!(y, x)
27 |     y .= broadcast(xi->convert(T,xi),x)
28 | end
29 | 


--------------------------------------------------------------------------------
/test/ptx/vadd_parent.ptx:
--------------------------------------------------------------------------------
 1 | 
 2 | .version 3.1
 3 | .target sm_20
 4 | .address_size 64
 5 | 
 6 | .extern .func  (.param .b32 func_retval0) add
 7 | (
 8 | 	.param .b32 add_param_0,
 9 | 	.param .b32 add_param_1
10 | )
11 | ;
12 | 
13 | .visible .entry vadd(
14 | 	.param .u64 vadd_param_0,
15 | 	.param .u64 vadd_param_1,
16 | 	.param .u64 vadd_param_2
17 | )
18 | {
19 | 	.reg .s32 	%r<8>;
20 | 	.reg .f32 	%f<4>;
21 | 	.reg .s64 	%rd<11>;
22 | 
23 | 
24 | 	ld.param.u64 	%rd1, [vadd_param_0];
25 | 	ld.param.u64 	%rd2, [vadd_param_1];
26 | 	ld.param.u64 	%rd3, [vadd_param_2];
27 | 	cvta.to.global.u64 	%rd4, %rd3;
28 | 	mov.u32 	%r1, %ntid.x;
29 | 	mov.u32 	%r2, %ctaid.x;
30 | 	mov.u32 	%r3, %tid.x;
31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
32 | 	cvta.to.global.u64 	%rd5, %rd1;
33 | 	mul.wide.s32 	%rd6, %r4, 4;
34 | 	add.s64 	%rd7, %rd5, %rd6;
35 | 	cvta.to.global.u64 	%rd8, %rd2;
36 | 	add.s64 	%rd9, %rd8, %rd6;
37 | 	ld.global.f32 	%f1, [%rd9];
38 | 	ld.global.f32 	%f2, [%rd7];
39 | 	// Callseq Start 0
40 | 	{
41 | 	.reg .b32 temp_param_reg;
42 | 	.param .b32 param0;
43 | 	st.param.f32	[param0+0], %f2;
44 | 	.param .b32 param1;
45 | 	st.param.f32	[param1+0], %f1;
46 | 	.param .b32 retval0;
47 | 	call.uni (retval0), 
48 | 	add, 
49 | 	(
50 | 	param0, 
51 | 	param1
52 | 	);
53 | 	ld.param.f32	%f3, [retval0+0];
54 | 	}
55 | 	// Callseq End 0
56 | 	add.s64 	%rd10, %rd4, %rd6;
57 | 	st.global.f32 	[%rd10], %f3;
58 | 	ret;
59 | }
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/lib/cudadrv/version.jl:
--------------------------------------------------------------------------------
 1 | # Version management
 2 | 
 3 | # NVML.driver_version() wrongly reports the forward compatible version,
 4 | # so we record the system libcuda version when we initialize the library.
 5 | const _system_version = Ref{VersionNumber}()
 6 | 
 7 | """
 8 |     system_version()
 9 | 
10 | Returns the latest version of CUDA supported by the system driver.
11 | """
12 | function system_version()
13 |     libcuda()   # initializes _system_version
14 |     _system_version[]
15 | end
16 | 
17 | """
18 |     version()
19 | 
20 | Returns the latest version of CUDA supported by the loaded driver.
21 | """
22 | function version()
23 |     version_ref = Ref{Cint}()
24 |     cuDriverGetVersion(version_ref)
25 |     major, ver = divrem(version_ref[], 1000)
26 |     minor, patch = divrem(ver, 10)
27 |     return VersionNumber(major, minor, patch)
28 | end
29 | 
30 | """
31 |     release()
32 | 
33 | Returns the CUDA release part of the version as returned by [`version`](@ref).
34 | """
35 | release() = VersionNumber(version().major, version().minor)
36 | 
37 | """
38 |     runtime_version()
39 | 
40 |     Returns the CUDA Runtime version.
41 | """
42 | function runtime_version()
43 |     version_ref = Ref{Cint}()
44 |     @ccall libcudart().cudaRuntimeGetVersion(version_ref::Ptr{Cint})::CUresult
45 |     major, ver = divrem(version_ref[], 1000)
46 |     minor, patch = divrem(ver, 10)
47 |     return VersionNumber(major, minor, patch)
48 | end
49 | 


--------------------------------------------------------------------------------
/lib/nvml/NVML.jl:
--------------------------------------------------------------------------------
 1 | module NVML
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | 
 7 | using CEnum: @cenum
 8 | 
 9 | import Libdl
10 | 
11 | 
12 | function libnvml()
13 |     @memoize begin
14 |         if Sys.iswindows()
15 |             # the NVSMI dir isn't added to PATH by the installer
16 |             nvsmi = joinpath(ENV["ProgramFiles"], "NVIDIA Corporation", "NVSMI")
17 |             if isdir(nvsmi)
18 |                 joinpath(nvsmi, "nvml.dll")
19 |             else
20 |                 # let's just hope for the best
21 |                 "nvml"
22 |             end
23 |         else
24 |             "libnvidia-ml.so.1"
25 |         end
26 |     end::String
27 | end
28 | 
29 | function has_nvml()
30 |     @memoize begin
31 |         if Libdl.dlopen(libnvml(); throw_error=false) === nothing
32 |             return false
33 |         end
34 | 
35 |         # JuliaGPU/CUDA.jl#860: initialization can fail on Windows
36 |         try
37 |             initialize_context()
38 |         catch err
39 |             @error "Cannot use NVML, as it failed to initialize" exception=(err, catch_backtrace())
40 |             return false
41 |         end
42 | 
43 |         return true
44 |     end::Bool
45 | end
46 | 
47 | 
48 | # core library
49 | include("libnvml_common.jl")
50 | include("error.jl")
51 | include("libnvml.jl")
52 | include("libnvml_deprecated.jl")
53 | 
54 | # wrappers
55 | include("system.jl")
56 | include("device.jl")
57 | 
58 | end
59 | 


--------------------------------------------------------------------------------
/perf/latency.jl:
--------------------------------------------------------------------------------
 1 | module Latency
 2 | 
 3 | using CUDA
 4 | using BenchmarkTools
 5 | 
 6 | function main()
 7 |     results = BenchmarkGroup()
 8 | 
 9 |     base_cmd = Base.julia_cmd()
10 |     if Base.JLOptions().project != C_NULL
11 |         base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
12 |     end
13 | 
14 |     # make sure all artifacts are downloaded
15 |     CUDA.version()
16 | 
17 |     # time to precompile the package and its dependencies
18 |     precompile_cmd =
19 |         `$base_cmd -e "uuid = Base.UUID(\"052768ef-5323-5732-b1bb-66c8b64840ba\")
20 |                     id = Base.PkgId(uuid, \"CUDA\")
21 |                     Base.compilecache(id)"`
22 |     results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
23 | 
24 |     # time to actually import the package
25 |     import_cmd =
26 |         `$base_cmd -e "using CUDA"`
27 |     results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
28 | 
29 |     # time to initialize CUDA and all other libraries
30 |     initialize_time =
31 |         `$base_cmd -e "using CUDA
32 |                     CUDA.version()"`
33 |     results["initialize"] = @benchmark run($initialize_time) evals=1 seconds=30
34 | 
35 |     # time to actually compile a kernel
36 |     ttfp_cmd =
37 |         `$base_cmd -e "using CUDA
38 |                     kernel() = return
39 |                     CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
40 |     results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
41 | 
42 |     results
43 | end
44 | 
45 | end
46 | 
47 | Latency.main()
48 | 


--------------------------------------------------------------------------------
/lib/curand/CURAND.jl:
--------------------------------------------------------------------------------
 1 | module CURAND
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | using ..CUDA: CUstream, libraryPropertyType, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
 7 | using ..CUDA: libcurand, @retry_reclaim, initialize_context
 8 | 
 9 | using CEnum: @cenum
10 | 
11 | 
12 | # core library
13 | include("libcurand_common.jl")
14 | include("error.jl")
15 | include("libcurand.jl")
16 | 
17 | # low-level wrappers
18 | include("wrappers.jl")
19 | 
20 | # high-level integrations
21 | include("random.jl")
22 | 
23 | # cache for created, but unused handles
24 | const idle_curand_rngs = HandleCache{CuContext,RNG}()
25 | 
26 | function default_rng()
27 |     cuda = CUDA.active_state()
28 | 
29 |     # every task maintains library state per device
30 |     LibraryState = @NamedTuple{rng::RNG}
31 |     states = get!(task_local_storage(), :CURAND) do
32 |         Dict{CuContext,LibraryState}()
33 |     end::Dict{CuContext,LibraryState}
34 | 
35 |     # get library state
36 |     @noinline function new_state(cuda)
37 |         new_rng = pop!(idle_curand_rngs, cuda.context) do
38 |             RNG()
39 |         end
40 | 
41 |         finalizer(current_task()) do task
42 |             push!(idle_curand_rngs, cuda.context, new_rng) do
43 |                 # no need to do anything, as the RNG is collected by its finalizer
44 |             end
45 |         end
46 | 
47 |         Random.seed!(new_rng)
48 |         (; rng=new_rng)
49 |     end
50 |     state = get!(states, cuda.context) do
51 |         new_state(cuda)
52 |     end
53 | 
54 |     return state.rng
55 | end
56 | 
57 | @deprecate seed!() CUDA.seed!()
58 | @deprecate seed!(seed) CUDA.seed!(seed)
59 | 
60 | end
61 | 


--------------------------------------------------------------------------------
/test/device/ldg.jl:
--------------------------------------------------------------------------------
 1 | @testset "ldg" begin
 2 |     ir = sprint(io->CUDA.code_llvm(io, CUDA.pointerref_ldg, Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}))
 3 |     @test occursin("@llvm.nvvm.ldg", ir)
 4 | end
 5 | 
 6 | 
 7 | capability(device()) >= v"3.2" && @testset "unsafe_cached_load" begin
 8 | 
 9 | @testset for T in (Int8, UInt16, Int32, UInt32, Int64, UInt64, Int128, Float32, Float64)
10 |     d_a = CuArray(ones(T))
11 |     d_b = CuArray(zeros(T))
12 |     @test Array(d_a) != Array(d_b)
13 | 
14 |     ptr_a = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(d_a))
15 |     ptr_b = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(d_b))
16 | 
17 |     let ptr_a=ptr_a, ptr_b=ptr_b #JuliaLang/julia#15276
18 |         @on_device unsafe_store!(ptr_b, unsafe_cached_load(ptr_a))
19 |     end
20 | 
21 |     @test Array(d_a) == Array(d_b)
22 | end
23 | 
24 | @testset "Const" begin
25 |     function kernel(a, b, i)
26 |         @inbounds b[i] = Base.Experimental.Const(a)[i]
27 |         return
28 |     end
29 | 
30 |     buf = IOBuffer()
31 | 
32 |     a = CuArray([0])
33 |     b = CuArray([0])
34 |     @device_code_ptx io=buf @cuda kernel(a, b, 1)
35 |     @test Array(a) == Array(b)
36 | 
37 |     asm = String(take!(copy(buf)))
38 |     @test occursin("ld.global.nc", asm)
39 | 
40 | 
41 |     function copy_const(A, _B)
42 |         B = Base.Experimental.Const(_B)
43 |         i = threadIdx().x
44 |         if i <= length(A)
45 |             @inbounds A[i] = B[i]
46 |         end
47 |         return
48 |     end
49 | 
50 |     x = CUDA.zeros(Float64, 32)
51 |     y = CUDA.ones(Float64, length(x))
52 | 
53 |     @cuda threads=length(x) copy_const(x, y)
54 |     @test Array(x) == Array(y)
55 | end
56 | 
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/cutensor/CUTENSOR.jl:
--------------------------------------------------------------------------------
 1 | module CUTENSOR
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | using ..CUDA: CUstream, cudaDataType
 7 | using ..CUDA: libcutensor,  @retry_reclaim, initialize_context
 8 | 
 9 | using CEnum: @cenum
10 | 
11 | 
12 | const cudaDataType_t = cudaDataType
13 | 
14 | # core library
15 | include("libcutensor_common.jl")
16 | include("error.jl")
17 | include("libcutensor.jl")
18 | 
19 | # low-level wrappers
20 | include("tensor.jl")
21 | include("wrappers.jl")
22 | 
23 | # high-level integrations
24 | include("interfaces.jl")
25 | 
26 | # cache for created, but unused handles
27 | const idle_handles = HandleCache{CuContext,Base.RefValue{cutensorHandle_t}}()
28 | 
29 | function handle()
30 |     cuda = CUDA.active_state()
31 | 
32 |     # every task maintains library state per device
33 |     LibraryState = @NamedTuple{handle::Base.RefValue{cutensorHandle_t}}
34 |     states = get!(task_local_storage(), :CUTENSOR) do
35 |         Dict{CuContext,LibraryState}()
36 |     end::Dict{CuContext,LibraryState}
37 | 
38 |     # get library state
39 |     @noinline function new_state(cuda)
40 |         new_handle = pop!(idle_handles, cuda.context) do
41 |             handle = Ref{cutensorHandle_t}()
42 |             cutensorInit(handle)
43 |             handle
44 |         end
45 | 
46 |         finalizer(current_task()) do task
47 |             push!(idle_handles, cuda.context, new_handle) do
48 |                 # CUTENSOR doesn't need to actively destroy its handle
49 |             end
50 |         end
51 | 
52 |         (; handle=new_handle)
53 |     end
54 |     state = get!(states, cuda.context) do
55 |         new_state(cuda)
56 |     end
57 | 
58 |     return state.handle
59 | end
60 | 
61 | end
62 | 


--------------------------------------------------------------------------------
/examples/peakflops.jl:
--------------------------------------------------------------------------------
 1 | using CUDA
 2 | using CUDA: i32
 3 | 
 4 | using Test
 5 | 
 6 | "Dummy kernel doing 100 FMAs."
 7 | function kernel_100fma(a, b, c, out)
 8 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
 9 |     @inbounds if i <= length(out)
10 |         a_val = a[i]
11 |         b_val = b[i]
12 |         c_val = c[i]
13 | 
14 |         for j in 1:33
15 |             a_val = CUDA.fma(a_val, b_val, c_val)
16 |             b_val = CUDA.fma(a_val, b_val, c_val)
17 |             c_val = CUDA.fma(a_val, b_val, c_val)
18 |         end
19 | 
20 |         out[i] = CUDA.fma(a_val, b_val, c_val)
21 |     end
22 | 
23 |     return
24 | end
25 | 
26 | function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
27 |     device!(dev) do
28 |         dims = (n, n)
29 |         a = round.(rand(Float32, dims) * 100)
30 |         b = round.(rand(Float32, dims) * 100)
31 |         c = round.(rand(Float32, dims) * 100)
32 |         out = similar(a)
33 | 
34 |         d_a = CuArray(a)
35 |         d_b = CuArray(b)
36 |         d_c = CuArray(c)
37 |         d_out = CuArray(out)
38 | 
39 |         len = prod(dims)
40 | 
41 |         kernel = @cuda launch=false kernel_100fma(d_a, d_b, d_c, d_out)
42 |         config = launch_configuration(kernel.fun)
43 |         threads = min(len, config.threads)
44 |         blocks = cld(len, threads)
45 | 
46 |         # warm-up
47 |         kernel(d_a, d_b, d_c, d_out)
48 |         synchronize()
49 | 
50 |         secs = CUDA.@elapsed begin
51 |             kernel(d_a, d_b, d_c, d_out; threads=threads, blocks=blocks)
52 |         end
53 |         flopcount = 200*len
54 |         flops = flopcount / secs
55 | 
56 |         return flops
57 |     end
58 | end
59 | 
60 | println(peakflops())
61 | 


--------------------------------------------------------------------------------
/src/device/pointer.jl:
--------------------------------------------------------------------------------
 1 | # CUDA-specific operations on pointers with address spaces
 2 | 
 3 | ## adrspace aliases
 4 | 
 5 | export AS
 6 | 
 7 | module AS
 8 | 
 9 | const Generic  = 0
10 | const Global   = 1
11 | const Shared   = 3
12 | const Constant = 4
13 | const Local    = 5
14 | 
15 | end
16 | 
17 | 
18 | ## ldg
19 | 
20 | const LDGTypes = (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64,
21 |                   Float32, Float64)
22 | 
23 | # TODO: this functionality should throw <sm_32
24 | # NOTE: CUDA 8.0 supports more caching modifiers, but those aren't supported by LLVM yet
25 | for T in LDGTypes
26 |     class = if T <: Integer
27 |         :i
28 |     elseif T <: AbstractFloat
29 |         :f
30 |     end
31 |     # TODO: p class
32 |     width = sizeof(T)*8 # in bits
33 |     typ = Symbol(class, width)
34 | 
35 |     intr = "llvm.nvvm.ldg.global.$class.$typ.p1$typ"
36 |     @eval @inline function pointerref_ldg(base_ptr::LLVMPtr{$T,AS.Global}, i::Integer,
37 |                                           ::Val{align}) where align
38 |         offset = i-one(i) # in elements
39 |         ptr = base_ptr + offset*sizeof($T)
40 |         @typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,AS.Global}, Int32), ptr, align)
41 |     end
42 | end
43 | 
44 | # interface
45 | 
46 | export unsafe_cached_load
47 | 
48 | unsafe_cached_load(p::LLVMPtr{<:Union{LDGTypes...},AS.Global}, i::Integer=1, align::Val=Val(1)) =
49 |     pointerref_ldg(p, i, align)
50 | # NOTE: fall back to normal unsafe_load for unsupported types. we could be smarter here,
51 | #       e.g. destruct/load/reconstruct, but that's too complicated for what it's worth.
52 | unsafe_cached_load(p::LLVMPtr, i::Integer=1, align::Val=Val(1)) =
53 |     unsafe_load(p, i, align)
54 | 


--------------------------------------------------------------------------------
/src/device/intrinsics/memory_dynamic.jl:
--------------------------------------------------------------------------------
 1 | # Dynamic Global Memory Allocation and Operations (B.21)
 2 | 
 3 | export malloc
 4 | 
 5 | @generated function malloc(sz::Csize_t)
 6 |     Context() do ctx
 7 |         T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx))
 8 |         T_size = convert(LLVMType, Csize_t; ctx)
 9 |         T_ptr = convert(LLVMType, Ptr{Cvoid}; ctx)
10 | 
11 |         # create function
12 |         llvm_f, _ = create_function(T_ptr, [T_size])
13 |         mod = LLVM.parent(llvm_f)
14 | 
15 |         # get the intrinsic
16 |         # NOTE: LLVM doesn't have void*, Clang uses i8* for malloc too
17 |         intr = LLVM.Function(mod, "malloc", LLVM.FunctionType(T_pint8, [T_size]))
18 |         # should we attach some metadata here? julia.gc_alloc_obj has the following:
19 |         #let attrs = function_attributes(intr)
20 |         #    AllocSizeNumElemsNotPresent = reinterpret(Cuint, Cint(-1))
21 |         #    packed_allocsize = Int64(1) << 32 | AllocSizeNumElemsNotPresent
22 |         #    push!(attrs, EnumAttribute("allocsize", packed_allocsize; ctx))
23 |         #end
24 |         #let attrs = return_attributes(intr)
25 |         #    push!(attrs, EnumAttribute("noalias", 0; ctx))
26 |         #    push!(attrs, EnumAttribute("nonnull", 0; ctx))
27 |         #end
28 | 
29 |         # generate IR
30 |         Builder(ctx) do builder
31 |             entry = BasicBlock(llvm_f, "entry"; ctx)
32 |             position!(builder, entry)
33 | 
34 |             ptr = call!(builder, intr, [parameters(llvm_f)[1]])
35 | 
36 |             jlptr = ptrtoint!(builder, ptr, T_ptr)
37 | 
38 |             ret!(builder, jlptr)
39 |         end
40 | 
41 |         call_function(llvm_f, Ptr{Cvoid}, Tuple{Csize_t}, :sz)
42 |     end
43 | end
44 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "CUDA"
 2 | uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
 3 | version = "3.5.0"
 4 | 
 5 | [deps]
 6 | AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 7 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 8 | BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 9 | CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
10 | CompilerSupportLibraries_jll = "e66e0078-7015-5450-92f7-15fbd957f2ae"
11 | ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
12 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
13 | GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
14 | LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
15 | LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
16 | Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
17 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
18 | Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
19 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
20 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
21 | Random123 = "74087812-796a-5b5d-8853-05524746bad3"
22 | RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143"
23 | Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
24 | Requires = "ae029012-a4dd-5104-9daa-d747884805df"
25 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
26 | SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
27 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
28 | 
29 | [compat]
30 | AbstractFFTs = "0.4, 0.5, 1.0"
31 | Adapt = "3.3"
32 | BFloat16s = "0.2"
33 | CEnum = "0.2, 0.3, 0.4"
34 | ExprTools = "0.1"
35 | GPUArrays = "8"
36 | GPUCompiler = "0.13.3"
37 | LLVM = "4.5.3"
38 | Random123 = "1.2"
39 | RandomNumbers = "1.5.3"
40 | Reexport = "0.2, 1.0"
41 | Requires = "0.5, 1.0"
42 | SpecialFunctions = "1.3"
43 | TimerOutputs = "0.5.9"
44 | julia = "1.6"
45 | 


--------------------------------------------------------------------------------
/lib/cufft/libcufft_common.jl:
--------------------------------------------------------------------------------
 1 | # Automatically generated using Clang.jl
 2 | 
 3 | # Skipping MacroDefinition: CUFFTAPI __attribute__ ( ( visibility ( "default" ) ) )
 4 | 
 5 | const CUFFT_VER_MAJOR = 10
 6 | const CUFFT_VER_MINOR = 4
 7 | const CUFFT_VER_PATCH = 1
 8 | const CUFFT_VER_BUILD = 152
 9 | const CUFFT_VERSION = 10401
10 | const MAX_CUFFT_ERROR = 0x11
11 | const CUFFT_FORWARD = -1
12 | const CUFFT_INVERSE = 1
13 | 
14 | @cenum cufftCompatibility_t::UInt32 begin
15 |     CUFFT_COMPATIBILITY_FFTW_PADDING = 1
16 | end
17 | 
18 | const CUFFT_COMPATIBILITY_DEFAULT = CUFFT_COMPATIBILITY_FFTW_PADDING
19 | const MAX_SHIM_RANK = 3
20 | 
21 | @cenum cufftResult_t::UInt32 begin
22 |     CUFFT_SUCCESS = 0
23 |     CUFFT_INVALID_PLAN = 1
24 |     CUFFT_ALLOC_FAILED = 2
25 |     CUFFT_INVALID_TYPE = 3
26 |     CUFFT_INVALID_VALUE = 4
27 |     CUFFT_INTERNAL_ERROR = 5
28 |     CUFFT_EXEC_FAILED = 6
29 |     CUFFT_SETUP_FAILED = 7
30 |     CUFFT_INVALID_SIZE = 8
31 |     CUFFT_UNALIGNED_DATA = 9
32 |     CUFFT_INCOMPLETE_PARAMETER_LIST = 10
33 |     CUFFT_INVALID_DEVICE = 11
34 |     CUFFT_PARSE_ERROR = 12
35 |     CUFFT_NO_WORKSPACE = 13
36 |     CUFFT_NOT_IMPLEMENTED = 14
37 |     CUFFT_LICENSE_ERROR = 15
38 |     CUFFT_NOT_SUPPORTED = 16
39 | end
40 | 
41 | const cufftResult = cufftResult_t
42 | const cufftReal = Cfloat
43 | const cufftDoubleReal = Cdouble
44 | const cufftComplex = cuComplex
45 | const cufftDoubleComplex = cuDoubleComplex
46 | 
47 | @cenum cufftType_t::UInt32 begin
48 |     CUFFT_R2C = 42
49 |     CUFFT_C2R = 44
50 |     CUFFT_C2C = 41
51 |     CUFFT_D2Z = 106
52 |     CUFFT_Z2D = 108
53 |     CUFFT_Z2Z = 105
54 | end
55 | 
56 | const cufftType = cufftType_t
57 | const cufftCompatibility = cufftCompatibility_t
58 | const cufftHandle = Cint
59 | 


--------------------------------------------------------------------------------
/src/device/intrinsics.jl:
--------------------------------------------------------------------------------
 1 | # wrappers for functionality provided by the CUDA toolkit
 2 | 
 3 | # special intrinsics for writing version-dependent code
 4 | include("intrinsics/version.jl")
 5 | 
 6 | # extensions to the C language
 7 | include("intrinsics/memory_shared.jl")
 8 | include("intrinsics/indexing.jl")
 9 | include("intrinsics/synchronization.jl")
10 | include("intrinsics/warp_vote.jl")
11 | include("intrinsics/warp_shuffle.jl")
12 | include("intrinsics/output.jl")
13 | include("intrinsics/assertion.jl")
14 | include("intrinsics/memory_dynamic.jl")
15 | include("intrinsics/atomics.jl")
16 | include("intrinsics/misc.jl")
17 | include("intrinsics/wmma.jl")
18 | 
19 | # functionality from libdevice
20 | #
21 | # > The libdevice library is a collection of NVVM bitcode functions that implement common
22 | # > functions for NVIDIA GPU devices, including math primitives and bit-manipulation
23 | # > functions. These functions are optimized for particular GPU architectures, and are
24 | # > intended to be linked with an NVVM IR module during compilation to PTX.
25 | include("intrinsics/math.jl")
26 | # TODO: native mathematical functions, CUDA C programming guide" > "C language extensions"
27 | #       https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__DOUBLE.html
28 | #       see /path/to/cuda/include/sm_20_intrinsics.h
29 | 
30 | # functionality from libcudadevrt
31 | #
32 | # The libcudadevrt library is a collection of PTX bitcode functions that implement parts of
33 | # the CUDA API for execution on the device, such as device synchronization primitives,
34 | # dynamic kernel APIs, etc.
35 | using CEnum: @cenum
36 | include("intrinsics/libcudadevrt_common.jl")
37 | include("intrinsics/libcudadevrt.jl")
38 | include("intrinsics/cooperative_groups.jl")
39 | include("intrinsics/dynamic_parallelism.jl")
40 | 


--------------------------------------------------------------------------------
/test/broadcast.jl:
--------------------------------------------------------------------------------
 1 | @testset "broadcast" begin
 2 |   @test testf((x)       -> fill!(x, 1),  rand(3,3))
 3 |   @test testf((x, y)    -> map(+, x, y), rand(2, 3), rand(2, 3))
 4 |   @test testf((x)       -> sin.(x),      rand(2, 3))
 5 |   @test testf((x)       -> log.(x) .+ 1, rand(2, 3))
 6 |   @test testf((x)       -> 2x,           rand(2, 3))
 7 |   @test testf((x)       -> x .^ 0,      rand(2, 3))
 8 |   @test testf((x)       -> x .^ 1,      rand(2, 3))
 9 |   @test testf((x)       -> x .^ 2,      rand(2, 3))
10 |   @test testf((x)       -> x .^ 3,      rand(2, 3))
11 |   @test testf((x)       -> x .^ 5,      rand(2, 3))
12 |   @test testf((x)       -> (z = Int32(5); x .^ z),      rand(2, 3))
13 |   @test testf((x)       -> (z = Float64(π); x .^ z),      rand(2, 3))
14 |   @test testf((x)       -> (z = Float32(π); x .^ z),      rand(Float32, 2, 3))
15 |   @test testf((x, y)    -> x .+ y,       rand(2, 3), rand(1, 3))
16 |   @test testf((z, x, y) -> z .= x .+ y,  rand(2, 3), rand(2, 3), rand(2))
17 |   @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL])
18 |   @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6])
19 | 
20 |   @eval struct Whatever{T}
21 |       x::Int
22 |   end
23 |   @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1])
24 | end
25 | 
26 | # https://github.com/JuliaGPU/CUDA.jl/issues/223
27 | @testset "Ref Broadcast" begin
28 |   foobar(idx, A) = A[idx]
29 |   @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42])))
30 | end
31 | 
32 | @testset "Broadcast Fix" begin
33 |   @test testf(x -> log.(x), rand(3,3))
34 |   @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3))
35 | end
36 | 
37 | # https://github.com/JuliaGPU/CUDA.jl/issues/261
38 | @testset "Broadcast Ref{<:Type}" begin
39 |   A = CuArray{ComplexF64}(undef, (2,2))
40 |   @test eltype(convert.(ComplexF32, A)) == ComplexF32
41 | end
42 | 


--------------------------------------------------------------------------------
/lib/cutensor/interfaces.jl:
--------------------------------------------------------------------------------
 1 | # interfacing with other packages
 2 | 
 3 | ## Base
 4 | 
 5 | function Base.:(+)(A::CuTensor, B::CuTensor)
 6 |     α = convert(eltype(A), 1.0)
 7 |     γ = convert(eltype(B), 1.0)
 8 |     C = similar(B)
 9 |     elementwiseBinary!(α, A, CUTENSOR_OP_IDENTITY, γ, B, CUTENSOR_OP_IDENTITY, C, CUTENSOR_OP_ADD)
10 | end
11 | 
12 | function Base.:(-)(A::CuTensor, B::CuTensor)
13 |     α = convert(eltype(A), 1.0)
14 |     γ = convert(eltype(B), -1.0)
15 |     C = similar(B)
16 |     elementwiseBinary!(α, A, CUTENSOR_OP_IDENTITY, γ, B, CUTENSOR_OP_IDENTITY, C, CUTENSOR_OP_ADD)
17 | end
18 | 
19 | function Base.:(*)(A::CuTensor, B::CuTensor)
20 |     tC = promote_type(eltype(A), eltype(B))
21 |     A_uniqs = [(idx, i) for (idx, i) in enumerate(A.inds) if !(i in B.inds)]
22 |     B_uniqs = [(idx, i) for (idx, i) in enumerate(B.inds) if !(i in A.inds)]
23 |     A_sizes = map(x->size(A,x[1]), A_uniqs)
24 |     B_sizes = map(x->size(B,x[1]), B_uniqs)
25 |     A_inds = map(x->Char(x[2]), A_uniqs)
26 |     B_inds = map(x->Char(x[2]), B_uniqs)
27 |     C = CuTensor(CUDA.zeros(tC, Dims(vcat(A_sizes, B_sizes))), vcat(A_inds, B_inds))
28 |     return mul!(C, A, B)
29 | end
30 | 
31 | 
32 | ## LinearAlgebra
33 | 
34 | using LinearAlgebra
35 | 
36 | LinearAlgebra.axpy!(a, X::CuTensor, Y::CuTensor) = elementwiseBinary!(a, X, CUTENSOR_OP_IDENTITY, one(eltype(Y)), Y, CUTENSOR_OP_IDENTITY, similar(Y), CUTENSOR_OP_ADD)
37 | LinearAlgebra.axpby!(a, X::CuTensor, b, Y::CuTensor) = elementwiseBinary!(a, X, CUTENSOR_OP_IDENTITY, b, Y, CUTENSOR_OP_IDENTITY, similar(Y), CUTENSOR_OP_ADD)
38 | 
39 | function LinearAlgebra.mul!(C::CuTensor, A::CuTensor, B::CuTensor)
40 |    contraction!(one(eltype(C)), A.data, A.inds, CUTENSOR_OP_IDENTITY, B.data, B.inds, CUTENSOR_OP_IDENTITY, zero(eltype(C)), C.data, C.inds, CUTENSOR_OP_IDENTITY, CUTENSOR_OP_IDENTITY)
41 |    return C
42 | end
43 | 


--------------------------------------------------------------------------------
/test/cudnn/softmax.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUDNN:
 2 |     cudnnSoftmaxForward,
 3 |     cudnnSoftmaxForward!,
 4 |     cudnnSoftmaxBackward,
 5 |     cudnnSoftmaxAlgorithm_t,
 6 |         CUDNN_SOFTMAX_FAST,     # 0, /* straightforward implementation */
 7 |         CUDNN_SOFTMAX_ACCURATE, # 1, /* subtract max from every point to avoid overflow */
 8 |         CUDNN_SOFTMAX_LOG,      # 2
 9 |     cudnnSoftmaxMode_t,
10 |         CUDNN_SOFTMAX_MODE_INSTANCE, # 0, /* compute the softmax over all C, H, W for each N */
11 |         CUDNN_SOFTMAX_MODE_CHANNEL   # 1  /* compute the softmax over all C for each H, W, N */
12 | 
13 | @testset "cudnn/softmax" begin
14 |     ax,ay = randn(Float32,10,10),randn(Float32,10,10)
15 |     cx,cy = CuArray.((ax,ay))
16 | 
17 |     function softmaxtest(;
18 |         alpha=1,
19 |         beta=0,
20 |         mode=CUDNN_SOFTMAX_MODE_INSTANCE,
21 |         algo=CUDNN_SOFTMAX_FAST
22 |     )
23 |         d = mode === CUDNN_SOFTMAX_MODE_INSTANCE ? 1 : 2
24 |         x = ax .- maximum(ax, dims=d)
25 |         y = x .- log.(sum(exp.(x), dims=d))
26 |         if algo !== CUDNN_SOFTMAX_LOG; y = exp.(y); end
27 |         add1(x)=reshape(x, (size(x)..., 1))
28 |         if mode === CUDNN_SOFTMAX_MODE_CHANNEL
29 |             y,cx1,cy1 = add1.((y,cx,cy))
30 |         else
31 |             cx1,cy1 = cx,cy
32 |         end
33 |         y0 = alpha * y
34 |         y1 = y0 .+ beta * ay
35 |         @test y0 ≈ cudnnSoftmaxForward(cx1; algo, mode, alpha) |> Array
36 |         @test y1 ≈ cudnnSoftmaxForward!(copy(cy1), cx1; algo, mode, alpha, beta) |> Array
37 |     end
38 | 
39 |     softmaxtest()
40 |     softmaxtest(alpha=2)
41 |     softmaxtest(beta=2)
42 |     softmaxtest(mode=CUDNN_SOFTMAX_MODE_INSTANCE)
43 |     softmaxtest(mode=CUDNN_SOFTMAX_MODE_CHANNEL)
44 |     softmaxtest(algo=CUDNN_SOFTMAX_FAST)
45 |     softmaxtest(algo=CUDNN_SOFTMAX_ACCURATE)
46 |     softmaxtest(algo=CUDNN_SOFTMAX_LOG)
47 | end
48 | 


--------------------------------------------------------------------------------
/test/nvml.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.NVML
 2 | 
 3 | macro maybe_unsupported(ex)
 4 |     quote
 5 |         try
 6 |             $(esc(ex))
 7 |         catch err
 8 |             (isa(err, NVML.NVMLError) && err.code == NVML.ERROR_NOT_SUPPORTED) || rethrow()
 9 |         end
10 |     end
11 | end
12 | 
13 | @testset "system" begin
14 |     @test NVML.version() isa VersionNumber
15 |     @test NVML.driver_version() isa VersionNumber
16 |     @test NVML.cuda_driver_version() == CUDA.version()
17 | end
18 | 
19 | @testset "devices" begin
20 |     let dev = NVML.Device(0)
21 |         @test dev == first(NVML.devices())
22 |         @test NVML.index(dev) == 0
23 | 
24 |         str = sprint(io->show(io, "text/plain", dev))
25 |         @test occursin("NVML.Device(0)", str)
26 |     end
27 | 
28 |     cuda_dev = CuDevice(0)
29 |     mig = uuid(cuda_dev) != parent_uuid(cuda_dev)
30 | 
31 |     # tests for the parent device
32 |     let dev = NVML.Device(parent_uuid(cuda_dev))
33 |         @test NVML.uuid(dev) == parent_uuid(cuda_dev)
34 |         NVML.brand(dev)
35 |         @test occursin(NVML.name(dev), name(cuda_dev))
36 |         @maybe_unsupported NVML.serial(dev)
37 | 
38 |         @maybe_unsupported NVML.power_usage(dev)
39 |         @maybe_unsupported NVML.energy_consumption(dev)
40 | 
41 |         @maybe_unsupported NVML.utilization_rates(dev)
42 | 
43 |         NVML.compute_mode(dev)
44 |         @test NVML.compute_capability(dev) == capability(cuda_dev)
45 |     end
46 | 
47 |     # tests for the compute instance
48 |     let dev = NVML.Device(uuid(cuda_dev); mig)
49 |         @test NVML.uuid(dev) == uuid(cuda_dev)
50 |         @test NVML.name(dev) == name(cuda_dev)
51 | 
52 |         NVML.memory_info(dev)
53 | 
54 |         context()
55 |         # FIXME: https://github.com/NVIDIA/gpu-monitoring-tools/issues/63
56 |         #@test getpid() in keys(NVML.compute_processes(dev))
57 |         NVML.compute_processes(dev)
58 |     end
59 | end
60 | 


--------------------------------------------------------------------------------
/docs/src/installation/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | 
 3 | 
 4 | ## Could not find a suitable CUDA installation
 5 | 
 6 | This means that CUDA.jl could not find or provide a CUDA toolkit. For more information,
 7 | re-run with the `JULIA_DEBUG` environment variable set to `CUDA`.
 8 | 
 9 | If you're encountering this error when disabling artifacts through by setting
10 | `JULIA_CUDA_USE_BINARYBUILDER=false`, it is your own responsibility to make sure CUDA.jl
11 | can detect the necessary pieces, e.g., by putting CUDA's binaries and libraries in
12 | discoverable locations (i.e. on PATH, and on the library search path). Additionally, the
13 | `CUDA_HOME` environment can be used to point CUDA.jl to where the CUDA toolkit is installed,
14 | but that will only help if the contents of that directory have not been reorganized.
15 | 
16 | 
17 | ## UNKNOWN_ERROR(999)
18 | 
19 | If you encounter this error, there are several known issues that may be causing it:
20 | 
21 | - a mismatch between the CUDA driver and driver library: on Linux, look for clues in `dmesg`
22 | - the CUDA driver is in a bad state: this can happen after resume. **Try rebooting**.
23 | 
24 | Generally though, it's impossible to say what's the reason for the error, but Julia is
25 | likely not to blame. Make sure your set-up works (e.g., try executing `nvidia-smi`, a CUDA C
26 | binary, etc), and if everything looks good file an issue.
27 | 
28 | 
29 | ## NVML library not found (on Windows)
30 | 
31 | Check and make sure the `NVSMI` folder is in your `PATH`. By default it may not be. Look in
32 | `C:\Program Files\NVIDIA Corporation` for the `NVSMI` folder - you should see `nvml.dll`
33 | within it. You can add this folder to your `PATH` and check that `nvidia-smi` runs properly.
34 | 
35 | 
36 | ## The specified module could not be found (on Windows)
37 | 
38 | Ensure the [Visual C++ Redistributable](https://aka.ms/vs/16/release/vc_redist.x64.exe) is
39 | installed.
40 | 


--------------------------------------------------------------------------------
/test/threading.jl:
--------------------------------------------------------------------------------
 1 | # FIXME: these tests regularly triggers illegal memory accesses
 2 | #        after having moved to distributed test execution,
 3 | #        regardless of the memory pool or system.
 4 | 
 5 | @testset "threaded execution" begin
 6 |     function kernel(a, tid, id)
 7 |         a[1] = tid
 8 |         a[2] = id
 9 |         return
10 |     end
11 | 
12 |     test_lock = ReentrantLock()
13 |     Threads.@threads for id in 1:10
14 |         da = CuArray{Int}(undef, 2)
15 |         tid = Threads.threadid()
16 |         @cuda kernel(da, tid, id)
17 | 
18 |         a = Array(da)
19 |         lock(test_lock) do
20 |             @test a == [tid, id]
21 |         end
22 |     end
23 | end
24 | 
25 | @testset "threaded arrays" begin
26 |   test_lock = ReentrantLock()
27 |   Threads.@threads for i in 1:Threads.nthreads()*100
28 |     # uses libraries (rand, gemm) to test library handles
29 |     # allocates and uses unsafe_free to cover the allocator
30 |     da = CUDA.rand(64, 64)
31 |     db = CUDA.rand(64, 64)
32 |     yield()
33 |     dc = da * db
34 |     yield()
35 | 
36 |     # @testset is not thread safe
37 |     a = Array(da)
38 |     b = Array(db)
39 |     c = Array(dc)
40 |     lock(test_lock) do
41 |       @test c ≈ a * b
42 |     end
43 | 
44 |     yield()
45 |     CUDA.unsafe_free!(da)
46 |     CUDA.unsafe_free!(db)
47 |   end
48 | end
49 | 
50 | @testset "threaded device usage" begin
51 |   test_lock = ReentrantLock()
52 |   Threads.@threads for i in 1:Threads.nthreads()*100
53 |     dev = rand(1:length(devices()))
54 |     device!(dev-1) do
55 |       da = CUDA.rand(64, 64)
56 |       db = CUDA.rand(64, 64)
57 |       yield()
58 |       dc = da * (db .* 2)
59 |       yield()
60 | 
61 |       a = Array(da)
62 |       b = Array(db)
63 |       c = Array(dc)
64 |       lock(test_lock) do
65 |         @test c ≈ a * (b .* 2)
66 |       end
67 | 
68 |       yield()
69 |       CUDA.unsafe_free!(da)
70 |       CUDA.unsafe_free!(db)
71 |     end
72 |   end
73 | end
74 | 


--------------------------------------------------------------------------------
/lib/cudnn/util.jl:
--------------------------------------------------------------------------------
 1 | # For low level cudnn functions that require a pointer to a number
 2 | cptr(x,a::DenseCuArray{Float64})=Float64[x]
 3 | cptr(x,a::DenseCuArray{Float32})=Float32[x]
 4 | cptr(x,a::DenseCuArray{Float16})=Float32[x]
 5 | 
 6 | # Conversion between Julia and CUDNN datatypes
 7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF
 8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT
 9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE
10 | cudnnDataType(::Type{Int8}) = CUDNN_DATA_INT8
11 | cudnnDataType(::Type{UInt8}) = CUDNN_DATA_UINT8
12 | cudnnDataType(::Type{Int32}) = CUDNN_DATA_INT32
13 | # The following are 32-bit elements each composed of 4 8-bit integers, only supported with CUDNN_TENSOR_NCHW_VECT_C
14 | # CUDNN_DATA_INT8x4,
15 | # CUDNN_DATA_UINT8x4,
16 | # CUDNN_DATA_INT8x32,
17 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 :
18 |                   a==CUDNN_DATA_FLOAT ? Float32 :
19 |                   a==CUDNN_DATA_DOUBLE ? Float64 :
20 |                   a==CUDNN_DATA_INT8 ? Int8 :
21 |                   a==CUDNN_DATA_UINT8 ? UInt8 :
22 |                   a==CUDNN_DATA_INT32 ? Int32 : error())
23 | 
24 | tuple_strides(A::Tuple) = _strides((1,), A)
25 | _strides(out::Tuple{Int}, A::Tuple{}) = ()
26 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out
27 | @inline function _strides(out::NTuple{M,Int}, A::Tuple) where M
28 |     _strides((out..., out[M]*A[M]), A)
29 | end
30 | 
31 | # The storage data types for alpha and beta are:
32 | #     float for HALF and FLOAT tensors, and
33 | #     double for DOUBLE tensors.
34 | scalingParameter(T, val) = error("Unknown tensor type $T")
35 | scalingParameter(::Type{Float16}, val) = Ref{Float32}(val)
36 | scalingParameter(::Type{Float32}, val) = Ref{Float32}(val)
37 | scalingParameter(::Type{Float64}, val) = Ref{Float64}(val)
38 | 
39 | 
40 | # Create temporary reserveSpace. Use 128 to avoid alignment issues.
41 | function cudnnTempSpace(nbytes)
42 |     nbytes == 0 ? nothing : CuArray{Int128}(undef, (nbytes-1)÷sizeof(Int128)+1)
43 | end
44 | 


--------------------------------------------------------------------------------
/src/gpuarrays.jl:
--------------------------------------------------------------------------------
 1 | # GPUArrays.jl interface
 2 | 
 3 | 
 4 | #
 5 | # Device functionality
 6 | #
 7 | 
 8 | 
 9 | ## execution
10 | 
11 | struct CuArrayBackend <: AbstractGPUBackend end
12 | 
13 | struct CuKernelContext <: AbstractKernelContext end
14 | 
15 | @inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
16 |                                             elements::Int, elements_per_thread::Int) where {F,N}
17 |     kernel = @cuda launch=false f(CuKernelContext(), args...)
18 | 
19 |     # launching many large blocks) lowers performance, as observed with broadcast, so cap
20 |     # the block size if we don't have a grid-stride kernel (which would keep the grid small)
21 |     if elements_per_thread > 1
22 |         launch_configuration(kernel.fun)
23 |     else
24 |         launch_configuration(kernel.fun; max_threads=256)
25 |     end
26 | end
27 | 
28 | @inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int,
29 |                                     blocks::Int; name::Union{String,Nothing}) where {F,TT}
30 |     @cuda threads=threads blocks=blocks name=name f(CuKernelContext(), args...)
31 | end
32 | 
33 | 
34 | ## on-device
35 | 
36 | # indexing
37 | 
38 | GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x
39 | GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x
40 | GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x
41 | GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x
42 | 
43 | # memory
44 | 
45 | @inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
46 |                                       ) where {T, dims, id}
47 |     ptr = CUDA._shmem(Val(id), T, Val(prod(dims)))
48 |     CuDeviceArray(dims, reinterpret(LLVMPtr{T, AS.Shared}, ptr))
49 | end
50 | 
51 | # synchronization
52 | 
53 | @inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads()
54 | 
55 | 
56 | 
57 | #
58 | # Host abstractions
59 | #
60 | 
61 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend()
62 | 


--------------------------------------------------------------------------------
/src/device/utils.jl:
--------------------------------------------------------------------------------
 1 | # helpers for writing device functionality
 2 | 
 3 | # helper type for writing Int32 literals
 4 | # TODO: upstream this
 5 | struct Literal{T} end
 6 | Base.:(*)(x, ::Type{Literal{T}}) where {T} = T(x)
 7 | const i32 = Literal{Int32}
 8 | 
 9 | # local method table for device functions
10 | @static if isdefined(Base.Experimental, Symbol("@overlay"))
11 | Base.Experimental.@MethodTable(method_table)
12 | else
13 | const method_table = nothing
14 | end
15 | 
16 | # list of overrides (only for Julia 1.6)
17 | const overrides = Expr[]
18 | 
19 | macro device_override(ex)
20 |     code = quote
21 |         $GPUCompiler.@override(CUDA.method_table, $ex)
22 |     end
23 |     if isdefined(Base.Experimental, Symbol("@overlay"))
24 |         return esc(code)
25 |     else
26 |         push!(overrides, code)
27 |         return
28 |     end
29 | end
30 | 
31 | macro device_function(ex)
32 |     ex = macroexpand(__module__, ex)
33 |     def = splitdef(ex)
34 | 
35 |     # generate a function that errors
36 |     def[:body] = quote
37 |         error("This function is not intended for use on the CPU")
38 |     end
39 | 
40 |     esc(quote
41 |         $(combinedef(def))
42 |         @device_override $ex
43 |     end)
44 | end
45 | 
46 | macro device_functions(ex)
47 |     ex = macroexpand(__module__, ex)
48 | 
49 |     # recursively prepend `@device_function` to all function definitions
50 |     function rewrite(block)
51 |         out = Expr(:block)
52 |         for arg in block.args
53 |             if Meta.isexpr(arg, :block)
54 |                 # descend in blocks
55 |                 push!(out.args, rewrite(arg))
56 |             elseif Meta.isexpr(arg, [:function, :(=)])
57 |                 # rewrite function definitions
58 |                 push!(out.args, :(@device_function $arg))
59 |             else
60 |                 # preserve all the rest
61 |                 push!(out.args, arg)
62 |             end
63 |         end
64 |         out
65 |     end
66 | 
67 |     esc(rewrite(ex))
68 | end
69 | 


--------------------------------------------------------------------------------
/lib/nvtx/highlevel.jl:
--------------------------------------------------------------------------------
 1 | #
 2 | # domains
 3 | #
 4 | 
 5 | export Domain, domain
 6 | 
 7 | struct Domain
 8 |     handle::nvtxDomainHandle_t
 9 | 
10 |     function Domain(name::String)
11 |         handle = nvtxDomainCreateA(name)
12 |         new(handle)
13 |     end
14 | end
15 | 
16 | Base.unsafe_convert(::Type{nvtxDomainHandle_t}, dom::Domain) = dom.handle
17 | 
18 | unsafe_destroy!(dom::Domain) = nvtxDomainDestroy(dom)
19 | 
20 | function Domain(f::Function, name::String)
21 |     dom = Domain(name)
22 |     f(dom)
23 |     unsafe_destroy!(dom)
24 | end
25 | 
26 | 
27 | #
28 | # markers
29 | #
30 | 
31 | export mark
32 | 
33 | mark(msg::String) = nvtxMarkA(msg)
34 | 
35 | 
36 | #
37 | # ranges
38 | #
39 | 
40 | export Range, start_range, stop_range, @range
41 | 
42 | struct Range
43 |     id::nvtxRangeId_t
44 | end
45 | 
46 | Base.convert(::Type{nvtxRangeId_t}, range::Range) = range.id
47 | 
48 | """
49 |     start_range(msg)
50 | 
51 | Create and start a new range. The range is not automatically stopped, use
52 | [`end_range(::Range)`](@ref) for that.
53 | 
54 | Use this API if you need overlapping ranges, for scope-based use [`@range`](@ref) instead.
55 | """
56 | start_range(msg::String) = nvtxRangeStartA(msg)
57 | end_range(r::Range) = nvtxRangeEnd(r)
58 | 
59 | push_range(msg::String) = nvtxRangePushA(msg)
60 | pop_range() = nvtxRangePop()
61 | 
62 | """
63 |     @range "msg" ex
64 |     @range function ... end
65 | 
66 | Create a new range and execute `ex`. The range is popped automatically afterwards.
67 | 
68 | See also: [`range`](@ref)
69 | """
70 | macro range(msg, ex)
71 |     quote
72 |         push_range($(esc(msg)))
73 |         local ret = $(esc(ex))
74 |         pop_range()
75 |         ret
76 |     end
77 | end
78 | macro range(ex)
79 |     def = splitdef(ex)
80 |     def[:body] = quote
81 |         $push_range($(string(def[:name])))
82 |         try
83 |             $(def[:body])
84 |         finally
85 |             $pop_range()
86 |         end
87 |     end
88 |     esc(combinedef(def))
89 | end
90 | 


--------------------------------------------------------------------------------
/lib/utils/threading.jl:
--------------------------------------------------------------------------------
 1 | export @spinlock, @lock, LazyInitialized
 2 | 
 3 | const var"@lock" = Base.var"@lock"
 4 | 
 5 | # a safe way to acquire locks from finalizers, where we can't wait (which switches tasks)
 6 | macro spinlock(l, ex)
 7 |   quote
 8 |     temp = $(esc(l))
 9 |     while !trylock(temp)
10 |       ccall(:jl_cpu_pause, Cvoid, ())
11 |       # Temporary solution before we have gc transition support in codegen.
12 |       ccall(:jl_gc_safepoint, Cvoid, ())
13 |       # we can't yield here
14 |     end
15 |     try
16 |       $(esc(ex))
17 |     finally
18 |       unlock(temp)
19 |     end
20 |   end
21 | end
22 | 
23 | 
24 | """
25 |     LazyInitialized{T}()
26 | 
27 | A thread-safe, lazily-initialized wrapper for a value of type `T`. Initialize and fetch the
28 | value by calling `get!`. The constructor is ensured to only be called once.
29 | 
30 | This type is intended for lazy initialization of e.g. global structures, without using
31 | `__init__`. It is similar to protecting accesses using a lock, but is much cheaper.
32 | 
33 | """
34 | struct LazyInitialized{T}
35 |     # 0: uninitialized
36 |     # 1: initializing
37 |     # 2: initialized
38 |     guard::Threads.Atomic{Int}
39 |     value::Base.RefValue{T}
40 | 
41 |     LazyInitialized{T}() where {T} =
42 |         new(Threads.Atomic{Int}(0), Ref{T}())
43 | end
44 | 
45 | function Base.get!(constructor, x::LazyInitialized; hook=nothing)
46 |     while x.guard[] != 2
47 |         initialize!(x, constructor, hook)
48 |     end
49 |     assume(isassigned(x.value)) # to get rid of the check
50 |     x.value[]
51 | end
52 | 
53 | @noinline function initialize!(x::LazyInitialized, constructor::F1, hook::F2) where {F1, F2}
54 |     status = Threads.atomic_cas!(x.guard, 0, 1)
55 |     if status == 0
56 |         try
57 |           x.value[] = constructor()
58 |           x.guard[] = 2
59 |         catch
60 |           x.guard[] = 0
61 |           rethrow()
62 |         end
63 | 
64 |         if hook !== nothing
65 |           hook()
66 |         end
67 |     else
68 |         yield()
69 |     end
70 |     return
71 | end
72 | 


--------------------------------------------------------------------------------
/test/cutensor/permutations.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUTENSOR
 2 | using CUDA
 3 | using LinearAlgebra
 4 | 
 5 | # using host memory with CUTENSOR doesn't work on Windows
 6 | can_pin = !Sys.iswindows()
 7 | 
 8 | eltypes = ((Float16, Float16),
 9 |             #(Float16, Float32),
10 |             (Float32, Float32),
11 |             #(Float32, Float64),
12 |             (Float64, Float64),
13 |             #(ComplexF16, ComplexF16),
14 |             #(ComplexF16, ComplexF32),
15 |             (ComplexF32, ComplexF32),
16 |             #(ComplexF32, ComplexF64),
17 |             (ComplexF64, ComplexF64))
18 | @testset for N=2:5
19 |     @testset for (eltyA, eltyC) in eltypes
20 |         # setup
21 |         dmax = 2^div(18,N)
22 |         dims = rand(2:dmax, N)
23 |         p = randperm(N)
24 |         indsA = collect(('a':'z')[1:N])
25 |         indsC = indsA[p]
26 |         dimsA = dims
27 |         dimsC = dims[p]
28 |         A = rand(eltyA, dimsA...)
29 |         can_pin && Mem.pin(A)
30 |         dA = CuArray(A)
31 |         dC = similar(dA, eltyC, dimsC...)
32 | 
33 |         # simple case
34 |         dC = CUTENSOR.permutation!(one(eltyA), dA, indsA, dC, indsC)
35 |         C  = collect(dC)
36 |         @test C == permutedims(A, p) # exact equality
37 |         if can_pin
38 |             Csimple = zeros(eltyC, dimsC...)
39 |             Mem.pin(Csimple)
40 |             Csimple = CUDA.@sync CUTENSOR.permutation!(one(eltyA), A, indsA, Csimple, indsC)
41 |             @test Csimple == permutedims(A, p) # exact equality
42 |         end
43 | 
44 |         # with scalar
45 |         α  = rand(eltyA)
46 |         dC = CUTENSOR.permutation!(α, dA, indsA, dC, indsC)
47 |         C  = collect(dC)
48 |         @test C ≈ α * permutedims(A, p) # approximate, floating point rounding
49 |         if can_pin
50 |             Cscalar = zeros(eltyC, dimsC...)
51 |             Mem.pin(Cscalar)
52 |             Cscalar = CUDA.@sync CUTENSOR.permutation!(α, A, indsA, Cscalar, indsC)
53 |             @test Cscalar ≈ α * permutedims(A, p) # approximate, floating point rounding
54 |         end
55 |     end
56 | end
57 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Sanity checks (read this first, then remove this section)**
11 | 
12 | - [ ] Make sure you're reporting *a bug*; for general questions, please use Discourse or
13 |   Slack.
14 | 
15 | - [ ] If you're dealing with a performance issue, make sure you **disable scalar iteration**
16 |   (`CUDA.allowscalar(false)`). Only file an issue if that shows scalar iteration happening
17 |   in CUDA.jl or Base Julia, as opposed to your own code.
18 | 
19 | - [ ] If you're seeing an error message, **follow the error message instructions**, if any
20 |   (e.g. `inspect code with @device_code_warntype`). If you can't solve the problem using
21 |   that information, make sure to post it as part of the issue.
22 | 
23 | - [ ] Always ensure you're using the latest version of CUDA.jl, and if possible, please
24 |   check the master branch to see if your issue hasn't been resolved yet.
25 | 
26 | If your bug is still valid, please go ahead and fill out the template below.
27 | 
28 | 
29 | **Describe the bug**
30 | 
31 | A clear and concise description of what the bug is.
32 | 
33 | 
34 | **To reproduce**
35 | 
36 | The Minimal Working Example (MWE) for this bug:
37 | 
38 | ```julia
39 | # some code here
40 | ```
41 | 
42 | <details><summary>Manifest.toml</summary>
43 | <p>
44 | 
45 | ```
46 | Paste your Manifest.toml here, or accurately describe which version of CUDA.jl and its dependencies (GPUArrays.jl, GPUCompiler.jl, LLVM.jl) you are using.
47 | ```
48 | 
49 | </p>
50 | </details>
51 | 
52 | 
53 | **Expected behavior**
54 | 
55 | A clear and concise description of what you expected to happen.
56 | 
57 | 
58 | **Version info**
59 | 
60 | Details on Julia:
61 | 
62 | ```
63 | # please post the output of:
64 | versioninfo()
65 | ```
66 | 
67 | Details on CUDA:
68 | 
69 | ```
70 | # please post the output of:
71 | CUDA.versioninfo()
72 | ```
73 | 
74 | 
75 | **Additional context**
76 | 
77 | Add any other context about the problem here.
78 | 


--------------------------------------------------------------------------------
/lib/nvml/libnvml_deprecated.jl:
--------------------------------------------------------------------------------
 1 | ## Deprecated in CUDA 11.1
 2 | 
 3 | struct nvmlDeviceAttributesV1_st
 4 |     multiprocessorCount::UInt32
 5 |     sharedCopyEngineCount::UInt32
 6 |     sharedDecoderCount::UInt32
 7 |     sharedEncoderCount::UInt32
 8 |     sharedJpegCount::UInt32
 9 |     sharedOfaCount::UInt32
10 | end
11 | 
12 | const nvmlDeviceAttributesV1_t = nvmlDeviceAttributesV1_st
13 | 
14 | @checked function nvmlDeviceGetAttributes(device, attributes)
15 |     initialize_context()
16 |     ccall((:nvmlDeviceGetAttributes, libnvml()), nvmlReturn_t,
17 |                    (nvmlDevice_t, Ptr{nvmlDeviceAttributesV1_t}),
18 |                    device, attributes)
19 | end
20 | 
21 | struct nvmlProcessInfoV1_st
22 |     pid::UInt32
23 |     usedGpuMemory::Culonglong
24 | end
25 | 
26 | const nvmlProcessInfoV1_t = nvmlProcessInfoV1_st
27 | 
28 | @checked function nvmlDeviceGetComputeRunningProcesses(device, infoCount, infos)
29 |     initialize_context()
30 |     ccall((:nvmlDeviceGetComputeRunningProcesses, libnvml()), nvmlReturn_t,
31 |                    (nvmlDevice_t, Ptr{UInt32}, Ptr{nvmlProcessInfoV1_t}),
32 |                    device, infoCount, infos)
33 | end
34 | 
35 | @checked function nvmlDeviceGetGraphicsRunningProcesses(device, infoCount, infos)
36 |     initialize_context()
37 |     ccall((:nvmlDeviceGetGraphicsRunningProcesses, libnvml()), nvmlReturn_t,
38 |                    (nvmlDevice_t, Ptr{UInt32}, Ptr{nvmlProcessInfoV1_t}),
39 |                    device, infoCount, infos)
40 | end
41 | 
42 | ## Superseded in CUDA 11.2
43 | 
44 | struct nvmlComputeInstanceInfoV1_st
45 |     device::nvmlDevice_t
46 |     gpuInstance::nvmlGpuInstance_t
47 |     id::UInt32
48 |     profileId::UInt32
49 | end
50 | 
51 | const nvmlComputeInstanceInfoV1_t = nvmlComputeInstanceInfoV1_st
52 | 
53 | @checked function nvmlComputeInstanceGetInfo(computeInstance, info)
54 |     initialize_context()
55 |     ccall((:nvmlComputeInstanceGetInfo, libnvml()), nvmlReturn_t,
56 |                    (nvmlComputeInstance_t, Ptr{nvmlComputeInstanceInfoV1_t}),
57 |                    computeInstance, info)
58 | end
59 | 
60 | ##
61 | 


--------------------------------------------------------------------------------
/src/device/intrinsics/version.jl:
--------------------------------------------------------------------------------
 1 | # device intrinsics for querying the compute SimpleVersion and PTX ISA version
 2 | 
 3 | 
 4 | ## a GPU-compatible version number
 5 | 
 6 | export SimpleVersion, @sv_str
 7 | 
 8 | struct SimpleVersion
 9 |     major::UInt32
10 |     minor::UInt32
11 | 
12 |     SimpleVersion(major, minor=0) = new(major, minor)
13 | end
14 | 
15 | function Base.tryparse(::Type{SimpleVersion}, v::AbstractString)
16 |     parts = split(v, ".")
17 |     1 <= length(parts) <= 2 || return nothing
18 | 
19 |     int_parts = map(parts) do part
20 |         tryparse(Int, part)
21 |     end
22 |     any(isnothing, int_parts) && return nothing
23 | 
24 |     SimpleVersion(int_parts...)
25 | end
26 | 
27 | function Base.parse(::Type{SimpleVersion}, v::AbstractString)
28 |     ver = tryparse(SimpleVersion, v)
29 |     ver === nothing && throw(ArgumentError("invalid SimpleVersion string: '$v'"))
30 |     return ver
31 | end
32 | 
33 | SimpleVersion(v::AbstractString) = parse(SimpleVersion, v)
34 | 
35 | @inline function Base.isless(a::SimpleVersion, b::SimpleVersion)
36 |     (a.major < b.major) && return true
37 |     (a.major > b.major) && return false
38 |     (a.minor < b.minor) && return true
39 |     (a.minor > b.minor) && return false
40 |     return false
41 | end
42 | 
43 | macro sv_str(str)
44 |     SimpleVersion(str)
45 | end
46 | 
47 | 
48 | ## accessors for the compute SimpleVersion and PTX ISA version
49 | 
50 | export compute_capability, ptx_isa_version
51 | 
52 | for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"]
53 |     @eval @inline $(Symbol(var))() =
54 |         Base.llvmcall(
55 |             $("""@$var = external global i32
56 |                  define i32 @entry() #0 {
57 |                      %val = load i32, i32* @$var
58 |                      ret i32 %val
59 |                  }
60 |                  attributes #0 = { alwaysinline }
61 |             """, "entry"), UInt32, Tuple{})
62 | end
63 | 
64 | @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor())
65 | @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor())
66 | 
67 | 


--------------------------------------------------------------------------------
/lib/cusolver/error.jl:
--------------------------------------------------------------------------------
 1 | export CUSOLVERError
 2 | 
 3 | struct CUSOLVERError <: Exception
 4 |     code::cusolverStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{cusolverStatus_t}, err::CUSOLVERError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUSOLVERError) =
10 |     print(io, "CUSOLVERError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CUSOLVERError) = string(err.code)
13 | 
14 | ## COV_EXCL_START
15 | function description(err)
16 |     if err.code == CUSOLVER_STATUS_SUCCESS
17 |         "the operation completed successfully"
18 |     elseif err.code == CUSOLVER_STATUS_NOT_INITIALIZED
19 |         "the library was not initialized"
20 |     elseif err.code == CUSOLVER_STATUS_ALLOC_FAILED
21 |         "the resource allocation failed"
22 |     elseif err.code == CUSOLVER_STATUS_INVALID_VALUE
23 |         "an invalid value was used as an argument"
24 |     elseif err.code == CUSOLVER_STATUS_ARCH_MISMATCH
25 |         "an absent device architectural feature is required"
26 |     elseif err.code == CUSOLVER_STATUS_EXECUTION_FAILED
27 |         "the GPU program failed to execute"
28 |     elseif err.code == CUSOLVER_STATUS_INTERNAL_ERROR
29 |         "an internal operation failed"
30 |     elseif err.code == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED
31 |         "the matrix type is not supported."
32 |     else
33 |         "no description for this error"
34 |     end
35 | end
36 | ## COV_EXCL_STOP
37 | 
38 | 
39 | ## API call wrapper
40 | 
41 | # outlined functionality to avoid GC frame allocation
42 | @noinline function throw_api_error(res)
43 |     if res == CUSOLVER_STATUS_ALLOC_FAILED
44 |         throw(OutOfGPUMemoryError())
45 |     else
46 |         throw(CUSOLVERError(res))
47 |     end
48 | end
49 | 
50 | macro check(ex, errs...)
51 |     check = :(isequal(err, CUSOLVER_STATUS_ALLOC_FAILED))
52 |     for err in errs
53 |         check = :($check || isequal(err, $(esc(err))))
54 |     end
55 | 
56 |     quote
57 |         res = @retry_reclaim err->$check $(esc(ex))
58 |         if res != CUSOLVER_STATUS_SUCCESS
59 |             throw_api_error(res)
60 |         end
61 | 
62 |         nothing
63 |     end
64 | end
65 | 


--------------------------------------------------------------------------------
/docs/src/tutorials/custom_structs.jl:
--------------------------------------------------------------------------------
 1 | # # Using custom structs
 2 | #
 3 | # This tutorial shows how to use custom structs on the GPU. Our example will be a one dimensional
 4 | # interpolation. Lets start with the CPU version:
 5 | using CUDA
 6 | 
 7 | struct Interpolate{A}
 8 |     xs::A
 9 |     ys::A
10 | end
11 | 
12 | function (itp::Interpolate)(x)
13 |     i = searchsortedfirst(itp.xs, x)
14 |     i = clamp(i, firstindex(itp.ys), lastindex(itp.ys))
15 |     @inbounds itp.ys[i]
16 | end
17 | 
18 | xs_cpu = [1.0, 2.0, 3.0]
19 | ys_cpu = [10.0,20.0,30.0]
20 | itp_cpu = Interpolate(xs_cpu, ys_cpu)
21 | pts_cpu = [1.1,2.3]
22 | result_cpu = itp_cpu.(pts_cpu)
23 | 
24 | # Ok the CPU code works, let's move our data to the GPU:
25 | itp = Interpolate(CuArray(xs_cpu), CuArray(ys_cpu))
26 | pts = CuArray(pts_cpu);
27 | # If we try to call our interpolate `itp.(pts)`, we get an error however:
28 | # ```
29 | # ...
30 | # KernelError: passing and using non-bitstype argument
31 | # ...
32 | # ```
33 | # Why does it throw an error? Our calculation involves
34 | # a custom type `Interpolate{CuArray{Float64, 1}}`.
35 | # At the end of the day all arguments of a CUDA kernel need to
36 | # be bitstypes. However we have
37 | isbitstype(typeof(itp))
38 | # How to fix this? The answer is, that there is a conversion mechanism, which adapts objects into
39 | # CUDA compatible bitstypes.
40 | # It is based on the [Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) package and basic types like `CuArray` already participate in this mechanism. For custom types,
41 | # we just need to add a conversion rule like so:
42 | import Adapt
43 | function Adapt.adapt_structure(to, itp::Interpolate)
44 |     xs = Adapt.adapt_structure(to, itp.xs)
45 |     ys = Adapt.adapt_structure(to, itp.ys)
46 |     Interpolate(xs, ys)
47 | end
48 | # Now our struct plays nicely with CUDA.jl:
49 | result = itp.(pts)
50 | # It works, we get the same result as on the CPU.
51 | @assert CuArray(result_cpu) == result
52 | # Alternatively instead of defining `Adapt.adapt_structure` explictly, we could have done
53 | # ```julia
54 | # Adapt.@adapt_structure Interpolate
55 | # ```
56 | # which expands to the same code that we wrote manually.
57 | 


--------------------------------------------------------------------------------
/lib/cublas/README.md:
--------------------------------------------------------------------------------
 1 | # CUBLAS implementation progress
 2 | 
 3 | The following sections list the CUBLAS functions shown on the CUBLAS
 4 | documentation page:
 5 | 
 6 | http://docs.nvidia.com/cuda/cublas/index.html
 7 | 
 8 | ## Level 1 (13 functions)
 9 | 
10 | CUBLAS functions:
11 | 
12 | * [x] amax
13 | * [x] amin
14 | * [x] asum
15 | * [x] axpy
16 | * [x] copy
17 | * [x] dot, dotc, dotu
18 | * [x] nrm2
19 | * [ ] rot (not implemented in julia blas.jl)
20 | * [ ] rotg (not implemented in julia blas.jl)
21 | * [ ] rotm (not implemented in julia blas.jl)
22 | * [ ] rotmg (not implemented in julia blas.jl)
23 | * [x] scal
24 | * [ ] swap (not implemented in julia blas.jl)
25 | 
26 | ## Level 2
27 | 
28 | Key:
29 | * `ge`: general
30 | * `gb`: general banded
31 | * `sy`: symmetric
32 | * `sb`: symmetric banded
33 | * `sp`: symmetric packed
34 | * `tr`: triangular
35 | * `tb`: triangular banded
36 | * `tp`: triangular packed
37 | * `he`: hermitian
38 | * `hb`: hermitian banded
39 | * `hp`: hermitian packed
40 | 
41 | CUBLAS functions:
42 | 
43 | * [x] gbmv (in julia/blas.jl)
44 | * [x] gemv (in julia/blas.jl)
45 | * [x] ger (in julia/blas.jl)
46 | * [x] sbmv (in julia/blas.jl)
47 | * [ ] spmv
48 | * [ ] spr
49 | * [ ] spr2
50 | * [x] symv (in julia/blas.jl)
51 | * [x] syr (in julia/blas.jl)
52 | * [ ] syr2
53 | * [x] tbmv
54 | * [x] tbsv
55 | * [ ] tpmv
56 | * [ ] tpsv
57 | * [x] trmv (in julia/blas.jl)
58 | * [x] trsv (in julia/blas.jl)
59 | * [x] hemv (in julia/blas.jl)
60 | * [x] hbmv
61 | * [ ] hpmv
62 | * [x] her (in julia/blas.jl)
63 | * [x] her2
64 | * [ ] hpr
65 | * [ ] hpr2
66 | 
67 | ## Level 3
68 | 
69 | CUBLAS functions:
70 | 
71 | * [x] gemm (in julia/blas.jl)
72 | * [x] gemmBatched
73 | * [x] symm (in julia/blas.jl)
74 | * [x] syrk (in julia/blas.jl)
75 | * [x] syr2k (in julia/blas.jl)
76 | * [ ] syrkx
77 | * [x] trmm (in julia/blas.jl)
78 | * [x] trsm (in julia/blas.jl)
79 | * [x] trsmBatched
80 | * [x] hemm
81 | * [x] herk (in julia/blas.jl)
82 | * [x] her2k (in julia/blas.jl)
83 | * [ ] herkx
84 | 
85 | ## BLAS-like extensions
86 | 
87 | * [x] geam
88 | * [x] dgmm
89 | * [x] getrfBatched
90 | * [x] getriBatched
91 | * [x] geqrfBatched
92 | * [x] gelsBatched
93 | * [ ] tpttr
94 | * [ ] trttp
95 | 


--------------------------------------------------------------------------------
/lib/cublas/util.jl:
--------------------------------------------------------------------------------
 1 | # convert matrix to band storage
 2 | function band(A::AbstractMatrix,kl,ku)
 3 |     m, n = size(A)
 4 |     AB = zeros(eltype(A),kl+ku+1,n)
 5 |     for j = 1:n
 6 |         for i = max(1,j-ku):min(m,j+kl)
 7 |             AB[ku+1-j+i,j] = A[i,j]
 8 |         end
 9 |     end
10 |     return AB
11 | end
12 | 
13 | # convert band storage to general matrix
14 | function unband(AB::AbstractMatrix,m,kl,ku)
15 |     bm, n = size(AB)
16 |     A = zeros(eltype(AB),m,n)
17 |     for j = 1:n
18 |         for i = max(1,j-ku):min(m,j+kl)
19 |             A[i,j] = AB[ku+1-j+i,j]
20 |         end
21 |     end
22 |     return A
23 | end
24 | 
25 | # zero out elements not on matrix bands
26 | function bandex(A::AbstractMatrix,kl,ku)
27 |     m, n = size(A)
28 |     AB = band(A,kl,ku)
29 |     B = unband(AB,m,kl,ku)
30 |     return B
31 | end
32 | 
33 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32}
34 | const CublasReal = Union{Float64,Float32}
35 | const CublasComplex = Union{ComplexF64,ComplexF32}
36 | 
37 | function Base.convert(::Type{cublasOperation_t}, trans::Char)
38 |     if trans == 'N'
39 |         return CUBLAS_OP_N
40 |     elseif trans == 'T'
41 |         return CUBLAS_OP_T
42 |     elseif trans == 'C'
43 |         return CUBLAS_OP_C
44 |     else
45 |         throw(ArgumentError("Unknown operation $trans"))
46 |     end
47 | end
48 | 
49 | function Base.convert(::Type{cublasFillMode_t}, uplo::Char)
50 |     if uplo == 'U'
51 |         return CUBLAS_FILL_MODE_UPPER
52 |     elseif uplo == 'L'
53 |         return CUBLAS_FILL_MODE_LOWER
54 |     else
55 |         throw(ArgumentError("Unknown fill mode $uplo"))
56 |     end
57 | end
58 | 
59 | function Base.convert(::Type{cublasDiagType_t}, diag::Char)
60 |     if diag == 'U'
61 |         return CUBLAS_DIAG_UNIT
62 |     elseif diag == 'N'
63 |         return CUBLAS_DIAG_NON_UNIT
64 |     else
65 |         throw(ArgumentError("Unknown diag mode $diag"))
66 |     end
67 | end
68 | 
69 | function Base.convert(::Type{cublasSideMode_t}, side::Char)
70 |     if side == 'L'
71 |         return CUBLAS_SIDE_LEFT
72 |     elseif side == 'R'
73 |         return CUBLAS_SIDE_RIGHT
74 |     else
75 |         throw(ArgumentError("Unknown side mode $side"))
76 |     end
77 | end
78 | 


--------------------------------------------------------------------------------
/lib/cudadrv/module/global.jl:
--------------------------------------------------------------------------------
 1 | # Module-scope global variables
 2 | 
 3 | # TODO: improve this interface:
 4 | # - should be more dict-like: get and setindex(::name), haskey(::name)
 5 | # - globals(::Type)?
 6 | 
 7 | export
 8 |     CuGlobal, get, set
 9 | 
10 | 
11 | """
12 |     CuGlobal{T}(mod::CuModule, name::String)
13 | 
14 | Acquires a typed global variable handle from a named global in a module.
15 | """
16 | struct CuGlobal{T}
17 |     buf::Mem.DeviceBuffer
18 | 
19 |     function CuGlobal{T}(mod::CuModule, name::String) where T
20 |         ptr_ref = Ref{CuPtr{Cvoid}}()
21 |         nbytes_ref = Ref{Csize_t}()
22 |         cuModuleGetGlobal_v2(ptr_ref, nbytes_ref, mod, name)
23 |         if nbytes_ref[] != sizeof(T)
24 |             throw(ArgumentError("size of global '$name' does not match type parameter type $T"))
25 |         end
26 |         buf = Mem.DeviceBuffer(ptr_ref[], nbytes_ref[], false)
27 | 
28 |         return new{T}(buf)
29 |     end
30 | end
31 | 
32 | Base.cconvert(::Type{CuPtr{Cvoid}}, var::CuGlobal) = var.buf
33 | 
34 | Base.:(==)(a::CuGlobal, b::CuGlobal) = a.handle == b.handle
35 | Base.hash(var::CuGlobal, h::UInt) = hash(var.ptr, h)
36 | 
37 | """
38 |     eltype(var::CuGlobal)
39 | 
40 | Return the element type of a global variable object.
41 | """
42 | Base.eltype(::Type{CuGlobal{T}}) where {T} = T
43 | 
44 | """
45 |     Base.getindex(var::CuGlobal)
46 | 
47 | Return the current value of a global variable.
48 | """
49 | function Base.getindex(var::CuGlobal{T}; async::Bool=false, stream::CuStream=stream()) where T
50 |     val_ref = Ref{T}()
51 |     if async
52 |         cuMemcpyDtoHAsync_v2(val_ref, var, var.buf.bytesize, stream)
53 |     else
54 |         cuMemcpyDtoH_v2(val_ref, var, var.buf.bytesize)
55 |     end
56 |     return val_ref[]
57 | end
58 | # TODO: import Base: get?
59 | 
60 | """
61 |     Base.setindex(var::CuGlobal{T}, val::T)
62 | 
63 | Set the value of a global variable to `val`
64 | """
65 | function Base.setindex!(var::CuGlobal{T}, val::T; async::Bool=false, stream::CuStream=stream()) where T
66 |     val_ref = Ref{T}(val)
67 |     if async
68 |         cuMemcpyHtoDAsync_v2(var, val_ref, var.buf.bytesize, stream)
69 |     else
70 |         cuMemcpyHtoD_v2(var, val_ref, var.buf.bytesize)
71 |     end
72 | end
73 | 


--------------------------------------------------------------------------------
/lib/utils/cache.jl:
--------------------------------------------------------------------------------
 1 | # a cache for library handles
 2 | 
 3 | export HandleCache
 4 | 
 5 | struct HandleCache{K,V}
 6 |     active_handles::Set{Pair{K,V}}      # for debugging, and to prevent handle finalization
 7 |     idle_handles::Dict{K,Vector{V}}
 8 |     lock::ReentrantLock
 9 | 
10 |     max_entries::Int
11 | 
12 |     function HandleCache{K,V}(max_entries::Int=32) where {K,V}
13 |         return new{K,V}(Set{Pair{K,V}}(), Dict{K,Vector{V}}(), ReentrantLock(), max_entries)
14 |     end
15 | end
16 | 
17 | # remove a handle from the cache, or create a new one
18 | function Base.pop!(f::Function, cache::HandleCache{K,V}, key) where {K,V}
19 |     function check_cache(f::Function=()->nothing)
20 |         try
21 |             GC.enable_finalizers(false)
22 |             lock(cache.lock) do
23 |                 handle = if !haskey(cache.idle_handles, key) || isempty(cache.idle_handles[key])
24 |                     f()
25 |                 else
26 |                     pop!(cache.idle_handles[key])
27 |                 end
28 | 
29 |                 if handle !== nothing
30 |                     push!(cache.active_handles, key=>handle)
31 |                 end
32 | 
33 |                 return handle
34 |             end
35 |         finally
36 |             GC.enable_finalizers(true)
37 |         end
38 |     end
39 | 
40 |     handle = check_cache()
41 | 
42 |     if handle === nothing
43 |         # if we didn't find anything, perform a quick GC collection to free up old handles.
44 |         GC.gc(false)
45 | 
46 |         handle = check_cache(f)
47 |     end
48 | 
49 |     return handle::V
50 | end
51 | 
52 | # put a handle in the cache, or destroy it if it doesn't fit
53 | function Base.push!(f::Function, cache::HandleCache{K,V}, key::K, handle::V) where {K,V}
54 |     # XXX: take this lock in a normal way once we have JuliaLang/julia#35689
55 |     @spinlock cache.lock begin
56 |         delete!(cache.active_handles, key=>handle)
57 | 
58 |         if haskey(cache.idle_handles, key)
59 |             if length(cache.idle_handles[key]) > cache.max_entries
60 |                 f()
61 |             else
62 |                 push!(cache.idle_handles[key], handle)
63 |             end
64 |         else
65 |             cache.idle_handles[key] = [handle]
66 |         end
67 |     end
68 | end
69 | 


--------------------------------------------------------------------------------
/res/wrap/README.md:
--------------------------------------------------------------------------------
 1 | # Wrapping headers
 2 | 
 3 | This directory contains scripts that can be used to automatically generate
 4 | wrappers for C headers by NVIDIA, such as CUBLAS or CUDNN. This is done using
 5 | Clang.jl, with some CSTParser.jl-based scripts to clean-up the result.
 6 | 
 7 | In CUDA.jl, the wrappers need to know whether pointers passed into the
 8 | library point to CPU or GPU memory (i.e. `Ptr` or `CuPtr`). This information is
 9 | not available from the headers, and the headers will need to be reviewed up manually.
10 | 
11 | 
12 | 
13 | # Usage
14 | 
15 | Either run `wrap.jl` directly, or include it using Revise.jl and call the `main()` function.
16 | Be sure to activate the project environment in this folder, which will download CUDA from
17 | artifacts (if you want to upgrade the headers, be sure to update the relevant JLLs in the
18 | project environment).
19 | 
20 | For each library, the script performs the following steps:
21 | 
22 | - generate wrappers with Clang.jl
23 | - rewrite the headers: wrap functions that result status codes with `@check`, add calls to
24 |   API initializers, etc.
25 | - apply manual patches: these are read from the `patches` folder, and can be used to
26 |   incompatible code
27 | 
28 | Clang.jl generates headers with two files: a `common` file with type definitions, aliases,
29 | etc, and a main wrapper that contains function definitions. The former will be copied over
30 | the existing files automatically, while for the latter we scan for changes: Removed
31 | functions are put in the `libXXX_deprecated.jl` file, new ones are concatenated to the
32 | `libXXX.jl` file.
33 | 
34 | You should always review any changes to the headers! Specifically, to correct `Ptr`
35 | signature and possibly change them to:
36 | - `CuPtr`: if this pointer is a device pointer
37 | - `PtrOrCuPtr`: if this pointer can be either a device or host pointer
38 | - `Ref`: if the pointer represents a scalar or single-value argument on the host
39 | - `CuRef`: idem, but on the device
40 | - `RefOrCuRef`: idem, but either on the host or device
41 | 
42 | Finally, it might also be useful to diff the generated wrapper (generated from scratch) in
43 | the `res/wrap` folder with the one in the `lib` folder (which is incrementally changed) to
44 | see if no function signatures have changed.
45 | 


--------------------------------------------------------------------------------
/lib/cublas/error.jl:
--------------------------------------------------------------------------------
 1 | export CUBLASError
 2 | 
 3 | struct CUBLASError <: Exception
 4 |     code::cublasStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{cublasStatus_t}, err::CUBLASError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUBLASError) =
10 |     print(io, "CUBLASError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CUBLASError) = string(err.code)
13 | 
14 | ## COV_EXCL_START
15 | function description(err)
16 |     if err.code == CUBLAS_STATUS_SUCCESS
17 |         "the operation completed successfully"
18 |     elseif err.code == CUBLAS_STATUS_NOT_INITIALIZED
19 |         "the library was not initialized"
20 |     elseif err.code == CUBLAS_STATUS_ALLOC_FAILED
21 |         "the resource allocation failed"
22 |     elseif err.code == CUBLAS_STATUS_INVALID_VALUE
23 |         "an invalid value was used as an argument"
24 |     elseif err.code == CUBLAS_STATUS_ARCH_MISMATCH
25 |         "an absent device architectural feature is required"
26 |     elseif err.code == CUBLAS_STATUS_MAPPING_ERROR
27 |         "an access to GPU memory space failed"
28 |     elseif err.code == CUBLAS_STATUS_EXECUTION_FAILED
29 |         "the GPU program failed to execute"
30 |     elseif err.code == CUBLAS_STATUS_INTERNAL_ERROR
31 |         "an internal operation failed"
32 |     elseif err.code == CUBLAS_STATUS_NOT_SUPPORTED
33 |         "the requested feature is not supported"
34 |     elseif err.code == CUBLAS_STATUS_LICENSE_ERROR
35 |         "error detected trying to check the license"
36 |     else
37 |         "no description for this error"
38 |     end
39 | end
40 | ## COV_EXCL_STOP
41 | 
42 | 
43 | ## API call wrapper
44 | 
45 | # outlined functionality to avoid GC frame allocation
46 | @noinline function throw_api_error(res)
47 |     if res == CUBLAS_STATUS_ALLOC_FAILED
48 |         throw(OutOfGPUMemoryError())
49 |     else
50 |         throw(CUBLASError(res))
51 |     end
52 | end
53 | 
54 | macro check(ex, errs...)
55 |     check = :(isequal(err, CUBLAS_STATUS_ALLOC_FAILED))
56 |     for err in errs
57 |         check = :($check || isequal(err, $(esc(err))))
58 |     end
59 | 
60 |     quote
61 |         res = @retry_reclaim err->$check $(esc(ex))
62 |         if res != CUBLAS_STATUS_SUCCESS
63 |             throw_api_error(res)
64 |         end
65 | 
66 |         nothing
67 |     end
68 | end
69 | 


--------------------------------------------------------------------------------
/lib/cudnn/softmax.jl:
--------------------------------------------------------------------------------
 1 | """
 2 |     cudnnSoftmaxForward(x; algo, mode, alpha)
 3 |     cudnnSoftmaxForward!(y, x; algo, mode, alpha, beta)
 4 | 
 5 | Return the softmax or logsoftmax of the input `x` depending on the `algo` keyword argument.
 6 | The `y` argument holds the result and it should be similar to `x` if specified. Keyword
 7 | arguments:
 8 | 
 9 | * `algo = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE)`: Options are `CUDNN_SOFTMAX_ACCURATE` which subtracts max from every point to avoid overflow, `CUDNN_SOFTMAX_FAST` which doesn't and `CUDNN_SOFTMAX_LOG` which returns logsoftmax.
10 | * `mode = CUDNN_SOFTMAX_MODE_INSTANCE`: Compute softmax per image (N) across the dimensions C,H,W. `CUDNN_SOFTMAX_MODE_CHANNEL` computes softmax per spatial location (H,W) per image (N) across the dimension C. 
11 | * `alpha=1, beta=0` can be used for scaling, i.e. `y .= alpha * op(x1) .+ beta * y`
12 | """
13 | cudnnSoftmaxForward, cudnnSoftmaxForward!
14 | 
15 | # Public methods
16 | cudnnSoftmaxForward(x; o...) = cudnnSoftmaxForwardWithDefaults(x; o...)
17 | cudnnSoftmaxForward!(y, x; o...) = cudnnSoftmaxForwardWithDefaults(x; y, o...)
18 | 
19 | 
20 | # Private method
21 | function cudnnSoftmaxForwardWithDefaults(
22 |     x;
23 |     y = similar(x),
24 |     algo::cudnnSoftmaxAlgorithm_t = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE),
25 |     mode::cudnnSoftmaxMode_t = CUDNN_SOFTMAX_MODE_INSTANCE,
26 |     alpha::Real = 1,
27 |     beta::Real = 0,
28 |     format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
29 |     xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
30 |     yDesc::cudnnTensorDescriptor = xDesc,
31 | )
32 |     @assert size(y) == size(x)
33 |     T = eltype(x)
34 |     alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
35 |     cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y)
36 | end
37 | 
38 | 
39 | # AD method
40 | function cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y)
41 |     cudnnSoftmaxForward(handle(), algo, mode, alpha, xDesc, x, beta, yDesc, y)
42 |     return y
43 | end
44 | 
45 | 
46 | # Deprecated methods
47 | function cudnnSoftmaxForward(x::DenseCuArray{T,4}, y::DenseCuArray{T,4}; o...) where T
48 |     @warn "`cudnnSoftmaxForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnSoftmaxForward`." maxlog=1
49 |     cudnnSoftmaxForward!(y, x; o...)
50 | end
51 | 
52 | 


--------------------------------------------------------------------------------
/.github/workflows/ManifestUpdater.yml:
--------------------------------------------------------------------------------
 1 | name: ManifestUpdater
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * 1'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   ManifestUpdater:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: Get Julia compatibility
14 |         id: julia_compat
15 |         # NOTE: this requires a Julia compat lower-bound with minor version!
16 |         run : |
17 |           version=$(grep '^julia = ' Project.toml | grep -o '".*"' | cut -d '"' -f2)
18 |           echo "::set-output name=version::$version"
19 |       - uses: julia-actions/setup-julia@v1
20 |         with:
21 |           version: ${{ steps.julia_compat.outputs.version }}
22 |       - name: Update packages
23 |         id: pkg_update
24 |         run: |
25 |           log=$(julia --project -e 'using Pkg; Pkg.update()')
26 |           log="${log//'%'/'%25'}"
27 |           log="${log//$'\n'/'%0A'}"
28 |           log="${log//$'\r'/'%0D'}"
29 |           echo "::set-output name=log::$log"
30 |       - name: Get status
31 |         id: pkg_status
32 |         run: |
33 |           log=$(julia --project -e 'using Pkg; VERSION >= v"1.3" ? Pkg.status(diff=true) : Pkg.status()')
34 |           log="${log//'%'/'%25'}"
35 |           log="${log//$'\n'/'%0A'}"
36 |           log="${log//$'\r'/'%0D'}"
37 |           echo "::set-output name=log::$log"
38 |       - name: Get Julia version
39 |         id: version
40 |         run: |
41 |           log=$(julia -e "println(Base.VERSION)")
42 |           echo "::set-output name=log::$log"
43 |       - name: Create pull request
44 |         uses: peter-evans/create-pull-request@v3
45 |         with:
46 |           token: ${{ secrets.GITHUB_TOKEN }}
47 |           commit-message: |
48 |             Update dependencies.
49 | 
50 |             ${{ steps.pkg_status.outputs.log }}
51 |           title: Update manifest
52 |           reviewers: maleadt
53 |           body: |
54 |             This pull request updates the manifest for Julia v${{ steps.version.outputs.log }}:
55 | 
56 |             ```
57 |             ${{ steps.pkg_status.outputs.log }}
58 |             ```
59 | 
60 |             <details><summary>Click here for the full update log.</summary>
61 |             <p>
62 | 
63 |             ```
64 |             ${{ steps.pkg_update.outputs.log }}
65 |             ```
66 | 
67 |             </p>
68 |             </details>
69 |           branch: update_manifest
70 | 
71 | 


--------------------------------------------------------------------------------
/src/linalg.jl:
--------------------------------------------------------------------------------
 1 | # integration with LinearAlgebra.jl
 2 | 
 3 | CuMatOrAdj{T} = Union{CuMatrix, LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, LinearAlgebra.Transpose{T, <:CuMatrix{T}}}
 4 | CuOrAdj{T} = Union{CuVecOrMat, LinearAlgebra.Adjoint{T, <:CuVecOrMat{T}}, LinearAlgebra.Transpose{T, <:CuVecOrMat{T}}}
 5 | 
 6 | 
 7 | # matrix division
 8 | 
 9 | function Base.:\(_A::CuMatOrAdj, _B::CuOrAdj)
10 |     A, B = copy(_A), copy(_B)
11 |     A, ipiv = CUSOLVER.getrf!(A)
12 |     return CUSOLVER.getrs!('N', A, ipiv, B)
13 | end
14 | 
15 | # patch JuliaLang/julia#40899 to create a CuArray
16 | # (see https://github.com/JuliaLang/julia/pull/41331#issuecomment-868374522)
17 | if VERSION >= v"1.7-"
18 | _zeros(::Type{T}, b::AbstractVector, n::Integer) where {T} = CUDA.zeros(T, max(length(b), n))
19 | _zeros(::Type{T}, B::AbstractMatrix, n::Integer) where {T} = CUDA.zeros(T, max(size(B, 1), n), size(B, 2))
20 | function Base.:\(F::Union{LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray},
21 |                           Adjoint{<:Any,<:LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray}}},
22 |                  B::AbstractVecOrMat)
23 |     m, n = size(F)
24 |     if m != size(B, 1)
25 |         throw(DimensionMismatch("arguments must have the same number of rows"))
26 |     end
27 | 
28 |     TFB = typeof(oneunit(eltype(B)) / oneunit(eltype(F)))
29 |     FF = Factorization{TFB}(F)
30 | 
31 |     # For wide problem we (often) compute a minimum norm solution. The solution
32 |     # is larger than the right hand side so we use size(F, 2).
33 |     BB = _zeros(TFB, B, n)
34 | 
35 |     if n > size(B, 1)
36 |         # Underdetermined
37 |         copyto!(view(BB, 1:m, :), B)
38 |     else
39 |         copyto!(BB, B)
40 |     end
41 | 
42 |     ldiv!(FF, BB)
43 | 
44 |     # For tall problems, we compute a least squares solution so only part
45 |     # of the rhs should be returned from \ while ldiv! uses (and returns)
46 |     # the complete rhs
47 |     return LinearAlgebra._cut_B(BB, 1:n)
48 | end
49 | end
50 | 
51 | 
52 | # qr
53 | 
54 | using LinearAlgebra: AbstractQ
55 | 
56 | # AbstractQ's `size` is the size of the full matrix,
57 | # while `Matrix(Q)` only gives the compact Q.
58 | # See JuliaLang/julia#26591 and JuliaGPU/CUDA.jl#969.
59 | CuMatrix{T}(Q::AbstractQ{S}) where {T,S} = convert(CuArray, Matrix{T}(Q))
60 | CuMatrix(Q::AbstractQ{T}) where {T} = CuMatrix{T}(Q)
61 | CuArray{T}(Q::AbstractQ) where {T} = CuMatrix{T}(Q)
62 | CuArray(Q::AbstractQ) = CuMatrix(Q)
63 | 


--------------------------------------------------------------------------------
/lib/cudnn/inplace.jl:
--------------------------------------------------------------------------------
 1 | """
 2 |     cudnnSetTensor!(x, s)
 3 | 
 4 | Set all elements of tensor `x` to scalar `s` and return `x`.
 5 | """
 6 | function cudnnSetTensor!(
 7 |     x, s::Real;
 8 |     format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
 9 |     xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format)
10 | )
11 |     cudnnSetTensor(handle(), xDesc, x, Ref(eltype(x)(s)))
12 |     return x
13 | end
14 | 
15 | 
16 | """
17 |     cudnnScaleTensor(x, s)
18 |     cudnnScaleTensor!(y, x, s)
19 | 
20 | Scale all elements of tensor `x` with scale `s` and return the result. `cudnnScaleTensor`
21 | allocates a new array for the answer, `cudnnScaleTensor!` overwrites `y`.
22 | """
23 | cudnnScaleTensor, cudnnScaleTensor!
24 | 
25 | function cudnnScaleTensor!(
26 |     y, x, s::Real;
27 |     format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
28 |     xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format)
29 | )
30 |     y === x || copyto!(y, x)
31 |     cudnnScaleTensor(handle(), xDesc, y, Ref(eltype(y)(s)))
32 |     return y
33 | end
34 | 
35 | cudnnScaleTensor(x, s::Real; o...) = cudnnScaleTensor!(similar(x), x, s; o...)
36 | 
37 | 
38 | # cudnnAddTensor does not support all broadcasting dimensions, use cudnnOpTensor instead.
39 | # Compared to libknet8 x .+ b it is ~2x slower for (1,1,100,100), ~30% faster for (14,14,256,32)
40 | # CUDA.jl x .+ b is 2x slower than both
41 | 
42 | """
43 |     cudnnAddTensor(x, b; alpha)
44 |     cudnnAddTensor!(y, x, b; alpha, beta)
45 | 
46 | Broadcast-add tensor `b` to tensor `x`. `alpha=1, beta=1` are used for scaling, i.e. `y .=
47 | alpha * b .+ beta * x`.  `cudnnAddTensor` allocates a new array for the answer,
48 | `cudnnAddTensor!` overwrites `y`. Does not support all valid broadcasting dimensions.  For
49 | more flexible broadcast operations see `cudnnOpTensor`.
50 | """
51 | cudnnAddTensor, cudnnAddTensor!
52 | 
53 | function cudnnAddTensor!(
54 |     y, x, b;
55 |     alpha::Real=1,
56 |     beta::Real=1,
57 |     format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW,
58 |     bDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(b; format),
59 |     xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format),
60 | )
61 |     T = eltype(x)
62 |     alpha, beta = scalingParameter(T, alpha), scalingParameter(T, beta)
63 |     y === x || copyto!(y, x)
64 |     cudnnAddTensor(handle(), alpha, bDesc, b, beta, xDesc, y)
65 |     return y
66 | end
67 | 
68 | cudnnAddTensor(x, b; o...) = cudnnAddTensor!(similar(x), x, b; o...)
69 | 


--------------------------------------------------------------------------------
/lib/cusparse/CUSPARSE.jl:
--------------------------------------------------------------------------------
 1 | module CUSPARSE
 2 | 
 3 | using ..APIUtils
 4 | 
 5 | using ..CUDA
 6 | using ..CUDA: CUstream, cuComplex, cuDoubleComplex, libraryPropertyType, cudaDataType
 7 | using ..CUDA: libcusparse, unsafe_free!, @retry_reclaim, @context!, initialize_context
 8 | 
 9 | using CEnum: @cenum
10 | 
11 | using LinearAlgebra
12 | using LinearAlgebra: HermOrSym
13 | 
14 | using Adapt: Adapt, adapt
15 | 
16 | using SparseArrays
17 | 
18 | const SparseChar = Char
19 | 
20 | 
21 | # core library
22 | include("libcusparse_common.jl")
23 | include("error.jl")
24 | include("libcusparse.jl")
25 | include("libcusparse_deprecated.jl")
26 | 
27 | include("array.jl")
28 | include("util.jl")
29 | include("types.jl")
30 | 
31 | # low-level wrappers
32 | include("helpers.jl")
33 | include("management.jl")
34 | include("level1.jl")
35 | include("level2.jl")
36 | include("level3.jl")
37 | include("extra.jl")
38 | include("preconditioners.jl")
39 | include("conversions.jl")
40 | include("generic.jl")
41 | 
42 | # high-level integrations
43 | include("interfaces.jl")
44 | 
45 | # cache for created, but unused handles
46 | const idle_handles = HandleCache{CuContext,cusparseHandle_t}()
47 | 
48 | function handle()
49 |     cuda = CUDA.active_state()
50 | 
51 |     # every task maintains library state per device
52 |     LibraryState = @NamedTuple{handle::cusparseHandle_t, stream::CuStream}
53 |     states = get!(task_local_storage(), :CUSPARSE) do
54 |         Dict{CuContext,LibraryState}()
55 |     end::Dict{CuContext,LibraryState}
56 | 
57 |     # get library state
58 |     @noinline function new_state(cuda)
59 |         new_handle = pop!(idle_handles, cuda.context) do
60 |             cusparseCreate()
61 |         end
62 | 
63 |         finalizer(current_task()) do task
64 |             push!(idle_handles, cuda.context, new_handle) do
65 |                 @context! skip_destroyed=true cuda.context cusparseDestroy(new_handle)
66 |             end
67 |         end
68 | 
69 |         cusparseSetStream(new_handle, cuda.stream)
70 | 
71 |         (; handle=new_handle, cuda.stream)
72 |     end
73 |     state = get!(states, cuda.context) do
74 |         new_state(cuda)
75 |     end
76 | 
77 |     # update stream
78 |     @noinline function update_stream(cuda, state)
79 |         cusparseSetStream_v2(state.handle, cuda.stream)
80 |         (; state.handle, cuda.stream)
81 |     end
82 |     if state.stream != cuda.stream
83 |         states[cuda.context] = state = update_stream(cuda, state)
84 |     end
85 | 
86 |     return state.handle
87 | end
88 | 
89 | end
90 | 


--------------------------------------------------------------------------------
/src/device/quirks.jl:
--------------------------------------------------------------------------------
 1 | macro print_and_throw(args...)
 2 |     quote
 3 |         @cuprintln "ERROR: " $(args...) "."
 4 |         throw(nothing)
 5 |     end
 6 | end
 7 | 
 8 | # math.jl
 9 | @device_override @noinline Base.Math.throw_complex_domainerror(f::Symbol, x) =
10 |     @print_and_throw "This operation requires a complex input to return a complex result"
11 | @device_override @noinline Base.Math.throw_exp_domainerror(f::Symbol, x) =
12 |     @print_and_throw "Exponentiation yielding a complex result requires a complex argument"
13 | 
14 | # intfuncs.jl
15 | @device_override @noinline Base.throw_domerr_powbysq(::Any, p) =
16 |     @print_and_throw "Cannot raise an integer to a negative power"
17 | @device_override @noinline Base.throw_domerr_powbysq(::Integer, p) =
18 |     @print_and_throw "Cannot raise an integer to a negative power"
19 | @device_override @noinline Base.throw_domerr_powbysq(::AbstractMatrix, p) =
20 |     @print_and_throw "Cannot raise an integer to a negative power"
21 | 
22 | # checked.jl
23 | @device_override @noinline Base.Checked.throw_overflowerr_binaryop(op, x, y) =
24 |     @print_and_throw "Binary operation overflowed"
25 | @device_override @noinline Base.Checked.throw_overflowerr_negation(op, x, y) =
26 |     @print_and_throw "Negation overflowed"
27 | 
28 | # boot.jl
29 | @device_override @noinline Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} =
30 |     @print_and_throw "Inexact conversion"
31 | 
32 | # abstractarray.jl
33 | @device_override @noinline Base.throw_boundserror(A, I) =
34 |     @print_and_throw "Out-of-bounds array access"
35 | 
36 | # trig.jl
37 | @device_override @noinline Base.Math.sincos_domain_error(x) =
38 |     @print_and_throw "sincos(x) is only defined for finite x."
39 | 
40 | # multidimensional.jl
41 | @static if VERSION >= v"1.7-"
42 |     # XXX: the boundscheck change in JuliaLang/julia#42119 has exposed additional issues
43 |     #      with bad code generation by ptxas on <sm_70, as reported with NVIDIA in #3382020.
44 |     @device_override Base.@propagate_inbounds function Base.getindex(iter::CartesianIndices{N,R},
45 |                                                                      I::Vararg{Int, N}) where {N,R}
46 |         if compute_capability() < sv"7"
47 |             CartesianIndex(getindex.(iter.indices, I))
48 |         else
49 |             @boundscheck checkbounds(iter, I...)
50 |             index = map(iter.indices, I) do r, i
51 |                 @inbounds getindex(r, i)
52 |             end
53 |             CartesianIndex(index)
54 |         end
55 |     end
56 | end
57 | 


--------------------------------------------------------------------------------
/docs/src/usage/workflow.md:
--------------------------------------------------------------------------------
 1 | # Workflow
 2 | 
 3 | A typical approach for porting or developing an application for the GPU is as follows:
 4 | 
 5 | 1. develop an application using generic array functionality, and test it on the CPU with the
 6 |    `Array` type
 7 | 2. port your application to the GPU by switching to the `CuArray` type
 8 | 3. disallow the CPU fallback ("scalar indexing") to find operations that are not implemented
 9 |    for or incompatible with GPU execution
10 | 4. (optional) use lower-level, CUDA-specific interfaces to implement missing functionality
11 |    or optimize performance
12 | 
13 | 
14 | ## [Scalar indexing](@id UsageWorkflowScalar)
15 | 
16 | Many array operations in Julia are implemented using loops, processing one element at a
17 | time. Doing so with GPU arrays is very ineffective, as the loop won't actually execute on
18 | the GPU, but transfer one element at a time and process it on the CPU. As this wrecks
19 | performance, you will be warned when performing this kind of iteration:
20 | 
21 | ```julia
22 | julia> a = CuArray([1])
23 | 1-element CuArray{Int64,1,Nothing}:
24 |  1
25 | 
26 | julia> a[1] += 1
27 | ┌ Warning: Performing scalar indexing.
28 | │ ...
29 | └ @ GPUArrays ~/Julia/pkg/GPUArrays/src/host/indexing.jl:57
30 | 2
31 | ```
32 | 
33 | Scalar indexing is only allowed in an interactive session, e.g. the REPL, because it is
34 | convenient when porting CPU code to the GPU. If you want to disallow scalar indexing, e.g.
35 | to verify that your application executes correctly on the GPU, call the `allowscalar`
36 | function:
37 | 
38 | ```julia
39 | julia> CUDA.allowscalar(false)
40 | 
41 | julia> a[1] .+ 1
42 | ERROR: scalar getindex is disallowed
43 | Stacktrace:
44 |  [1] error(::String) at ./error.jl:33
45 |  [2] assertscalar(::String) at GPUArrays/src/indexing.jl:14
46 |  [3] getindex(::CuArray{Int64,1,Nothing}, ::Int64) at GPUArrays/src/indexing.jl:54
47 |  [4] top-level scope at REPL[5]:1
48 | 
49 | julia> a .+ 1
50 | 1-element CuArray{Int64,1,Nothing}:
51 |  2
52 | ```
53 | 
54 | In a non-interactive session, e.g. when running code from a script or application, scalar
55 | indexing is disallowed by default. There is no global toggle to allow scalar indexing; if
56 | you really need it, you can mark expressions using `allowscalar` with do-block syntax or
57 | `@allowscalar` macro:
58 | 
59 | ```julia
60 | julia> a = CuArray([1])
61 | 1-element CuArray{Int64, 1}:
62 |  1
63 | 
64 | julia> CUDA.allowscalar(false)
65 | 
66 | julia> CUDA.allowscalar() do
67 |          a[1] += 1
68 |        end
69 | 2
70 | 
71 | julia> CUDA.@allowscalar a[1] += 1
72 | 3
73 | ```
74 | 


--------------------------------------------------------------------------------
/lib/curand/error.jl:
--------------------------------------------------------------------------------
 1 | export CURANDError
 2 | 
 3 | struct CURANDError <: Exception
 4 |     code::curandStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{curandStatus_t}, err::CURANDError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CURANDError) =
10 |     print(io, "CURANDError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CURANDError) = string(err.code)
13 | 
14 | ## COV_EXCL_START
15 | function description(err)
16 |     if err.code == CURAND_STATUS_SUCCESS
17 |         "generator was created successfully"
18 |     elseif err.code == CURAND_STATUS_VERSION_MISMATCH
19 |         "header file and linked library version do not match"
20 |     elseif err.code == CURAND_STATUS_NOT_INITIALIZED
21 |         "generator not initialized"
22 |     elseif err.code == CURAND_STATUS_ALLOCATION_FAILED
23 |         "memory allocation failed"
24 |     elseif err.code == CURAND_STATUS_TYPE_ERROR
25 |         "generator is wrong type"
26 |     elseif err.code == CURAND_STATUS_OUT_OF_RANGE
27 |         "argument out of range"
28 |     elseif err.code == CURAND_STATUS_LENGTH_NOT_MULTIPLE
29 |         "length requested is not a multple of dimension"
30 |     elseif err.code == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED
31 |         "GPU does not have double precision required by MRG32k3a"
32 |     elseif err.code == CURAND_STATUS_LAUNCH_FAILURE
33 |         "kernel launch failure"
34 |     elseif err.code == CURAND_STATUS_PREEXISTING_FAILURE
35 |         "preexisting failure on library entry"
36 |     elseif err.code == CURAND_STATUS_INITIALIZATION_FAILED
37 |         "initialization of CUDA failed"
38 |     elseif err.code == CURAND_STATUS_ARCH_MISMATCH
39 |         "architecture mismatch, GPU does not support requested feature"
40 |     elseif err.code == CURAND_STATUS_INTERNAL_ERROR
41 |         "internal library error"
42 |     else
43 |         "no description for this error"
44 |     end
45 | end
46 | ## COV_EXCL_STOP
47 | 
48 | 
49 | ## API call wrapper
50 | 
51 | # outlined functionality to avoid GC frame allocation
52 | @noinline function throw_api_error(res)
53 |     if res == CURAND_STATUS_ALLOCATION_FAILED
54 |         throw(OutOfGPUMemoryError())
55 |     else
56 |         throw(CURANDError(res))
57 |     end
58 | end
59 | 
60 | macro check(ex, errs...)
61 |     check = :(isequal(err, CURAND_STATUS_ALLOCATION_FAILED))
62 |     for err in errs
63 |         check = :($check || isequal(err, $(esc(err))))
64 |     end
65 | 
66 |     quote
67 |         res = @retry_reclaim err->$check $(esc(ex))
68 |         if res != CURAND_STATUS_SUCCESS
69 |             throw_api_error(res)
70 |         end
71 | 
72 |         nothing
73 |     end
74 | end
75 | 


--------------------------------------------------------------------------------
/docs/make.jl:
--------------------------------------------------------------------------------
 1 | using Documenter, Literate
 2 | using CUDA
 3 | 
 4 | const src = "https://github.com/JuliaGPU/CUDA.jl"
 5 | const dst = "https://cuda.juliagpu.org/stable/"
 6 | 
 7 | function main()
 8 |     ci = get(ENV, "CI", "") == "true"
 9 | 
10 |     @info "Building Literate.jl documentation"
11 |     cd(@__DIR__) do
12 |         Literate.markdown("src/tutorials/introduction.jl", "src/tutorials";
13 |                           repo_root_url="$src/blob/master/docs")
14 |         Literate.markdown("src/tutorials/custom_structs.jl", "src/tutorials";
15 |                           repo_root_url="$src/blob/master/docs")
16 |     end
17 | 
18 |     @info "Generating Documenter.jl site"
19 |     DocMeta.setdocmeta!(CUDA, :DocTestSetup, :(using CUDA); recursive=true)
20 |     makedocs(
21 |         sitename = "CUDA.jl",
22 |         authors = "Tim Besard",
23 |         repo = "$src/blob/{commit}{path}#{line}",
24 |         format = Documenter.HTML(
25 |             # Use clean URLs on CI
26 |             prettyurls = ci,
27 |             canonical = dst,
28 |             assets = ["assets/favicon.ico"],
29 |             analytics = "UA-154489943-2",
30 |         ),
31 |         doctest = true,
32 |         #strict = true,
33 |         modules = [CUDA],
34 |         pages = Any[
35 |             "Home" => "index.md",
36 |             "Tutorials" => Any[
37 |                 "tutorials/introduction.md",
38 |                 "tutorials/custom_structs.md",
39 |             ],
40 |             "Installation" => Any[
41 |                 "installation/overview.md",
42 |                 "installation/conditional.md",
43 |                 "installation/troubleshooting.md",
44 |             ],
45 |             "Usage" => Any[
46 |                 "usage/overview.md",
47 |                 "usage/workflow.md",
48 |                 "usage/array.md",
49 |                 "usage/memory.md",
50 |                 "usage/multitasking.md",
51 |                 "usage/multigpu.md",
52 |             ],
53 |             "Development" => Any[
54 |                 "development/profiling.md",
55 |                 "development/troubleshooting.md",
56 |             ],
57 |             "API reference" => Any[
58 |                 "api/essentials.md",
59 |                 "api/compiler.md",
60 |                 "api/kernel.md",
61 |                 "api/array.md",
62 |             ],
63 |             "Library reference" => Any[
64 |                 "lib/driver.md",
65 |             ],
66 |             "FAQ" => "faq.md",
67 |         ]
68 |     )
69 | 
70 |     if ci
71 |         @info "Deploying to GitHub"
72 |         deploydocs(
73 |             repo = "github.com/JuliaGPU/CUDA.jl.git",
74 |             push_preview = true
75 |         )
76 |     end
77 | end
78 | 
79 | isinteractive() || main()
80 | 


--------------------------------------------------------------------------------
/docs/src/index.md:
--------------------------------------------------------------------------------
 1 | # CUDA programming in Julia
 2 | 
 3 | The CUDA.jl package is the main entrypoint for programming NVIDIA GPUs in Julia. The package
 4 | makes it possible to do so at various abstraction levels, from easy-to-use arrays down to
 5 | hand-written kernels using low-level CUDA APIs.
 6 | 
 7 | If you have any questions, please feel free to use the `#gpu` channel on the [Julia
 8 | slack](https://julialang.slack.com/), or the [GPU domain of the Julia
 9 | Discourse](https://discourse.julialang.org/c/domain/gpu).
10 | 
11 | 
12 | ## Quick Start
13 | 
14 | The Julia CUDA stack only requires a working NVIDIA driver; you don't need to install the
15 | entire CUDA toolkit, as it will automatically be downloaded when you first use the package:
16 | 
17 | ```julia
18 | # install the package
19 | using Pkg
20 | Pkg.add("CUDA")
21 | 
22 | # smoke test (this will download the CUDA toolkit)
23 | using CUDA
24 | CUDA.versioninfo()
25 | ```
26 | 
27 | If you want to ensure everything works as expected, you can execute the test suite:
28 | 
29 | ```julia
30 | using Pkg
31 | Pkg.test("CUDA")    # takes ~40 minutes if using 1 thread
32 | ```
33 | 
34 | For more details on the installation process, consult the [Installation](@ref
35 | InstallationOverview) section. To understand the toolchain in more detail, have a look at
36 | the tutorials in this manual. **It is highly recommended that new users start with the
37 | [Introduction](@ref) tutorial**. For an overview of the available functionality, read the
38 | [Usage](@ref UsageOverview) section. The following resources may also be of interest:
39 | 
40 | - Effectively using GPUs with Julia: [video](https://www.youtube.com/watch?v=7Yq1UyncDNc),
41 |   [slides](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/)
42 | - How Julia is compiled to GPUs: [video](https://www.youtube.com/watch?v=Fz-ogmASMAE)
43 | 
44 | 
45 | ## Acknowledgements
46 | 
47 | The Julia CUDA stack has been a collaborative effort by many individuals. Significant
48 | contributions have been made by the following individuals:
49 | 
50 | - Tim Besard (@maleadt) (lead developer)
51 | - Valentin Churavy (@vchuravy)
52 | - Mike Innes (@MikeInnes)
53 | - Katharine Hyatt (@kshyatt)
54 | - Simon Danisch (@SimonDanisch)
55 | 
56 | 
57 | ## Supporting and Citing
58 | 
59 | Much of the software in this ecosystem was developed as part of academic research. If you
60 | would like to help support it, please star the repository as such metrics may help us secure
61 | funding in the future. If you use our software as part of your research, teaching, or other
62 | activities, we would be grateful if you could cite our work. The
63 | [CITATION.bib](https://github.com/JuliaGPU/CUDA.jl/blob/master/CITATION.bib) file in the
64 | root of this repository lists the relevant papers.
65 | 


--------------------------------------------------------------------------------
/perf/byval.jl:
--------------------------------------------------------------------------------
 1 | module ByVal
 2 | 
 3 | using CUDA, BenchmarkTools, Random
 4 | using CUDA: i32
 5 | 
 6 | const threads = 256
 7 | 
 8 | # simple add matrixes kernel
 9 | function kernel_add_mat(n, x1, x2, y)
10 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
11 |     if i <= n
12 |         @inbounds y[i] = x1[i] + x2[i]
13 |     end
14 |     return
15 | end
16 | 
17 | @inline get_inputs3(indx_y, a, b, c)                            = (a, b, c)
18 | @inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)             = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
19 | @inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
20 | 
21 | # add arrays of matrixes kernel
22 | function kernel_add_mat_z_slices(n, vararg...)
23 |     x1, x2, y = get_inputs3(blockIdx().y, vararg...)
24 |     i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
25 |     if i <= n
26 |         @inbounds y[i] = x1[i] + x2[i]
27 |     end
28 |     return
29 | end
30 | 
31 | function add_z_slices!(y, x1, x2)
32 |     m1, n1 = size(x1[1]) #get size of first slice
33 |     blocks = (m1 * n1 + threads - 1) ÷ threads
34 |     # get length(x1) more blocks than needed to process 1 slice
35 |     @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
36 | end
37 | 
38 | function add!(y, x1, x2)
39 |     m1, n1 = size(x1)
40 |     blocks = (m1 * n1 + threads - 1) ÷ threads
41 |     @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
42 | end
43 | 
44 | function main()
45 |     results = BenchmarkGroup()
46 | 
47 |     num_z_slices = 3
48 |     Random.seed!(1)
49 | 
50 |     #m, n = 7, 5          # tiny to measure overhead
51 |     #m, n = 521, 111
52 |     #m, n = 1521, 1111
53 |     #m, n = 3001, 1511    # prime numbers to test memory access correctness
54 |     m, n = 3072, 1536    # 256 multiplier
55 |     #m, n = 6007, 3001    # prime numbers to test memory access correctness
56 | 
57 |     x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
58 |     x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
59 |     y1 = [similar(x1[1]) for i = 1:num_z_slices]
60 | 
61 |     # reference down to bones add on GPU
62 |     results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
63 | 
64 |     # adding arrays in an array
65 |     for slices = 1:num_z_slices
66 |         results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
67 |     end
68 | 
69 |     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
70 |     CUDA.unsafe_free!.(x1)
71 |     CUDA.unsafe_free!.(x2)
72 |     CUDA.unsafe_free!.(y1)
73 | 
74 |     return results
75 | end
76 | 
77 | end
78 | 
79 | ByVal.main()
80 | 


--------------------------------------------------------------------------------
/test/cudnn/activation.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUDNN:
 2 |     cudnnActivationForward,
 3 |     cudnnActivationForward!,
 4 |     cudnnActivationBackward,
 5 |     cudnnActivationDescriptor,
 6 |         cudnnActivationDescriptor_t,
 7 |         cudnnCreateActivationDescriptor,
 8 |         cudnnSetActivationDescriptor,
 9 |         cudnnGetActivationDescriptor,
10 |         cudnnDestroyActivationDescriptor,
11 |     cudnnActivationMode_t,
12 |         CUDNN_ACTIVATION_SIGMOID,      # 0
13 |         CUDNN_ACTIVATION_RELU,         # 1
14 |         CUDNN_ACTIVATION_TANH,         # 2
15 |         CUDNN_ACTIVATION_CLIPPED_RELU, # 3
16 |         CUDNN_ACTIVATION_ELU,          # 4
17 |         CUDNN_ACTIVATION_IDENTITY,     # 5
18 |     cudnnNanPropagation_t,
19 |         CUDNN_NOT_PROPAGATE_NAN, # 0
20 |         CUDNN_PROPAGATE_NAN      # 1
21 | 
22 | 
23 | @testset "cudnn/activation" begin
24 |     @test cudnnActivationDescriptor(C_NULL) isa cudnnActivationDescriptor
25 |     @test Base.unsafe_convert(Ptr, cudnnActivationDescriptor(C_NULL)) isa Ptr
26 |     @test cudnnActivationDescriptor(CUDNN_ACTIVATION_RELU,CUDNN_NOT_PROPAGATE_NAN,0) isa cudnnActivationDescriptor
27 | 
28 |     (ax,ay) = randn.((10,10))
29 |     (cx,cy) = CuArray.((ax,ay))
30 | 
31 |     function activationtest(;
32 |         mode=CUDNN_ACTIVATION_SIGMOID,
33 |         nanOpt=CUDNN_NOT_PROPAGATE_NAN,
34 |         coef=1,
35 |         alpha=1,
36 |         beta=0,
37 |     )
38 |         fx = (mode === CUDNN_ACTIVATION_SIGMOID ? 1 ./ (1 .+ exp.(-ax)) :
39 |               mode === CUDNN_ACTIVATION_RELU ? max.(0,ax) :
40 |               mode === CUDNN_ACTIVATION_TANH ? tanh.(ax) :
41 |               mode === CUDNN_ACTIVATION_CLIPPED_RELU ? clamp.(ax,0,coef) :
42 |               mode === CUDNN_ACTIVATION_ELU ? (x->(x >= 0 ? x : coef*(exp(x)-1))).(ax) :
43 |               error("Unknown activation"))
44 |         d = cudnnActivationDescriptor(mode,nanOpt,Cfloat(coef))
45 |         y0 = alpha * fx
46 |         y1 = y0 .+ beta * ay
47 |         @test y0 ≈ cudnnActivationForward(cx; mode, nanOpt, coef, alpha) |> Array
48 |         @test y0 ≈ cudnnActivationForward(cx, d; alpha) |> Array
49 |         @test y1 ≈ cudnnActivationForward!(copy(cy), cx; mode, nanOpt, coef, alpha, beta) |> Array
50 |         @test y1 ≈ cudnnActivationForward!(copy(cy), cx, d; alpha, beta) |> Array
51 |     end
52 | 
53 |     activationtest(mode=CUDNN_ACTIVATION_SIGMOID)
54 |     activationtest(mode=CUDNN_ACTIVATION_RELU)
55 |     activationtest(mode=CUDNN_ACTIVATION_TANH)
56 |     activationtest(mode=CUDNN_ACTIVATION_CLIPPED_RELU)
57 |     activationtest(mode=CUDNN_ACTIVATION_ELU)
58 |     activationtest(nanOpt=CUDNN_PROPAGATE_NAN)
59 |     activationtest(coef=2,mode=CUDNN_ACTIVATION_CLIPPED_RELU)
60 |     activationtest(coef=2,mode=CUDNN_ACTIVATION_ELU)
61 |     activationtest(alpha=2)
62 |     activationtest(beta=2)
63 | end
64 | 


--------------------------------------------------------------------------------
/lib/cutensor/error.jl:
--------------------------------------------------------------------------------
 1 | export CUTENSORError
 2 | 
 3 | struct CUTENSORError <: Exception
 4 |     code::cutensorStatus_t
 5 | end
 6 | 
 7 | Base.convert(::Type{cutensorStatus_t}, err::CUTENSORError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUTENSORError) =
10 |     print(io, "CUTENSORError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CUTENSORError) = unsafe_string(cutensorGetErrorString(err))
13 | 
14 | ## COV_EXCL_START
15 | function description(err::CUTENSORError)
16 |     if err.code == CUTENSOR_STATUS_SUCCESS
17 |         "the operation completed successfully"
18 |     elseif err.code == CUTENSOR_STATUS_NOT_INITIALIZED
19 |         "the library was not initialized"
20 |     elseif err.code == CUTENSOR_STATUS_ALLOC_FAILED
21 |         "the resource allocation failed"
22 |     elseif err.code == CUTENSOR_STATUS_INVALID_VALUE
23 |         "an invalid value was used as an argument"
24 |     elseif err.code == CUTENSOR_STATUS_ARCH_MISMATCH
25 |         "an absent device architectural feature is required"
26 |     elseif err.code == CUTENSOR_STATUS_MAPPING_ERROR
27 |         "an access to GPU memory space failed"
28 |     elseif err.code == CUTENSOR_STATUS_EXECUTION_FAILED
29 |         "the GPU program failed to execute"
30 |     elseif err.code == CUTENSOR_STATUS_INTERNAL_ERROR
31 |         "an internal operation failed"
32 |     elseif err.code == CUTENSOR_STATUS_NOT_SUPPORTED
33 |         "operation not supported (yet)"
34 |     elseif err.code == CUTENSOR_STATUS_LICENSE_ERROR
35 |         "error detected trying to check the license"
36 |     elseif err.code == CUTENSOR_STATUS_CUBLAS_ERROR
37 |         "error occurred during a CUBLAS operation"
38 |     elseif err.code == CUTENSOR_STATUS_CUDA_ERROR
39 |         "error occurred during a CUDA operation"
40 |     elseif err.code == CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE
41 |         "insufficient workspace memory for this operation"
42 |     elseif err.code == CUTENSOR_STATUS_INSUFFICIENT_DRIVER
43 |         "insufficient driver version"
44 |     else
45 |         "no description for this error"
46 |     end
47 | end
48 | ## COV_EXCL_STOP
49 | 
50 | 
51 | ## API call wrapper
52 | 
53 | # outlined functionality to avoid GC frame allocation
54 | @noinline function throw_api_error(res)
55 |     if res == CUTENSOR_STATUS_ALLOC_FAILED
56 |         throw(OutOfGPUMemoryError())
57 |     else
58 |         throw(CUTENSORError(res))
59 |     end
60 | end
61 | 
62 | macro check(ex, errs...)
63 |     check = :(isequal(err, CUTENSOR_STATUS_ALLOC_FAILED))
64 |     for err in errs
65 |         check = :($check || isequal(err, $(esc(err))))
66 |     end
67 | 
68 |     quote
69 |         res = @retry_reclaim err->$check $(esc(ex))
70 |         if res != CUTENSOR_STATUS_SUCCESS
71 |             throw_api_error(res)
72 |         end
73 | 
74 |         nothing
75 |     end
76 | end
77 | 


--------------------------------------------------------------------------------
/test/cudnn/optensor.jl:
--------------------------------------------------------------------------------
 1 | using CUDA.CUDNN:
 2 |     cudnnOpTensor,
 3 |     cudnnOpTensor!,
 4 |     cudnnOpTensorDescriptor,
 5 |         cudnnOpTensorDescriptor_t,
 6 |         cudnnCreateOpTensorDescriptor,
 7 |         cudnnSetOpTensorDescriptor,
 8 |         cudnnGetOpTensorDescriptor,
 9 |         cudnnDestroyOpTensorDescriptor,
10 |     cudnnOpTensorOp_t,
11 |         CUDNN_OP_TENSOR_ADD,  # 0,
12 |         CUDNN_OP_TENSOR_MUL,  # 1,
13 |         CUDNN_OP_TENSOR_MIN,  # 2,
14 |         CUDNN_OP_TENSOR_MAX,  # 3,
15 |         CUDNN_OP_TENSOR_SQRT, # 4, performed only on first arg
16 |         CUDNN_OP_TENSOR_NOT,  # 5, performed only on first arg
17 |     cudnnNanPropagation_t,
18 |         CUDNN_NOT_PROPAGATE_NAN, # 0
19 |         CUDNN_PROPAGATE_NAN,     # 1
20 |     cudnnDataType
21 | 
22 | @testset "cudnn/optensor" begin
23 |     @test cudnnOpTensorDescriptor(C_NULL) isa cudnnOpTensorDescriptor
24 |     @test Base.unsafe_convert(Ptr, cudnnOpTensorDescriptor(C_NULL)) isa Ptr
25 |     @test cudnnOpTensorDescriptor(CUDNN_OP_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN) isa cudnnOpTensorDescriptor
26 | 
27 |     (ax1,ax2,ay) = rand.((10,10,10))
28 |     (cx1,cx2,cy) = CuArray.((ax1,ax2,ay))
29 | 
30 |     function optensortest(;
31 |         op=CUDNN_OP_TENSOR_ADD,
32 |         nanOpt=CUDNN_NOT_PROPAGATE_NAN,
33 |         compType=(eltype(ax1) <: Float64 ? Float64 : Float32),
34 |         alpha1=1,
35 |         alpha2=1,
36 |         beta=0,
37 |     )
38 |         f1 = (op === CUDNN_OP_TENSOR_ADD ? alpha1*ax1 .+ alpha2*ax2 :
39 |               op === CUDNN_OP_TENSOR_MUL ? (alpha1*ax1) .* (alpha2*ax2) :
40 |               op === CUDNN_OP_TENSOR_MIN ? min.(alpha1*ax1, alpha2*ax2) :
41 |               op === CUDNN_OP_TENSOR_MAX ? max.(alpha1*ax1, alpha2*ax2) :
42 |               op === CUDNN_OP_TENSOR_SQRT ? sqrt.(alpha1*ax1) :
43 |               op === CUDNN_OP_TENSOR_NOT ? 1 .- ax1 :
44 |               error("Unknown optensor"))
45 |         f2 = f1 .+ beta * ay
46 |         d = cudnnOpTensorDescriptor(op,cudnnDataType(compType),nanOpt)
47 |         @test f1 ≈ cudnnOpTensor(cx1, cx2; op, compType, nanOpt, alpha1, alpha2) |> Array
48 |         @test f1 ≈ cudnnOpTensor(cx1, cx2, d; alpha1, alpha2) |> Array
49 |         @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2; op, compType, nanOpt, alpha1, alpha2, beta) |> Array
50 |         @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2, d; alpha1, alpha2, beta) |> Array
51 |     end
52 | 
53 |     optensortest(op = CUDNN_OP_TENSOR_ADD)
54 |     optensortest(op = CUDNN_OP_TENSOR_MUL)
55 |     optensortest(op = CUDNN_OP_TENSOR_MIN)
56 |     optensortest(op = CUDNN_OP_TENSOR_MAX)
57 |     optensortest(op = CUDNN_OP_TENSOR_SQRT)
58 |     optensortest(op = CUDNN_OP_TENSOR_NOT)
59 |     optensortest(nanOpt = CUDNN_PROPAGATE_NAN)
60 |     optensortest(alpha1 = 2)
61 |     optensortest(alpha2 = 2)
62 |     optensortest(beta = 2)
63 | end
64 | 


--------------------------------------------------------------------------------
/lib/cusparse/extra.jl:
--------------------------------------------------------------------------------
 1 | export geam
 2 | 
 3 | """
 4 |     geam(alpha::Number, A::CuSparseMatrix, beta::Number, B::CuSparseMatrix, index::SparseChar)
 5 | 
 6 | Performs `C = alpha * A + beta * B`. `A` and `B` are sparse matrix defined in CSR storage format.
 7 | """
 8 | geam(alpha::Number, A::CuSparseMatrixCSR, beta::Number, B::CuSparseMatrixCSR, index::SparseChar)
 9 | 
10 | for (bname,gname,elty) in ((:cusparseScsrgeam2_bufferSizeExt, :cusparseScsrgeam2, :Float32),
11 |                            (:cusparseDcsrgeam2_bufferSizeExt, :cusparseDcsrgeam2, :Float64),
12 |                            (:cusparseCcsrgeam2_bufferSizeExt, :cusparseCcsrgeam2, :ComplexF32),
13 |                            (:cusparseZcsrgeam2_bufferSizeExt, :cusparseZcsrgeam2, :ComplexF64))
14 |     @eval begin
15 |         function geam(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar)
16 |             m, n = size(A)
17 |             (m, n) == size(B) && DimensionMismatch("dimensions must match: a has dims $(axes(A)), b has dims $(axes(B))")
18 |             descrA = CuMatrixDescriptor('G', 'L', 'N', index)
19 |             descrB = CuMatrixDescriptor('G', 'L', 'N', index)
20 |             descrC = CuMatrixDescriptor('G', 'L', 'N', index)
21 | 
22 |             rowPtrC = CuArray{Int32,1}(undef, m+1)
23 | 
24 |             function bufferSize()
25 |                 out = Ref{Csize_t}(1)
26 |                 $bname(handle(), m, n,
27 |                     alpha, descrA, nnz(A), nonzeros(A), A.rowPtr, A.colVal,
28 |                     beta, descrB, nnz(B), nonzeros(B), B.rowPtr, B.colVal,
29 |                     descrC, CuArray{$elty,1}(undef, 0), rowPtrC, CuArray{Int32,1}(undef, 0),
30 |                     out)
31 |                 return out[]
32 |             end
33 | 
34 |             C = with_workspace(bufferSize) do buffer
35 |                 function get_nnzC(buffer)
36 |                     nnzTotalDevHostPtr = Ref{Cint}(1)
37 |                     cusparseXcsrgeam2Nnz(handle(), m, n,
38 |                         descrA, nnz(A), A.rowPtr, A.colVal,
39 |                         descrB, nnz(B), B.rowPtr, B.colVal,
40 |                         descrC, rowPtrC, nnzTotalDevHostPtr,
41 |                         buffer)
42 |                     return nnzTotalDevHostPtr[]
43 |                 end
44 | 
45 |                 nnzC = get_nnzC(buffer)
46 |                 colValC = CuArray{Int32,1}(undef, Int(nnzC))
47 |                 nzValC = CuArray{$elty,1}(undef, Int(nnzC))
48 | 
49 |                 $gname(handle(), m, n,
50 |                     alpha, descrA, nnz(A), nonzeros(A), A.rowPtr, A.colVal,
51 |                     beta, descrB, nnz(B), nonzeros(B), B.rowPtr, B.colVal,
52 |                     descrC, nzValC, rowPtrC, colValC,
53 |                     buffer)
54 |                 return CuSparseMatrixCSR(rowPtrC, colValC, nzValC, (m, n))
55 |             end
56 |             C
57 |         end
58 |     end
59 | end
60 | 


--------------------------------------------------------------------------------
/lib/cudnn/activation.jl:
--------------------------------------------------------------------------------
 1 | """
 2 |     cudnnActivationForward(x; mode, nanOpt, coef, alpha)
 3 |     cudnnActivationForward(x, d::cudnnActivationDescriptor; alpha)
 4 |     cudnnActivationForward!(y, x; mode, nanOpt, coef, alpha, beta)
 5 |     cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; alpha, beta)
 6 | 
 7 | Return the result of the specified elementwise activation operation applied to `x`.
 8 | Optionally `y` holds the result and `d` specifies the operation. `y` should be similar to
 9 | `x` if specified. Keyword arguments `alpha=1, beta=0` can be used for scaling, i.e. `y .=
10 | alpha * op.(x) .+ beta * y`.  The following keyword arguments specify the operation if `d` is
11 | not given:
12 | 
13 | * `mode = CUDNN_ACTIVATION_RELU`: Options are `SIGMOID`, `RELU`, `TANH`, `CLIPPED_RELU`, `ELU`, `IDENTITY`
14 | * `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NaN propagation policy, the other option is `CUDNN_PROPAGATE_NAN`
15 | * `coef=1`: When the activation mode is set to `CUDNN_ACTIVATION_CLIPPED_RELU`, this input specifies the clipping threshold; and when the activation mode is set to `CUDNN_ACTIVATION_ELU`, this input specifies the `alpha` parameter.
16 | """
17 | cudnnActivationForward, cudnnActivationForward!
18 | 
19 | 
20 | # Public methods
21 | cudnnActivationForward(x; o...)     = cudnnActivationForwardWithDefaults(x; o...)
22 | cudnnActivationForward!(y, x; o...) = cudnnActivationForwardWithDefaults(x; y, o...)
23 | cudnnActivationForward(x, d::cudnnActivationDescriptor; o...)     = cudnnActivationForwardWithDefaults(x; activationDesc=d, o...)
24 | cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; y, activationDesc=d, o...)
25 | 
26 | 
27 | # Private method
28 | function cudnnActivationForwardWithDefaults(
29 |     x;
30 |     y = similar(x),
31 |     mode::cudnnActivationMode_t = CUDNN_ACTIVATION_RELU,
32 |     nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN,
33 |     coef::Real=1,
34 |     activationDesc::cudnnActivationDescriptor = cudnnActivationDescriptor(mode, nanOpt, Cdouble(coef)),
35 |     alpha::Real=1,
36 |     beta::Real=0,
37 |     xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x),
38 |     yDesc::cudnnTensorDescriptor = xDesc,
39 | )
40 |     T = eltype(x)
41 |     alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta)
42 |     cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
43 | end
44 | 
45 | 
46 | # AD method:
47 | function cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y)
48 |     cudnnActivationForward(handle(), activationDesc, alpha, xDesc, x, beta, yDesc, y)
49 |     return y
50 | end
51 | 
52 | 
53 | # Deprecated:
54 | function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}; o...) where {T,N}
55 |     @warn "`cudnnActivationForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnActivationForward`." maxlog=1
56 |     cudnnActivationForward!(y, x; o...)
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/cudadrv/error.jl:
--------------------------------------------------------------------------------
  1 | # Error type and decoding functionality
  2 | 
  3 | export CuError
  4 | 
  5 | 
  6 | """
  7 |     CuError(code)
  8 |     CuError(code, meta)
  9 | 
 10 | Create a CUDA error object with error code `code`. The optional `meta` parameter indicates
 11 | whether extra information, such as error logs, is known.
 12 | """
 13 | struct CuError <: Exception
 14 |     code::CUresult
 15 |     meta::Any
 16 | 
 17 |     CuError(code, meta=nothing) = new(code, meta)
 18 | end
 19 | 
 20 | Base.convert(::Type{CUresult}, err::CuError) = err.code
 21 | 
 22 | Base.:(==)(x::CuError,y::CuError) = x.code == y.code
 23 | 
 24 | """
 25 |     name(err::CuError)
 26 | 
 27 | Gets the string representation of an error code.
 28 | 
 29 | ```jldoctest
 30 | julia> err = CuError(CUDA.cudaError_enum(1))
 31 | CuError(CUDA_ERROR_INVALID_VALUE)
 32 | 
 33 | julia> name(err)
 34 | "ERROR_INVALID_VALUE"
 35 | ```
 36 | """
 37 | function name(err::CuError)
 38 |     str_ref = Ref{Cstring}()
 39 |     cuGetErrorName(err, str_ref)
 40 |     unsafe_string(str_ref[])[6:end]
 41 | end
 42 | 
 43 | """
 44 |     description(err::CuError)
 45 | 
 46 | Gets the string description of an error code.
 47 | """
 48 | function description(err::CuError)
 49 |     if err.code == -1%UInt32
 50 |         "Cannot use the CUDA stub libraries."
 51 |     else
 52 |         str_ref = Ref{Cstring}()
 53 |         cuGetErrorString(err, str_ref)
 54 |         unsafe_string(str_ref[])
 55 |     end
 56 | end
 57 | 
 58 | function Base.showerror(io::IO, err::CuError)
 59 |     try
 60 |         print(io, "CUDA error: $(description(err)) (code $(reinterpret(Int32, err.code)), $(name(err)))")
 61 |     catch
 62 |         # we might throw before the library is initialized
 63 |         print(io, "CUDA error (code $(reinterpret(Int32, err.code)), $(err.code))")
 64 |     end
 65 | 
 66 |     if err.meta != nothing
 67 |         print(io, "\n")
 68 |         print(io, err.meta)
 69 |     end
 70 | end
 71 | 
 72 | Base.show(io::IO, ::MIME"text/plain", err::CuError) = print(io, "CuError($(err.code))")
 73 | 
 74 | @enum_without_prefix cudaError_enum CUDA_
 75 | 
 76 | 
 77 | ## API call wrapper
 78 | 
 79 | @inline function initialize_context()
 80 |     prepare_cuda_state()
 81 |     return
 82 | end
 83 | 
 84 | # outlined functionality to avoid GC frame allocation
 85 | @noinline throw_stub_error() =
 86 |     error("Cannot use the CUDA stub libraries. You either don't have the NVIDIA driver installed, or it is not properly discoverable.")
 87 | @noinline function throw_api_error(res)
 88 |     if res == ERROR_OUT_OF_MEMORY
 89 |         throw(OutOfGPUMemoryError())
 90 |     else
 91 |         throw(CuError(res))
 92 |     end
 93 | end
 94 | 
 95 | macro check(ex)
 96 |     quote
 97 |         res = $(esc(ex))
 98 |         if res == 0xffffffff
 99 |             throw_stub_error()
100 |         elseif res != SUCCESS
101 |             throw_api_error(res)
102 |         end
103 | 
104 |         nothing
105 |     end
106 | end
107 | 


--------------------------------------------------------------------------------
/lib/cudadrv/pool.jl:
--------------------------------------------------------------------------------
 1 | # Stream-orderdered memory allocator
 2 | 
 3 | export CuMemoryPool, default_memory_pool, memory_pool, memory_pool!, trim, attribute, attribute!
 4 | 
 5 | @enum_without_prefix CUmemAllocationType CU_MEM_
 6 | @enum_without_prefix CUmemAllocationHandleType CU_MEM_
 7 | 
 8 | mutable struct CuMemoryPool
 9 |     handle::CUmemoryPool
10 |     ctx::CuContext
11 | 
12 |     function CuMemoryPool(dev::CuDevice;
13 |                           alloc_type::CUmemAllocationType=ALLOCATION_TYPE_PINNED,
14 |                           handle_type::CUmemAllocationHandleType=HANDLE_TYPE_NONE)
15 |         props = Ref(CUmemPoolProps(
16 |             alloc_type,
17 |             handle_type,
18 |             CUmemLocation(
19 |                 CU_MEM_LOCATION_TYPE_DEVICE,
20 |                 deviceid(dev)
21 |             ),
22 |             C_NULL,
23 |             ntuple(i->Cuchar(0), 64)
24 |         ))
25 |         handle_ref = Ref{CUmemoryPool}()
26 |         cuMemPoolCreate(handle_ref, props)
27 | 
28 |         ctx = current_context()
29 |         obj = new(handle_ref[], ctx)
30 |         finalizer(unsafe_destroy!, obj)
31 |         return obj
32 |     end
33 | 
34 |     global function default_memory_pool(dev::CuDevice)
35 |         handle_ref = Ref{CUmemoryPool}()
36 |         cuDeviceGetDefaultMemPool(handle_ref, dev)
37 | 
38 |         ctx = current_context()
39 |         new(handle_ref[], ctx)
40 |     end
41 | 
42 |     global function memory_pool(dev::CuDevice)
43 |         handle_ref = Ref{CUmemoryPool}()
44 |         cuDeviceGetMemPool(handle_ref, dev)
45 | 
46 |         ctx = current_context()
47 |         new(handle_ref[], ctx)
48 |     end
49 | end
50 | 
51 | function unsafe_destroy!(pool::CuMemoryPool)
52 |     @finalize_in_ctx pool.ctx cuMemPoolDestroy(pool)
53 | end
54 | 
55 | Base.unsafe_convert(::Type{CUmemoryPool}, pool::CuMemoryPool) = pool.handle
56 | 
57 | Base.:(==)(a::CuMemoryPool, b::CuMemoryPool) = a.handle == b.handle
58 | Base.hash(pool::CuMemoryPool, h::UInt) = hash(pool.handle, h)
59 | 
60 | memory_pool!(dev::CuDevice, pool::CuMemoryPool) = cuDeviceSetMemPool(dev, pool)
61 | 
62 | trim(pool::CuMemoryPool, bytes_to_keep::Integer=0) = cuMemPoolTrimTo(pool, bytes_to_keep)
63 | 
64 | 
65 | ## pool attributes
66 | 
67 | @enum_without_prefix CUmemPool_attribute CU_
68 | 
69 | """
70 |     attribute(X, pool::CuMemoryPool, attr)
71 | 
72 | Returns attribute `attr` about `pool`. The type of the returned value depends on the
73 | attribute, and as such must be passed as the `X` parameter.
74 | """
75 | function attribute(X::Type, pool::CuMemoryPool, attr::CUmemPool_attribute) where {T}
76 |     value = Ref{X}()
77 |     cuMemPoolGetAttribute(pool, attr, value)
78 |     return value[]
79 | end
80 | 
81 | """
82 |     attribute!(ptr::Union{Ptr,CuPtr}, attr, val)
83 | 
84 | Sets attribute` attr` on a pointer `ptr` to `val`.
85 | """
86 | function attribute!(pool::CuMemoryPool, attr::CUmemPool_attribute, value) where {T}
87 |     cuMemPoolSetAttribute(pool, attr, Ref(value))
88 |     return
89 | end
90 | 


--------------------------------------------------------------------------------
/lib/cufft/error.jl:
--------------------------------------------------------------------------------
 1 | export CUFFTError
 2 | 
 3 | struct CUFFTError <: Exception
 4 |     code::cufftResult
 5 | end
 6 | 
 7 | Base.convert(::Type{cufftResult}, err::CUFFTError) = err.code
 8 | 
 9 | Base.showerror(io::IO, err::CUFFTError) =
10 |     print(io, "CUFFTError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))")
11 | 
12 | name(err::CUFFTError) = string(err.code)
13 | 
14 | ## COV_EXCL_START
15 | function description(err::CUFFTError)
16 |     if err.code == CUFFT_SUCCESS
17 |         "the operation completed successfully"
18 |     elseif err.code == CUFFT_INVALID_PLAN
19 |         "cuFFT was passed an invalid plan handle"
20 |     elseif err.code == CUFFT_ALLOC_FAILED
21 |         "cuFFT failed to allocate GPU or CPU memory"
22 |     elseif err.code == CUFFT_INVALID_TYPE
23 |         "cuFFT invalid type " # No longer used
24 |     elseif err.code == CUFFT_INVALID_VALUE
25 |         "user specified an invalid pointer or parameter"
26 |     elseif err.code == CUFFT_INTERNAL_ERROR
27 |         "driver or internal cuFFT library error"
28 |     elseif err.code == CUFFT_EXEC_FAILED
29 |         "failed to execute an FFT on the GPU"
30 |     elseif err.code == CUFFT_SETUP_FAILED
31 |         "the cuFFT library failed to initialize"
32 |     elseif err.code == CUFFT_INVALID_SIZE
33 |         "user specified an invalid transform size"
34 |     elseif err.code == CUFFT_UNALIGNED_DATA
35 |         "cuFFT unaligned data" # No longer used
36 |     elseif err.code == CUFFT_INCOMPLETE_PARAMETER_LIST
37 |         "missing parameters in call"
38 |     elseif err.code == CUFFT_INVALID_DEVICE
39 |         "execution of a plan was on different GPU than plan creation"
40 |     elseif err.code == CUFFT_PARSE_ERROR
41 |         "internal plan database error"
42 |     elseif err.code == CUFFT_NO_WORKSPACE
43 |         "no workspace has been provided prior to plan execution"
44 |     elseif err.code == CUFFT_NOT_IMPLEMENTED
45 |         "function does not implement functionality for parameters given."
46 |     elseif err.code == CUFFT_LICENSE_ERROR
47 |         "cuFFT license error" # Used in previous versions.
48 |     elseif err.code == CUFFT_NOT_SUPPORTED
49 |         "operation is not supported for parameters given."
50 |     else
51 |         "no description for this error"
52 |     end
53 | end
54 | ## COV_EXCL_STOP
55 | 
56 | 
57 | ## API call wrapper
58 | 
59 | # outlined functionality to avoid GC frame allocation
60 | @noinline function throw_api_error(res)
61 |     if res == CUFFT_ALLOC_FAILED
62 |         throw(OutOfGPUMemoryError())
63 |     else
64 |         throw(CUFFTError(res))
65 |     end
66 | end
67 | 
68 | macro check(ex, errs...)
69 |     check = :(isequal(err, CUFFT_ALLOC_FAILED))
70 |     for err in errs
71 |         check = :($check || isequal(err, $(esc(err))))
72 |     end
73 | 
74 |     quote
75 |         res = @retry_reclaim err->$check $(esc(ex))
76 |         if res != CUFFT_SUCCESS
77 |             throw_api_error(res)
78 |         end
79 | 
80 |         nothing
81 |     end
82 | end
83 | 


--------------------------------------------------------------------------------
/docs/Manifest.toml:
--------------------------------------------------------------------------------
 1 | # This file is machine-generated - editing it directly is not advised
 2 | 
 3 | [[ANSIColoredPrinters]]
 4 | git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
 5 | uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9"
 6 | version = "0.0.1"
 7 | 
 8 | [[Base64]]
 9 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
10 | 
11 | [[Dates]]
12 | deps = ["Printf"]
13 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
14 | 
15 | [[DocStringExtensions]]
16 | deps = ["LibGit2"]
17 | git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f"
18 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
19 | version = "0.8.5"
20 | 
21 | [[Documenter]]
22 | deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
23 | git-tree-sha1 = "8b43e37cfb4f4edc2b6180409acc0cebce7fede8"
24 | uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
25 | version = "0.27.7"
26 | 
27 | [[IOCapture]]
28 | deps = ["Logging", "Random"]
29 | git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a"
30 | uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
31 | version = "0.2.2"
32 | 
33 | [[InteractiveUtils]]
34 | deps = ["Markdown"]
35 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
36 | 
37 | [[JSON]]
38 | deps = ["Dates", "Mmap", "Parsers", "Unicode"]
39 | git-tree-sha1 = "8076680b162ada2a031f707ac7b4953e30667a37"
40 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
41 | version = "0.21.2"
42 | 
43 | [[LibGit2]]
44 | deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
45 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
46 | 
47 | [[Literate]]
48 | deps = ["Base64", "IOCapture", "JSON", "REPL"]
49 | git-tree-sha1 = "bbebc3c14dbfbe76bfcbabf0937481ac84dc86ef"
50 | uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
51 | version = "2.9.3"
52 | 
53 | [[Logging]]
54 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
55 | 
56 | [[Markdown]]
57 | deps = ["Base64"]
58 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
59 | 
60 | [[Mmap]]
61 | uuid = "a63ad114-7e13-5084-954f-fe012c677804"
62 | 
63 | [[NetworkOptions]]
64 | uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
65 | 
66 | [[Parsers]]
67 | deps = ["Dates"]
68 | git-tree-sha1 = "a8709b968a1ea6abc2dc1967cb1db6ac9a00dfb6"
69 | uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
70 | version = "2.0.5"
71 | 
72 | [[Printf]]
73 | deps = ["Unicode"]
74 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
75 | 
76 | [[REPL]]
77 | deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
78 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
79 | 
80 | [[Random]]
81 | deps = ["Serialization"]
82 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
83 | 
84 | [[SHA]]
85 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
86 | 
87 | [[Serialization]]
88 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
89 | 
90 | [[Sockets]]
91 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
92 | 
93 | [[Test]]
94 | deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
95 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
96 | 
97 | [[Unicode]]
98 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
99 | 


--------------------------------------------------------------------------------
/test/exceptions.jl:
--------------------------------------------------------------------------------
 1 | # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces,
 2 | #                     but --show-backtrace=no does not survive execve.
 3 | @not_if_sanitize begin
 4 | 
 5 | # these tests spawn subprocesses, so reset the current context to conserve memory
 6 | CUDA.can_reset_device() && device_reset!()
 7 | 
 8 | host_error_re = r"ERROR: (KernelException: exception thrown during kernel execution on device|CUDA error: an illegal instruction was encountered|CUDA error: unspecified launch failure)"
 9 | device_error_re = r"ERROR: a \w+ was thrown during kernel execution"
10 | 
11 | @testset "stack traces at different debug levels" begin
12 | 
13 | script = """
14 |     function kernel(arr, val)
15 |         arr[1] = val
16 |         return
17 |     end
18 | 
19 |     cpu = zeros(Int)
20 |     gpu = CuArray(cpu)
21 |     @cuda kernel(gpu, 1.2)
22 |     synchronize()
23 | 
24 |     # FIXME: on some platforms (Windows...), for some users, the exception flag change
25 |     # doesn't immediately propagate to the host, and gets caught during finalization.
26 |     # this looks like a driver bug, since we threadfence_system() after setting the flag.
27 |     # https://stackoverflow.com/questions/16417346/cuda-pinned-memory-flushing-from-the-device
28 |     sleep(1)
29 |     synchronize()
30 | """
31 | 
32 | # NOTE: kernel exceptions aren't always caught on the CPU as a KernelException.
33 | #       on older devices, we emit a `trap` which causes a CUDA error...
34 | #
35 | 
36 | let (code, out, err) = julia_script(script, `-g0`)
37 |     @test code == 1
38 |     @test  occursin(host_error_re, err)
39 |     @test !occursin(device_error_re, out)
40 |     # NOTE: stdout sometimes contain a failure to free the CuArray with ILLEGAL_ACCESS
41 | end
42 | 
43 | let (code, out, err) = julia_script(script, `-g1`)
44 |     @test code == 1
45 |     @test occursin(host_error_re, err)
46 |     @test occursin(device_error_re, out)
47 |     @test occursin("Run Julia on debug level 2 for device stack traces", out)
48 | end
49 | 
50 | let (code, out, err) = julia_script(script, `-g2`,
51 |                                     "JULIA_CUDA_DEBUG_INFO"=>false) # NVIDIA#3305774
52 |     @test code == 1
53 |     @test occursin(host_error_re, err)
54 |     @test occursin(device_error_re, out)
55 |     @test occursin("[1] Int64 at $(joinpath(".", "float.jl"))", out)
56 |     @test occursin("[4] kernel at $(joinpath(".", "none")):5", out)
57 | end
58 | 
59 | end
60 | 
61 | @testset "#329" begin
62 | 
63 | script = """
64 |     @noinline foo(a, i) = a[1] = i
65 |     bar(a) = (foo(a, 42); nothing)
66 | 
67 |     ptr = reinterpret(Core.LLVMPtr{Int,AS.Global}, C_NULL)
68 |     arr = CuDeviceArray{Int,1,AS.Global}((0,), ptr)
69 | 
70 |     CUDA.@sync @cuda bar(arr)
71 | """
72 | 
73 | let (code, out, err) = julia_script(script, `-g2`,
74 |                                     "JULIA_CUDA_DEBUG_INFO"=>false) # NVIDIA#3305774
75 |     @test code == 1
76 |     @test occursin(host_error_re, err)
77 |     @test occursin(device_error_re, out)
78 |     @test occursin("foo at $(joinpath(".", "none")):4", out)
79 |     @test occursin("bar at $(joinpath(".", "none")):5", out)
80 | end
81 | 
82 | end
83 | 
84 | end
85 | 


--------------------------------------------------------------------------------
/src/CUDA.jl:
--------------------------------------------------------------------------------
  1 | module CUDA
  2 | 
  3 | using GPUCompiler
  4 | 
  5 | using GPUArrays
  6 | 
  7 | using LLVM
  8 | using LLVM.Interop
  9 | using Core: LLVMPtr
 10 | 
 11 | using Adapt: Adapt, adapt, WrappedArray
 12 | 
 13 | using Requires: @require
 14 | 
 15 | using LinearAlgebra
 16 | 
 17 | using BFloat16s: BFloat16
 18 | 
 19 | using ExprTools: splitdef, combinedef
 20 | 
 21 | # TODO: set lib versions in bindeps or so
 22 | 
 23 | # XXX: to be replaced by a JLL
 24 | include("../deps/Deps.jl")
 25 | using .Deps
 26 | 
 27 | # only use TimerOutputs on non latency-critical CI, in part because
 28 | # @timeit_debug isn't truely zero-cost (KristofferC/TimerOutputs.jl#120)
 29 | if getenv("CI", false) && !getenv("BENCHMARKS", false)
 30 |     using TimerOutputs
 31 |     const to = TimerOutput()
 32 | 
 33 |     macro timeit_ci(args...)
 34 |         TimerOutputs.timer_expr(CUDA, false, :($CUDA.to), args...)
 35 |     end
 36 | else
 37 |     macro timeit_ci(args...)
 38 |         esc(args[end])
 39 |     end
 40 | end
 41 | 
 42 | 
 43 | ## source code includes
 44 | 
 45 | include("pointer.jl")
 46 | 
 47 | # core library
 48 | include("../lib/utils/APIUtils.jl")
 49 | include("../lib/cudadrv/CUDAdrv.jl")
 50 | 
 51 | # essential stuff
 52 | include("initialization.jl")
 53 | include("state.jl")
 54 | include("debug.jl")
 55 | 
 56 | # device functionality (needs to be loaded first, because of generated functions)
 57 | include("device/utils.jl")
 58 | include("device/pointer.jl")
 59 | include("device/array.jl")
 60 | include("device/intrinsics.jl")
 61 | include("device/runtime.jl")
 62 | include("device/texture.jl")
 63 | include("device/random.jl")
 64 | include("device/sparse.jl")
 65 | include("device/quirks.jl")
 66 | 
 67 | # array essentials
 68 | include("pool.jl")
 69 | include("array.jl")
 70 | 
 71 | # compiler libraries
 72 | include("../lib/cupti/CUPTI.jl")
 73 | include("../lib/nvtx/NVTX.jl")
 74 | export CUPTI, NVTX
 75 | 
 76 | # compiler implementation
 77 | include("compiler/gpucompiler.jl")
 78 | include("compiler/execution.jl")
 79 | include("compiler/exceptions.jl")
 80 | include("compiler/reflection.jl")
 81 | 
 82 | # array implementation
 83 | include("gpuarrays.jl")
 84 | include("utilities.jl")
 85 | include("texture.jl")
 86 | 
 87 | # array libraries
 88 | include("../lib/complex.jl")
 89 | include("../lib/library_types.jl")
 90 | include("../lib/cublas/CUBLAS.jl")
 91 | include("../lib/cusparse/CUSPARSE.jl")
 92 | include("../lib/cusolver/CUSOLVER.jl")
 93 | include("../lib/cufft/CUFFT.jl")
 94 | include("../lib/curand/CURAND.jl")
 95 | include("../lib/cudnn/CUDNN.jl")
 96 | include("../lib/cutensor/CUTENSOR.jl")
 97 | export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR
 98 | 
 99 | # integrations and specialized functionality
100 | include("indexing.jl")
101 | include("broadcast.jl")
102 | include("mapreduce.jl")
103 | include("accumulate.jl")
104 | include("reverse.jl")
105 | include("linalg.jl")
106 | include("iterator.jl")
107 | include("random.jl")
108 | include("sorting.jl")
109 | 
110 | # other libraries
111 | include("../lib/nvml/NVML.jl")
112 | const has_nvml = NVML.has_nvml
113 | export NVML, has_nvml
114 | 
115 | include("deprecated.jl")
116 | include("precompile.jl")
117 | 
118 | end
119 | 


--------------------------------------------------------------------------------
/src/device/intrinsics/assertion.jl:
--------------------------------------------------------------------------------
 1 | # Assertion (B.19)
 2 | 
 3 | export @cuassert
 4 | 
 5 | """
 6 |     @assert cond [text]
 7 | 
 8 | Signal assertion failure to the CUDA driver if `cond` is `false`. Preferred syntax for
 9 | writing assertions, mimicking `Base.@assert`. Message `text` is optionally displayed upon
10 | assertion failure.
11 | 
12 | !!! warning
13 |     A failed assertion will crash the GPU, so use sparingly as a debugging tool.
14 |     Furthermore, the assertion might be disabled at various optimization levels, and thus
15 |     should not cause any side-effects.
16 | """
17 | macro cuassert(ex, msgs...)
18 |     # message handling copied from Base.@assert
19 |     msg = isempty(msgs) ? ex : msgs[1]
20 |     if isa(msg, AbstractString)
21 |         msg = msg # pass-through
22 |     elseif !isempty(msgs) && (isa(msg, Expr) || isa(msg, Symbol))
23 |         # message is an expression needing evaluating
24 |         msg = :(Main.Base.string($(esc(msg))))
25 |     elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) && applicable(Main.Base.string, msg)
26 |         msg = Main.Base.string(msg)
27 |     else
28 |         # string() might not be defined during bootstrap
29 |         msg = :(Main.Base.string($(Expr(:quote,msg))))
30 |     end
31 | 
32 |     return :($(esc(ex)) ? $(nothing)
33 |                         : cuassert_fail($(Val(Symbol(msg))),
34 |                                         $(Val(__source__.file)),
35 |                                         $(Val(__source__.line))))
36 | end
37 | 
38 | assert_counter = 0
39 | 
40 | @generated function cuassert_fail(::Val{msg}, ::Val{file}, ::Val{line}) where
41 |                                  {msg, file, line}
42 |     Context() do ctx
43 |         T_void = LLVM.VoidType(ctx)
44 |         T_int32 = LLVM.Int32Type(ctx)
45 |         T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx))
46 | 
47 |         # create function
48 |         llvm_f, _ = create_function(T_void)
49 |         mod = LLVM.parent(llvm_f)
50 | 
51 |         # generate IR
52 |         Builder(ctx) do builder
53 |             entry = BasicBlock(llvm_f, "entry"; ctx)
54 |             position!(builder, entry)
55 | 
56 |             global assert_counter
57 |             assert_counter += 1
58 | 
59 |             message = globalstring_ptr!(builder, String(msg), "assert_message_$(assert_counter)")
60 |             file = globalstring_ptr!(builder, String(file), "assert_file_$(assert_counter)")
61 |             line = ConstantInt(T_int32, line)
62 |             func = globalstring_ptr!(builder, "unknown", "assert_function_$(assert_counter)")
63 |             charSize = ConstantInt(Csize_t(1); ctx)
64 | 
65 |             # invoke __assertfail and return
66 |             # NOTE: we don't mark noreturn since that control flow might confuse ptxas
67 |             assertfail_typ =
68 |                 LLVM.FunctionType(T_void,
69 |                                 [T_pint8, T_pint8, T_int32, T_pint8, llvmtype(charSize)])
70 |             assertfail = LLVM.Function(mod, "__assertfail", assertfail_typ)
71 |             call!(builder, assertfail, [message, file, line, func, charSize])
72 | 
73 |             ret!(builder)
74 |         end
75 | 
76 |         call_function(llvm_f, Nothing, Tuple{})
77 |     end
78 | end
79 | 


--------------------------------------------------------------------------------
/lib/utils/memoization.jl:
--------------------------------------------------------------------------------
 1 | export @memoize
 2 | 
 3 | """
 4 |     @memoize [arg::T]... begin
 5 |         # expensive computation
 6 |     end::T
 7 | 
 8 | Low-level, no-frills memoization macro that stores values in a thread-local, typed Dict. The
 9 | types of the dictionary are derived from the syntactical type assertions.
10 | 
11 | When there are no arguments to key the cache with, instead of a dictionary a simple array
12 | with per-thread elements is used. This further improves performance to 2ns per access.
13 | """
14 | macro memoize(ex...)
15 |     code = ex[end]
16 |     args = ex[1:end-1]
17 | 
18 |     # decode the code body
19 |     @assert Meta.isexpr(code, :(::))
20 |     rettyp = code.args[2]
21 |     code = code.args[1]
22 | 
23 |     # decode the arguments
24 |     argtyps = []
25 |     argvars = []
26 |     for arg in args
27 |         @assert Meta.isexpr(arg, :(::))
28 |         push!(argvars, arg.args[1])
29 |         push!(argtyps, arg.args[2])
30 |     end
31 | 
32 |     # the global cache is an array with one entry per thread. if we don't have to key on
33 |     # anything, that entry will be the memoized new_value, or else a dictionary of values.
34 |     @gensym global_cache
35 | 
36 |     # generate code to access memoized values
37 |     # (assuming the global_cache can be indexed with the thread ID)
38 |     if isempty(args)
39 |         # if we don't have to key on anything, use the global cache directly
40 |         global_cache_eltyp = :(Union{Nothing,$rettyp})
41 |         ex = quote
42 |             cache = get!($(esc(global_cache))) do
43 |                 [nothing for _ in 1:Threads.nthreads()]
44 |             end
45 |             cached_value = @inbounds cache[Threads.threadid()]
46 |             if cached_value !== nothing
47 |                 cached_value
48 |             else
49 |                 new_value = $(esc(code))::$rettyp
50 |                 @inbounds cache[Threads.threadid()] = new_value
51 |                 new_value
52 |             end
53 |         end
54 |     else
55 |         if length(args) == 1
56 |             global_cache_eltyp = :(Dict{$(argtyps[1]),$rettyp})
57 |             global_init = :(Dict{$(argtyps[1]),$rettyp}())
58 |             key = :($(esc(argvars[1])))
59 |         else
60 |             global_cache_eltyp = :(Dict{Tuple{$(argtyps...)},$rettyp})
61 |             global_init = :(Dict{Tuple{$(argtyps...)},$rettyp}())
62 |             key = :(tuple($(map(esc, argvars)...)))
63 |         end
64 |         ex = quote
65 |             cache = get!($(esc(global_cache))) do
66 |                 [$global_init for _ in 1:Threads.nthreads()]
67 |             end
68 |             local_cache = @inbounds cache[Threads.threadid()]
69 |             cached_value = get(local_cache, $key, nothing)
70 |             if cached_value !== nothing
71 |                 cached_value
72 |             else
73 |                 new_value = $(esc(code))::$rettyp
74 |                 local_cache[$key] = new_value
75 |                 new_value
76 |             end
77 |         end
78 |     end
79 | 
80 |     # define the per-thread cache
81 |     @eval __module__ begin
82 |         const $global_cache = LazyInitialized{Vector{$(global_cache_eltyp)}}()
83 |     end
84 | 
85 |     quote
86 |         $ex
87 |     end
88 | end
89 | 


--------------------------------------------------------------------------------
/test/device/intrinsics/math.jl:
--------------------------------------------------------------------------------
 1 | using SpecialFunctions
 2 | 
 3 | @testset "math" begin
 4 |     @testset "log10" begin
 5 |         @test testf(a->log10.(a), Float32[100])
 6 |     end
 7 | 
 8 |     @testset "pow" begin
 9 |         for T in (Float16, Float32, Float64, ComplexF32, ComplexF64)
10 |             range = (T<:Integer) ? (T(5):T(10)) : T
11 |             @test testf((x,y)->x.^y, rand(Float32, 1), rand(range, 1))
12 |             @test testf((x,y)->x.^y, rand(Float32, 1), -rand(range, 1))
13 |         end
14 |     end
15 | 
16 |     @testset "isinf" begin
17 |       for x in (Inf32, Inf, NaN32, NaN)
18 |         @test testf(x->isinf.(x), [x])
19 |       end
20 |     end
21 | 
22 |     @testset "isnan" begin
23 |       for x in (Inf32, Inf, NaN32, NaN)
24 |         @test testf(x->isnan.(x), [x])
25 |       end
26 |     end
27 | 
28 |     for op in (exp, angle, exp2, exp10,)
29 |         @testset "$op" begin
30 |             for T in (Float16, Float32, Float64)
31 |                 @test testf(x->op.(x), rand(T, 1))
32 |                 @test testf(x->op.(x), -rand(T, 1))
33 |             end
34 |         end
35 |     end
36 | 
37 |     for op in (expm1,)
38 |         @testset "$op" begin
39 |             # FIXME: add expm1(::Float16) to Base
40 |             for T in (Float32, Float64)
41 |                 @test testf(x->op.(x), rand(T, 1))
42 |                 @test testf(x->op.(x), -rand(T, 1))
43 |             end
44 |         end
45 |     end
46 | 
47 |     for op in (exp, abs, abs2, angle, log)
48 |         @testset "Complex - $op" begin
49 |             for T in (ComplexF16, ComplexF32, ComplexF64)
50 |                 @test testf(x->op.(x), rand(T, 1))
51 |                 @test testf(x->op.(x), -rand(T, 1))
52 |             end
53 | 
54 |         end
55 |     end
56 |     @testset "mod and rem" begin
57 |         for T in (Float16, Float32, Float64)
58 |             @test testf(a->rem.(a, T(2)), T[0, 1, 1.5, 2, -1])
59 |             @test testf(a->rem.(a, T(2), RoundNearest), T[0, 1, 1.5, 2, -1])
60 |             @test testf(a->mod.(a, T(2)), T[0, 1, 1.5, 2, -1])
61 |         end
62 |     end
63 | 
64 |     @testset "rsqrt" begin
65 |         # GPUCompiler.jl#173: a CUDA-only device function fails to validate
66 |         function kernel(a)
67 |             a[] = CUDA.rsqrt(a[])
68 |             return
69 |         end
70 | 
71 |         # make sure this test uses an actual device function
72 |         @test_throws ErrorException kernel(ones(1))
73 | 
74 |         for T in (Float16, Float32)
75 |             a = CuArray{T}([4])
76 |             @cuda kernel(a)
77 |             @test Array(a) == [0.5]
78 |         end
79 |     end
80 | 
81 |     @testset "fma" begin
82 |         for T in (Float16, Float32, Float64)
83 |             @test testf((x,y,z)->fma.(x,y,z), rand(T, 1), rand(T, 1), rand(T, 1))
84 |             @test testf((x,y,z)->fma.(x,y,z), rand(T, 1), -rand(T, 1), -rand(T, 1))
85 |         end
86 |     end
87 | 
88 |     # something from SpecialFunctions.jl
89 |     @testset "erf" begin
90 |         @test testf(a->SpecialFunctions.erf.(a), Float32[1.0])
91 |     end
92 | 
93 |     @testset "exp" begin
94 |         # JuliaGPU/CUDA.jl#1085: exp uses Base.sincos performing a global CPU load
95 |         @test testf(x->exp.(x), [1e7im])
96 |     end
97 | end
98 | 


--------------------------------------------------------------------------------
/docs/src/faq.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | This page is a compilation of frequently asked questions and answers.
 4 | 
 5 | 
 6 | ## An old version of CUDA.jl keeps getting installed!
 7 | 
 8 | Sometimes it happens that a breaking version of CUDA.jl or one of its dependencies is
 9 | released. If any package you use isn't yet compatible with this release, this will block
10 | automatic upgrade of CUDA.jl. For example, with Flux.jl v0.11.1 we get CUDA.jl v1.3.3
11 | despite there being a v2.x release:
12 | 
13 | ```
14 | pkg> add Flux
15 |   [587475ba] + Flux v0.11.1
16 | pkg> add CUDA
17 |   [052768ef] + CUDA v1.3.3
18 | ```
19 | 
20 | To examine which package is holding back CUDA.jl, you can "force" an upgrade by specifically
21 | requesting a newer version. The resolver will then complain, and explain why this upgrade
22 | isn't possible:
23 | 
24 | ```
25 | pkg> add CUDA.jl@2
26 |   Resolving package versions...
27 | ERROR: Unsatisfiable requirements detected for package Adapt [79e6a3ab]:
28 |  Adapt [79e6a3ab] log:
29 |  ├─possible versions are: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0, 2.0.0-2.0.2, 2.1.0, 2.2.0, 2.3.0] or uninstalled
30 |  ├─restricted by compatibility requirements with CUDA [052768ef] to versions: [2.2.0, 2.3.0]
31 |  │ └─CUDA [052768ef] log:
32 |  │   ├─possible versions are: [0.1.0, 1.0.0-1.0.2, 1.1.0, 1.2.0-1.2.1, 1.3.0-1.3.3, 2.0.0-2.0.2] or uninstalled
33 |  │   └─restricted to versions 2 by an explicit requirement, leaving only versions 2.0.0-2.0.2
34 |  └─restricted by compatibility requirements with Flux [587475ba] to versions: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0] — no versions left
35 |    └─Flux [587475ba] log:
36 |      ├─possible versions are: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1] or uninstalled
37 |      ├─restricted to versions * by an explicit requirement, leaving only versions [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1]
38 |      └─restricted by compatibility requirements with CUDA [052768ef] to versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4] or uninstalled, leaving only versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4]
39 |        └─CUDA [052768ef] log: see above
40 | ```
41 | 
42 | A common source of these incompatibilities is having both CUDA.jl and the older
43 | CUDAnative.jl/CuArrays.jl/CUDAdrv.jl stack installed: These are incompatible, and cannot
44 | coexist. You can inspect in the Pkg REPL which exact packages you have installed using the
45 | `status --manifest` option.
46 | 
47 | 
48 | ## Can you wrap this or that CUDA API?
49 | 
50 | If a certain API isn't wrapped with some high-level functionality, you can always use the
51 | underlying C APIs which are always available as unexported methods. For example, you can
52 | access the CUDA driver library as `cu` prefixed, unexported functions like
53 | `CUDA.cuDriverGetVersion`. Similarly, vendor libraries like CUBLAS are available through
54 | their exported submodule handles, e.g., `CUBLAS.cublasGetVersion_v2`.
55 | 
56 | Any help on designing or implementing high-level wrappers for this low-level functionality
57 | is greatly appreciated, so please consider contributing your uses of these APIs on the
58 | respective repositories.
59 | 


--------------------------------------------------------------------------------