├── test ├── ptx │ ├── empty.cu │ ├── global.cu │ ├── dummy.cu │ ├── empty.ptx │ ├── global.ptx │ ├── vadd_child.cu │ ├── dummy.ptx │ ├── vadd.cu │ ├── vadd_parent.cu │ ├── Makefile │ ├── vadd_child.ptx │ ├── vectorops.cu │ ├── vadd.ptx │ └── vadd_parent.ptx ├── .gitignore ├── curand.jl ├── nvtx.jl ├── iterator.jl ├── linalg.jl ├── cutensor │ ├── base.jl │ └── permutations.jl ├── apiutils.jl ├── Project.toml ├── cudnn │ ├── tensor.jl │ ├── dropout.jl │ ├── inplace.jl │ ├── softmax.jl │ ├── activation.jl │ └── optensor.jl ├── examples.jl ├── pool.jl ├── cusparse │ └── conversions.jl ├── utils.jl ├── device │ ├── sparse.jl │ ├── ldg.jl │ └── intrinsics │ │ └── math.jl ├── broadcast.jl ├── nvml.jl ├── threading.jl └── exceptions.jl ├── .github ├── FUNDING.yml ├── workflows │ ├── TagBot.yml │ ├── CompatHelper.yml │ └── ManifestUpdater.yml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── res └── wrap │ ├── .gitignore │ ├── patches │ ├── cusolver │ │ └── cppversion.patch │ ├── cudadrv │ │ ├── cudeviceptr.patch │ │ ├── cuarray.patch │ │ └── batched_memop.patch │ ├── cudnn │ │ ├── severity.patch │ │ └── algorithm.patch │ ├── nvtx │ │ ├── unions.patch │ │ └── macro.patch │ ├── cusparse │ │ └── cppversion.patch │ └── cublas │ │ └── computetype.patch │ ├── Project.toml │ └── README.md ├── docs ├── src │ ├── assets │ │ ├── logo.png │ │ └── favicon.ico │ ├── development │ │ ├── nvvp.png │ │ ├── nsight_systems.png │ │ ├── nsight_compute-api.png │ │ ├── nsight_compute-attach.png │ │ ├── nsight_compute-kernel.png │ │ └── nsight_compute-resume.png │ ├── tutorials │ │ ├── intro1.png │ │ ├── common.jl │ │ └── custom_structs.jl │ ├── usage │ │ ├── multitasking │ │ │ ├── tasks.png │ │ │ └── tasks_pinned.png │ │ └── workflow.md │ ├── api │ │ ├── array.md │ │ ├── essentials.md │ │ └── compiler.md │ ├── installation │ │ └── troubleshooting.md │ ├── index.md │ └── faq.md ├── .gitignore ├── Project.toml ├── make.jl └── Manifest.toml ├── .gitignore ├── lib ├── cupti │ ├── wrappers.jl │ └── CUPTI.jl ├── complex.jl ├── utils │ ├── APIUtils.jl │ ├── enum.jl │ ├── threading.jl │ ├── cache.jl │ └── memoization.jl ├── nvtx │ ├── NVTX.jl │ └── highlevel.jl ├── cufft │ ├── wrappers.jl │ ├── CUFFT.jl │ ├── util.jl │ ├── libcufft_common.jl │ └── error.jl ├── cublas │ ├── libcublas_deprecated.jl │ ├── README.md │ ├── util.jl │ └── error.jl ├── cusparse │ ├── management.jl │ ├── util.jl │ ├── helpers.jl │ ├── error.jl │ ├── CUSPARSE.jl │ └── extra.jl ├── curand │ ├── wrappers.jl │ ├── CURAND.jl │ └── error.jl ├── cudnn │ ├── base.jl │ ├── error.jl │ ├── util.jl │ ├── softmax.jl │ ├── inplace.jl │ └── activation.jl ├── cudadrv │ ├── module │ │ ├── function.jl │ │ └── global.jl │ ├── types.jl │ ├── libcuda_deprecated.jl │ ├── version.jl │ ├── error.jl │ └── pool.jl ├── nvml │ ├── system.jl │ ├── error.jl │ ├── NVML.jl │ └── libnvml_deprecated.jl ├── cusolver │ ├── base.jl │ └── error.jl └── cutensor │ ├── tensor.jl │ ├── CUTENSOR.jl │ ├── interfaces.jl │ └── error.jl ├── src ├── debug.jl ├── deprecated.jl ├── device │ ├── intrinsics │ │ ├── cooperative_groups.jl │ │ ├── misc.jl │ │ ├── memory_dynamic.jl │ │ ├── version.jl │ │ └── assertion.jl │ ├── pointer.jl │ ├── intrinsics.jl │ ├── utils.jl │ └── quirks.jl ├── broadcast.jl ├── precompile.jl ├── iterator.jl ├── compiler │ └── exceptions.jl ├── gpuarrays.jl ├── linalg.jl └── CUDA.jl ├── codecov.yml ├── examples ├── driver │ ├── vadd.cu │ ├── Makefile │ ├── vadd.jl │ └── vadd.ptx ├── hello_world.jl ├── vadd.jl ├── wmma │ ├── high-level.jl │ └── low-level.jl └── peakflops.jl ├── deps ├── Deps.jl └── utils.jl ├── perf ├── Project.toml ├── cudadevrt.jl ├── kernel.jl ├── latency.jl └── byval.jl ├── CITATION.bib ├── LICENSE.md └── Project.toml /test/ptx/empty.cu: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | Manifest.toml 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: JuliaLang 2 | -------------------------------------------------------------------------------- /test/ptx/global.cu: -------------------------------------------------------------------------------- 1 | __device__ int foobar; 2 | -------------------------------------------------------------------------------- /res/wrap/.gitignore: -------------------------------------------------------------------------------- 1 | LibTemplate.jl 2 | ctypes.jl 3 | lib*.jl 4 | -------------------------------------------------------------------------------- /test/ptx/dummy.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void dummy() 4 | { 5 | } 6 | 7 | } 8 | -------------------------------------------------------------------------------- /test/ptx/empty.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/src/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/assets/logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.*.cov 2 | *.jl.cov 3 | *.jl.mem 4 | /docs/build/ 5 | .vscode 6 | lcov.info 7 | build/ 8 | -------------------------------------------------------------------------------- /docs/src/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/assets/favicon.ico -------------------------------------------------------------------------------- /docs/src/development/nvvp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nvvp.png -------------------------------------------------------------------------------- /docs/src/tutorials/intro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/tutorials/intro1.png -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | 3 | # generated files 4 | src/tutorials/introduction.md 5 | src/tutorials/custom_structs.md 6 | 7 | -------------------------------------------------------------------------------- /docs/src/development/nsight_systems.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_systems.png -------------------------------------------------------------------------------- /docs/src/usage/multitasking/tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/usage/multitasking/tasks.png -------------------------------------------------------------------------------- /test/ptx/global.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | .global .align 4 .u32 foobar; 7 | 8 | 9 | -------------------------------------------------------------------------------- /test/ptx/vadd_child.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __device__ float add(float a, float b) 4 | { 5 | return a+b; 6 | } 7 | 8 | } 9 | -------------------------------------------------------------------------------- /docs/src/development/nsight_compute-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-api.png -------------------------------------------------------------------------------- /docs/src/usage/multitasking/tasks_pinned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/usage/multitasking/tasks_pinned.png -------------------------------------------------------------------------------- /docs/src/development/nsight_compute-attach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-attach.png -------------------------------------------------------------------------------- /docs/src/development/nsight_compute-kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-kernel.png -------------------------------------------------------------------------------- /docs/src/development/nsight_compute-resume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/CUDA.jl/master/docs/src/development/nsight_compute-resume.png -------------------------------------------------------------------------------- /lib/cupti/wrappers.jl: -------------------------------------------------------------------------------- 1 | function version() 2 | version_ref = Ref{Cuint}() 3 | cuptiGetVersion(version_ref) 4 | VersionNumber(version_ref[]) 5 | end 6 | -------------------------------------------------------------------------------- /docs/src/api/array.md: -------------------------------------------------------------------------------- 1 | # Array programming 2 | 3 | The CUDA array type, `CuArray`, generally implements the Base array interface and all of its 4 | expected methods. 5 | -------------------------------------------------------------------------------- /src/debug.jl: -------------------------------------------------------------------------------- 1 | # debug functionality 2 | 3 | isdebug(group, mod=CUDA) = 4 | Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, mod) !== nothing 5 | -------------------------------------------------------------------------------- /test/ptx/dummy.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | 7 | .visible .entry dummy( 8 | 9 | ) 10 | { 11 | 12 | 13 | 14 | ret; 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /test/ptx/vadd.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void vadd(const float *a, const float *b, float *c) 4 | { 5 | int i = blockIdx.x *blockDim.x + threadIdx.x; 6 | c[i] = a[i] + b[i]; 7 | } 8 | 9 | } 10 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | ignore: 3 | - "lib/*/lib*.jl" 4 | - "src/device" 5 | - "res/" 6 | - "doc/" 7 | - "perf/" 8 | status: 9 | patch: false 10 | project: false 11 | changes: false 12 | -------------------------------------------------------------------------------- /examples/driver/vadd.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void kernel_vadd(const float *a, const float *b, float *c) 4 | { 5 | int i = blockIdx.x *blockDim.x + threadIdx.x; 6 | c[i] = a[i] + b[i]; 7 | } 8 | 9 | } 10 | -------------------------------------------------------------------------------- /test/curand.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CURAND 2 | 3 | @test CURAND.version() isa VersionNumber 4 | 5 | rng = CURAND.default_rng() 6 | Random.seed!(rng) 7 | Random.seed!(rng, nothing) 8 | Random.seed!(rng, 1) 9 | Random.seed!(rng, 1, 0) 10 | -------------------------------------------------------------------------------- /examples/driver/Makefile: -------------------------------------------------------------------------------- 1 | OBJS=vadd.ptx 2 | 3 | NVCC=nvcc 4 | NVCCFLAGS= 5 | 6 | 7 | .PHONY: all 8 | all: $(OBJS) 9 | 10 | .PHONY: clean 11 | clean: 12 | $(RM) $(OBJS) 13 | 14 | 15 | %.ptx: %.cu 16 | $(NVCC) $(NVCCFLAGS) -ptx $^ -o $@ 17 | -------------------------------------------------------------------------------- /res/wrap/patches/cusolver/cppversion.patch: -------------------------------------------------------------------------------- 1 | --- a/libcusolver_common.jl 2 | +++ b/libcusolver_common.jl 3 | @@ -8,3 +7,0 @@ 4 | -const CUSOLVER_CPP_VERSION = __cplusplus 5 | -const CUSOLVER_DEPRECATED = new_func 6 | -const CUSOLVER_DEPRECATED_ENUM = new_enum 7 | -------------------------------------------------------------------------------- /test/ptx/vadd_parent.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __device__ float add(float a, float b); 4 | 5 | __global__ void vadd(const float *a, const float *b, float *c) 6 | { 7 | int i = blockIdx.x *blockDim.x + threadIdx.x; 8 | c[i] = add(a[i], b[i]); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /deps/Deps.jl: -------------------------------------------------------------------------------- 1 | module Deps 2 | 3 | Base.Experimental.@compiler_options compile=min optimize=0 infer=false 4 | 5 | import ..CUDA 6 | import ..LLVM 7 | 8 | include("discovery.jl") 9 | include("compatibility.jl") 10 | include("bindeps.jl") 11 | include("utils.jl") 12 | 13 | end 14 | -------------------------------------------------------------------------------- /perf/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 3 | HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" 4 | JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" 5 | StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" 6 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 7 | -------------------------------------------------------------------------------- /docs/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 3 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" 4 | Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" 5 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 6 | 7 | [compat] 8 | Documenter = "0.27" 9 | Literate = "2.2" 10 | -------------------------------------------------------------------------------- /lib/complex.jl: -------------------------------------------------------------------------------- 1 | # CUDA's complex types are defined in terms of vector types (float2, double2), 2 | # but those seem compatible with Julia's complex numbers, so use those. 3 | const cuFloatComplex = Complex{Float32} 4 | const cuDoubleComplex = Complex{Float64} 5 | 6 | # aliases 7 | const cuComplex = cuFloatComplex 8 | -------------------------------------------------------------------------------- /lib/utils/APIUtils.jl: -------------------------------------------------------------------------------- 1 | module APIUtils 2 | 3 | using ..CUDA 4 | 5 | using LLVM 6 | using LLVM.Interop 7 | 8 | # helpers that facilitate working with CUDA APIs 9 | include("call.jl") 10 | include("enum.jl") 11 | include("threading.jl") 12 | include("cache.jl") 13 | include("memoization.jl") 14 | 15 | end 16 | -------------------------------------------------------------------------------- /docs/src/api/essentials.md: -------------------------------------------------------------------------------- 1 | # Essentials 2 | 3 | 4 | ## Initialization 5 | 6 | ```@docs 7 | CUDA.functional(::Bool) 8 | has_cuda 9 | has_cuda_gpu 10 | ``` 11 | 12 | 13 | ## Global state 14 | 15 | ```@docs 16 | context 17 | context! 18 | device 19 | device! 20 | device_reset! 21 | stream 22 | stream! 23 | ``` 24 | -------------------------------------------------------------------------------- /test/nvtx.jl: -------------------------------------------------------------------------------- 1 | # markers 2 | 3 | NVTX.mark("test") 4 | 5 | 6 | # ranges 7 | 8 | NVTX.@range "test" begin 9 | end 10 | 11 | NVTX.@range function test() 12 | end 13 | test() 14 | 15 | @eval test2() = nothing 16 | 17 | NVTX.@range function Main.test2(::Int) 18 | end 19 | 20 | NVTX.@range function Main.test2(::T) where T 21 | end 22 | 23 | NVTX.@range test3() = nothing 24 | -------------------------------------------------------------------------------- /test/ptx/Makefile: -------------------------------------------------------------------------------- 1 | SRCS=$(wildcard *.cu) 2 | OBJS=$(SRCS:.cu=.ptx) 3 | 4 | CUDA_ROOT=/usr 5 | NVCC=$(CUDA_ROOT)/bin/nvcc 6 | NVCCFLAGS=-arch=sm_20 7 | 8 | 9 | .PHONY: all 10 | all: $(OBJS) 11 | 12 | .PHONY: clean 13 | clean: 14 | $(RM) $(OBJS) 15 | 16 | 17 | %.ptx: %.cu 18 | $(NVCC) $(NVCCFLAGS) -ptx $^ -o $@ 19 | sed -i -e '/\.file/d' -e '/\.loc/d' -e '/^\/\//d' $@ 20 | -------------------------------------------------------------------------------- /res/wrap/patches/cudadrv/cudeviceptr.patch: -------------------------------------------------------------------------------- 1 | --- a/libcuda_common.jl 2019-10-15 15:11:11.826266035 +0200 2 | +++ b/libcuda_common.jl 2019-10-15 15:31:06.144762261 +0200 3 | @@ -42,7 +42,6 @@ 4 | 5 | const cuuint32_t = UInt32 6 | const cuuint64_t = UInt64 7 | -const CUdeviceptr = Culonglong 8 | const CUdevice = Cint 9 | const CUctx_st = Cvoid 10 | const CUcontext = Ptr{CUctx_st} 11 | -------------------------------------------------------------------------------- /lib/nvtx/NVTX.jl: -------------------------------------------------------------------------------- 1 | module NVTX 2 | 3 | using ..CUDA 4 | using ..CUDA: libnvtx, @checked 5 | using ..CUDA: CUstream, CUdevice, CUcontext, CUevent 6 | 7 | using CEnum: @cenum 8 | 9 | using ExprTools: splitdef, combinedef 10 | 11 | 12 | # core library 13 | initialize_context() = return 14 | include("libnvtx_common.jl") 15 | include("libnvtx.jl") 16 | 17 | include("highlevel.jl") 18 | 19 | end 20 | -------------------------------------------------------------------------------- /res/wrap/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | CSTParser = "00ebfdb7-1f24-5e51-bd34-a7502290713f" 3 | CUDA_full_jll = "4f82f1eb-248c-5f56-a42e-99106d144614" 4 | CUDNN_jll = "62b44479-cb7b-5706-934f-f13b2eb2e645" 5 | CUTENSOR_jll = "35b6c64b-1ee1-5834-92a3-3f624899209a" 6 | Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31" 7 | Tokenize = "0796e94c-ce3b-5d07-9a54-7f471281c624" 8 | 9 | [compat] 10 | julia = "1.4" 11 | -------------------------------------------------------------------------------- /.github/workflows/TagBot.yml: -------------------------------------------------------------------------------- 1 | name: TagBot 2 | 3 | on: 4 | issue_comment: 5 | types: 6 | - created 7 | workflow_dispatch: 8 | 9 | jobs: 10 | TagBot: 11 | if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: JuliaRegistries/TagBot@v1 15 | with: 16 | token: ${{ secrets.GITHUB_TOKEN }} 17 | -------------------------------------------------------------------------------- /lib/cufft/wrappers.jl: -------------------------------------------------------------------------------- 1 | # wrappers of low-level functionality 2 | 3 | function cufftGetProperty(property::libraryPropertyType) 4 | value_ref = Ref{Cint}() 5 | cufftGetProperty(property, value_ref) 6 | value_ref[] 7 | end 8 | 9 | version() = VersionNumber(cufftGetProperty(CUDA.MAJOR_VERSION), 10 | cufftGetProperty(CUDA.MINOR_VERSION), 11 | cufftGetProperty(CUDA.PATCH_LEVEL)) 12 | -------------------------------------------------------------------------------- /res/wrap/patches/cudadrv/cuarray.patch: -------------------------------------------------------------------------------- 1 | --- a/libcuda_common.jl 2 | +++ b/libcuda_common.jl 3 | @@ -82,8 +82,6 @@ const CUmod_st = Cvoid 4 | const CUmodule = Ptr{CUmod_st} 5 | const CUfunc_st = Cvoid 6 | const CUfunction = Ptr{CUfunc_st} 7 | -const CUarray_st = Cvoid 8 | -const CUarray = Ptr{CUarray_st} 9 | const CUmipmappedArray_st = Cvoid 10 | const CUmipmappedArray = Ptr{CUmipmappedArray_st} 11 | const CUtexref_st = Cvoid 12 | -------------------------------------------------------------------------------- /test/ptx/vadd_child.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | 7 | .visible .func (.param .b32 func_retval0) add( 8 | .param .b32 add_param_0, 9 | .param .b32 add_param_1 10 | ) 11 | { 12 | .reg .f32 %f<4>; 13 | 14 | 15 | ld.param.f32 %f1, [add_param_0]; 16 | ld.param.f32 %f2, [add_param_1]; 17 | add.f32 %f3, %f1, %f2; 18 | st.param.f32 [func_retval0+0], %f3; 19 | ret; 20 | } 21 | 22 | 23 | -------------------------------------------------------------------------------- /deps/utils.jl: -------------------------------------------------------------------------------- 1 | export getenv 2 | 3 | # robustly get and parse an env var 4 | function getenv(var, default::T) where T 5 | if haskey(ENV, var) 6 | result = tryparse(T, ENV[var]) 7 | if result === nothing 8 | @warn "Could not parse $(var)=$(ENV[var]), using default value '$default'" 9 | default 10 | else 11 | result 12 | end 13 | else 14 | default 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /res/wrap/patches/cudnn/severity.patch: -------------------------------------------------------------------------------- 1 | --- a/libcudnn_common.jl 2019-10-23 17:52:17.651150610 +0200 2 | +++ b/libcudnn_common.jl 2019-10-23 17:51:42.383658270 +0200 3 | @@ -26,9 +26,6 @@ 4 | end 5 | 6 | 7 | -const CUDNN_SEV_ERROR_EN = UInt32(1) << CUDNN_SEV_ERROR 8 | -const CUDNN_SEV_WARNING_EN = UInt32(1) << CUDNN_SEV_WARNING 9 | -const CUDNN_SEV_INFO_EN = UInt32(1) << CUDNN_SEV_INFO 10 | const cudnnContext = Cvoid 11 | const cudnnHandle_t = Ptr{cudnnContext} 12 | 13 | -------------------------------------------------------------------------------- /examples/hello_world.jl: -------------------------------------------------------------------------------- 1 | using CUDA 2 | 3 | if Sys.iswindows() 4 | function hello_world() 5 | @cuprintf("Greetings from block %lld, thread %lld!\n", Int64(blockIdx().x), Int64(threadIdx().x)) 6 | return 7 | end 8 | else 9 | function hello_world() 10 | @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x)) 11 | return 12 | end 13 | end 14 | @cuda blocks=2 threads=2 hello_world() 15 | synchronize() 16 | -------------------------------------------------------------------------------- /lib/cupti/CUPTI.jl: -------------------------------------------------------------------------------- 1 | module CUPTI 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | using ..CUDA: libcupti, @retry_reclaim, initialize_context 7 | using ..CUDA: CUuuid, CUcontext, CUstream, CUdevice, CUdevice_attribute, 8 | CUgraph, CUgraphNode, CUgraphNodeType, CUgraphExec, CUaccessPolicyWindow 9 | 10 | using CEnum: @cenum 11 | 12 | 13 | # core library 14 | include("libcupti_common.jl") 15 | include("error.jl") 16 | include("libcupti.jl") 17 | 18 | include("wrappers.jl") 19 | 20 | end 21 | -------------------------------------------------------------------------------- /examples/driver/vadd.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | using CUDA 4 | 5 | md = CuModuleFile(joinpath(@__DIR__, "vadd.ptx")) 6 | vadd = CuFunction(md, "kernel_vadd") 7 | 8 | dims = (3,4) 9 | a = round.(rand(Float32, dims) * 100) 10 | b = round.(rand(Float32, dims) * 100) 11 | c = similar(a) 12 | 13 | d_a = CuArray(a) 14 | d_b = CuArray(b) 15 | d_c = CuArray(c) 16 | 17 | len = prod(dims) 18 | cudacall(vadd, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, d_a, d_b, d_c; threads=len) 19 | 20 | @test a+b ≈ Array(d_c) 21 | -------------------------------------------------------------------------------- /examples/vadd.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | using CUDA 4 | using CUDA: i32 5 | 6 | function vadd(a, b, c) 7 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 8 | c[i] = a[i] + b[i] 9 | return 10 | end 11 | 12 | dims = (3,4) 13 | a = round.(rand(Float32, dims) * 100) 14 | b = round.(rand(Float32, dims) * 100) 15 | c = similar(a) 16 | 17 | d_a = CuArray(a) 18 | d_b = CuArray(b) 19 | d_c = CuArray(c) 20 | 21 | len = prod(dims) 22 | @cuda threads=len vadd(d_a, d_b, d_c) 23 | c = Array(d_c) 24 | @test a+b ≈ c 25 | -------------------------------------------------------------------------------- /res/wrap/patches/nvtx/unions.patch: -------------------------------------------------------------------------------- 1 | --- a/libnvtx_common.jl 2019-10-25 16:09:36.638690884 +0200 2 | +++ b/libnvtx_common.jl 2019-10-25 16:33:25.506013480 +0200 3 | @@ -72,6 +64,10 @@ 4 | end 5 | 6 | 7 | +struct payload_t 8 | + ullValue::UInt64 9 | +end 10 | + 11 | struct nvtxEventAttributes_v2 12 | version::UInt16 13 | size::UInt16 14 | @@ -96,6 +92,10 @@ 15 | end 16 | 17 | 18 | +struct identifier_t 19 | + ullValue::UInt64 20 | +end 21 | + 22 | struct nvtxResourceAttributes_v0 23 | version::UInt16 24 | size::UInt16 25 | -------------------------------------------------------------------------------- /src/deprecated.jl: -------------------------------------------------------------------------------- 1 | # Deprecated functionality 2 | 3 | @deprecate CuDevice(ctx::CuContext) device(ctx) 4 | @deprecate CuCurrentDevice() current_device() 5 | @deprecate CuCurrentContext() current_context() 6 | @deprecate CuContext(ptr::Union{Ptr,CuPtr}) context(ptr) 7 | @deprecate CuDevice(ptr::Union{Ptr,CuPtr}) device(ptr) 8 | 9 | @deprecate CuDefaultStream() default_stream() 10 | @deprecate CuStreamLegacy() legacy_stream() 11 | @deprecate CuStreamPerThread() per_thread_stream() 12 | @deprecate query(s::CuStream) isdone(s) 13 | @deprecate query(e::CuEvent) isdone(e) 14 | -------------------------------------------------------------------------------- /lib/cufft/CUFFT.jl: -------------------------------------------------------------------------------- 1 | module CUFFT 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | using ..CUDA: CUstream, cuComplex, cuDoubleComplex, libraryPropertyType 7 | using ..CUDA: libcufft, unsafe_free!, @retry_reclaim, @context!, initialize_context 8 | 9 | using CEnum: @cenum 10 | 11 | using Reexport: @reexport 12 | 13 | 14 | # core library 15 | include("libcufft_common.jl") 16 | include("error.jl") 17 | include("libcufft.jl") 18 | 19 | # low-level wrappers 20 | include("util.jl") 21 | include("wrappers.jl") 22 | 23 | # high-level integrations 24 | include("fft.jl") 25 | 26 | end 27 | -------------------------------------------------------------------------------- /res/wrap/patches/nvtx/macro.patch: -------------------------------------------------------------------------------- 1 | --- a/libnvtx_common.jl 2019-10-25 16:09:36.638690884 +0200 2 | +++ b/libnvtx_common.jl 2019-10-25 16:25:03.653940666 +0200 3 | @@ -5,7 +5,7 @@ 4 | 5 | # Skipping MacroDefinition: NVTX_INLINE_STATIC inline static 6 | 7 | -const NVTX_DECLSPEC = NVTX_INLINE_STATIC 8 | +# Skipping MacroDefinition: NVTX_DECLSPEC 9 | 10 | # Skipping MacroDefinition: NVTX_VERSIONED_IDENTIFIER_L3 ( NAME , VERSION ) NAME ## _v ## VERSION 11 | # Skipping MacroDefinition: NVTX_VERSIONED_IDENTIFIER_L2 ( NAME , VERSION ) NVTX_VERSIONED_IDENTIFIER_L3 ( NAME , VERSION ) 12 | -------------------------------------------------------------------------------- /docs/src/tutorials/common.jl: -------------------------------------------------------------------------------- 1 | # function to run a Julia script outside of the current environment 2 | function script(code; wrapper=``, args=``) 3 | if Base.JLOptions().project != C_NULL 4 | args = `$args --project=$(unsafe_string(Base.JLOptions().project))` 5 | end 6 | mktemp() do path, io 7 | write(io, code) 8 | flush(io) 9 | cmd = `$wrapper $(Base.julia_cmd()) $args $path` 10 | # redirect stderr to stdout to have it picked up by Weave.jl 11 | run(pipeline(ignorestatus(cmd), stderr=stdout)) 12 | end 13 | nothing 14 | end 15 | -------------------------------------------------------------------------------- /lib/cublas/libcublas_deprecated.jl: -------------------------------------------------------------------------------- 1 | # Removed in CUDA 11.0 2 | 3 | @checked function cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, 4 | beta, C, ldc) 5 | initialize_context() 6 | ccall((:cublasDgemm_v2, libcublas()), cublasStatus_t, 7 | (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, 8 | Cint, RefOrCuRef{Float16}, CuPtr{Float16}, Cint, CuPtr{Float16}, Cint, 9 | RefOrCuRef{Float16}, CuPtr{Float16}, Cint), 10 | handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) 11 | end 12 | -------------------------------------------------------------------------------- /lib/cusparse/management.jl: -------------------------------------------------------------------------------- 1 | # cuSPARSE functions for managing the library 2 | 3 | function cusparseCreate() 4 | handle = Ref{cusparseHandle_t}() 5 | @check unsafe_cusparseCreate(handle) CUSPARSE_STATUS_NOT_INITIALIZED 6 | handle[] 7 | end 8 | 9 | function cusparseGetProperty(property::libraryPropertyType) 10 | value_ref = Ref{Cint}() 11 | cusparseGetProperty(property, value_ref) 12 | value_ref[] 13 | end 14 | 15 | version() = VersionNumber(cusparseGetProperty(CUDA.MAJOR_VERSION), 16 | cusparseGetProperty(CUDA.MINOR_VERSION), 17 | cusparseGetProperty(CUDA.PATCH_LEVEL)) 18 | -------------------------------------------------------------------------------- /lib/curand/wrappers.jl: -------------------------------------------------------------------------------- 1 | # wrappers of low-level functionality 2 | 3 | function curandCreateGenerator(typ) 4 | handle_ref = Ref{curandGenerator_t}() 5 | @check unsafe_curandCreateGenerator(handle_ref, typ) CURAND_STATUS_INITIALIZATION_FAILED 6 | handle_ref[] 7 | end 8 | 9 | function curandGetProperty(property::libraryPropertyType) 10 | value_ref = Ref{Cint}() 11 | curandGetProperty(property, value_ref) 12 | value_ref[] 13 | end 14 | 15 | version() = VersionNumber(curandGetProperty(CUDA.MAJOR_VERSION), 16 | curandGetProperty(CUDA.MINOR_VERSION), 17 | curandGetProperty(CUDA.PATCH_LEVEL)) 18 | -------------------------------------------------------------------------------- /res/wrap/patches/cusparse/cppversion.patch: -------------------------------------------------------------------------------- 1 | --- a/libcusparse_common.jl 2 | +++ b/libcusparse_common.jl 3 | @@ -10,7 +10,6 @@ const CUSPARSE_VER_MINOR = 1 4 | const CUSPARSE_VER_PATCH = 0 5 | const CUSPARSE_VER_BUILD = 218 6 | const CUSPARSE_VERSION = CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR * 100 + CUSPARSE_VER_PATCH 7 | -const CUSPARSE_CPP_VERSION = __cplusplus 8 | 9 | # Skipping MacroDefinition: CUSPARSE_DEPRECATED ( new_func ) __attribute__ ( ( deprecated ( "please use " # new_func " instead" ) ) ) 10 | # Skipping MacroDefinition: CUSPARSE_DEPRECATED_ENUM ( new_enum ) __attribute__ ( ( deprecated ( "please use " # new_enum " instead" ) ) ) 11 | -------------------------------------------------------------------------------- /res/wrap/patches/cudnn/algorithm.patch: -------------------------------------------------------------------------------- 1 | --- a/libcudnn_common.jl 2019-10-23 17:52:17.651150610 +0200 2 | +++ b/libcudnn_common.jl 2019-10-23 17:53:07.195648729 +0200 3 | @@ -403,5 +400,8 @@ 4 | -struct cudnnAlgorithmUnionStruct 5 | - algo::Algorithm 6 | -end 7 | - 8 | -const cudnnAlgorithm_t = cudnnAlgorithmUnionStruct 9 | +# FIXME: can't use such a union as the type in a ccall expression 10 | +#Algorithm = Union{cudnnConvolutionFwdAlgo_t, cudnnConvolutionBwdFilterAlgo_t, cudnnConvolutionBwdDataAlgo_t, cudnnRNNAlgo_t, cudnnCTCLossAlgo_t} 11 | +#struct cudnnAlgorithm_t 12 | +# algo::Algorithm 13 | +#end 14 | +# 15 | +#const cudnnAlgorithm_t = cudnnAlgorithmUnionStruct 16 | +const cudnnAlgorithm_t = Cint 17 | -------------------------------------------------------------------------------- /src/device/intrinsics/cooperative_groups.jl: -------------------------------------------------------------------------------- 1 | # C. Cooperative Groups 2 | 3 | export this_grid, sync_grid 4 | 5 | """ 6 | this_grid() 7 | 8 | Returns a `grid_handle` of the grid group this thread belongs to. Only available if a 9 | cooperative kernel is launched. 10 | """ 11 | this_grid() = cudaCGGetIntrinsicHandle(cudaCGScopeGrid) 12 | 13 | """ 14 | sync_grid(grid_handle::Culonglong) 15 | 16 | Waits until all threads in all blocks in the grid `grid_handle` have reached this point and 17 | all global memory accesses made by these threads prior to `sync_grid()` are visible to all 18 | threads in the grid. A 32-bit integer `cudaError_t` is returned. 19 | """ 20 | sync_grid(handle) = cudaCGSynchronize(handle, 0) 21 | -------------------------------------------------------------------------------- /test/ptx/vectorops.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void vadd(const float *a, const float *b, float *c) 4 | { 5 | int i = blockIdx.x *blockDim.x + threadIdx.x; 6 | c[i] = a[i] + b[i]; 7 | } 8 | 9 | __global__ void vsub(const float *a, const float *b, float *c) 10 | { 11 | int i = blockIdx.x *blockDim.x + threadIdx.x; 12 | c[i] = a[i] - b[i]; 13 | } 14 | 15 | __global__ void vmul(const float *a, const float *b, float *c) 16 | { 17 | int i = blockIdx.x *blockDim.x + threadIdx.x; 18 | c[i] = a[i] * b[i]; 19 | } 20 | 21 | __global__ void vdiv(const float *a, const float *b, float *c) 22 | { 23 | int i = blockIdx.x *blockDim.x + threadIdx.x; 24 | c[i] = a[i] / b[i]; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /test/iterator.jl: -------------------------------------------------------------------------------- 1 | batch_count = 10 2 | max_batch_items = 3 3 | max_ndims = 3 4 | sizes = 20:50 5 | 6 | rand_shape = () -> rand(sizes, rand(1:max_ndims)) 7 | batches = [[rand(Float32, rand_shape()...) for _ in 1:rand(1:max_batch_items)] 8 | for _ in 1:batch_count] 9 | cubatches = CuIterator(batch for batch in batches) # ensure generators are accepted 10 | 11 | previous_cubatch = missing 12 | for (batch, cubatch) in zip(batches, cubatches) 13 | global previous_cubatch 14 | @test ismissing(previous_cubatch) || all(x -> x.storage === nothing, previous_cubatch) 15 | @test batch == Array.(cubatch) 16 | @test all(x -> x isa CuArray, cubatch) 17 | previous_cubatch = cubatch 18 | end 19 | -------------------------------------------------------------------------------- /test/linalg.jl: -------------------------------------------------------------------------------- 1 | using LinearAlgebra 2 | 3 | @testset "qr size mismatch" begin 4 | X = rand(Float32, 2, 1) 5 | Q,R = qr(X) 6 | 7 | @test collect(Q) == Array(collect(Q)) 8 | @test Array(Q) == Array(CuArray(Q)) 9 | @test Array{Float32}(Q) == Array(CuArray{Float32}(Q)) 10 | @test Matrix(Q) == Array(CuMatrix(Q)) 11 | @test Matrix{Float32}(Q) == Array(CuMatrix{Float32}(Q)) 12 | @test convert(Array, Q) == Array(convert(CuArray, Q)) 13 | @test convert(Array{Float32}, Q) == Array(convert(CuArray{Float32}, Q)) 14 | end 15 | 16 | @testset "normalize!" begin 17 | x = rand(ComplexF32, 10) 18 | dx = CuVector{ComplexF32}(x) 19 | @test isreal(norm(dx, 2)) 20 | @test norm(normalize!(dx)) ≈ 1 21 | end 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | 12 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 13 | 14 | 15 | **Describe the solution you'd like** 16 | 17 | A clear and concise description of what you want to happen. 18 | 19 | 20 | **Describe alternatives you've considered** 21 | 22 | A clear and concise description of any alternative solutions or features you've considered. 23 | 24 | 25 | **Additional context** 26 | 27 | Add any other context or information about the feature request here. 28 | -------------------------------------------------------------------------------- /test/cutensor/base.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUTENSOR 2 | using CUDA 3 | using LinearAlgebra 4 | 5 | @test has_cutensor() 6 | @test CUTENSOR.version() isa VersionNumber 7 | 8 | @testset "CuTensor type basics" begin 9 | N = 2 10 | dmax = 2^div(18,N) 11 | dims = rand(2:dmax, N) 12 | p = randperm(N) 13 | indsA = collect(('a':'z')[1:N]) 14 | dimsA = dims 15 | A = rand(Float64, dimsA...) 16 | dA = CuArray(A) 17 | p = randperm(N) 18 | indsA = collect(('a':'z')[1:N]) 19 | ctA = CuTensor(dA, indsA) 20 | @test length(ctA) == length(A) 21 | @test size(ctA) == size(A) 22 | @test size(ctA, 1) == size(A, 1) 23 | @test ndims(ctA) == ndims(A) 24 | @test strides(ctA) == strides(A) 25 | @test eltype(ctA) == eltype(A) 26 | end 27 | -------------------------------------------------------------------------------- /lib/cudnn/base.jl: -------------------------------------------------------------------------------- 1 | function cudnnCreate() 2 | handle_ref = Ref{cudnnHandle_t}() 3 | @check unsafe_cudnnCreate(handle_ref) CUDNN_STATUS_NOT_INITIALIZED CUDNN_STATUS_INTERNAL_ERROR 4 | return handle_ref[] 5 | end 6 | 7 | function cudnnGetProperty(property::CUDA.libraryPropertyType) 8 | value_ref = Ref{Cint}() 9 | cudnnGetProperty(property, value_ref) 10 | value_ref[] 11 | end 12 | 13 | function version() 14 | ver = cudnnGetVersion() 15 | major, ver = divrem(ver, 1000) 16 | minor, patch = divrem(ver, 10) 17 | 18 | VersionNumber(major, minor, patch) 19 | end 20 | 21 | function cuda_version() 22 | ver = cudnnGetCudartVersion() 23 | major, ver = divrem(ver, 1000) 24 | minor, patch = divrem(ver, 10) 25 | 26 | VersionNumber(major, minor, patch) 27 | end 28 | -------------------------------------------------------------------------------- /res/wrap/patches/cudadrv/batched_memop.patch: -------------------------------------------------------------------------------- 1 | --- a/libcuda.jl 2019-10-16 09:15:14.213122392 +0200 2 | +++ b/libcuda.jl 2019-10-16 09:15:29.233281015 +0200 3 | @@ -964,5 +963,0 @@ 4 | -@checked function cuStreamBatchMemOp(stream, count, paramArray, flags) 5 | - initialize_context() 6 | - ccall((:cuStreamBatchMemOp, libcuda), CUresult, (CUstream, UInt32, Ptr{CUstreamBatchMemOpParams}, UInt32), stream, count, paramArray, flags) 7 | -end 8 | - 9 | --- a/libcuda_common.jl 2019-10-16 09:15:14.289789877 +0200 10 | +++ b/libcuda_common.jl 2019-10-16 09:16:50.574087901 +0200 11 | @@ -214,6 +213,0 @@ 12 | -struct CUstreamBatchMemOpParams_union 13 | - waitValue::CUstreamMemOpWaitValueParams_st 14 | -end 15 | - 16 | -const CUstreamBatchMemOpParams = CUstreamBatchMemOpParams_union 17 | - 18 | -------------------------------------------------------------------------------- /src/broadcast.jl: -------------------------------------------------------------------------------- 1 | # broadcasting 2 | 3 | using Base.Broadcast: BroadcastStyle, Broadcasted 4 | 5 | struct CuArrayStyle{N} <: AbstractGPUArrayStyle{N} end 6 | CuArrayStyle(::Val{N}) where N = CuArrayStyle{N}() 7 | CuArrayStyle{M}(::Val{N}) where {N,M} = CuArrayStyle{N}() 8 | 9 | BroadcastStyle(::Type{<:CuArray{T,N}}) where {T,N} = CuArrayStyle{N}() 10 | 11 | Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}) where {N,T} = 12 | similar(CuArray{T}, axes(bc)) 13 | 14 | Base.similar(bc::Broadcasted{CuArrayStyle{N}}, ::Type{T}, dims) where {N,T} = 15 | CuArray{T}(undef, dims) 16 | 17 | # broadcasting type ctors isn't GPU compatible 18 | Broadcast.broadcasted(::CuArrayStyle{N}, f::Type{T}, args...) where {N, T} = 19 | Broadcasted{CuArrayStyle{N}}((x...) -> T(x...), args, nothing) 20 | -------------------------------------------------------------------------------- /lib/cudadrv/module/function.jl: -------------------------------------------------------------------------------- 1 | # Functions in modules 2 | 3 | export 4 | CuFunction 5 | 6 | 7 | """ 8 | CuFunction(mod::CuModule, name::String) 9 | 10 | Acquires a function handle from a named function in a module. 11 | """ 12 | struct CuFunction 13 | handle::CUfunction 14 | mod::CuModule 15 | 16 | "Get a handle to a kernel function in a CUDA module." 17 | function CuFunction(mod::CuModule, name::String) 18 | handle_ref = Ref{CUfunction}() 19 | cuModuleGetFunction(handle_ref, mod, name) 20 | new(handle_ref[], mod) 21 | end 22 | end 23 | 24 | Base.unsafe_convert(::Type{CUfunction}, fun::CuFunction) = fun.handle 25 | 26 | Base.:(==)(a::CuFunction, b::CuFunction) = a.handle == b.handle 27 | Base.hash(fun::CuFunction, h::UInt) = hash(mod.handle, h) 28 | -------------------------------------------------------------------------------- /src/device/intrinsics/misc.jl: -------------------------------------------------------------------------------- 1 | export clock, nanosleep 2 | 3 | """ 4 | clock(UInt32) 5 | 6 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 7 | """ 8 | clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) 9 | 10 | """ 11 | clock(UInt64) 12 | 13 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 14 | """ 15 | clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) 16 | 17 | 18 | """ 19 | nanosleep(t) 20 | 21 | Puts a thread for a given amount `t`(in nanoseconds). 22 | 23 | !!! note 24 | Requires CUDA >= 10.0 and sm_6.2 25 | """ 26 | @inline function nanosleep(t::Unsigned) 27 | @asmcall("nanosleep.u32 \$0;", "r", true, 28 | Cvoid, Tuple{UInt32}, convert(UInt32, t)) 29 | end 30 | -------------------------------------------------------------------------------- /lib/cusparse/util.jl: -------------------------------------------------------------------------------- 1 | # utility functions for the CUSPARSE wrappers 2 | 3 | """ 4 | check that the dimensions of matrix `X` and vector `Y` make sense for a multiplication 5 | """ 6 | function chkmvdims(X, n, Y, m) 7 | if length(X) != n 8 | throw(DimensionMismatch("X must have length $n, but has length $(length(X))")) 9 | elseif length(Y) != m 10 | throw(DimensionMismatch("Y must have length $m, but has length $(length(Y))")) 11 | end 12 | end 13 | 14 | """ 15 | check that the dimensions of matrices `X` and `Y` make sense for a multiplication 16 | """ 17 | function chkmmdims( B, C, k, l, m, n ) 18 | if size(B) != (k,l) 19 | throw(DimensionMismatch("B has dimensions $(size(B)) but needs ($k,$l)")) 20 | elseif size(C) != (m,n) 21 | throw(DimensionMismatch("C has dimensions $(size(C)) but needs ($m,$n)")) 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /src/precompile.jl: -------------------------------------------------------------------------------- 1 | 2 | # installation management 3 | precompile(__init_toolkit__, ()) 4 | precompile(libcuda, ()) 5 | 6 | # array 7 | precompile(CuArray, (Vector{Int},)) 8 | 9 | # compilation 10 | precompile(CUDACompilerTarget, (CuDevice,)) 11 | precompile(cufunction_compile, (CompilerJob,)) 12 | precompile(cufunction_link, (CompilerJob,NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}})) 13 | precompile(cufunction_cache, (CuContext,)) 14 | precompile(create_exceptions!, (CuModule,)) 15 | precompile(run_and_collect, (Cmd,)) 16 | 17 | # launch 18 | precompile(cudaconvert, (Function,)) 19 | precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) 20 | precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) 21 | -------------------------------------------------------------------------------- /test/apiutils.jl: -------------------------------------------------------------------------------- 1 | using CUDA.APIUtils 2 | 3 | @testset "@enum_without_prefix" begin 4 | mod = @eval module $(gensym()) 5 | using CUDA.APIUtils 6 | @enum MY_ENUM MY_ENUM_VALUE 7 | @enum_without_prefix MY_ENUM MY_ 8 | end 9 | 10 | @test mod.ENUM_VALUE == mod.MY_ENUM_VALUE 11 | end 12 | 13 | @testset "@checked" begin 14 | mod = @eval module $(gensym()) 15 | using CUDA.APIUtils 16 | 17 | const checks = Ref(0) 18 | macro check(ex) 19 | esc(quote 20 | $checks[] += 1 21 | $ex 22 | end) 23 | end 24 | 25 | @checked function foo() 26 | ccall(:jl_getpid, Cint, ()) 27 | end 28 | end 29 | 30 | @test mod.checks[] == 0 31 | @test mod.foo() == getpid() 32 | @test mod.checks[] == 1 33 | @test mod.unsafe_foo() == getpid() 34 | @test mod.checks[] == 1 35 | end 36 | -------------------------------------------------------------------------------- /test/ptx/vadd.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | 7 | .visible .entry vadd( 8 | .param .u64 vadd_param_0, 9 | .param .u64 vadd_param_1, 10 | .param .u64 vadd_param_2 11 | ) 12 | { 13 | .reg .s32 %r<8>; 14 | .reg .f32 %f<4>; 15 | .reg .s64 %rd<11>; 16 | 17 | 18 | ld.param.u64 %rd1, [vadd_param_0]; 19 | ld.param.u64 %rd2, [vadd_param_1]; 20 | ld.param.u64 %rd3, [vadd_param_2]; 21 | cvta.to.global.u64 %rd4, %rd3; 22 | mov.u32 %r1, %ntid.x; 23 | mov.u32 %r2, %ctaid.x; 24 | mov.u32 %r3, %tid.x; 25 | mad.lo.s32 %r4, %r1, %r2, %r3; 26 | cvta.to.global.u64 %rd5, %rd1; 27 | mul.wide.s32 %rd6, %r4, 4; 28 | add.s64 %rd7, %rd5, %rd6; 29 | cvta.to.global.u64 %rd8, %rd2; 30 | add.s64 %rd9, %rd8, %rd6; 31 | ld.global.f32 %f1, [%rd9]; 32 | ld.global.f32 %f2, [%rd7]; 33 | add.f32 %f3, %f2, %f1; 34 | add.s64 %rd10, %rd4, %rd6; 35 | st.global.f32 [%rd10], %f3; 36 | ret; 37 | } 38 | 39 | 40 | -------------------------------------------------------------------------------- /lib/nvml/system.jl: -------------------------------------------------------------------------------- 1 | function version() 2 | buf = Vector{Cchar}(undef, NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) 3 | nvmlSystemGetNVMLVersion(pointer(buf), length(buf)) 4 | 5 | # the version string is too long for Julia to handle, e.g. 11.450.36.06, 6 | # so split off the driver part into the build suffix 7 | ver = unsafe_string(pointer(buf)) 8 | parts = parse.(Int, split(ver, '.')) 9 | return VersionNumber(parts[1], 0, 0, (), Tuple(parts[2:end])) 10 | end 11 | 12 | function driver_version() 13 | buf = Vector{Cchar}(undef, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) 14 | nvmlSystemGetDriverVersion(pointer(buf), length(buf)) 15 | return VersionNumber(unsafe_string(pointer(buf))) 16 | end 17 | 18 | function cuda_driver_version() 19 | ref = Ref{Cint}() 20 | nvmlSystemGetCudaDriverVersion_v2(ref) 21 | major, ver = divrem(ref[], 1000) 22 | minor, patch = divrem(ver, 10) 23 | return VersionNumber(major, minor, patch) 24 | end 25 | -------------------------------------------------------------------------------- /lib/cusparse/helpers.jl: -------------------------------------------------------------------------------- 1 | # cuSPARSE helper functions 2 | 3 | 4 | ## matrix descriptor 5 | 6 | mutable struct CuMatrixDescriptor 7 | handle::cusparseMatDescr_t 8 | 9 | function CuMatrixDescriptor() 10 | descr_ref = Ref{cusparseMatDescr_t}() 11 | cusparseCreateMatDescr(descr_ref) 12 | obj = new(descr_ref[]) 13 | finalizer(cusparseDestroyMatDescr, obj) 14 | obj 15 | end 16 | end 17 | 18 | Base.unsafe_convert(::Type{cusparseMatDescr_t}, desc::CuMatrixDescriptor) = desc.handle 19 | 20 | function CuMatrixDescriptor(MatrixType::Char, FillMode::Char, DiagType::Char, IndexBase::Char) 21 | desc = CuMatrixDescriptor() 22 | if MatrixType != 'G' 23 | cusparseSetMatType(desc, MatrixType) 24 | end 25 | cusparseSetMatFillMode(desc, FillMode) 26 | cusparseSetMatDiagType(desc, DiagType) 27 | if IndexBase != 'Z' 28 | cusparseSetMatIndexBase(desc, IndexBase) 29 | end 30 | return desc 31 | end 32 | -------------------------------------------------------------------------------- /lib/utils/enum.jl: -------------------------------------------------------------------------------- 1 | export @enum_without_prefix 2 | 3 | 4 | ## redeclare enum values without a prefix 5 | 6 | # this is useful when enum values from an underlying C library, typically prefixed for the 7 | # lack of namespacing in C, are to be used in Julia where we do have module namespacing. 8 | macro enum_without_prefix(enum, prefix) 9 | if isa(enum, Symbol) 10 | mod = __module__ 11 | elseif Meta.isexpr(enum, :(.)) 12 | mod = getfield(__module__, enum.args[1]) 13 | enum = enum.args[2].value 14 | else 15 | error("Do not know how to refer to $enum") 16 | end 17 | enum = getfield(mod, enum) 18 | prefix = String(prefix) 19 | 20 | ex = quote end 21 | for instance in instances(enum) 22 | name = String(Symbol(instance)) 23 | @assert startswith(name, prefix) 24 | push!(ex.args, :(const $(Symbol(name[length(prefix)+1:end])) = $(mod).$(Symbol(name)))) 25 | end 26 | 27 | return esc(ex) 28 | end 29 | -------------------------------------------------------------------------------- /test/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 3 | BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" 4 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 5 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" 6 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" 7 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" 8 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" 9 | Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" 10 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 11 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" 12 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" 13 | REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" 14 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 15 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" 16 | SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" 17 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 18 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 19 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 20 | -------------------------------------------------------------------------------- /examples/wmma/high-level.jl: -------------------------------------------------------------------------------- 1 | using CUDA 2 | if capability(device()) < v"7.0" 3 | exit() 4 | end 5 | 6 | ### START 7 | using Test 8 | 9 | using CUDA 10 | 11 | a = rand(Float16, (16, 16)) 12 | b = rand(Float16, (16, 16)) 13 | c = rand(Float32, (16, 16)) 14 | 15 | a_dev = CuArray(a) 16 | b_dev = CuArray(b) 17 | c_dev = CuArray(c) 18 | d_dev = similar(c_dev) 19 | 20 | function kernel(a_dev, b_dev, c_dev, d_dev) 21 | conf = WMMA.Config{16, 16, 16, Float32} 22 | 23 | a_frag = WMMA.load_a(pointer(a_dev), 16, WMMA.ColMajor, conf) 24 | b_frag = WMMA.load_b(pointer(b_dev), 16, WMMA.ColMajor, conf) 25 | c_frag = WMMA.load_c(pointer(c_dev), 16, WMMA.ColMajor, conf) 26 | 27 | c_frag = 0.5f0 .* c_frag 28 | 29 | d_frag = WMMA.mma(a_frag, b_frag, c_frag, conf) 30 | 31 | WMMA.store_d(pointer(d_dev), d_frag, 16, WMMA.ColMajor, conf) 32 | 33 | return 34 | end 35 | 36 | @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev) 37 | d = Array(d_dev) 38 | 39 | @test all(isapprox.(a * b + 0.5 * c, d; rtol=0.01)) 40 | ### END 41 | -------------------------------------------------------------------------------- /res/wrap/patches/cublas/computetype.patch: -------------------------------------------------------------------------------- 1 | --- a/libcublas.jl 2 | +++ b/libcublas.jl 3 | @@ -1414,5 +1414,5 @@ end 4 | @checked function cublasGemmEx(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo) 5 | initialize_api() 6 | - ccall((:cublasGemmEx, libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, cublasComputeType_t, cublasGemmAlgo_t), handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo) 7 | + ccall((:cublasGemmEx, libcublas), cublasStatus_t, (cublasHandle_t, cublasOperation_t, cublasOperation_t, Cint, Cint, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, cudaDataType, Cint, Ptr{Cvoid}, Ptr{Cvoid}, cudaDataType, Cint, UInt32, cublasGemmAlgo_t), handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo) 8 | end 9 | -------------------------------------------------------------------------------- /CITATION.bib: -------------------------------------------------------------------------------- 1 | % primary paper, detailing the GPU compiler and relevant aspects 2 | @article{besard2018juliagpu, 3 | author = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn}, 4 | title = {Effective Extensible Programming: Unleashing {Julia} on {GPUs}}, 5 | journal = {IEEE Transactions on Parallel and Distributed Systems}, 6 | year = {2018}, 7 | doi = {10.1109/TPDS.2018.2872064}, 8 | ISSN = {1045-9219}, 9 | archivePrefix = {arXiv}, 10 | eprint = {1712.03112}, 11 | primaryClass = {cs.PL}, 12 | } 13 | 14 | % specific paper on array programming for heterogeneous systems 15 | @article{besard2019prototyping, 16 | title = {Rapid software prototyping for heterogeneous and distributed platforms}, 17 | author = {Besard, Tim and Churavy, Valentin and Edelman, Alan and De Sutter, Bjorn}, 18 | journal = {Advances in Engineering Software}, 19 | volume = {132}, 20 | pages = {29--46}, 21 | year = {2019}, 22 | publisher = {Elsevier} 23 | } 24 | -------------------------------------------------------------------------------- /lib/cusolver/base.jl: -------------------------------------------------------------------------------- 1 | # wrappers of low-level functionality 2 | 3 | function cusolverGetProperty(property::libraryPropertyType) 4 | value_ref = Ref{Cint}() 5 | cusolverGetProperty(property, value_ref) 6 | value_ref[] 7 | end 8 | 9 | version() = VersionNumber(cusolverGetProperty(CUDA.MAJOR_VERSION), 10 | cusolverGetProperty(CUDA.MINOR_VERSION), 11 | cusolverGetProperty(CUDA.PATCH_LEVEL)) 12 | 13 | function Base.convert(::Type{cusolverEigType_t}, typ::Int) 14 | if typ == 1 15 | CUSOLVER_EIG_TYPE_1 16 | elseif typ == 2 17 | CUSOLVER_EIG_TYPE_2 18 | elseif typ == 3 19 | CUSOLVER_EIG_TYPE_3 20 | else 21 | throw(ArgumentError("Unknown eigenvalue solver type $typ.")) 22 | end 23 | end 24 | 25 | function Base.convert(::Type{cusolverEigMode_t}, jobz::Char) 26 | if jobz == 'N' 27 | CUSOLVER_EIG_MODE_NOVECTOR 28 | elseif jobz == 'V' 29 | CUSOLVER_EIG_MODE_VECTOR 30 | else 31 | throw(ArgumentError("Unknown eigenvalue solver mode $jobz.")) 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/cudnn/error.jl: -------------------------------------------------------------------------------- 1 | export CUDNNError 2 | 3 | struct CUDNNError <: Exception 4 | code::cudnnStatus_t 5 | end 6 | 7 | Base.convert(::Type{cudnnStatus_t}, err::CUDNNError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUDNNError) = 10 | print(io, "CUDNNError: ", name(err), " (code $(reinterpret(Int32, err.code)))") 11 | 12 | name(err::CUDNNError) = unsafe_string(cudnnGetErrorString(err)) 13 | 14 | 15 | ## API call wrapper 16 | 17 | # outlined functionality to avoid GC frame allocation 18 | @noinline function throw_api_error(res) 19 | if res == CUDNN_STATUS_ALLOC_FAILED 20 | throw(OutOfGPUMemoryError()) 21 | else 22 | throw(CUDNNError(res)) 23 | end 24 | end 25 | 26 | macro check(ex, errs...) 27 | check = :(isequal(err, CUDNN_STATUS_ALLOC_FAILED)) 28 | for err in errs 29 | check = :($check || isequal(err, $(esc(err)))) 30 | end 31 | 32 | quote 33 | res = @retry_reclaim err->$check $(esc(ex)) 34 | if res != CUDNN_STATUS_SUCCESS 35 | throw_api_error(res) 36 | end 37 | 38 | nothing 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /test/cudnn/tensor.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUDNN: 2 | cudnnTensorDescriptor, 3 | cudnnCreateTensorDescriptor, 4 | cudnnFilterDescriptor, 5 | cudnnDataType, 6 | cudnnDataType_t, 7 | CUDNN_TENSOR_NCHW, 8 | CUDNN_STATUS_SUCCESS, 9 | @retry_reclaim 10 | 11 | @testset "cudnn/tensor" begin 12 | x = CUDA.rand(1,1,1,2) 13 | 14 | TD = cudnnTensorDescriptor 15 | FD = cudnnFilterDescriptor 16 | DT = cudnnDataType 17 | 18 | @test TD(x) isa TD 19 | @test TD(CUDNN_TENSOR_NCHW, DT(eltype(x)), Cint(ndims(x)), Cint[reverse(size(x))...]) isa TD 20 | td = TD(x) 21 | @test TD(td.ptr) isa TD 22 | @test Base.unsafe_convert(Ptr, TD(td.ptr)) isa Ptr 23 | 24 | @test FD(x) isa FD 25 | @test FD(DT(eltype(x)),CUDNN_TENSOR_NCHW,Cint(ndims(x)),Cint[reverse(size(x))...]) isa FD 26 | fd = FD(x) 27 | @test FD(fd.ptr) isa FD 28 | @test Base.unsafe_convert(Ptr, FD(fd.ptr)) isa Ptr 29 | 30 | @test DT(Float32) isa cudnnDataType_t 31 | 32 | @test (@retry_reclaim(x->(x!==CUDNN_STATUS_SUCCESS),cudnnCreateTensorDescriptor(Ref{Ptr{Cvoid}}(C_NULL)))) isa Nothing 33 | end 34 | -------------------------------------------------------------------------------- /test/examples.jl: -------------------------------------------------------------------------------- 1 | # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces, 2 | # but --show-backtrace=no does not survive execve. 3 | @not_if_sanitize begin 4 | 5 | function find_sources(path::String, sources=String[]) 6 | if isdir(path) 7 | for entry in readdir(path) 8 | find_sources(joinpath(path, entry), sources) 9 | end 10 | elseif endswith(path, ".jl") 11 | push!(sources, path) 12 | end 13 | sources 14 | end 15 | 16 | examples_dir = joinpath(@__DIR__, "..", "examples") 17 | examples = find_sources(examples_dir) 18 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) 19 | 20 | cd(examples_dir) do 21 | global examples 22 | examples = relpath.(examples, Ref(examples_dir)) 23 | @testset for example in examples 24 | cmd = Base.julia_cmd() 25 | if Base.JLOptions().project != C_NULL 26 | cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))` 27 | end 28 | 29 | @test success(pipeline(`$cmd $example`, stderr=stderr)) 30 | end 31 | end 32 | 33 | end 34 | -------------------------------------------------------------------------------- /src/iterator.jl: -------------------------------------------------------------------------------- 1 | export CuIterator 2 | 3 | """ 4 | CuIterator(batches) 5 | 6 | Return a `CuIterator` that can iterate through the provided `batches` via `Base.iterate`. 7 | 8 | Upon each iteration, the current `batch` is adapted to the GPU (via `map(x -> adapt(CuArray, x), batch)`) 9 | and the previous iteration is marked as freeable from GPU memory (via `unsafe_free!`). 10 | 11 | This abstraction is useful for batching data into GPU memory in a manner that 12 | allows old iterations to potentially be freed (or marked as reusable) earlier 13 | than they otherwise would via CuArray's internal polling mechanism. 14 | """ 15 | mutable struct CuIterator{B} 16 | batches::B 17 | previous::Any 18 | CuIterator(batches) = new{typeof(batches)}(batches) 19 | end 20 | 21 | function Base.iterate(c::CuIterator, state...) 22 | item = iterate(c.batches, state...) 23 | isdefined(c, :previous) && foreach(unsafe_free!, c.previous) 24 | item === nothing && return nothing 25 | batch, next_state = item 26 | cubatch = map(x -> adapt(CuArray, x), batch) 27 | c.previous = cubatch 28 | return cubatch, next_state 29 | end 30 | -------------------------------------------------------------------------------- /lib/cutensor/tensor.jl: -------------------------------------------------------------------------------- 1 | export CuTensor 2 | 3 | mutable struct CuTensor{T, N} 4 | data::DenseCuArray{T, N} 5 | inds::Vector{Char} 6 | function CuTensor{T, N}(data::DenseCuArray{T, N}, inds::Vector{Char}) where {T<:Number, N} 7 | new(data, inds) 8 | end 9 | function CuTensor{T, N}(data::DenseCuArray{N, T}, inds::Vector{<:AbstractChar}) where {T<:Number, N} 10 | new(data, Char.(inds)) 11 | end 12 | end 13 | 14 | CuTensor(data::DenseCuArray{T, N}, inds::Vector{<:AbstractChar}) where {T<:Number, N} = 15 | CuTensor{T, N}(data, convert(Vector{Char}, inds)) 16 | 17 | CuTensor(data::DenseCuArray{T, N}, inds::Vector{Char}) where {T<:Number, N} = 18 | CuTensor{T, N}(data, inds) 19 | 20 | Base.size(T::CuTensor) = size(T.data) 21 | Base.size(T::CuTensor, i) = size(T.data, i) 22 | Base.length(T::CuTensor) = length(T.data) 23 | Base.ndims(T::CuTensor) = length(T.inds) 24 | Base.strides(T::CuTensor) = strides(T.data) 25 | Base.eltype(T::CuTensor) = eltype(T.data) 26 | Base.similar(T::CuTensor{Tv, N}) where {Tv, N} = CuTensor{Tv, N}(similar(T.data), copy(T.inds)) 27 | Base.collect(T::CuTensor) = (collect(T.data), T.inds) 28 | -------------------------------------------------------------------------------- /test/pool.jl: -------------------------------------------------------------------------------- 1 | CUDA.alloc(0) 2 | 3 | @test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20) 4 | 5 | @testset "@allocated" begin 6 | @test (CUDA.@allocated CuArray{Int32}(undef,1)) == 4 7 | end 8 | 9 | @testset "@timed" begin 10 | out = CUDA.@timed CuArray{Int32}(undef, 1) 11 | @test isa(out.value, CuArray{Int32}) 12 | @test out.gpu_bytes > 0 13 | end 14 | 15 | @testset "@time" begin 16 | ret, out = @grab_output CUDA.@time CuArray{Int32}(undef, 1) 17 | @test isa(ret, CuArray{Int32}) 18 | @test occursin("1 GPU allocation: 4 bytes", out) 19 | 20 | x = CuArray{Int32}(undef, 6) 21 | ret, out = @grab_output CUDA.@time Base.unsafe_wrap(CuArray, pointer(x), (2, 3)) 22 | @test isa(ret, CuArray{Int32}) 23 | @test !occursin("GPU allocation", out) 24 | end 25 | 26 | @testset "reclaim" begin 27 | CUDA.reclaim(1024) 28 | CUDA.reclaim() 29 | 30 | @test CUDA.@retry_reclaim(isequal(42), 42) == 42 31 | @test CUDA.@retry_reclaim(isequal(42), 41) == 41 32 | end 33 | 34 | @testset "memory_status" begin 35 | CUDA.memory_status(devnull) 36 | CUDA.used_memory() 37 | CUDA.cached_memory() 38 | end 39 | -------------------------------------------------------------------------------- /examples/wmma/low-level.jl: -------------------------------------------------------------------------------- 1 | using CUDA 2 | if capability(device()) < v"7.0" 3 | exit() 4 | end 5 | 6 | ### START 7 | using Test 8 | 9 | using CUDA 10 | 11 | # Generate input matrices 12 | a = rand(Float16, (16, 16)) 13 | a_dev = CuArray(a) 14 | b = rand(Float16, (16, 16)) 15 | b_dev = CuArray(b) 16 | c = rand(Float32, (16, 16)) 17 | c_dev = CuArray(c) 18 | 19 | # Allocate space for result 20 | d_dev = similar(c_dev) 21 | 22 | # Matrix multiply-accumulate kernel (D = A * B + C) 23 | function kernel(a_dev, b_dev, c_dev, d_dev) 24 | a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_f16(pointer(a_dev), 16) 25 | b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_f16(pointer(b_dev), 16) 26 | c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_f32(pointer(c_dev), 16) 27 | 28 | d_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_f32_f32(a_frag, b_frag, c_frag) 29 | 30 | WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_f32(pointer(d_dev), d_frag, 16) 31 | return 32 | end 33 | 34 | @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev) 35 | @test all(isapprox.(a * b + c, Array(d_dev); rtol=0.01)) 36 | ### END 37 | -------------------------------------------------------------------------------- /.github/workflows/CompatHelper.yml: -------------------------------------------------------------------------------- 1 | name: CompatHelper 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | CompatHelper: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Get Julia compatibility 14 | id: julia_compat 15 | # NOTE: this requires a Julia compat lower-bound with minor version! 16 | run : | 17 | version=$(grep '^julia = ' Project.toml | grep -o '".*"' | cut -d '"' -f2) 18 | echo "::set-output name=version::$version" 19 | - uses: julia-actions/setup-julia@v1 20 | with: 21 | version: ${{ steps.julia_compat.outputs.version }} 22 | - name: Install CompatHelper 23 | run: | 24 | import Pkg 25 | name = "CompatHelper" 26 | version = "2" 27 | Pkg.add(; name, version) 28 | shell: julia --color=yes {0} 29 | - name: Run CompatHelper 30 | run: | 31 | using CompatHelper 32 | CompatHelper.main() 33 | shell: julia --color=yes {0} 34 | env: 35 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 36 | -------------------------------------------------------------------------------- /docs/src/api/compiler.md: -------------------------------------------------------------------------------- 1 | # Compiler 2 | 3 | ## Execution 4 | 5 | The main entry-point to the compiler is the `@cuda` macro: 6 | 7 | ```@docs 8 | @cuda 9 | ``` 10 | 11 | If needed, you can use a lower-level API that lets you inspect the compiler kernel: 12 | 13 | ```@docs 14 | cudaconvert 15 | cufunction 16 | CUDA.HostKernel 17 | CUDA.version 18 | CUDA.maxthreads 19 | CUDA.registers 20 | CUDA.memory 21 | ``` 22 | 23 | 24 | ## Reflection 25 | 26 | If you want to inspect generated code, you can use macros that resemble functionality from 27 | the InteractiveUtils standard library: 28 | 29 | ``` 30 | @device_code_lowered 31 | @device_code_typed 32 | @device_code_warntype 33 | @device_code_llvm 34 | @device_code_ptx 35 | @device_code_sass 36 | @device_code 37 | ``` 38 | 39 | These macros are also available in function-form: 40 | 41 | ``` 42 | CUDA.code_typed 43 | CUDA.code_warntype 44 | CUDA.code_llvm 45 | CUDA.code_ptx 46 | CUDA.code_sass 47 | ``` 48 | 49 | For more information, please consult the GPUCompiler.jl documentation. Only the `code_sass` 50 | functionality is actually defined in CUDA.jl: 51 | 52 | ```@docs 53 | @device_code_sass 54 | CUDA.code_sass 55 | ``` 56 | -------------------------------------------------------------------------------- /perf/cudadevrt.jl: -------------------------------------------------------------------------------- 1 | module cudadevrt 2 | 3 | using CUDA, BenchmarkTools, Random 4 | 5 | const threads = 256 6 | #simple add matrix and vector kernel 7 | function kernel_add_mat_vec(m, x1, x2, y) 8 | # one block per column 9 | offset = (blockIdx().x-1) * m 10 | @inbounds xtmp = x2[blockIdx().x] 11 | for i = threadIdx().x : blockDim().x : m 12 | @inbounds y[offset + i] = x1[offset + i] + xtmp 13 | end 14 | return 15 | end 16 | 17 | function add!(y, x1, x2) 18 | m, n = size(x1) 19 | @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) 20 | end 21 | 22 | function main() 23 | Random.seed!(1) 24 | m, n = 3072, 1536 # 256 multiplier 25 | x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) 26 | x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) 27 | y1 = similar(x1) 28 | 29 | results = @benchmark CUDA.@sync add!($y1, $x1, $x2) 30 | 31 | # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them 32 | CUDA.unsafe_free!(x1) 33 | CUDA.unsafe_free!(x2) 34 | CUDA.unsafe_free!(y1) 35 | 36 | return results 37 | end 38 | 39 | end 40 | 41 | cudadevrt.main() 42 | 43 | -------------------------------------------------------------------------------- /test/cusparse/conversions.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUSPARSE, SparseArrays 2 | 3 | @testset "sparse" begin 4 | n, m = 4, 4 5 | I = [1,2,3] |> cu 6 | J = [2,3,4] |> cu 7 | V = Float32[1,2,3] |> cu 8 | 9 | dense = rand(3,3) |> cu 10 | 11 | # check defaults 12 | @test sparse(I, J, V) isa CuSparseMatrixCSC 13 | @test sparse(dense) isa CuSparseMatrixCSC 14 | 15 | for (fmt, T) in [(:coo, CuSparseMatrixCOO), 16 | (:csc, CuSparseMatrixCSC), 17 | (:csr, CuSparseMatrixCSR), 18 | (:bsr, CuSparseMatrixBSR) 19 | ] 20 | if fmt != :bsr # bsr not supported 21 | x = sparse(I, J, V; fmt=fmt) 22 | @test x isa T{Float32} 23 | @test size(x) == (3, 4) 24 | 25 | x = sparse(I, J, V, m, n; fmt=fmt) 26 | @test x isa T{Float32} 27 | @test size(x) == (4, 4) 28 | end 29 | 30 | if fmt != :coo # dense to COO not implemented 31 | x = sparse(dense; fmt=fmt) 32 | @test x isa T{Float32} 33 | @test collect(x) == collect(dense) 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /test/utils.jl: -------------------------------------------------------------------------------- 1 | @testset "test utilities" begin 2 | mutable struct NoThrowTestSet <: Test.AbstractTestSet 3 | results::Vector 4 | NoThrowTestSet(desc) = new([]) 5 | end 6 | Test.record(ts::NoThrowTestSet, t::Test.Result) = (push!(ts.results, t); t) 7 | Test.finish(ts::NoThrowTestSet) = ts.results 8 | fails = @testset NoThrowTestSet begin 9 | # OK 10 | @test_throws_cuerror CUDA.ERROR_UNKNOWN throw(CuError(CUDA.ERROR_UNKNOWN)) 11 | # Fail, wrong CuError 12 | @test_throws_cuerror CUDA.ERROR_UNKNOWN throw(CuError(CUDA.ERROR_INVALID_VALUE)) 13 | # Fail, wrong Exception 14 | @test_throws_cuerror CUDA.ERROR_UNKNOWN error() 15 | end 16 | @test isa(fails[1], Test.Pass) 17 | @test isa(fails[2], Test.Fail) 18 | @test isa(fails[3], Test.Fail) 19 | end 20 | 21 | @testset "@sync" begin 22 | t = Base.@elapsed ret = CUDA.@sync begin 23 | # TODO: do something that takes a while on the GPU 24 | # (need to wrap clock64 for that) 25 | 42 26 | end 27 | @test t >= 0 28 | @test ret == 42 29 | end 30 | 31 | @testset "versioninfo" begin 32 | CUDA.versioninfo(devnull) 33 | end 34 | -------------------------------------------------------------------------------- /examples/driver/vadd.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-19856038 5 | // Cuda compilation tools, release 7.5, V7.5.17 6 | // Based on LLVM 3.4svn 7 | // 8 | 9 | .version 4.3 10 | .target sm_20 11 | .address_size 64 12 | 13 | // .globl kernel_vadd 14 | 15 | .visible .entry kernel_vadd( 16 | .param .u64 kernel_vadd_param_0, 17 | .param .u64 kernel_vadd_param_1, 18 | .param .u64 kernel_vadd_param_2 19 | ) 20 | { 21 | .reg .f32 %f<4>; 22 | .reg .b32 %r<5>; 23 | .reg .b64 %rd<11>; 24 | 25 | 26 | ld.param.u64 %rd1, [kernel_vadd_param_0]; 27 | ld.param.u64 %rd2, [kernel_vadd_param_1]; 28 | ld.param.u64 %rd3, [kernel_vadd_param_2]; 29 | cvta.to.global.u64 %rd4, %rd3; 30 | cvta.to.global.u64 %rd5, %rd2; 31 | cvta.to.global.u64 %rd6, %rd1; 32 | mov.u32 %r1, %ctaid.x; 33 | mov.u32 %r2, %ntid.x; 34 | mov.u32 %r3, %tid.x; 35 | mad.lo.s32 %r4, %r2, %r1, %r3; 36 | mul.wide.s32 %rd7, %r4, 4; 37 | add.s64 %rd8, %rd6, %rd7; 38 | ld.global.f32 %f1, [%rd8]; 39 | add.s64 %rd9, %rd5, %rd7; 40 | ld.global.f32 %f2, [%rd9]; 41 | add.f32 %f3, %f1, %f2; 42 | add.s64 %rd10, %rd4, %rd7; 43 | st.global.f32 [%rd10], %f3; 44 | ret; 45 | } 46 | 47 | 48 | -------------------------------------------------------------------------------- /lib/cudadrv/types.jl: -------------------------------------------------------------------------------- 1 | export CuDim3, CuDim 2 | 3 | """ 4 | CuDim3(x) 5 | 6 | CuDim3((x,)) 7 | CuDim3((x, y)) 8 | CuDim3((x, y, x)) 9 | 10 | A type used to specify dimensions, consisting of 3 integers for respectively the `x`, `y` 11 | and `z` dimension. Unspecified dimensions default to `1`. 12 | 13 | Often accepted as argument through the `CuDim` type alias, eg. in the case of 14 | [`cudacall`](@ref) or [`CUDA.launch`](@ref), allowing to pass dimensions as a plain integer 15 | or a tuple without having to construct an explicit `CuDim3` object. 16 | """ 17 | struct CuDim3 18 | x::Cuint 19 | y::Cuint 20 | z::Cuint 21 | end 22 | 23 | CuDim3(dims::Integer) = CuDim3(dims, Cuint(1), Cuint(1)) 24 | CuDim3(dims::NTuple{1,<:Integer}) = CuDim3(dims[1], Cuint(1), Cuint(1)) 25 | CuDim3(dims::NTuple{2,<:Integer}) = CuDim3(dims[1], dims[2], Cuint(1)) 26 | CuDim3(dims::NTuple{3,<:Integer}) = CuDim3(dims[1], dims[2], dims[3]) 27 | 28 | # Type alias for conveniently specifying the dimensions 29 | # (e.g. `(len, 2)` instead of `CuDim3((len, 2))`) 30 | const CuDim = Union{Integer, 31 | Tuple{Integer}, 32 | Tuple{Integer, Integer}, 33 | Tuple{Integer, Integer, Integer}} 34 | -------------------------------------------------------------------------------- /lib/cusparse/error.jl: -------------------------------------------------------------------------------- 1 | export CUSPARSEError 2 | 3 | struct CUSPARSEError <: Exception 4 | code::cusparseStatus_t 5 | end 6 | 7 | Base.convert(::Type{cusparseStatus_t}, err::CUSPARSEError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUSPARSEError) = 10 | print(io, "CUSPARSEError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CUSPARSEError) = unsafe_string(cusparseGetErrorName(err)) 13 | 14 | description(err::CUSPARSEError) = unsafe_string(cusparseGetErrorString(err)) 15 | 16 | 17 | ## API call wrapper 18 | 19 | # outlined functionality to avoid GC frame allocation 20 | @noinline function throw_api_error(res) 21 | if res == CUSPARSE_STATUS_ALLOC_FAILED 22 | throw(OutOfGPUMemoryError()) 23 | else 24 | throw(CUSPARSEError(res)) 25 | end 26 | end 27 | 28 | macro check(ex, errs...) 29 | check = :(isequal(err, CUSPARSE_STATUS_ALLOC_FAILED)) 30 | for err in errs 31 | check = :($check || isequal(err, $(esc(err)))) 32 | end 33 | 34 | quote 35 | res = @retry_reclaim err->$check $(esc(ex)) 36 | if res != CUSPARSE_STATUS_SUCCESS 37 | throw_api_error(res) 38 | end 39 | 40 | nothing 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /test/cudnn/dropout.jl: -------------------------------------------------------------------------------- 1 | using Statistics 2 | using CUDA.CUDNN: 3 | cudnnDropoutForward, 4 | cudnnDropoutForward!, 5 | cudnnDropoutBackward, 6 | cudnnDropoutSeed, 7 | cudnnDropoutDescriptor, 8 | cudnnDropoutDescriptor_t, 9 | cudnnCreateDropoutDescriptor, 10 | cudnnSetDropoutDescriptor, 11 | cudnnGetDropoutDescriptor, 12 | cudnnRestoreDropoutDescriptor, 13 | cudnnDestroyDropoutDescriptor, 14 | cudnnDropoutGetStatesSize, 15 | cudnnDropoutGetReserveSpaceSize 16 | 17 | @testset "cudnn/dropout" begin 18 | @test cudnnDropoutDescriptor(C_NULL) isa cudnnDropoutDescriptor 19 | @test Base.unsafe_convert(Ptr, cudnnDropoutDescriptor(C_NULL)) isa Ptr 20 | @test cudnnDropoutDescriptor(0.5) isa cudnnDropoutDescriptor 21 | 22 | N,P = 1000, 0.7 23 | x = CUDA.rand(N) 24 | d = cudnnDropoutDescriptor(P) 25 | cudnnDropoutSeed[] = 1 26 | y = cudnnDropoutForward(x; dropout = P) |> Array 27 | @test isapprox(mean(y.==0), P; atol = 3/sqrt(N)) 28 | @test y == cudnnDropoutForward(x, d) |> Array 29 | @test y == cudnnDropoutForward!(similar(x), x; dropout = P) |> Array 30 | @test y == cudnnDropoutForward!(similar(x), x, d) |> Array 31 | cudnnDropoutSeed[] = -1 32 | end 33 | -------------------------------------------------------------------------------- /test/cudnn/inplace.jl: -------------------------------------------------------------------------------- 1 | import CUDA.CUDNN: 2 | cudnnSetTensor!, 3 | cudnnScaleTensor!, 4 | cudnnScaleTensor, 5 | cudnnAddTensor!, 6 | cudnnAddTensor, 7 | CUDNN_TENSOR_NHWC 8 | 9 | @testset "cudnn/inplace" begin 10 | x = CUDA.rand(10) 11 | cudnnSetTensor!(x, 7) 12 | @test all(isequal(7), Array(x)) 13 | ax = rand(10) 14 | cx = CuArray(ax) 15 | @test 7*ax ≈ cudnnScaleTensor(cx, 7) |> Array 16 | @test 7*ax ≈ cudnnScaleTensor!(similar(cx), cx, 7) |> Array 17 | ax,ab = rand(5,4,3,2),rand(1,1,3,1) 18 | cx,cb = CuArray.((ax,ab)) 19 | @test ax .+ ab ≈ cudnnAddTensor(cx, cb) |> Array 20 | @test ax .+ 7*ab ≈ cudnnAddTensor(cx, cb, alpha=7) |> Array 21 | @test 7*ax .+ ab ≈ cudnnAddTensor(cx, cb, beta=7) |> Array 22 | @test ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb) |> Array 23 | @test ax .+ 7*ab ≈ cudnnAddTensor!(similar(cx), cx, cb, alpha=7) |> Array 24 | @test 7*ax .+ ab ≈ cudnnAddTensor!(similar(cx), cx, cb, beta=7) |> Array 25 | @test ax .+ ab ≈ cudnnAddTensor!(cx, cx, cb) |> Array 26 | @test ax .+ ab ≈ cx |> Array 27 | ax,ab = rand(3,5,4,2),rand(3,1,1,1) 28 | cx,cb = CuArray.((ax,ab)) 29 | @test ax .+ ab ≈ cudnnAddTensor(cx, cb, format=CUDNN_TENSOR_NHWC) |> Array 30 | end 31 | -------------------------------------------------------------------------------- /lib/nvml/error.jl: -------------------------------------------------------------------------------- 1 | export NVMLError 2 | 3 | struct NVMLError <: Exception 4 | code::nvmlReturn_t 5 | end 6 | 7 | Base.convert(::Type{nvmlReturn_t}, err::NVMLError) = err.code 8 | 9 | Base.showerror(io::IO, err::NVMLError) = 10 | print(io, "NVMLError: ", description(err), " (code $(reinterpret(Int32, err.code)))") 11 | 12 | description(err::NVMLError) = unsafe_string(nvmlErrorString(err)) 13 | 14 | @enum_without_prefix nvmlReturn_enum NVML_ 15 | 16 | 17 | ## API call wrapper 18 | 19 | # outlined functionality to avoid GC frame allocation 20 | @noinline function throw_api_error(res) 21 | throw(NVMLError(res)) 22 | end 23 | 24 | const initialized = Ref(false) 25 | function initialize_context() 26 | if !initialized[] 27 | res = unsafe_nvmlInitWithFlags(0) 28 | if res !== NVML_SUCCESS 29 | # NOTE: we can't call nvmlErrorString during initialization 30 | error("NVML could not be initialized ($res)") 31 | end 32 | atexit() do 33 | nvmlShutdown() 34 | end 35 | initialized[] = true 36 | end 37 | end 38 | 39 | macro check(ex) 40 | quote 41 | res = $(esc(ex)) 42 | if res != NVML_SUCCESS 43 | throw_api_error(res) 44 | end 45 | 46 | nothing 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /src/compiler/exceptions.jl: -------------------------------------------------------------------------------- 1 | # support for device-side exceptions 2 | 3 | ## exception type 4 | 5 | struct KernelException <: Exception 6 | dev::CuDevice 7 | end 8 | 9 | function Base.showerror(io::IO, err::KernelException) 10 | print(io, "KernelException: exception thrown during kernel execution on device $(name(err.dev))") 11 | end 12 | 13 | 14 | ## exception handling 15 | 16 | const exception_flags = Dict{CuContext, Mem.HostBuffer}() 17 | 18 | # create a CPU/GPU exception flag for error signalling, and put it in the module 19 | function create_exceptions!(mod::CuModule) 20 | exception_flag = get!(exception_flags, mod.ctx, 21 | Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP)) 22 | return reinterpret(Ptr{Cvoid}, convert(CuPtr{Cvoid}, exception_flag)) 23 | end 24 | 25 | # check the exception flags on every API call, similarly to how CUDA handles errors 26 | function check_exceptions() 27 | for (ctx,buf) in exception_flags 28 | if isvalid(ctx) 29 | ptr = convert(Ptr{Int}, buf) 30 | flag = unsafe_load(ptr) 31 | if flag != 0 32 | unsafe_store!(ptr, 0) 33 | dev = device(ctx) 34 | throw(KernelException(dev)) 35 | end 36 | end 37 | end 38 | return 39 | end 40 | -------------------------------------------------------------------------------- /lib/cudadrv/libcuda_deprecated.jl: -------------------------------------------------------------------------------- 1 | ## superseded in CUDA 11.0 2 | 3 | @checked function cuDevicePrimaryCtxRelease(dev) 4 | ccall((:cuDevicePrimaryCtxRelease, libcuda()), CUresult, 5 | (CUdevice,), 6 | dev) 7 | end 8 | 9 | @checked function cuDevicePrimaryCtxSetFlags(dev, flags) 10 | ccall((:cuDevicePrimaryCtxSetFlags, libcuda()), CUresult, 11 | (CUdevice, UInt32), 12 | dev, flags) 13 | end 14 | 15 | @checked function cuDevicePrimaryCtxReset(dev) 16 | ccall((:cuDevicePrimaryCtxReset, libcuda()), CUresult, 17 | (CUdevice,), 18 | dev) 19 | end 20 | 21 | @checked function cuGraphInstantiate(phGraphExec, hGraph, phErrorNode, logBuffer, 22 | bufferSize) 23 | ccall((:cuGraphInstantiate, libcuda()), CUresult, 24 | (Ptr{CUgraphExec}, CUgraph, Ptr{CUgraphNode}, Cstring, Csize_t), 25 | phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize) 26 | end 27 | 28 | ## superseded in CUDA 11.1 29 | 30 | @checked function cuIpcOpenMemHandle(pdptr, handle, Flags) 31 | ccall((:cuIpcOpenMemHandle, libcuda()), CUresult, 32 | (Ptr{CUdeviceptr}, CUipcMemHandle, UInt32), 33 | pdptr, handle, Flags) 34 | end 35 | 36 | ## 37 | -------------------------------------------------------------------------------- /test/device/sparse.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | using CUDA 3 | using CUDA.CUSPARSE 4 | using SparseArrays 5 | using CUDA: CuSparseDeviceVector, CuSparseDeviceMatrixCSC, CuSparseDeviceMatrixCSR, 6 | CuSparseDeviceMatrixBSR, CuSparseDeviceMatrixCOO 7 | 8 | @testset "cudaconvert" begin 9 | @test isbitstype(CuSparseDeviceVector{Float32, Cint, CUDA.AS.Global}) 10 | @test isbitstype(CuSparseDeviceMatrixCSC{Float32, Cint, CUDA.AS.Global}) 11 | @test isbitstype(CuSparseDeviceMatrixCSR{Float32, Cint, CUDA.AS.Global}) 12 | @test isbitstype(CuSparseDeviceMatrixBSR{Float32, Cint, CUDA.AS.Global}) 13 | @test isbitstype(CuSparseDeviceMatrixCOO{Float32, Cint, CUDA.AS.Global}) 14 | 15 | V = sprand(10, 0.5) 16 | cuV = CuSparseVector(V) 17 | @test cudaconvert(cuV) isa CuSparseDeviceVector{Float64, Cint, 1} 18 | 19 | A = sprand(10, 10, 0.5) 20 | cuA = CuSparseMatrixCSC(A) 21 | @test cudaconvert(cuA) isa CuSparseDeviceMatrixCSC{Float64, Cint, 1} 22 | 23 | cuA = CuSparseMatrixCSR(A) 24 | @test cudaconvert(cuA) isa CuSparseDeviceMatrixCSR{Float64, Cint, 1} 25 | 26 | cuA = CuSparseMatrixCOO(A) 27 | @test cudaconvert(cuA) isa CuSparseDeviceMatrixCOO{Float64, Cint, 1} 28 | 29 | # Roger-Luo: I'm not sure how to create a BSR matrix 30 | # cuA = CuSparseMatrixBSR(A) 31 | # @test cudaconvert(cuA) isa CuSparseDeviceMatrixBSR 32 | end 33 | -------------------------------------------------------------------------------- /perf/kernel.jl: -------------------------------------------------------------------------------- 1 | using CUDA: i32 2 | 3 | group = addgroup!(SUITE, "kernel") 4 | 5 | dummy_kernel() = nothing 6 | group["launch"] = @benchmarkable @cuda dummy_kernel() 7 | 8 | wanted_threads = 10000 9 | group["occupancy"] = @benchmarkable begin 10 | kernel = @cuda launch=false dummy_kernel() 11 | config = launch_configuration(kernel.fun) 12 | threads = min($wanted_threads, config.threads) 13 | blocks = cld($wanted_threads, threads) 14 | end 15 | 16 | src = CUDA.rand(Float32, 512, 1000) 17 | dest = similar(src) 18 | function indexing_kernel(dest, src) 19 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 20 | @inbounds dest[i] = src[i] 21 | return 22 | end 23 | group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src) 24 | 25 | function checked_indexing_kernel(dest, src) 26 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 27 | dest[i] = src[i] 28 | return 29 | end 30 | group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) 31 | 32 | function rand_kernel(dest::AbstractArray{T}) where {T} 33 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 34 | dest[i] = rand(T) 35 | return 36 | end 37 | group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) 38 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The CUDA.jl package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2019-present: Julia Computing and other contributors 4 | > 5 | > Copyright (c) 2014-2018: Tim Besard 6 | > 7 | > Copyright (c) 2013: Dahua Lin 8 | > 9 | > All Rights Reserved. 10 | > 11 | > Permission is hereby granted, free of charge, to any person obtaining a copy 12 | > of this software and associated documentation files (the "Software"), to deal 13 | > in the Software without restriction, including without limitation the rights 14 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | > copies of the Software, and to permit persons to whom the Software is 16 | > furnished to do so, subject to the following conditions: 17 | > 18 | > The above copyright notice and this permission notice shall be included in all 19 | > copies or substantial portions of the Software. 20 | > 21 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | > SOFTWARE. 28 | > 29 | -------------------------------------------------------------------------------- /lib/cufft/util.jl: -------------------------------------------------------------------------------- 1 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex} 2 | const cufftReals = Union{cufftDoubleReal,cufftReal} 3 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex} 4 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex} 5 | const cufftSingle = Union{cufftReal,cufftComplex} 6 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}} 7 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}} 8 | 9 | cufftfloat(x) = _cufftfloat(float(x)) 10 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T 11 | _cufftfloat(::Type{Float16}) = Float32 12 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)} 13 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported") 14 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x) 15 | 16 | complexfloat(x::DenseCuArray{Complex{<:cufftReals}}) = x 17 | realfloat(x::DenseCuArray{<:cufftReals}) = x 18 | 19 | complexfloat(x::DenseCuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x) 20 | complexfloat(x::DenseCuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x) 21 | 22 | realfloat(x::DenseCuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x) 23 | 24 | function copy1(::Type{T}, x) where T 25 | y = CuArray{T}(undef, map(length, axes(x))) 26 | #copy!(y, x) 27 | y .= broadcast(xi->convert(T,xi),x) 28 | end 29 | -------------------------------------------------------------------------------- /test/ptx/vadd_parent.ptx: -------------------------------------------------------------------------------- 1 | 2 | .version 3.1 3 | .target sm_20 4 | .address_size 64 5 | 6 | .extern .func (.param .b32 func_retval0) add 7 | ( 8 | .param .b32 add_param_0, 9 | .param .b32 add_param_1 10 | ) 11 | ; 12 | 13 | .visible .entry vadd( 14 | .param .u64 vadd_param_0, 15 | .param .u64 vadd_param_1, 16 | .param .u64 vadd_param_2 17 | ) 18 | { 19 | .reg .s32 %r<8>; 20 | .reg .f32 %f<4>; 21 | .reg .s64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [vadd_param_0]; 25 | ld.param.u64 %rd2, [vadd_param_1]; 26 | ld.param.u64 %rd3, [vadd_param_2]; 27 | cvta.to.global.u64 %rd4, %rd3; 28 | mov.u32 %r1, %ntid.x; 29 | mov.u32 %r2, %ctaid.x; 30 | mov.u32 %r3, %tid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | cvta.to.global.u64 %rd5, %rd1; 33 | mul.wide.s32 %rd6, %r4, 4; 34 | add.s64 %rd7, %rd5, %rd6; 35 | cvta.to.global.u64 %rd8, %rd2; 36 | add.s64 %rd9, %rd8, %rd6; 37 | ld.global.f32 %f1, [%rd9]; 38 | ld.global.f32 %f2, [%rd7]; 39 | // Callseq Start 0 40 | { 41 | .reg .b32 temp_param_reg; 42 | .param .b32 param0; 43 | st.param.f32 [param0+0], %f2; 44 | .param .b32 param1; 45 | st.param.f32 [param1+0], %f1; 46 | .param .b32 retval0; 47 | call.uni (retval0), 48 | add, 49 | ( 50 | param0, 51 | param1 52 | ); 53 | ld.param.f32 %f3, [retval0+0]; 54 | } 55 | // Callseq End 0 56 | add.s64 %rd10, %rd4, %rd6; 57 | st.global.f32 [%rd10], %f3; 58 | ret; 59 | } 60 | 61 | 62 | -------------------------------------------------------------------------------- /lib/cudadrv/version.jl: -------------------------------------------------------------------------------- 1 | # Version management 2 | 3 | # NVML.driver_version() wrongly reports the forward compatible version, 4 | # so we record the system libcuda version when we initialize the library. 5 | const _system_version = Ref{VersionNumber}() 6 | 7 | """ 8 | system_version() 9 | 10 | Returns the latest version of CUDA supported by the system driver. 11 | """ 12 | function system_version() 13 | libcuda() # initializes _system_version 14 | _system_version[] 15 | end 16 | 17 | """ 18 | version() 19 | 20 | Returns the latest version of CUDA supported by the loaded driver. 21 | """ 22 | function version() 23 | version_ref = Ref{Cint}() 24 | cuDriverGetVersion(version_ref) 25 | major, ver = divrem(version_ref[], 1000) 26 | minor, patch = divrem(ver, 10) 27 | return VersionNumber(major, minor, patch) 28 | end 29 | 30 | """ 31 | release() 32 | 33 | Returns the CUDA release part of the version as returned by [`version`](@ref). 34 | """ 35 | release() = VersionNumber(version().major, version().minor) 36 | 37 | """ 38 | runtime_version() 39 | 40 | Returns the CUDA Runtime version. 41 | """ 42 | function runtime_version() 43 | version_ref = Ref{Cint}() 44 | @ccall libcudart().cudaRuntimeGetVersion(version_ref::Ptr{Cint})::CUresult 45 | major, ver = divrem(version_ref[], 1000) 46 | minor, patch = divrem(ver, 10) 47 | return VersionNumber(major, minor, patch) 48 | end 49 | -------------------------------------------------------------------------------- /lib/nvml/NVML.jl: -------------------------------------------------------------------------------- 1 | module NVML 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | 7 | using CEnum: @cenum 8 | 9 | import Libdl 10 | 11 | 12 | function libnvml() 13 | @memoize begin 14 | if Sys.iswindows() 15 | # the NVSMI dir isn't added to PATH by the installer 16 | nvsmi = joinpath(ENV["ProgramFiles"], "NVIDIA Corporation", "NVSMI") 17 | if isdir(nvsmi) 18 | joinpath(nvsmi, "nvml.dll") 19 | else 20 | # let's just hope for the best 21 | "nvml" 22 | end 23 | else 24 | "libnvidia-ml.so.1" 25 | end 26 | end::String 27 | end 28 | 29 | function has_nvml() 30 | @memoize begin 31 | if Libdl.dlopen(libnvml(); throw_error=false) === nothing 32 | return false 33 | end 34 | 35 | # JuliaGPU/CUDA.jl#860: initialization can fail on Windows 36 | try 37 | initialize_context() 38 | catch err 39 | @error "Cannot use NVML, as it failed to initialize" exception=(err, catch_backtrace()) 40 | return false 41 | end 42 | 43 | return true 44 | end::Bool 45 | end 46 | 47 | 48 | # core library 49 | include("libnvml_common.jl") 50 | include("error.jl") 51 | include("libnvml.jl") 52 | include("libnvml_deprecated.jl") 53 | 54 | # wrappers 55 | include("system.jl") 56 | include("device.jl") 57 | 58 | end 59 | -------------------------------------------------------------------------------- /perf/latency.jl: -------------------------------------------------------------------------------- 1 | module Latency 2 | 3 | using CUDA 4 | using BenchmarkTools 5 | 6 | function main() 7 | results = BenchmarkGroup() 8 | 9 | base_cmd = Base.julia_cmd() 10 | if Base.JLOptions().project != C_NULL 11 | base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))` 12 | end 13 | 14 | # make sure all artifacts are downloaded 15 | CUDA.version() 16 | 17 | # time to precompile the package and its dependencies 18 | precompile_cmd = 19 | `$base_cmd -e "uuid = Base.UUID(\"052768ef-5323-5732-b1bb-66c8b64840ba\") 20 | id = Base.PkgId(uuid, \"CUDA\") 21 | Base.compilecache(id)"` 22 | results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60 23 | 24 | # time to actually import the package 25 | import_cmd = 26 | `$base_cmd -e "using CUDA"` 27 | results["import"] = @benchmark run($import_cmd) evals=1 seconds=30 28 | 29 | # time to initialize CUDA and all other libraries 30 | initialize_time = 31 | `$base_cmd -e "using CUDA 32 | CUDA.version()"` 33 | results["initialize"] = @benchmark run($initialize_time) evals=1 seconds=30 34 | 35 | # time to actually compile a kernel 36 | ttfp_cmd = 37 | `$base_cmd -e "using CUDA 38 | kernel() = return 39 | CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"` 40 | results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60 41 | 42 | results 43 | end 44 | 45 | end 46 | 47 | Latency.main() 48 | -------------------------------------------------------------------------------- /lib/curand/CURAND.jl: -------------------------------------------------------------------------------- 1 | module CURAND 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | using ..CUDA: CUstream, libraryPropertyType, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK 7 | using ..CUDA: libcurand, @retry_reclaim, initialize_context 8 | 9 | using CEnum: @cenum 10 | 11 | 12 | # core library 13 | include("libcurand_common.jl") 14 | include("error.jl") 15 | include("libcurand.jl") 16 | 17 | # low-level wrappers 18 | include("wrappers.jl") 19 | 20 | # high-level integrations 21 | include("random.jl") 22 | 23 | # cache for created, but unused handles 24 | const idle_curand_rngs = HandleCache{CuContext,RNG}() 25 | 26 | function default_rng() 27 | cuda = CUDA.active_state() 28 | 29 | # every task maintains library state per device 30 | LibraryState = @NamedTuple{rng::RNG} 31 | states = get!(task_local_storage(), :CURAND) do 32 | Dict{CuContext,LibraryState}() 33 | end::Dict{CuContext,LibraryState} 34 | 35 | # get library state 36 | @noinline function new_state(cuda) 37 | new_rng = pop!(idle_curand_rngs, cuda.context) do 38 | RNG() 39 | end 40 | 41 | finalizer(current_task()) do task 42 | push!(idle_curand_rngs, cuda.context, new_rng) do 43 | # no need to do anything, as the RNG is collected by its finalizer 44 | end 45 | end 46 | 47 | Random.seed!(new_rng) 48 | (; rng=new_rng) 49 | end 50 | state = get!(states, cuda.context) do 51 | new_state(cuda) 52 | end 53 | 54 | return state.rng 55 | end 56 | 57 | @deprecate seed!() CUDA.seed!() 58 | @deprecate seed!(seed) CUDA.seed!(seed) 59 | 60 | end 61 | -------------------------------------------------------------------------------- /test/device/ldg.jl: -------------------------------------------------------------------------------- 1 | @testset "ldg" begin 2 | ir = sprint(io->CUDA.code_llvm(io, CUDA.pointerref_ldg, Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}})) 3 | @test occursin("@llvm.nvvm.ldg", ir) 4 | end 5 | 6 | 7 | capability(device()) >= v"3.2" && @testset "unsafe_cached_load" begin 8 | 9 | @testset for T in (Int8, UInt16, Int32, UInt32, Int64, UInt64, Int128, Float32, Float64) 10 | d_a = CuArray(ones(T)) 11 | d_b = CuArray(zeros(T)) 12 | @test Array(d_a) != Array(d_b) 13 | 14 | ptr_a = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(d_a)) 15 | ptr_b = reinterpret(Core.LLVMPtr{T,AS.Global}, pointer(d_b)) 16 | 17 | let ptr_a=ptr_a, ptr_b=ptr_b #JuliaLang/julia#15276 18 | @on_device unsafe_store!(ptr_b, unsafe_cached_load(ptr_a)) 19 | end 20 | 21 | @test Array(d_a) == Array(d_b) 22 | end 23 | 24 | @testset "Const" begin 25 | function kernel(a, b, i) 26 | @inbounds b[i] = Base.Experimental.Const(a)[i] 27 | return 28 | end 29 | 30 | buf = IOBuffer() 31 | 32 | a = CuArray([0]) 33 | b = CuArray([0]) 34 | @device_code_ptx io=buf @cuda kernel(a, b, 1) 35 | @test Array(a) == Array(b) 36 | 37 | asm = String(take!(copy(buf))) 38 | @test occursin("ld.global.nc", asm) 39 | 40 | 41 | function copy_const(A, _B) 42 | B = Base.Experimental.Const(_B) 43 | i = threadIdx().x 44 | if i <= length(A) 45 | @inbounds A[i] = B[i] 46 | end 47 | return 48 | end 49 | 50 | x = CUDA.zeros(Float64, 32) 51 | y = CUDA.ones(Float64, length(x)) 52 | 53 | @cuda threads=length(x) copy_const(x, y) 54 | @test Array(x) == Array(y) 55 | end 56 | 57 | end 58 | -------------------------------------------------------------------------------- /lib/cutensor/CUTENSOR.jl: -------------------------------------------------------------------------------- 1 | module CUTENSOR 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | using ..CUDA: CUstream, cudaDataType 7 | using ..CUDA: libcutensor, @retry_reclaim, initialize_context 8 | 9 | using CEnum: @cenum 10 | 11 | 12 | const cudaDataType_t = cudaDataType 13 | 14 | # core library 15 | include("libcutensor_common.jl") 16 | include("error.jl") 17 | include("libcutensor.jl") 18 | 19 | # low-level wrappers 20 | include("tensor.jl") 21 | include("wrappers.jl") 22 | 23 | # high-level integrations 24 | include("interfaces.jl") 25 | 26 | # cache for created, but unused handles 27 | const idle_handles = HandleCache{CuContext,Base.RefValue{cutensorHandle_t}}() 28 | 29 | function handle() 30 | cuda = CUDA.active_state() 31 | 32 | # every task maintains library state per device 33 | LibraryState = @NamedTuple{handle::Base.RefValue{cutensorHandle_t}} 34 | states = get!(task_local_storage(), :CUTENSOR) do 35 | Dict{CuContext,LibraryState}() 36 | end::Dict{CuContext,LibraryState} 37 | 38 | # get library state 39 | @noinline function new_state(cuda) 40 | new_handle = pop!(idle_handles, cuda.context) do 41 | handle = Ref{cutensorHandle_t}() 42 | cutensorInit(handle) 43 | handle 44 | end 45 | 46 | finalizer(current_task()) do task 47 | push!(idle_handles, cuda.context, new_handle) do 48 | # CUTENSOR doesn't need to actively destroy its handle 49 | end 50 | end 51 | 52 | (; handle=new_handle) 53 | end 54 | state = get!(states, cuda.context) do 55 | new_state(cuda) 56 | end 57 | 58 | return state.handle 59 | end 60 | 61 | end 62 | -------------------------------------------------------------------------------- /examples/peakflops.jl: -------------------------------------------------------------------------------- 1 | using CUDA 2 | using CUDA: i32 3 | 4 | using Test 5 | 6 | "Dummy kernel doing 100 FMAs." 7 | function kernel_100fma(a, b, c, out) 8 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 9 | @inbounds if i <= length(out) 10 | a_val = a[i] 11 | b_val = b[i] 12 | c_val = c[i] 13 | 14 | for j in 1:33 15 | a_val = CUDA.fma(a_val, b_val, c_val) 16 | b_val = CUDA.fma(a_val, b_val, c_val) 17 | c_val = CUDA.fma(a_val, b_val, c_val) 18 | end 19 | 20 | out[i] = CUDA.fma(a_val, b_val, c_val) 21 | end 22 | 23 | return 24 | end 25 | 26 | function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0)) 27 | device!(dev) do 28 | dims = (n, n) 29 | a = round.(rand(Float32, dims) * 100) 30 | b = round.(rand(Float32, dims) * 100) 31 | c = round.(rand(Float32, dims) * 100) 32 | out = similar(a) 33 | 34 | d_a = CuArray(a) 35 | d_b = CuArray(b) 36 | d_c = CuArray(c) 37 | d_out = CuArray(out) 38 | 39 | len = prod(dims) 40 | 41 | kernel = @cuda launch=false kernel_100fma(d_a, d_b, d_c, d_out) 42 | config = launch_configuration(kernel.fun) 43 | threads = min(len, config.threads) 44 | blocks = cld(len, threads) 45 | 46 | # warm-up 47 | kernel(d_a, d_b, d_c, d_out) 48 | synchronize() 49 | 50 | secs = CUDA.@elapsed begin 51 | kernel(d_a, d_b, d_c, d_out; threads=threads, blocks=blocks) 52 | end 53 | flopcount = 200*len 54 | flops = flopcount / secs 55 | 56 | return flops 57 | end 58 | end 59 | 60 | println(peakflops()) 61 | -------------------------------------------------------------------------------- /src/device/pointer.jl: -------------------------------------------------------------------------------- 1 | # CUDA-specific operations on pointers with address spaces 2 | 3 | ## adrspace aliases 4 | 5 | export AS 6 | 7 | module AS 8 | 9 | const Generic = 0 10 | const Global = 1 11 | const Shared = 3 12 | const Constant = 4 13 | const Local = 5 14 | 15 | end 16 | 17 | 18 | ## ldg 19 | 20 | const LDGTypes = (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, 21 | Float32, Float64) 22 | 23 | # TODO: this functionality should throw The libdevice library is a collection of NVVM bitcode functions that implement common 22 | # > functions for NVIDIA GPU devices, including math primitives and bit-manipulation 23 | # > functions. These functions are optimized for particular GPU architectures, and are 24 | # > intended to be linked with an NVVM IR module during compilation to PTX. 25 | include("intrinsics/math.jl") 26 | # TODO: native mathematical functions, CUDA C programming guide" > "C language extensions" 27 | # https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__DOUBLE.html 28 | # see /path/to/cuda/include/sm_20_intrinsics.h 29 | 30 | # functionality from libcudadevrt 31 | # 32 | # The libcudadevrt library is a collection of PTX bitcode functions that implement parts of 33 | # the CUDA API for execution on the device, such as device synchronization primitives, 34 | # dynamic kernel APIs, etc. 35 | using CEnum: @cenum 36 | include("intrinsics/libcudadevrt_common.jl") 37 | include("intrinsics/libcudadevrt.jl") 38 | include("intrinsics/cooperative_groups.jl") 39 | include("intrinsics/dynamic_parallelism.jl") 40 | -------------------------------------------------------------------------------- /test/broadcast.jl: -------------------------------------------------------------------------------- 1 | @testset "broadcast" begin 2 | @test testf((x) -> fill!(x, 1), rand(3,3)) 3 | @test testf((x, y) -> map(+, x, y), rand(2, 3), rand(2, 3)) 4 | @test testf((x) -> sin.(x), rand(2, 3)) 5 | @test testf((x) -> log.(x) .+ 1, rand(2, 3)) 6 | @test testf((x) -> 2x, rand(2, 3)) 7 | @test testf((x) -> x .^ 0, rand(2, 3)) 8 | @test testf((x) -> x .^ 1, rand(2, 3)) 9 | @test testf((x) -> x .^ 2, rand(2, 3)) 10 | @test testf((x) -> x .^ 3, rand(2, 3)) 11 | @test testf((x) -> x .^ 5, rand(2, 3)) 12 | @test testf((x) -> (z = Int32(5); x .^ z), rand(2, 3)) 13 | @test testf((x) -> (z = Float64(π); x .^ z), rand(2, 3)) 14 | @test testf((x) -> (z = Float32(π); x .^ z), rand(Float32, 2, 3)) 15 | @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) 16 | @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) 17 | @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL]) 18 | @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6]) 19 | 20 | @eval struct Whatever{T} 21 | x::Int 22 | end 23 | @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1]) 24 | end 25 | 26 | # https://github.com/JuliaGPU/CUDA.jl/issues/223 27 | @testset "Ref Broadcast" begin 28 | foobar(idx, A) = A[idx] 29 | @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42]))) 30 | end 31 | 32 | @testset "Broadcast Fix" begin 33 | @test testf(x -> log.(x), rand(3,3)) 34 | @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3)) 35 | end 36 | 37 | # https://github.com/JuliaGPU/CUDA.jl/issues/261 38 | @testset "Broadcast Ref{<:Type}" begin 39 | A = CuArray{ComplexF64}(undef, (2,2)) 40 | @test eltype(convert.(ComplexF32, A)) == ComplexF32 41 | end 42 | -------------------------------------------------------------------------------- /lib/cutensor/interfaces.jl: -------------------------------------------------------------------------------- 1 | # interfacing with other packages 2 | 3 | ## Base 4 | 5 | function Base.:(+)(A::CuTensor, B::CuTensor) 6 | α = convert(eltype(A), 1.0) 7 | γ = convert(eltype(B), 1.0) 8 | C = similar(B) 9 | elementwiseBinary!(α, A, CUTENSOR_OP_IDENTITY, γ, B, CUTENSOR_OP_IDENTITY, C, CUTENSOR_OP_ADD) 10 | end 11 | 12 | function Base.:(-)(A::CuTensor, B::CuTensor) 13 | α = convert(eltype(A), 1.0) 14 | γ = convert(eltype(B), -1.0) 15 | C = similar(B) 16 | elementwiseBinary!(α, A, CUTENSOR_OP_IDENTITY, γ, B, CUTENSOR_OP_IDENTITY, C, CUTENSOR_OP_ADD) 17 | end 18 | 19 | function Base.:(*)(A::CuTensor, B::CuTensor) 20 | tC = promote_type(eltype(A), eltype(B)) 21 | A_uniqs = [(idx, i) for (idx, i) in enumerate(A.inds) if !(i in B.inds)] 22 | B_uniqs = [(idx, i) for (idx, i) in enumerate(B.inds) if !(i in A.inds)] 23 | A_sizes = map(x->size(A,x[1]), A_uniqs) 24 | B_sizes = map(x->size(B,x[1]), B_uniqs) 25 | A_inds = map(x->Char(x[2]), A_uniqs) 26 | B_inds = map(x->Char(x[2]), B_uniqs) 27 | C = CuTensor(CUDA.zeros(tC, Dims(vcat(A_sizes, B_sizes))), vcat(A_inds, B_inds)) 28 | return mul!(C, A, B) 29 | end 30 | 31 | 32 | ## LinearAlgebra 33 | 34 | using LinearAlgebra 35 | 36 | LinearAlgebra.axpy!(a, X::CuTensor, Y::CuTensor) = elementwiseBinary!(a, X, CUTENSOR_OP_IDENTITY, one(eltype(Y)), Y, CUTENSOR_OP_IDENTITY, similar(Y), CUTENSOR_OP_ADD) 37 | LinearAlgebra.axpby!(a, X::CuTensor, b, Y::CuTensor) = elementwiseBinary!(a, X, CUTENSOR_OP_IDENTITY, b, Y, CUTENSOR_OP_IDENTITY, similar(Y), CUTENSOR_OP_ADD) 38 | 39 | function LinearAlgebra.mul!(C::CuTensor, A::CuTensor, B::CuTensor) 40 | contraction!(one(eltype(C)), A.data, A.inds, CUTENSOR_OP_IDENTITY, B.data, B.inds, CUTENSOR_OP_IDENTITY, zero(eltype(C)), C.data, C.inds, CUTENSOR_OP_IDENTITY, CUTENSOR_OP_IDENTITY) 41 | return C 42 | end 43 | -------------------------------------------------------------------------------- /test/cudnn/softmax.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUDNN: 2 | cudnnSoftmaxForward, 3 | cudnnSoftmaxForward!, 4 | cudnnSoftmaxBackward, 5 | cudnnSoftmaxAlgorithm_t, 6 | CUDNN_SOFTMAX_FAST, # 0, /* straightforward implementation */ 7 | CUDNN_SOFTMAX_ACCURATE, # 1, /* subtract max from every point to avoid overflow */ 8 | CUDNN_SOFTMAX_LOG, # 2 9 | cudnnSoftmaxMode_t, 10 | CUDNN_SOFTMAX_MODE_INSTANCE, # 0, /* compute the softmax over all C, H, W for each N */ 11 | CUDNN_SOFTMAX_MODE_CHANNEL # 1 /* compute the softmax over all C for each H, W, N */ 12 | 13 | @testset "cudnn/softmax" begin 14 | ax,ay = randn(Float32,10,10),randn(Float32,10,10) 15 | cx,cy = CuArray.((ax,ay)) 16 | 17 | function softmaxtest(; 18 | alpha=1, 19 | beta=0, 20 | mode=CUDNN_SOFTMAX_MODE_INSTANCE, 21 | algo=CUDNN_SOFTMAX_FAST 22 | ) 23 | d = mode === CUDNN_SOFTMAX_MODE_INSTANCE ? 1 : 2 24 | x = ax .- maximum(ax, dims=d) 25 | y = x .- log.(sum(exp.(x), dims=d)) 26 | if algo !== CUDNN_SOFTMAX_LOG; y = exp.(y); end 27 | add1(x)=reshape(x, (size(x)..., 1)) 28 | if mode === CUDNN_SOFTMAX_MODE_CHANNEL 29 | y,cx1,cy1 = add1.((y,cx,cy)) 30 | else 31 | cx1,cy1 = cx,cy 32 | end 33 | y0 = alpha * y 34 | y1 = y0 .+ beta * ay 35 | @test y0 ≈ cudnnSoftmaxForward(cx1; algo, mode, alpha) |> Array 36 | @test y1 ≈ cudnnSoftmaxForward!(copy(cy1), cx1; algo, mode, alpha, beta) |> Array 37 | end 38 | 39 | softmaxtest() 40 | softmaxtest(alpha=2) 41 | softmaxtest(beta=2) 42 | softmaxtest(mode=CUDNN_SOFTMAX_MODE_INSTANCE) 43 | softmaxtest(mode=CUDNN_SOFTMAX_MODE_CHANNEL) 44 | softmaxtest(algo=CUDNN_SOFTMAX_FAST) 45 | softmaxtest(algo=CUDNN_SOFTMAX_ACCURATE) 46 | softmaxtest(algo=CUDNN_SOFTMAX_LOG) 47 | end 48 | -------------------------------------------------------------------------------- /test/nvml.jl: -------------------------------------------------------------------------------- 1 | using CUDA.NVML 2 | 3 | macro maybe_unsupported(ex) 4 | quote 5 | try 6 | $(esc(ex)) 7 | catch err 8 | (isa(err, NVML.NVMLError) && err.code == NVML.ERROR_NOT_SUPPORTED) || rethrow() 9 | end 10 | end 11 | end 12 | 13 | @testset "system" begin 14 | @test NVML.version() isa VersionNumber 15 | @test NVML.driver_version() isa VersionNumber 16 | @test NVML.cuda_driver_version() == CUDA.version() 17 | end 18 | 19 | @testset "devices" begin 20 | let dev = NVML.Device(0) 21 | @test dev == first(NVML.devices()) 22 | @test NVML.index(dev) == 0 23 | 24 | str = sprint(io->show(io, "text/plain", dev)) 25 | @test occursin("NVML.Device(0)", str) 26 | end 27 | 28 | cuda_dev = CuDevice(0) 29 | mig = uuid(cuda_dev) != parent_uuid(cuda_dev) 30 | 31 | # tests for the parent device 32 | let dev = NVML.Device(parent_uuid(cuda_dev)) 33 | @test NVML.uuid(dev) == parent_uuid(cuda_dev) 34 | NVML.brand(dev) 35 | @test occursin(NVML.name(dev), name(cuda_dev)) 36 | @maybe_unsupported NVML.serial(dev) 37 | 38 | @maybe_unsupported NVML.power_usage(dev) 39 | @maybe_unsupported NVML.energy_consumption(dev) 40 | 41 | @maybe_unsupported NVML.utilization_rates(dev) 42 | 43 | NVML.compute_mode(dev) 44 | @test NVML.compute_capability(dev) == capability(cuda_dev) 45 | end 46 | 47 | # tests for the compute instance 48 | let dev = NVML.Device(uuid(cuda_dev); mig) 49 | @test NVML.uuid(dev) == uuid(cuda_dev) 50 | @test NVML.name(dev) == name(cuda_dev) 51 | 52 | NVML.memory_info(dev) 53 | 54 | context() 55 | # FIXME: https://github.com/NVIDIA/gpu-monitoring-tools/issues/63 56 | #@test getpid() in keys(NVML.compute_processes(dev)) 57 | NVML.compute_processes(dev) 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /docs/src/installation/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | 4 | ## Could not find a suitable CUDA installation 5 | 6 | This means that CUDA.jl could not find or provide a CUDA toolkit. For more information, 7 | re-run with the `JULIA_DEBUG` environment variable set to `CUDA`. 8 | 9 | If you're encountering this error when disabling artifacts through by setting 10 | `JULIA_CUDA_USE_BINARYBUILDER=false`, it is your own responsibility to make sure CUDA.jl 11 | can detect the necessary pieces, e.g., by putting CUDA's binaries and libraries in 12 | discoverable locations (i.e. on PATH, and on the library search path). Additionally, the 13 | `CUDA_HOME` environment can be used to point CUDA.jl to where the CUDA toolkit is installed, 14 | but that will only help if the contents of that directory have not been reorganized. 15 | 16 | 17 | ## UNKNOWN_ERROR(999) 18 | 19 | If you encounter this error, there are several known issues that may be causing it: 20 | 21 | - a mismatch between the CUDA driver and driver library: on Linux, look for clues in `dmesg` 22 | - the CUDA driver is in a bad state: this can happen after resume. **Try rebooting**. 23 | 24 | Generally though, it's impossible to say what's the reason for the error, but Julia is 25 | likely not to blame. Make sure your set-up works (e.g., try executing `nvidia-smi`, a CUDA C 26 | binary, etc), and if everything looks good file an issue. 27 | 28 | 29 | ## NVML library not found (on Windows) 30 | 31 | Check and make sure the `NVSMI` folder is in your `PATH`. By default it may not be. Look in 32 | `C:\Program Files\NVIDIA Corporation` for the `NVSMI` folder - you should see `nvml.dll` 33 | within it. You can add this folder to your `PATH` and check that `nvidia-smi` runs properly. 34 | 35 | 36 | ## The specified module could not be found (on Windows) 37 | 38 | Ensure the [Visual C++ Redistributable](https://aka.ms/vs/16/release/vc_redist.x64.exe) is 39 | installed. 40 | -------------------------------------------------------------------------------- /test/threading.jl: -------------------------------------------------------------------------------- 1 | # FIXME: these tests regularly triggers illegal memory accesses 2 | # after having moved to distributed test execution, 3 | # regardless of the memory pool or system. 4 | 5 | @testset "threaded execution" begin 6 | function kernel(a, tid, id) 7 | a[1] = tid 8 | a[2] = id 9 | return 10 | end 11 | 12 | test_lock = ReentrantLock() 13 | Threads.@threads for id in 1:10 14 | da = CuArray{Int}(undef, 2) 15 | tid = Threads.threadid() 16 | @cuda kernel(da, tid, id) 17 | 18 | a = Array(da) 19 | lock(test_lock) do 20 | @test a == [tid, id] 21 | end 22 | end 23 | end 24 | 25 | @testset "threaded arrays" begin 26 | test_lock = ReentrantLock() 27 | Threads.@threads for i in 1:Threads.nthreads()*100 28 | # uses libraries (rand, gemm) to test library handles 29 | # allocates and uses unsafe_free to cover the allocator 30 | da = CUDA.rand(64, 64) 31 | db = CUDA.rand(64, 64) 32 | yield() 33 | dc = da * db 34 | yield() 35 | 36 | # @testset is not thread safe 37 | a = Array(da) 38 | b = Array(db) 39 | c = Array(dc) 40 | lock(test_lock) do 41 | @test c ≈ a * b 42 | end 43 | 44 | yield() 45 | CUDA.unsafe_free!(da) 46 | CUDA.unsafe_free!(db) 47 | end 48 | end 49 | 50 | @testset "threaded device usage" begin 51 | test_lock = ReentrantLock() 52 | Threads.@threads for i in 1:Threads.nthreads()*100 53 | dev = rand(1:length(devices())) 54 | device!(dev-1) do 55 | da = CUDA.rand(64, 64) 56 | db = CUDA.rand(64, 64) 57 | yield() 58 | dc = da * (db .* 2) 59 | yield() 60 | 61 | a = Array(da) 62 | b = Array(db) 63 | c = Array(dc) 64 | lock(test_lock) do 65 | @test c ≈ a * (b .* 2) 66 | end 67 | 68 | yield() 69 | CUDA.unsafe_free!(da) 70 | CUDA.unsafe_free!(db) 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /lib/cudnn/util.jl: -------------------------------------------------------------------------------- 1 | # For low level cudnn functions that require a pointer to a number 2 | cptr(x,a::DenseCuArray{Float64})=Float64[x] 3 | cptr(x,a::DenseCuArray{Float32})=Float32[x] 4 | cptr(x,a::DenseCuArray{Float16})=Float32[x] 5 | 6 | # Conversion between Julia and CUDNN datatypes 7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF 8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT 9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE 10 | cudnnDataType(::Type{Int8}) = CUDNN_DATA_INT8 11 | cudnnDataType(::Type{UInt8}) = CUDNN_DATA_UINT8 12 | cudnnDataType(::Type{Int32}) = CUDNN_DATA_INT32 13 | # The following are 32-bit elements each composed of 4 8-bit integers, only supported with CUDNN_TENSOR_NCHW_VECT_C 14 | # CUDNN_DATA_INT8x4, 15 | # CUDNN_DATA_UINT8x4, 16 | # CUDNN_DATA_INT8x32, 17 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 : 18 | a==CUDNN_DATA_FLOAT ? Float32 : 19 | a==CUDNN_DATA_DOUBLE ? Float64 : 20 | a==CUDNN_DATA_INT8 ? Int8 : 21 | a==CUDNN_DATA_UINT8 ? UInt8 : 22 | a==CUDNN_DATA_INT32 ? Int32 : error()) 23 | 24 | tuple_strides(A::Tuple) = _strides((1,), A) 25 | _strides(out::Tuple{Int}, A::Tuple{}) = () 26 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out 27 | @inline function _strides(out::NTuple{M,Int}, A::Tuple) where M 28 | _strides((out..., out[M]*A[M]), A) 29 | end 30 | 31 | # The storage data types for alpha and beta are: 32 | # float for HALF and FLOAT tensors, and 33 | # double for DOUBLE tensors. 34 | scalingParameter(T, val) = error("Unknown tensor type $T") 35 | scalingParameter(::Type{Float16}, val) = Ref{Float32}(val) 36 | scalingParameter(::Type{Float32}, val) = Ref{Float32}(val) 37 | scalingParameter(::Type{Float64}, val) = Ref{Float64}(val) 38 | 39 | 40 | # Create temporary reserveSpace. Use 128 to avoid alignment issues. 41 | function cudnnTempSpace(nbytes) 42 | nbytes == 0 ? nothing : CuArray{Int128}(undef, (nbytes-1)÷sizeof(Int128)+1) 43 | end 44 | -------------------------------------------------------------------------------- /src/gpuarrays.jl: -------------------------------------------------------------------------------- 1 | # GPUArrays.jl interface 2 | 3 | 4 | # 5 | # Device functionality 6 | # 7 | 8 | 9 | ## execution 10 | 11 | struct CuArrayBackend <: AbstractGPUBackend end 12 | 13 | struct CuKernelContext <: AbstractKernelContext end 14 | 15 | @inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N}; 16 | elements::Int, elements_per_thread::Int) where {F,N} 17 | kernel = @cuda launch=false f(CuKernelContext(), args...) 18 | 19 | # launching many large blocks) lowers performance, as observed with broadcast, so cap 20 | # the block size if we don't have a grid-stride kernel (which would keep the grid small) 21 | if elements_per_thread > 1 22 | launch_configuration(kernel.fun) 23 | else 24 | launch_configuration(kernel.fun; max_threads=256) 25 | end 26 | end 27 | 28 | @inline function GPUArrays.gpu_call(::CuArrayBackend, f::F, args::TT, threads::Int, 29 | blocks::Int; name::Union{String,Nothing}) where {F,TT} 30 | @cuda threads=threads blocks=blocks name=name f(CuKernelContext(), args...) 31 | end 32 | 33 | 34 | ## on-device 35 | 36 | # indexing 37 | 38 | GPUArrays.blockidx(ctx::CuKernelContext) = blockIdx().x 39 | GPUArrays.blockdim(ctx::CuKernelContext) = blockDim().x 40 | GPUArrays.threadidx(ctx::CuKernelContext) = threadIdx().x 41 | GPUArrays.griddim(ctx::CuKernelContext) = gridDim().x 42 | 43 | # memory 44 | 45 | @inline function GPUArrays.LocalMemory(::CuKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} 46 | ) where {T, dims, id} 47 | ptr = CUDA._shmem(Val(id), T, Val(prod(dims))) 48 | CuDeviceArray(dims, reinterpret(LLVMPtr{T, AS.Shared}, ptr)) 49 | end 50 | 51 | # synchronization 52 | 53 | @inline GPUArrays.synchronize_threads(::CuKernelContext) = sync_threads() 54 | 55 | 56 | 57 | # 58 | # Host abstractions 59 | # 60 | 61 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend() 62 | -------------------------------------------------------------------------------- /src/device/utils.jl: -------------------------------------------------------------------------------- 1 | # helpers for writing device functionality 2 | 3 | # helper type for writing Int32 literals 4 | # TODO: upstream this 5 | struct Literal{T} end 6 | Base.:(*)(x, ::Type{Literal{T}}) where {T} = T(x) 7 | const i32 = Literal{Int32} 8 | 9 | # local method table for device functions 10 | @static if isdefined(Base.Experimental, Symbol("@overlay")) 11 | Base.Experimental.@MethodTable(method_table) 12 | else 13 | const method_table = nothing 14 | end 15 | 16 | # list of overrides (only for Julia 1.6) 17 | const overrides = Expr[] 18 | 19 | macro device_override(ex) 20 | code = quote 21 | $GPUCompiler.@override(CUDA.method_table, $ex) 22 | end 23 | if isdefined(Base.Experimental, Symbol("@overlay")) 24 | return esc(code) 25 | else 26 | push!(overrides, code) 27 | return 28 | end 29 | end 30 | 31 | macro device_function(ex) 32 | ex = macroexpand(__module__, ex) 33 | def = splitdef(ex) 34 | 35 | # generate a function that errors 36 | def[:body] = quote 37 | error("This function is not intended for use on the CPU") 38 | end 39 | 40 | esc(quote 41 | $(combinedef(def)) 42 | @device_override $ex 43 | end) 44 | end 45 | 46 | macro device_functions(ex) 47 | ex = macroexpand(__module__, ex) 48 | 49 | # recursively prepend `@device_function` to all function definitions 50 | function rewrite(block) 51 | out = Expr(:block) 52 | for arg in block.args 53 | if Meta.isexpr(arg, :block) 54 | # descend in blocks 55 | push!(out.args, rewrite(arg)) 56 | elseif Meta.isexpr(arg, [:function, :(=)]) 57 | # rewrite function definitions 58 | push!(out.args, :(@device_function $arg)) 59 | else 60 | # preserve all the rest 61 | push!(out.args, arg) 62 | end 63 | end 64 | out 65 | end 66 | 67 | esc(rewrite(ex)) 68 | end 69 | -------------------------------------------------------------------------------- /lib/nvtx/highlevel.jl: -------------------------------------------------------------------------------- 1 | # 2 | # domains 3 | # 4 | 5 | export Domain, domain 6 | 7 | struct Domain 8 | handle::nvtxDomainHandle_t 9 | 10 | function Domain(name::String) 11 | handle = nvtxDomainCreateA(name) 12 | new(handle) 13 | end 14 | end 15 | 16 | Base.unsafe_convert(::Type{nvtxDomainHandle_t}, dom::Domain) = dom.handle 17 | 18 | unsafe_destroy!(dom::Domain) = nvtxDomainDestroy(dom) 19 | 20 | function Domain(f::Function, name::String) 21 | dom = Domain(name) 22 | f(dom) 23 | unsafe_destroy!(dom) 24 | end 25 | 26 | 27 | # 28 | # markers 29 | # 30 | 31 | export mark 32 | 33 | mark(msg::String) = nvtxMarkA(msg) 34 | 35 | 36 | # 37 | # ranges 38 | # 39 | 40 | export Range, start_range, stop_range, @range 41 | 42 | struct Range 43 | id::nvtxRangeId_t 44 | end 45 | 46 | Base.convert(::Type{nvtxRangeId_t}, range::Range) = range.id 47 | 48 | """ 49 | start_range(msg) 50 | 51 | Create and start a new range. The range is not automatically stopped, use 52 | [`end_range(::Range)`](@ref) for that. 53 | 54 | Use this API if you need overlapping ranges, for scope-based use [`@range`](@ref) instead. 55 | """ 56 | start_range(msg::String) = nvtxRangeStartA(msg) 57 | end_range(r::Range) = nvtxRangeEnd(r) 58 | 59 | push_range(msg::String) = nvtxRangePushA(msg) 60 | pop_range() = nvtxRangePop() 61 | 62 | """ 63 | @range "msg" ex 64 | @range function ... end 65 | 66 | Create a new range and execute `ex`. The range is popped automatically afterwards. 67 | 68 | See also: [`range`](@ref) 69 | """ 70 | macro range(msg, ex) 71 | quote 72 | push_range($(esc(msg))) 73 | local ret = $(esc(ex)) 74 | pop_range() 75 | ret 76 | end 77 | end 78 | macro range(ex) 79 | def = splitdef(ex) 80 | def[:body] = quote 81 | $push_range($(string(def[:name]))) 82 | try 83 | $(def[:body]) 84 | finally 85 | $pop_range() 86 | end 87 | end 88 | esc(combinedef(def)) 89 | end 90 | -------------------------------------------------------------------------------- /lib/utils/threading.jl: -------------------------------------------------------------------------------- 1 | export @spinlock, @lock, LazyInitialized 2 | 3 | const var"@lock" = Base.var"@lock" 4 | 5 | # a safe way to acquire locks from finalizers, where we can't wait (which switches tasks) 6 | macro spinlock(l, ex) 7 | quote 8 | temp = $(esc(l)) 9 | while !trylock(temp) 10 | ccall(:jl_cpu_pause, Cvoid, ()) 11 | # Temporary solution before we have gc transition support in codegen. 12 | ccall(:jl_gc_safepoint, Cvoid, ()) 13 | # we can't yield here 14 | end 15 | try 16 | $(esc(ex)) 17 | finally 18 | unlock(temp) 19 | end 20 | end 21 | end 22 | 23 | 24 | """ 25 | LazyInitialized{T}() 26 | 27 | A thread-safe, lazily-initialized wrapper for a value of type `T`. Initialize and fetch the 28 | value by calling `get!`. The constructor is ensured to only be called once. 29 | 30 | This type is intended for lazy initialization of e.g. global structures, without using 31 | `__init__`. It is similar to protecting accesses using a lock, but is much cheaper. 32 | 33 | """ 34 | struct LazyInitialized{T} 35 | # 0: uninitialized 36 | # 1: initializing 37 | # 2: initialized 38 | guard::Threads.Atomic{Int} 39 | value::Base.RefValue{T} 40 | 41 | LazyInitialized{T}() where {T} = 42 | new(Threads.Atomic{Int}(0), Ref{T}()) 43 | end 44 | 45 | function Base.get!(constructor, x::LazyInitialized; hook=nothing) 46 | while x.guard[] != 2 47 | initialize!(x, constructor, hook) 48 | end 49 | assume(isassigned(x.value)) # to get rid of the check 50 | x.value[] 51 | end 52 | 53 | @noinline function initialize!(x::LazyInitialized, constructor::F1, hook::F2) where {F1, F2} 54 | status = Threads.atomic_cas!(x.guard, 0, 1) 55 | if status == 0 56 | try 57 | x.value[] = constructor() 58 | x.guard[] = 2 59 | catch 60 | x.guard[] = 0 61 | rethrow() 62 | end 63 | 64 | if hook !== nothing 65 | hook() 66 | end 67 | else 68 | yield() 69 | end 70 | return 71 | end 72 | -------------------------------------------------------------------------------- /test/cutensor/permutations.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUTENSOR 2 | using CUDA 3 | using LinearAlgebra 4 | 5 | # using host memory with CUTENSOR doesn't work on Windows 6 | can_pin = !Sys.iswindows() 7 | 8 | eltypes = ((Float16, Float16), 9 | #(Float16, Float32), 10 | (Float32, Float32), 11 | #(Float32, Float64), 12 | (Float64, Float64), 13 | #(ComplexF16, ComplexF16), 14 | #(ComplexF16, ComplexF32), 15 | (ComplexF32, ComplexF32), 16 | #(ComplexF32, ComplexF64), 17 | (ComplexF64, ComplexF64)) 18 | @testset for N=2:5 19 | @testset for (eltyA, eltyC) in eltypes 20 | # setup 21 | dmax = 2^div(18,N) 22 | dims = rand(2:dmax, N) 23 | p = randperm(N) 24 | indsA = collect(('a':'z')[1:N]) 25 | indsC = indsA[p] 26 | dimsA = dims 27 | dimsC = dims[p] 28 | A = rand(eltyA, dimsA...) 29 | can_pin && Mem.pin(A) 30 | dA = CuArray(A) 31 | dC = similar(dA, eltyC, dimsC...) 32 | 33 | # simple case 34 | dC = CUTENSOR.permutation!(one(eltyA), dA, indsA, dC, indsC) 35 | C = collect(dC) 36 | @test C == permutedims(A, p) # exact equality 37 | if can_pin 38 | Csimple = zeros(eltyC, dimsC...) 39 | Mem.pin(Csimple) 40 | Csimple = CUDA.@sync CUTENSOR.permutation!(one(eltyA), A, indsA, Csimple, indsC) 41 | @test Csimple == permutedims(A, p) # exact equality 42 | end 43 | 44 | # with scalar 45 | α = rand(eltyA) 46 | dC = CUTENSOR.permutation!(α, dA, indsA, dC, indsC) 47 | C = collect(dC) 48 | @test C ≈ α * permutedims(A, p) # approximate, floating point rounding 49 | if can_pin 50 | Cscalar = zeros(eltyC, dimsC...) 51 | Mem.pin(Cscalar) 52 | Cscalar = CUDA.@sync CUTENSOR.permutation!(α, A, indsA, Cscalar, indsC) 53 | @test Cscalar ≈ α * permutedims(A, p) # approximate, floating point rounding 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Sanity checks (read this first, then remove this section)** 11 | 12 | - [ ] Make sure you're reporting *a bug*; for general questions, please use Discourse or 13 | Slack. 14 | 15 | - [ ] If you're dealing with a performance issue, make sure you **disable scalar iteration** 16 | (`CUDA.allowscalar(false)`). Only file an issue if that shows scalar iteration happening 17 | in CUDA.jl or Base Julia, as opposed to your own code. 18 | 19 | - [ ] If you're seeing an error message, **follow the error message instructions**, if any 20 | (e.g. `inspect code with @device_code_warntype`). If you can't solve the problem using 21 | that information, make sure to post it as part of the issue. 22 | 23 | - [ ] Always ensure you're using the latest version of CUDA.jl, and if possible, please 24 | check the master branch to see if your issue hasn't been resolved yet. 25 | 26 | If your bug is still valid, please go ahead and fill out the template below. 27 | 28 | 29 | **Describe the bug** 30 | 31 | A clear and concise description of what the bug is. 32 | 33 | 34 | **To reproduce** 35 | 36 | The Minimal Working Example (MWE) for this bug: 37 | 38 | ```julia 39 | # some code here 40 | ``` 41 | 42 |
Manifest.toml 43 |

44 | 45 | ``` 46 | Paste your Manifest.toml here, or accurately describe which version of CUDA.jl and its dependencies (GPUArrays.jl, GPUCompiler.jl, LLVM.jl) you are using. 47 | ``` 48 | 49 |

50 |
51 | 52 | 53 | **Expected behavior** 54 | 55 | A clear and concise description of what you expected to happen. 56 | 57 | 58 | **Version info** 59 | 60 | Details on Julia: 61 | 62 | ``` 63 | # please post the output of: 64 | versioninfo() 65 | ``` 66 | 67 | Details on CUDA: 68 | 69 | ``` 70 | # please post the output of: 71 | CUDA.versioninfo() 72 | ``` 73 | 74 | 75 | **Additional context** 76 | 77 | Add any other context about the problem here. 78 | -------------------------------------------------------------------------------- /lib/nvml/libnvml_deprecated.jl: -------------------------------------------------------------------------------- 1 | ## Deprecated in CUDA 11.1 2 | 3 | struct nvmlDeviceAttributesV1_st 4 | multiprocessorCount::UInt32 5 | sharedCopyEngineCount::UInt32 6 | sharedDecoderCount::UInt32 7 | sharedEncoderCount::UInt32 8 | sharedJpegCount::UInt32 9 | sharedOfaCount::UInt32 10 | end 11 | 12 | const nvmlDeviceAttributesV1_t = nvmlDeviceAttributesV1_st 13 | 14 | @checked function nvmlDeviceGetAttributes(device, attributes) 15 | initialize_context() 16 | ccall((:nvmlDeviceGetAttributes, libnvml()), nvmlReturn_t, 17 | (nvmlDevice_t, Ptr{nvmlDeviceAttributesV1_t}), 18 | device, attributes) 19 | end 20 | 21 | struct nvmlProcessInfoV1_st 22 | pid::UInt32 23 | usedGpuMemory::Culonglong 24 | end 25 | 26 | const nvmlProcessInfoV1_t = nvmlProcessInfoV1_st 27 | 28 | @checked function nvmlDeviceGetComputeRunningProcesses(device, infoCount, infos) 29 | initialize_context() 30 | ccall((:nvmlDeviceGetComputeRunningProcesses, libnvml()), nvmlReturn_t, 31 | (nvmlDevice_t, Ptr{UInt32}, Ptr{nvmlProcessInfoV1_t}), 32 | device, infoCount, infos) 33 | end 34 | 35 | @checked function nvmlDeviceGetGraphicsRunningProcesses(device, infoCount, infos) 36 | initialize_context() 37 | ccall((:nvmlDeviceGetGraphicsRunningProcesses, libnvml()), nvmlReturn_t, 38 | (nvmlDevice_t, Ptr{UInt32}, Ptr{nvmlProcessInfoV1_t}), 39 | device, infoCount, infos) 40 | end 41 | 42 | ## Superseded in CUDA 11.2 43 | 44 | struct nvmlComputeInstanceInfoV1_st 45 | device::nvmlDevice_t 46 | gpuInstance::nvmlGpuInstance_t 47 | id::UInt32 48 | profileId::UInt32 49 | end 50 | 51 | const nvmlComputeInstanceInfoV1_t = nvmlComputeInstanceInfoV1_st 52 | 53 | @checked function nvmlComputeInstanceGetInfo(computeInstance, info) 54 | initialize_context() 55 | ccall((:nvmlComputeInstanceGetInfo, libnvml()), nvmlReturn_t, 56 | (nvmlComputeInstance_t, Ptr{nvmlComputeInstanceInfoV1_t}), 57 | computeInstance, info) 58 | end 59 | 60 | ## 61 | -------------------------------------------------------------------------------- /src/device/intrinsics/version.jl: -------------------------------------------------------------------------------- 1 | # device intrinsics for querying the compute SimpleVersion and PTX ISA version 2 | 3 | 4 | ## a GPU-compatible version number 5 | 6 | export SimpleVersion, @sv_str 7 | 8 | struct SimpleVersion 9 | major::UInt32 10 | minor::UInt32 11 | 12 | SimpleVersion(major, minor=0) = new(major, minor) 13 | end 14 | 15 | function Base.tryparse(::Type{SimpleVersion}, v::AbstractString) 16 | parts = split(v, ".") 17 | 1 <= length(parts) <= 2 || return nothing 18 | 19 | int_parts = map(parts) do part 20 | tryparse(Int, part) 21 | end 22 | any(isnothing, int_parts) && return nothing 23 | 24 | SimpleVersion(int_parts...) 25 | end 26 | 27 | function Base.parse(::Type{SimpleVersion}, v::AbstractString) 28 | ver = tryparse(SimpleVersion, v) 29 | ver === nothing && throw(ArgumentError("invalid SimpleVersion string: '$v'")) 30 | return ver 31 | end 32 | 33 | SimpleVersion(v::AbstractString) = parse(SimpleVersion, v) 34 | 35 | @inline function Base.isless(a::SimpleVersion, b::SimpleVersion) 36 | (a.major < b.major) && return true 37 | (a.major > b.major) && return false 38 | (a.minor < b.minor) && return true 39 | (a.minor > b.minor) && return false 40 | return false 41 | end 42 | 43 | macro sv_str(str) 44 | SimpleVersion(str) 45 | end 46 | 47 | 48 | ## accessors for the compute SimpleVersion and PTX ISA version 49 | 50 | export compute_capability, ptx_isa_version 51 | 52 | for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"] 53 | @eval @inline $(Symbol(var))() = 54 | Base.llvmcall( 55 | $("""@$var = external global i32 56 | define i32 @entry() #0 { 57 | %val = load i32, i32* @$var 58 | ret i32 %val 59 | } 60 | attributes #0 = { alwaysinline } 61 | """, "entry"), UInt32, Tuple{}) 62 | end 63 | 64 | @device_function @inline compute_capability() = SimpleVersion(sm_major(), sm_minor()) 65 | @device_function @inline ptx_isa_version() = SimpleVersion(ptx_major(), ptx_minor()) 66 | 67 | -------------------------------------------------------------------------------- /lib/cusolver/error.jl: -------------------------------------------------------------------------------- 1 | export CUSOLVERError 2 | 3 | struct CUSOLVERError <: Exception 4 | code::cusolverStatus_t 5 | end 6 | 7 | Base.convert(::Type{cusolverStatus_t}, err::CUSOLVERError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUSOLVERError) = 10 | print(io, "CUSOLVERError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CUSOLVERError) = string(err.code) 13 | 14 | ## COV_EXCL_START 15 | function description(err) 16 | if err.code == CUSOLVER_STATUS_SUCCESS 17 | "the operation completed successfully" 18 | elseif err.code == CUSOLVER_STATUS_NOT_INITIALIZED 19 | "the library was not initialized" 20 | elseif err.code == CUSOLVER_STATUS_ALLOC_FAILED 21 | "the resource allocation failed" 22 | elseif err.code == CUSOLVER_STATUS_INVALID_VALUE 23 | "an invalid value was used as an argument" 24 | elseif err.code == CUSOLVER_STATUS_ARCH_MISMATCH 25 | "an absent device architectural feature is required" 26 | elseif err.code == CUSOLVER_STATUS_EXECUTION_FAILED 27 | "the GPU program failed to execute" 28 | elseif err.code == CUSOLVER_STATUS_INTERNAL_ERROR 29 | "an internal operation failed" 30 | elseif err.code == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED 31 | "the matrix type is not supported." 32 | else 33 | "no description for this error" 34 | end 35 | end 36 | ## COV_EXCL_STOP 37 | 38 | 39 | ## API call wrapper 40 | 41 | # outlined functionality to avoid GC frame allocation 42 | @noinline function throw_api_error(res) 43 | if res == CUSOLVER_STATUS_ALLOC_FAILED 44 | throw(OutOfGPUMemoryError()) 45 | else 46 | throw(CUSOLVERError(res)) 47 | end 48 | end 49 | 50 | macro check(ex, errs...) 51 | check = :(isequal(err, CUSOLVER_STATUS_ALLOC_FAILED)) 52 | for err in errs 53 | check = :($check || isequal(err, $(esc(err)))) 54 | end 55 | 56 | quote 57 | res = @retry_reclaim err->$check $(esc(ex)) 58 | if res != CUSOLVER_STATUS_SUCCESS 59 | throw_api_error(res) 60 | end 61 | 62 | nothing 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /docs/src/tutorials/custom_structs.jl: -------------------------------------------------------------------------------- 1 | # # Using custom structs 2 | # 3 | # This tutorial shows how to use custom structs on the GPU. Our example will be a one dimensional 4 | # interpolation. Lets start with the CPU version: 5 | using CUDA 6 | 7 | struct Interpolate{A} 8 | xs::A 9 | ys::A 10 | end 11 | 12 | function (itp::Interpolate)(x) 13 | i = searchsortedfirst(itp.xs, x) 14 | i = clamp(i, firstindex(itp.ys), lastindex(itp.ys)) 15 | @inbounds itp.ys[i] 16 | end 17 | 18 | xs_cpu = [1.0, 2.0, 3.0] 19 | ys_cpu = [10.0,20.0,30.0] 20 | itp_cpu = Interpolate(xs_cpu, ys_cpu) 21 | pts_cpu = [1.1,2.3] 22 | result_cpu = itp_cpu.(pts_cpu) 23 | 24 | # Ok the CPU code works, let's move our data to the GPU: 25 | itp = Interpolate(CuArray(xs_cpu), CuArray(ys_cpu)) 26 | pts = CuArray(pts_cpu); 27 | # If we try to call our interpolate `itp.(pts)`, we get an error however: 28 | # ``` 29 | # ... 30 | # KernelError: passing and using non-bitstype argument 31 | # ... 32 | # ``` 33 | # Why does it throw an error? Our calculation involves 34 | # a custom type `Interpolate{CuArray{Float64, 1}}`. 35 | # At the end of the day all arguments of a CUDA kernel need to 36 | # be bitstypes. However we have 37 | isbitstype(typeof(itp)) 38 | # How to fix this? The answer is, that there is a conversion mechanism, which adapts objects into 39 | # CUDA compatible bitstypes. 40 | # It is based on the [Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) package and basic types like `CuArray` already participate in this mechanism. For custom types, 41 | # we just need to add a conversion rule like so: 42 | import Adapt 43 | function Adapt.adapt_structure(to, itp::Interpolate) 44 | xs = Adapt.adapt_structure(to, itp.xs) 45 | ys = Adapt.adapt_structure(to, itp.ys) 46 | Interpolate(xs, ys) 47 | end 48 | # Now our struct plays nicely with CUDA.jl: 49 | result = itp.(pts) 50 | # It works, we get the same result as on the CPU. 51 | @assert CuArray(result_cpu) == result 52 | # Alternatively instead of defining `Adapt.adapt_structure` explictly, we could have done 53 | # ```julia 54 | # Adapt.@adapt_structure Interpolate 55 | # ``` 56 | # which expands to the same code that we wrote manually. 57 | -------------------------------------------------------------------------------- /lib/cublas/README.md: -------------------------------------------------------------------------------- 1 | # CUBLAS implementation progress 2 | 3 | The following sections list the CUBLAS functions shown on the CUBLAS 4 | documentation page: 5 | 6 | http://docs.nvidia.com/cuda/cublas/index.html 7 | 8 | ## Level 1 (13 functions) 9 | 10 | CUBLAS functions: 11 | 12 | * [x] amax 13 | * [x] amin 14 | * [x] asum 15 | * [x] axpy 16 | * [x] copy 17 | * [x] dot, dotc, dotu 18 | * [x] nrm2 19 | * [ ] rot (not implemented in julia blas.jl) 20 | * [ ] rotg (not implemented in julia blas.jl) 21 | * [ ] rotm (not implemented in julia blas.jl) 22 | * [ ] rotmg (not implemented in julia blas.jl) 23 | * [x] scal 24 | * [ ] swap (not implemented in julia blas.jl) 25 | 26 | ## Level 2 27 | 28 | Key: 29 | * `ge`: general 30 | * `gb`: general banded 31 | * `sy`: symmetric 32 | * `sb`: symmetric banded 33 | * `sp`: symmetric packed 34 | * `tr`: triangular 35 | * `tb`: triangular banded 36 | * `tp`: triangular packed 37 | * `he`: hermitian 38 | * `hb`: hermitian banded 39 | * `hp`: hermitian packed 40 | 41 | CUBLAS functions: 42 | 43 | * [x] gbmv (in julia/blas.jl) 44 | * [x] gemv (in julia/blas.jl) 45 | * [x] ger (in julia/blas.jl) 46 | * [x] sbmv (in julia/blas.jl) 47 | * [ ] spmv 48 | * [ ] spr 49 | * [ ] spr2 50 | * [x] symv (in julia/blas.jl) 51 | * [x] syr (in julia/blas.jl) 52 | * [ ] syr2 53 | * [x] tbmv 54 | * [x] tbsv 55 | * [ ] tpmv 56 | * [ ] tpsv 57 | * [x] trmv (in julia/blas.jl) 58 | * [x] trsv (in julia/blas.jl) 59 | * [x] hemv (in julia/blas.jl) 60 | * [x] hbmv 61 | * [ ] hpmv 62 | * [x] her (in julia/blas.jl) 63 | * [x] her2 64 | * [ ] hpr 65 | * [ ] hpr2 66 | 67 | ## Level 3 68 | 69 | CUBLAS functions: 70 | 71 | * [x] gemm (in julia/blas.jl) 72 | * [x] gemmBatched 73 | * [x] symm (in julia/blas.jl) 74 | * [x] syrk (in julia/blas.jl) 75 | * [x] syr2k (in julia/blas.jl) 76 | * [ ] syrkx 77 | * [x] trmm (in julia/blas.jl) 78 | * [x] trsm (in julia/blas.jl) 79 | * [x] trsmBatched 80 | * [x] hemm 81 | * [x] herk (in julia/blas.jl) 82 | * [x] her2k (in julia/blas.jl) 83 | * [ ] herkx 84 | 85 | ## BLAS-like extensions 86 | 87 | * [x] geam 88 | * [x] dgmm 89 | * [x] getrfBatched 90 | * [x] getriBatched 91 | * [x] geqrfBatched 92 | * [x] gelsBatched 93 | * [ ] tpttr 94 | * [ ] trttp 95 | -------------------------------------------------------------------------------- /lib/cublas/util.jl: -------------------------------------------------------------------------------- 1 | # convert matrix to band storage 2 | function band(A::AbstractMatrix,kl,ku) 3 | m, n = size(A) 4 | AB = zeros(eltype(A),kl+ku+1,n) 5 | for j = 1:n 6 | for i = max(1,j-ku):min(m,j+kl) 7 | AB[ku+1-j+i,j] = A[i,j] 8 | end 9 | end 10 | return AB 11 | end 12 | 13 | # convert band storage to general matrix 14 | function unband(AB::AbstractMatrix,m,kl,ku) 15 | bm, n = size(AB) 16 | A = zeros(eltype(AB),m,n) 17 | for j = 1:n 18 | for i = max(1,j-ku):min(m,j+kl) 19 | A[i,j] = AB[ku+1-j+i,j] 20 | end 21 | end 22 | return A 23 | end 24 | 25 | # zero out elements not on matrix bands 26 | function bandex(A::AbstractMatrix,kl,ku) 27 | m, n = size(A) 28 | AB = band(A,kl,ku) 29 | B = unband(AB,m,kl,ku) 30 | return B 31 | end 32 | 33 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 34 | const CublasReal = Union{Float64,Float32} 35 | const CublasComplex = Union{ComplexF64,ComplexF32} 36 | 37 | function Base.convert(::Type{cublasOperation_t}, trans::Char) 38 | if trans == 'N' 39 | return CUBLAS_OP_N 40 | elseif trans == 'T' 41 | return CUBLAS_OP_T 42 | elseif trans == 'C' 43 | return CUBLAS_OP_C 44 | else 45 | throw(ArgumentError("Unknown operation $trans")) 46 | end 47 | end 48 | 49 | function Base.convert(::Type{cublasFillMode_t}, uplo::Char) 50 | if uplo == 'U' 51 | return CUBLAS_FILL_MODE_UPPER 52 | elseif uplo == 'L' 53 | return CUBLAS_FILL_MODE_LOWER 54 | else 55 | throw(ArgumentError("Unknown fill mode $uplo")) 56 | end 57 | end 58 | 59 | function Base.convert(::Type{cublasDiagType_t}, diag::Char) 60 | if diag == 'U' 61 | return CUBLAS_DIAG_UNIT 62 | elseif diag == 'N' 63 | return CUBLAS_DIAG_NON_UNIT 64 | else 65 | throw(ArgumentError("Unknown diag mode $diag")) 66 | end 67 | end 68 | 69 | function Base.convert(::Type{cublasSideMode_t}, side::Char) 70 | if side == 'L' 71 | return CUBLAS_SIDE_LEFT 72 | elseif side == 'R' 73 | return CUBLAS_SIDE_RIGHT 74 | else 75 | throw(ArgumentError("Unknown side mode $side")) 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/cudadrv/module/global.jl: -------------------------------------------------------------------------------- 1 | # Module-scope global variables 2 | 3 | # TODO: improve this interface: 4 | # - should be more dict-like: get and setindex(::name), haskey(::name) 5 | # - globals(::Type)? 6 | 7 | export 8 | CuGlobal, get, set 9 | 10 | 11 | """ 12 | CuGlobal{T}(mod::CuModule, name::String) 13 | 14 | Acquires a typed global variable handle from a named global in a module. 15 | """ 16 | struct CuGlobal{T} 17 | buf::Mem.DeviceBuffer 18 | 19 | function CuGlobal{T}(mod::CuModule, name::String) where T 20 | ptr_ref = Ref{CuPtr{Cvoid}}() 21 | nbytes_ref = Ref{Csize_t}() 22 | cuModuleGetGlobal_v2(ptr_ref, nbytes_ref, mod, name) 23 | if nbytes_ref[] != sizeof(T) 24 | throw(ArgumentError("size of global '$name' does not match type parameter type $T")) 25 | end 26 | buf = Mem.DeviceBuffer(ptr_ref[], nbytes_ref[], false) 27 | 28 | return new{T}(buf) 29 | end 30 | end 31 | 32 | Base.cconvert(::Type{CuPtr{Cvoid}}, var::CuGlobal) = var.buf 33 | 34 | Base.:(==)(a::CuGlobal, b::CuGlobal) = a.handle == b.handle 35 | Base.hash(var::CuGlobal, h::UInt) = hash(var.ptr, h) 36 | 37 | """ 38 | eltype(var::CuGlobal) 39 | 40 | Return the element type of a global variable object. 41 | """ 42 | Base.eltype(::Type{CuGlobal{T}}) where {T} = T 43 | 44 | """ 45 | Base.getindex(var::CuGlobal) 46 | 47 | Return the current value of a global variable. 48 | """ 49 | function Base.getindex(var::CuGlobal{T}; async::Bool=false, stream::CuStream=stream()) where T 50 | val_ref = Ref{T}() 51 | if async 52 | cuMemcpyDtoHAsync_v2(val_ref, var, var.buf.bytesize, stream) 53 | else 54 | cuMemcpyDtoH_v2(val_ref, var, var.buf.bytesize) 55 | end 56 | return val_ref[] 57 | end 58 | # TODO: import Base: get? 59 | 60 | """ 61 | Base.setindex(var::CuGlobal{T}, val::T) 62 | 63 | Set the value of a global variable to `val` 64 | """ 65 | function Base.setindex!(var::CuGlobal{T}, val::T; async::Bool=false, stream::CuStream=stream()) where T 66 | val_ref = Ref{T}(val) 67 | if async 68 | cuMemcpyHtoDAsync_v2(var, val_ref, var.buf.bytesize, stream) 69 | else 70 | cuMemcpyHtoD_v2(var, val_ref, var.buf.bytesize) 71 | end 72 | end 73 | -------------------------------------------------------------------------------- /lib/utils/cache.jl: -------------------------------------------------------------------------------- 1 | # a cache for library handles 2 | 3 | export HandleCache 4 | 5 | struct HandleCache{K,V} 6 | active_handles::Set{Pair{K,V}} # for debugging, and to prevent handle finalization 7 | idle_handles::Dict{K,Vector{V}} 8 | lock::ReentrantLock 9 | 10 | max_entries::Int 11 | 12 | function HandleCache{K,V}(max_entries::Int=32) where {K,V} 13 | return new{K,V}(Set{Pair{K,V}}(), Dict{K,Vector{V}}(), ReentrantLock(), max_entries) 14 | end 15 | end 16 | 17 | # remove a handle from the cache, or create a new one 18 | function Base.pop!(f::Function, cache::HandleCache{K,V}, key) where {K,V} 19 | function check_cache(f::Function=()->nothing) 20 | try 21 | GC.enable_finalizers(false) 22 | lock(cache.lock) do 23 | handle = if !haskey(cache.idle_handles, key) || isempty(cache.idle_handles[key]) 24 | f() 25 | else 26 | pop!(cache.idle_handles[key]) 27 | end 28 | 29 | if handle !== nothing 30 | push!(cache.active_handles, key=>handle) 31 | end 32 | 33 | return handle 34 | end 35 | finally 36 | GC.enable_finalizers(true) 37 | end 38 | end 39 | 40 | handle = check_cache() 41 | 42 | if handle === nothing 43 | # if we didn't find anything, perform a quick GC collection to free up old handles. 44 | GC.gc(false) 45 | 46 | handle = check_cache(f) 47 | end 48 | 49 | return handle::V 50 | end 51 | 52 | # put a handle in the cache, or destroy it if it doesn't fit 53 | function Base.push!(f::Function, cache::HandleCache{K,V}, key::K, handle::V) where {K,V} 54 | # XXX: take this lock in a normal way once we have JuliaLang/julia#35689 55 | @spinlock cache.lock begin 56 | delete!(cache.active_handles, key=>handle) 57 | 58 | if haskey(cache.idle_handles, key) 59 | if length(cache.idle_handles[key]) > cache.max_entries 60 | f() 61 | else 62 | push!(cache.idle_handles[key], handle) 63 | end 64 | else 65 | cache.idle_handles[key] = [handle] 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /res/wrap/README.md: -------------------------------------------------------------------------------- 1 | # Wrapping headers 2 | 3 | This directory contains scripts that can be used to automatically generate 4 | wrappers for C headers by NVIDIA, such as CUBLAS or CUDNN. This is done using 5 | Clang.jl, with some CSTParser.jl-based scripts to clean-up the result. 6 | 7 | In CUDA.jl, the wrappers need to know whether pointers passed into the 8 | library point to CPU or GPU memory (i.e. `Ptr` or `CuPtr`). This information is 9 | not available from the headers, and the headers will need to be reviewed up manually. 10 | 11 | 12 | 13 | # Usage 14 | 15 | Either run `wrap.jl` directly, or include it using Revise.jl and call the `main()` function. 16 | Be sure to activate the project environment in this folder, which will download CUDA from 17 | artifacts (if you want to upgrade the headers, be sure to update the relevant JLLs in the 18 | project environment). 19 | 20 | For each library, the script performs the following steps: 21 | 22 | - generate wrappers with Clang.jl 23 | - rewrite the headers: wrap functions that result status codes with `@check`, add calls to 24 | API initializers, etc. 25 | - apply manual patches: these are read from the `patches` folder, and can be used to 26 | incompatible code 27 | 28 | Clang.jl generates headers with two files: a `common` file with type definitions, aliases, 29 | etc, and a main wrapper that contains function definitions. The former will be copied over 30 | the existing files automatically, while for the latter we scan for changes: Removed 31 | functions are put in the `libXXX_deprecated.jl` file, new ones are concatenated to the 32 | `libXXX.jl` file. 33 | 34 | You should always review any changes to the headers! Specifically, to correct `Ptr` 35 | signature and possibly change them to: 36 | - `CuPtr`: if this pointer is a device pointer 37 | - `PtrOrCuPtr`: if this pointer can be either a device or host pointer 38 | - `Ref`: if the pointer represents a scalar or single-value argument on the host 39 | - `CuRef`: idem, but on the device 40 | - `RefOrCuRef`: idem, but either on the host or device 41 | 42 | Finally, it might also be useful to diff the generated wrapper (generated from scratch) in 43 | the `res/wrap` folder with the one in the `lib` folder (which is incrementally changed) to 44 | see if no function signatures have changed. 45 | -------------------------------------------------------------------------------- /lib/cublas/error.jl: -------------------------------------------------------------------------------- 1 | export CUBLASError 2 | 3 | struct CUBLASError <: Exception 4 | code::cublasStatus_t 5 | end 6 | 7 | Base.convert(::Type{cublasStatus_t}, err::CUBLASError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUBLASError) = 10 | print(io, "CUBLASError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CUBLASError) = string(err.code) 13 | 14 | ## COV_EXCL_START 15 | function description(err) 16 | if err.code == CUBLAS_STATUS_SUCCESS 17 | "the operation completed successfully" 18 | elseif err.code == CUBLAS_STATUS_NOT_INITIALIZED 19 | "the library was not initialized" 20 | elseif err.code == CUBLAS_STATUS_ALLOC_FAILED 21 | "the resource allocation failed" 22 | elseif err.code == CUBLAS_STATUS_INVALID_VALUE 23 | "an invalid value was used as an argument" 24 | elseif err.code == CUBLAS_STATUS_ARCH_MISMATCH 25 | "an absent device architectural feature is required" 26 | elseif err.code == CUBLAS_STATUS_MAPPING_ERROR 27 | "an access to GPU memory space failed" 28 | elseif err.code == CUBLAS_STATUS_EXECUTION_FAILED 29 | "the GPU program failed to execute" 30 | elseif err.code == CUBLAS_STATUS_INTERNAL_ERROR 31 | "an internal operation failed" 32 | elseif err.code == CUBLAS_STATUS_NOT_SUPPORTED 33 | "the requested feature is not supported" 34 | elseif err.code == CUBLAS_STATUS_LICENSE_ERROR 35 | "error detected trying to check the license" 36 | else 37 | "no description for this error" 38 | end 39 | end 40 | ## COV_EXCL_STOP 41 | 42 | 43 | ## API call wrapper 44 | 45 | # outlined functionality to avoid GC frame allocation 46 | @noinline function throw_api_error(res) 47 | if res == CUBLAS_STATUS_ALLOC_FAILED 48 | throw(OutOfGPUMemoryError()) 49 | else 50 | throw(CUBLASError(res)) 51 | end 52 | end 53 | 54 | macro check(ex, errs...) 55 | check = :(isequal(err, CUBLAS_STATUS_ALLOC_FAILED)) 56 | for err in errs 57 | check = :($check || isequal(err, $(esc(err)))) 58 | end 59 | 60 | quote 61 | res = @retry_reclaim err->$check $(esc(ex)) 62 | if res != CUBLAS_STATUS_SUCCESS 63 | throw_api_error(res) 64 | end 65 | 66 | nothing 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/cudnn/softmax.jl: -------------------------------------------------------------------------------- 1 | """ 2 | cudnnSoftmaxForward(x; algo, mode, alpha) 3 | cudnnSoftmaxForward!(y, x; algo, mode, alpha, beta) 4 | 5 | Return the softmax or logsoftmax of the input `x` depending on the `algo` keyword argument. 6 | The `y` argument holds the result and it should be similar to `x` if specified. Keyword 7 | arguments: 8 | 9 | * `algo = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE)`: Options are `CUDNN_SOFTMAX_ACCURATE` which subtracts max from every point to avoid overflow, `CUDNN_SOFTMAX_FAST` which doesn't and `CUDNN_SOFTMAX_LOG` which returns logsoftmax. 10 | * `mode = CUDNN_SOFTMAX_MODE_INSTANCE`: Compute softmax per image (N) across the dimensions C,H,W. `CUDNN_SOFTMAX_MODE_CHANNEL` computes softmax per spatial location (H,W) per image (N) across the dimension C. 11 | * `alpha=1, beta=0` can be used for scaling, i.e. `y .= alpha * op(x1) .+ beta * y` 12 | """ 13 | cudnnSoftmaxForward, cudnnSoftmaxForward! 14 | 15 | # Public methods 16 | cudnnSoftmaxForward(x; o...) = cudnnSoftmaxForwardWithDefaults(x; o...) 17 | cudnnSoftmaxForward!(y, x; o...) = cudnnSoftmaxForwardWithDefaults(x; y, o...) 18 | 19 | 20 | # Private method 21 | function cudnnSoftmaxForwardWithDefaults( 22 | x; 23 | y = similar(x), 24 | algo::cudnnSoftmaxAlgorithm_t = (CUDA.math_mode()===CUDA.FAST_MATH ? CUDNN_SOFTMAX_FAST : CUDNN_SOFTMAX_ACCURATE), 25 | mode::cudnnSoftmaxMode_t = CUDNN_SOFTMAX_MODE_INSTANCE, 26 | alpha::Real = 1, 27 | beta::Real = 0, 28 | format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, 29 | xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format), 30 | yDesc::cudnnTensorDescriptor = xDesc, 31 | ) 32 | @assert size(y) == size(x) 33 | T = eltype(x) 34 | alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta) 35 | cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y) 36 | end 37 | 38 | 39 | # AD method 40 | function cudnnSoftmaxForwardAD(x; algo, mode, alpha, xDesc, beta, yDesc, y) 41 | cudnnSoftmaxForward(handle(), algo, mode, alpha, xDesc, x, beta, yDesc, y) 42 | return y 43 | end 44 | 45 | 46 | # Deprecated methods 47 | function cudnnSoftmaxForward(x::DenseCuArray{T,4}, y::DenseCuArray{T,4}; o...) where T 48 | @warn "`cudnnSoftmaxForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnSoftmaxForward`." maxlog=1 49 | cudnnSoftmaxForward!(y, x; o...) 50 | end 51 | 52 | -------------------------------------------------------------------------------- /.github/workflows/ManifestUpdater.yml: -------------------------------------------------------------------------------- 1 | name: ManifestUpdater 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * 1' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | ManifestUpdater: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Get Julia compatibility 14 | id: julia_compat 15 | # NOTE: this requires a Julia compat lower-bound with minor version! 16 | run : | 17 | version=$(grep '^julia = ' Project.toml | grep -o '".*"' | cut -d '"' -f2) 18 | echo "::set-output name=version::$version" 19 | - uses: julia-actions/setup-julia@v1 20 | with: 21 | version: ${{ steps.julia_compat.outputs.version }} 22 | - name: Update packages 23 | id: pkg_update 24 | run: | 25 | log=$(julia --project -e 'using Pkg; Pkg.update()') 26 | log="${log//'%'/'%25'}" 27 | log="${log//$'\n'/'%0A'}" 28 | log="${log//$'\r'/'%0D'}" 29 | echo "::set-output name=log::$log" 30 | - name: Get status 31 | id: pkg_status 32 | run: | 33 | log=$(julia --project -e 'using Pkg; VERSION >= v"1.3" ? Pkg.status(diff=true) : Pkg.status()') 34 | log="${log//'%'/'%25'}" 35 | log="${log//$'\n'/'%0A'}" 36 | log="${log//$'\r'/'%0D'}" 37 | echo "::set-output name=log::$log" 38 | - name: Get Julia version 39 | id: version 40 | run: | 41 | log=$(julia -e "println(Base.VERSION)") 42 | echo "::set-output name=log::$log" 43 | - name: Create pull request 44 | uses: peter-evans/create-pull-request@v3 45 | with: 46 | token: ${{ secrets.GITHUB_TOKEN }} 47 | commit-message: | 48 | Update dependencies. 49 | 50 | ${{ steps.pkg_status.outputs.log }} 51 | title: Update manifest 52 | reviewers: maleadt 53 | body: | 54 | This pull request updates the manifest for Julia v${{ steps.version.outputs.log }}: 55 | 56 | ``` 57 | ${{ steps.pkg_status.outputs.log }} 58 | ``` 59 | 60 |
Click here for the full update log. 61 |

62 | 63 | ``` 64 | ${{ steps.pkg_update.outputs.log }} 65 | ``` 66 | 67 |

68 |
69 | branch: update_manifest 70 | 71 | -------------------------------------------------------------------------------- /src/linalg.jl: -------------------------------------------------------------------------------- 1 | # integration with LinearAlgebra.jl 2 | 3 | CuMatOrAdj{T} = Union{CuMatrix, LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, LinearAlgebra.Transpose{T, <:CuMatrix{T}}} 4 | CuOrAdj{T} = Union{CuVecOrMat, LinearAlgebra.Adjoint{T, <:CuVecOrMat{T}}, LinearAlgebra.Transpose{T, <:CuVecOrMat{T}}} 5 | 6 | 7 | # matrix division 8 | 9 | function Base.:\(_A::CuMatOrAdj, _B::CuOrAdj) 10 | A, B = copy(_A), copy(_B) 11 | A, ipiv = CUSOLVER.getrf!(A) 12 | return CUSOLVER.getrs!('N', A, ipiv, B) 13 | end 14 | 15 | # patch JuliaLang/julia#40899 to create a CuArray 16 | # (see https://github.com/JuliaLang/julia/pull/41331#issuecomment-868374522) 17 | if VERSION >= v"1.7-" 18 | _zeros(::Type{T}, b::AbstractVector, n::Integer) where {T} = CUDA.zeros(T, max(length(b), n)) 19 | _zeros(::Type{T}, B::AbstractMatrix, n::Integer) where {T} = CUDA.zeros(T, max(size(B, 1), n), size(B, 2)) 20 | function Base.:\(F::Union{LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray}, 21 | Adjoint{<:Any,<:LinearAlgebra.LAPACKFactorizations{<:Any,<:CuArray}}}, 22 | B::AbstractVecOrMat) 23 | m, n = size(F) 24 | if m != size(B, 1) 25 | throw(DimensionMismatch("arguments must have the same number of rows")) 26 | end 27 | 28 | TFB = typeof(oneunit(eltype(B)) / oneunit(eltype(F))) 29 | FF = Factorization{TFB}(F) 30 | 31 | # For wide problem we (often) compute a minimum norm solution. The solution 32 | # is larger than the right hand side so we use size(F, 2). 33 | BB = _zeros(TFB, B, n) 34 | 35 | if n > size(B, 1) 36 | # Underdetermined 37 | copyto!(view(BB, 1:m, :), B) 38 | else 39 | copyto!(BB, B) 40 | end 41 | 42 | ldiv!(FF, BB) 43 | 44 | # For tall problems, we compute a least squares solution so only part 45 | # of the rhs should be returned from \ while ldiv! uses (and returns) 46 | # the complete rhs 47 | return LinearAlgebra._cut_B(BB, 1:n) 48 | end 49 | end 50 | 51 | 52 | # qr 53 | 54 | using LinearAlgebra: AbstractQ 55 | 56 | # AbstractQ's `size` is the size of the full matrix, 57 | # while `Matrix(Q)` only gives the compact Q. 58 | # See JuliaLang/julia#26591 and JuliaGPU/CUDA.jl#969. 59 | CuMatrix{T}(Q::AbstractQ{S}) where {T,S} = convert(CuArray, Matrix{T}(Q)) 60 | CuMatrix(Q::AbstractQ{T}) where {T} = CuMatrix{T}(Q) 61 | CuArray{T}(Q::AbstractQ) where {T} = CuMatrix{T}(Q) 62 | CuArray(Q::AbstractQ) = CuMatrix(Q) 63 | -------------------------------------------------------------------------------- /lib/cudnn/inplace.jl: -------------------------------------------------------------------------------- 1 | """ 2 | cudnnSetTensor!(x, s) 3 | 4 | Set all elements of tensor `x` to scalar `s` and return `x`. 5 | """ 6 | function cudnnSetTensor!( 7 | x, s::Real; 8 | format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, 9 | xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format) 10 | ) 11 | cudnnSetTensor(handle(), xDesc, x, Ref(eltype(x)(s))) 12 | return x 13 | end 14 | 15 | 16 | """ 17 | cudnnScaleTensor(x, s) 18 | cudnnScaleTensor!(y, x, s) 19 | 20 | Scale all elements of tensor `x` with scale `s` and return the result. `cudnnScaleTensor` 21 | allocates a new array for the answer, `cudnnScaleTensor!` overwrites `y`. 22 | """ 23 | cudnnScaleTensor, cudnnScaleTensor! 24 | 25 | function cudnnScaleTensor!( 26 | y, x, s::Real; 27 | format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, 28 | xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format) 29 | ) 30 | y === x || copyto!(y, x) 31 | cudnnScaleTensor(handle(), xDesc, y, Ref(eltype(y)(s))) 32 | return y 33 | end 34 | 35 | cudnnScaleTensor(x, s::Real; o...) = cudnnScaleTensor!(similar(x), x, s; o...) 36 | 37 | 38 | # cudnnAddTensor does not support all broadcasting dimensions, use cudnnOpTensor instead. 39 | # Compared to libknet8 x .+ b it is ~2x slower for (1,1,100,100), ~30% faster for (14,14,256,32) 40 | # CUDA.jl x .+ b is 2x slower than both 41 | 42 | """ 43 | cudnnAddTensor(x, b; alpha) 44 | cudnnAddTensor!(y, x, b; alpha, beta) 45 | 46 | Broadcast-add tensor `b` to tensor `x`. `alpha=1, beta=1` are used for scaling, i.e. `y .= 47 | alpha * b .+ beta * x`. `cudnnAddTensor` allocates a new array for the answer, 48 | `cudnnAddTensor!` overwrites `y`. Does not support all valid broadcasting dimensions. For 49 | more flexible broadcast operations see `cudnnOpTensor`. 50 | """ 51 | cudnnAddTensor, cudnnAddTensor! 52 | 53 | function cudnnAddTensor!( 54 | y, x, b; 55 | alpha::Real=1, 56 | beta::Real=1, 57 | format::cudnnTensorFormat_t = CUDNN_TENSOR_NCHW, 58 | bDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(b; format), 59 | xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x; format), 60 | ) 61 | T = eltype(x) 62 | alpha, beta = scalingParameter(T, alpha), scalingParameter(T, beta) 63 | y === x || copyto!(y, x) 64 | cudnnAddTensor(handle(), alpha, bDesc, b, beta, xDesc, y) 65 | return y 66 | end 67 | 68 | cudnnAddTensor(x, b; o...) = cudnnAddTensor!(similar(x), x, b; o...) 69 | -------------------------------------------------------------------------------- /lib/cusparse/CUSPARSE.jl: -------------------------------------------------------------------------------- 1 | module CUSPARSE 2 | 3 | using ..APIUtils 4 | 5 | using ..CUDA 6 | using ..CUDA: CUstream, cuComplex, cuDoubleComplex, libraryPropertyType, cudaDataType 7 | using ..CUDA: libcusparse, unsafe_free!, @retry_reclaim, @context!, initialize_context 8 | 9 | using CEnum: @cenum 10 | 11 | using LinearAlgebra 12 | using LinearAlgebra: HermOrSym 13 | 14 | using Adapt: Adapt, adapt 15 | 16 | using SparseArrays 17 | 18 | const SparseChar = Char 19 | 20 | 21 | # core library 22 | include("libcusparse_common.jl") 23 | include("error.jl") 24 | include("libcusparse.jl") 25 | include("libcusparse_deprecated.jl") 26 | 27 | include("array.jl") 28 | include("util.jl") 29 | include("types.jl") 30 | 31 | # low-level wrappers 32 | include("helpers.jl") 33 | include("management.jl") 34 | include("level1.jl") 35 | include("level2.jl") 36 | include("level3.jl") 37 | include("extra.jl") 38 | include("preconditioners.jl") 39 | include("conversions.jl") 40 | include("generic.jl") 41 | 42 | # high-level integrations 43 | include("interfaces.jl") 44 | 45 | # cache for created, but unused handles 46 | const idle_handles = HandleCache{CuContext,cusparseHandle_t}() 47 | 48 | function handle() 49 | cuda = CUDA.active_state() 50 | 51 | # every task maintains library state per device 52 | LibraryState = @NamedTuple{handle::cusparseHandle_t, stream::CuStream} 53 | states = get!(task_local_storage(), :CUSPARSE) do 54 | Dict{CuContext,LibraryState}() 55 | end::Dict{CuContext,LibraryState} 56 | 57 | # get library state 58 | @noinline function new_state(cuda) 59 | new_handle = pop!(idle_handles, cuda.context) do 60 | cusparseCreate() 61 | end 62 | 63 | finalizer(current_task()) do task 64 | push!(idle_handles, cuda.context, new_handle) do 65 | @context! skip_destroyed=true cuda.context cusparseDestroy(new_handle) 66 | end 67 | end 68 | 69 | cusparseSetStream(new_handle, cuda.stream) 70 | 71 | (; handle=new_handle, cuda.stream) 72 | end 73 | state = get!(states, cuda.context) do 74 | new_state(cuda) 75 | end 76 | 77 | # update stream 78 | @noinline function update_stream(cuda, state) 79 | cusparseSetStream_v2(state.handle, cuda.stream) 80 | (; state.handle, cuda.stream) 81 | end 82 | if state.stream != cuda.stream 83 | states[cuda.context] = state = update_stream(cuda, state) 84 | end 85 | 86 | return state.handle 87 | end 88 | 89 | end 90 | -------------------------------------------------------------------------------- /src/device/quirks.jl: -------------------------------------------------------------------------------- 1 | macro print_and_throw(args...) 2 | quote 3 | @cuprintln "ERROR: " $(args...) "." 4 | throw(nothing) 5 | end 6 | end 7 | 8 | # math.jl 9 | @device_override @noinline Base.Math.throw_complex_domainerror(f::Symbol, x) = 10 | @print_and_throw "This operation requires a complex input to return a complex result" 11 | @device_override @noinline Base.Math.throw_exp_domainerror(f::Symbol, x) = 12 | @print_and_throw "Exponentiation yielding a complex result requires a complex argument" 13 | 14 | # intfuncs.jl 15 | @device_override @noinline Base.throw_domerr_powbysq(::Any, p) = 16 | @print_and_throw "Cannot raise an integer to a negative power" 17 | @device_override @noinline Base.throw_domerr_powbysq(::Integer, p) = 18 | @print_and_throw "Cannot raise an integer to a negative power" 19 | @device_override @noinline Base.throw_domerr_powbysq(::AbstractMatrix, p) = 20 | @print_and_throw "Cannot raise an integer to a negative power" 21 | 22 | # checked.jl 23 | @device_override @noinline Base.Checked.throw_overflowerr_binaryop(op, x, y) = 24 | @print_and_throw "Binary operation overflowed" 25 | @device_override @noinline Base.Checked.throw_overflowerr_negation(op, x, y) = 26 | @print_and_throw "Negation overflowed" 27 | 28 | # boot.jl 29 | @device_override @noinline Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} = 30 | @print_and_throw "Inexact conversion" 31 | 32 | # abstractarray.jl 33 | @device_override @noinline Base.throw_boundserror(A, I) = 34 | @print_and_throw "Out-of-bounds array access" 35 | 36 | # trig.jl 37 | @device_override @noinline Base.Math.sincos_domain_error(x) = 38 | @print_and_throw "sincos(x) is only defined for finite x." 39 | 40 | # multidimensional.jl 41 | @static if VERSION >= v"1.7-" 42 | # XXX: the boundscheck change in JuliaLang/julia#42119 has exposed additional issues 43 | # with bad code generation by ptxas on a = CuArray([1]) 23 | 1-element CuArray{Int64,1,Nothing}: 24 | 1 25 | 26 | julia> a[1] += 1 27 | ┌ Warning: Performing scalar indexing. 28 | │ ... 29 | └ @ GPUArrays ~/Julia/pkg/GPUArrays/src/host/indexing.jl:57 30 | 2 31 | ``` 32 | 33 | Scalar indexing is only allowed in an interactive session, e.g. the REPL, because it is 34 | convenient when porting CPU code to the GPU. If you want to disallow scalar indexing, e.g. 35 | to verify that your application executes correctly on the GPU, call the `allowscalar` 36 | function: 37 | 38 | ```julia 39 | julia> CUDA.allowscalar(false) 40 | 41 | julia> a[1] .+ 1 42 | ERROR: scalar getindex is disallowed 43 | Stacktrace: 44 | [1] error(::String) at ./error.jl:33 45 | [2] assertscalar(::String) at GPUArrays/src/indexing.jl:14 46 | [3] getindex(::CuArray{Int64,1,Nothing}, ::Int64) at GPUArrays/src/indexing.jl:54 47 | [4] top-level scope at REPL[5]:1 48 | 49 | julia> a .+ 1 50 | 1-element CuArray{Int64,1,Nothing}: 51 | 2 52 | ``` 53 | 54 | In a non-interactive session, e.g. when running code from a script or application, scalar 55 | indexing is disallowed by default. There is no global toggle to allow scalar indexing; if 56 | you really need it, you can mark expressions using `allowscalar` with do-block syntax or 57 | `@allowscalar` macro: 58 | 59 | ```julia 60 | julia> a = CuArray([1]) 61 | 1-element CuArray{Int64, 1}: 62 | 1 63 | 64 | julia> CUDA.allowscalar(false) 65 | 66 | julia> CUDA.allowscalar() do 67 | a[1] += 1 68 | end 69 | 2 70 | 71 | julia> CUDA.@allowscalar a[1] += 1 72 | 3 73 | ``` 74 | -------------------------------------------------------------------------------- /lib/curand/error.jl: -------------------------------------------------------------------------------- 1 | export CURANDError 2 | 3 | struct CURANDError <: Exception 4 | code::curandStatus_t 5 | end 6 | 7 | Base.convert(::Type{curandStatus_t}, err::CURANDError) = err.code 8 | 9 | Base.showerror(io::IO, err::CURANDError) = 10 | print(io, "CURANDError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CURANDError) = string(err.code) 13 | 14 | ## COV_EXCL_START 15 | function description(err) 16 | if err.code == CURAND_STATUS_SUCCESS 17 | "generator was created successfully" 18 | elseif err.code == CURAND_STATUS_VERSION_MISMATCH 19 | "header file and linked library version do not match" 20 | elseif err.code == CURAND_STATUS_NOT_INITIALIZED 21 | "generator not initialized" 22 | elseif err.code == CURAND_STATUS_ALLOCATION_FAILED 23 | "memory allocation failed" 24 | elseif err.code == CURAND_STATUS_TYPE_ERROR 25 | "generator is wrong type" 26 | elseif err.code == CURAND_STATUS_OUT_OF_RANGE 27 | "argument out of range" 28 | elseif err.code == CURAND_STATUS_LENGTH_NOT_MULTIPLE 29 | "length requested is not a multple of dimension" 30 | elseif err.code == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED 31 | "GPU does not have double precision required by MRG32k3a" 32 | elseif err.code == CURAND_STATUS_LAUNCH_FAILURE 33 | "kernel launch failure" 34 | elseif err.code == CURAND_STATUS_PREEXISTING_FAILURE 35 | "preexisting failure on library entry" 36 | elseif err.code == CURAND_STATUS_INITIALIZATION_FAILED 37 | "initialization of CUDA failed" 38 | elseif err.code == CURAND_STATUS_ARCH_MISMATCH 39 | "architecture mismatch, GPU does not support requested feature" 40 | elseif err.code == CURAND_STATUS_INTERNAL_ERROR 41 | "internal library error" 42 | else 43 | "no description for this error" 44 | end 45 | end 46 | ## COV_EXCL_STOP 47 | 48 | 49 | ## API call wrapper 50 | 51 | # outlined functionality to avoid GC frame allocation 52 | @noinline function throw_api_error(res) 53 | if res == CURAND_STATUS_ALLOCATION_FAILED 54 | throw(OutOfGPUMemoryError()) 55 | else 56 | throw(CURANDError(res)) 57 | end 58 | end 59 | 60 | macro check(ex, errs...) 61 | check = :(isequal(err, CURAND_STATUS_ALLOCATION_FAILED)) 62 | for err in errs 63 | check = :($check || isequal(err, $(esc(err)))) 64 | end 65 | 66 | quote 67 | res = @retry_reclaim err->$check $(esc(ex)) 68 | if res != CURAND_STATUS_SUCCESS 69 | throw_api_error(res) 70 | end 71 | 72 | nothing 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /docs/make.jl: -------------------------------------------------------------------------------- 1 | using Documenter, Literate 2 | using CUDA 3 | 4 | const src = "https://github.com/JuliaGPU/CUDA.jl" 5 | const dst = "https://cuda.juliagpu.org/stable/" 6 | 7 | function main() 8 | ci = get(ENV, "CI", "") == "true" 9 | 10 | @info "Building Literate.jl documentation" 11 | cd(@__DIR__) do 12 | Literate.markdown("src/tutorials/introduction.jl", "src/tutorials"; 13 | repo_root_url="$src/blob/master/docs") 14 | Literate.markdown("src/tutorials/custom_structs.jl", "src/tutorials"; 15 | repo_root_url="$src/blob/master/docs") 16 | end 17 | 18 | @info "Generating Documenter.jl site" 19 | DocMeta.setdocmeta!(CUDA, :DocTestSetup, :(using CUDA); recursive=true) 20 | makedocs( 21 | sitename = "CUDA.jl", 22 | authors = "Tim Besard", 23 | repo = "$src/blob/{commit}{path}#{line}", 24 | format = Documenter.HTML( 25 | # Use clean URLs on CI 26 | prettyurls = ci, 27 | canonical = dst, 28 | assets = ["assets/favicon.ico"], 29 | analytics = "UA-154489943-2", 30 | ), 31 | doctest = true, 32 | #strict = true, 33 | modules = [CUDA], 34 | pages = Any[ 35 | "Home" => "index.md", 36 | "Tutorials" => Any[ 37 | "tutorials/introduction.md", 38 | "tutorials/custom_structs.md", 39 | ], 40 | "Installation" => Any[ 41 | "installation/overview.md", 42 | "installation/conditional.md", 43 | "installation/troubleshooting.md", 44 | ], 45 | "Usage" => Any[ 46 | "usage/overview.md", 47 | "usage/workflow.md", 48 | "usage/array.md", 49 | "usage/memory.md", 50 | "usage/multitasking.md", 51 | "usage/multigpu.md", 52 | ], 53 | "Development" => Any[ 54 | "development/profiling.md", 55 | "development/troubleshooting.md", 56 | ], 57 | "API reference" => Any[ 58 | "api/essentials.md", 59 | "api/compiler.md", 60 | "api/kernel.md", 61 | "api/array.md", 62 | ], 63 | "Library reference" => Any[ 64 | "lib/driver.md", 65 | ], 66 | "FAQ" => "faq.md", 67 | ] 68 | ) 69 | 70 | if ci 71 | @info "Deploying to GitHub" 72 | deploydocs( 73 | repo = "github.com/JuliaGPU/CUDA.jl.git", 74 | push_preview = true 75 | ) 76 | end 77 | end 78 | 79 | isinteractive() || main() 80 | -------------------------------------------------------------------------------- /docs/src/index.md: -------------------------------------------------------------------------------- 1 | # CUDA programming in Julia 2 | 3 | The CUDA.jl package is the main entrypoint for programming NVIDIA GPUs in Julia. The package 4 | makes it possible to do so at various abstraction levels, from easy-to-use arrays down to 5 | hand-written kernels using low-level CUDA APIs. 6 | 7 | If you have any questions, please feel free to use the `#gpu` channel on the [Julia 8 | slack](https://julialang.slack.com/), or the [GPU domain of the Julia 9 | Discourse](https://discourse.julialang.org/c/domain/gpu). 10 | 11 | 12 | ## Quick Start 13 | 14 | The Julia CUDA stack only requires a working NVIDIA driver; you don't need to install the 15 | entire CUDA toolkit, as it will automatically be downloaded when you first use the package: 16 | 17 | ```julia 18 | # install the package 19 | using Pkg 20 | Pkg.add("CUDA") 21 | 22 | # smoke test (this will download the CUDA toolkit) 23 | using CUDA 24 | CUDA.versioninfo() 25 | ``` 26 | 27 | If you want to ensure everything works as expected, you can execute the test suite: 28 | 29 | ```julia 30 | using Pkg 31 | Pkg.test("CUDA") # takes ~40 minutes if using 1 thread 32 | ``` 33 | 34 | For more details on the installation process, consult the [Installation](@ref 35 | InstallationOverview) section. To understand the toolchain in more detail, have a look at 36 | the tutorials in this manual. **It is highly recommended that new users start with the 37 | [Introduction](@ref) tutorial**. For an overview of the available functionality, read the 38 | [Usage](@ref UsageOverview) section. The following resources may also be of interest: 39 | 40 | - Effectively using GPUs with Julia: [video](https://www.youtube.com/watch?v=7Yq1UyncDNc), 41 | [slides](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/) 42 | - How Julia is compiled to GPUs: [video](https://www.youtube.com/watch?v=Fz-ogmASMAE) 43 | 44 | 45 | ## Acknowledgements 46 | 47 | The Julia CUDA stack has been a collaborative effort by many individuals. Significant 48 | contributions have been made by the following individuals: 49 | 50 | - Tim Besard (@maleadt) (lead developer) 51 | - Valentin Churavy (@vchuravy) 52 | - Mike Innes (@MikeInnes) 53 | - Katharine Hyatt (@kshyatt) 54 | - Simon Danisch (@SimonDanisch) 55 | 56 | 57 | ## Supporting and Citing 58 | 59 | Much of the software in this ecosystem was developed as part of academic research. If you 60 | would like to help support it, please star the repository as such metrics may help us secure 61 | funding in the future. If you use our software as part of your research, teaching, or other 62 | activities, we would be grateful if you could cite our work. The 63 | [CITATION.bib](https://github.com/JuliaGPU/CUDA.jl/blob/master/CITATION.bib) file in the 64 | root of this repository lists the relevant papers. 65 | -------------------------------------------------------------------------------- /perf/byval.jl: -------------------------------------------------------------------------------- 1 | module ByVal 2 | 3 | using CUDA, BenchmarkTools, Random 4 | using CUDA: i32 5 | 6 | const threads = 256 7 | 8 | # simple add matrixes kernel 9 | function kernel_add_mat(n, x1, x2, y) 10 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 11 | if i <= n 12 | @inbounds y[i] = x1[i] + x2[i] 13 | end 14 | return 15 | end 16 | 17 | @inline get_inputs3(indx_y, a, b, c) = (a, b, c) 18 | @inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2) 19 | @inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3) 20 | 21 | # add arrays of matrixes kernel 22 | function kernel_add_mat_z_slices(n, vararg...) 23 | x1, x2, y = get_inputs3(blockIdx().y, vararg...) 24 | i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x 25 | if i <= n 26 | @inbounds y[i] = x1[i] + x2[i] 27 | end 28 | return 29 | end 30 | 31 | function add_z_slices!(y, x1, x2) 32 | m1, n1 = size(x1[1]) #get size of first slice 33 | blocks = (m1 * n1 + threads - 1) ÷ threads 34 | # get length(x1) more blocks than needed to process 1 slice 35 | @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) 36 | end 37 | 38 | function add!(y, x1, x2) 39 | m1, n1 = size(x1) 40 | blocks = (m1 * n1 + threads - 1) ÷ threads 41 | @cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y) 42 | end 43 | 44 | function main() 45 | results = BenchmarkGroup() 46 | 47 | num_z_slices = 3 48 | Random.seed!(1) 49 | 50 | #m, n = 7, 5 # tiny to measure overhead 51 | #m, n = 521, 111 52 | #m, n = 1521, 1111 53 | #m, n = 3001, 1511 # prime numbers to test memory access correctness 54 | m, n = 3072, 1536 # 256 multiplier 55 | #m, n = 6007, 3001 # prime numbers to test memory access correctness 56 | 57 | x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] 58 | x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] 59 | y1 = [similar(x1[1]) for i = 1:num_z_slices] 60 | 61 | # reference down to bones add on GPU 62 | results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1]) 63 | 64 | # adding arrays in an array 65 | for slices = 1:num_z_slices 66 | results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) 67 | end 68 | 69 | # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them 70 | CUDA.unsafe_free!.(x1) 71 | CUDA.unsafe_free!.(x2) 72 | CUDA.unsafe_free!.(y1) 73 | 74 | return results 75 | end 76 | 77 | end 78 | 79 | ByVal.main() 80 | -------------------------------------------------------------------------------- /test/cudnn/activation.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUDNN: 2 | cudnnActivationForward, 3 | cudnnActivationForward!, 4 | cudnnActivationBackward, 5 | cudnnActivationDescriptor, 6 | cudnnActivationDescriptor_t, 7 | cudnnCreateActivationDescriptor, 8 | cudnnSetActivationDescriptor, 9 | cudnnGetActivationDescriptor, 10 | cudnnDestroyActivationDescriptor, 11 | cudnnActivationMode_t, 12 | CUDNN_ACTIVATION_SIGMOID, # 0 13 | CUDNN_ACTIVATION_RELU, # 1 14 | CUDNN_ACTIVATION_TANH, # 2 15 | CUDNN_ACTIVATION_CLIPPED_RELU, # 3 16 | CUDNN_ACTIVATION_ELU, # 4 17 | CUDNN_ACTIVATION_IDENTITY, # 5 18 | cudnnNanPropagation_t, 19 | CUDNN_NOT_PROPAGATE_NAN, # 0 20 | CUDNN_PROPAGATE_NAN # 1 21 | 22 | 23 | @testset "cudnn/activation" begin 24 | @test cudnnActivationDescriptor(C_NULL) isa cudnnActivationDescriptor 25 | @test Base.unsafe_convert(Ptr, cudnnActivationDescriptor(C_NULL)) isa Ptr 26 | @test cudnnActivationDescriptor(CUDNN_ACTIVATION_RELU,CUDNN_NOT_PROPAGATE_NAN,0) isa cudnnActivationDescriptor 27 | 28 | (ax,ay) = randn.((10,10)) 29 | (cx,cy) = CuArray.((ax,ay)) 30 | 31 | function activationtest(; 32 | mode=CUDNN_ACTIVATION_SIGMOID, 33 | nanOpt=CUDNN_NOT_PROPAGATE_NAN, 34 | coef=1, 35 | alpha=1, 36 | beta=0, 37 | ) 38 | fx = (mode === CUDNN_ACTIVATION_SIGMOID ? 1 ./ (1 .+ exp.(-ax)) : 39 | mode === CUDNN_ACTIVATION_RELU ? max.(0,ax) : 40 | mode === CUDNN_ACTIVATION_TANH ? tanh.(ax) : 41 | mode === CUDNN_ACTIVATION_CLIPPED_RELU ? clamp.(ax,0,coef) : 42 | mode === CUDNN_ACTIVATION_ELU ? (x->(x >= 0 ? x : coef*(exp(x)-1))).(ax) : 43 | error("Unknown activation")) 44 | d = cudnnActivationDescriptor(mode,nanOpt,Cfloat(coef)) 45 | y0 = alpha * fx 46 | y1 = y0 .+ beta * ay 47 | @test y0 ≈ cudnnActivationForward(cx; mode, nanOpt, coef, alpha) |> Array 48 | @test y0 ≈ cudnnActivationForward(cx, d; alpha) |> Array 49 | @test y1 ≈ cudnnActivationForward!(copy(cy), cx; mode, nanOpt, coef, alpha, beta) |> Array 50 | @test y1 ≈ cudnnActivationForward!(copy(cy), cx, d; alpha, beta) |> Array 51 | end 52 | 53 | activationtest(mode=CUDNN_ACTIVATION_SIGMOID) 54 | activationtest(mode=CUDNN_ACTIVATION_RELU) 55 | activationtest(mode=CUDNN_ACTIVATION_TANH) 56 | activationtest(mode=CUDNN_ACTIVATION_CLIPPED_RELU) 57 | activationtest(mode=CUDNN_ACTIVATION_ELU) 58 | activationtest(nanOpt=CUDNN_PROPAGATE_NAN) 59 | activationtest(coef=2,mode=CUDNN_ACTIVATION_CLIPPED_RELU) 60 | activationtest(coef=2,mode=CUDNN_ACTIVATION_ELU) 61 | activationtest(alpha=2) 62 | activationtest(beta=2) 63 | end 64 | -------------------------------------------------------------------------------- /lib/cutensor/error.jl: -------------------------------------------------------------------------------- 1 | export CUTENSORError 2 | 3 | struct CUTENSORError <: Exception 4 | code::cutensorStatus_t 5 | end 6 | 7 | Base.convert(::Type{cutensorStatus_t}, err::CUTENSORError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUTENSORError) = 10 | print(io, "CUTENSORError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CUTENSORError) = unsafe_string(cutensorGetErrorString(err)) 13 | 14 | ## COV_EXCL_START 15 | function description(err::CUTENSORError) 16 | if err.code == CUTENSOR_STATUS_SUCCESS 17 | "the operation completed successfully" 18 | elseif err.code == CUTENSOR_STATUS_NOT_INITIALIZED 19 | "the library was not initialized" 20 | elseif err.code == CUTENSOR_STATUS_ALLOC_FAILED 21 | "the resource allocation failed" 22 | elseif err.code == CUTENSOR_STATUS_INVALID_VALUE 23 | "an invalid value was used as an argument" 24 | elseif err.code == CUTENSOR_STATUS_ARCH_MISMATCH 25 | "an absent device architectural feature is required" 26 | elseif err.code == CUTENSOR_STATUS_MAPPING_ERROR 27 | "an access to GPU memory space failed" 28 | elseif err.code == CUTENSOR_STATUS_EXECUTION_FAILED 29 | "the GPU program failed to execute" 30 | elseif err.code == CUTENSOR_STATUS_INTERNAL_ERROR 31 | "an internal operation failed" 32 | elseif err.code == CUTENSOR_STATUS_NOT_SUPPORTED 33 | "operation not supported (yet)" 34 | elseif err.code == CUTENSOR_STATUS_LICENSE_ERROR 35 | "error detected trying to check the license" 36 | elseif err.code == CUTENSOR_STATUS_CUBLAS_ERROR 37 | "error occurred during a CUBLAS operation" 38 | elseif err.code == CUTENSOR_STATUS_CUDA_ERROR 39 | "error occurred during a CUDA operation" 40 | elseif err.code == CUTENSOR_STATUS_INSUFFICIENT_WORKSPACE 41 | "insufficient workspace memory for this operation" 42 | elseif err.code == CUTENSOR_STATUS_INSUFFICIENT_DRIVER 43 | "insufficient driver version" 44 | else 45 | "no description for this error" 46 | end 47 | end 48 | ## COV_EXCL_STOP 49 | 50 | 51 | ## API call wrapper 52 | 53 | # outlined functionality to avoid GC frame allocation 54 | @noinline function throw_api_error(res) 55 | if res == CUTENSOR_STATUS_ALLOC_FAILED 56 | throw(OutOfGPUMemoryError()) 57 | else 58 | throw(CUTENSORError(res)) 59 | end 60 | end 61 | 62 | macro check(ex, errs...) 63 | check = :(isequal(err, CUTENSOR_STATUS_ALLOC_FAILED)) 64 | for err in errs 65 | check = :($check || isequal(err, $(esc(err)))) 66 | end 67 | 68 | quote 69 | res = @retry_reclaim err->$check $(esc(ex)) 70 | if res != CUTENSOR_STATUS_SUCCESS 71 | throw_api_error(res) 72 | end 73 | 74 | nothing 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /test/cudnn/optensor.jl: -------------------------------------------------------------------------------- 1 | using CUDA.CUDNN: 2 | cudnnOpTensor, 3 | cudnnOpTensor!, 4 | cudnnOpTensorDescriptor, 5 | cudnnOpTensorDescriptor_t, 6 | cudnnCreateOpTensorDescriptor, 7 | cudnnSetOpTensorDescriptor, 8 | cudnnGetOpTensorDescriptor, 9 | cudnnDestroyOpTensorDescriptor, 10 | cudnnOpTensorOp_t, 11 | CUDNN_OP_TENSOR_ADD, # 0, 12 | CUDNN_OP_TENSOR_MUL, # 1, 13 | CUDNN_OP_TENSOR_MIN, # 2, 14 | CUDNN_OP_TENSOR_MAX, # 3, 15 | CUDNN_OP_TENSOR_SQRT, # 4, performed only on first arg 16 | CUDNN_OP_TENSOR_NOT, # 5, performed only on first arg 17 | cudnnNanPropagation_t, 18 | CUDNN_NOT_PROPAGATE_NAN, # 0 19 | CUDNN_PROPAGATE_NAN, # 1 20 | cudnnDataType 21 | 22 | @testset "cudnn/optensor" begin 23 | @test cudnnOpTensorDescriptor(C_NULL) isa cudnnOpTensorDescriptor 24 | @test Base.unsafe_convert(Ptr, cudnnOpTensorDescriptor(C_NULL)) isa Ptr 25 | @test cudnnOpTensorDescriptor(CUDNN_OP_TENSOR_ADD,cudnnDataType(Float32),CUDNN_NOT_PROPAGATE_NAN) isa cudnnOpTensorDescriptor 26 | 27 | (ax1,ax2,ay) = rand.((10,10,10)) 28 | (cx1,cx2,cy) = CuArray.((ax1,ax2,ay)) 29 | 30 | function optensortest(; 31 | op=CUDNN_OP_TENSOR_ADD, 32 | nanOpt=CUDNN_NOT_PROPAGATE_NAN, 33 | compType=(eltype(ax1) <: Float64 ? Float64 : Float32), 34 | alpha1=1, 35 | alpha2=1, 36 | beta=0, 37 | ) 38 | f1 = (op === CUDNN_OP_TENSOR_ADD ? alpha1*ax1 .+ alpha2*ax2 : 39 | op === CUDNN_OP_TENSOR_MUL ? (alpha1*ax1) .* (alpha2*ax2) : 40 | op === CUDNN_OP_TENSOR_MIN ? min.(alpha1*ax1, alpha2*ax2) : 41 | op === CUDNN_OP_TENSOR_MAX ? max.(alpha1*ax1, alpha2*ax2) : 42 | op === CUDNN_OP_TENSOR_SQRT ? sqrt.(alpha1*ax1) : 43 | op === CUDNN_OP_TENSOR_NOT ? 1 .- ax1 : 44 | error("Unknown optensor")) 45 | f2 = f1 .+ beta * ay 46 | d = cudnnOpTensorDescriptor(op,cudnnDataType(compType),nanOpt) 47 | @test f1 ≈ cudnnOpTensor(cx1, cx2; op, compType, nanOpt, alpha1, alpha2) |> Array 48 | @test f1 ≈ cudnnOpTensor(cx1, cx2, d; alpha1, alpha2) |> Array 49 | @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2; op, compType, nanOpt, alpha1, alpha2, beta) |> Array 50 | @test f2 ≈ cudnnOpTensor!(copy(cy), cx1, cx2, d; alpha1, alpha2, beta) |> Array 51 | end 52 | 53 | optensortest(op = CUDNN_OP_TENSOR_ADD) 54 | optensortest(op = CUDNN_OP_TENSOR_MUL) 55 | optensortest(op = CUDNN_OP_TENSOR_MIN) 56 | optensortest(op = CUDNN_OP_TENSOR_MAX) 57 | optensortest(op = CUDNN_OP_TENSOR_SQRT) 58 | optensortest(op = CUDNN_OP_TENSOR_NOT) 59 | optensortest(nanOpt = CUDNN_PROPAGATE_NAN) 60 | optensortest(alpha1 = 2) 61 | optensortest(alpha2 = 2) 62 | optensortest(beta = 2) 63 | end 64 | -------------------------------------------------------------------------------- /lib/cusparse/extra.jl: -------------------------------------------------------------------------------- 1 | export geam 2 | 3 | """ 4 | geam(alpha::Number, A::CuSparseMatrix, beta::Number, B::CuSparseMatrix, index::SparseChar) 5 | 6 | Performs `C = alpha * A + beta * B`. `A` and `B` are sparse matrix defined in CSR storage format. 7 | """ 8 | geam(alpha::Number, A::CuSparseMatrixCSR, beta::Number, B::CuSparseMatrixCSR, index::SparseChar) 9 | 10 | for (bname,gname,elty) in ((:cusparseScsrgeam2_bufferSizeExt, :cusparseScsrgeam2, :Float32), 11 | (:cusparseDcsrgeam2_bufferSizeExt, :cusparseDcsrgeam2, :Float64), 12 | (:cusparseCcsrgeam2_bufferSizeExt, :cusparseCcsrgeam2, :ComplexF32), 13 | (:cusparseZcsrgeam2_bufferSizeExt, :cusparseZcsrgeam2, :ComplexF64)) 14 | @eval begin 15 | function geam(alpha::Number, A::CuSparseMatrixCSR{$elty}, beta::Number, B::CuSparseMatrixCSR{$elty}, index::SparseChar) 16 | m, n = size(A) 17 | (m, n) == size(B) && DimensionMismatch("dimensions must match: a has dims $(axes(A)), b has dims $(axes(B))") 18 | descrA = CuMatrixDescriptor('G', 'L', 'N', index) 19 | descrB = CuMatrixDescriptor('G', 'L', 'N', index) 20 | descrC = CuMatrixDescriptor('G', 'L', 'N', index) 21 | 22 | rowPtrC = CuArray{Int32,1}(undef, m+1) 23 | 24 | function bufferSize() 25 | out = Ref{Csize_t}(1) 26 | $bname(handle(), m, n, 27 | alpha, descrA, nnz(A), nonzeros(A), A.rowPtr, A.colVal, 28 | beta, descrB, nnz(B), nonzeros(B), B.rowPtr, B.colVal, 29 | descrC, CuArray{$elty,1}(undef, 0), rowPtrC, CuArray{Int32,1}(undef, 0), 30 | out) 31 | return out[] 32 | end 33 | 34 | C = with_workspace(bufferSize) do buffer 35 | function get_nnzC(buffer) 36 | nnzTotalDevHostPtr = Ref{Cint}(1) 37 | cusparseXcsrgeam2Nnz(handle(), m, n, 38 | descrA, nnz(A), A.rowPtr, A.colVal, 39 | descrB, nnz(B), B.rowPtr, B.colVal, 40 | descrC, rowPtrC, nnzTotalDevHostPtr, 41 | buffer) 42 | return nnzTotalDevHostPtr[] 43 | end 44 | 45 | nnzC = get_nnzC(buffer) 46 | colValC = CuArray{Int32,1}(undef, Int(nnzC)) 47 | nzValC = CuArray{$elty,1}(undef, Int(nnzC)) 48 | 49 | $gname(handle(), m, n, 50 | alpha, descrA, nnz(A), nonzeros(A), A.rowPtr, A.colVal, 51 | beta, descrB, nnz(B), nonzeros(B), B.rowPtr, B.colVal, 52 | descrC, nzValC, rowPtrC, colValC, 53 | buffer) 54 | return CuSparseMatrixCSR(rowPtrC, colValC, nzValC, (m, n)) 55 | end 56 | C 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/cudnn/activation.jl: -------------------------------------------------------------------------------- 1 | """ 2 | cudnnActivationForward(x; mode, nanOpt, coef, alpha) 3 | cudnnActivationForward(x, d::cudnnActivationDescriptor; alpha) 4 | cudnnActivationForward!(y, x; mode, nanOpt, coef, alpha, beta) 5 | cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; alpha, beta) 6 | 7 | Return the result of the specified elementwise activation operation applied to `x`. 8 | Optionally `y` holds the result and `d` specifies the operation. `y` should be similar to 9 | `x` if specified. Keyword arguments `alpha=1, beta=0` can be used for scaling, i.e. `y .= 10 | alpha * op.(x) .+ beta * y`. The following keyword arguments specify the operation if `d` is 11 | not given: 12 | 13 | * `mode = CUDNN_ACTIVATION_RELU`: Options are `SIGMOID`, `RELU`, `TANH`, `CLIPPED_RELU`, `ELU`, `IDENTITY` 14 | * `nanOpt = CUDNN_NOT_PROPAGATE_NAN`: NaN propagation policy, the other option is `CUDNN_PROPAGATE_NAN` 15 | * `coef=1`: When the activation mode is set to `CUDNN_ACTIVATION_CLIPPED_RELU`, this input specifies the clipping threshold; and when the activation mode is set to `CUDNN_ACTIVATION_ELU`, this input specifies the `alpha` parameter. 16 | """ 17 | cudnnActivationForward, cudnnActivationForward! 18 | 19 | 20 | # Public methods 21 | cudnnActivationForward(x; o...) = cudnnActivationForwardWithDefaults(x; o...) 22 | cudnnActivationForward!(y, x; o...) = cudnnActivationForwardWithDefaults(x; y, o...) 23 | cudnnActivationForward(x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; activationDesc=d, o...) 24 | cudnnActivationForward!(y, x, d::cudnnActivationDescriptor; o...) = cudnnActivationForwardWithDefaults(x; y, activationDesc=d, o...) 25 | 26 | 27 | # Private method 28 | function cudnnActivationForwardWithDefaults( 29 | x; 30 | y = similar(x), 31 | mode::cudnnActivationMode_t = CUDNN_ACTIVATION_RELU, 32 | nanOpt::cudnnNanPropagation_t = CUDNN_NOT_PROPAGATE_NAN, 33 | coef::Real=1, 34 | activationDesc::cudnnActivationDescriptor = cudnnActivationDescriptor(mode, nanOpt, Cdouble(coef)), 35 | alpha::Real=1, 36 | beta::Real=0, 37 | xDesc::cudnnTensorDescriptor = cudnnTensorDescriptor(x), 38 | yDesc::cudnnTensorDescriptor = xDesc, 39 | ) 40 | T = eltype(x) 41 | alpha, beta = scalingParameter(T,alpha), scalingParameter(T,beta) 42 | cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y) 43 | end 44 | 45 | 46 | # AD method: 47 | function cudnnActivationForwardAD(x; activationDesc, alpha, xDesc, beta, yDesc, y) 48 | cudnnActivationForward(handle(), activationDesc, alpha, xDesc, x, beta, yDesc, y) 49 | return y 50 | end 51 | 52 | 53 | # Deprecated: 54 | function cudnnActivationForward(x::DenseCuArray{T,N}, y::DenseCuArray{T,N}; o...) where {T,N} 55 | @warn "`cudnnActivationForward(x,y)` is deprecated, please use one of the methods in `@doc cudnnActivationForward`." maxlog=1 56 | cudnnActivationForward!(y, x; o...) 57 | end 58 | -------------------------------------------------------------------------------- /lib/cudadrv/error.jl: -------------------------------------------------------------------------------- 1 | # Error type and decoding functionality 2 | 3 | export CuError 4 | 5 | 6 | """ 7 | CuError(code) 8 | CuError(code, meta) 9 | 10 | Create a CUDA error object with error code `code`. The optional `meta` parameter indicates 11 | whether extra information, such as error logs, is known. 12 | """ 13 | struct CuError <: Exception 14 | code::CUresult 15 | meta::Any 16 | 17 | CuError(code, meta=nothing) = new(code, meta) 18 | end 19 | 20 | Base.convert(::Type{CUresult}, err::CuError) = err.code 21 | 22 | Base.:(==)(x::CuError,y::CuError) = x.code == y.code 23 | 24 | """ 25 | name(err::CuError) 26 | 27 | Gets the string representation of an error code. 28 | 29 | ```jldoctest 30 | julia> err = CuError(CUDA.cudaError_enum(1)) 31 | CuError(CUDA_ERROR_INVALID_VALUE) 32 | 33 | julia> name(err) 34 | "ERROR_INVALID_VALUE" 35 | ``` 36 | """ 37 | function name(err::CuError) 38 | str_ref = Ref{Cstring}() 39 | cuGetErrorName(err, str_ref) 40 | unsafe_string(str_ref[])[6:end] 41 | end 42 | 43 | """ 44 | description(err::CuError) 45 | 46 | Gets the string description of an error code. 47 | """ 48 | function description(err::CuError) 49 | if err.code == -1%UInt32 50 | "Cannot use the CUDA stub libraries." 51 | else 52 | str_ref = Ref{Cstring}() 53 | cuGetErrorString(err, str_ref) 54 | unsafe_string(str_ref[]) 55 | end 56 | end 57 | 58 | function Base.showerror(io::IO, err::CuError) 59 | try 60 | print(io, "CUDA error: $(description(err)) (code $(reinterpret(Int32, err.code)), $(name(err)))") 61 | catch 62 | # we might throw before the library is initialized 63 | print(io, "CUDA error (code $(reinterpret(Int32, err.code)), $(err.code))") 64 | end 65 | 66 | if err.meta != nothing 67 | print(io, "\n") 68 | print(io, err.meta) 69 | end 70 | end 71 | 72 | Base.show(io::IO, ::MIME"text/plain", err::CuError) = print(io, "CuError($(err.code))") 73 | 74 | @enum_without_prefix cudaError_enum CUDA_ 75 | 76 | 77 | ## API call wrapper 78 | 79 | @inline function initialize_context() 80 | prepare_cuda_state() 81 | return 82 | end 83 | 84 | # outlined functionality to avoid GC frame allocation 85 | @noinline throw_stub_error() = 86 | error("Cannot use the CUDA stub libraries. You either don't have the NVIDIA driver installed, or it is not properly discoverable.") 87 | @noinline function throw_api_error(res) 88 | if res == ERROR_OUT_OF_MEMORY 89 | throw(OutOfGPUMemoryError()) 90 | else 91 | throw(CuError(res)) 92 | end 93 | end 94 | 95 | macro check(ex) 96 | quote 97 | res = $(esc(ex)) 98 | if res == 0xffffffff 99 | throw_stub_error() 100 | elseif res != SUCCESS 101 | throw_api_error(res) 102 | end 103 | 104 | nothing 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /lib/cudadrv/pool.jl: -------------------------------------------------------------------------------- 1 | # Stream-orderdered memory allocator 2 | 3 | export CuMemoryPool, default_memory_pool, memory_pool, memory_pool!, trim, attribute, attribute! 4 | 5 | @enum_without_prefix CUmemAllocationType CU_MEM_ 6 | @enum_without_prefix CUmemAllocationHandleType CU_MEM_ 7 | 8 | mutable struct CuMemoryPool 9 | handle::CUmemoryPool 10 | ctx::CuContext 11 | 12 | function CuMemoryPool(dev::CuDevice; 13 | alloc_type::CUmemAllocationType=ALLOCATION_TYPE_PINNED, 14 | handle_type::CUmemAllocationHandleType=HANDLE_TYPE_NONE) 15 | props = Ref(CUmemPoolProps( 16 | alloc_type, 17 | handle_type, 18 | CUmemLocation( 19 | CU_MEM_LOCATION_TYPE_DEVICE, 20 | deviceid(dev) 21 | ), 22 | C_NULL, 23 | ntuple(i->Cuchar(0), 64) 24 | )) 25 | handle_ref = Ref{CUmemoryPool}() 26 | cuMemPoolCreate(handle_ref, props) 27 | 28 | ctx = current_context() 29 | obj = new(handle_ref[], ctx) 30 | finalizer(unsafe_destroy!, obj) 31 | return obj 32 | end 33 | 34 | global function default_memory_pool(dev::CuDevice) 35 | handle_ref = Ref{CUmemoryPool}() 36 | cuDeviceGetDefaultMemPool(handle_ref, dev) 37 | 38 | ctx = current_context() 39 | new(handle_ref[], ctx) 40 | end 41 | 42 | global function memory_pool(dev::CuDevice) 43 | handle_ref = Ref{CUmemoryPool}() 44 | cuDeviceGetMemPool(handle_ref, dev) 45 | 46 | ctx = current_context() 47 | new(handle_ref[], ctx) 48 | end 49 | end 50 | 51 | function unsafe_destroy!(pool::CuMemoryPool) 52 | @finalize_in_ctx pool.ctx cuMemPoolDestroy(pool) 53 | end 54 | 55 | Base.unsafe_convert(::Type{CUmemoryPool}, pool::CuMemoryPool) = pool.handle 56 | 57 | Base.:(==)(a::CuMemoryPool, b::CuMemoryPool) = a.handle == b.handle 58 | Base.hash(pool::CuMemoryPool, h::UInt) = hash(pool.handle, h) 59 | 60 | memory_pool!(dev::CuDevice, pool::CuMemoryPool) = cuDeviceSetMemPool(dev, pool) 61 | 62 | trim(pool::CuMemoryPool, bytes_to_keep::Integer=0) = cuMemPoolTrimTo(pool, bytes_to_keep) 63 | 64 | 65 | ## pool attributes 66 | 67 | @enum_without_prefix CUmemPool_attribute CU_ 68 | 69 | """ 70 | attribute(X, pool::CuMemoryPool, attr) 71 | 72 | Returns attribute `attr` about `pool`. The type of the returned value depends on the 73 | attribute, and as such must be passed as the `X` parameter. 74 | """ 75 | function attribute(X::Type, pool::CuMemoryPool, attr::CUmemPool_attribute) where {T} 76 | value = Ref{X}() 77 | cuMemPoolGetAttribute(pool, attr, value) 78 | return value[] 79 | end 80 | 81 | """ 82 | attribute!(ptr::Union{Ptr,CuPtr}, attr, val) 83 | 84 | Sets attribute` attr` on a pointer `ptr` to `val`. 85 | """ 86 | function attribute!(pool::CuMemoryPool, attr::CUmemPool_attribute, value) where {T} 87 | cuMemPoolSetAttribute(pool, attr, Ref(value)) 88 | return 89 | end 90 | -------------------------------------------------------------------------------- /lib/cufft/error.jl: -------------------------------------------------------------------------------- 1 | export CUFFTError 2 | 3 | struct CUFFTError <: Exception 4 | code::cufftResult 5 | end 6 | 7 | Base.convert(::Type{cufftResult}, err::CUFFTError) = err.code 8 | 9 | Base.showerror(io::IO, err::CUFFTError) = 10 | print(io, "CUFFTError: ", description(err), " (code $(reinterpret(Int32, err.code)), $(name(err)))") 11 | 12 | name(err::CUFFTError) = string(err.code) 13 | 14 | ## COV_EXCL_START 15 | function description(err::CUFFTError) 16 | if err.code == CUFFT_SUCCESS 17 | "the operation completed successfully" 18 | elseif err.code == CUFFT_INVALID_PLAN 19 | "cuFFT was passed an invalid plan handle" 20 | elseif err.code == CUFFT_ALLOC_FAILED 21 | "cuFFT failed to allocate GPU or CPU memory" 22 | elseif err.code == CUFFT_INVALID_TYPE 23 | "cuFFT invalid type " # No longer used 24 | elseif err.code == CUFFT_INVALID_VALUE 25 | "user specified an invalid pointer or parameter" 26 | elseif err.code == CUFFT_INTERNAL_ERROR 27 | "driver or internal cuFFT library error" 28 | elseif err.code == CUFFT_EXEC_FAILED 29 | "failed to execute an FFT on the GPU" 30 | elseif err.code == CUFFT_SETUP_FAILED 31 | "the cuFFT library failed to initialize" 32 | elseif err.code == CUFFT_INVALID_SIZE 33 | "user specified an invalid transform size" 34 | elseif err.code == CUFFT_UNALIGNED_DATA 35 | "cuFFT unaligned data" # No longer used 36 | elseif err.code == CUFFT_INCOMPLETE_PARAMETER_LIST 37 | "missing parameters in call" 38 | elseif err.code == CUFFT_INVALID_DEVICE 39 | "execution of a plan was on different GPU than plan creation" 40 | elseif err.code == CUFFT_PARSE_ERROR 41 | "internal plan database error" 42 | elseif err.code == CUFFT_NO_WORKSPACE 43 | "no workspace has been provided prior to plan execution" 44 | elseif err.code == CUFFT_NOT_IMPLEMENTED 45 | "function does not implement functionality for parameters given." 46 | elseif err.code == CUFFT_LICENSE_ERROR 47 | "cuFFT license error" # Used in previous versions. 48 | elseif err.code == CUFFT_NOT_SUPPORTED 49 | "operation is not supported for parameters given." 50 | else 51 | "no description for this error" 52 | end 53 | end 54 | ## COV_EXCL_STOP 55 | 56 | 57 | ## API call wrapper 58 | 59 | # outlined functionality to avoid GC frame allocation 60 | @noinline function throw_api_error(res) 61 | if res == CUFFT_ALLOC_FAILED 62 | throw(OutOfGPUMemoryError()) 63 | else 64 | throw(CUFFTError(res)) 65 | end 66 | end 67 | 68 | macro check(ex, errs...) 69 | check = :(isequal(err, CUFFT_ALLOC_FAILED)) 70 | for err in errs 71 | check = :($check || isequal(err, $(esc(err)))) 72 | end 73 | 74 | quote 75 | res = @retry_reclaim err->$check $(esc(ex)) 76 | if res != CUFFT_SUCCESS 77 | throw_api_error(res) 78 | end 79 | 80 | nothing 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /docs/Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | [[ANSIColoredPrinters]] 4 | git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" 5 | uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" 6 | version = "0.0.1" 7 | 8 | [[Base64]] 9 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" 10 | 11 | [[Dates]] 12 | deps = ["Printf"] 13 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" 14 | 15 | [[DocStringExtensions]] 16 | deps = ["LibGit2"] 17 | git-tree-sha1 = "a32185f5428d3986f47c2ab78b1f216d5e6cc96f" 18 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" 19 | version = "0.8.5" 20 | 21 | [[Documenter]] 22 | deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] 23 | git-tree-sha1 = "8b43e37cfb4f4edc2b6180409acc0cebce7fede8" 24 | uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 25 | version = "0.27.7" 26 | 27 | [[IOCapture]] 28 | deps = ["Logging", "Random"] 29 | git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a" 30 | uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" 31 | version = "0.2.2" 32 | 33 | [[InteractiveUtils]] 34 | deps = ["Markdown"] 35 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 36 | 37 | [[JSON]] 38 | deps = ["Dates", "Mmap", "Parsers", "Unicode"] 39 | git-tree-sha1 = "8076680b162ada2a031f707ac7b4953e30667a37" 40 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" 41 | version = "0.21.2" 42 | 43 | [[LibGit2]] 44 | deps = ["Base64", "NetworkOptions", "Printf", "SHA"] 45 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" 46 | 47 | [[Literate]] 48 | deps = ["Base64", "IOCapture", "JSON", "REPL"] 49 | git-tree-sha1 = "bbebc3c14dbfbe76bfcbabf0937481ac84dc86ef" 50 | uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306" 51 | version = "2.9.3" 52 | 53 | [[Logging]] 54 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" 55 | 56 | [[Markdown]] 57 | deps = ["Base64"] 58 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" 59 | 60 | [[Mmap]] 61 | uuid = "a63ad114-7e13-5084-954f-fe012c677804" 62 | 63 | [[NetworkOptions]] 64 | uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" 65 | 66 | [[Parsers]] 67 | deps = ["Dates"] 68 | git-tree-sha1 = "a8709b968a1ea6abc2dc1967cb1db6ac9a00dfb6" 69 | uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" 70 | version = "2.0.5" 71 | 72 | [[Printf]] 73 | deps = ["Unicode"] 74 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" 75 | 76 | [[REPL]] 77 | deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] 78 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" 79 | 80 | [[Random]] 81 | deps = ["Serialization"] 82 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 83 | 84 | [[SHA]] 85 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" 86 | 87 | [[Serialization]] 88 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" 89 | 90 | [[Sockets]] 91 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc" 92 | 93 | [[Test]] 94 | deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] 95 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 96 | 97 | [[Unicode]] 98 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" 99 | -------------------------------------------------------------------------------- /test/exceptions.jl: -------------------------------------------------------------------------------- 1 | # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces, 2 | # but --show-backtrace=no does not survive execve. 3 | @not_if_sanitize begin 4 | 5 | # these tests spawn subprocesses, so reset the current context to conserve memory 6 | CUDA.can_reset_device() && device_reset!() 7 | 8 | host_error_re = r"ERROR: (KernelException: exception thrown during kernel execution on device|CUDA error: an illegal instruction was encountered|CUDA error: unspecified launch failure)" 9 | device_error_re = r"ERROR: a \w+ was thrown during kernel execution" 10 | 11 | @testset "stack traces at different debug levels" begin 12 | 13 | script = """ 14 | function kernel(arr, val) 15 | arr[1] = val 16 | return 17 | end 18 | 19 | cpu = zeros(Int) 20 | gpu = CuArray(cpu) 21 | @cuda kernel(gpu, 1.2) 22 | synchronize() 23 | 24 | # FIXME: on some platforms (Windows...), for some users, the exception flag change 25 | # doesn't immediately propagate to the host, and gets caught during finalization. 26 | # this looks like a driver bug, since we threadfence_system() after setting the flag. 27 | # https://stackoverflow.com/questions/16417346/cuda-pinned-memory-flushing-from-the-device 28 | sleep(1) 29 | synchronize() 30 | """ 31 | 32 | # NOTE: kernel exceptions aren't always caught on the CPU as a KernelException. 33 | # on older devices, we emit a `trap` which causes a CUDA error... 34 | # 35 | 36 | let (code, out, err) = julia_script(script, `-g0`) 37 | @test code == 1 38 | @test occursin(host_error_re, err) 39 | @test !occursin(device_error_re, out) 40 | # NOTE: stdout sometimes contain a failure to free the CuArray with ILLEGAL_ACCESS 41 | end 42 | 43 | let (code, out, err) = julia_script(script, `-g1`) 44 | @test code == 1 45 | @test occursin(host_error_re, err) 46 | @test occursin(device_error_re, out) 47 | @test occursin("Run Julia on debug level 2 for device stack traces", out) 48 | end 49 | 50 | let (code, out, err) = julia_script(script, `-g2`, 51 | "JULIA_CUDA_DEBUG_INFO"=>false) # NVIDIA#3305774 52 | @test code == 1 53 | @test occursin(host_error_re, err) 54 | @test occursin(device_error_re, out) 55 | @test occursin("[1] Int64 at $(joinpath(".", "float.jl"))", out) 56 | @test occursin("[4] kernel at $(joinpath(".", "none")):5", out) 57 | end 58 | 59 | end 60 | 61 | @testset "#329" begin 62 | 63 | script = """ 64 | @noinline foo(a, i) = a[1] = i 65 | bar(a) = (foo(a, 42); nothing) 66 | 67 | ptr = reinterpret(Core.LLVMPtr{Int,AS.Global}, C_NULL) 68 | arr = CuDeviceArray{Int,1,AS.Global}((0,), ptr) 69 | 70 | CUDA.@sync @cuda bar(arr) 71 | """ 72 | 73 | let (code, out, err) = julia_script(script, `-g2`, 74 | "JULIA_CUDA_DEBUG_INFO"=>false) # NVIDIA#3305774 75 | @test code == 1 76 | @test occursin(host_error_re, err) 77 | @test occursin(device_error_re, out) 78 | @test occursin("foo at $(joinpath(".", "none")):4", out) 79 | @test occursin("bar at $(joinpath(".", "none")):5", out) 80 | end 81 | 82 | end 83 | 84 | end 85 | -------------------------------------------------------------------------------- /src/CUDA.jl: -------------------------------------------------------------------------------- 1 | module CUDA 2 | 3 | using GPUCompiler 4 | 5 | using GPUArrays 6 | 7 | using LLVM 8 | using LLVM.Interop 9 | using Core: LLVMPtr 10 | 11 | using Adapt: Adapt, adapt, WrappedArray 12 | 13 | using Requires: @require 14 | 15 | using LinearAlgebra 16 | 17 | using BFloat16s: BFloat16 18 | 19 | using ExprTools: splitdef, combinedef 20 | 21 | # TODO: set lib versions in bindeps or so 22 | 23 | # XXX: to be replaced by a JLL 24 | include("../deps/Deps.jl") 25 | using .Deps 26 | 27 | # only use TimerOutputs on non latency-critical CI, in part because 28 | # @timeit_debug isn't truely zero-cost (KristofferC/TimerOutputs.jl#120) 29 | if getenv("CI", false) && !getenv("BENCHMARKS", false) 30 | using TimerOutputs 31 | const to = TimerOutput() 32 | 33 | macro timeit_ci(args...) 34 | TimerOutputs.timer_expr(CUDA, false, :($CUDA.to), args...) 35 | end 36 | else 37 | macro timeit_ci(args...) 38 | esc(args[end]) 39 | end 40 | end 41 | 42 | 43 | ## source code includes 44 | 45 | include("pointer.jl") 46 | 47 | # core library 48 | include("../lib/utils/APIUtils.jl") 49 | include("../lib/cudadrv/CUDAdrv.jl") 50 | 51 | # essential stuff 52 | include("initialization.jl") 53 | include("state.jl") 54 | include("debug.jl") 55 | 56 | # device functionality (needs to be loaded first, because of generated functions) 57 | include("device/utils.jl") 58 | include("device/pointer.jl") 59 | include("device/array.jl") 60 | include("device/intrinsics.jl") 61 | include("device/runtime.jl") 62 | include("device/texture.jl") 63 | include("device/random.jl") 64 | include("device/sparse.jl") 65 | include("device/quirks.jl") 66 | 67 | # array essentials 68 | include("pool.jl") 69 | include("array.jl") 70 | 71 | # compiler libraries 72 | include("../lib/cupti/CUPTI.jl") 73 | include("../lib/nvtx/NVTX.jl") 74 | export CUPTI, NVTX 75 | 76 | # compiler implementation 77 | include("compiler/gpucompiler.jl") 78 | include("compiler/execution.jl") 79 | include("compiler/exceptions.jl") 80 | include("compiler/reflection.jl") 81 | 82 | # array implementation 83 | include("gpuarrays.jl") 84 | include("utilities.jl") 85 | include("texture.jl") 86 | 87 | # array libraries 88 | include("../lib/complex.jl") 89 | include("../lib/library_types.jl") 90 | include("../lib/cublas/CUBLAS.jl") 91 | include("../lib/cusparse/CUSPARSE.jl") 92 | include("../lib/cusolver/CUSOLVER.jl") 93 | include("../lib/cufft/CUFFT.jl") 94 | include("../lib/curand/CURAND.jl") 95 | include("../lib/cudnn/CUDNN.jl") 96 | include("../lib/cutensor/CUTENSOR.jl") 97 | export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND, CUDNN, CUTENSOR 98 | 99 | # integrations and specialized functionality 100 | include("indexing.jl") 101 | include("broadcast.jl") 102 | include("mapreduce.jl") 103 | include("accumulate.jl") 104 | include("reverse.jl") 105 | include("linalg.jl") 106 | include("iterator.jl") 107 | include("random.jl") 108 | include("sorting.jl") 109 | 110 | # other libraries 111 | include("../lib/nvml/NVML.jl") 112 | const has_nvml = NVML.has_nvml 113 | export NVML, has_nvml 114 | 115 | include("deprecated.jl") 116 | include("precompile.jl") 117 | 118 | end 119 | -------------------------------------------------------------------------------- /src/device/intrinsics/assertion.jl: -------------------------------------------------------------------------------- 1 | # Assertion (B.19) 2 | 3 | export @cuassert 4 | 5 | """ 6 | @assert cond [text] 7 | 8 | Signal assertion failure to the CUDA driver if `cond` is `false`. Preferred syntax for 9 | writing assertions, mimicking `Base.@assert`. Message `text` is optionally displayed upon 10 | assertion failure. 11 | 12 | !!! warning 13 | A failed assertion will crash the GPU, so use sparingly as a debugging tool. 14 | Furthermore, the assertion might be disabled at various optimization levels, and thus 15 | should not cause any side-effects. 16 | """ 17 | macro cuassert(ex, msgs...) 18 | # message handling copied from Base.@assert 19 | msg = isempty(msgs) ? ex : msgs[1] 20 | if isa(msg, AbstractString) 21 | msg = msg # pass-through 22 | elseif !isempty(msgs) && (isa(msg, Expr) || isa(msg, Symbol)) 23 | # message is an expression needing evaluating 24 | msg = :(Main.Base.string($(esc(msg)))) 25 | elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) && applicable(Main.Base.string, msg) 26 | msg = Main.Base.string(msg) 27 | else 28 | # string() might not be defined during bootstrap 29 | msg = :(Main.Base.string($(Expr(:quote,msg)))) 30 | end 31 | 32 | return :($(esc(ex)) ? $(nothing) 33 | : cuassert_fail($(Val(Symbol(msg))), 34 | $(Val(__source__.file)), 35 | $(Val(__source__.line)))) 36 | end 37 | 38 | assert_counter = 0 39 | 40 | @generated function cuassert_fail(::Val{msg}, ::Val{file}, ::Val{line}) where 41 | {msg, file, line} 42 | Context() do ctx 43 | T_void = LLVM.VoidType(ctx) 44 | T_int32 = LLVM.Int32Type(ctx) 45 | T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx)) 46 | 47 | # create function 48 | llvm_f, _ = create_function(T_void) 49 | mod = LLVM.parent(llvm_f) 50 | 51 | # generate IR 52 | Builder(ctx) do builder 53 | entry = BasicBlock(llvm_f, "entry"; ctx) 54 | position!(builder, entry) 55 | 56 | global assert_counter 57 | assert_counter += 1 58 | 59 | message = globalstring_ptr!(builder, String(msg), "assert_message_$(assert_counter)") 60 | file = globalstring_ptr!(builder, String(file), "assert_file_$(assert_counter)") 61 | line = ConstantInt(T_int32, line) 62 | func = globalstring_ptr!(builder, "unknown", "assert_function_$(assert_counter)") 63 | charSize = ConstantInt(Csize_t(1); ctx) 64 | 65 | # invoke __assertfail and return 66 | # NOTE: we don't mark noreturn since that control flow might confuse ptxas 67 | assertfail_typ = 68 | LLVM.FunctionType(T_void, 69 | [T_pint8, T_pint8, T_int32, T_pint8, llvmtype(charSize)]) 70 | assertfail = LLVM.Function(mod, "__assertfail", assertfail_typ) 71 | call!(builder, assertfail, [message, file, line, func, charSize]) 72 | 73 | ret!(builder) 74 | end 75 | 76 | call_function(llvm_f, Nothing, Tuple{}) 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /lib/utils/memoization.jl: -------------------------------------------------------------------------------- 1 | export @memoize 2 | 3 | """ 4 | @memoize [arg::T]... begin 5 | # expensive computation 6 | end::T 7 | 8 | Low-level, no-frills memoization macro that stores values in a thread-local, typed Dict. The 9 | types of the dictionary are derived from the syntactical type assertions. 10 | 11 | When there are no arguments to key the cache with, instead of a dictionary a simple array 12 | with per-thread elements is used. This further improves performance to 2ns per access. 13 | """ 14 | macro memoize(ex...) 15 | code = ex[end] 16 | args = ex[1:end-1] 17 | 18 | # decode the code body 19 | @assert Meta.isexpr(code, :(::)) 20 | rettyp = code.args[2] 21 | code = code.args[1] 22 | 23 | # decode the arguments 24 | argtyps = [] 25 | argvars = [] 26 | for arg in args 27 | @assert Meta.isexpr(arg, :(::)) 28 | push!(argvars, arg.args[1]) 29 | push!(argtyps, arg.args[2]) 30 | end 31 | 32 | # the global cache is an array with one entry per thread. if we don't have to key on 33 | # anything, that entry will be the memoized new_value, or else a dictionary of values. 34 | @gensym global_cache 35 | 36 | # generate code to access memoized values 37 | # (assuming the global_cache can be indexed with the thread ID) 38 | if isempty(args) 39 | # if we don't have to key on anything, use the global cache directly 40 | global_cache_eltyp = :(Union{Nothing,$rettyp}) 41 | ex = quote 42 | cache = get!($(esc(global_cache))) do 43 | [nothing for _ in 1:Threads.nthreads()] 44 | end 45 | cached_value = @inbounds cache[Threads.threadid()] 46 | if cached_value !== nothing 47 | cached_value 48 | else 49 | new_value = $(esc(code))::$rettyp 50 | @inbounds cache[Threads.threadid()] = new_value 51 | new_value 52 | end 53 | end 54 | else 55 | if length(args) == 1 56 | global_cache_eltyp = :(Dict{$(argtyps[1]),$rettyp}) 57 | global_init = :(Dict{$(argtyps[1]),$rettyp}()) 58 | key = :($(esc(argvars[1]))) 59 | else 60 | global_cache_eltyp = :(Dict{Tuple{$(argtyps...)},$rettyp}) 61 | global_init = :(Dict{Tuple{$(argtyps...)},$rettyp}()) 62 | key = :(tuple($(map(esc, argvars)...))) 63 | end 64 | ex = quote 65 | cache = get!($(esc(global_cache))) do 66 | [$global_init for _ in 1:Threads.nthreads()] 67 | end 68 | local_cache = @inbounds cache[Threads.threadid()] 69 | cached_value = get(local_cache, $key, nothing) 70 | if cached_value !== nothing 71 | cached_value 72 | else 73 | new_value = $(esc(code))::$rettyp 74 | local_cache[$key] = new_value 75 | new_value 76 | end 77 | end 78 | end 79 | 80 | # define the per-thread cache 81 | @eval __module__ begin 82 | const $global_cache = LazyInitialized{Vector{$(global_cache_eltyp)}}() 83 | end 84 | 85 | quote 86 | $ex 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /test/device/intrinsics/math.jl: -------------------------------------------------------------------------------- 1 | using SpecialFunctions 2 | 3 | @testset "math" begin 4 | @testset "log10" begin 5 | @test testf(a->log10.(a), Float32[100]) 6 | end 7 | 8 | @testset "pow" begin 9 | for T in (Float16, Float32, Float64, ComplexF32, ComplexF64) 10 | range = (T<:Integer) ? (T(5):T(10)) : T 11 | @test testf((x,y)->x.^y, rand(Float32, 1), rand(range, 1)) 12 | @test testf((x,y)->x.^y, rand(Float32, 1), -rand(range, 1)) 13 | end 14 | end 15 | 16 | @testset "isinf" begin 17 | for x in (Inf32, Inf, NaN32, NaN) 18 | @test testf(x->isinf.(x), [x]) 19 | end 20 | end 21 | 22 | @testset "isnan" begin 23 | for x in (Inf32, Inf, NaN32, NaN) 24 | @test testf(x->isnan.(x), [x]) 25 | end 26 | end 27 | 28 | for op in (exp, angle, exp2, exp10,) 29 | @testset "$op" begin 30 | for T in (Float16, Float32, Float64) 31 | @test testf(x->op.(x), rand(T, 1)) 32 | @test testf(x->op.(x), -rand(T, 1)) 33 | end 34 | end 35 | end 36 | 37 | for op in (expm1,) 38 | @testset "$op" begin 39 | # FIXME: add expm1(::Float16) to Base 40 | for T in (Float32, Float64) 41 | @test testf(x->op.(x), rand(T, 1)) 42 | @test testf(x->op.(x), -rand(T, 1)) 43 | end 44 | end 45 | end 46 | 47 | for op in (exp, abs, abs2, angle, log) 48 | @testset "Complex - $op" begin 49 | for T in (ComplexF16, ComplexF32, ComplexF64) 50 | @test testf(x->op.(x), rand(T, 1)) 51 | @test testf(x->op.(x), -rand(T, 1)) 52 | end 53 | 54 | end 55 | end 56 | @testset "mod and rem" begin 57 | for T in (Float16, Float32, Float64) 58 | @test testf(a->rem.(a, T(2)), T[0, 1, 1.5, 2, -1]) 59 | @test testf(a->rem.(a, T(2), RoundNearest), T[0, 1, 1.5, 2, -1]) 60 | @test testf(a->mod.(a, T(2)), T[0, 1, 1.5, 2, -1]) 61 | end 62 | end 63 | 64 | @testset "rsqrt" begin 65 | # GPUCompiler.jl#173: a CUDA-only device function fails to validate 66 | function kernel(a) 67 | a[] = CUDA.rsqrt(a[]) 68 | return 69 | end 70 | 71 | # make sure this test uses an actual device function 72 | @test_throws ErrorException kernel(ones(1)) 73 | 74 | for T in (Float16, Float32) 75 | a = CuArray{T}([4]) 76 | @cuda kernel(a) 77 | @test Array(a) == [0.5] 78 | end 79 | end 80 | 81 | @testset "fma" begin 82 | for T in (Float16, Float32, Float64) 83 | @test testf((x,y,z)->fma.(x,y,z), rand(T, 1), rand(T, 1), rand(T, 1)) 84 | @test testf((x,y,z)->fma.(x,y,z), rand(T, 1), -rand(T, 1), -rand(T, 1)) 85 | end 86 | end 87 | 88 | # something from SpecialFunctions.jl 89 | @testset "erf" begin 90 | @test testf(a->SpecialFunctions.erf.(a), Float32[1.0]) 91 | end 92 | 93 | @testset "exp" begin 94 | # JuliaGPU/CUDA.jl#1085: exp uses Base.sincos performing a global CPU load 95 | @test testf(x->exp.(x), [1e7im]) 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /docs/src/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | This page is a compilation of frequently asked questions and answers. 4 | 5 | 6 | ## An old version of CUDA.jl keeps getting installed! 7 | 8 | Sometimes it happens that a breaking version of CUDA.jl or one of its dependencies is 9 | released. If any package you use isn't yet compatible with this release, this will block 10 | automatic upgrade of CUDA.jl. For example, with Flux.jl v0.11.1 we get CUDA.jl v1.3.3 11 | despite there being a v2.x release: 12 | 13 | ``` 14 | pkg> add Flux 15 | [587475ba] + Flux v0.11.1 16 | pkg> add CUDA 17 | [052768ef] + CUDA v1.3.3 18 | ``` 19 | 20 | To examine which package is holding back CUDA.jl, you can "force" an upgrade by specifically 21 | requesting a newer version. The resolver will then complain, and explain why this upgrade 22 | isn't possible: 23 | 24 | ``` 25 | pkg> add CUDA.jl@2 26 | Resolving package versions... 27 | ERROR: Unsatisfiable requirements detected for package Adapt [79e6a3ab]: 28 | Adapt [79e6a3ab] log: 29 | ├─possible versions are: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0, 2.0.0-2.0.2, 2.1.0, 2.2.0, 2.3.0] or uninstalled 30 | ├─restricted by compatibility requirements with CUDA [052768ef] to versions: [2.2.0, 2.3.0] 31 | │ └─CUDA [052768ef] log: 32 | │ ├─possible versions are: [0.1.0, 1.0.0-1.0.2, 1.1.0, 1.2.0-1.2.1, 1.3.0-1.3.3, 2.0.0-2.0.2] or uninstalled 33 | │ └─restricted to versions 2 by an explicit requirement, leaving only versions 2.0.0-2.0.2 34 | └─restricted by compatibility requirements with Flux [587475ba] to versions: [0.3.0-0.3.1, 0.4.0-0.4.2, 1.0.0-1.0.1, 1.1.0] — no versions left 35 | └─Flux [587475ba] log: 36 | ├─possible versions are: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1] or uninstalled 37 | ├─restricted to versions * by an explicit requirement, leaving only versions [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4, 0.11.0-0.11.1] 38 | └─restricted by compatibility requirements with CUDA [052768ef] to versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4] or uninstalled, leaving only versions: [0.4.1, 0.5.0-0.5.4, 0.6.0-0.6.10, 0.7.0-0.7.3, 0.8.0-0.8.3, 0.9.0, 0.10.0-0.10.4] 39 | └─CUDA [052768ef] log: see above 40 | ``` 41 | 42 | A common source of these incompatibilities is having both CUDA.jl and the older 43 | CUDAnative.jl/CuArrays.jl/CUDAdrv.jl stack installed: These are incompatible, and cannot 44 | coexist. You can inspect in the Pkg REPL which exact packages you have installed using the 45 | `status --manifest` option. 46 | 47 | 48 | ## Can you wrap this or that CUDA API? 49 | 50 | If a certain API isn't wrapped with some high-level functionality, you can always use the 51 | underlying C APIs which are always available as unexported methods. For example, you can 52 | access the CUDA driver library as `cu` prefixed, unexported functions like 53 | `CUDA.cuDriverGetVersion`. Similarly, vendor libraries like CUBLAS are available through 54 | their exported submodule handles, e.g., `CUBLAS.cublasGetVersion_v2`. 55 | 56 | Any help on designing or implementing high-level wrappers for this low-level functionality 57 | is greatly appreciated, so please consider contributing your uses of these APIs on the 58 | respective repositories. 59 | --------------------------------------------------------------------------------