├── docs ├── .gitignore ├── Project.toml ├── src │ ├── lib │ │ ├── compilation.md │ │ ├── device │ │ │ ├── array.md │ │ │ ├── libdevice.md │ │ │ └── intrinsics.md │ │ └── reflection.md │ ├── index.md │ └── man │ │ ├── performance.md │ │ ├── troubleshooting.md │ │ ├── usage.md │ │ └── hacking.md └── make.jl ├── examples ├── reduce │ ├── .gitignore │ ├── verify.jl │ ├── benchmark.jl │ ├── reduce.cu │ └── reduce.jl ├── vadd.jl ├── hello_world.jl ├── oob.jl ├── multigpu.jl ├── peakflops.jl ├── scan.jl ├── pairwise.jl └── blackscholes.jl ├── deps ├── .gitignore └── build.jl ├── .gitignore ├── test ├── perf │ └── launch_overhead │ │ ├── .gitignore │ │ ├── cuda.cu │ │ ├── build.jl │ │ ├── cudanative.jl │ │ ├── cuda.jl │ │ ├── README.md │ │ └── cuda.c ├── examples.jl ├── base.jl ├── device │ ├── pointer.jl │ ├── codegen.jl │ └── array.jl ├── runtests.jl ├── pointer.jl └── util.jl ├── REQUIRE ├── bors.toml ├── codecov.yml ├── src ├── deprecated.jl ├── compiler.jl ├── utils.jl ├── device │ ├── cuda_intrinsics.jl │ ├── cuda_intrinsics │ │ ├── misc.jl │ │ ├── memory_dynamic.jl │ │ ├── warp_vote.jl │ │ ├── assertion.jl │ │ ├── indexing.jl │ │ ├── output.jl │ │ ├── synchronization.jl │ │ ├── memory_shared.jl │ │ └── warp_shuffle.jl │ ├── array.jl │ ├── runtime_intrinsics.jl │ ├── pointer.jl │ └── tools.jl ├── CUDAnative.jl ├── compiler │ ├── debug.jl │ ├── driver.jl │ ├── mcgen.jl │ ├── common.jl │ ├── rtlib.jl │ └── validation.jl ├── init.jl ├── execution.jl └── reflection.jl ├── Project.toml ├── LICENSE.md ├── README.md ├── .gitlab-ci.yml ├── res └── parse_libdevice.jl └── NEWS.md /docs/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /examples/reduce/.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.ptx 3 | -------------------------------------------------------------------------------- /deps/.gitignore: -------------------------------------------------------------------------------- 1 | ext.jl 2 | ext.jl.bak 3 | build.log 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.*.cov 2 | *.jl.mem 3 | Manifest.toml 4 | deps/runtime/ 5 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/.gitignore: -------------------------------------------------------------------------------- 1 | cuda 2 | cuda.ptx 3 | *.nvvp 4 | 5 | -------------------------------------------------------------------------------- /REQUIRE: -------------------------------------------------------------------------------- 1 | julia 1.0 2 | CUDAdrv 1.1 3 | LLVM 0.9.14 4 | CUDAapi 0.4.0 5 | Adapt 0.4 6 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ 2 | "ci/gitlab/%" 3 | ] 4 | delete_merged_branches = true 5 | -------------------------------------------------------------------------------- /docs/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 3 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 4 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/cuda.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void kernel_dummy(float *ptr) 2 | { 3 | ptr[blockIdx.x] = 0; 4 | } 5 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | ignore: 3 | - "deps/*" 4 | - "src/device/*" 5 | status: 6 | patch: false 7 | project: false 8 | changes: false 9 | comment: false 10 | -------------------------------------------------------------------------------- /src/deprecated.jl: -------------------------------------------------------------------------------- 1 | # Deprecated functionality 2 | 3 | macro profile(ex) 4 | Base.depwarn("`CUDAnative.@profile` is deprecated, use `CUDAdrv.@profile` instead", :profile) 5 | quote 6 | CUDAdrv.@profile begin 7 | $(esc(ex)) 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /docs/src/lib/compilation.md: -------------------------------------------------------------------------------- 1 | # Compilation & Execution 2 | 3 | ```@docs 4 | CUDAnative.@cuda 5 | CUDAnative.cufunction 6 | CUDAnative.Kernel 7 | CUDAnative.compile 8 | CUDAnative.cudaconvert 9 | CUDAnative.nearest_warpsize 10 | ``` 11 | 12 | ## Devices 13 | 14 | ```@docs 15 | CUDAnative.device! 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/src/lib/device/array.md: -------------------------------------------------------------------------------- 1 | # Arrays 2 | 3 | CUDAnative provides a primitive, lightweight array type to manage GPU data 4 | organized in an plain, dense fashion. This is the device-counterpart to the 5 | `CuArray` from CuArrays.jl, and implements (part of) the array interface as well 6 | as other functionality for use _on_ the GPU: 7 | 8 | ```@docs 9 | CUDAnative.CuDeviceArray 10 | CUDAnative.ldg 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/src/lib/device/libdevice.md: -------------------------------------------------------------------------------- 1 | # libdevice 2 | 3 | CUDAnative.jl provides wrapper functions for the mathematical routines in `libdevice`, 4 | CUDA's device math library. Many of these functions implement an interface familiar to 5 | similar functions in `Base`, but it is currently impossible to transparently dispatch to 6 | these device functions. As a consequence, users should prefix calls to math functions (eg. 7 | `sin` or `pow`) with the CUDAnative module name. 8 | 9 | WIP 10 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/build.jl: -------------------------------------------------------------------------------- 1 | using CUDAapi 2 | using CUDAdrv 3 | 4 | dev = CuDevice(0) 5 | cap = capability(dev) 6 | 7 | cd(@__DIR__) do 8 | toolkit = CUDAapi.find_toolkit() 9 | nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit) 10 | toolchain = CUDAapi.find_toolchain(toolkit) 11 | flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)` 12 | run(`$nvcc $flags -ptx -o cuda.ptx cuda.cu`) 13 | run(`$nvcc $flags -lm -lcuda -o cuda cuda.c`) 14 | end 15 | -------------------------------------------------------------------------------- /examples/vadd.jl: -------------------------------------------------------------------------------- 1 | using CUDAdrv, CUDAnative, CuArrays 2 | 3 | using Test 4 | 5 | function vadd(a, b, c) 6 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 7 | c[i] = a[i] + b[i] 8 | return 9 | end 10 | 11 | dims = (3,4) 12 | a = round.(rand(Float32, dims) * 100) 13 | b = round.(rand(Float32, dims) * 100) 14 | 15 | d_a = CuArray(a) 16 | d_b = CuArray(b) 17 | d_c = similar(d_a) 18 | 19 | len = prod(dims) 20 | @cuda threads=len vadd(d_a, d_b, d_c) 21 | c = Array(d_c) 22 | @test a+b ≈ c 23 | -------------------------------------------------------------------------------- /examples/hello_world.jl: -------------------------------------------------------------------------------- 1 | using CUDAdrv, CUDAnative, CuArrays 2 | 3 | if Sys.iswindows() 4 | function hello_world() 5 | @cuprintf("Greetings from block %lld, thread %lld!\n", Int64(blockIdx().x), Int64(threadIdx().x)) 6 | return 7 | end 8 | else 9 | function hello_world() 10 | @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x)) 11 | return 12 | end 13 | end 14 | @cuda blocks=2 threads=2 hello_world() 15 | synchronize() 16 | -------------------------------------------------------------------------------- /examples/reduce/verify.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | include("reduce.jl") 4 | 5 | if capability(device()) < v"3.0" 6 | @warn("this example requires a newer GPU") 7 | exit(0) 8 | end 9 | 10 | len = 10^7 11 | input = ones(Int32, len) 12 | 13 | # CPU 14 | cpu_val = reduce(+, input) 15 | 16 | # CUDAnative 17 | let 18 | gpu_input = CuArray(input) 19 | gpu_output = similar(gpu_input) 20 | gpu_reduce(+, gpu_input, gpu_output) 21 | gpu_val = Array(gpu_output)[1] 22 | @assert cpu_val == gpu_val 23 | end 24 | -------------------------------------------------------------------------------- /src/compiler.jl: -------------------------------------------------------------------------------- 1 | # JIT compilation of Julia code to PTX 2 | 3 | include(joinpath("compiler", "common.jl")) 4 | include(joinpath("compiler", "irgen.jl")) 5 | include(joinpath("compiler", "optim.jl")) 6 | include(joinpath("compiler", "validation.jl")) 7 | include(joinpath("compiler", "rtlib.jl")) 8 | include(joinpath("compiler", "mcgen.jl")) 9 | include(joinpath("compiler", "debug.jl")) 10 | include(joinpath("compiler", "driver.jl")) 11 | 12 | function __init_compiler__() 13 | # enable generation of FMA instructions to mimic behavior of nvcc 14 | LLVM.clopts("--nvptx-fma-level=1") 15 | end 16 | -------------------------------------------------------------------------------- /examples/oob.jl: -------------------------------------------------------------------------------- 1 | # EXCLUDE FROM TESTING 2 | # this example might fail (CUDA error, or runtime trap if bounds-checking if enabled) 3 | 4 | # Running this example under cuda-memset properly gives line number info, 5 | # demonstrating how we support existing CUDA tools. 6 | 7 | # TODO: make the actual error trap at run time 8 | 9 | using CUDAdrv, CUDAnative, CuArrays 10 | 11 | a = CuArray{Float32}(undef, 10) 12 | 13 | function memset(a, val) 14 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 15 | a[i] = val 16 | return 17 | end 18 | 19 | @cuda threads=11 memset(a, 0f0) 20 | synchronize() 21 | -------------------------------------------------------------------------------- /src/utils.jl: -------------------------------------------------------------------------------- 1 | # device capability handling 2 | 3 | # select the highest capability that is supported by both the toolchain and device 4 | function supported_capability(dev::CuDevice) 5 | dev_cap = capability(dev) 6 | compat_caps = filter(cap -> cap <= dev_cap, target_support) 7 | isempty(compat_caps) && 8 | error("Device capability v$dev_cap not supported by available toolchain") 9 | 10 | return maximum(compat_caps) 11 | end 12 | 13 | # return the capability of the current context's device, or a sane fall-back 14 | function current_capability() 15 | if initialized[] 16 | return supported_capability(device()) 17 | else 18 | # newer devices tend to support cleaner code (higher-level instructions, etc) 19 | # so target the most recent device as supported by this toolchain 20 | return maximum(target_support) 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "CUDAnative" 2 | uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" 3 | 4 | [deps] 5 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 6 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" 7 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" 8 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 9 | LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" 10 | Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" 11 | Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" 12 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 13 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" 14 | 15 | [extras] 16 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 17 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 18 | SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" 19 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 20 | 21 | [targets] 22 | test = ["Test", "BenchmarkTools", "SpecialFunctions"] 23 | -------------------------------------------------------------------------------- /docs/make.jl: -------------------------------------------------------------------------------- 1 | using Documenter 2 | 3 | using Pkg 4 | if haskey(ENV, "GITLAB_CI") 5 | Pkg.add([PackageSpec(name = x; rev = "master") for x in ["CUDAdrv", "LLVM"]]) 6 | end 7 | 8 | using CUDAnative 9 | 10 | makedocs( 11 | modules = [CUDAnative], 12 | format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), 13 | sitename = "CUDAnative.jl", 14 | pages = [ 15 | "Home" => "index.md", 16 | "Manual" => [ 17 | "man/usage.md", 18 | "man/troubleshooting.md", 19 | "man/performance.md", 20 | "man/hacking.md" 21 | ], 22 | "Library" => [ 23 | "lib/compilation.md", 24 | "lib/reflection.md", 25 | "Device Code" => [ 26 | "lib/device/intrinsics.md", 27 | "lib/device/array.md", 28 | "lib/device/libdevice.md" 29 | ] 30 | ] 31 | ], 32 | doctest = true 33 | ) 34 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics.jl: -------------------------------------------------------------------------------- 1 | # CUDA extensions to the C language 2 | 3 | # TODO: "CUDA C programming guide" > "C language extensions" lists mathematical functions, 4 | # without mentioning libdevice. Is this implied, by NVCC always using libdevice, 5 | # or are there some natively-supported math functions as well? 6 | 7 | # yes: https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__DOUBLE.html 8 | # see /home/tbesard/CUDA/toolkit/current/include/sm_20_intrinsics.h 9 | 10 | include(joinpath("cuda_intrinsics", "memory_shared.jl")) 11 | include(joinpath("cuda_intrinsics", "indexing.jl")) 12 | include(joinpath("cuda_intrinsics", "synchronization.jl")) 13 | include(joinpath("cuda_intrinsics", "warp_vote.jl")) 14 | include(joinpath("cuda_intrinsics", "warp_shuffle.jl")) 15 | include(joinpath("cuda_intrinsics", "output.jl")) 16 | include(joinpath("cuda_intrinsics", "assertion.jl")) 17 | include(joinpath("cuda_intrinsics", "memory_dynamic.jl")) 18 | include(joinpath("cuda_intrinsics", "misc.jl")) 19 | -------------------------------------------------------------------------------- /test/examples.jl: -------------------------------------------------------------------------------- 1 | @testset "examples" begin 2 | 3 | function find_sources(path::String, sources=String[]) 4 | if isdir(path) 5 | for entry in readdir(path) 6 | find_sources(joinpath(path, entry), sources) 7 | end 8 | elseif endswith(path, ".jl") 9 | push!(sources, path) 10 | end 11 | sources 12 | end 13 | 14 | examples_dir = joinpath(@__DIR__, "..", "examples") 15 | examples = find_sources(examples_dir) 16 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) 17 | 18 | cd(examples_dir) do 19 | examples = relpath.(examples, Ref(examples_dir)) 20 | @testset for example in examples 21 | cmd = `$(Base.julia_cmd())` 22 | if Base.JLOptions().project != C_NULL 23 | # --project isn't preserved by julia_cmd() 24 | cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))` 25 | end 26 | cmd = `$cmd $example` 27 | @test success(pipeline(cmd, stderr=stderr)) 28 | end 29 | end 30 | 31 | end 32 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/misc.jl: -------------------------------------------------------------------------------- 1 | export clock, nanosleep 2 | 3 | """ 4 | clock(UInt32) 5 | 6 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 7 | """ 8 | clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) 9 | 10 | """ 11 | clock(UInt32) 12 | 13 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 14 | """ 15 | clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) 16 | 17 | 18 | """ 19 | nanosleep(t) 20 | 21 | Puts a thread for a given amount `t`(in nanoseconds). 22 | 23 | !!! note 24 | Requires CUDA >= 10.0 and sm_6.2 25 | """ 26 | nanosleep 27 | 28 | if cuda_driver_version >= v"10.0" && v"6.2" in ptx_support 29 | @inline function nanosleep(t::Unsigned) 30 | @asmcall("nanosleep.u32 \$0;", "r", true, 31 | Cvoid, Tuple{UInt32}, convert(UInt32, t)) 32 | end 33 | else 34 | @inline function nanosleep(t::Unsigned) 35 | return nothing 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /docs/src/index.md: -------------------------------------------------------------------------------- 1 | # CUDAnative.jl 2 | 3 | *Support for compiling and executing native Julia kernels on CUDA hardware.* 4 | 5 | This package provides support for compiling and executing native Julia kernels on CUDA 6 | hardware. It is a work in progress, and only works on very recent versions of Julia . 7 | 8 | 9 | ## Installation 10 | 11 | Requirements: 12 | 13 | * Julia 1.0 14 | * CUDA toolkit 15 | * NVIDIA driver 16 | 17 | ``` 18 | Pkg.add("CUDAnative") 19 | using CUDAnative 20 | 21 | # optionally 22 | Pkg.test("CUDAnative") 23 | ``` 24 | 25 | The build step will discover the available CUDA and LLVM installations, and 26 | figure out which devices can be programmed using that set-up. It depends on 27 | CUDAdrv and LLVM being properly configured. 28 | 29 | Even if the build fails, CUDAnative.jl should always be loadable. This simplifies use by 30 | downstream packages, until there is proper language support for conditional modules. You can 31 | check whether the package has been built properly by inspecting the `CUDAnative.configured` 32 | global variable. 33 | -------------------------------------------------------------------------------- /src/CUDAnative.jl: -------------------------------------------------------------------------------- 1 | module CUDAnative 2 | 3 | using CUDAdrv 4 | 5 | using LLVM 6 | using LLVM.Interop 7 | 8 | using Adapt 9 | 10 | using Pkg 11 | using Libdl 12 | 13 | const ext = joinpath(@__DIR__, "..", "deps", "ext.jl") 14 | isfile(ext) || error("CUDAnative.jl has not been built, please run Pkg.build(\"CUDAnative\").") 15 | include(ext) 16 | if !configured 17 | # default (non-functional) values for critical variables, 18 | # making it possible to _load_ the package at all times. 19 | const target_support = [v"2.0"] 20 | const cuda_driver_version = v"5.5" 21 | end 22 | 23 | include("utils.jl") 24 | 25 | # needs to be loaded _before_ the compiler infrastructure, because of generated functions 26 | include(joinpath("device", "tools.jl")) 27 | include(joinpath("device", "pointer.jl")) 28 | include(joinpath("device", "array.jl")) 29 | include(joinpath("device", "libdevice.jl")) 30 | include(joinpath("device", "cuda_intrinsics.jl")) 31 | include(joinpath("device", "runtime_intrinsics.jl")) 32 | 33 | include("compiler.jl") 34 | include("execution.jl") 35 | include("reflection.jl") 36 | 37 | include("deprecated.jl") 38 | 39 | include("init.jl") 40 | 41 | end 42 | -------------------------------------------------------------------------------- /docs/src/lib/reflection.md: -------------------------------------------------------------------------------- 1 | # Reflection 2 | 3 | Because of using a different compilation toolchain, CUDAnative.jl offers counterpart 4 | functions to the `code_` functionality from Base: 5 | 6 | ```@docs 7 | CUDAnative.code_llvm 8 | CUDAnative.code_ptx 9 | CUDAnative.code_sass 10 | ``` 11 | 12 | 13 | ## Convenience macros 14 | 15 | For ease of use, CUDAnative.jl also implements `@device_code_` macros wrapping 16 | the above reflection functionality. These macros evaluate the expression 17 | argument, while tracing compilation and finally printing or returning the code 18 | for every invoked CUDA kernel. Do note that this evaluation can have side 19 | effects, as opposed to similarly-named `@code_` macros in Base which are free of 20 | side effects. 21 | 22 | ```@docs 23 | CUDAnative.@device_code_lowered 24 | CUDAnative.@device_code_typed 25 | CUDAnative.@device_code_warntype 26 | CUDAnative.@device_code_llvm 27 | CUDAnative.@device_code_ptx 28 | CUDAnative.@device_code_sass 29 | CUDAnative.@device_code 30 | ``` 31 | 32 | ## Version and related queries 33 | 34 | ```@docs 35 | CUDAnative.version 36 | CUDAnative.maxthreads 37 | CUDAnative.registers 38 | CUDAnative.memory 39 | ``` 40 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2013 Dahua Lin 4 | Copyright © 2014-2018 Tim Besard, and other contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /test/base.jl: -------------------------------------------------------------------------------- 1 | @testset "base interface" begin 2 | 3 | ############################################################################################ 4 | 5 | @testset "method caching" begin 6 | 7 | import InteractiveUtils: _dump_function 8 | 9 | # #17057 fallout 10 | @noinline post17057_child(i) = sink(i) 11 | function post17057_parent(arr::Ptr{Int64}) 12 | i = post17057_child(0) 13 | unsafe_store!(arr, i, i) 14 | end 15 | 16 | # bug: default module activation segfaulted on NULL child function if cached=false 17 | params = Base.CodegenParams(cached=false) 18 | if VERSION >= v"1.1.0-DEV.762" 19 | _dump_function(post17057_parent, Tuple{Ptr{Int64}}, 20 | #=native=#false, #=wrapper=#false, #=strip=#false, 21 | #=dump_module=#true, #=syntax=#:att, #=optimize=#false, :none, 22 | params) 23 | else 24 | _dump_function(post17057_parent, Tuple{Ptr{Int64}}, 25 | #=native=#false, #=wrapper=#false, #=strip=#false, 26 | #=dump_module=#true, #=syntax=#:att, #=optimize=#false, 27 | params) 28 | end 29 | 30 | end 31 | 32 | ############################################################################################ 33 | 34 | end -------------------------------------------------------------------------------- /examples/multigpu.jl: -------------------------------------------------------------------------------- 1 | using CUDAdrv, CUDAnative, CuArrays 2 | 3 | using Test 4 | 5 | function vadd(gpu, a, b, c) 6 | i = threadIdx().x + blockDim().x * ((blockIdx().x-1) + (gpu-1) * gridDim().x) 7 | c[i] = a[i] + b[i] 8 | return 9 | end 10 | 11 | gpus = Int(length(devices())) 12 | 13 | dims = (gpus,3,4) 14 | a = round.(rand(Float32, dims) * 100) 15 | b = round.(rand(Float32, dims) * 100) 16 | 17 | # FIXME: CuArray doesn't tie in with unified memory yet 18 | buf_a = Mem.alloc(sizeof(a), true) 19 | Mem.upload!(buf_a, a) 20 | d_a = CuArray{Float32,3}(buf_a, dims) 21 | buf_b = Mem.alloc(sizeof(a), true) 22 | Mem.upload!(buf_b, b) 23 | d_b = CuArray{Float32,3}(buf_b, dims) 24 | buf_c = Mem.alloc(sizeof(a), true) 25 | d_c = CuArray{Float32,3}(buf_c, dims) 26 | 27 | len = prod(dims) 28 | blocks = gpus 29 | threads = len ÷ blocks 30 | 31 | for (gpu,dev) in enumerate(devices()) 32 | @debug "Allocating slice $gpu on device $(name(dev))" 33 | device!(dev) 34 | @cuda blocks=blocks÷gpus threads=threads vadd(gpu, d_a, d_b, d_c) 35 | end 36 | 37 | @debug "Synchronizing devices" 38 | for dev in devices() 39 | # NOTE: normally you'd use events and wait for them 40 | device!(dev) 41 | synchronize() 42 | end 43 | 44 | c = Array(d_c) 45 | @test a+b ≈ c 46 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/cudanative.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # CUDAnative.jl version 4 | 5 | using CUDAdrv, CUDAnative 6 | 7 | using Statistics 8 | using Printf 9 | 10 | function kernel_dummy(ptr) 11 | Base.pointerset(ptr, 0f0, Int(blockIdx().x), 8) 12 | return 13 | end 14 | 15 | const len = 1000 16 | const ITERATIONS = 100 17 | 18 | function benchmark(gpu_buf) 19 | @cuda threads=len kernel_dummy(Base.unsafe_convert(Ptr{Float32}, gpu_buf)) 20 | end 21 | 22 | function main() 23 | cpu_time = Vector{Float64}(undef, ITERATIONS) 24 | gpu_time = Vector{Float64}(undef, ITERATIONS) 25 | 26 | gpu_buf = Mem.alloc(len*sizeof(Float32)) 27 | for i in 1:ITERATIONS 28 | i == ITERATIONS-4 && CUDAdrv.Profile.start() 29 | 30 | gpu_tic, gpu_toc = CuEvent(), CuEvent() 31 | 32 | cpu_tic = time_ns() 33 | record(gpu_tic) 34 | benchmark(gpu_buf) 35 | record(gpu_toc) 36 | synchronize(gpu_toc) 37 | cpu_toc = time_ns() 38 | 39 | cpu_time[i] = (cpu_toc-cpu_tic)/1000 40 | gpu_time[i] = CUDAdrv.elapsed(gpu_tic, gpu_toc)*1000000 41 | end 42 | CUDAdrv.Profile.stop() 43 | Mem.free(gpu_buf) 44 | 45 | popfirst!(cpu_time) 46 | popfirst!(gpu_time) 47 | 48 | @printf("CPU time: %.2f ± %.2f us\n", mean(cpu_time), std(cpu_time)) 49 | @printf("GPU time: %.2f ± %.2f us\n", mean(gpu_time), std(gpu_time)) 50 | end 51 | 52 | main() 53 | -------------------------------------------------------------------------------- /examples/peakflops.jl: -------------------------------------------------------------------------------- 1 | using CUDAdrv, CUDAnative, CuArrays 2 | 3 | using Test 4 | 5 | "Dummy kernel doing 100 FMAs." 6 | function kernel_100fma(a, b, c, out) 7 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 8 | @inbounds a_val = a[i] 9 | @inbounds b_val = b[i] 10 | @inbounds c_val = c[i] 11 | 12 | for j in 1:33 13 | a_val = CUDAnative.fma(a_val, b_val, c_val) 14 | b_val = CUDAnative.fma(a_val, b_val, c_val) 15 | c_val = CUDAnative.fma(a_val, b_val, c_val) 16 | end 17 | 18 | @inbounds out[i] = CUDAnative.fma(a_val, b_val, c_val) 19 | 20 | return 21 | end 22 | 23 | function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0)) 24 | ctx = CuContext(dev) 25 | 26 | dims = (n, n) 27 | a = round.(rand(Float32, dims) * 100) 28 | b = round.(rand(Float32, dims) * 100) 29 | c = round.(rand(Float32, dims) * 100) 30 | 31 | d_a = CuArray(a) 32 | d_b = CuArray(b) 33 | d_c = CuArray(c) 34 | d_out = similar(d_a) 35 | 36 | len = prod(dims) 37 | threads = min(len, 1024) 38 | blocks = len ÷ threads 39 | 40 | # warm-up 41 | @cuda kernel_100fma(d_a, d_b, d_c, d_out) 42 | synchronize(ctx) 43 | 44 | secs = CUDAdrv.@elapsed begin 45 | @cuda blocks=blocks threads=threads kernel_100fma(d_a, d_b, d_c, d_out) 46 | end 47 | flopcount = 200*len 48 | flops = flopcount / secs 49 | 50 | destroy!(ctx) 51 | return flops 52 | end 53 | 54 | println(peakflops()) 55 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/cuda.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # CUDAdrv.jl version 4 | 5 | using CUDAdrv 6 | 7 | using Statistics 8 | using Printf 9 | 10 | const len = 1000 11 | const ITERATIONS = 100 12 | 13 | # TODO: api-trace shows some attribute fetches, where do they come from? 14 | 15 | const dev = CuDevice(0) 16 | const ctx = CuContext(dev) 17 | 18 | const mod = CuModuleFile("cuda.ptx") 19 | const fun = CuFunction(mod, "kernel_dummy") 20 | 21 | function benchmark(gpu_buf) 22 | cudacall(fun, (Ptr{Float32},), gpu_buf; threads=1) 23 | return 24 | end 25 | 26 | 27 | function main() 28 | cpu_time = Vector{Float64}(undef, ITERATIONS) 29 | gpu_time = Vector{Float64}(undef, ITERATIONS) 30 | 31 | gpu_buf = Mem.alloc(len*sizeof(Float32)) 32 | for i in 1:ITERATIONS 33 | i == ITERATIONS-4 && CUDAdrv.Profile.start() 34 | 35 | gpu_tic, gpu_toc = CuEvent(), CuEvent() 36 | 37 | cpu_tic = time_ns() 38 | record(gpu_tic) 39 | benchmark(gpu_buf) 40 | record(gpu_toc) 41 | synchronize(gpu_toc) 42 | cpu_toc = time_ns() 43 | 44 | cpu_time[i] = (cpu_toc-cpu_tic)/1000 45 | gpu_time[i] = CUDAdrv.elapsed(gpu_tic, gpu_toc)*1000000 46 | end 47 | CUDAdrv.Profile.stop() 48 | Mem.free(gpu_buf) 49 | 50 | popfirst!(cpu_time) 51 | popfirst!(gpu_time) 52 | 53 | @printf("CPU time: %.2f ± %.2f us\n", mean(cpu_time), std(cpu_time)) 54 | @printf("GPU time: %.2f ± %.2f us\n", mean(gpu_time), std(gpu_time)) 55 | 56 | destroy!(ctx) 57 | end 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/memory_dynamic.jl: -------------------------------------------------------------------------------- 1 | # Dynamic Global Memory Allocation and Operations (B.21) 2 | 3 | export malloc 4 | 5 | @generated function malloc(sz::Csize_t) 6 | T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext())) 7 | T_size = convert(LLVMType, Csize_t) 8 | T_ptr = convert(LLVMType, Ptr{Cvoid}) 9 | 10 | # create function 11 | llvm_f, _ = create_function(T_ptr, [T_size]) 12 | mod = LLVM.parent(llvm_f) 13 | 14 | # get the intrinsic 15 | # NOTE: LLVM doesn't have void*, Clang uses i8* for malloc too 16 | intr = LLVM.Function(mod, "malloc", LLVM.FunctionType(T_pint8, [T_size])) 17 | # should we attach some metadata here? julia.gc_alloc_obj has the following: 18 | #let attrs = function_attributes(intr) 19 | # AllocSizeNumElemsNotPresent = reinterpret(Cuint, Cint(-1)) 20 | # packed_allocsize = Int64(1) << 32 | AllocSizeNumElemsNotPresent 21 | # push!(attrs, EnumAttribute("allocsize", packed_allocsize, JuliaContext())) 22 | #end 23 | #let attrs = return_attributes(intr) 24 | # push!(attrs, EnumAttribute("noalias", 0, JuliaContext())) 25 | # push!(attrs, EnumAttribute("nonnull", 0, JuliaContext())) 26 | #end 27 | 28 | # generate IR 29 | Builder(JuliaContext()) do builder 30 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 31 | position!(builder, entry) 32 | 33 | ptr = call!(builder, intr, [parameters(llvm_f)[1]]) 34 | 35 | jlptr = ptrtoint!(builder, ptr, T_ptr) 36 | 37 | ret!(builder, jlptr) 38 | end 39 | 40 | call_function(llvm_f, Ptr{Cvoid}, Tuple{Csize_t}, :((sz,))) 41 | end 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CUDAnative.jl 2 | ============= 3 | 4 | *Support for compiling and executing native Julia kernels on CUDA hardware.* 5 | 6 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url] 7 | 8 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl/branch/master/graph/badge.svg 9 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl 10 | 11 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg 12 | [docs-latest-url]: https://juliagpu.gitlab.io/CUDAnative.jl/ 13 | 14 | [doi-img]: https://zenodo.org/badge/DOI/10.1109/TPDS.2018.2872064.svg 15 | [doi-url]: https://doi.org/10.1109/TPDS.2018.2872064 16 | 17 | 18 | 19 | Installation 20 | ------------ 21 | 22 | CUDAnative is a registered package, and can be installed using the Julia package manager: 23 | 24 | ```julia 25 | Pkg.add("CUDAnative") 26 | ``` 27 | 28 | **NOTE**: the current version of this package requires Julia 1.0. Only older versions of this package, v0.6.x or older, work with Julia 0.6, and require a source-build of Julia. 29 | 30 | 31 | License 32 | ------- 33 | 34 | CUDAnative.jl is licensed under the [MIT license](LICENSE.md). 35 | 36 | If you use this package in your research, please cite the [following 37 | paper](https://ieeexplore.ieee.org/document/8471188): 38 | 39 | ``` 40 | @article{besard:2017, 41 | author = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn}, 42 | title = {Effective Extensible Programming: Unleashing {Julia} on {GPUs}}, 43 | journal = {IEEE Transactions on Parallel and Distributed Systems}, 44 | year = {2018}, 45 | doi = {10.1109/TPDS.2018.2872064}, 46 | ISSN = {1045-9219}, 47 | } 48 | ``` 49 | -------------------------------------------------------------------------------- /test/device/pointer.jl: -------------------------------------------------------------------------------- 1 | @testset "pointer" begin 2 | 3 | @testset "unsafe_load & unsafe_store!" begin 4 | 5 | @eval struct LoadableStruct 6 | a::Int64 7 | b::UInt8 8 | end 9 | Base.one(::Type{LoadableStruct}) = LoadableStruct(1,1) 10 | Base.zero(::Type{LoadableStruct}) = LoadableStruct(0,0) 11 | 12 | @testset for T in (Int8, UInt16, Int32, UInt32, Int64, UInt64, Int128, 13 | Float32, Float64, 14 | LoadableStruct), 15 | cached in (false, true) 16 | d_a = Mem.upload(ones(T)) 17 | d_b = Mem.upload(zeros(T)) 18 | 19 | ptr_a = CUDAnative.DevicePtr{T,AS.Global}(Base.unsafe_convert(CuPtr{T}, d_a)) 20 | ptr_b = CUDAnative.DevicePtr{T,AS.Global}(Base.unsafe_convert(CuPtr{T}, d_b)) 21 | @test Mem.download(T, d_a) != Mem.download(T, d_b) 22 | 23 | let ptr_a=ptr_a, ptr_b=ptr_b #JuliaLang/julia#15276 24 | if cached && capability(dev) >= v"3.2" 25 | @on_device unsafe_store!(ptr_b, unsafe_cached_load(ptr_a)) 26 | else 27 | @on_device unsafe_store!(ptr_b, unsafe_load(ptr_a)) 28 | end 29 | end 30 | @test Mem.download(T, d_a) == Mem.download(T, d_b) 31 | end 32 | 33 | @testset "indexing" begin 34 | function kernel(src, dst) 35 | unsafe_store!(dst, CUDAnative.unsafe_cached_load(src, 4)) 36 | return 37 | end 38 | 39 | T = Complex{Int8} 40 | 41 | src = Mem.upload([T(1) T(9); T(3) T(4)]) 42 | dst = Mem.upload([0]) 43 | 44 | @cuda kernel( 45 | CUDAnative.DevicePtr{T,AS.Global}(CuPtr{T}(src.ptr)), 46 | CUDAnative.DevicePtr{T,AS.Global}(CuPtr{T}(dst.ptr)) 47 | ) 48 | 49 | @test Mem.download(T, src, 4)[4] == Mem.download(T, dst)[1] 50 | end 51 | 52 | end 53 | 54 | end 55 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | CI_IMAGE_TAG: 'cuda' 3 | 4 | include: 5 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/common.yml' 6 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.0.yml' 7 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml' 8 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_dev.yml' 9 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml' 10 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/coverage_v1.1.yml' 11 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/documentation_v1.1.yml' 12 | 13 | test:v1.0: 14 | only: 15 | - master 16 | - staging 17 | - trying 18 | 19 | test:v1.1: 20 | only: 21 | - master 22 | - staging 23 | - trying 24 | 25 | test:dev: 26 | only: 27 | - master 28 | - staging 29 | - trying 30 | 31 | coverage: 32 | allow_failure: true 33 | only: 34 | - master 35 | - staging 36 | - trying 37 | 38 | documentation: 39 | only: 40 | - master 41 | - staging 42 | - trying 43 | 44 | pages: 45 | stage: deploy 46 | script: 47 | - mv docs/build public 48 | artifacts: 49 | paths: 50 | - public 51 | only: 52 | - master 53 | 54 | cuarrays: 55 | stage: test 56 | image: "juliagpu/julia:v1.1-cuda" 57 | script: 58 | - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325 59 | - julia -e 'using Pkg; 60 | Pkg.develop(PackageSpec(path=pwd())); 61 | Pkg.build(); 62 | Pkg.add(PackageSpec(name="CuArrays", rev="master")); 63 | Pkg.test("CuArrays");' 64 | allow_failure: true 65 | only: 66 | - master 67 | - staging 68 | - trying 69 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/warp_vote.jl: -------------------------------------------------------------------------------- 1 | # Warp Vote (B.13) 2 | 3 | export vote_all, vote_any, vote_ballot 4 | 5 | """ 6 | vote_all(predicate::Bool) 7 | 8 | Evaluate `predicate` for all active threads of the warp and return non-zero if and only if 9 | `predicate` evaluates to non-zero for all of them. 10 | """ 11 | @inline function vote_all(pred::Bool) 12 | return @asmcall( 13 | """{ 14 | .reg .pred %p1; 15 | .reg .pred %p2; 16 | setp.ne.u32 %p1, \$1, 0; 17 | vote.all.pred %p2, %p1; 18 | selp.s32 \$0, 1, 0, %p2; 19 | }""", "=r,r", true, 20 | Int32, Tuple{Int32}, convert(Int32, pred)) != Int32(0) 21 | end 22 | 23 | """ 24 | vote_any(predicate::Bool) 25 | 26 | Evaluate `predicate` for all active threads of the warp and return non-zero if and only if 27 | `predicate` evaluates to non-zero for any of them. 28 | """ 29 | @inline function vote_any(pred::Bool) 30 | return @asmcall( 31 | """{ 32 | .reg .pred %p1; 33 | .reg .pred %p2; 34 | setp.ne.u32 %p1, \$1, 0; 35 | vote.any.pred %p2, %p1; 36 | selp.s32 \$0, 1, 0, %p2; 37 | }""", "=r,r", true, 38 | Int32, Tuple{Int32}, convert(Int32, pred)) != Int32(0) 39 | end 40 | 41 | """ 42 | vote_ballot(predicate::Bool) 43 | 44 | Evaluate `predicate` for all active threads of the warp and return an integer whose Nth bit 45 | is set if and only if `predicate` evaluates to non-zero for the Nth thread of the warp and 46 | the Nth thread is active. 47 | """ 48 | @inline function vote_ballot(pred::Bool) 49 | return @asmcall( 50 | """{ 51 | .reg .pred %p1; 52 | setp.ne.u32 %p1, \$1, 0; 53 | vote.ballot.b32 \$0, %p1; 54 | }""", "=r,r", true, 55 | UInt32, Tuple{Int32}, convert(Int32, pred)) 56 | end 57 | -------------------------------------------------------------------------------- /docs/src/man/performance.md: -------------------------------------------------------------------------------- 1 | # Performance 2 | 3 | GPU code written in CUDAnative.jl can be as fast or even outperform CUDA C compiled with 4 | `nvcc` (on the condition that the same hardware features are used). This section will 5 | describe how to do so, and what to be careful about. 6 | 7 | 8 | ## Profiling 9 | 10 | When optimizing code, it is important to know what to optimize. Luckily, the CUDA toolkit 11 | ships an excellent profiler, `nvprof`, with `nvvp` as the Eclipse-based UI. The CUDAnative 12 | compiler is fully compatible with these tools, and generates the required line number 13 | information to debug performance issues. To generate line number information, invoke Julia 14 | with the command-line option `-g1` (the default option). Using `-g2` puts the PTX JIT in 15 | debug mode, which significantly lowers performance of GPU code and currently does not 16 | improve debugging. 17 | 18 | Traces collected with these tools might be very large and sparse, because they capture the 19 | entire application including e.g. kernel compilation or initial data uploads. To avoid this, 20 | run the above profilers with the option "Start profiling at application start" disabled 21 | (`--profile-from-start off` with `nvprof`), make your application perform a warm-up 22 | iteration, and wrap subsequent iterations with `CUDAdrv.@profile`. This macro instructs any 23 | active profiler to start collecting information, resulting in much more focused traces. 24 | 25 | For true source-level profiling akin to `Base.@profile`, look at `nvvp`'s PC Sampling View 26 | (requires compute capability >= 5.2, CUDA >= 7.5). In the future, we might have a 27 | `CUDAnative.@profile` offering similar functionality, using the NVIDIA CUPTI library. 28 | 29 | 30 | ## Optimizing 31 | 32 | This section is a WIP. Some things to consider: 33 | 34 | * `Float64` is expensive, but literal floats are `Float64`. Use `...f0` or cast. 35 | * Same for integers; although the performance hit is small, it increases register pressure. 36 | -------------------------------------------------------------------------------- /src/compiler/debug.jl: -------------------------------------------------------------------------------- 1 | # tools for dealing with compiler debug information 2 | 3 | # generate a pseudo-backtrace from LLVM IR instruction debug information 4 | # 5 | # this works by looking up the debug information of the instruction, and inspecting the call 6 | # sites of the containing function. if there's only one, repeat the process from that call. 7 | # finally, the debug information is converted to a Julia stack trace. 8 | function backtrace(inst::LLVM.Instruction, bt = StackTraces.StackFrame[]) 9 | name = Ref{Cstring}() 10 | filename = Ref{Cstring}() 11 | line = Ref{Cuint}() 12 | col = Ref{Cuint}() 13 | 14 | # look up the debug information from the current instruction 15 | depth = 0 16 | while LLVM.API.LLVMGetSourceLocation(LLVM.ref(inst), depth, name, filename, line, col) == 1 17 | frame = StackTraces.StackFrame(replace(unsafe_string(name[]), r";$"=>""), 18 | unsafe_string(filename[]), line[]) 19 | push!(bt, frame) 20 | depth += 1 21 | end 22 | 23 | # move up the call chain 24 | f = LLVM.parent(LLVM.parent(inst)) 25 | ## functions can be used as a *value* in eg. constant expressions, so filter those out 26 | callers = filter(val -> isa(user(val), LLVM.CallInst), collect(uses(f))) 27 | if !isempty(callers) 28 | # figure out the call sites of this instruction 29 | call_sites = unique(callers) do call 30 | # there could be multiple calls, originating from the same source location 31 | md = metadata(user(call)) 32 | if haskey(md, LLVM.MD_dbg) 33 | md[LLVM.MD_dbg] 34 | else 35 | nothing 36 | end 37 | end 38 | 39 | if length(call_sites) > 1 40 | frame = StackTraces.StackFrame("multiple call sites", "unknown", 0) 41 | push!(bt, frame) 42 | elseif length(call_sites) == 1 43 | backtrace(user(first(call_sites)), bt) 44 | end 45 | end 46 | 47 | return bt 48 | end 49 | -------------------------------------------------------------------------------- /examples/scan.jl: -------------------------------------------------------------------------------- 1 | # Work-inefficient inclusive scan 2 | # - uses shared memory to reduce 3 | # 4 | # Based on https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html 5 | 6 | using CUDAdrv, CUDAnative, CuArrays 7 | 8 | function cpu_accumulate!(op::Function, data::Matrix{T}) where {T} 9 | cols = size(data,2) 10 | for col in 1:cols 11 | accum = zero(T) 12 | rows = size(data,1) 13 | for row in 1:size(data,1) 14 | accum = op(accum, data[row,col]) 15 | data[row,col] = accum 16 | end 17 | end 18 | end 19 | 20 | function gpu_accumulate!(op::Function, data::CuDeviceMatrix{T}) where {T} 21 | col = blockIdx().x 22 | cols = gridDim().x 23 | 24 | row = threadIdx().x 25 | rows = blockDim().x 26 | 27 | if col <= cols && row <= rows 28 | shmem = @cuDynamicSharedMem(T, 2*rows) 29 | shmem[row] = data[row,col] 30 | sync_threads() 31 | 32 | # parallel reduction 33 | pin, pout = 1, 0 34 | offset = 1 35 | while offset < rows 36 | pout = 1 - pout 37 | pin = 1 - pin 38 | if row > offset 39 | shmem[pout * rows + row] = 40 | op(shmem[pin * rows + row], 41 | shmem[pin * rows + row - offset]) 42 | else 43 | shmem[pout * rows + row] = 44 | shmem[pin * rows + row] 45 | end 46 | sync_threads() 47 | offset *= UInt32(2) 48 | end 49 | shmem[pin * rows + row] = shmem[pout * rows + row] 50 | sync_threads() 51 | 52 | # write back results 53 | data[row,col] = shmem[row] 54 | end 55 | 56 | return 57 | end 58 | 59 | rows = 5 60 | cols = 4 61 | 62 | a = rand(Int, rows, cols) 63 | 64 | cpu_a = copy(a) 65 | cpu_accumulate!(+, cpu_a) 66 | 67 | gpu_a = CuArray(a) 68 | @cuda blocks=cols threads=rows shmem=2*rows*sizeof(eltype(a)) gpu_accumulate!(+, gpu_a) 69 | 70 | using Test 71 | 72 | @test cpu_a ≈ Array(gpu_a) 73 | 74 | 75 | # FURTHER IMPROVEMENTS: 76 | # - work efficiency 77 | # - avoid memory bank conflcits 78 | # - large array support 79 | -------------------------------------------------------------------------------- /docs/src/lib/device/intrinsics.md: -------------------------------------------------------------------------------- 1 | # Intrinsics 2 | 3 | This section lists the package's public functionality that corresponds to special CUDA 4 | functions to be used in device code. It is loosely organized according to the [C language 5 | extensions](http://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions) 6 | appendix from the CUDA C programming guide. For more information about certain intrinsics, 7 | refer to the aforementioned NVIDIA documentation. 8 | 9 | 10 | ## Indexing and Dimensions 11 | 12 | ```@docs 13 | CUDAnative.gridDim 14 | CUDAnative.blockIdx 15 | CUDAnative.blockDim 16 | CUDAnative.threadIdx 17 | CUDAnative.warpsize 18 | ``` 19 | 20 | 21 | ## Memory Types 22 | 23 | ### Shared Memory 24 | 25 | ```@docs 26 | CUDAnative.@cuStaticSharedMem 27 | CUDAnative.@cuDynamicSharedMem 28 | ``` 29 | 30 | 31 | ## Synchronization 32 | 33 | ```@docs 34 | CUDAnative.sync_threads 35 | CUDAnative.sync_warp 36 | CUDAnative.threadfence_block 37 | CUDAnative.threadfence 38 | CUDAnative.threadfence_system 39 | ``` 40 | 41 | ## Clock & Sleep 42 | 43 | ```@docs 44 | CUDAnative.clock 45 | CUDAnative.nanosleep 46 | ``` 47 | 48 | ## Warp Vote 49 | 50 | The warp vote functions allow the threads of a given warp to perform a 51 | reduction-and-broadcast operation. These functions take as input a boolean predicate from 52 | each thread in the warp and evaluate it. The results of that evaluation are combined 53 | (reduced) across the active threads of the warp in one different ways, broadcasting a single 54 | return value to each participating thread. 55 | 56 | ```@docs 57 | CUDAnative.vote_all 58 | CUDAnative.vote_any 59 | CUDAnative.vote_ballot 60 | ``` 61 | 62 | 63 | ## Warp Shuffle 64 | 65 | ```@docs 66 | CUDAnative.shfl 67 | CUDAnative.shfl_up 68 | CUDAnative.shfl_down 69 | CUDAnative.shfl_xor 70 | ``` 71 | 72 | If using CUDA 9.0, and PTX ISA 6.0 is supported, synchronizing versions of these 73 | intrinsics are available as well: 74 | 75 | ```@docs 76 | CUDAnative.shfl_sync 77 | CUDAnative.shfl_up_sync 78 | CUDAnative.shfl_down_sync 79 | CUDAnative.shfl_xor_sync 80 | ``` 81 | 82 | 83 | ## Formatted Output 84 | 85 | ```@docs 86 | CUDAnative.@cuprintf 87 | ``` 88 | 89 | 90 | ## Assertions 91 | 92 | ```@docs 93 | CUDAnative.@cuassert 94 | ``` 95 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/README.md: -------------------------------------------------------------------------------- 1 | Launch overhead measurement 2 | =========================== 3 | 4 | These tests allow measuring the overhead of launching a kernel, and comparing it to CUDA. 5 | 6 | Use `nvvp` (the NVIDIA visual profiler) to visualize the overhead, disabling the option 7 | "Start execution with profiling enabled". 8 | 9 | For example: 10 | 11 | ``` 12 | $ nvprof --profile-from-start off ./cuda 13 | ==9929== NVPROF is profiling process 9929, command: ./cuda 14 | CPU time: 36.00us 15 | GPU time: 30.82us 16 | ==9929== Profiling application: ./cuda 17 | ==9929== Profiling result: 18 | Time(%) Time Calls Avg Min Max Name 19 | 100.00% 125.70us 5 25.139us 25.088us 25.281us kernel_dummy 20 | ``` 21 | 22 | This shows how launching a kernel takes 36us from Julia's POV, 30 us when using event 23 | counters, but even that contains some overhead because according to `nvprof` the kernel only 24 | took 25 us. 25 | 26 | Luckily, this was using CUDA, and CUDAdrv.jl doesn't perform much worse: 27 | 28 | ``` 29 | $ nvprof --profile-from-start off ./cuda.jl 30 | ==19694== NVPROF is profiling process 19694, command: julia ./cuda.jl 31 | CPU time: 36.23us 32 | GPU time: 31.62us 33 | ==19694== Profiling application: julia ./cuda.jl 34 | ==19694== Profiling result: 35 | Time(%) Time Calls Avg Min Max Name 36 | 100.00% 125.70us 5 25.139us 25.088us 25.312us kernel_dummy 37 | ``` 38 | 39 | But more importantly, CUDAnative.jl performs equally well: 40 | 41 | ``` 42 | $ nvprof --profile-from-start off ./cudanative.jl 43 | ==21135== NVPROF is profiling process 21135, command: julia ./cudanative.jl 44 | CPU time: 36.42us 45 | GPU time: 31.81us 46 | ==21135== Profiling application: julia ./cudanative.jl 47 | ==21135== Profiling result: 48 | Time(%) Time Calls Avg Min Max Name 49 | 100.00% 123.78us 5 24.755us 24.704us 24.928us julia_kernel_dummy_60488 50 | ``` 51 | 52 | Note that these are simple kernels, with more complex kernels Julia's heuristics start 53 | fighting us (eg. when dealing with long argument lists, inference performs worse and 54 | sometimes refuses to expand our generated functions). 55 | 56 | Also, when dealing with more arguments there's an overhead caused by CUDA copying over 57 | arguments, and cannot be avoided. For use of hardware counters, see the CUPTI library. 58 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | # development often happens in lockstep with other packages, 4 | # so check-out the master branch of those packages. 5 | using Pkg 6 | if haskey(ENV, "GITLAB_CI") 7 | Pkg.add([PackageSpec(name = x; rev = "master") 8 | for x in ["CUDAdrv", "LLVM", "CuArrays"]]) 9 | end 10 | 11 | using CUDAnative, CUDAdrv 12 | import LLVM 13 | 14 | include("util.jl") 15 | 16 | @testset "CUDAnative" begin 17 | 18 | include("base.jl") 19 | include("pointer.jl") 20 | include("codegen.jl") 21 | 22 | if CUDAnative.configured 23 | @test length(devices()) > 0 24 | if length(devices()) > 0 25 | # the API shouldn't have been initialized 26 | @test CuCurrentContext() == nothing 27 | 28 | device_callbacked = nothing 29 | device_callback = (dev, ctx) -> begin 30 | device_callbacked = dev 31 | end 32 | push!(CUDAnative.device!_listeners, device_callback) 33 | 34 | # now cause initialization 35 | Mem.alloc(1) 36 | @test CuCurrentContext() != nothing 37 | @test device(CuCurrentContext()) == CuDevice(0) 38 | @test device_callbacked == CuDevice(0) 39 | 40 | device!(CuDevice(0)) 41 | device!(CuDevice(0)) do 42 | nothing 43 | end 44 | 45 | # test the device selection functionality 46 | if length(devices()) > 1 47 | device!(1) do 48 | @test device(CuCurrentContext()) == CuDevice(1) 49 | end 50 | @test device(CuCurrentContext()) == CuDevice(0) 51 | 52 | device!(1) 53 | @test device(CuCurrentContext()) == CuDevice(1) 54 | end 55 | 56 | # pick most recent device (based on compute capability) 57 | global dev = last(sort(collect(devices()); by=capability)) 58 | @info("Testing using device $(name(dev))") 59 | device!(dev) 60 | 61 | if capability(dev) < v"2.0" 62 | @warn("native execution not supported on SM < 2.0") 63 | else 64 | include("device/codegen.jl") 65 | include("device/execution.jl") 66 | include("device/pointer.jl") 67 | include("device/array.jl") 68 | include("device/intrinsics.jl") 69 | 70 | #include("examples.jl") 71 | end 72 | end 73 | else 74 | @warn("CUDAnative.jl has not been configured; skipping on-device tests.") 75 | end 76 | 77 | end 78 | -------------------------------------------------------------------------------- /test/pointer.jl: -------------------------------------------------------------------------------- 1 | @testset "pointer" begin 2 | 3 | # inner constructors 4 | 5 | voidptr_a = CuPtr{Cvoid}(Int(0xDEADBEEF)) 6 | generic_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Generic}(voidptr_a) 7 | global_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Global}(voidptr_a) 8 | local_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Local}(voidptr_a) 9 | 10 | voidptr_b = CuPtr{Cvoid}(Int(0xCAFEBABE)) 11 | generic_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Generic}(voidptr_b) 12 | global_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Global}(voidptr_b) 13 | local_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Local}(voidptr_b) 14 | 15 | intptr_b = convert(CuPtr{Int}, voidptr_b) 16 | generic_intptr_b = CUDAnative.DevicePtr{Int,AS.Generic}(intptr_b) 17 | global_intptr_b = CUDAnative.DevicePtr{Int,AS.Global}(intptr_b) 18 | local_intptr_b = CUDAnative.DevicePtr{Int,AS.Local}(intptr_b) 19 | 20 | # outer constructors 21 | @test CUDAnative.DevicePtr{Cvoid}(voidptr_a) == generic_voidptr_a 22 | @test CUDAnative.DevicePtr(voidptr_a) == generic_voidptr_a 23 | 24 | # getters 25 | @test eltype(generic_voidptr_a) == Cvoid 26 | @test eltype(global_intptr_b) == Int 27 | @test addrspace(generic_voidptr_a) == AS.Generic 28 | @test addrspace(global_voidptr_a) == AS.Global 29 | @test addrspace(local_voidptr_a) == AS.Local 30 | 31 | # comparisons 32 | @test generic_voidptr_a != global_voidptr_a 33 | @test generic_voidptr_a != generic_intptr_b 34 | 35 | 36 | @testset "conversions" begin 37 | 38 | # between host and device pointers 39 | 40 | @test convert(CuPtr{Cvoid}, generic_voidptr_a) == voidptr_a 41 | @test convert(CUDAnative.DevicePtr{Cvoid}, voidptr_a) == generic_voidptr_a 42 | @test convert(CUDAnative.DevicePtr{Cvoid,AS.Global}, voidptr_a) == global_voidptr_a 43 | 44 | 45 | # between device pointers 46 | 47 | @test_throws ArgumentError convert(typeof(local_voidptr_a), global_voidptr_a) 48 | @test convert(typeof(generic_voidptr_a), generic_voidptr_a) == generic_voidptr_a 49 | @test convert(typeof(global_voidptr_a), global_voidptr_a) == global_voidptr_a 50 | @test Base.unsafe_convert(typeof(local_voidptr_a), global_voidptr_a) == local_voidptr_a 51 | 52 | @test convert(typeof(global_voidptr_a), global_intptr_b) == global_voidptr_b 53 | @test convert(typeof(generic_voidptr_a), global_intptr_b) == generic_voidptr_b 54 | @test convert(typeof(global_voidptr_a), generic_intptr_b) == global_voidptr_b 55 | 56 | @test convert(CUDAnative.DevicePtr{Cvoid}, global_intptr_b) == global_voidptr_b 57 | 58 | end 59 | 60 | end 61 | -------------------------------------------------------------------------------- /test/perf/launch_overhead/cuda.c: -------------------------------------------------------------------------------- 1 | // C version 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #define check(err) __check(err, __FILE__, __LINE__) 11 | void __check(CUresult err, const char *file, const int line) { 12 | if (CUDA_SUCCESS != err) { 13 | const char *msg; 14 | cuGetErrorName(err, &msg); 15 | fprintf(stderr, "CUDA error: %s (%04d) at %s:%i.\n", msg, err, file, line); 16 | exit(-1); 17 | } 18 | } 19 | 20 | const size_t len = 1000; 21 | const size_t ITERATIONS = 100; 22 | 23 | int main(int argc, char **argv) { 24 | check(cuInit(0x0)); 25 | 26 | CUdevice dev; 27 | check(cuDeviceGet(&dev, 0)); 28 | 29 | CUcontext ctx; 30 | check(cuCtxCreate(&ctx, 0, dev)); 31 | 32 | CUmodule mod; 33 | check(cuModuleLoad(&mod, "cuda.ptx")); 34 | 35 | CUfunction fun; 36 | check(cuModuleGetFunction(&fun, mod, "kernel_dummy")); 37 | 38 | CUdeviceptr gpu_arr; 39 | check(cuMemAlloc(&gpu_arr, sizeof(float) * len)); 40 | 41 | float cpu_time[ITERATIONS]; 42 | float gpu_time[ITERATIONS]; 43 | 44 | for (int i = 0; i < ITERATIONS; i++) { 45 | if (i == ITERATIONS - 5) 46 | check(cuProfilerStart()); 47 | 48 | struct timespec cpu_t0, cpu_t1; 49 | clock_gettime(CLOCK_MONOTONIC, &cpu_t0); 50 | 51 | CUevent gpu_t0, gpu_t1; 52 | check(cuEventCreate(&gpu_t0, 0x0)); 53 | check(cuEventCreate(&gpu_t1, 0x0)); 54 | 55 | check(cuEventRecord(gpu_t0, NULL)); 56 | 57 | void *args[3] = {&gpu_arr}; 58 | check(cuLaunchKernel(fun, len, 1, 1, 1, 1, 1, 0, 0, args, 0)); 59 | 60 | check(cuEventRecord(gpu_t1, NULL)); 61 | check(cuEventSynchronize(gpu_t1)); 62 | 63 | clock_gettime(CLOCK_MONOTONIC, &cpu_t1); 64 | 65 | check(cuEventElapsedTime(&gpu_time[i], gpu_t0, gpu_t1)); 66 | gpu_time[i] *= 1000; 67 | 68 | cpu_time[i] = (cpu_t1.tv_sec - cpu_t0.tv_sec) + 69 | (cpu_t1.tv_nsec - cpu_t0.tv_nsec) / 1000.; 70 | } 71 | check(cuProfilerStop()); 72 | 73 | double mean_cpu = 0; 74 | double mean_gpu = 0; 75 | int i; 76 | for (i = 1; i < ITERATIONS ; ++i) { 77 | mean_cpu += cpu_time[i]; 78 | mean_gpu += gpu_time[i]; 79 | } 80 | mean_cpu /= (ITERATIONS-1); 81 | mean_gpu /= (ITERATIONS-1); 82 | 83 | double std_cpu = 0; 84 | double std_gpu = 0; 85 | for (i = 1; i < ITERATIONS ; ++i ) { 86 | std_cpu += pow((cpu_time[i] - mean_cpu), 2); 87 | std_gpu += pow((gpu_time[i] - mean_gpu), 2); 88 | } 89 | std_cpu = sqrt(std_cpu / (ITERATIONS-1)); 90 | std_gpu = sqrt(std_gpu / (ITERATIONS-1)); 91 | 92 | printf("CPU time: %.2f +/- %.2f us\n", mean_cpu, std_cpu); 93 | printf("GPU time: %.2f +/- %.2f us\n", mean_gpu, std_gpu); 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /examples/reduce/benchmark.jl: -------------------------------------------------------------------------------- 1 | # EXCLUDE FROM TESTING 2 | 3 | using BenchmarkTools 4 | BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 5 | BenchmarkTools.DEFAULT_PARAMETERS.gcsample = true 6 | 7 | include("reduce.jl") 8 | 9 | CUDAnative.initialize() 10 | const dev = device() 11 | const cap = capability(dev) 12 | @assert(cap >= v"3.0", "this example requires a newer GPU") 13 | 14 | len = 10^7 15 | input = ones(Int32, len) 16 | 17 | 18 | ## CPU 19 | 20 | benchmark_cpu = @benchmarkable begin 21 | reduce(+, input) 22 | end 23 | 24 | @show run(benchmark_cpu) 25 | 26 | 27 | 28 | ## CUDAnative 29 | 30 | # PTX generation 31 | open(joinpath(@__DIR__, "reduce.jl.ptx"), "w") do f 32 | CUDAnative.code_ptx(f, reduce_grid, Tuple{typeof(+), CuDeviceVector{Int32,AS.Global}, 33 | CuDeviceVector{Int32,AS.Global}, Int32}; 34 | cap=v"6.1.0") 35 | end 36 | 37 | benchmark_gpu = @benchmarkable begin 38 | gpu_reduce(+, gpu_input, gpu_output) 39 | val = Array(gpu_output)[1] 40 | end setup=( 41 | val = nothing; 42 | gpu_input = CuArray($input); 43 | gpu_output = similar(gpu_input) 44 | ) teardown=( 45 | gpu_input = nothing; 46 | gpu_output = nothing 47 | ) 48 | 49 | @show run(benchmark_gpu) 50 | 51 | 52 | ## CUDA 53 | 54 | using CUDAapi 55 | using Libdl 56 | 57 | cd(@__DIR__) do 58 | toolkit = CUDAapi.find_toolkit() 59 | nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit) 60 | toolchain = CUDAapi.find_toolchain(toolkit) 61 | flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)` 62 | run(`$nvcc $flags -ptx -o reduce.cu.ptx reduce.cu`) 63 | run(`$nvcc $flags -shared --compiler-options '-fPIC' -o reduce.so reduce.cu`) 64 | end 65 | 66 | # Entry-point wrappers 67 | lib = Libdl.dlopen(joinpath(@__DIR__, "reduce.so")) 68 | setup_cuda(input) = ccall(Libdl.dlsym(lib, "setup"), Ptr{Cvoid}, 69 | (Ptr{Cint}, Csize_t), input, length(input)) 70 | run_cuda(state) = ccall(Libdl.dlsym(lib, "run"), Cint, 71 | (Ptr{Cvoid},), state) 72 | teardown_cuda(state) = ccall(Libdl.dlsym(lib, "teardown"), Cvoid, 73 | (Ptr{Cvoid},), state) 74 | 75 | # Correctness check (not part of verify.jl which is meant to run during testing) 76 | using Test 77 | let 78 | cuda_state = setup_cuda(input) 79 | cuda_val = run_cuda(cuda_state) 80 | teardown_cuda(cuda_state) 81 | @assert cuda_val == reduce(+, input) 82 | end 83 | 84 | benchmark_cuda = @benchmarkable begin 85 | val = run_cuda(state) 86 | end setup=( 87 | val = nothing; 88 | state = setup_cuda($input); 89 | ) teardown=( 90 | teardown_cuda(state) 91 | ) 92 | 93 | @show run(benchmark_cuda) 94 | -------------------------------------------------------------------------------- /docs/src/man/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | To increase logging verbosity of the CUDAnative compiler, launch Julia with the 4 | `JULIA_DEBUG` environment variable set to `CUDAnative`. 5 | 6 | 7 | ## LLVM IR generated for ... is not GPU compatible 8 | 9 | Not all of Julia is supported by CUDAnative. Several commonly-used features, 10 | like strings or exceptions, will not compile to GPU code, because of their 11 | interactions with the CPU-only runtime library. 12 | 13 | When not using GPU-incompatible language features, you might still run into this 14 | compiler error when your code contains type instabilities or other dynamic 15 | behavior. These are often easily spotted by prefixing the failing function call 16 | with one of several `@device_code` macros. 17 | 18 | For example, say we define and execute the following kernel: 19 | 20 | ```julia 21 | julia> kernel(a) = @inbounds a[threadId().x] = 0 22 | kernel (generic function with 1 method) 23 | 24 | julia> @cuda kernel(CuArray([1])) 25 | ERROR: LLVM IR generated for Kernel(CuDeviceArray{Int64,1,CUDAnative.AS.Global}) is not GPU compatible 26 | ``` 27 | 28 | When running with `JULIA_DEBUG=CUDAnative`, you will get to see the actual 29 | incompatible IR constructs. Prefixing our kernel invocation with 30 | `@device_code_warntype` reveals our issue: 31 | 32 | ```julia 33 | julia> @device_code_warntype @cuda kernel(CuArray([1])) 34 | Variables: 35 | a::CuDeviceArray{Int64,1,CUDAnative.AS.Global} 36 | val 37 | 38 | Body: 39 | begin 40 | Core.SSAValue(1) = (Main.threadId)()::ANY 41 | Core.SSAValue(2) = (Base.getproperty)(Core.SSAValue(1), :x)::ANY 42 | (Base.setindex!)(a::CuDeviceArray{Int64,1,CUDAnative.AS.Global}, 0, Core.SSAValue(2))::ANY 43 | return 0 44 | end::Int64 45 | ERROR: LLVM IR generated for Kernel(CuDeviceArray{Int64,1,CUDAnative.AS.Global}) is not GPU compatible 46 | ``` 47 | 48 | Because of a typo, the call to `threadId` is untyped and returns `Any` (it 49 | should have been `threadIdx`). In the future, we expect to be able to catch such 50 | errors automatically. 51 | 52 | If you want to dump all forms of generated code to disk, for further inspection, 53 | have a look at the `@device_code` macro instead. 54 | 55 | 56 | ## Debug info and line-number information 57 | 58 | LLVM's NVPTX back-end does not support the undocumented PTX debug format, so we cannot 59 | generate the necessary DWARF sections. This means that debugging generated code with e.g. 60 | `cuda-gdb` will be an unpleasant experience. Nonetheless, the PTX JIT is configured to emit 61 | debug info (which corresponds with `nvcc -G`) when the Julia debug info level is 2 or 62 | higher (`julia -g2`). 63 | 64 | We do however support emitting line number information, which is useful for other CUDA tools 65 | like `cuda-memcheck`. The functionality (which corresponds with `nvcc -lineinfo`) is enabled 66 | when the Julia debug info level is 1 (the default value). It can be disabled by passing `-g0` 67 | instead. 68 | -------------------------------------------------------------------------------- /res/parse_libdevice.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | # Script to parse and compare the libdevice PDF manual against our list of intrinsics 4 | 5 | function parse_intrinsics(cb) 6 | fn = joinpath(@__DIR__, "..", "src", "device", "libdevice.jl") 7 | open(fn) do f 8 | for ln in eachline(f) 9 | m = match(r"@wrap ([\w.]+\(.+?\)::\w+)", ln) 10 | if m != nothing 11 | cb(replace(m.captures[1], r"\w+::", "::")) 12 | end 13 | end 14 | end 15 | end 16 | 17 | function parse_libdevice(fn, cb) 18 | open(fn) do f 19 | next_proto = false 20 | number = 0 21 | 22 | for ln in eachline(f) 23 | if (m = match(r"^\d\.(\d+)\..", ln); m != nothing) 24 | number = parse(Int, m.captures[1]) 25 | elseif occursin(r"^Prototype:", ln) 26 | next_proto = true 27 | elseif next_proto 28 | cb(chomp(ln), number) 29 | next_proto = false 30 | end 31 | end 32 | end 33 | end 34 | 35 | function main(args) 36 | if length(args) != 1 37 | println("Usage: $(basename(@__FILE__)) LIBDEVICE_PDF") 38 | exit(1) 39 | end 40 | pdf = args[1] 41 | isfile(pdf) || error("input PDF does not exist") 42 | 43 | wrapped = Set{String}() 44 | parse_intrinsics(intr -> push!(wrapped, intr)) 45 | 46 | intrinsics = Set{String}() 47 | numbering = Dict{String,Number}() 48 | txt = tempname() 49 | run(`pdftotext $pdf $txt`) 50 | parse_libdevice(txt, (proto, number) -> begin 51 | m = match(r"^(\w+) (@[\w.]+)\((.*?)\)", proto) 52 | if m != nothing 53 | rettype = m.captures[1] 54 | fn = m.captures[2] 55 | arglist = m.captures[3] 56 | 57 | argpairs = split(arglist, ", ") 58 | argtypes, args = zip(map(argpair -> split(argpair, " "), argpairs)...) 59 | 60 | wrap_fn = strip(fn, '@') 61 | wrap_argtypes = map(argtyp -> endswith(argtyp, '*') ? "Ptr{$(argtyp[1:end-1])}" 62 | : argtyp, argtypes) 63 | wrap_args = map(arg -> strip(arg, '%'), args) 64 | wrap_arglist = join(["$arg::$argtyp" for (arg, argtyp) in zip(wrap_args, wrap_argtypes)], ", ") 65 | 66 | intr = "$wrap_fn($wrap_arglist)::$rettype" 67 | push!(intrinsics, intr) 68 | numbering[intr] = number 69 | end 70 | end) 71 | rm(txt) 72 | 73 | missing = setdiff(intrinsics, wrapped) 74 | superfluous = setdiff(wrapped, intrinsics) 75 | 76 | println("Missing intrinsics:") 77 | for intr in sort(collect(missing), lt=(a,b)->numbering[a] When a function with a variable-length argument list is called, the variable 7 | # > arguments are passed using C's old ``default argument promotions.'' These say that 8 | # > types char and short int are automatically promoted to int, and type float is 9 | # > automatically promoted to double. Therefore, varargs functions will never receive 10 | # > arguments of type char, short int, or float. 11 | 12 | if arg == Cchar || arg == Cshort 13 | return :(Cint(arg)) 14 | elseif arg == Cfloat 15 | return :(Cdouble(arg)) 16 | else 17 | return :(arg) 18 | end 19 | end 20 | 21 | """ 22 | Print a formatted string in device context on the host standard output: 23 | 24 | @cuprintf("%Fmt", args...) 25 | 26 | Note that this is not a fully C-compliant `printf` implementation; see the CUDA 27 | documentation for supported options and inputs. 28 | 29 | Also beware that it is an untyped, and unforgiving `printf` implementation. Type widths need 30 | to match, eg. printing a 64-bit Julia integer requires the `%ld` formatting string. 31 | """ 32 | macro cuprintf(fmt::String, args...) 33 | fmt_val = Val(Symbol(fmt)) 34 | 35 | return :(_cuprintf($fmt_val, $(map(arg -> :(promote_c_argument($arg)), esc.(args))...))) 36 | end 37 | 38 | @generated function _cuprintf(::Val{fmt}, argspec...) where {fmt} 39 | arg_exprs = [:( argspec[$i] ) for i in 1:length(argspec)] 40 | arg_types = [argspec...] 41 | 42 | T_void = LLVM.VoidType(JuliaContext()) 43 | T_int32 = LLVM.Int32Type(JuliaContext()) 44 | T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext())) 45 | 46 | # create functions 47 | param_types = LLVMType[convert.(LLVMType, arg_types)...] 48 | llvm_f, _ = create_function(T_int32, param_types) 49 | mod = LLVM.parent(llvm_f) 50 | 51 | # generate IR 52 | Builder(JuliaContext()) do builder 53 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 54 | position!(builder, entry) 55 | 56 | str = globalstring_ptr!(builder, String(fmt)) 57 | 58 | # construct and fill args buffer 59 | if isempty(argspec) 60 | buffer = LLVM.PointerNull(T_pint8) 61 | else 62 | argtypes = LLVM.StructType("printf_args", JuliaContext()) 63 | elements!(argtypes, param_types) 64 | 65 | args = alloca!(builder, argtypes) 66 | for (i, param) in enumerate(parameters(llvm_f)) 67 | p = struct_gep!(builder, args, i-1) 68 | store!(builder, param, p) 69 | end 70 | 71 | buffer = bitcast!(builder, args, T_pint8) 72 | end 73 | 74 | # invoke vprintf and return 75 | vprintf_typ = LLVM.FunctionType(T_int32, [T_pint8, T_pint8]) 76 | vprintf = LLVM.Function(mod, "vprintf", vprintf_typ) 77 | chars = call!(builder, vprintf, [str, buffer]) 78 | 79 | ret!(builder, chars) 80 | end 81 | 82 | arg_tuple = Expr(:tuple, arg_exprs...) 83 | call_function(llvm_f, Int32, Tuple{arg_types...}, arg_tuple) 84 | end 85 | -------------------------------------------------------------------------------- /src/compiler/driver.jl: -------------------------------------------------------------------------------- 1 | # compiler driver and main interface 2 | 3 | # (::CompilerContext) 4 | const compile_hook = Ref{Union{Nothing,Function}}(nothing) 5 | 6 | """ 7 | compile(dev::CuDevice, f, tt; kwargs...) 8 | 9 | Compile a function `f` invoked with types `tt` for device `dev`, returning the compiled 10 | function module respectively of type `CuFuction` and `CuModule`. 11 | 12 | For a list of supported keyword arguments, refer to the documentation of 13 | [`cufunction`](@ref). 14 | """ 15 | function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); kwargs...) 16 | CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.") 17 | 18 | module_asm, module_entry = compile(supported_capability(dev), f, tt; kwargs...) 19 | 20 | # enable debug options based on Julia's debug setting 21 | jit_options = Dict{CUDAdrv.CUjit_option,Any}() 22 | if Base.JLOptions().debug_level == 1 23 | jit_options[CUDAdrv.GENERATE_LINE_INFO] = true 24 | elseif Base.JLOptions().debug_level >= 2 25 | jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true 26 | end 27 | cuda_mod = CuModule(module_asm, jit_options) 28 | cuda_fun = CuFunction(cuda_mod, module_entry) 29 | 30 | return cuda_fun, cuda_mod 31 | end 32 | 33 | # same as above, but without an active device 34 | function compile(cap::VersionNumber, @nospecialize(f), @nospecialize(tt); 35 | kernel=true, kwargs...) 36 | ctx = CompilerContext(f, tt, cap, kernel; kwargs...) 37 | 38 | return compile(ctx) 39 | end 40 | 41 | function compile(ctx::CompilerContext) 42 | if compile_hook[] != nothing 43 | hook = compile_hook[] 44 | compile_hook[] = nothing 45 | 46 | global globalUnique 47 | previous_globalUnique = globalUnique 48 | 49 | hook(ctx) 50 | 51 | globalUnique = previous_globalUnique 52 | compile_hook[] = hook 53 | end 54 | 55 | 56 | ## high-level code generation (Julia AST) 57 | 58 | @debug "(Re)compiling function" ctx 59 | 60 | check_method(ctx) 61 | 62 | 63 | ## low-level code generation (LLVM IR) 64 | 65 | mod, entry = irgen(ctx) 66 | 67 | need_library(lib) = any(f -> isdeclaration(f) && 68 | intrinsic_id(f) == 0 && 69 | haskey(functions(lib), LLVM.name(f)), 70 | functions(mod)) 71 | 72 | libdevice = load_libdevice(ctx.cap) 73 | if need_library(libdevice) 74 | link_libdevice!(ctx, mod, libdevice) 75 | end 76 | 77 | # optimize the IR 78 | entry = optimize!(ctx, mod, entry) 79 | 80 | runtime = load_runtime(ctx.cap) 81 | if need_library(runtime) 82 | link_library!(ctx, mod, runtime) 83 | end 84 | 85 | prepare_execution!(ctx, mod) 86 | 87 | check_invocation(ctx, entry) 88 | 89 | # check generated IR 90 | check_ir(ctx, mod) 91 | verify(mod) 92 | 93 | 94 | ## machine code generation (PTX assembly) 95 | 96 | module_asm = mcgen(ctx, mod, entry) 97 | 98 | return module_asm, LLVM.name(entry) 99 | end 100 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/synchronization.jl: -------------------------------------------------------------------------------- 1 | # Synchronization (B.6) 2 | 3 | export sync_threads, sync_warp 4 | export threadfence, threadfence_block, threadfence_system 5 | 6 | """ 7 | sync_threads() 8 | 9 | Waits until all threads in the thread block have reached this point and all global and 10 | shared memory accesses made by these threads prior to `sync_threads()` are visible to all 11 | threads in the block. 12 | """ 13 | @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ()) 14 | 15 | """ 16 | sync_warp(mask::Integer=0xffffffff) 17 | 18 | Waits threads in the warp, selected by means of the bitmask `mask`, have reached this point 19 | and all global and shared memory accesses made by these threads prior to `sync_warp()` are 20 | visible to those threads in the warp. The default value for `mask` selects all threads in 21 | the warp. 22 | 23 | !!! note 24 | Requires CUDA >= 9.0 and sm_6.2 25 | """ 26 | sync_warp 27 | 28 | if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support 29 | @inline function sync_warp(mask::Integer=0xffffffff) 30 | @asmcall("bar.warp.sync \$0;", "r", true, 31 | Cvoid, Tuple{UInt32}, convert(UInt32, mask)) 32 | end 33 | else 34 | @inline sync_warp(mask::Integer=0xffffffff) = nothing 35 | end 36 | 37 | """ 38 | threadfence_block() 39 | 40 | A memory fence that ensures that: 41 | - All writes to all memory made by the calling thread before the call to `threadfence_block()` 42 | are observed by all threads in the block of the calling thread as occurring before all writes 43 | to all memory made by the calling thread after the call to `threadfence_block()` 44 | - All reads from all memory made by the calling thread before the call to `threadfence_block()` 45 | are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`. 46 | """ 47 | @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) 48 | 49 | """ 50 | threadfence() 51 | 52 | A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the 53 | calling thread and also ensures that no writes to all memory made by the calling thread after 54 | the call to `threadfence()` are observed by any thread in the device as occurring before any 55 | write to all memory made by the calling thread before the call to `threadfence()`. 56 | 57 | Note that for this ordering guarantee to be true, the observing threads must truly observe the 58 | memory and not cached versions of it; this is requires the use of volatile loads and stores, 59 | which is not available from Julia right now. 60 | """ 61 | @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) 62 | 63 | """ 64 | threadfence_system() 65 | 66 | A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the 67 | calling thread and also ensures that all writes to all memory made by the calling thread 68 | before the call to `threadfence_system()` are observed by all threads in the device, 69 | host threads, and all threads in peer devices as occurring before all writes to all 70 | memory made by the calling thread after the call to `threadfence_system()`. 71 | """ 72 | @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) 73 | 74 | -------------------------------------------------------------------------------- /src/compiler/mcgen.jl: -------------------------------------------------------------------------------- 1 | # machine code generation 2 | 3 | function machine(cap::VersionNumber, triple::String) 4 | InitializeNVPTXTarget() 5 | InitializeNVPTXTargetInfo() 6 | t = Target(triple) 7 | 8 | InitializeNVPTXTargetMC() 9 | cpu = "sm_$(cap.major)$(cap.minor)" 10 | if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support 11 | # in the case of CUDA 9, we use sync intrinsics from PTX ISA 6.0+ 12 | feat = "+ptx60" 13 | else 14 | feat = "" 15 | end 16 | tm = TargetMachine(t, triple, cpu, feat) 17 | asm_verbosity!(tm, true) 18 | 19 | return tm 20 | end 21 | 22 | # final preparations for the module to be compiled to PTX 23 | # these passes should not be run when e.g. compiling to write to disk. 24 | function prepare_execution!(ctx::CompilerContext, mod::LLVM.Module) 25 | let pm = ModulePassManager() 26 | global global_ctx 27 | global_ctx = ctx 28 | 29 | global_optimizer!(pm) 30 | 31 | add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!)) 32 | 33 | global_dce!(pm) 34 | strip_dead_prototypes!(pm) 35 | 36 | run!(pm, mod) 37 | dispose(pm) 38 | end 39 | 40 | return 41 | end 42 | 43 | # some Julia code contains references to objects in the CPU run-time, 44 | # without actually using the contents or functionality of those objects. 45 | # 46 | # prime example are type tags, which reference the address of the allocated type. 47 | # since those references are ephemeral, we can't eagerly resolve and emit them in the IR, 48 | # but at the same time the GPU can't resolve them at run-time. 49 | # 50 | # this pass performs that resolution at link time. 51 | function resolve_cpu_references!(mod::LLVM.Module) 52 | ctx = global_ctx::CompilerContext 53 | changed = false 54 | 55 | for f in functions(mod) 56 | fn = LLVM.name(f) 57 | if isdeclaration(f) && intrinsic_id(f) == 0 && startswith(fn, "jl_") 58 | # eagerly resolve the address of the binding 59 | address = ccall(:jl_cglobal, Any, (Any, Any), fn, UInt) 60 | dereferenced = unsafe_load(address) 61 | dereferenced = LLVM.ConstantInt(dereferenced, JuliaContext()) 62 | 63 | function replace_bindings!(value) 64 | changed = false 65 | for use in uses(value) 66 | val = user(use) 67 | if isa(val, LLVM.ConstantExpr) 68 | # recurse 69 | changed |= replace_bindings!(val) 70 | elseif isa(val, LLVM.LoadInst) 71 | # resolve 72 | replace_uses!(val, dereferenced) 73 | unsafe_delete!(LLVM.parent(val), val) 74 | # FIXME: iterator invalidation? 75 | changed = true 76 | end 77 | end 78 | changed 79 | end 80 | 81 | changed |= replace_bindings!(f) 82 | end 83 | end 84 | 85 | return changed 86 | end 87 | 88 | function mcgen(ctx::CompilerContext, mod::LLVM.Module, f::LLVM.Function) 89 | tm = machine(ctx.cap, triple(mod)) 90 | 91 | InitializeNVPTXAsmPrinter() 92 | return String(emit(tm, mod, LLVM.API.LLVMAssemblyFile)) 93 | end 94 | -------------------------------------------------------------------------------- /src/compiler/common.jl: -------------------------------------------------------------------------------- 1 | # common functionality 2 | 3 | struct CompilerContext 4 | # core invocation 5 | f::Core.Function 6 | tt::DataType 7 | cap::VersionNumber 8 | kernel::Bool 9 | 10 | # optional properties 11 | minthreads::Union{Nothing,CuDim} 12 | maxthreads::Union{Nothing,CuDim} 13 | blocks_per_sm::Union{Nothing,Integer} 14 | maxregs::Union{Nothing,Integer} 15 | 16 | CompilerContext(f, tt, cap, kernel; 17 | minthreads=nothing, maxthreads=nothing, 18 | blocks_per_sm=nothing, maxregs=nothing) = 19 | new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs) 20 | end 21 | 22 | # global context reference 23 | # FIXME: thread through `ctx` everywhere (deadlocks the Julia compiler when doing so with 24 | # the LLVM passes in CUDAnative) 25 | global_ctx = nothing 26 | 27 | 28 | function signature(ctx::CompilerContext) 29 | fn = typeof(ctx.f).name.mt.name 30 | args = join(ctx.tt.parameters, ", ") 31 | return "$fn($(join(ctx.tt.parameters, ", ")))" 32 | end 33 | 34 | 35 | struct KernelError <: Exception 36 | ctx::CompilerContext 37 | message::String 38 | help::Union{Nothing,String} 39 | bt::StackTraces.StackTrace 40 | 41 | KernelError(ctx::CompilerContext, message::String, help=nothing; 42 | bt=StackTraces.StackTrace()) = 43 | new(ctx, message, help, bt) 44 | end 45 | 46 | function Base.showerror(io::IO, err::KernelError) 47 | println(io, "GPU compilation of $(signature(err.ctx)) failed") 48 | println(io, "KernelError: $(err.message)") 49 | println(io) 50 | println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros.")) 51 | Base.show_backtrace(io, err.bt) 52 | end 53 | 54 | 55 | struct InternalCompilerError <: Exception 56 | ctx::CompilerContext 57 | message::String 58 | meta::Dict 59 | InternalCompilerError(ctx, message; kwargs...) = new(ctx, message, kwargs) 60 | end 61 | 62 | function Base.showerror(io::IO, err::InternalCompilerError) 63 | println(io, """CUDAnative.jl encountered an unexpected internal compiler error. 64 | Please file an issue attaching the following information, including the backtrace, 65 | as well as a reproducible example (if possible).""") 66 | 67 | println(io, "\nInternalCompilerError: $(err.message)") 68 | 69 | println(io, "\nCompiler invocation:") 70 | for field in fieldnames(CompilerContext) 71 | println(io, " - $field = $(repr(getfield(err.ctx, field)))") 72 | end 73 | 74 | if !isempty(err.meta) 75 | println(io, "\nAdditional information:") 76 | for (key,val) in err.meta 77 | println(io, " - $key = $(repr(val))") 78 | end 79 | end 80 | 81 | println(io, "\nInstalled packages:") 82 | for (pkg,ver) in Pkg.installed() 83 | println(io, " - $pkg = $ver") 84 | end 85 | 86 | println(io) 87 | versioninfo(io) 88 | end 89 | 90 | macro compiler_assert(ex, ctx, kwargs...) 91 | msg = "$ex, at $(__source__.file):$(__source__.line)" 92 | return :($(esc(ex)) ? $(nothing) 93 | : throw(InternalCompilerError($(esc(ctx)), $msg; 94 | $(map(esc, kwargs)...))) 95 | ) 96 | end 97 | 98 | 99 | # maintain our own "global unique" suffix for disambiguating kernels 100 | globalUnique = 0 101 | -------------------------------------------------------------------------------- /examples/reduce/reduce.cu: -------------------------------------------------------------------------------- 1 | // Fast parallel reduction for Kepler hardware 2 | // 3 | // Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ 4 | 5 | #include 6 | #include 7 | 8 | #define APICALL(code) { check_code((code), __FILE__, __LINE__); } 9 | inline void check_code(cudaError_t code, const char *file, int line) 10 | { 11 | if (code != cudaSuccess) 12 | { 13 | fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); 14 | exit(code); 15 | } 16 | } 17 | 18 | 19 | // 20 | // Main implementation 21 | // 22 | 23 | // Reduce a value across a warp 24 | __inline__ __device__ 25 | int sumReduce_warp(int val) { 26 | for (int offset = warpSize/2; offset > 0; offset /= 2) 27 | val += __shfl_down(val, offset); 28 | return val; 29 | } 30 | 31 | // Reduce a value across a block, using shared memory for communication 32 | __inline__ __device__ int sumReduce_block(int val) { 33 | // shared mem for 32 partial sums 34 | static __shared__ int shared[32]; 35 | 36 | int lane = threadIdx.x % warpSize; 37 | int wid = threadIdx.x / warpSize; 38 | 39 | // each warp performs partial reduction 40 | val = sumReduce_warp(val); 41 | 42 | // write reduced value to shared memory 43 | if (lane==0) shared[wid]=val; 44 | 45 | // wait for all partial reductions 46 | __syncthreads(); 47 | 48 | // read from shared memory only if that warp existed 49 | val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; 50 | 51 | // final reduce within first warp 52 | if (wid==0) { 53 | val = sumReduce_warp(val); 54 | } 55 | 56 | return val; 57 | } 58 | 59 | // Reduce an array across a complete grid 60 | __global__ void sumReduce_grid(int *input, int* output, int N) { 61 | int sum = 0; 62 | 63 | // reduce multiple elements per thread (grid-stride loop) 64 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; 65 | i < N; 66 | i += blockDim.x * gridDim.x) { 67 | sum += input[i]; 68 | } 69 | 70 | sum = sumReduce_block(sum); 71 | 72 | if (threadIdx.x==0) 73 | output[blockIdx.x]=sum; 74 | } 75 | 76 | void sumReduce(int *input, int* output, int N) { 77 | int threads = 512; 78 | int blocks = min((N + threads - 1) / threads, 1024); 79 | 80 | sumReduce_grid<<>>(input, output, N); 81 | sumReduce_grid<<<1, 1024>>>(output, output, blocks); 82 | } 83 | 84 | 85 | // 86 | // Benchmark entry-points 87 | // 88 | 89 | struct State 90 | { 91 | size_t len; 92 | int *gpu_input; 93 | int *gpu_output; 94 | }; 95 | 96 | extern "C" 97 | State *setup(int *input, size_t len) 98 | { 99 | State *state = new State(); 100 | 101 | state->len = len; 102 | 103 | APICALL(cudaMalloc(&state->gpu_input, len*sizeof(int))); 104 | APICALL(cudaMemcpy(state->gpu_input, input, len*sizeof(int), cudaMemcpyHostToDevice)); 105 | APICALL(cudaMalloc(&state->gpu_output, len*sizeof(int))); 106 | 107 | return state; 108 | } 109 | 110 | extern "C" 111 | int run(State *state) 112 | { 113 | sumReduce(state->gpu_input, state->gpu_output, state->len); 114 | 115 | int* output = (int*) malloc(state->len * sizeof(int)); 116 | APICALL(cudaMemcpy(output, state->gpu_output, state->len*sizeof(int), cudaMemcpyDeviceToHost)); 117 | int val = output[0]; 118 | free(output); 119 | 120 | return val; 121 | } 122 | 123 | extern "C" 124 | void teardown(State *state) 125 | { 126 | APICALL(cudaFree(state->gpu_output)); 127 | APICALL(cudaFree(state->gpu_input)); 128 | } 129 | -------------------------------------------------------------------------------- /src/init.jl: -------------------------------------------------------------------------------- 1 | # Initialization 2 | 3 | export device! 4 | 5 | 6 | const initialized = Ref{Bool}(false) 7 | const device_contexts = Dict{CuDevice,CuContext}() 8 | 9 | # FIXME: support for flags (see `cudaSetDeviceFlags`) 10 | 11 | # API calls that are allowed without lazily initializing the CUDA library 12 | # 13 | # this list isn't meant to be complete (ie. many other API calls are actually allowed 14 | # without setting-up a context), and only serves to make multi-device applications possible. 15 | # 16 | # feel free to open a PR adding additional API calls, if you have a specific use for them. 17 | const preinit_apicalls = Set{Symbol}([ 18 | :cuDriverGetVersion, 19 | # device calls, commonly used to determine the most appropriate device 20 | :cuDeviceGet, 21 | :cuDeviceGetAttribute, 22 | :cuDeviceGetCount, 23 | :cuDeviceGetName, 24 | :cuDeviceTotalMem, 25 | # context calls, for testing 26 | :cuCtxGetCurrent 27 | ]) 28 | 29 | function maybe_initialize(apicall) 30 | initialized[] && return 31 | apicall in preinit_apicalls && return 32 | @debug "Initializing CUDA after call to $apicall" 33 | initialize() 34 | end 35 | 36 | function initialize(dev = CuDevice(0)) 37 | # NOTE: we could do something smarter here, 38 | # eg. select the most powerful device, 39 | # or skip devices without free memory 40 | device!(dev) 41 | end 42 | 43 | const device!_listeners = Set{Function}() 44 | 45 | """ 46 | device!(dev) 47 | 48 | Sets `dev` as the current active device for the calling host thread. Devices can be 49 | specified by integer id, or as a `CuDevice`. This is intended to be a low-cost operation, 50 | only performing significant work when calling it for the first time for each device. 51 | 52 | If your library or code needs to perform an action when the active device changes, add a 53 | callback of the signature `(::CuDevice, ::CuContext)` to the `device!_listeners` set. 54 | """ 55 | function device!(dev::CuDevice) 56 | if !initialized[] 57 | initialized[] = true 58 | CUDAdrv.apicall_hook[] = nothing 59 | end 60 | 61 | # NOTE: although these conceptually match what the primary context is for, 62 | # we don't use that because it is refcounted separately 63 | # and might confuse / be confused by user operations 64 | # (eg. calling `unsafe_reset!` on a primary context) 65 | if haskey(device_contexts, dev) 66 | ctx = device_contexts[dev] 67 | activate(ctx) 68 | else 69 | device_contexts[dev] = CuContext(dev) 70 | end 71 | 72 | for listener in device!_listeners 73 | listener(dev, device_contexts[dev]) 74 | end 75 | end 76 | device!(dev::Integer) = device!(CuDevice(dev)) 77 | 78 | """ 79 | device!(f, dev) 80 | 81 | Sets the active device for the duration of `f`. 82 | """ 83 | function device!(f::Function, dev::CuDevice) 84 | # FIXME: should use Push/Pop 85 | old_ctx = CuCurrentContext() 86 | try 87 | device!(dev) 88 | f() 89 | finally 90 | if old_ctx != nothing 91 | activate(old_ctx) 92 | end 93 | end 94 | end 95 | device!(f::Function, dev::Integer) = device!(f, CuDevice(dev)) 96 | 97 | function __init__() 98 | configured || return 99 | 100 | if CUDAdrv.version() != cuda_driver_version 101 | error("Your set-up has changed. Please run Pkg.build(\"CUDAnative\") and restart Julia.") 102 | end 103 | 104 | CUDAdrv.apicall_hook[] = maybe_initialize 105 | __init_compiler__() 106 | end 107 | -------------------------------------------------------------------------------- /docs/src/man/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | 4 | ## Quick start 5 | 6 | First you have to write the kernel function and make sure it only uses features from the 7 | CUDA-supported subset of Julia: 8 | 9 | ```julia 10 | using CUDAnative 11 | 12 | function kernel_vadd(a, b, c) 13 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 14 | c[i] = a[i] + b[i] 15 | return nothing 16 | end 17 | ``` 18 | 19 | Using the `@cuda` macro, you can launch the kernel on a GPU of your choice: 20 | 21 | ```julia 22 | using CUDAdrv, CUDAnative, CuArrays 23 | using Test 24 | 25 | # CUDAdrv functionality: generate and upload data 26 | a = round.(rand(Float32, (3, 4)) * 100) 27 | b = round.(rand(Float32, (3, 4)) * 100) 28 | d_a = CuArray(a) 29 | d_b = CuArray(b) 30 | d_c = similar(d_a) # output array 31 | 32 | # run the kernel and fetch results 33 | # syntax: @cuda [kwargs...] kernel(args...) 34 | @cuda threads=12 kernel_vadd(d_a, d_b, d_c) 35 | 36 | # CUDAdrv functionality: download data 37 | # this synchronizes the device 38 | c = Array(d_c) 39 | 40 | @test a+b ≈ c 41 | ``` 42 | 43 | This code is executed in a default, global context for the first device in your 44 | system. Similar to `cudaSetDevice`, you can switch devices by calling 45 | CUDAnative's `device!` function: 46 | 47 | ```julia 48 | # change the active device 49 | device!(1) 50 | 51 | # the same, but only temporarily 52 | device!(2) do 53 | # ... 54 | end 55 | ``` 56 | 57 | To enable debug logging, launch Julia with the `JULIA_DEBUG` environment 58 | variable set to `CUDAnative`. 59 | 60 | 61 | 62 | ## Julia support 63 | 64 | Only a limited subset of Julia is supported by this package. This subset is undocumented, as 65 | it is too much in flux. 66 | 67 | In general, GPU support of Julia code is determined by the language features used by the 68 | code. Several parts of the language are downright disallowed, such as calls to the Julia 69 | runtime, or garbage allocations. Other features might get reduced in strength, eg. throwing 70 | exceptions will result in a `trap`. 71 | 72 | If your code is incompatible with GPU execution, the compiler will mention the unsupported 73 | feature, and where the use came from: 74 | 75 | ``` 76 | julia> foo(i) = (print("can't do this"); return nothing) 77 | foo (generic function with 1 method) 78 | 79 | julia> @cuda foo(1) 80 | ERROR: error compiling foo: error compiling print: generic call to unsafe_write requires the runtime language feature 81 | ``` 82 | 83 | In addition, the JIT doesn't support certain modes of compilation. For example, recursive 84 | functions require a proper cached compilation, which is currently absent. 85 | 86 | 87 | ## CUDA support 88 | 89 | Not all of CUDA is supported, and because of time constraints the supported subset is again 90 | undocumented. The following (incomplete) list details the support and their CUDAnative.jl 91 | names. Most are implemented in `intrinsics.jl`, so have a look at that file for a more up to 92 | date list: 93 | 94 | * Indexing: `threadIdx().{x,y,z}`, `blockDim()`, `blockIdx()`, `gridDim()`, `warpsize()` 95 | * Shared memory: `@cuStaticSharedMemory`, `@cuDynamicSharedMemory` 96 | * Array type: `CuDeviceArray` (converted from input `CuArray`s, or shared memory) 97 | * I/O: `@cuprintf` 98 | * Synchronization: `sync_threads` 99 | * Communication: `vote_{all,any,ballot}` 100 | * Data movement: `shfl_{up,down,bfly,idx}` 101 | 102 | ### `libdevice` 103 | 104 | In addition to the native intrinsics listed above, math functionality from `libdevice` is 105 | wrapped and part of CUDAnative. For now, you need to fully qualify function calls to these 106 | intrinsics, which provide similar functionality to some of the low-level math functionality 107 | of Base which would otherwise call out to `libm`. 108 | -------------------------------------------------------------------------------- /examples/reduce/reduce.jl: -------------------------------------------------------------------------------- 1 | # EXCLUDE FROM TESTING 2 | # this file doesn't have an entry point, see `verify.jl` instead 3 | 4 | # Fast parallel reduction for Kepler hardware 5 | # - uses shuffle and shared memory to reduce efficiently 6 | # - support for large arrays 7 | # 8 | # Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ 9 | 10 | using CUDAdrv, CUDAnative, CuArrays 11 | 12 | 13 | # 14 | # Main implementation 15 | # 16 | 17 | # Reduce a value across a warp 18 | @inline function reduce_warp(op::F, val::T)::T where {F<:Function,T} 19 | offset = CUDAnative.warpsize() ÷ 2 20 | # TODO: this can be unrolled if warpsize is known... 21 | while offset > 0 22 | val = op(val, shfl_down(val, offset)) 23 | offset ÷= 2 24 | end 25 | return val 26 | end 27 | 28 | # Reduce a value across a block, using shared memory for communication 29 | @inline function reduce_block(op::F, val::T)::T where {F<:Function,T} 30 | # shared mem for 32 partial sums 31 | shared = @cuStaticSharedMem(T, 32) 32 | 33 | wid, lane = fldmod1(threadIdx().x, CUDAnative.warpsize()) 34 | 35 | # each warp performs partial reduction 36 | val = reduce_warp(op, val) 37 | 38 | # write reduced value to shared memory 39 | if lane == 1 40 | @inbounds shared[wid] = val 41 | end 42 | 43 | # wait for all partial reductions 44 | sync_threads() 45 | 46 | # read from shared memory only if that warp existed 47 | @inbounds val = (threadIdx().x <= fld(blockDim().x, CUDAnative.warpsize())) ? shared[lane] : zero(T) 48 | 49 | # final reduce within first warp 50 | if wid == 1 51 | val = reduce_warp(op, val) 52 | end 53 | 54 | return val 55 | end 56 | 57 | # Reduce an array across a complete grid 58 | function reduce_grid(op::F, input::CuDeviceVector{T}, output::CuDeviceVector{T}, 59 | len::Integer) where {F<:Function,T} 60 | 61 | # TODO: neutral element depends on the operator (see Base's 2 and 3 argument `reduce`) 62 | val = zero(T) 63 | 64 | # reduce multiple elements per thread (grid-stride loop) 65 | # TODO: step range (see JuliaGPU/CUDAnative.jl#12) 66 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 67 | step = blockDim().x * gridDim().x 68 | while i <= len 69 | @inbounds val = op(val, input[i]) 70 | i += step 71 | end 72 | 73 | val = reduce_block(op, val) 74 | 75 | if threadIdx().x == 1 76 | @inbounds output[blockIdx().x] = val 77 | end 78 | 79 | return 80 | end 81 | 82 | """ 83 | Reduce a large array. 84 | 85 | Kepler-specific implementation, ie. you need sm_30 or higher to run this code. 86 | """ 87 | function gpu_reduce(op::Function, input::CuVector{T}, output::CuVector{T}) where {T} 88 | len = length(input) 89 | 90 | # TODO: these values are hardware-dependent, with recent GPUs supporting more threads 91 | threads = 512 92 | blocks = min((len + threads - 1) ÷ threads, 1024) 93 | 94 | # the output array must have a size equal to or larger than the number of thread blocks 95 | # in the grid because each block writes to a unique location within the array. 96 | if length(output) < blocks 97 | throw(ArgumentError("output array too small, should be at least $blocks elements")) 98 | end 99 | 100 | @cuda blocks=blocks threads=threads reduce_grid(op, input, output, len) 101 | @cuda threads=1024 reduce_grid(op, output, output, blocks) 102 | end 103 | 104 | 105 | # FURTHER IMPROVEMENTS: 106 | # - use atomic memory operations 107 | # - dynamic block/grid size based on device capabilities 108 | # - vectorized memory access 109 | # devblogs.nvidia.com/parallelforall/cuda-pro-tip-increase-performance-with-vectorized-memory-access/ 110 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | CUDAnative v1.0 release notes 2 | ============================= 3 | 4 | This document describes major features and user-facing changes to CUDAnative. 5 | 6 | 7 | New features 8 | ------------ 9 | 10 | * `@device_code_...` macros make it easy to inspect generated device code even 11 | if the outermost function call isn't a `@cuda` invocation. This is especially 12 | useful in combination with, e.g., CuArrays. The `@device_code` macro dumps 13 | _all_ forms of intermediate code to a directory, for easy inspection ([#147]). 14 | 15 | * Fast versions of CUDA math intrinsics are now wrapped ([#152]). 16 | 17 | * Support for loading values through the texture cache, aka. `__ldg`, has been 18 | added. No `getindex`-based interfaced is available yet, manually use 19 | `unsafe_cached_load` instead ([#158]). 20 | 21 | * Multiple devices are supported, by calling `device!` to switch to another 22 | device. The CUDA API is now also initialized lazily, so be sure to call 23 | `device!` before performing any work to avoid allocating a context on device 24 | 0 ([#175]). 25 | 26 | * Support for object and closure kernel functions has been added ([#176]). 27 | 28 | * IR transformation passes have been introduced to rewrite exceptions, where 29 | possible, to generate user-friendly messages as well as prevent hitting 30 | issues in `ptxas` ([#241]). 31 | 32 | * Code generated by `@cuda` can now be recreated manually using a low-level 33 | kernel launch API. The kernel objects used in that API are useful for 34 | reflecting on hardware resource usage ([#266]). 35 | 36 | * A GPU runtime library has been introduced ([#303]), implementing certain functionality 37 | from the Julia runtime library that would previously have prevented GPU execution 38 | ([#314], [#318], [#321]). 39 | 40 | 41 | Changes 42 | ------- 43 | 44 | * Debug info generation now honors the `-g` flag as passed to the Julia command, 45 | and is no longer tied to the `DEBUG` environment variable. 46 | 47 | * Log messages are implemented using the new Base Julia logging system. Debug 48 | logging can be enabled by specifying the `JULIA_DEBUG=CUDAnative` environment 49 | variable. 50 | 51 | * The syntax of `@cuda` now takes keyword arguments, eg. `@cuda threads=1 52 | foo(...)`, instead of the old tuple syntax. See the documentation of `@cuda` 53 | for a list of supported arguments ([#154]). 54 | 55 | * Non isbits values can be passed to a kernel, as long as they are unused. This 56 | makes it easier to implement GPU-versions of existing functions, without 57 | requiring a different method signature ([#168]). 58 | 59 | * Indexing intrinsics now return `Int`, so no need to convert to `(U)Int32` 60 | anymore. Although this might require more registers, it allows LLVM to 61 | simplify code ([#182]). 62 | 63 | * Better error messages, showing backtraces into GPU code (#189) and detecting 64 | common pitfalls like recursion or use of Base intrinsics (#210). 65 | 66 | * Debug information is now stripped from LLVM and PTX reflection functions 67 | ([#208], [#214]). Use the `strip_ir_metadata` (cfr. Base) keyword argument 68 | to disable this. 69 | 70 | * Error handling and reporting has been improved. This includes 71 | GPU-incompatible `ccall`s which are now detected and decoded by the IR 72 | validator ([#248]). 73 | 74 | * A callback mechanism has been introduced to inform downstream users about 75 | device switches ([#226]). 76 | 77 | * Adapt.jl is now used for host-device argument conversions ([#269]). 78 | 79 | 80 | Deprecations and removals 81 | ------------------------- 82 | 83 | * `CUDAnative.@profile` has been removed, use `CUDAdrv.@profile` with a manual 84 | warm-up step instead. 85 | 86 | * The `KernelWrapper` has been removed since it prevented inferring varargs 87 | functions ([#254]). 88 | 89 | * Support for `CUDAdrv.CuArray` has been removed, the CuArrays.jl package should be used 90 | instead ([#284]). -------------------------------------------------------------------------------- /test/util.jl: -------------------------------------------------------------------------------- 1 | # @test_throw, with additional testing for the exception message 2 | macro test_throws_message(f, typ, ex...) 3 | quote 4 | msg = "" 5 | @test_throws $(esc(typ)) try 6 | $(esc(ex...)) 7 | catch err 8 | msg = sprint(showerror, err) 9 | rethrow() 10 | end 11 | 12 | if !$(esc(f))(msg) 13 | # @test should return its result, but doesn't 14 | @error "Failed to validate error message\n$msg" 15 | end 16 | @test $(esc(f))(msg) 17 | end 18 | end 19 | 20 | # NOTE: based on test/pkg.jl::capture_stdout, but doesn't discard exceptions 21 | macro grab_output(ex) 22 | quote 23 | mktemp() do fname, fout 24 | ret = nothing 25 | open(fname, "w") do fout 26 | redirect_stdout(fout) do 27 | ret = $(esc(ex)) 28 | end 29 | end 30 | ret, read(fname, String) 31 | end 32 | end 33 | end 34 | 35 | # Run some code on-device, returning captured standard output 36 | macro on_device(ex) 37 | quote 38 | let 39 | function kernel() 40 | $(esc(ex)) 41 | return 42 | end 43 | 44 | @cuda kernel() 45 | synchronize() 46 | end 47 | end 48 | end 49 | 50 | # helper function for sinking a value to prevent the callee from getting optimized away 51 | @inline sink(i::Int32) = 52 | Base.llvmcall("""%slot = alloca i32 53 | store volatile i32 %0, i32* %slot 54 | %value = load volatile i32, i32* %slot 55 | ret i32 %value""", Int32, Tuple{Int32}, i) 56 | @inline sink(i::Int64) = 57 | Base.llvmcall("""%slot = alloca i64 58 | store volatile i64 %0, i64* %slot 59 | %value = load volatile i64, i64* %slot 60 | ret i64 %value""", Int64, Tuple{Int64}, i) 61 | 62 | function julia_script(code, args=``) 63 | # FIXME: this doesn't work when the compute mode is set to exclusive 64 | script = "using CUDAnative; import CUDAdrv; $code" 65 | out = Pipe() 66 | err = Pipe() 67 | cmd = `$(Base.julia_cmd()) -e $script` 68 | if Base.JLOptions().project != C_NULL 69 | # --project isn't preserved by julia_cmd() 70 | cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))` 71 | end 72 | cmd = `$cmd $args` 73 | proc = run(pipeline(cmd, stdout=out, stderr=err), wait=false) 74 | close(out.in) 75 | close(err.in) 76 | wait(proc) 77 | proc.exitcode, read(out, String), read(err, String) 78 | end 79 | 80 | # a lightweight CUDA array type for testing purposes 81 | ## ctor & finalizer 82 | mutable struct CuTestArray{T,N} 83 | buf::Mem.Buffer 84 | shape::NTuple{N,Int} 85 | function CuTestArray{T,N}(shape::NTuple{N,Int}) where {T,N} 86 | len = prod(shape) 87 | buf = Mem.alloc(len*sizeof(T)) 88 | 89 | obj = new{T,N}(buf, shape) 90 | finalizer(unsafe_free!, obj) 91 | return obj 92 | end 93 | end 94 | function unsafe_free!(a::CuTestArray) 95 | CUDAdrv.isvalid(a.buf.ctx) && Mem.free(a.buf) 96 | end 97 | ## memory copy operations 98 | function CuTestArray(src::Array{T,N}) where {T,N} 99 | dst = CuTestArray{T,N}(size(src)) 100 | Mem.upload!(dst.buf, pointer(src), length(src) * sizeof(T)) 101 | return dst 102 | end 103 | function Base.Array(src::CuTestArray{T,N}) where {T,N} 104 | dst = Array{T,N}(undef, src.shape) 105 | Mem.download!(pointer(dst), src.buf, prod(src.shape) * sizeof(T)) 106 | return dst 107 | end 108 | ## conversions 109 | using Adapt 110 | function Adapt.adapt_storage(::CUDAnative.Adaptor, a::CuTestArray{T,N}) where {T,N} 111 | ptr = Base.unsafe_convert(CuPtr{T}, a.buf) 112 | devptr = CUDAnative.DevicePtr{T,AS.Global}(ptr) 113 | CuDeviceArray{T,N,AS.Global}(a.shape, devptr) 114 | end 115 | -------------------------------------------------------------------------------- /examples/pairwise.jl: -------------------------------------------------------------------------------- 1 | # calculate pairwise distance between every point in a vector 2 | 3 | using CUDAdrv, CUDAnative, CuArrays 4 | 5 | 6 | function haversine_cpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32) 7 | c1 = cospi(lat1 / 180.0f0) 8 | c2 = cospi(lat2 / 180.0f0) 9 | dlat = lat2 - lat1 10 | dlon = lon2 - lon1 11 | d1 = sinpi(dlat / 360.0f0) 12 | d2 = sinpi(dlon / 360.0f0) 13 | t = d2 * d2 * c1 * c2 14 | a = d1 * d1 + t 15 | c = 2.0f0 * asin(min(1.0f0, sqrt(a))) 16 | return radius * c 17 | end 18 | 19 | function pairwise_dist_cpu(lat::Vector{Float32}, lon::Vector{Float32}) 20 | # allocate 21 | n = length(lat) 22 | rowresult = Array{Float32}(undef, n, n) 23 | 24 | # brute force fill in each cell 25 | for i in 1:n, j in 1:n 26 | @inbounds rowresult[i, j] = haversine_cpu(lat[i], lon[i], lat[j], lon[j] , 6372.8f0) 27 | end 28 | 29 | return rowresult 30 | end 31 | 32 | # from https://devblogs.nvidia.com/parallelforall/fast-great-circle-distance-calculation-cuda-c/ 33 | function haversine_gpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32) 34 | # XXX: need to prefix math intrinsics with CUDAnative 35 | c1 = CUDAnative.cospi(lat1 / 180.0f0) 36 | c2 = CUDAnative.cospi(lat2 / 180.0f0) 37 | dlat = lat2 - lat1 38 | dlon = lon2 - lon1 39 | d1 = CUDAnative.sinpi(dlat / 360.0f0) 40 | d2 = CUDAnative.sinpi(dlon / 360.0f0) 41 | t = d2 * d2 * c1 * c2 42 | a = d1 * d1 + t 43 | c = 2.0f0 * CUDAnative.asin(CUDAnative.min(1.0f0, CUDAnative.sqrt(a))) 44 | return radius * c 45 | end 46 | 47 | # pairwise distance calculation kernel 48 | function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{Float32}, 49 | rowresult::CuDeviceMatrix{Float32}, n) 50 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 51 | j = (blockIdx().y-1) * blockDim().y + threadIdx().y 52 | 53 | if i <= n && j <= n 54 | # store to shared memory 55 | shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y) 56 | if threadIdx().y == 1 57 | shmem[threadIdx().x] = lat[i] 58 | shmem[blockDim().x + threadIdx().x] = lon[i] 59 | end 60 | if threadIdx().x == 1 61 | shmem[2*blockDim().x + threadIdx().y] = lat[j] 62 | shmem[2*blockDim().x + blockDim().y + threadIdx().y] = lon[j] 63 | end 64 | sync_threads() 65 | 66 | # load from shared memory 67 | lat_i = shmem[threadIdx().x] 68 | lon_i = shmem[blockDim().x + threadIdx().x] 69 | lat_j = shmem[2*blockDim().x + threadIdx().y] 70 | lon_j = shmem[2*blockDim().x + blockDim().y + threadIdx().y] 71 | 72 | @inbounds rowresult[i, j] = haversine_gpu(lat_i, lon_i, lat_j, lon_j, 6372.8f0) 73 | end 74 | 75 | return 76 | end 77 | 78 | function pairwise_dist_gpu(lat::Vector{Float32}, lon::Vector{Float32}) 79 | # upload 80 | lat_gpu = CuArray(lat) 81 | lon_gpu = CuArray(lon) 82 | 83 | # allocate 84 | n = length(lat) 85 | rowresult_gpu = CuArray{Float32}(undef, n, n) 86 | 87 | # calculate launch configuration 88 | # NOTE: we want our launch configuration to be as square as possible, 89 | # because that minimizes shared memory usage 90 | dev = device() 91 | total_threads = min(n, attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)) 92 | threads_x = floor(Int, sqrt(total_threads)) 93 | threads_y = total_threads ÷ threads_x 94 | threads = (threads_x, threads_y) 95 | blocks = ceil.(Int, n ./ threads) 96 | 97 | # calculate size of dynamic shared memory 98 | shmem = 2 * sum(threads) * sizeof(Float32) 99 | 100 | @cuda blocks=blocks threads=threads shmem=shmem pairwise_dist_kernel(lat_gpu, lon_gpu, rowresult_gpu, n) 101 | 102 | return Array(rowresult_gpu) 103 | end 104 | 105 | using Test 106 | 107 | # generate reasonable data 108 | function main(n = 10000) 109 | lat = rand(Float32, n) .* 45 110 | lon = rand(Float32, n) .* -120 111 | 112 | @test pairwise_dist_cpu(lat, lon) ≈ pairwise_dist_gpu(lat, lon) rtol=1e-2 113 | end 114 | main() 115 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/memory_shared.jl: -------------------------------------------------------------------------------- 1 | # Shared Memory (part of B.2) 2 | 3 | export @cuStaticSharedMem, @cuDynamicSharedMem 4 | 5 | # FIXME: `shmem_id` increment in the macro isn't correct, as multiple parametrically typed 6 | # functions will alias the id (but the size might be a parameter). but incrementing in 7 | # the @generated function doesn't work, as it is supposed to be pure and identical 8 | # invocations will erroneously share (and even cause multiple shmem globals). 9 | shmem_id = 0 10 | 11 | """ 12 | @cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared} 13 | 14 | Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape) 15 | pointing to a statically-allocated piece of shared memory. The type should be statically 16 | inferable and the dimensions should be constant, or an error will be thrown and the 17 | generator function will be called dynamically. 18 | """ 19 | macro cuStaticSharedMem(T, dims) 20 | global shmem_id 21 | id = shmem_id::Int += 1 22 | 23 | quote 24 | len = prod($(esc(dims))) 25 | ptr = _shmem(Val($id), $(esc(T)), Val(len)) 26 | CuDeviceArray($(esc(dims)), ptr) 27 | end 28 | end 29 | 30 | """ 31 | @cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared} 32 | 33 | Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape) 34 | pointing to a dynamically-allocated piece of shared memory. The type should be statically 35 | inferable or an error will be thrown and the generator function will be called dynamically. 36 | 37 | Note that the amount of dynamic shared memory needs to specified when launching the kernel. 38 | 39 | Optionally, an offset parameter indicating how many bytes to add to the base shared memory 40 | pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic 41 | shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`. 42 | """ 43 | macro cuDynamicSharedMem(T, dims, offset=0) 44 | global shmem_id 45 | id = shmem_id::Int += 1 46 | 47 | # TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM) 48 | 49 | quote 50 | len = prod($(esc(dims))) 51 | ptr = _shmem(Val($id), $(esc(T))) + $(esc(offset)) 52 | CuDeviceArray($(esc(dims)), ptr) 53 | end 54 | end 55 | 56 | # get a pointer to shared memory, with known (static) or zero length (dynamic shared memory) 57 | @generated function _shmem(::Val{id}, ::Type{T}, ::Val{len}=Val(0)) where {id,T,len} 58 | eltyp = convert(LLVMType, T) 59 | 60 | T_ptr = convert(LLVMType, DevicePtr{T,AS.Shared}) 61 | T_actual_ptr = LLVM.PointerType(eltyp) 62 | 63 | # create a function 64 | llvm_f, _ = create_function(T_ptr) 65 | 66 | # create the global variable 67 | mod = LLVM.parent(llvm_f) 68 | gv_typ = LLVM.ArrayType(eltyp, len) 69 | gv = GlobalVariable(mod, gv_typ, "shmem$id", #=addrspace=# 3) 70 | if len > 0 71 | # static shared memory should be demoted to local variables, whenever possible. 72 | # this is done by the NVPTX ASM printer: 73 | # > Find out if a global variable can be demoted to local scope. 74 | # > Currently, this is valid for CUDA shared variables, which have local 75 | # > scope and global lifetime. So the conditions to check are : 76 | # > 1. Is the global variable in shared address space? 77 | # > 2. Does it have internal linkage? 78 | # > 3. Is the global variable referenced only in one function? 79 | linkage!(gv, LLVM.API.LLVMInternalLinkage) 80 | initializer!(gv, null(gv_typ)) 81 | end 82 | # by requesting a larger-than-datatype alignment, we might be able to vectorize. 83 | # we pick 16 bytes since this is the largest transaction size as supported by PTX. 84 | alignment!(gv, Base.max(16, datatype_align(T))) 85 | 86 | # generate IR 87 | Builder(JuliaContext()) do builder 88 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 89 | position!(builder, entry) 90 | 91 | ptr_with_as = gep!(builder, gv, [ConstantInt(0, JuliaContext()), 92 | ConstantInt(0, JuliaContext())]) 93 | 94 | ptr = addrspacecast!(builder, ptr_with_as, T_actual_ptr) 95 | val = ptrtoint!(builder, ptr, T_ptr) 96 | ret!(builder, val) 97 | end 98 | 99 | call_function(llvm_f, DevicePtr{T,AS.Shared}) 100 | end 101 | -------------------------------------------------------------------------------- /docs/src/man/hacking.md: -------------------------------------------------------------------------------- 1 | # Hacking 2 | 3 | ## Generated functions 4 | 5 | Generated functions are used heavily in CUDAnative.jl, in combination with LLVM.jl, to 6 | generate type-specialized code and IR. If evaluating the generator results in an error, 7 | Julia generates a dynamic call to the generator for you to inspect the error at run-time. 8 | This is a problem in the world of GPUs, where dynamic calls are prohibited. A band-aid is to 9 | print the exception during inference: 10 | 11 | ```patch 12 | diff --git a/base/inference.jl b/base/inference.jl 13 | index 6443665676..b03d78ddaa 100644 14 | --- a/base/inference.jl 15 | +++ b/base/inference.jl 16 | @@ -2430,7 +2430,10 @@ function typeinf_frame(linfo::MethodInstance, caller, optimize::Bool, cached::Bo 17 | try 18 | # user code might throw errors – ignore them 19 | src = get_staged(linfo) 20 | - catch 21 | + catch ex 22 | + println("WARNING: An error occurred during generated function execution.") 23 | + println(ex) 24 | + ccall(:jlbacktrace, Void, ()) 25 | return nothing 26 | end 27 | else 28 | ``` 29 | 30 | 31 | ## Adding intrinsics 32 | 33 | Adding intrinsics to `CUDAnative.jl` can be relatively convoluted, depending on the type of 34 | intrinsic. Most of the boil down to inlining a snippet of LLVM IR, using `llvmcall` (or 35 | `ccall` with the `llvmcall` calling convention). For more complex code, use LLVM.jl to build 36 | the IR string. 37 | 38 | 39 | ### `libdevice` intrinsics 40 | 41 | These intrinsics are represented by function calls to `libdevice`. Most of them should 42 | already be covered. There's a convenience macro, `@wrap`, simplifying the job of adding and 43 | exporting intrinsics, and converting arguments and return values. See the documentation of 44 | the macro for more details, and look at `src/device/libdevice.jl` for examples. 45 | 46 | 47 | ### LLVM back-end intrinsics 48 | 49 | Calls to functions like `llvm.nvvm.barrier0` are backed the PTX LLVM back-end, and can be 50 | wrapped using `ccall` with the `llvmcall` calling convention. For more complex intrinsics, 51 | or when you're not actually calling an intrinsic function, you can still use `@wrap`. 52 | 53 | 54 | ### Inline PTX assembly 55 | 56 | When there's no corresponding `libdevice` function or PTX back-end intrinsic exposing the 57 | required functionality, you can use inline PTX assembly via `llvmcall`. This requires you to 58 | embed the PTX assembly in LLVM IR, which is often messy. 59 | 60 | If the source of the assembly instructions is CUDA C code, you simplify this task by first 61 | compiling the CUDA code using Clang, and adapting the resulting LLVM IR for use within 62 | `llvmcall`. For example, extracting the following function definition from the CUDA SDK: 63 | 64 | ```cuda 65 | __device__ unsigned int __ballot(int a) 66 | { 67 | int result; 68 | asm __volatile__ ("{ \n\t" 69 | ".reg .pred \t%%p1; \n\t" 70 | "setp.ne.u32 \t%%p1, %1, 0; \n\t" 71 | "vote.ballot.b32 \t%0, %%p1; \n\t" 72 | "}" : "=r"(result) : "r"(a)); 73 | return result; 74 | } 75 | ``` 76 | 77 | We can generate the following LLVM IR by executing `clang++ -Xclang -fcuda-is-device -S 78 | -emit-llvm -target nvptx64 ballot.cu -o -` (you might need to add [some CUDA 79 | boilerplate](https://gist.github.com/eliben/b014ac17cbe5a452803f)): 80 | 81 | ``` 82 | define i32 @_Z8__balloti(i32 %a) #0 { 83 | %1 = alloca i32, align 4 84 | %result = alloca i32, align 4 85 | store i32 %a, i32* %1, align 4 86 | %2 = load i32, i32* %1, align 4 87 | %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 %2) #1, !srcloc !1 88 | store i32 %3, i32* %result, align 4 89 | %4 = load i32, i32* %result, align 4 90 | ret i32 %4 91 | } 92 | ``` 93 | 94 | Finally, we use LLVM.jl's `@asmcall` macro to inline this assembly and call it: 95 | 96 | ```julia 97 | function vote_ballot(pred::Bool) 98 | return @asmcall( 99 | """{ 100 | .reg .pred %p1; 101 | setp.ne.u32 %p1, \$1, 0; 102 | vote.ballot.b32 \$0, %p1; 103 | }""", "=r,r", true, 104 | UInt32, Tuple{Int32}, convert(Int32, pred)) 105 | end 106 | ``` 107 | 108 | 109 | ### Other functionality 110 | 111 | For other functionality, like shared memory, or when some additional management is required, 112 | like storing a global variable for `printf`'s formatting string, you should use LLVM.jl to 113 | build the IR code instead of hacking strings together. As this doesn't touch global state, 114 | you can even do so from a `@generated` function. Do take care however to use Julia's LLVM 115 | context for all operations. 116 | -------------------------------------------------------------------------------- /test/device/array.jl: -------------------------------------------------------------------------------- 1 | @testset "device arrays" begin 2 | 3 | @testset "constructors" begin 4 | # inner constructors 5 | let 6 | dp = CUDAnative.DevicePtr{Int,AS.Generic}(0) 7 | CuDeviceArray{Int,1,AS.Generic}((1,), dp) 8 | end 9 | 10 | # outer constructors 11 | for I in [Int32,Int64] 12 | a = I(1) 13 | b = I(2) 14 | 15 | dp = CUDAnative.DevicePtr{I,AS.Generic}(0) 16 | 17 | # not parameterized 18 | CuDeviceArray(b, dp) 19 | CuDeviceArray((b,), dp) 20 | CuDeviceArray((b,a), dp) 21 | 22 | # partially parameterized 23 | CuDeviceArray{I}(b, dp) 24 | CuDeviceArray{I}((b,), dp) 25 | CuDeviceArray{I}((a,b), dp) 26 | CuDeviceArray{I,1}(b, dp) 27 | CuDeviceArray{I,1}((b,), dp) 28 | @test_throws MethodError CuDeviceArray{I,1}((a,b), dp) 29 | @test_throws MethodError CuDeviceArray{I,2}(b, dp) 30 | @test_throws MethodError CuDeviceArray{I,2}((b,), dp) 31 | CuDeviceArray{I,2}((a,b), dp) 32 | 33 | # fully parameterized 34 | CuDeviceArray{I,1,AS.Generic}(b, dp) 35 | CuDeviceArray{I,1,AS.Generic}((b,), dp) 36 | @test_throws MethodError CuDeviceArray{I,1,AS.Generic}((a,b), dp) 37 | @test_throws MethodError CuDeviceArray{I,1,AS.Shared}((a,b), dp) 38 | @test_throws MethodError CuDeviceArray{I,2,AS.Generic}(b, dp) 39 | @test_throws MethodError CuDeviceArray{I,2,AS.Generic}((b,), dp) 40 | CuDeviceArray{I,2,AS.Generic}((a,b), dp) 41 | 42 | # type aliases 43 | CuDeviceVector{I}(b, dp) 44 | CuDeviceMatrix{I}((a,b), dp) 45 | end 46 | end 47 | 48 | @testset "basics" begin # argument passing, get and setindex, length 49 | dims = (16, 16) 50 | len = prod(dims) 51 | 52 | function kernel(input::CuDeviceArray{Float32}, output::CuDeviceArray{Float32}) 53 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 54 | 55 | if i <= length(input) 56 | output[i] = Float64(input[i]) # force conversion upon setindex! 57 | end 58 | 59 | return 60 | end 61 | 62 | input = round.(rand(Float32, dims) * 100) 63 | 64 | input_dev = CuTestArray(input) 65 | output_dev = CuTestArray(input) 66 | 67 | @cuda threads=len kernel(input_dev, output_dev) 68 | output = Array(output_dev) 69 | @test input ≈ output 70 | end 71 | 72 | @testset "iteration" begin # argument passing, get and setindex, length 73 | dims = (16, 16) 74 | function kernel(input::CuDeviceArray{T}, output::CuDeviceArray{T}) where {T} 75 | acc = zero(T) 76 | for elem in input 77 | acc += elem 78 | end 79 | output[1] = acc 80 | return 81 | end 82 | 83 | input = round.(rand(Float32, dims) * 100) 84 | 85 | input_dev = CuTestArray(input) 86 | output_dev = CuTestArray(Float32[0]) 87 | 88 | @cuda kernel(input_dev, output_dev) 89 | output = Array(output_dev) 90 | @test sum(input) ≈ output[1] 91 | end 92 | 93 | @testset "bounds checking" begin 94 | @testset "#313" begin 95 | function kernel(dest) 96 | dest[1] = 1 97 | nothing 98 | end 99 | tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global}, 100 | Tuple{UnitRange{Int64},UnitRange{Int64}},false}} 101 | 102 | ir = sprint(io->CUDAnative.code_llvm(io, kernel, tt)) 103 | @test !occursin("jl_invoke", ir) 104 | CUDAnative.code_ptx(devnull, kernel, tt) 105 | end 106 | end 107 | 108 | @testset "views" begin 109 | function kernel(array) 110 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 111 | 112 | _sub = view(array, 2:length(array)-1) 113 | if i <= length(_sub) 114 | _sub[i] = i 115 | end 116 | 117 | return 118 | end 119 | 120 | array = zeros(Int64, 100) 121 | array_dev = CuTestArray(array) 122 | 123 | sub = view(array, 2:length(array)-1) 124 | for i in 1:length(sub) 125 | sub[i] = i 126 | end 127 | 128 | @cuda threads=100 kernel(array_dev) 129 | @test array == Array(array_dev) 130 | end 131 | 132 | @testset "non-Int index to unsafe_load" begin 133 | function load_index(a) 134 | return a[UInt64(1)] 135 | end 136 | 137 | a = [1] 138 | p = pointer(a) 139 | dp = Base.bitcast(CUDAnative.DevicePtr{eltype(p), AS.Generic}, p) 140 | da = CUDAnative.CuDeviceArray(1, dp) 141 | load_index(da) 142 | end 143 | 144 | @testset "ldg" begin 145 | function kernel(a, b, i) 146 | b[i] = ldg(a, i) 147 | return 148 | end 149 | 150 | buf = IOBuffer() 151 | 152 | a = CuTestArray([0]) 153 | b = CuTestArray([0]) 154 | @device_code_ptx io=buf @cuda kernel(a, b, 1) 155 | @test Array(a) == Array(b) 156 | 157 | asm = String(take!(copy(buf))) 158 | @test occursin("ld.global.nc", asm) 159 | end 160 | 161 | end 162 | -------------------------------------------------------------------------------- /src/device/cuda_intrinsics/warp_shuffle.jl: -------------------------------------------------------------------------------- 1 | # Warp Shuffle (B.14) 2 | 3 | # TODO: does not work on sub-word (ie. Int16) or non-word divisible sized types 4 | 5 | # TODO: should shfl_idx conform to 1-based indexing? 6 | 7 | # TODO: these functions should dispatch based on the actual warp size 8 | const ws = Int32(32) 9 | 10 | # TODO: this functionality should throw = v"9.0" && v"6.0" in ptx_support 27 | instruction = Symbol("shfl.sync.$mode.b32") 28 | fname_sync = Symbol("$(fname)_sync") 29 | 30 | # TODO: implement using LLVM intrinsics when we have D38090 31 | 32 | @eval begin 33 | export $fname_sync, $fname 34 | 35 | @inline $fname_sync(val::UInt32, src::UInt32, width::UInt32=$ws, 36 | threadmask::UInt32=0xffffffff) = 37 | @asmcall($"$instruction \$0, \$1, \$2, \$3, \$4;", "=r,r,r,r,r", true, 38 | UInt32, NTuple{4,UInt32}, 39 | val, src, pack(width, $mask), threadmask) 40 | 41 | # FIXME: replace this with a checked conversion once we have exceptions 42 | @inline $fname_sync(val::UInt32, src::Integer, width::Integer=$ws, 43 | threadmask::UInt32=0xffffffff) = 44 | $fname_sync(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width), 45 | threadmask) 46 | 47 | @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) = 48 | $fname_sync(val, src, width) 49 | end 50 | else 51 | intrinsic = Symbol("llvm.nvvm.shfl.$mode.i32") 52 | 53 | @eval begin 54 | export $fname 55 | @inline $fname(val::UInt32, src::UInt32, width::UInt32=$ws) = 56 | ccall($"$intrinsic", llvmcall, UInt32, 57 | (UInt32, UInt32, UInt32), 58 | val, src, pack(width, $mask)) 59 | 60 | # FIXME: replace this with a checked conversion once we have exceptions 61 | @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) = 62 | $fname(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width)) 63 | end 64 | end 65 | end 66 | 67 | 68 | # wide and aggregate intrinsics 69 | 70 | for name in ["_up", "_down", "_xor", ""] 71 | fname = Symbol("shfl$name") 72 | @eval @inline $fname(src, args...) = recurse_value_invocation($fname, src, args...) 73 | 74 | fname_sync = Symbol("$(fname)_sync") 75 | @eval @inline $fname_sync(src, args...) = recurse_value_invocation($fname, src, args...) 76 | end 77 | 78 | 79 | # documentation 80 | 81 | @doc """ 82 | shfl(val, lane::Integer, width::Integer=32) 83 | 84 | Shuffle a value from a directly indexed lane `lane`. 85 | """ shfl 86 | 87 | @doc """ 88 | shfl_up(val, delta::Integer, width::Integer=32) 89 | 90 | Shuffle a value from a lane with lower ID relative to caller. 91 | """ shfl_up 92 | 93 | @doc """ 94 | shfl_down(val, delta::Integer, width::Integer=32) 95 | 96 | Shuffle a value from a lane with higher ID relative to caller. 97 | """ shfl_down 98 | 99 | @doc """ 100 | shfl_xor(val, mask::Integer, width::Integer=32) 101 | 102 | Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`. 103 | """ shfl_xor 104 | 105 | 106 | @doc """ 107 | shfl_sync(val, lane::Integer, width::Integer=32, threadmask::UInt32=0xffffffff) 108 | 109 | Shuffle a value from a directly indexed lane `lane`. The default value for `threadmask` 110 | performs the shuffle on all threads in the warp. 111 | """ shfl_sync 112 | 113 | @doc """ 114 | shfl_up_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff) 115 | 116 | Shuffle a value from a lane with lower ID relative to caller. The default value for 117 | `threadmask` performs the shuffle on all threads in the warp. 118 | """ shfl_up_sync 119 | 120 | @doc """ 121 | shfl_down_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff) 122 | 123 | Shuffle a value from a lane with higher ID relative to caller. The default value for 124 | `threadmask` performs the shuffle on all threads in the warp. 125 | """ shfl_down_sync 126 | 127 | @doc """ 128 | shfl_xor_sync(val, mask::Integer, width::Integer=32, threadmask::UInt32=0xffffffff) 129 | 130 | Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`. The default 131 | value for `threadmask` performs the shuffle on all threads in the warp. 132 | """ shfl_xor_sync 133 | -------------------------------------------------------------------------------- /src/device/array.jl: -------------------------------------------------------------------------------- 1 | # Contiguous on-device arrays 2 | 3 | export 4 | CuDeviceArray, CuDeviceVector, CuDeviceMatrix, CuBoundsError, ldg 5 | 6 | 7 | ## construction 8 | 9 | """ 10 | CuDeviceArray(dims, ptr) 11 | CuDeviceArray{T}(dims, ptr) 12 | CuDeviceArray{T,A}(dims, ptr) 13 | CuDeviceArray{T,A,N}(dims, ptr) 14 | 15 | Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a 16 | pointer, where `N` is determined from the length of `dims` and `T` is determined from the 17 | type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the 18 | lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`, 19 | then it must match the length of `dims`. The same applies to the element type `T`, which 20 | should match the type of the pointer `ptr`. 21 | """ 22 | CuDeviceArray 23 | 24 | # NOTE: we can't support the typical `tuple or series of integer` style construction, 25 | # because we're currently requiring a trailing pointer argument. 26 | 27 | struct CuDeviceArray{T,N,A} <: AbstractArray{T,N} 28 | shape::Dims{N} 29 | ptr::DevicePtr{T,A} 30 | 31 | # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) 32 | CuDeviceArray{T,N,A}(shape::Dims{N}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr) 33 | end 34 | 35 | const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A} 36 | const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A} 37 | 38 | # outer constructors, non-parameterized 39 | CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) 40 | CuDeviceArray(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) 41 | 42 | # outer constructors, partially parameterized 43 | CuDeviceArray{T}(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) 44 | CuDeviceArray{T}(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) 45 | CuDeviceArray{T,N}(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p) 46 | CuDeviceVector{T}(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((len,), p) 47 | 48 | # outer constructors, fully parameterized 49 | CuDeviceArray{T,N,A}(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p) 50 | CuDeviceVector{T,A}(len::Integer, p::DevicePtr{T,A}) where {T,A} = CuDeviceVector{T,A}((Int(len),), p) 51 | 52 | 53 | ## getters 54 | 55 | Base.pointer(a::CuDeviceArray) = a.ptr 56 | 57 | Base.size(g::CuDeviceArray) = g.shape 58 | Base.length(g::CuDeviceArray) = prod(g.shape) 59 | 60 | 61 | ## conversions 62 | 63 | Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a) 64 | 65 | 66 | ## indexing 67 | 68 | # TODO: arrays as allocated by the CUDA APIs are 256-byte aligned. we should keep track of 69 | # this information, because it enables optimizations like Load Store Vectorization 70 | # (cfr. shared memory and its wider-than-datatype alignment) 71 | 72 | @inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T} 73 | @boundscheck checkbounds(A, index) 74 | align = datatype_align(T) 75 | Base.unsafe_load(pointer(A), index, Val(align))::T 76 | end 77 | 78 | @inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T} 79 | @boundscheck checkbounds(A, index) 80 | align = datatype_align(T) 81 | Base.unsafe_store!(pointer(A), x, index, Val(align)) 82 | end 83 | 84 | """ 85 | ldg(A, i) 86 | 87 | Index the array `A` with the linear index `i`, but loads the value through the read-only 88 | texture cache for improved cache behavior. You should make sure the array `A`, or any 89 | aliased instance, is not written to for the duration of the current kernel. 90 | 91 | This function can only be used on devices with compute capability 3.5 or higher. 92 | 93 | See also: [`Base.getindex`](@ref) 94 | """ 95 | @inline function ldg(A::CuDeviceArray{T}, index::Integer) where {T} 96 | # FIXME: this only works on sm_35+, but we can't verify that for now 97 | @boundscheck checkbounds(A, index) 98 | align = datatype_align(T) 99 | unsafe_cached_load(pointer(A), index, Val(align))::T 100 | end 101 | 102 | Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear() 103 | 104 | 105 | ## other 106 | 107 | Base.show(io::IO, a::CuDeviceVector) = 108 | print(io, "$(length(a))-element device array at $(pointer(a))") 109 | Base.show(io::IO, a::CuDeviceArray) = 110 | print(io, "$(join(a.shape, '×')) device array at $(pointer(a))") 111 | 112 | Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a) 113 | 114 | @inline function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T} 115 | ptr = pointer(A) + (I[1].start-1)*sizeof(T) 116 | len = I[1].stop - I[1].start + 1 117 | return CuDeviceArray(len, ptr) 118 | end 119 | 120 | @inline function Base.iterate(A::CuDeviceArray, i=1) 121 | if (i % UInt) - 1 < length(A) 122 | (@inbounds A[i], i + 1) 123 | else 124 | nothing 125 | end 126 | end 127 | -------------------------------------------------------------------------------- /src/compiler/rtlib.jl: -------------------------------------------------------------------------------- 1 | # compiler support for working with run-time libraries 2 | 3 | function link_library!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module) 4 | # linking is destructive, so copy the library 5 | lib = LLVM.Module(lib) 6 | 7 | # save list of external functions 8 | exports = String[] 9 | for f in functions(mod) 10 | fn = LLVM.name(f) 11 | if !haskey(functions(lib), fn) 12 | push!(exports, fn) 13 | end 14 | end 15 | 16 | link!(mod, lib) 17 | 18 | ModulePassManager() do pm 19 | # internalize all functions that aren't exports 20 | internalize!(pm, exports) 21 | 22 | # eliminate all unused internal functions 23 | global_optimizer!(pm) 24 | global_dce!(pm) 25 | strip_dead_prototypes!(pm) 26 | 27 | run!(pm, mod) 28 | end 29 | end 30 | 31 | const libcache = Dict{String, LLVM.Module}() 32 | 33 | 34 | # 35 | # CUDA device library 36 | # 37 | 38 | function find_libdevice(cap) 39 | CUDAnative.configured || return 40 | global libdevice 41 | 42 | if isa(libdevice, Dict) 43 | # select the most recent & compatible library 44 | vers = keys(CUDAnative.libdevice) 45 | compat_vers = Set(ver for ver in vers if ver <= cap) 46 | isempty(compat_vers) && error("No compatible CUDA device library available") 47 | ver = maximum(compat_vers) 48 | path = libdevice[ver] 49 | else 50 | libdevice 51 | end 52 | end 53 | 54 | function load_libdevice(cap) 55 | path = find_libdevice(cap) 56 | 57 | get!(libcache, path) do 58 | open(path) do io 59 | parse(LLVM.Module, read(path), JuliaContext()) 60 | end 61 | end 62 | end 63 | 64 | function link_libdevice!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module) 65 | # override libdevice's triple and datalayout to avoid warnings 66 | triple!(lib, triple(mod)) 67 | datalayout!(lib, datalayout(mod)) 68 | 69 | link_library!(ctx, mod, lib) 70 | 71 | ModulePassManager() do pm 72 | push!(metadata(mod), "nvvm-reflect-ftz", 73 | MDNode([ConstantInt(Int32(1), JuliaContext())])) 74 | # TODO: run the reflect pass? 75 | run!(pm, mod) 76 | end 77 | end 78 | 79 | 80 | # 81 | # CUDAnative run-time library 82 | # 83 | 84 | # remove existing runtime libraries globally, 85 | # so any change to CUDAnative triggers recompilation 86 | rm(joinpath(@__DIR__, "..", "..", "deps", "runtime"); recursive=true, force=true) 87 | 88 | 89 | ## higher-level functionality to work with runtime functions 90 | 91 | function LLVM.call!(builder, rt::Runtime.RuntimeMethodInstance, args=LLVM.Value[]) 92 | bb = position(builder) 93 | f = LLVM.parent(bb) 94 | mod = LLVM.parent(f) 95 | 96 | # get or create a function prototype 97 | if haskey(functions(mod), rt.llvm_name) 98 | f = functions(mod)[rt.llvm_name] 99 | ft = eltype(llvmtype(f)) 100 | else 101 | ft = LLVM.FunctionType(rt.llvm_return_type, rt.llvm_types) 102 | f = LLVM.Function(mod, rt.llvm_name, ft) 103 | end 104 | 105 | # runtime functions are written in Julia, while we're calling from LLVM, 106 | # this often results in argument type mismatches. try to fix some here. 107 | for (i,arg) in enumerate(args) 108 | if llvmtype(arg) != parameters(ft)[i] 109 | if (llvmtype(arg) isa LLVM.PointerType) && 110 | (parameters(ft)[i] isa LLVM.IntegerType) 111 | # Julia pointers are passed as integers 112 | args[i] = ptrtoint!(builder, args[i], parameters(ft)[i]) 113 | else 114 | error("Don't know how to convert ", arg, " argument to ", parameters(ft)[i]) 115 | end 116 | end 117 | end 118 | 119 | call!(builder, f, args) 120 | end 121 | 122 | 123 | ## functionality to build the runtime library 124 | 125 | function emit_function!(mod, cap, f, types, name) 126 | tt = Base.to_tuple_type(types) 127 | ctx = CompilerContext(f, tt, cap, #= kernel =# false) 128 | new_mod, entry = irgen(ctx) 129 | entry = optimize!(ctx, new_mod, entry) 130 | LLVM.name!(entry, name) 131 | 132 | link!(mod, new_mod) 133 | end 134 | 135 | function build_runtime(cap) 136 | mod = LLVM.Module("CUDAnative run-time library", JuliaContext()) 137 | 138 | for method in values(Runtime.methods) 139 | emit_function!(mod, cap, method.def, method.types, method.llvm_name) 140 | end 141 | 142 | mod 143 | end 144 | 145 | function load_runtime(cap) 146 | name = "cudanative.$(cap.major)$(cap.minor).bc" 147 | path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) 148 | mkpath(dirname(path)) 149 | 150 | get!(libcache, path) do 151 | if ispath(path) 152 | open(path) do io 153 | parse(LLVM.Module, read(io), JuliaContext()) 154 | end 155 | else 156 | @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..." 157 | lib = build_runtime(cap) 158 | open(path, "w") do io 159 | write(io, lib) 160 | end 161 | lib 162 | end 163 | end 164 | end 165 | -------------------------------------------------------------------------------- /src/compiler/validation.jl: -------------------------------------------------------------------------------- 1 | # validation of properties and code 2 | 3 | function check_method(ctx::CompilerContext) 4 | # get the method 5 | ms = Base.methods(ctx.f, ctx.tt) 6 | isempty(ms) && throw(KernelError(ctx, "no method found")) 7 | length(ms)!=1 && throw(KernelError(ctx, "no unique matching method")) 8 | m = first(ms) 9 | 10 | # kernels can't return values 11 | if ctx.kernel 12 | rt = Base.return_types(ctx.f, ctx.tt)[1] 13 | if rt != Nothing 14 | throw(KernelError(ctx, "kernel returns a value of type `$rt`", 15 | """Make sure your kernel function ends in `return`, `return nothing` or `nothing`. 16 | If the returned value is of type `Union{}`, your Julia code probably throws an exception. 17 | Inspect the code with `@device_code_warntype` for more details.""")) 18 | end 19 | end 20 | 21 | return 22 | end 23 | 24 | function check_invocation(ctx::CompilerContext, entry::LLVM.Function) 25 | # make sure any non-isbits arguments are unused 26 | real_arg_i = 0 27 | sig = Base.signature_type(ctx.f, ctx.tt)::Type 28 | for (arg_i,dt) in enumerate(sig.parameters) 29 | isghosttype(dt) && continue 30 | real_arg_i += 1 31 | 32 | if !isbitstype(dt) 33 | param = parameters(entry)[real_arg_i] 34 | if !isempty(uses(param)) 35 | throw(KernelError(ctx, "passing and using non-bitstype argument", 36 | """Argument $arg_i to your kernel function is of type $dt. 37 | That type is not isbits, and such arguments are only allowed when they are unused by the kernel.""")) 38 | end 39 | end 40 | end 41 | 42 | return 43 | end 44 | 45 | 46 | ## IR validation 47 | 48 | const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta 49 | 50 | struct InvalidIRError <: Exception 51 | ctx::CompilerContext 52 | errors::Vector{IRError} 53 | end 54 | 55 | const RUNTIME_FUNCTION = "call to the Julia runtime" 56 | const UNKNOWN_FUNCTION = "call to an unknown function" 57 | const POINTER_FUNCTION = "call through a literal pointer" 58 | 59 | function Base.showerror(io::IO, err::InvalidIRError) 60 | print(io, "InvalidIRError: compiling $(signature(err.ctx)) resulted in invalid LLVM IR") 61 | for (kind, bt, meta) in err.errors 62 | print(io, "\nReason: unsupported $kind") 63 | if meta != nothing 64 | if kind == RUNTIME_FUNCTION || kind == UNKNOWN_FUNCTION || kind == POINTER_FUNCTION 65 | print(io, " (call to ", meta, ")") 66 | end 67 | end 68 | Base.show_backtrace(io, bt) 69 | end 70 | return 71 | end 72 | 73 | function check_ir(ctx, args...) 74 | errors = check_ir!(ctx, IRError[], args...) 75 | unique!(errors) 76 | if !isempty(errors) 77 | throw(InvalidIRError(ctx, errors)) 78 | end 79 | 80 | return 81 | end 82 | 83 | function check_ir!(ctx, errors::Vector{IRError}, mod::LLVM.Module) 84 | for f in functions(mod) 85 | check_ir!(ctx, errors, f) 86 | end 87 | 88 | return errors 89 | end 90 | 91 | function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function) 92 | for bb in blocks(f), inst in instructions(bb) 93 | if isa(inst, LLVM.CallInst) 94 | check_ir!(ctx, errors, inst) 95 | end 96 | end 97 | 98 | return errors 99 | end 100 | 101 | const special_fns = ("vprintf", "__assertfail", "malloc", "free", "__nvvm_reflect") 102 | 103 | const libjulia = Ref{Ptr{Cvoid}}(C_NULL) 104 | 105 | function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst) 106 | dest = called_value(inst) 107 | if isa(dest, LLVM.Function) 108 | fn = LLVM.name(dest) 109 | 110 | # detect calls to undefined functions 111 | if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) 112 | # figure out if the function lives in the Julia runtime library 113 | if libjulia[] == C_NULL 114 | paths = filter(Libdl.dllist()) do path 115 | name = splitdir(path)[2] 116 | startswith(name, "libjulia") 117 | end 118 | libjulia[] = Libdl.dlopen(first(paths)) 119 | end 120 | 121 | bt = backtrace(inst) 122 | if Libdl.dlsym_e(libjulia[], fn) != C_NULL 123 | push!(errors, (RUNTIME_FUNCTION, bt, LLVM.name(dest))) 124 | else 125 | push!(errors, (UNKNOWN_FUNCTION, bt, LLVM.name(dest))) 126 | end 127 | end 128 | elseif isa(dest, InlineAsm) 129 | # let's assume it's valid ASM 130 | elseif isa(dest, ConstantExpr) 131 | # detect calls to literal pointers 132 | # FIXME: can we detect these properly? 133 | # FIXME: jl_apply_generic and jl_invoke also have such arguments 134 | if occursin("inttoptr", string(dest)) 135 | # extract the literal pointer 136 | ptr_arg = first(operands(dest)) 137 | @compiler_assert isa(ptr_arg, ConstantInt) ctx 138 | ptr_val = convert(Int, ptr_arg) 139 | ptr = Ptr{Cvoid}(ptr_val) 140 | 141 | # look it up in the Julia JIT cache 142 | bt = backtrace(inst) 143 | frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0) 144 | if length(frames) >= 1 145 | @compiler_assert length(frames) == 1 ctx frames=frames 146 | fn, file, line, linfo, fromC, inlined, ip = last(frames) 147 | push!(errors, (POINTER_FUNCTION, bt, fn)) 148 | else 149 | fn, file, line, linfo, fromC, inlined, ip = last(frames) 150 | push!(errors, (POINTER_FUNCTION, bt, nothing)) 151 | end 152 | end 153 | end 154 | 155 | return errors 156 | end 157 | -------------------------------------------------------------------------------- /deps/build.jl: -------------------------------------------------------------------------------- 1 | using CUDAapi 2 | using CUDAdrv 3 | using LLVM 4 | 5 | 6 | ## auxiliary routines 7 | 8 | function build_error(reason) 9 | println(""" 10 | $reason. 11 | 12 | This is not a fatal error, but GPU functionality will be unavailable. 13 | If you expected this to work, please open a thread on 14 | https://discourse.julialang.org/c/domain/gpu""") 15 | exit(1) 16 | end 17 | 18 | function llvm_support(version) 19 | @debug("Using LLVM $version") 20 | 21 | InitializeAllTargets() 22 | haskey(targets(), "nvptx") || 23 | build_error(""" 24 | Your LLVM does not support the NVPTX back-end. 25 | 26 | This is very strange; both the official binaries 27 | and an unmodified build should contain this back-end.""") 28 | 29 | target_support = sort(collect(CUDAapi.devices_for_llvm(version))) 30 | 31 | ptx_support = CUDAapi.isas_for_llvm(version) 32 | push!(ptx_support, v"6.0") # JuliaLang/julia#23817 33 | ptx_support = sort(collect(ptx_support)) 34 | 35 | @debug("LLVM support", targets=target_support, isas=ptx_support) 36 | return target_support, ptx_support 37 | end 38 | 39 | function cuda_support(driver_version, toolkit_version) 40 | @debug("Using CUDA driver $driver_version and toolkit $toolkit_version") 41 | 42 | # the toolkit version as reported contains major.minor.patch, 43 | # but the version number returned by libcuda is only major.minor. 44 | toolkit_version = VersionNumber(toolkit_version.major, toolkit_version.minor) 45 | if toolkit_version > driver_version 46 | build_error(""" 47 | CUDA $(toolkit_version.major).$(toolkit_version.minor) is not supported by 48 | your driver (which supports up to $(driver_version.major).$(driver_version.minor))""") 49 | end 50 | 51 | driver_target_support = CUDAapi.devices_for_cuda(driver_version) 52 | toolkit_target_support = CUDAapi.devices_for_cuda(toolkit_version) 53 | target_support = sort(collect(driver_target_support ∩ toolkit_target_support)) 54 | 55 | driver_ptx_support = CUDAapi.isas_for_cuda(driver_version) 56 | toolkit_ptx_support = CUDAapi.isas_for_cuda(toolkit_version) 57 | ptx_support = sort(collect(driver_ptx_support ∩ toolkit_ptx_support)) 58 | 59 | @debug("CUDA driver support", version=driver_version, 60 | targets=driver_target_support, isas=driver_ptx_support) 61 | @debug("CUDA toolkit support", version=toolkit_version, 62 | targets=toolkit_target_support, isas=toolkit_ptx_support) 63 | 64 | return target_support, ptx_support 65 | end 66 | 67 | 68 | ## main 69 | 70 | const config_path = joinpath(@__DIR__, "ext.jl") 71 | const previous_config_path = config_path * ".bak" 72 | 73 | function write_ext(config, path) 74 | open(path, "w") do io 75 | println(io, "# autogenerated file, do not edit") 76 | for (key,val) in config 77 | println(io, "const $key = $(repr(val))") 78 | end 79 | end 80 | end 81 | 82 | function read_ext(path) 83 | config = Dict{Symbol,Any}() 84 | r = r"^const (\w+) = (.+)$" 85 | open(path, "r") do io 86 | for line in eachline(io) 87 | m = match(r, line) 88 | if m != nothing 89 | config[Symbol(m.captures[1])] = eval(Meta.parse(m.captures[2])) 90 | end 91 | end 92 | end 93 | return config 94 | end 95 | 96 | function main() 97 | ispath(config_path) && mv(config_path, previous_config_path; force=true) 98 | config = Dict{Symbol,Any}(:configured => false) 99 | write_ext(config, config_path) 100 | 101 | 102 | ## gather info 103 | 104 | ### LLVM.jl 105 | 106 | LLVM.configured || build_error("Dependent package LLVM.jl has not been built successfully") 107 | 108 | LLVM.libllvm_system && build_error("CUDAnative.jl requires LLVM.jl to be built against Julia's LLVM library, not a system-provided one") 109 | 110 | llvm_version = LLVM.version() 111 | llvm_targets, llvm_isas = llvm_support(llvm_version) 112 | 113 | ### julia 114 | 115 | julia_llvm_version = Base.libllvm_version 116 | if julia_llvm_version != llvm_version 117 | build_error("LLVM $llvm_version incompatible with Julia's LLVM $julia_llvm_version") 118 | end 119 | 120 | ### CUDA 121 | 122 | CUDAdrv.configured || build_error("Dependent package CUDAdrv.jl has not been built successfully") 123 | 124 | toolkit_dirs = find_toolkit() 125 | cuda_toolkit_version = find_toolkit_version(toolkit_dirs) 126 | 127 | config[:cuda_driver_version] = CUDAdrv.version() 128 | cuda_targets, cuda_isas = cuda_support(config[:cuda_driver_version], cuda_toolkit_version) 129 | 130 | config[:target_support] = sort(collect(llvm_targets ∩ cuda_targets)) 131 | isempty(config[:target_support]) && build_error("Your toolchain does not support any device target") 132 | 133 | config[:ptx_support] = sort(collect(llvm_isas ∩ cuda_isas)) 134 | isempty(config[:target_support]) && build_error("Your toolchain does not support any PTX ISA") 135 | 136 | @debug("CUDAnative support", targets=config[:target_support], isas=config[:ptx_support]) 137 | 138 | # discover other CUDA toolkit artifacts 139 | ## required 140 | config[:libdevice] = find_libdevice(config[:target_support], toolkit_dirs) 141 | config[:libdevice] == nothing && build_error("Available CUDA toolchain does not provide libdevice") 142 | ## optional 143 | config[:nvdisasm] = find_cuda_binary("nvdisasm", toolkit_dirs) 144 | config[:ptxas] = find_cuda_binary("ptxas", toolkit_dirs) 145 | 146 | config[:configured] = true 147 | 148 | 149 | ## (re)generate ext.jl 150 | 151 | if isfile(previous_config_path) 152 | previous_config = read_ext(previous_config_path) 153 | 154 | if config == previous_config 155 | mv(previous_config_path, config_path; force=true) 156 | return 157 | end 158 | end 159 | 160 | write_ext(config, config_path) 161 | 162 | return 163 | end 164 | 165 | main() 166 | -------------------------------------------------------------------------------- /src/device/runtime_intrinsics.jl: -------------------------------------------------------------------------------- 1 | # CUDAnative run-time library 2 | # 3 | # This module defines method instances that will be compiled into a device-specific image 4 | # and will be available to the CUDAnative compiler to call after Julia has generated code. 5 | 6 | module Runtime 7 | 8 | using ..CUDAnative 9 | using LLVM 10 | using LLVM.Interop 11 | 12 | 13 | ## representation of a runtime method instance 14 | 15 | struct RuntimeMethodInstance 16 | def::Function 17 | 18 | return_type::Type 19 | types::Tuple 20 | name::Symbol 21 | 22 | # LLVM types cannot be cached, so we can't put them in the runtime method instance. 23 | # the actual types are constructed upon accessing them, based on a sentinel value: 24 | # - nothing: construct the LLVM type based on its Julia counterparts 25 | # - function: call this generator to get the type (when more control is needed) 26 | llvm_return_type::Union{Nothing, Function} 27 | llvm_types::Union{Nothing, Function} 28 | llvm_name::String 29 | end 30 | 31 | function Base.getproperty(rt::RuntimeMethodInstance, field::Symbol) 32 | value = getfield(rt, field) 33 | if field == :llvm_types 34 | if value == nothing 35 | LLVMType[convert.(LLVMType, typ) for typ in rt.types] 36 | else 37 | value() 38 | end 39 | elseif field == :llvm_return_type 40 | if value == nothing 41 | convert.(LLVMType, rt.return_type) 42 | else 43 | value() 44 | end 45 | else 46 | return value 47 | end 48 | end 49 | 50 | const methods = Dict{Symbol,RuntimeMethodInstance}() 51 | get(name::Symbol) = methods[name] 52 | 53 | # Register a Julia function `def` as a runtime library function identified by `name`. The 54 | # function will be compiled upon first use for argument types `types` and should return 55 | # `return_type`. Use `Runtime.get(name)` to get a reference to this method instance. 56 | # 57 | # The corresponding LLVM types `llvm_types` and `llvm_return_type` will be deduced from 58 | # their Julia counterparts. To influence that conversion, pass a callable object instead; 59 | # this object will be evaluated at run-time and the returned value will be used instead. 60 | # 61 | # When generating multiple runtime functions from a single definition, make sure to specify 62 | # different values for `name`. The LLVM function name will be deduced from that name, but 63 | # you can always specify `llvm_name` to influence that. Never use an LLVM name that starts 64 | # with `julia_` or the function might clash with other compiled functions. 65 | function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=nothing; 66 | name=typeof(def).name.mt.name, llvm_name="ptx_$name") 67 | meth = RuntimeMethodInstance(def, 68 | return_type, types, name, 69 | llvm_return_type, llvm_types, llvm_name) 70 | if haskey(methods, name) 71 | error("Runtime function $name has already been registered!") 72 | end 73 | methods[name] = meth 74 | meth 75 | end 76 | 77 | 78 | ## exception handling 79 | 80 | function report_exception(ex) 81 | @cuprintf(""" 82 | ERROR: a %s was thrown during kernel execution. 83 | Run Julia on debug level 2 for device stack traces. 84 | """, ex) 85 | return 86 | end 87 | 88 | compile(report_exception, Nothing, (Ptr{Cchar},)) 89 | 90 | function report_exception_name(ex) 91 | @cuprintf(""" 92 | ERROR: a %s was thrown during kernel execution. 93 | Stacktrace: 94 | """, ex) 95 | return 96 | end 97 | 98 | function report_exception_frame(idx, func, file, line) 99 | @cuprintf(" [%i] %s at %s:%i\n", idx, func, file, line) 100 | return 101 | end 102 | 103 | compile(report_exception_frame, Nothing, (Cint, Ptr{Cchar}, Ptr{Cchar}, Cint)) 104 | compile(report_exception_name, Nothing, (Ptr{Cchar},)) 105 | 106 | # NOTE: no throw functions are provided here, but replaced by an LLVM pass instead 107 | # in order to provide some debug information without stack unwinding. 108 | 109 | 110 | ## GC 111 | 112 | @enum AddressSpace begin 113 | Generic = 1 114 | Tracked = 10 115 | Derived = 11 116 | CalleeRooted = 12 117 | Loaded = 13 118 | end 119 | 120 | # LLVM type of a tracked pointer 121 | function T_prjlvalue() 122 | T_pjlvalue = convert(LLVMType, Any, true) 123 | LLVM.PointerType(eltype(T_pjlvalue), Tracked) 124 | end 125 | 126 | function gc_pool_alloc(sz::Csize_t) 127 | ptr = malloc(sz) 128 | return unsafe_pointer_to_objref(ptr) 129 | end 130 | 131 | compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) 132 | 133 | 134 | ## boxing and unboxing 135 | 136 | const tag_type = UInt 137 | const tag_size = sizeof(tag_type) 138 | 139 | const gc_bits = 0x3 # FIXME 140 | 141 | # get the type tag of a type at run-time 142 | @generated function type_tag(::Val{type_name}) where type_name 143 | T_tag = convert(LLVMType, tag_type) 144 | T_ptag = LLVM.PointerType(T_tag) 145 | 146 | T_pjlvalue = convert(LLVMType, Any, true) 147 | 148 | # create function 149 | llvm_f, _ = create_function(T_tag) 150 | mod = LLVM.parent(llvm_f) 151 | 152 | # this isn't really a function, but we abuse it to get the JIT to resolve the address 153 | typ = LLVM.Function(mod, "jl_" * String(type_name) * "_type", 154 | LLVM.FunctionType(T_pjlvalue)) 155 | 156 | # generate IR 157 | Builder(JuliaContext()) do builder 158 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 159 | position!(builder, entry) 160 | 161 | typ_var = bitcast!(builder, typ, T_ptag) 162 | 163 | tag = load!(builder, typ_var) 164 | 165 | ret!(builder, tag) 166 | end 167 | 168 | call_function(llvm_f, tag_type) 169 | end 170 | 171 | # we use `jl_value_ptr`, a Julia pseudo-intrinsic that can be used to box and unbox values 172 | 173 | @generated function box(val, ::Val{type_name}) where type_name 174 | sz = sizeof(val) 175 | allocsz = sz + tag_size 176 | 177 | # type-tags are ephemeral, so look them up at run time 178 | #tag = unsafe_load(convert(Ptr{tag_type}, type_name)) 179 | tag = :( type_tag(Val(type_name)) ) 180 | 181 | quote 182 | Base.@_inline_meta 183 | 184 | ptr = malloc($(Csize_t(allocsz))) 185 | 186 | # store the type tag 187 | ptr = convert(Ptr{tag_type}, ptr) 188 | Core.Intrinsics.pointerset(ptr, $tag | $gc_bits, #=index=# 1, #=align=# $tag_size) 189 | 190 | # store the value 191 | ptr = convert(Ptr{$val}, ptr+tag_size) 192 | Core.Intrinsics.pointerset(ptr, val, #=index=# 1, #=align=# $sz) 193 | 194 | unsafe_pointer_to_objref(ptr) 195 | end 196 | end 197 | 198 | @inline function unbox(obj, ::Type{T}) where T 199 | ptr = ccall(:jl_value_ptr, Ptr{Cvoid}, (Any,), obj) 200 | 201 | # load the value 202 | ptr = convert(Ptr{T}, ptr) 203 | Core.Intrinsics.pointerref(ptr, #=index=# 1, #=align=# sizeof(T)) 204 | end 205 | 206 | # generate functions functions that exist in the Julia runtime (see julia/src/datatype.c) 207 | for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => :int64, 208 | UInt8 => :uint8, UInt16 => :uint16, UInt32 => :uint32, UInt64 => :uint64] 209 | box_fn = Symbol("box_$t") 210 | unbox_fn = Symbol("unbox_$t") 211 | @eval begin 212 | $box_fn(val) = box($T(val), Val($(QuoteNode(t)))) 213 | $unbox_fn(obj) = unbox(obj, $T) 214 | 215 | compile($box_fn, Any, ($T,), T_prjlvalue; llvm_name=$"jl_$box_fn") 216 | compile($unbox_fn, $T, (Any,); llvm_name=$"jl_$unbox_fn") 217 | end 218 | end 219 | 220 | 221 | end 222 | -------------------------------------------------------------------------------- /src/device/pointer.jl: -------------------------------------------------------------------------------- 1 | # Pointers with address space information 2 | 3 | # 4 | # Address spaces 5 | # 6 | 7 | export AS, addrspace 8 | 9 | abstract type AddressSpace end 10 | 11 | module AS 12 | 13 | import ..AddressSpace 14 | 15 | struct Generic <: AddressSpace end 16 | struct Global <: AddressSpace end 17 | struct Shared <: AddressSpace end 18 | struct Constant <: AddressSpace end 19 | struct Local <: AddressSpace end 20 | 21 | end 22 | 23 | 24 | # 25 | # Device pointer 26 | # 27 | 28 | """ 29 | DevicePtr{T,A} 30 | 31 | A memory address that refers to data of type `T` that is accessible from the GPU. It is the 32 | on-device counterpart of `CUDAdrv.CuPtr`, additionally keeping track of the address space 33 | `A` where the data resides (shared, global, constant, etc). This information is used to 34 | provide optimized implementations of operations such as `unsafe_load` and `unsafe_store!.` 35 | """ 36 | DevicePtr 37 | 38 | if sizeof(Ptr{Cvoid}) == 8 39 | primitive type DevicePtr{T,A} 64 end 40 | else 41 | primitive type DevicePtr{T,A} 32 end 42 | end 43 | 44 | # constructors 45 | DevicePtr{T,A}(x::Union{Int,UInt,CuPtr,DevicePtr}) where {T,A<:AddressSpace} = Base.bitcast(DevicePtr{T,A}, x) 46 | DevicePtr{T}(ptr::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr) 47 | DevicePtr(ptr::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr) 48 | 49 | 50 | ## getters 51 | 52 | Base.eltype(::Type{<:DevicePtr{T}}) where {T} = T 53 | 54 | addrspace(x::DevicePtr) = addrspace(typeof(x)) 55 | addrspace(::Type{DevicePtr{T,A}}) where {T,A} = A 56 | 57 | 58 | ## conversions 59 | 60 | # to and from integers 61 | ## pointer to integer 62 | Base.convert(::Type{T}, x::DevicePtr) where {T<:Integer} = T(UInt(x)) 63 | ## integer to pointer 64 | Base.convert(::Type{DevicePtr{T,A}}, x::Union{Int,UInt}) where {T,A<:AddressSpace} = DevicePtr{T,A}(x) 65 | Int(x::DevicePtr) = Base.bitcast(Int, x) 66 | UInt(x::DevicePtr) = Base.bitcast(UInt, x) 67 | 68 | # between host and device pointers 69 | Base.convert(::Type{CuPtr{T}}, p::DevicePtr) where {T} = Base.bitcast(CuPtr{T}, p) 70 | Base.convert(::Type{DevicePtr{T,A}}, p::CuPtr) where {T,A<:AddressSpace} = Base.bitcast(DevicePtr{T,A}, p) 71 | Base.convert(::Type{DevicePtr{T}}, p::CuPtr) where {T} = Base.bitcast(DevicePtr{T,AS.Generic}, p) 72 | 73 | # between device pointers 74 | Base.convert(::Type{<:DevicePtr}, p::DevicePtr) = throw(ArgumentError("cannot convert between incompatible device pointer types")) 75 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{T,A}) where {T,A} = p 76 | Base.unsafe_convert(::Type{DevicePtr{T,A}}, p::DevicePtr) where {T,A} = Base.bitcast(DevicePtr{T,A}, p) 77 | ## identical addrspaces 78 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) 79 | ## convert to & from generic 80 | Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr) where {T} = Base.unsafe_convert(DevicePtr{T,AS.Generic}, p) 81 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,AS.Generic}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) 82 | Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr{T,AS.Generic}) where {T} = p # avoid ambiguities 83 | ## unspecified, preserve source addrspace 84 | Base.convert(::Type{DevicePtr{T}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p) 85 | 86 | # defer conversions to DevicePtr to unsafe_convert 87 | Base.cconvert(::Type{<:DevicePtr}, x) = x 88 | 89 | 90 | ## limited pointer arithmetic & comparison 91 | 92 | isequal(x::DevicePtr, y::DevicePtr) = (x === y) && addrspace(x) == addrspace(y) 93 | isless(x::DevicePtr{T,A}, y::DevicePtr{T,A}) where {T,A<:AddressSpace} = x < y 94 | 95 | Base.:(==)(x::DevicePtr, y::DevicePtr) = UInt(x) == UInt(y) && addrspace(x) == addrspace(y) 96 | Base.:(<)(x::DevicePtr, y::DevicePtr) = UInt(x) < UInt(y) 97 | Base.:(-)(x::DevicePtr, y::DevicePtr) = UInt(x) - UInt(y) 98 | 99 | Base.:(+)(x::DevicePtr, y::Integer) = oftype(x, Base.add_ptr(UInt(x), (y % UInt) % UInt)) 100 | Base.:(-)(x::DevicePtr, y::Integer) = oftype(x, Base.sub_ptr(UInt(x), (y % UInt) % UInt)) 101 | Base.:(+)(x::Integer, y::DevicePtr) = y + x 102 | 103 | 104 | 105 | ## memory operations 106 | 107 | Base.convert(::Type{Int}, ::Type{AS.Generic}) = 0 108 | Base.convert(::Type{Int}, ::Type{AS.Global}) = 1 109 | Base.convert(::Type{Int}, ::Type{AS.Shared}) = 3 110 | Base.convert(::Type{Int}, ::Type{AS.Constant}) = 4 111 | Base.convert(::Type{Int}, ::Type{AS.Local}) = 5 112 | 113 | function tbaa_make_child(name::String, constant::Bool=false; ctx::LLVM.Context=JuliaContext()) 114 | tbaa_root = MDNode([MDString("ptxtbaa", ctx)], ctx) 115 | tbaa_struct_type = 116 | MDNode([MDString("ptxtbaa_$name", ctx), 117 | tbaa_root, 118 | LLVM.ConstantInt(0, ctx)], ctx) 119 | tbaa_access_tag = 120 | MDNode([tbaa_struct_type, 121 | tbaa_struct_type, 122 | LLVM.ConstantInt(0, ctx), 123 | LLVM.ConstantInt(constant ? 1 : 0, ctx)], ctx) 124 | 125 | return tbaa_access_tag 126 | end 127 | 128 | tbaa_addrspace(as::Type{<:AddressSpace}) = tbaa_make_child(lowercase(String(as.name.name))) 129 | 130 | @generated function Base.unsafe_load(p::DevicePtr{T,A}, i::Integer=1, 131 | ::Val{align}=Val(1)) where {T,A,align} 132 | eltyp = convert(LLVMType, T) 133 | 134 | T_int = convert(LLVMType, Int) 135 | T_ptr = convert(LLVMType, DevicePtr{T,A}) 136 | 137 | T_actual_ptr = LLVM.PointerType(eltyp) 138 | 139 | # create a function 140 | param_types = [T_ptr, T_int] 141 | llvm_f, _ = create_function(eltyp, param_types) 142 | 143 | # generate IR 144 | Builder(JuliaContext()) do builder 145 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 146 | position!(builder, entry) 147 | 148 | ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) 149 | 150 | ptr = gep!(builder, ptr, [parameters(llvm_f)[2]]) 151 | ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) 152 | ld = load!(builder, ptr_with_as) 153 | 154 | if A != AS.Generic 155 | metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(A) 156 | end 157 | alignment!(ld, align) 158 | 159 | ret!(builder, ld) 160 | end 161 | 162 | call_function(llvm_f, T, Tuple{DevicePtr{T,A}, Int}, :((p, Int(i-one(i))))) 163 | end 164 | 165 | @generated function Base.unsafe_store!(p::DevicePtr{T,A}, x, i::Integer=1, 166 | ::Val{align}=Val(1)) where {T,A,align} 167 | eltyp = convert(LLVMType, T) 168 | 169 | T_int = convert(LLVMType, Int) 170 | T_ptr = convert(LLVMType, DevicePtr{T,A}) 171 | 172 | T_actual_ptr = LLVM.PointerType(eltyp) 173 | 174 | # create a function 175 | param_types = [T_ptr, eltyp, T_int] 176 | llvm_f, _ = create_function(LLVM.VoidType(JuliaContext()), param_types) 177 | 178 | # generate IR 179 | Builder(JuliaContext()) do builder 180 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 181 | position!(builder, entry) 182 | 183 | ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) 184 | 185 | ptr = gep!(builder, ptr, [parameters(llvm_f)[3]]) 186 | ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) 187 | val = parameters(llvm_f)[2] 188 | st = store!(builder, val, ptr_with_as) 189 | 190 | if A != AS.Generic 191 | metadata(st)[LLVM.MD_tbaa] = tbaa_addrspace(A) 192 | end 193 | alignment!(st, align) 194 | 195 | ret!(builder) 196 | end 197 | 198 | call_function(llvm_f, Cvoid, Tuple{DevicePtr{T,A}, T, Int}, 199 | :((p, convert(T,x), Int(i-one(i))))) 200 | end 201 | 202 | ## loading through the texture cache 203 | 204 | export unsafe_cached_load 205 | 206 | # NOTE: CUDA 8.0 supports more caching modifiers, but those aren't supported by LLVM yet 207 | 208 | # TODO: this functionality should throw Nothing, 6 | :i8 => Int8, 7 | :i16 => Int16, 8 | :i32 => Int32, 9 | :i64 => Int64, 10 | :float => Float32, 11 | :double => Float64 12 | ) 13 | 14 | # Decode an expression of the form: 15 | # 16 | # function(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type 17 | # 18 | # Returns a tuple containing the function name, a vector of argument, a vector of argument 19 | # types and the return type (all in symbolic form). 20 | function decode_call(e) 21 | @assert e.head == :(::) 22 | 23 | # decode the return type expression: single symbol (the LLVM type), or a tuple of 2 24 | # symbols (the LLVM and corresponding Julia type) 25 | retspec = e.args[2] 26 | if isa(retspec, Symbol) 27 | rettype = retspec 28 | else 29 | @assert retspec.head == :tuple 30 | @assert length(retspec.args) == 2 31 | rettype = (retspec.args[1], retspec.args[2]) 32 | end 33 | 34 | call = e.args[1] 35 | @assert call.head == :call 36 | 37 | fn = Symbol(call.args[1]) 38 | args = Symbol[arg.args[1] for arg in call.args[2:end]] 39 | argtypes = Symbol[arg.args[2] for arg in call.args[2:end]] 40 | 41 | return fn, args, argtypes, rettype 42 | end 43 | 44 | # Generate a `llvmcall` statement calling an intrinsic specified as follows: 45 | # 46 | # intrinsic(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type [attr] 47 | # 48 | # The argument types should be valid LLVM type identifiers (eg. i32, float, double). 49 | # Conversions to the corresponding Julia type are automatically generated; make sure the 50 | # actual arguments are of the same type to make these conversions no-ops. The optional 51 | # argument `attr` indicates which LLVM function attributes (such as `readnone` or `nounwind`) 52 | # to add to the intrinsic declaration. 53 | 54 | # For example, the following call: 55 | # `@wrap __some_intrinsic(x::float, y::double)::float` 56 | # 57 | # will yield the following `llvmcall`: 58 | # ``` 59 | # Base.llvmcall(("declare float @__somme__intr(float, double)", 60 | # "%3 = call float @__somme__intr(float %0, double %1) 61 | # ret float %3"), 62 | # Float32, Tuple{Float32,Float64}, 63 | # convert(Float32,x), convert(Float64,y)) 64 | # ``` 65 | macro wrap(call, attrs="") 66 | intrinsic, args, argtypes, rettype = decode_call(call) 67 | 68 | # decide on intrinsic return type 69 | if isa(rettype, Symbol) 70 | # only LLVM return type specified, match against known LLVM/Julia type combinations 71 | llvm_ret_typ = rettype 72 | julia_ret_typ = jltypes[rettype] 73 | else 74 | # both specified (for when there is a mismatch, eg. i32 -> UInt32) 75 | llvm_ret_typ = rettype[1] 76 | julia_ret_typ = rettype[2] 77 | end 78 | 79 | llvm_args = String["%$i" for i in 0:length(argtypes)] 80 | if llvm_ret_typ == :void 81 | llvm_ret_asgn = "" 82 | llvm_ret = "void" 83 | else 84 | llvm_ret_var = "%$(length(argtypes)+1)" 85 | llvm_ret_asgn = "$llvm_ret_var = " 86 | llvm_ret = "$llvm_ret_typ $llvm_ret_var" 87 | end 88 | llvm_declargs = join(argtypes, ", ") 89 | llvm_defargs = join(("$t $arg" for (t,arg) in zip(argtypes, llvm_args)), ", ") 90 | 91 | julia_argtypes = (jltypes[t] for t in argtypes) 92 | julia_args = (:(convert($argtype, $(esc(arg)))) for (arg, argtype) in zip(args, julia_argtypes)) 93 | 94 | dest = ("""declare $llvm_ret_typ @$intrinsic($llvm_declargs)""", 95 | """$llvm_ret_asgn call $llvm_ret_typ @$intrinsic($llvm_defargs) 96 | ret $llvm_ret""") 97 | return quote 98 | Base.llvmcall($dest, $julia_ret_typ, Tuple{$(julia_argtypes...)}, $(julia_args...)) 99 | end 100 | end 101 | 102 | 103 | # julia.h: jl_datatype_align 104 | Base.@pure function datatype_align(::Type{T}) where {T} 105 | # typedef struct { 106 | # uint32_t nfields; 107 | # uint32_t alignment : 9; 108 | # uint32_t haspadding : 1; 109 | # uint32_t npointers : 20; 110 | # uint32_t fielddesc_type : 2; 111 | # } jl_datatype_layout_t; 112 | field = T.layout + sizeof(UInt32) 113 | unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1) 114 | end 115 | 116 | 117 | # generalization of word-based primitives 118 | 119 | ## extract bits from a larger value 120 | @inline function extract_word(val, ::Val{i}) where {i} 121 | extract_value(val, UInt32, Val(32*(i-1))) 122 | end 123 | @generated function extract_value(val, ::Type{sub}, ::Val{offset}) where {sub, offset} 124 | T_val = convert(LLVMType, val) 125 | T_sub = convert(LLVMType, sub) 126 | 127 | bytes = Core.sizeof(val) 128 | T_int = LLVM.IntType(8*bytes, JuliaContext()) 129 | 130 | # create function 131 | llvm_f, _ = create_function(T_sub, [T_val]) 132 | mod = LLVM.parent(llvm_f) 133 | 134 | # generate IR 135 | Builder(JuliaContext()) do builder 136 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 137 | position!(builder, entry) 138 | 139 | equiv = bitcast!(builder, parameters(llvm_f)[1], T_int) 140 | shifted = lshr!(builder, equiv, LLVM.ConstantInt(T_int, offset)) 141 | # extracted = and!(builder, shifted, 2^32-1) 142 | extracted = trunc!(builder, shifted, T_sub) 143 | 144 | ret!(builder, extracted) 145 | end 146 | 147 | call_function(llvm_f, UInt32, Tuple{val}, :( (val,) )) 148 | end 149 | 150 | ## insert bits into a larger value 151 | @inline function insert_word(val, word::UInt32, ::Val{i}) where {i} 152 | insert_value(val, word, Val(32*(i-1))) 153 | end 154 | @generated function insert_value(val, sub, ::Val{offset}) where {offset} 155 | T_val = convert(LLVMType, val) 156 | T_sub = convert(LLVMType, sub) 157 | 158 | bytes = Core.sizeof(val) 159 | T_out_int = LLVM.IntType(8*bytes, JuliaContext()) 160 | 161 | # create function 162 | llvm_f, _ = create_function(T_val, [T_val, T_sub]) 163 | mod = LLVM.parent(llvm_f) 164 | 165 | # generate IR 166 | Builder(JuliaContext()) do builder 167 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 168 | position!(builder, entry) 169 | 170 | equiv = bitcast!(builder, parameters(llvm_f)[1], T_out_int) 171 | ext = zext!(builder, parameters(llvm_f)[2], T_out_int) 172 | shifted = shl!(builder, ext, LLVM.ConstantInt(T_out_int, offset)) 173 | inserted = or!(builder, equiv, shifted) 174 | orig = bitcast!(builder, inserted, T_val) 175 | 176 | ret!(builder, orig) 177 | end 178 | 179 | call_function(llvm_f, val, Tuple{val, sub}, :( (val, sub) )) 180 | end 181 | 182 | # split the invocation of a function `op` on a value `val` with non-struct eltype 183 | # into multiple smaller invocations on byte-sized partial values. 184 | @generated function split_value_invocation(op::Function, val, args...) 185 | # TODO: control of lower-limit 186 | 187 | ex = quote 188 | Base.@_inline_meta 189 | end 190 | 191 | # disassemble into words 192 | words = Symbol[] 193 | for i in 1:Core.sizeof(val)÷4 194 | word = Symbol("word$i") 195 | push!(ex.args, :( $word = extract_word(val, Val($i)) )) 196 | push!(words, word) 197 | end 198 | 199 | # perform the operation 200 | for word in words 201 | push!(ex.args, :( $word = op($word, args...)) ) 202 | end 203 | 204 | # reassemble 205 | push!(ex.args, :( out = zero(val) )) 206 | for (i,word) in enumerate(words) 207 | push!(ex.args, :( out = insert_word(out, $word, Val($i)) )) 208 | end 209 | 210 | push!(ex.args, :( out )) 211 | return ex 212 | end 213 | 214 | # split the invocation of a function `op` on a value `val` 215 | # by invoking the function on each of its fields 216 | @generated function recurse_value_invocation(op::Function, val, args...) 217 | ex = quote 218 | Base.@_inline_meta 219 | end 220 | 221 | fields = fieldnames(val) 222 | if isempty(fields) 223 | push!(ex.args, :( split_value_invocation(op, val, args...) )) 224 | else 225 | ctor = Expr(:new, val) 226 | for field in fields 227 | push!(ctor.args, :( 228 | recurse_value_invocation(op, getfield(val, $(QuoteNode(field))), args...) )) 229 | end 230 | push!(ex.args, ctor) 231 | end 232 | 233 | return ex 234 | end 235 | 236 | # split the invocation of a function `op` on a pointer `ptr` with non-struct eltype 237 | # into multiple smaller invocations on any supported pointer as listed in `supported_ptrs`. 238 | @generated function split_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs}, 239 | args...) where {supported_ptrs} 240 | T = eltype(ptr) 241 | elsize(x) = Core.sizeof(eltype(x)) 242 | supported_ptrs = reverse(Base.uniontypes(supported_ptrs)) 243 | 244 | ex = quote 245 | Base.@_inline_meta 246 | end 247 | 248 | # disassemble 249 | vals = Tuple{Symbol,Int,Type}[] 250 | offset = 0 251 | while offset < Core.sizeof(T) 252 | val = Symbol("value.$(length(vals)+1)") 253 | 254 | # greedy selection of next pointer type 255 | remaining = Core.sizeof(T)-offset 256 | valid = filter(ptr->elsize(ptr)<=remaining, supported_ptrs) 257 | if isempty(valid) 258 | error("Cannot partition $T into values of $supported_typs") 259 | end 260 | ptr = first(sort(collect(valid); by=elsize, rev=true)) 261 | 262 | push!(vals, (val, offset, ptr)) 263 | offset += elsize(ptr) 264 | end 265 | 266 | # perform the operation 267 | for (val, offset, ptr) in vals 268 | subptr = :(convert($ptr, ptr+$offset)) 269 | push!(ex.args, :( $val = op($subptr, args...)) ) 270 | end 271 | 272 | # reassemble 273 | push!(ex.args, :( out = zero($T) )) 274 | for (val, offset, ptr) in vals 275 | push!(ex.args, :( out = insert_value(out, $val, Val($offset)) )) 276 | end 277 | 278 | push!(ex.args, :( out )) 279 | return ex 280 | end 281 | 282 | # split the invocation of a function `op` on a pointer `ptr` 283 | # by invoking the function on a pointer to each of its fields 284 | @generated function recurse_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs}, 285 | args...) where {supported_ptrs} 286 | T = eltype(ptr) 287 | 288 | ex = quote 289 | Base.@_inline_meta 290 | end 291 | 292 | fields = fieldnames(T) 293 | if isempty(fields) 294 | push!(ex.args, :( split_pointer_invocation(op, ptr, supported_ptrs, args...) )) 295 | else 296 | ctor = Expr(:new, T) 297 | for (i,field) in enumerate(fields) 298 | field_typ = fieldtype(T, i) 299 | field_offset = fieldoffset(T, i) 300 | field_ptr_typ = :($(ptr.name.wrapper){$field_typ}) 301 | # NOTE: this ctor is a leap of faith 302 | subptr = :(convert($field_ptr_typ, ptr+$field_offset)) 303 | push!(ctor.args, :( 304 | recurse_pointer_invocation(op, $subptr, supported_ptrs, args...) )) 305 | end 306 | push!(ex.args, ctor) 307 | end 308 | 309 | return ex 310 | end 311 | -------------------------------------------------------------------------------- /src/execution.jl: -------------------------------------------------------------------------------- 1 | # Native execution support 2 | 3 | export @cuda, cudaconvert, cufunction, nearest_warpsize 4 | 5 | 6 | ## kernel object and query functions 7 | 8 | struct Kernel{F,TT} 9 | ctx::CuContext 10 | mod::CuModule 11 | fun::CuFunction 12 | end 13 | 14 | """ 15 | version(k::Kernel) 16 | 17 | Queries the PTX and SM versions a kernel was compiled for. 18 | Returns a named tuple. 19 | """ 20 | function version(k::Kernel) 21 | attr = attributes(k.fun) 22 | binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...) 23 | ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...) 24 | return (ptx=ptx_ver, binary=binary_ver) 25 | end 26 | 27 | """ 28 | memory(k::Kernel) 29 | 30 | Queries the local, shared and constant memory usage of a compiled kernel in bytes. 31 | Returns a named tuple. 32 | """ 33 | function memory(k::Kernel) 34 | attr = attributes(k.fun) 35 | local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES] 36 | shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES] 37 | constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES] 38 | return (:local=>local_mem, shared=shared_mem, constant=constant_mem) 39 | end 40 | 41 | """ 42 | registers(k::Kernel) 43 | 44 | Queries the register usage of a kernel. 45 | """ 46 | function registers(k::Kernel) 47 | attr = attributes(k.fun) 48 | return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS] 49 | end 50 | 51 | """ 52 | maxthreads(k::Kernel) 53 | 54 | Queries the maximum amount of threads a kernel can use in a single block. 55 | """ 56 | function maxthreads(k::Kernel) 57 | attr = attributes(k.fun) 58 | return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK] 59 | end 60 | 61 | 62 | ## helper functions 63 | 64 | # split keyword arguments to `@cuda` into ones affecting the compiler, or the execution 65 | function split_kwargs(kwargs) 66 | compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] 67 | call_kws = [:blocks, :threads, :shmem, :stream] 68 | compiler_kwargs = [] 69 | call_kwargs = [] 70 | for kwarg in kwargs 71 | if Meta.isexpr(kwarg, :(=)) 72 | key,val = kwarg.args 73 | if isa(key, Symbol) 74 | if key in compiler_kws 75 | push!(compiler_kwargs, kwarg) 76 | elseif key in call_kws 77 | push!(call_kwargs, kwarg) 78 | else 79 | throw(ArgumentError("unknown keyword argument '$key'")) 80 | end 81 | else 82 | throw(ArgumentError("non-symbolic keyword '$key'")) 83 | end 84 | else 85 | throw(ArgumentError("non-keyword argument like option '$kwarg'")) 86 | end 87 | end 88 | 89 | return compiler_kwargs, call_kwargs 90 | end 91 | 92 | # assign arguments to variables, handle splatting 93 | function assign_args!(code, args) 94 | # handle splatting 95 | splats = map(arg -> Meta.isexpr(arg, :(...)), args) 96 | args = map(args, splats) do arg, splat 97 | splat ? arg.args[1] : arg 98 | end 99 | 100 | # assign arguments to variables 101 | vars = Tuple(gensym() for arg in args) 102 | map(vars, args) do var,arg 103 | push!(code.args, :($var = $(esc(arg)))) 104 | end 105 | 106 | # convert the arguments, compile the function and call the kernel 107 | # while keeping the original arguments alive 108 | var_exprs = map(vars, args, splats) do var, arg, splat 109 | splat ? Expr(:(...), var) : var 110 | end 111 | 112 | return vars, var_exprs 113 | end 114 | 115 | # fast lookup of global world age 116 | world_age() = ccall(:jl_get_tls_world_age, UInt, ()) 117 | 118 | # slow lookup of local method age 119 | function method_age(f, tt)::UInt 120 | for m in Base._methods(f, tt, 1, typemax(UInt)) 121 | return m[3].min_world 122 | end 123 | throw(MethodError(f, tt)) 124 | end 125 | 126 | 127 | ## adaptors 128 | 129 | struct Adaptor end 130 | 131 | # convert CUDAdrv pointers to CUDAnative pointers 132 | Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p) 133 | 134 | # Base.RefValue isn't GPU compatible, so provide a compatible alternative 135 | struct CuRefValue{T} <: Ref{T} 136 | x::T 137 | end 138 | Base.getindex(r::CuRefValue) = r.x 139 | Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[])) 140 | 141 | # convenience function 142 | """ 143 | cudaconvert(x) 144 | 145 | This function is called for every argument to be passed to a kernel, allowing it to be 146 | converted to a GPU-friendly format. By default, the function does nothing and returns the 147 | input object `x` as-is. 148 | 149 | Do not add methods to this function, but instead extend the underlying Adapt.jl package and 150 | register methods for the the `CUDAnative.Adaptor` type. 151 | """ 152 | cudaconvert(arg) = adapt(Adaptor(), arg) 153 | 154 | 155 | ## high-level @cuda interface 156 | 157 | """ 158 | @cuda [kwargs...] func(args...) 159 | 160 | High-level interface for executing code on a GPU. The `@cuda` macro should prefix a call, 161 | with `func` a callable function or object that should return nothing. It will be compiled to 162 | a CUDA function upon first use, and to a certain extent arguments will be converted and 163 | managed automatically using `cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is 164 | performed, scheduling a kernel launch on the current CUDA context. 165 | 166 | Several keyword arguments are supported that influence kernel compilation and execution. For 167 | more information, refer to the documentation of respectively [`cufunction`](@ref) and 168 | [`CUDAnative.Kernel`](@ref) 169 | 170 | The underlying operations (argument conversion, kernel compilation, kernel call) can be 171 | performed explicitly when more control is needed, e.g. to reflect on the resource usage of a 172 | kernel to determine the launch configuration: 173 | 174 | args = ... 175 | GC.@preserve args begin 176 | kernel_args = cudaconvert.(args) 177 | kernel_tt = Tuple{Core.Typeof.(kernel_args)...} 178 | kernel = cufunction(f, kernel_tt; compilation_kwargs) 179 | kernel(kernel_args...; launch_kwargs) 180 | end 181 | """ 182 | macro cuda(ex...) 183 | # destructure the `@cuda` expression 184 | if length(ex) > 0 && ex[1].head == :tuple 185 | error("The tuple argument to @cuda has been replaced by keywords: `@cuda threads=... fun(args...)`") 186 | end 187 | call = ex[end] 188 | kwargs = ex[1:end-1] 189 | 190 | # destructure the kernel call 191 | if call.head != :call 192 | throw(ArgumentError("second argument to @cuda should be a function call")) 193 | end 194 | f = call.args[1] 195 | args = call.args[2:end] 196 | 197 | code = quote end 198 | compiler_kwargs, call_kwargs = split_kwargs(kwargs) 199 | vars, var_exprs = assign_args!(code, args) 200 | 201 | # convert the arguments, call the compiler and launch the kernel 202 | # while keeping the original arguments alive 203 | push!(code.args, 204 | quote 205 | GC.@preserve $(vars...) begin 206 | local kernel_args = cudaconvert.(($(var_exprs...),)) 207 | local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} 208 | local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) 209 | kernel(kernel_args...; $(map(esc, call_kwargs)...)) 210 | end 211 | end) 212 | return code 213 | end 214 | 215 | 216 | ## APIs for manual compilation 217 | 218 | const agecache = Dict{UInt, UInt}() 219 | const compilecache = Dict{UInt, Kernel}() 220 | 221 | """ 222 | cufunction(f, tt=Tuple{}; kwargs...) 223 | 224 | Low-level interface to compile a function invocation for the currently-active GPU, returning 225 | a callable kernel object. For a higher-level interface, use [`@cuda`](@ref). 226 | 227 | The following keyword arguments are supported: 228 | - minthreads: the required number of threads in a thread block. 229 | - maxthreads: the maximum number of threads in a thread block. 230 | - blocks_per_sm: a minimum number of thread blocks to be scheduled on a single 231 | multiprocessor. 232 | - maxregs: the maximum number of registers to be allocated to a single thread (only 233 | supported on LLVM 4.0+) 234 | 235 | The output of this function is automatically cached, i.e. you can simply call `cufunction` 236 | in a hot path without degrading performance. New code will be generated automatically, when 237 | when function changes, or when different types or keyword arguments are provided. 238 | """ 239 | @generated function cufunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) 240 | tt = Base.to_tuple_type(tt.parameters[1]) 241 | sig = Base.signature_type(f, tt) 242 | t = Tuple(tt.parameters) 243 | 244 | precomp_key = hash(sig) # precomputable part of the keys 245 | quote 246 | Base.@_inline_meta 247 | 248 | CUDAnative.maybe_initialize("cufunction") 249 | 250 | # look-up the method age 251 | key = hash(world_age(), $precomp_key) 252 | if haskey(agecache, key) 253 | age = agecache[key] 254 | else 255 | age = method_age(f, $t) 256 | agecache[key] = age 257 | end 258 | 259 | # compile the function 260 | ctx = CuCurrentContext() 261 | key = hash(age, $precomp_key) 262 | key = hash(ctx, key) 263 | key = hash(kwargs, key) 264 | for nf in 1:nfields(f) 265 | # mix in the values of any captured variable 266 | key = hash(getfield(f, nf), key) 267 | end 268 | if !haskey(compilecache, key) 269 | fun, mod = compile(device(ctx), f, tt; kwargs...) 270 | kernel = Kernel{f,tt}(ctx, mod, fun) 271 | @debug begin 272 | ver = version(kernel) 273 | mem = memory(kernel) 274 | reg = registers(kernel) 275 | """Compiled $f to PTX $(ver.ptx) for SM $(ver.binary) using $reg registers. 276 | Memory usage: $(Base.format_bytes(mem.local)) local, $(Base.format_bytes(mem.shared)) shared, $(Base.format_bytes(mem.constant)) constant""" 277 | end 278 | compilecache[key] = kernel 279 | end 280 | 281 | return compilecache[key]::Kernel{f,tt} 282 | end 283 | end 284 | 285 | @generated function (kernel::Kernel{F,TT})(args...; call_kwargs...) where {F,TT} 286 | sig = Base.signature_type(F, TT) 287 | args = (:F, (:( args[$i] ) for i in 1:length(args))...) 288 | 289 | # filter out ghost arguments that shouldn't be passed 290 | to_pass = map(!isghosttype, sig.parameters) 291 | call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] 292 | call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] 293 | 294 | # replace non-isbits arguments (they should be unused, or compilation would have failed) 295 | # alternatively, make CUDAdrv allow `launch` with non-isbits arguments. 296 | for (i,dt) in enumerate(call_t) 297 | if !isbitstype(dt) 298 | call_t[i] = Ptr{Any} 299 | call_args[i] = :C_NULL 300 | end 301 | end 302 | 303 | # finalize types 304 | call_tt = Base.to_tuple_type(call_t) 305 | 306 | quote 307 | Base.@_inline_meta 308 | 309 | cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...) 310 | end 311 | end 312 | 313 | # There doesn't seem to be a way to access the documentation for the call-syntax, 314 | # so attach it to the type 315 | """ 316 | (::Kernel)(args...; kwargs...) 317 | 318 | Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`. 319 | For a higher-level interface, use [`@cuda`](@ref). 320 | 321 | The following keyword arguments are supported: 322 | - threads (defaults to 1) 323 | - blocks (defaults to 1) 324 | - shmem (defaults to 0) 325 | - stream (defaults to the default stream) 326 | """ 327 | Kernel 328 | 329 | ## other 330 | 331 | """ 332 | nearest_warpsize(dev::CuDevice, threads::Integer) 333 | 334 | Return the nearest number of threads that is a multiple of the warp size of a device. 335 | 336 | This is a common requirement, eg. when using shuffle intrinsics. 337 | """ 338 | function nearest_warpsize(dev::CuDevice, threads::Integer) 339 | ws = CUDAdrv.warpsize(dev) 340 | return threads + (ws - threads % ws) % ws 341 | end 342 | -------------------------------------------------------------------------------- /examples/blackscholes.jl: -------------------------------------------------------------------------------- 1 | using CUDAapi, CUDAdrv, CUDAnative, CuArrays 2 | 3 | CUDAnative.initialize() 4 | const dev = device() 5 | const cap = capability(dev) 6 | 7 | using BenchmarkTools 8 | BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 9 | BenchmarkTools.DEFAULT_PARAMETERS.gcsample = true 10 | 11 | using SpecialFunctions 12 | 13 | 14 | ## scalar CPU version 15 | 16 | @inline cndf2(in::Float32) = 0.5f0 + 0.5f0 * erf(0.707106781f0 * in) 17 | 18 | function blackscholes_cpu(sptprice::Float32, strike::Float32, rate::Float32, 19 | volatility::Float32, time::Float32) 20 | logterm = log10(sptprice / strike) 21 | powterm = .5f0 * volatility * volatility 22 | den = volatility * sqrt(time) 23 | d1 = (((rate + powterm) * time) + logterm) / den 24 | d2 = d1 - den 25 | NofXd1 = cndf2(d1) 26 | NofXd2 = cndf2(d2) 27 | futureValue = strike * exp(-rate * time) 28 | c1 = futureValue * NofXd2 29 | call = sptprice * NofXd1 - c1 30 | return call - futureValue + sptprice 31 | end 32 | 33 | 34 | ## vectorized CPU version 35 | 36 | @inline cndf2(in::AbstractArray{Float32}) = 0.5f0 .+ 0.5f0 .* erf.(0.707106781f0 .* in) 37 | 38 | function blackscholes_cpu(sptprice::AbstractArray{Float32}, 39 | strike::AbstractArray{Float32}, 40 | rate::AbstractArray{Float32}, 41 | volatility::AbstractArray{Float32}, 42 | time::AbstractArray{Float32}) 43 | logterm = log10.(sptprice ./ strike) 44 | powterm = .5f0 .* volatility .* volatility 45 | den = volatility .* sqrt.(time) 46 | d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den 47 | d2 = d1 .- den 48 | NofXd1 = cndf2(d1) 49 | NofXd2 = cndf2(d2) 50 | futureValue = strike .* exp.(- rate .* time) 51 | c1 = futureValue .* NofXd2 52 | call = sptprice .* NofXd1 .- c1 53 | return call .- futureValue .+ sptprice 54 | end 55 | 56 | 57 | ## native CUDA version 58 | 59 | @inline cndf2_cuda(in::Float32) = 0.5f0 + 0.5f0 * CUDAnative.erf(0.707106781f0 * in) 60 | 61 | function blackscholes_kernel(sptprice::AbstractArray{Float32}, 62 | strike::AbstractArray{Float32}, 63 | rate::AbstractArray{Float32}, 64 | volatility::AbstractArray{Float32}, 65 | time::AbstractArray{Float32}, 66 | out::AbstractArray{Float32}) 67 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 68 | 69 | if i <= size(sptprice, 1) 70 | logterm = CUDAnative.log10(sptprice[i] / strike[i]) 71 | powterm = 0.5f0 * volatility[i] * volatility[i] 72 | den = volatility[i] * CUDAnative.sqrt(time[i]) 73 | d1 = (((rate[i] + powterm) * time[i]) + logterm) / den 74 | d2 = d1 - den 75 | NofXd1 = cndf2_cuda(d1) 76 | NofXd2 = cndf2_cuda(d2) 77 | futureValue = strike[i] * CUDAnative.exp(-rate[i] * time[i]) 78 | c1 = futureValue * NofXd2 79 | call = sptprice[i] * NofXd1 - c1 80 | out[i] = call - futureValue + sptprice[i] 81 | end 82 | 83 | return 84 | end 85 | 86 | 87 | ## scalar CuArrays version 88 | 89 | function blackscholes_gpu(sptprice::Float32, strike::Float32, rate::Float32, 90 | volatility::Float32, time::Float32) 91 | logterm = CUDAnative.log10(sptprice / strike) 92 | powterm = .5f0 * volatility * volatility 93 | den = volatility * CUDAnative.sqrt(time) 94 | d1 = (((rate + powterm) * time) + logterm) / den 95 | d2 = d1 - den 96 | NofXd1 = cndf2_cuda(d1) 97 | NofXd2 = cndf2_cuda(d2) 98 | futureValue = strike * CUDAnative.exp(-rate * time) 99 | c1 = futureValue * NofXd2 100 | call = sptprice * NofXd1 - c1 101 | return call - futureValue + sptprice 102 | end 103 | 104 | 105 | ## vectorized CuArrays version 106 | 107 | @inline cndf2_cuarr(in::AbstractArray{Float32}) = 0.5f0 .+ 0.5f0 .* CUDAnative.erf.(0.707106781f0 .* in) 108 | 109 | function blackscholes_gpu(sptprice::AbstractArray{Float32}, 110 | strike::AbstractArray{Float32}, 111 | rate::AbstractArray{Float32}, 112 | volatility::AbstractArray{Float32}, 113 | time::AbstractArray{Float32}) 114 | logterm = CUDAnative.log10.(sptprice ./ strike) 115 | powterm = .5f0 .* volatility .* volatility 116 | den = volatility .* CUDAnative.sqrt.(time) 117 | d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den 118 | d2 = d1 .- den 119 | NofXd1 = cndf2_cuarr(d1) 120 | NofXd2 = cndf2_cuarr(d2) 121 | futureValue = strike .* CUDAnative.exp.(- rate .* time) 122 | c1 = futureValue .* NofXd2 123 | call = sptprice .* NofXd1 .- c1 124 | return call .- futureValue .+ sptprice 125 | end 126 | 127 | 128 | ## non-native CUDA C version 129 | 130 | const cuda_source = "$(tempname()).cu" 131 | const cuda_ptx = "$(tempname()).ptx" 132 | 133 | open(cuda_source, "w") do io 134 | print(io, """ 135 | extern "C" __global__ void blackscholes_kernel(const float *sptprice, 136 | const float *strike, 137 | const float *rate, 138 | const float *volatility, 139 | const float *time, 140 | float *out, 141 | size_t n) 142 | { 143 | int i = blockIdx.x * blockDim.x + threadIdx.x; 144 | if (i < n) { 145 | float logterm = log10(sptprice[i] / strike[i]); 146 | float powterm = 0.5 * volatility[i] * volatility[i]; 147 | float den = volatility[i] * sqrt(time[i]); 148 | float d1 = (((rate[i] + powterm) * time[i]) + logterm) / den; 149 | float d2 = d1 - den; 150 | float NofXd1 = 0.5 + 0.5 * erf(0.707106781 * d1); 151 | float NofXd2 = 0.5 + 0.5 * erf(0.707106781 * d2); 152 | float futureValue = strike[i] * exp(-rate[i] * time[i]); 153 | float c1 = futureValue * NofXd2; 154 | float call = sptprice[i] * NofXd1 - c1; 155 | out[i] = call - futureValue + sptprice[i]; 156 | } 157 | } 158 | """) 159 | end 160 | 161 | toolkit = CUDAapi.find_toolkit() 162 | nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit) 163 | toolchain = CUDAapi.find_toolchain(toolkit) 164 | flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)` 165 | run(`$nvcc $flags -ptx -o $cuda_ptx $cuda_source`) 166 | 167 | const cuda_module = CuModuleFile(cuda_ptx) 168 | const cuda_function = CuFunction(cuda_module, "blackscholes_kernel") 169 | 170 | 171 | ## main 172 | 173 | function checksum(reference, result) 174 | reference_sum = sum(reference) 175 | result_sum = sum(result) 176 | diff = abs(1-reference_sum/result_sum) 177 | if diff>0.01 178 | warn("checksum failed: $result_sum instead of $reference_sum (relative difference: $diff)") 179 | println(stacktrace()) 180 | end 181 | end 182 | 183 | function main(iterations) 184 | sptprice = Float32[ 42.0 for i = 1:iterations ] 185 | strike = Float32[ 40.0 + (i / iterations) for i = 1:iterations ] 186 | rate = Float32[ 0.5 for i = 1:iterations ] 187 | volatility = Float32[ 0.2 for i = 1:iterations ] 188 | time = Float32[ 0.5 for i = 1:iterations ] 189 | 190 | timings = Dict() 191 | 192 | reference = blackscholes_cpu.(sptprice, strike, rate, volatility, time) 193 | 194 | let benchmark = @benchmarkable begin 195 | out .= blackscholes_cpu.($sptprice, $strike, $rate, 196 | $volatility, $time) 197 | end setup=( 198 | out = similar($strike) 199 | ) teardown=( 200 | checksum($reference, out) 201 | ) 202 | timings["Single-threaded (scalar)"] = run(benchmark) 203 | end 204 | 205 | let benchmark = @benchmarkable begin 206 | out = blackscholes_cpu($sptprice, $strike, $rate, 207 | $volatility, $time) 208 | end setup=( 209 | out = nothing 210 | ) teardown=( 211 | checksum($reference, out) 212 | ) 213 | timings["Single-threaded (vectorized)"] = run(benchmark) 214 | end 215 | 216 | let benchmark = @benchmarkable begin 217 | cudacall(cuda_function, 218 | Tuple{Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, 219 | Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Csize_t}, 220 | sptprice_dev, strike_dev, rate_dev, volatility_dev, 221 | time_dev, out, n; blocks=grid, threads=block) 222 | synchronize() 223 | end setup=( 224 | sptprice_dev = CuArray($sptprice); 225 | strike_dev = CuArray($strike); 226 | rate_dev = CuArray($rate); 227 | volatility_dev = CuArray($volatility); 228 | time_dev = CuArray($time); 229 | out = similar(strike_dev); 230 | 231 | n = size($sptprice, 1); 232 | block = min(n, 1024); 233 | grid = ceil(Integer, n/block) 234 | ) teardown=( 235 | checksum($reference, Array(out)) 236 | ) 237 | timings["CUDA C (kernel)"] = run(benchmark) 238 | end 239 | 240 | let benchmark = @benchmarkable begin 241 | @cuda blocks=grid threads=block blackscholes_kernel(sptprice_dev, strike_dev, rate_dev, 242 | volatility_dev, time_dev, out) 243 | synchronize() 244 | end setup=( 245 | sptprice_dev = CuArray($sptprice); 246 | strike_dev = CuArray($strike); 247 | rate_dev = CuArray($rate); 248 | volatility_dev = CuArray($volatility); 249 | time_dev = CuArray($time); 250 | out = similar(strike_dev); 251 | 252 | n = size($sptprice, 1); 253 | block = min(n, 1024); 254 | grid = ceil(Integer, n/block) 255 | ) teardown=( 256 | checksum($reference, Array(out)) 257 | ) 258 | timings["CUDAnative.jl (kernel)"] = run(benchmark) 259 | end 260 | 261 | let benchmark = @benchmarkable begin 262 | out .= blackscholes_gpu.(sptprice_dev, strike_dev, rate_dev, 263 | volatility_dev, time_dev) 264 | synchronize() 265 | end setup=( 266 | sptprice_dev = CuArray($sptprice); 267 | strike_dev = CuArray($strike); 268 | rate_dev = CuArray($rate); 269 | volatility_dev = CuArray($volatility); 270 | time_dev = CuArray($time); 271 | out = similar(strike_dev); 272 | ) teardown=( 273 | checksum($reference, Array(out)) 274 | ) 275 | timings["CuArrays.jl (scalar)"] = run(benchmark) 276 | end 277 | 278 | let benchmark = @benchmarkable begin 279 | out = blackscholes_gpu(sptprice_dev, strike_dev, rate_dev, 280 | volatility_dev, time_dev) 281 | synchronize() 282 | end setup=( 283 | sptprice_dev = CuArray($sptprice); 284 | strike_dev = CuArray($strike); 285 | rate_dev = CuArray($rate); 286 | volatility_dev = CuArray($volatility); 287 | time_dev = CuArray($time); 288 | out = nothing 289 | ) teardown=( 290 | checksum($reference, Array(out)) 291 | ) 292 | timings["CuArrays.jl (vectorized)"] = run(benchmark) 293 | end 294 | 295 | return timings 296 | end 297 | 298 | function main() 299 | iterations = 10^7 300 | timings = main(iterations) 301 | 302 | println() 303 | println("Timings:") 304 | for (test, trials) in timings 305 | println("* $test: ", BenchmarkTools.prettytime(time(trials))) 306 | end 307 | 308 | println() 309 | println("Rates:") 310 | for (test, trials) in timings 311 | println("* $test: ", 1e9*iterations/time(trials), " ops/sec") 312 | end 313 | end 314 | 315 | main() 316 | 317 | rm(cuda_source) 318 | rm(cuda_ptx) 319 | -------------------------------------------------------------------------------- /src/reflection.jl: -------------------------------------------------------------------------------- 1 | # code reflection entry-points 2 | 3 | using InteractiveUtils 4 | 5 | 6 | # 7 | # code_* replacements 8 | # 9 | 10 | # NOTE: these functions replicate parts of the main compiler driver in order to generate 11 | # more compact code (i.e. without the run-time library) and/or to support generating 12 | # otherwise invalid code (e.g. with missing symbols). 13 | 14 | """ 15 | code_llvm([io], f, types; optimize=true, cap::VersionNumber, kernel=true, 16 | dump_module=false, strip_ir_metadata=true) 17 | 18 | Prints the device LLVM IR generated for the method matching the given generic function and 19 | type signature to `io` which defaults to `stdout`. The IR is optimized according to 20 | `optimize` (defaults to true), which includes entry-point specific optimizations if `kernel` 21 | is set (defaults to false). The device capability `cap` to generate code for defaults to the 22 | current active device's capability, or v"2.0" if there is no such active context. The entire 23 | module, including headers and other functions, is dumped if `dump_module` is set (defaults 24 | to false). Finally, setting `strip_ir_metadata` removes all debug metadata (defaults to 25 | true). 26 | 27 | See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref) 28 | """ 29 | function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple); 30 | optimize::Bool=true, cap::VersionNumber=current_capability(), 31 | dump_module::Bool=false, strip_ir_metadata::Bool=true, 32 | kernel::Bool=false, kwargs...) 33 | tt = Base.to_tuple_type(types) 34 | ctx = CompilerContext(func, tt, cap, kernel; kwargs...) 35 | code_llvm(io, ctx; optimize=optimize, dump_module=dump_module, 36 | strip_ir_metadata=strip_ir_metadata) 37 | end 38 | function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true, 39 | dump_module::Bool=false, strip_ir_metadata::Bool=true) 40 | check_method(ctx) 41 | mod, entry = irgen(ctx) 42 | if optimize 43 | entry = optimize!(ctx, mod, entry) 44 | end 45 | if strip_ir_metadata 46 | strip_debuginfo!(mod) 47 | end 48 | if dump_module 49 | show(io, mod) 50 | else 51 | show(io, entry) 52 | end 53 | end 54 | code_llvm(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) = 55 | code_llvm(stdout, func, types; kwargs...) 56 | 57 | """ 58 | code_ptx([io], f, types; cap::VersionNumber, kernel=false, strip_ir_metadata=true) 59 | 60 | Prints the PTX assembly generated for the method matching the given generic function and 61 | type signature to `io` which defaults to `stdout`. The device capability `cap` to generate 62 | code for defaults to the current active device's capability, or v"2.0" if there is no such 63 | active context. The optional `kernel` parameter indicates whether the function in question 64 | is an entry-point function, or a regular device function. Finally, setting 65 | `strip_ir_metadata` removes all debug metadata (defaults to true). 66 | 67 | See also: [`@device_code_ptx`](@ref) 68 | """ 69 | function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple); 70 | cap::VersionNumber=current_capability(), kernel::Bool=false, 71 | strip_ir_metadata::Bool=true, kwargs...) 72 | tt = Base.to_tuple_type(types) 73 | ctx = CompilerContext(func, tt, cap, kernel; kwargs...) 74 | code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata) 75 | end 76 | function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true) 77 | check_method(ctx) 78 | mod, entry = irgen(ctx) 79 | entry = optimize!(ctx, mod, entry) 80 | if strip_ir_metadata 81 | strip_debuginfo!(mod) 82 | end 83 | prepare_execution!(ctx, mod) 84 | ptx = mcgen(ctx, mod, entry) 85 | print(io, ptx) 86 | end 87 | code_ptx(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) = 88 | code_ptx(stdout, func, types; kwargs...) 89 | 90 | """ 91 | code_sass([io], f, types, cap::VersionNumber) 92 | 93 | Prints the SASS code generated for the method matching the given generic function and type 94 | signature to `io` which defaults to `stdout`. The device capability `cap` to generate code 95 | for defaults to the current active device's capability, or v"2.0" if there is no such active 96 | context. The method needs to be a valid entry-point kernel, eg. it should not return any 97 | values. 98 | 99 | See also: [`@device_code_sass`](@ref) 100 | """ 101 | function code_sass(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple); 102 | cap::VersionNumber=current_capability(), kernel::Bool=true, kwargs...) 103 | tt = Base.to_tuple_type(types) 104 | ctx = CompilerContext(func, tt, cap, kernel; kwargs...) 105 | code_sass(io, ctx) 106 | end 107 | function code_sass(io::IO, ctx::CompilerContext) 108 | if !ctx.kernel 109 | error("Can only generate SASS code for kernel functions") 110 | end 111 | if ptxas === nothing || nvdisasm === nothing 112 | error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass") 113 | end 114 | 115 | ptx,_ = compile(ctx) 116 | 117 | fn = tempname() 118 | gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)" 119 | # NOTE: this might not match what is being executed, due to the PTX->SASS conversion 120 | # by the driver possibly not matching what `ptxas` (part of the toolkit) does. 121 | # TODO: see how `nvvp` extracts SASS code when doing PC sampling, and copy that. 122 | Base.run(`$ptxas --gpu-name $gpu --output-file $fn --input-as-string $ptx`) 123 | try 124 | cmd = `$nvdisasm --print-code --print-line-info $fn` 125 | for line in readlines(cmd) 126 | # nvdisasm output is pretty verbose; 127 | # perform some clean-up and make it look like @code_native 128 | line = replace(line, r"/\*[0-9a-f]{4}\*/" => " ") # strip inst addr 129 | line = replace(line, r"^[ ]{30}" => " ") # reduce leading spaces 130 | line = replace(line, r"[\s+]//##" => ";") # change line info tag 131 | line = replace(line, r"^\." => "\n.") # break before new BBs 132 | line = replace(line, r"; File \"(.+?)\", line (\d+)" => s"; Location \1:\2") # rename line info 133 | println(io, line) 134 | end 135 | finally 136 | rm(fn) 137 | end 138 | end 139 | code_sass(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) = 140 | code_sass(stdout, func, types; kwargs...) 141 | 142 | 143 | # 144 | # @device_code_* functions 145 | # 146 | 147 | export @device_code_lowered, @device_code_typed, @device_code_warntype, 148 | @device_code_llvm, @device_code_ptx, @device_code_sass, 149 | @device_code 150 | 151 | function emit_hooked_compilation(inner_hook, ex...) 152 | user_code = ex[end] 153 | user_kwargs = ex[1:end-1] 154 | quote 155 | # wipe the compile cache to force recompilation 156 | empty!(CUDAnative.compilecache) 157 | 158 | local kernels = 0 159 | function outer_hook(ctx) 160 | kernels += 1 161 | $inner_hook(ctx; $(map(esc, user_kwargs)...)) 162 | end 163 | 164 | if CUDAnative.compile_hook[] != nothing 165 | error("Chaining multiple @device_code calls is unsupported") 166 | end 167 | try 168 | CUDAnative.compile_hook[] = outer_hook 169 | $(esc(user_code)) 170 | finally 171 | CUDAnative.compile_hook[] = nothing 172 | end 173 | 174 | if kernels == 0 175 | error("no kernels executed while evaluating the given expression") 176 | end 177 | 178 | nothing 179 | end 180 | end 181 | 182 | # NOTE: these hooks take both a `f` and an inner `f`, because of how `@cuda`/`_cuda` work: 183 | # kernels are automatically wrapper in a function returning nothing, for usability. 184 | # 185 | # Julia-level reflection (lowered/typed/warntype) skips these wrapper, because we 186 | # can't do call-site inlining and the kernel wrapper would hide any meaningful code. 187 | # 188 | # at the LLVM level, we inline everything so there's no need to hide the wrapper. 189 | 190 | """ 191 | @device_code_lowered ex 192 | 193 | Evaluates the expression `ex` and returns the result of 194 | [`InteractiveUtils.code_lowered`](@ref) for every compiled CUDA kernel. 195 | 196 | See also: [`InteractiveUtils.@code_lowered`](@ref) 197 | """ 198 | macro device_code_lowered(ex...) 199 | quote 200 | buf = Any[] 201 | function hook(ctx::CompilerContext) 202 | append!(buf, code_lowered(ctx.f, ctx.tt)) 203 | end 204 | $(emit_hooked_compilation(:hook, ex...)) 205 | buf 206 | end 207 | end 208 | 209 | """ 210 | @device_code_typed ex 211 | 212 | Evaluates the expression `ex` and returns the result of 213 | [`InteractiveUtils.code_typed`](@ref) for every compiled CUDA kernel. 214 | 215 | See also: [`InteractiveUtils.@code_typed`](@ref) 216 | """ 217 | macro device_code_typed(ex...) 218 | quote 219 | buf = Any[] 220 | function hook(ctx::CompilerContext) 221 | append!(buf, code_typed(ctx.f, ctx.tt)) 222 | end 223 | $(emit_hooked_compilation(:hook, ex...)) 224 | buf 225 | end 226 | end 227 | 228 | """ 229 | @device_code_warntype [io::IO=stdout] ex 230 | 231 | Evaluates the expression `ex` and prints the result of 232 | [`InteractiveUtils.code_warntype`](@ref) to `io` for every compiled CUDA kernel. 233 | 234 | See also: [`InteractiveUtils.@code_warntype`](@ref) 235 | """ 236 | macro device_code_warntype(ex...) 237 | function hook(ctx::CompilerContext; io::IO=stdout, kwargs...) 238 | code_warntype(io, ctx.f, ctx.tt; kwargs...) 239 | end 240 | emit_hooked_compilation(hook, ex...) 241 | end 242 | 243 | """ 244 | @device_code_llvm [io::IO=stdout, ...] ex 245 | 246 | Evaluates the expression `ex` and prints the result of [`InteractiveUtils.code_llvm`](@ref) 247 | to `io` for every compiled CUDA kernel. For other supported keywords, see 248 | [`CUDAnative.code_llvm`](@ref). 249 | 250 | See also: [`InteractiveUtils.@code_llvm`](@ref) 251 | """ 252 | macro device_code_llvm(ex...) 253 | hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_llvm(io, ctx; kwargs...) 254 | emit_hooked_compilation(hook, ex...) 255 | end 256 | 257 | """ 258 | @device_code_ptx [io::IO=stdout, ...] ex 259 | 260 | Evaluates the expression `ex` and prints the result of [`CUDAnative.code_ptx`](@ref) to `io` 261 | for every compiled CUDA kernel. For other supported keywords, see 262 | [`CUDAnative.code_ptx`](@ref). 263 | """ 264 | macro device_code_ptx(ex...) 265 | hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_ptx(io, ctx; kwargs...) 266 | emit_hooked_compilation(hook, ex...) 267 | end 268 | 269 | """ 270 | @device_code_sass [io::IO=stdout, ...] ex 271 | 272 | Evaluates the expression `ex` and prints the result of [`CUDAnative.code_sass`](@ref) to 273 | `io` for every compiled CUDA kernel. For other supported keywords, see 274 | [`CUDAnative.code_sass`](@ref). 275 | """ 276 | macro device_code_sass(ex...) 277 | hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_sass(io, ctx; kwargs...) 278 | emit_hooked_compilation(hook, ex...) 279 | end 280 | 281 | """ 282 | @device_code dir::AbstractString=... [...] ex 283 | 284 | Evaluates the expression `ex` and dumps all intermediate forms of code to the directory 285 | `dir`. 286 | """ 287 | macro device_code(ex...) 288 | only(xs) = (@assert length(xs) == 1; first(xs)) 289 | function hook(ctx::CompilerContext; dir::AbstractString) 290 | fn = "$(typeof(ctx.f).name.mt.name)_$(globalUnique+1)" 291 | mkpath(dir) 292 | 293 | open(joinpath(dir, "$fn.lowered.jl"), "w") do io 294 | code = only(code_lowered(ctx.f, ctx.tt)) 295 | println(io, code) 296 | end 297 | 298 | open(joinpath(dir, "$fn.typed.jl"), "w") do io 299 | code = only(code_typed(ctx.f, ctx.tt)) 300 | println(io, code) 301 | end 302 | 303 | open(joinpath(dir, "$fn.unopt.ll"), "w") do io 304 | code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false, optimize=false) 305 | end 306 | 307 | open(joinpath(dir, "$fn.opt.ll"), "w") do io 308 | code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false) 309 | end 310 | 311 | open(joinpath(dir, "$fn.ptx"), "w") do io 312 | code_ptx(io, ctx) 313 | end 314 | 315 | open(joinpath(dir, "$fn.sass"), "w") do io 316 | code_sass(io, ctx) 317 | end 318 | end 319 | emit_hooked_compilation(hook, ex...) 320 | end 321 | --------------------------------------------------------------------------------