├── docs
    ├── .gitignore
    ├── Project.toml
    ├── src
    │   ├── lib
    │   │   ├── compilation.md
    │   │   ├── device
    │   │   │   ├── array.md
    │   │   │   ├── libdevice.md
    │   │   │   └── intrinsics.md
    │   │   └── reflection.md
    │   ├── index.md
    │   └── man
    │   │   ├── performance.md
    │   │   ├── troubleshooting.md
    │   │   ├── usage.md
    │   │   └── hacking.md
    └── make.jl
├── examples
    ├── reduce
    │   ├── .gitignore
    │   ├── verify.jl
    │   ├── benchmark.jl
    │   ├── reduce.cu
    │   └── reduce.jl
    ├── vadd.jl
    ├── hello_world.jl
    ├── oob.jl
    ├── multigpu.jl
    ├── peakflops.jl
    ├── scan.jl
    ├── pairwise.jl
    └── blackscholes.jl
├── deps
    ├── .gitignore
    └── build.jl
├── .gitignore
├── test
    ├── perf
    │   └── launch_overhead
    │   │   ├── .gitignore
    │   │   ├── cuda.cu
    │   │   ├── build.jl
    │   │   ├── cudanative.jl
    │   │   ├── cuda.jl
    │   │   ├── README.md
    │   │   └── cuda.c
    ├── examples.jl
    ├── base.jl
    ├── device
    │   ├── pointer.jl
    │   ├── codegen.jl
    │   └── array.jl
    ├── runtests.jl
    ├── pointer.jl
    └── util.jl
├── REQUIRE
├── bors.toml
├── codecov.yml
├── src
    ├── deprecated.jl
    ├── compiler.jl
    ├── utils.jl
    ├── device
    │   ├── cuda_intrinsics.jl
    │   ├── cuda_intrinsics
    │   │   ├── misc.jl
    │   │   ├── memory_dynamic.jl
    │   │   ├── warp_vote.jl
    │   │   ├── assertion.jl
    │   │   ├── indexing.jl
    │   │   ├── output.jl
    │   │   ├── synchronization.jl
    │   │   ├── memory_shared.jl
    │   │   └── warp_shuffle.jl
    │   ├── array.jl
    │   ├── runtime_intrinsics.jl
    │   ├── pointer.jl
    │   └── tools.jl
    ├── CUDAnative.jl
    ├── compiler
    │   ├── debug.jl
    │   ├── driver.jl
    │   ├── mcgen.jl
    │   ├── common.jl
    │   ├── rtlib.jl
    │   └── validation.jl
    ├── init.jl
    ├── execution.jl
    └── reflection.jl
├── Project.toml
├── LICENSE.md
├── README.md
├── .gitlab-ci.yml
├── res
    └── parse_libdevice.jl
└── NEWS.md


/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/examples/reduce/.gitignore:
--------------------------------------------------------------------------------
1 | *.so
2 | *.ptx
3 | 


--------------------------------------------------------------------------------
/deps/.gitignore:
--------------------------------------------------------------------------------
1 | ext.jl
2 | ext.jl.bak
3 | build.log
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.jl.*.cov
2 | *.jl.mem
3 | Manifest.toml
4 | deps/runtime/
5 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/.gitignore:
--------------------------------------------------------------------------------
1 | cuda
2 | cuda.ptx
3 | *.nvvp
4 | 
5 | 


--------------------------------------------------------------------------------
/REQUIRE:
--------------------------------------------------------------------------------
1 | julia 1.0
2 | CUDAdrv 1.1
3 | LLVM 0.9.14
4 | CUDAapi 0.4.0
5 | Adapt 0.4
6 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [
2 |   "ci/gitlab/%"
3 | ]
4 | delete_merged_branches = true
5 | 


--------------------------------------------------------------------------------
/docs/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
3 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
4 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/cuda.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void kernel_dummy(float *ptr)
2 | {
3 |     ptr[blockIdx.x] = 0;
4 | }
5 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   ignore:
 3 |     - "deps/*"
 4 |     - "src/device/*"
 5 |   status:
 6 |     patch: false
 7 |     project: false
 8 |     changes: false
 9 | comment: false
10 | 


--------------------------------------------------------------------------------
/src/deprecated.jl:
--------------------------------------------------------------------------------
 1 | # Deprecated functionality
 2 | 
 3 | macro profile(ex)
 4 |     Base.depwarn("`CUDAnative.@profile` is deprecated, use `CUDAdrv.@profile` instead", :profile)
 5 |     quote
 6 |         CUDAdrv.@profile begin
 7 |             $(esc(ex))
 8 |         end
 9 |     end 
10 | end
11 | 


--------------------------------------------------------------------------------
/docs/src/lib/compilation.md:
--------------------------------------------------------------------------------
 1 | # Compilation & Execution
 2 | 
 3 | ```@docs
 4 | CUDAnative.@cuda
 5 | CUDAnative.cufunction
 6 | CUDAnative.Kernel
 7 | CUDAnative.compile
 8 | CUDAnative.cudaconvert
 9 | CUDAnative.nearest_warpsize
10 | ```
11 | 
12 | ## Devices
13 | 
14 | ```@docs
15 | CUDAnative.device!
16 | ```
17 | 


--------------------------------------------------------------------------------
/docs/src/lib/device/array.md:
--------------------------------------------------------------------------------
 1 | # Arrays
 2 | 
 3 | CUDAnative provides a primitive, lightweight array type to manage GPU data
 4 | organized in an plain, dense fashion. This is the device-counterpart to the
 5 | `CuArray` from CuArrays.jl, and implements (part of) the array interface as well
 6 | as other functionality for use _on_ the GPU:
 7 | 
 8 | ```@docs
 9 | CUDAnative.CuDeviceArray
10 | CUDAnative.ldg
11 | ```
12 | 


--------------------------------------------------------------------------------
/docs/src/lib/device/libdevice.md:
--------------------------------------------------------------------------------
 1 | # libdevice
 2 | 
 3 | CUDAnative.jl provides wrapper functions for the mathematical routines in `libdevice`,
 4 | CUDA's device math library. Many of these functions implement an interface familiar to
 5 | similar functions in `Base`, but it is currently impossible to transparently dispatch to
 6 | these device functions. As a consequence, users should prefix calls to math functions (eg.
 7 | `sin` or `pow`) with the CUDAnative module name.
 8 | 
 9 | WIP
10 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/build.jl:
--------------------------------------------------------------------------------
 1 | using CUDAapi
 2 | using CUDAdrv
 3 | 
 4 | dev = CuDevice(0)
 5 | cap = capability(dev)
 6 | 
 7 | cd(@__DIR__) do
 8 |     toolkit = CUDAapi.find_toolkit()
 9 |     nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit)
10 |     toolchain = CUDAapi.find_toolchain(toolkit)
11 |     flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)`
12 |     run(`$nvcc $flags -ptx -o cuda.ptx cuda.cu`)
13 |     run(`$nvcc $flags -lm -lcuda -o cuda cuda.c`)
14 | end
15 | 


--------------------------------------------------------------------------------
/examples/vadd.jl:
--------------------------------------------------------------------------------
 1 | using CUDAdrv, CUDAnative, CuArrays
 2 | 
 3 | using Test
 4 | 
 5 | function vadd(a, b, c)
 6 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 7 |     c[i] = a[i] + b[i]
 8 |     return
 9 | end
10 | 
11 | dims = (3,4)
12 | a = round.(rand(Float32, dims) * 100)
13 | b = round.(rand(Float32, dims) * 100)
14 | 
15 | d_a = CuArray(a)
16 | d_b = CuArray(b)
17 | d_c = similar(d_a)
18 | 
19 | len = prod(dims)
20 | @cuda threads=len vadd(d_a, d_b, d_c)
21 | c = Array(d_c)
22 | @test a+b ≈ c
23 | 


--------------------------------------------------------------------------------
/examples/hello_world.jl:
--------------------------------------------------------------------------------
 1 | using CUDAdrv, CUDAnative, CuArrays
 2 | 
 3 | if Sys.iswindows()
 4 |     function hello_world()
 5 |         @cuprintf("Greetings from block %lld, thread %lld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
 6 |         return
 7 |     end
 8 | else
 9 |     function hello_world()
10 |        @cuprintf("Greetings from block %ld, thread %ld!\n", Int64(blockIdx().x), Int64(threadIdx().x))
11 |        return
12 |    end
13 | end
14 | @cuda blocks=2 threads=2 hello_world()
15 | synchronize()
16 | 


--------------------------------------------------------------------------------
/examples/reduce/verify.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | include("reduce.jl")
 4 | 
 5 | if capability(device()) < v"3.0"
 6 |     @warn("this example requires a newer GPU")
 7 |     exit(0)
 8 | end
 9 | 
10 | len = 10^7
11 | input = ones(Int32, len)
12 | 
13 | # CPU
14 | cpu_val = reduce(+, input)
15 | 
16 | # CUDAnative
17 | let
18 |     gpu_input = CuArray(input)
19 |     gpu_output = similar(gpu_input)
20 |     gpu_reduce(+, gpu_input, gpu_output)
21 |     gpu_val = Array(gpu_output)[1]
22 |     @assert cpu_val == gpu_val
23 | end
24 | 


--------------------------------------------------------------------------------
/src/compiler.jl:
--------------------------------------------------------------------------------
 1 | # JIT compilation of Julia code to PTX
 2 | 
 3 | include(joinpath("compiler", "common.jl"))
 4 | include(joinpath("compiler", "irgen.jl"))
 5 | include(joinpath("compiler", "optim.jl"))
 6 | include(joinpath("compiler", "validation.jl"))
 7 | include(joinpath("compiler", "rtlib.jl"))
 8 | include(joinpath("compiler", "mcgen.jl"))
 9 | include(joinpath("compiler", "debug.jl"))
10 | include(joinpath("compiler", "driver.jl"))
11 | 
12 | function __init_compiler__()
13 |     # enable generation of FMA instructions to mimic behavior of nvcc
14 |     LLVM.clopts("--nvptx-fma-level=1")
15 | end
16 | 


--------------------------------------------------------------------------------
/examples/oob.jl:
--------------------------------------------------------------------------------
 1 | # EXCLUDE FROM TESTING
 2 | # this example might fail (CUDA error, or runtime trap if bounds-checking if enabled)
 3 | 
 4 | # Running this example under cuda-memset properly gives line number info,
 5 | # demonstrating how we support existing CUDA tools.
 6 | 
 7 | # TODO: make the actual error trap at run time
 8 | 
 9 | using CUDAdrv, CUDAnative, CuArrays
10 | 
11 | a = CuArray{Float32}(undef, 10)
12 | 
13 | function memset(a, val)
14 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
15 |     a[i] = val
16 |     return
17 | end
18 | 
19 | @cuda threads=11 memset(a, 0f0)
20 | synchronize()
21 | 


--------------------------------------------------------------------------------
/src/utils.jl:
--------------------------------------------------------------------------------
 1 | # device capability handling
 2 | 
 3 | # select the highest capability that is supported by both the toolchain and device
 4 | function supported_capability(dev::CuDevice)
 5 |     dev_cap = capability(dev)
 6 |     compat_caps = filter(cap -> cap <= dev_cap, target_support)
 7 |     isempty(compat_caps) &&
 8 |         error("Device capability v$dev_cap not supported by available toolchain")
 9 | 
10 |     return maximum(compat_caps)
11 | end
12 | 
13 | # return the capability of the current context's device, or a sane fall-back
14 | function current_capability()
15 |     if initialized[]
16 |         return supported_capability(device())
17 |     else
18 |         # newer devices tend to support cleaner code (higher-level instructions, etc)
19 |         # so target the most recent device as supported by this toolchain
20 |         return maximum(target_support)
21 |     end
22 | end
23 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "CUDAnative"
 2 | uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 3 | 
 4 | [deps]
 5 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 6 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 7 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 8 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 9 | LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
10 | Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
11 | Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
12 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
13 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
14 | 
15 | [extras]
16 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
17 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
18 | SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
19 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
20 | 
21 | [targets]
22 | test = ["Test", "BenchmarkTools", "SpecialFunctions"]
23 | 


--------------------------------------------------------------------------------
/docs/make.jl:
--------------------------------------------------------------------------------
 1 | using Documenter
 2 | 
 3 | using Pkg
 4 | if haskey(ENV, "GITLAB_CI")
 5 |   Pkg.add([PackageSpec(name = x; rev = "master") for x in ["CUDAdrv", "LLVM"]])
 6 | end
 7 | 
 8 | using CUDAnative
 9 | 
10 | makedocs(
11 |     modules = [CUDAnative],
12 |     format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
13 |     sitename = "CUDAnative.jl",
14 |     pages = [
15 |         "Home"    => "index.md",
16 |         "Manual"  => [
17 |             "man/usage.md",
18 |             "man/troubleshooting.md",
19 |             "man/performance.md",
20 |             "man/hacking.md"
21 |         ],
22 |         "Library" => [
23 |             "lib/compilation.md",
24 |             "lib/reflection.md",
25 |             "Device Code" => [
26 |                 "lib/device/intrinsics.md",
27 |                 "lib/device/array.md",
28 |                 "lib/device/libdevice.md"
29 |             ]
30 |         ]
31 |     ],
32 |     doctest = true
33 | )
34 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics.jl:
--------------------------------------------------------------------------------
 1 | # CUDA extensions to the C language
 2 | 
 3 | # TODO: "CUDA C programming guide" > "C language extensions" lists mathematical functions,
 4 | #       without mentioning libdevice. Is this implied, by NVCC always using libdevice,
 5 | #       or are there some natively-supported math functions as well?
 6 | 
 7 | # yes: https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__DOUBLE.html
 8 | # see /home/tbesard/CUDA/toolkit/current/include/sm_20_intrinsics.h
 9 | 
10 | include(joinpath("cuda_intrinsics", "memory_shared.jl"))
11 | include(joinpath("cuda_intrinsics", "indexing.jl"))
12 | include(joinpath("cuda_intrinsics", "synchronization.jl"))
13 | include(joinpath("cuda_intrinsics", "warp_vote.jl"))
14 | include(joinpath("cuda_intrinsics", "warp_shuffle.jl"))
15 | include(joinpath("cuda_intrinsics", "output.jl"))
16 | include(joinpath("cuda_intrinsics", "assertion.jl"))
17 | include(joinpath("cuda_intrinsics", "memory_dynamic.jl"))
18 | include(joinpath("cuda_intrinsics", "misc.jl"))
19 | 


--------------------------------------------------------------------------------
/test/examples.jl:
--------------------------------------------------------------------------------
 1 | @testset "examples" begin
 2 | 
 3 | function find_sources(path::String, sources=String[])
 4 |     if isdir(path)
 5 |         for entry in readdir(path)
 6 |             find_sources(joinpath(path, entry), sources)
 7 |         end
 8 |     elseif endswith(path, ".jl")
 9 |         push!(sources, path)
10 |     end
11 |     sources
12 | end
13 | 
14 | examples_dir = joinpath(@__DIR__, "..", "examples")
15 | examples = find_sources(examples_dir)
16 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples)
17 | 
18 | cd(examples_dir) do
19 |     examples = relpath.(examples, Ref(examples_dir))
20 |     @testset for example in examples
21 |         cmd = `$(Base.julia_cmd())`
22 |         if Base.JLOptions().project != C_NULL
23 |             # --project isn't preserved by julia_cmd()
24 |             cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))`
25 |         end
26 |         cmd = `$cmd $example`
27 |         @test success(pipeline(cmd, stderr=stderr))
28 |     end
29 | end
30 | 
31 | end
32 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/misc.jl:
--------------------------------------------------------------------------------
 1 | export clock, nanosleep
 2 | 
 3 | """
 4 |     clock(UInt32)
 5 | 
 6 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 
 7 | """
 8 | clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) 
 9 | 
10 | """
11 |     clock(UInt32)
12 | 
13 | Returns the value of a per-multiprocessor counter that is incremented every clock cycle. 
14 | """
15 | clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) 
16 | 
17 | 
18 | """
19 |     nanosleep(t)
20 | 
21 | Puts a thread for a given amount `t`(in nanoseconds).
22 | 
23 | !!! note
24 |     Requires CUDA >= 10.0 and sm_6.2
25 | """
26 | nanosleep
27 | 
28 | if cuda_driver_version >= v"10.0" && v"6.2" in ptx_support
29 |     @inline function nanosleep(t::Unsigned)
30 |         @asmcall("nanosleep.u32 \$0;", "r", true,
31 |                  Cvoid, Tuple{UInt32}, convert(UInt32, t))
32 |     end
33 | else
34 |     @inline function nanosleep(t::Unsigned)
35 | 	return nothing
36 |     end
37 | end
38 | 


--------------------------------------------------------------------------------
/docs/src/index.md:
--------------------------------------------------------------------------------
 1 | # CUDAnative.jl
 2 | 
 3 | *Support for compiling and executing native Julia kernels on CUDA hardware.*
 4 | 
 5 | This package provides support for compiling and executing native Julia kernels on CUDA
 6 | hardware. It is a work in progress, and only works on very recent versions of Julia .
 7 | 
 8 | 
 9 | ## Installation
10 | 
11 | Requirements:
12 | 
13 | * Julia 1.0
14 | * CUDA toolkit
15 | * NVIDIA driver
16 | 
17 | ```
18 | Pkg.add("CUDAnative")
19 | using CUDAnative
20 | 
21 | # optionally
22 | Pkg.test("CUDAnative")
23 | ```
24 | 
25 | The build step will discover the available CUDA and LLVM installations, and
26 | figure out which devices can be programmed using that set-up. It depends on
27 | CUDAdrv and LLVM being properly configured.
28 | 
29 | Even if the build fails, CUDAnative.jl should always be loadable. This simplifies use by
30 | downstream packages, until there is proper language support for conditional modules. You can
31 | check whether the package has been built properly by inspecting the `CUDAnative.configured`
32 | global variable.
33 | 


--------------------------------------------------------------------------------
/src/CUDAnative.jl:
--------------------------------------------------------------------------------
 1 | module CUDAnative
 2 | 
 3 | using CUDAdrv
 4 | 
 5 | using LLVM
 6 | using LLVM.Interop
 7 | 
 8 | using Adapt
 9 | 
10 | using Pkg
11 | using Libdl
12 | 
13 | const ext = joinpath(@__DIR__, "..", "deps", "ext.jl")
14 | isfile(ext) || error("CUDAnative.jl has not been built, please run Pkg.build(\"CUDAnative\").")
15 | include(ext)
16 | if !configured
17 |     # default (non-functional) values for critical variables,
18 |     # making it possible to _load_ the package at all times.
19 |     const target_support = [v"2.0"]
20 |     const cuda_driver_version = v"5.5"
21 | end
22 | 
23 | include("utils.jl")
24 | 
25 | # needs to be loaded _before_ the compiler infrastructure, because of generated functions
26 | include(joinpath("device", "tools.jl"))
27 | include(joinpath("device", "pointer.jl"))
28 | include(joinpath("device", "array.jl"))
29 | include(joinpath("device", "libdevice.jl"))
30 | include(joinpath("device", "cuda_intrinsics.jl"))
31 | include(joinpath("device", "runtime_intrinsics.jl"))
32 | 
33 | include("compiler.jl")
34 | include("execution.jl")
35 | include("reflection.jl")
36 | 
37 | include("deprecated.jl")
38 | 
39 | include("init.jl")
40 | 
41 | end
42 | 


--------------------------------------------------------------------------------
/docs/src/lib/reflection.md:
--------------------------------------------------------------------------------
 1 | # Reflection
 2 | 
 3 | Because of using a different compilation toolchain, CUDAnative.jl offers counterpart
 4 | functions to the `code_` functionality from Base:
 5 | 
 6 | ```@docs
 7 | CUDAnative.code_llvm
 8 | CUDAnative.code_ptx
 9 | CUDAnative.code_sass
10 | ```
11 | 
12 | 
13 | ## Convenience macros
14 | 
15 | For ease of use, CUDAnative.jl also implements `@device_code_` macros wrapping
16 | the above reflection functionality. These macros evaluate the expression
17 | argument, while tracing compilation and finally printing or returning the code
18 | for every invoked CUDA kernel. Do note that this evaluation can have side
19 | effects, as opposed to similarly-named `@code_` macros in Base which are free of
20 | side effects.
21 | 
22 | ```@docs
23 | CUDAnative.@device_code_lowered
24 | CUDAnative.@device_code_typed
25 | CUDAnative.@device_code_warntype
26 | CUDAnative.@device_code_llvm
27 | CUDAnative.@device_code_ptx
28 | CUDAnative.@device_code_sass
29 | CUDAnative.@device_code
30 | ```
31 | 
32 | ## Version and related queries
33 | 
34 | ```@docs
35 | CUDAnative.version
36 | CUDAnative.maxthreads
37 | CUDAnative.registers
38 | CUDAnative.memory
39 | ```
40 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright &copy; 2013 Dahua Lin
 4 | Copyright &copy; 2014-2018 Tim Besard, and other contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/test/base.jl:
--------------------------------------------------------------------------------
 1 | @testset "base interface" begin
 2 | 
 3 | ############################################################################################
 4 | 
 5 | @testset "method caching" begin
 6 | 
 7 | import InteractiveUtils: _dump_function
 8 | 
 9 | # #17057 fallout
10 | @noinline post17057_child(i) = sink(i)
11 | function post17057_parent(arr::Ptr{Int64})
12 |     i = post17057_child(0)
13 |     unsafe_store!(arr, i, i)
14 | end
15 | 
16 | # bug: default module activation segfaulted on NULL child function if cached=false
17 | params = Base.CodegenParams(cached=false)
18 | if VERSION >= v"1.1.0-DEV.762"
19 |     _dump_function(post17057_parent, Tuple{Ptr{Int64}},
20 |                    #=native=#false, #=wrapper=#false, #=strip=#false,
21 |                    #=dump_module=#true, #=syntax=#:att, #=optimize=#false, :none,
22 |                    params)
23 | else
24 |     _dump_function(post17057_parent, Tuple{Ptr{Int64}},
25 |                    #=native=#false, #=wrapper=#false, #=strip=#false,
26 |                    #=dump_module=#true, #=syntax=#:att, #=optimize=#false,
27 |                    params)
28 | end
29 | 
30 | end
31 | 
32 | ############################################################################################
33 | 
34 | end


--------------------------------------------------------------------------------
/examples/multigpu.jl:
--------------------------------------------------------------------------------
 1 | using CUDAdrv, CUDAnative, CuArrays
 2 | 
 3 | using Test
 4 | 
 5 | function vadd(gpu, a, b, c)
 6 |     i = threadIdx().x + blockDim().x * ((blockIdx().x-1) + (gpu-1) * gridDim().x)
 7 |     c[i] = a[i] + b[i]
 8 |     return
 9 | end
10 | 
11 | gpus = Int(length(devices()))
12 | 
13 | dims = (gpus,3,4)
14 | a = round.(rand(Float32, dims) * 100)
15 | b = round.(rand(Float32, dims) * 100)
16 | 
17 | # FIXME: CuArray doesn't tie in with unified memory yet
18 | buf_a = Mem.alloc(sizeof(a), true)
19 | Mem.upload!(buf_a, a)
20 | d_a = CuArray{Float32,3}(buf_a, dims)
21 | buf_b = Mem.alloc(sizeof(a), true)
22 | Mem.upload!(buf_b, b)
23 | d_b = CuArray{Float32,3}(buf_b, dims)
24 | buf_c = Mem.alloc(sizeof(a), true)
25 | d_c = CuArray{Float32,3}(buf_c, dims)
26 | 
27 | len = prod(dims)
28 | blocks = gpus
29 | threads = len ÷ blocks
30 | 
31 | for (gpu,dev) in enumerate(devices())
32 |     @debug "Allocating slice $gpu on device $(name(dev))"
33 |     device!(dev)
34 |     @cuda blocks=blocks÷gpus threads=threads vadd(gpu, d_a, d_b, d_c)
35 | end
36 | 
37 | @debug "Synchronizing devices"
38 | for dev in devices()
39 |     # NOTE: normally you'd use events and wait for them
40 |     device!(dev)
41 |     synchronize()
42 | end
43 | 
44 | c = Array(d_c)
45 | @test a+b ≈ c
46 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/cudanative.jl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env julia
 2 | 
 3 | # CUDAnative.jl version
 4 | 
 5 | using CUDAdrv, CUDAnative
 6 | 
 7 | using Statistics
 8 | using Printf
 9 | 
10 | function kernel_dummy(ptr)
11 |     Base.pointerset(ptr, 0f0, Int(blockIdx().x), 8)
12 |     return
13 | end
14 | 
15 | const len = 1000
16 | const ITERATIONS = 100
17 | 
18 | function benchmark(gpu_buf)
19 |     @cuda threads=len kernel_dummy(Base.unsafe_convert(Ptr{Float32}, gpu_buf))
20 | end
21 | 
22 | function main()    
23 |     cpu_time = Vector{Float64}(undef, ITERATIONS)
24 |     gpu_time = Vector{Float64}(undef, ITERATIONS)
25 | 
26 |     gpu_buf = Mem.alloc(len*sizeof(Float32))
27 |     for i in 1:ITERATIONS
28 |         i == ITERATIONS-4 && CUDAdrv.Profile.start()
29 | 
30 |         gpu_tic, gpu_toc = CuEvent(), CuEvent()
31 | 
32 |         cpu_tic = time_ns()
33 |         record(gpu_tic)
34 |         benchmark(gpu_buf)
35 |         record(gpu_toc)
36 |         synchronize(gpu_toc)
37 |         cpu_toc = time_ns()
38 | 
39 |         cpu_time[i] = (cpu_toc-cpu_tic)/1000
40 |         gpu_time[i] = CUDAdrv.elapsed(gpu_tic, gpu_toc)*1000000
41 |     end
42 |     CUDAdrv.Profile.stop()
43 |     Mem.free(gpu_buf)
44 | 
45 |     popfirst!(cpu_time)
46 |     popfirst!(gpu_time)
47 | 
48 |     @printf("CPU time: %.2f ± %.2f us\n", mean(cpu_time), std(cpu_time))
49 |     @printf("GPU time: %.2f ± %.2f us\n", mean(gpu_time), std(gpu_time))
50 | end
51 | 
52 | main()
53 | 


--------------------------------------------------------------------------------
/examples/peakflops.jl:
--------------------------------------------------------------------------------
 1 | using CUDAdrv, CUDAnative, CuArrays
 2 | 
 3 | using Test
 4 | 
 5 | "Dummy kernel doing 100 FMAs."
 6 | function kernel_100fma(a, b, c, out)
 7 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 8 |     @inbounds a_val = a[i]
 9 |     @inbounds b_val = b[i]
10 |     @inbounds c_val = c[i]
11 | 
12 |     for j in 1:33
13 |         a_val = CUDAnative.fma(a_val, b_val, c_val)
14 |         b_val = CUDAnative.fma(a_val, b_val, c_val)
15 |         c_val = CUDAnative.fma(a_val, b_val, c_val)
16 |     end
17 | 
18 |     @inbounds out[i] = CUDAnative.fma(a_val, b_val, c_val)
19 | 
20 |     return
21 | end
22 | 
23 | function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
24 |     ctx = CuContext(dev)
25 | 
26 |     dims = (n, n)
27 |     a = round.(rand(Float32, dims) * 100)
28 |     b = round.(rand(Float32, dims) * 100)
29 |     c = round.(rand(Float32, dims) * 100)
30 | 
31 |     d_a = CuArray(a)
32 |     d_b = CuArray(b)
33 |     d_c = CuArray(c)
34 |     d_out = similar(d_a)
35 | 
36 |     len = prod(dims)
37 |     threads = min(len, 1024)
38 |     blocks = len ÷ threads
39 | 
40 |     # warm-up
41 |     @cuda kernel_100fma(d_a, d_b, d_c, d_out)
42 |     synchronize(ctx)
43 | 
44 |     secs = CUDAdrv.@elapsed begin
45 |         @cuda blocks=blocks threads=threads kernel_100fma(d_a, d_b, d_c, d_out)
46 |     end
47 |     flopcount = 200*len
48 |     flops = flopcount / secs
49 | 
50 |     destroy!(ctx)
51 |     return flops
52 | end
53 | 
54 | println(peakflops())
55 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/cuda.jl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env julia
 2 | 
 3 | # CUDAdrv.jl version
 4 | 
 5 | using CUDAdrv
 6 | 
 7 | using Statistics
 8 | using Printf
 9 | 
10 | const len = 1000
11 | const ITERATIONS = 100
12 | 
13 | # TODO: api-trace shows some attribute fetches, where do they come from?
14 | 
15 | const dev = CuDevice(0)
16 | const ctx = CuContext(dev)
17 | 
18 | const mod = CuModuleFile("cuda.ptx")
19 | const fun = CuFunction(mod, "kernel_dummy")
20 | 
21 | function benchmark(gpu_buf)
22 |     cudacall(fun, (Ptr{Float32},), gpu_buf; threads=1)
23 |     return
24 | end
25 | 
26 | 
27 | function main()
28 |     cpu_time = Vector{Float64}(undef, ITERATIONS)
29 |     gpu_time = Vector{Float64}(undef, ITERATIONS)
30 | 
31 |     gpu_buf = Mem.alloc(len*sizeof(Float32))
32 |     for i in 1:ITERATIONS
33 |         i == ITERATIONS-4 && CUDAdrv.Profile.start()
34 | 
35 |         gpu_tic, gpu_toc = CuEvent(), CuEvent()
36 | 
37 |         cpu_tic = time_ns()
38 |         record(gpu_tic)
39 |         benchmark(gpu_buf)
40 |         record(gpu_toc)
41 |         synchronize(gpu_toc)
42 |         cpu_toc = time_ns()
43 | 
44 |         cpu_time[i] = (cpu_toc-cpu_tic)/1000
45 |         gpu_time[i] = CUDAdrv.elapsed(gpu_tic, gpu_toc)*1000000
46 |     end
47 |     CUDAdrv.Profile.stop()
48 |     Mem.free(gpu_buf)
49 | 
50 |     popfirst!(cpu_time)
51 |     popfirst!(gpu_time)
52 | 
53 |     @printf("CPU time: %.2f ± %.2f us\n", mean(cpu_time), std(cpu_time))
54 |     @printf("GPU time: %.2f ± %.2f us\n", mean(gpu_time), std(gpu_time))
55 | 
56 |     destroy!(ctx)
57 | end
58 | 
59 | main()
60 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/memory_dynamic.jl:
--------------------------------------------------------------------------------
 1 | # Dynamic Global Memory Allocation and Operations (B.21)
 2 | 
 3 | export malloc
 4 | 
 5 | @generated function malloc(sz::Csize_t)
 6 |     T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
 7 |     T_size = convert(LLVMType, Csize_t)
 8 |     T_ptr = convert(LLVMType, Ptr{Cvoid})
 9 | 
10 |     # create function
11 |     llvm_f, _ = create_function(T_ptr, [T_size])
12 |     mod = LLVM.parent(llvm_f)
13 | 
14 |     # get the intrinsic
15 |     # NOTE: LLVM doesn't have void*, Clang uses i8* for malloc too
16 |     intr = LLVM.Function(mod, "malloc", LLVM.FunctionType(T_pint8, [T_size]))
17 |     # should we attach some metadata here? julia.gc_alloc_obj has the following:
18 |     #let attrs = function_attributes(intr)
19 |     #    AllocSizeNumElemsNotPresent = reinterpret(Cuint, Cint(-1))
20 |     #    packed_allocsize = Int64(1) << 32 | AllocSizeNumElemsNotPresent
21 |     #    push!(attrs, EnumAttribute("allocsize", packed_allocsize, JuliaContext()))
22 |     #end
23 |     #let attrs = return_attributes(intr)
24 |     #    push!(attrs, EnumAttribute("noalias", 0, JuliaContext()))
25 |     #    push!(attrs, EnumAttribute("nonnull", 0, JuliaContext()))
26 |     #end
27 | 
28 |     # generate IR
29 |     Builder(JuliaContext()) do builder
30 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
31 |         position!(builder, entry)
32 | 
33 |         ptr = call!(builder, intr, [parameters(llvm_f)[1]])
34 | 
35 |         jlptr = ptrtoint!(builder, ptr, T_ptr)
36 | 
37 |         ret!(builder, jlptr)
38 |     end
39 | 
40 |     call_function(llvm_f, Ptr{Cvoid}, Tuple{Csize_t}, :((sz,)))
41 | end
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CUDAnative.jl
 2 | =============
 3 | 
 4 | *Support for compiling and executing native Julia kernels on CUDA hardware.*
 5 | 
 6 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url]
 7 | 
 8 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl/branch/master/graph/badge.svg
 9 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl
10 | 
11 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg
12 | [docs-latest-url]: https://juliagpu.gitlab.io/CUDAnative.jl/
13 | 
14 | [doi-img]: https://zenodo.org/badge/DOI/10.1109/TPDS.2018.2872064.svg
15 | [doi-url]: https://doi.org/10.1109/TPDS.2018.2872064
16 | 
17 | 
18 | 
19 | Installation
20 | ------------
21 | 
22 | CUDAnative is a registered package, and can be installed using the Julia package manager:
23 | 
24 | ```julia
25 | Pkg.add("CUDAnative")
26 | ```
27 | 
28 | **NOTE**: the current version of this package requires Julia 1.0. Only older versions of this package, v0.6.x or older, work with Julia 0.6, and require a source-build of Julia.
29 | 
30 | 
31 | License
32 | -------
33 | 
34 | CUDAnative.jl is licensed under the [MIT license](LICENSE.md).
35 | 
36 | If you use this package in your research, please cite the [following
37 | paper](https://ieeexplore.ieee.org/document/8471188):
38 | 
39 | ```
40 | @article{besard:2017,
41 |   author    = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn},
42 |   title     = {Effective Extensible Programming: Unleashing {Julia} on {GPUs}},
43 |   journal   = {IEEE Transactions on Parallel and Distributed Systems},
44 |   year      = {2018},
45 |   doi       = {10.1109/TPDS.2018.2872064},
46 |   ISSN      = {1045-9219},
47 | }
48 | ```
49 | 


--------------------------------------------------------------------------------
/test/device/pointer.jl:
--------------------------------------------------------------------------------
 1 | @testset "pointer" begin
 2 | 
 3 | @testset "unsafe_load & unsafe_store!" begin
 4 | 
 5 | @eval struct LoadableStruct
 6 |     a::Int64
 7 |     b::UInt8
 8 | end
 9 | Base.one(::Type{LoadableStruct}) = LoadableStruct(1,1)
10 | Base.zero(::Type{LoadableStruct}) = LoadableStruct(0,0)
11 | 
12 | @testset for T in (Int8, UInt16, Int32, UInt32, Int64, UInt64, Int128,
13 |                    Float32, Float64,
14 |                    LoadableStruct),
15 |              cached in (false, true)
16 |     d_a = Mem.upload(ones(T))
17 |     d_b = Mem.upload(zeros(T))
18 | 
19 |     ptr_a = CUDAnative.DevicePtr{T,AS.Global}(Base.unsafe_convert(CuPtr{T}, d_a))
20 |     ptr_b = CUDAnative.DevicePtr{T,AS.Global}(Base.unsafe_convert(CuPtr{T}, d_b))
21 |     @test Mem.download(T, d_a) != Mem.download(T, d_b)
22 | 
23 |     let ptr_a=ptr_a, ptr_b=ptr_b #JuliaLang/julia#15276
24 |         if cached && capability(dev) >= v"3.2"
25 |             @on_device unsafe_store!(ptr_b, unsafe_cached_load(ptr_a))
26 |         else
27 |             @on_device unsafe_store!(ptr_b, unsafe_load(ptr_a))
28 |         end
29 |     end
30 |     @test Mem.download(T, d_a) == Mem.download(T, d_b)
31 | end
32 | 
33 | @testset "indexing" begin
34 |     function kernel(src, dst)
35 |         unsafe_store!(dst, CUDAnative.unsafe_cached_load(src, 4))
36 |         return
37 |     end
38 | 
39 |     T = Complex{Int8}
40 | 
41 |     src = Mem.upload([T(1) T(9); T(3) T(4)])
42 |     dst = Mem.upload([0])
43 | 
44 |     @cuda kernel(
45 |         CUDAnative.DevicePtr{T,AS.Global}(CuPtr{T}(src.ptr)),
46 |         CUDAnative.DevicePtr{T,AS.Global}(CuPtr{T}(dst.ptr))
47 |     )
48 | 
49 |     @test Mem.download(T, src, 4)[4] == Mem.download(T, dst)[1]
50 | end
51 | 
52 | end
53 | 
54 | end
55 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   CI_IMAGE_TAG: 'cuda'
 3 | 
 4 | include:
 5 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/common.yml'
 6 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.0.yml'
 7 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml'
 8 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_dev.yml'
 9 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml'
10 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/coverage_v1.1.yml'
11 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/documentation_v1.1.yml'
12 | 
13 | test:v1.0:
14 |   only:
15 |     - master
16 |     - staging
17 |     - trying
18 | 
19 | test:v1.1:
20 |   only:
21 |     - master
22 |     - staging
23 |     - trying
24 | 
25 | test:dev:
26 |   only:
27 |     - master
28 |     - staging
29 |     - trying
30 | 
31 | coverage:
32 |   allow_failure: true
33 |   only:
34 |     - master
35 |     - staging
36 |     - trying
37 | 
38 | documentation:
39 |   only:
40 |     - master
41 |     - staging
42 |     - trying
43 | 
44 | pages:
45 |   stage: deploy
46 |   script:
47 |     - mv docs/build public
48 |   artifacts:
49 |     paths:
50 |     - public
51 |   only:
52 |     - master
53 | 
54 | cuarrays:
55 |   stage: test
56 |   image: "juliagpu/julia:v1.1-cuda"
57 |   script:
58 |     - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325
59 |     - julia -e 'using Pkg;
60 |                 Pkg.develop(PackageSpec(path=pwd()));
61 |                 Pkg.build();
62 |                 Pkg.add(PackageSpec(name="CuArrays", rev="master"));
63 |                 Pkg.test("CuArrays");'
64 |   allow_failure: true
65 |   only:
66 |     - master
67 |     - staging
68 |     - trying
69 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/warp_vote.jl:
--------------------------------------------------------------------------------
 1 | # Warp Vote (B.13)
 2 | 
 3 | export vote_all, vote_any, vote_ballot
 4 | 
 5 | """
 6 |     vote_all(predicate::Bool)
 7 | 
 8 | Evaluate `predicate` for all active threads of the warp and return non-zero if and only if
 9 | `predicate` evaluates to non-zero for all of them.
10 | """
11 | @inline function vote_all(pred::Bool)
12 |     return @asmcall(
13 |         """{
14 |                .reg .pred %p1;
15 |                .reg .pred %p2;
16 |                setp.ne.u32 %p1, \$1, 0;
17 |                vote.all.pred %p2, %p1;
18 |                selp.s32 \$0, 1, 0, %p2;
19 |            }""", "=r,r", true,
20 |         Int32, Tuple{Int32}, convert(Int32, pred)) != Int32(0)
21 | end
22 | 
23 | """
24 |     vote_any(predicate::Bool)
25 | 
26 | Evaluate `predicate` for all active threads of the warp and return non-zero if and only if
27 | `predicate` evaluates to non-zero for any of them.
28 | """
29 | @inline function vote_any(pred::Bool)
30 |     return @asmcall(
31 |         """{
32 |                .reg .pred %p1;
33 |                .reg .pred %p2;
34 |                setp.ne.u32 %p1, \$1, 0;
35 |                vote.any.pred %p2, %p1;
36 |                selp.s32 \$0, 1, 0, %p2;
37 |            }""", "=r,r", true,
38 |         Int32, Tuple{Int32}, convert(Int32, pred)) != Int32(0)
39 | end
40 | 
41 | """
42 |     vote_ballot(predicate::Bool)
43 | 
44 | Evaluate `predicate` for all active threads of the warp and return an integer whose Nth bit
45 | is set if and only if `predicate` evaluates to non-zero for the Nth thread of the warp and
46 | the Nth thread is active.
47 | """
48 | @inline function vote_ballot(pred::Bool)
49 |     return @asmcall(
50 |         """{
51 |                .reg .pred %p1;
52 |                setp.ne.u32 %p1, \$1, 0;
53 |                vote.ballot.b32 \$0, %p1;
54 |            }""", "=r,r", true,
55 |         UInt32, Tuple{Int32}, convert(Int32, pred))
56 | end
57 | 


--------------------------------------------------------------------------------
/docs/src/man/performance.md:
--------------------------------------------------------------------------------
 1 | # Performance
 2 | 
 3 | GPU code written in CUDAnative.jl can be as fast or even outperform CUDA C compiled with
 4 | `nvcc` (on the condition that the same hardware features are used). This section will
 5 | describe how to do so, and what to be careful about.
 6 | 
 7 | 
 8 | ## Profiling
 9 | 
10 | When optimizing code, it is important to know what to optimize. Luckily, the CUDA toolkit
11 | ships an excellent profiler, `nvprof`, with `nvvp` as the Eclipse-based UI. The CUDAnative
12 | compiler is fully compatible with these tools, and generates the required line number
13 | information to debug performance issues. To generate line number information, invoke Julia
14 | with the command-line option `-g1` (the default option). Using `-g2` puts the PTX JIT in
15 | debug mode, which significantly lowers performance of GPU code and currently does not
16 | improve debugging.
17 | 
18 | Traces collected with these tools might be very large and sparse, because they capture the
19 | entire application including e.g. kernel compilation or initial data uploads. To avoid this,
20 | run the above profilers with the option "Start profiling at application start" disabled
21 | (`--profile-from-start off` with `nvprof`), make your application perform a warm-up
22 | iteration, and wrap subsequent iterations with `CUDAdrv.@profile`. This macro instructs any
23 | active profiler to start collecting information, resulting in much more focused traces.
24 | 
25 | For true source-level profiling akin to `Base.@profile`, look at `nvvp`'s PC Sampling View
26 | (requires compute capability >= 5.2, CUDA >= 7.5). In the future, we might have a
27 | `CUDAnative.@profile` offering similar functionality, using the NVIDIA CUPTI library.
28 | 
29 | 
30 | ## Optimizing
31 | 
32 | This section is a WIP. Some things to consider:
33 | 
34 | * `Float64` is expensive, but literal floats are `Float64`. Use `...f0` or cast.
35 | * Same for integers; although the performance hit is small, it increases register pressure.
36 | 


--------------------------------------------------------------------------------
/src/compiler/debug.jl:
--------------------------------------------------------------------------------
 1 | # tools for dealing with compiler debug information
 2 | 
 3 | # generate a pseudo-backtrace from LLVM IR instruction debug information
 4 | #
 5 | # this works by looking up the debug information of the instruction, and inspecting the call
 6 | # sites of the containing function. if there's only one, repeat the process from that call.
 7 | # finally, the debug information is converted to a Julia stack trace.
 8 | function backtrace(inst::LLVM.Instruction, bt = StackTraces.StackFrame[])
 9 |     name = Ref{Cstring}()
10 |     filename = Ref{Cstring}()
11 |     line = Ref{Cuint}()
12 |     col = Ref{Cuint}()
13 | 
14 |     # look up the debug information from the current instruction
15 |     depth = 0
16 |     while LLVM.API.LLVMGetSourceLocation(LLVM.ref(inst), depth, name, filename, line, col) == 1
17 |         frame = StackTraces.StackFrame(replace(unsafe_string(name[]), r";$"=>""),
18 |                                        unsafe_string(filename[]), line[])
19 |         push!(bt, frame)
20 |         depth += 1
21 |     end
22 | 
23 |     # move up the call chain
24 |     f = LLVM.parent(LLVM.parent(inst))
25 |     ## functions can be used as a *value* in eg. constant expressions, so filter those out
26 |     callers = filter(val -> isa(user(val), LLVM.CallInst), collect(uses(f)))
27 |     if !isempty(callers)
28 |         # figure out the call sites of this instruction
29 |         call_sites = unique(callers) do call
30 |             # there could be multiple calls, originating from the same source location
31 |             md = metadata(user(call))
32 |             if haskey(md, LLVM.MD_dbg)
33 |                 md[LLVM.MD_dbg]
34 |             else
35 |                 nothing
36 |             end
37 |         end
38 | 
39 |         if length(call_sites) > 1
40 |             frame = StackTraces.StackFrame("multiple call sites", "unknown", 0)
41 |             push!(bt, frame)
42 |         elseif length(call_sites) == 1
43 |             backtrace(user(first(call_sites)), bt)
44 |         end
45 |     end
46 | 
47 |     return bt
48 | end
49 | 


--------------------------------------------------------------------------------
/examples/scan.jl:
--------------------------------------------------------------------------------
 1 | # Work-inefficient inclusive scan
 2 | # - uses shared memory to reduce
 3 | #
 4 | # Based on https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html
 5 | 
 6 | using CUDAdrv, CUDAnative, CuArrays
 7 | 
 8 | function cpu_accumulate!(op::Function, data::Matrix{T}) where {T}
 9 |     cols = size(data,2)
10 |     for col in 1:cols
11 |         accum = zero(T)
12 |         rows = size(data,1)
13 |         for row in 1:size(data,1)
14 |             accum = op(accum, data[row,col])
15 |             data[row,col] = accum
16 |         end
17 |     end
18 | end
19 | 
20 | function gpu_accumulate!(op::Function, data::CuDeviceMatrix{T}) where {T}
21 |     col = blockIdx().x
22 |     cols = gridDim().x
23 | 
24 |     row = threadIdx().x
25 |     rows = blockDim().x
26 | 
27 |     if col <= cols && row <= rows
28 |         shmem = @cuDynamicSharedMem(T, 2*rows)
29 |         shmem[row] = data[row,col]
30 |         sync_threads()
31 | 
32 |         # parallel reduction
33 |         pin, pout = 1, 0
34 |         offset = 1
35 |         while offset < rows
36 |             pout = 1 - pout
37 |             pin = 1 - pin
38 |             if row > offset
39 |                 shmem[pout * rows + row] =
40 |                     op(shmem[pin * rows + row],
41 |                        shmem[pin * rows + row - offset])
42 |             else
43 |                  shmem[pout * rows + row] =
44 |                     shmem[pin * rows + row]
45 |             end
46 |             sync_threads()
47 |             offset *= UInt32(2)
48 |         end
49 |         shmem[pin * rows + row] = shmem[pout * rows + row]
50 |         sync_threads()
51 | 
52 |         # write back results
53 |         data[row,col] = shmem[row]
54 |     end
55 | 
56 |     return
57 | end
58 | 
59 | rows = 5
60 | cols = 4
61 | 
62 | a = rand(Int, rows, cols)
63 | 
64 | cpu_a = copy(a)
65 | cpu_accumulate!(+, cpu_a)
66 | 
67 | gpu_a = CuArray(a)
68 | @cuda blocks=cols threads=rows shmem=2*rows*sizeof(eltype(a)) gpu_accumulate!(+, gpu_a)
69 | 
70 | using Test
71 | 
72 | @test cpu_a ≈ Array(gpu_a)
73 | 
74 | 
75 | # FURTHER IMPROVEMENTS:
76 | # - work efficiency
77 | # - avoid memory bank conflcits
78 | # - large array support
79 | 


--------------------------------------------------------------------------------
/docs/src/lib/device/intrinsics.md:
--------------------------------------------------------------------------------
 1 | # Intrinsics
 2 | 
 3 | This section lists the package's public functionality that corresponds to special CUDA
 4 | functions to be used in device code. It is loosely organized according to the [C language
 5 | extensions](http://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions)
 6 | appendix from the CUDA C programming guide. For more information about certain intrinsics,
 7 | refer to the aforementioned NVIDIA documentation.
 8 | 
 9 | 
10 | ## Indexing and Dimensions
11 | 
12 | ```@docs
13 | CUDAnative.gridDim
14 | CUDAnative.blockIdx
15 | CUDAnative.blockDim
16 | CUDAnative.threadIdx
17 | CUDAnative.warpsize
18 | ```
19 | 
20 | 
21 | ## Memory Types
22 | 
23 | ### Shared Memory
24 | 
25 | ```@docs
26 | CUDAnative.@cuStaticSharedMem
27 | CUDAnative.@cuDynamicSharedMem
28 | ```
29 | 
30 | 
31 | ## Synchronization
32 | 
33 | ```@docs
34 | CUDAnative.sync_threads
35 | CUDAnative.sync_warp
36 | CUDAnative.threadfence_block
37 | CUDAnative.threadfence
38 | CUDAnative.threadfence_system
39 | ```
40 | 
41 | ## Clock & Sleep
42 | 
43 | ```@docs
44 | CUDAnative.clock
45 | CUDAnative.nanosleep
46 | ```
47 | 
48 | ## Warp Vote
49 | 
50 | The warp vote functions allow the threads of a given warp to perform a
51 | reduction-and-broadcast operation. These functions take as input a boolean predicate from
52 | each thread in the warp and evaluate it. The results of that evaluation are combined
53 | (reduced) across the active threads of the warp in one different ways, broadcasting a single
54 | return value to each participating thread.
55 | 
56 | ```@docs
57 | CUDAnative.vote_all
58 | CUDAnative.vote_any
59 | CUDAnative.vote_ballot
60 | ```
61 | 
62 | 
63 | ## Warp Shuffle
64 | 
65 | ```@docs
66 | CUDAnative.shfl
67 | CUDAnative.shfl_up
68 | CUDAnative.shfl_down
69 | CUDAnative.shfl_xor
70 | ```
71 | 
72 | If using CUDA 9.0, and PTX ISA 6.0 is supported, synchronizing versions of these
73 | intrinsics are available as well:
74 | 
75 | ```@docs
76 | CUDAnative.shfl_sync
77 | CUDAnative.shfl_up_sync
78 | CUDAnative.shfl_down_sync
79 | CUDAnative.shfl_xor_sync
80 | ```
81 | 
82 | 
83 | ## Formatted Output
84 | 
85 | ```@docs
86 | CUDAnative.@cuprintf
87 | ```
88 | 
89 | 
90 | ## Assertions
91 | 
92 | ```@docs
93 | CUDAnative.@cuassert
94 | ```
95 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/README.md:
--------------------------------------------------------------------------------
 1 | Launch overhead measurement
 2 | ===========================
 3 | 
 4 | These tests allow measuring the overhead of launching a kernel, and comparing it to CUDA.
 5 | 
 6 | Use `nvvp` (the NVIDIA visual profiler) to visualize the overhead, disabling the option
 7 | "Start execution with profiling enabled".
 8 | 
 9 | For example:
10 | 
11 | ```
12 | $ nvprof --profile-from-start off ./cuda
13 | ==9929== NVPROF is profiling process 9929, command: ./cuda
14 | CPU time: 36.00us
15 | GPU time: 30.82us
16 | ==9929== Profiling application: ./cuda
17 | ==9929== Profiling result:
18 | Time(%)      Time     Calls       Avg       Min       Max  Name
19 | 100.00%  125.70us         5  25.139us  25.088us  25.281us  kernel_dummy
20 | ```
21 | 
22 | This shows how launching a kernel takes 36us from Julia's POV, 30 us when using event
23 | counters, but even that contains some overhead because according to `nvprof` the kernel only
24 | took 25 us.
25 | 
26 | Luckily, this was using CUDA, and CUDAdrv.jl doesn't perform much worse:
27 | 
28 | ```
29 | $ nvprof --profile-from-start off ./cuda.jl
30 | ==19694== NVPROF is profiling process 19694, command: julia ./cuda.jl
31 | CPU time: 36.23us
32 | GPU time: 31.62us
33 | ==19694== Profiling application: julia ./cuda.jl
34 | ==19694== Profiling result:
35 | Time(%)      Time     Calls       Avg       Min       Max  Name
36 | 100.00%  125.70us         5  25.139us  25.088us  25.312us  kernel_dummy
37 | ```
38 | 
39 | But more importantly, CUDAnative.jl performs equally well:
40 | 
41 | ```
42 | $ nvprof --profile-from-start off ./cudanative.jl
43 | ==21135== NVPROF is profiling process 21135, command: julia ./cudanative.jl
44 | CPU time: 36.42us
45 | GPU time: 31.81us
46 | ==21135== Profiling application: julia ./cudanative.jl
47 | ==21135== Profiling result:
48 | Time(%)      Time     Calls       Avg       Min       Max  Name
49 | 100.00%  123.78us         5  24.755us  24.704us  24.928us  julia_kernel_dummy_60488
50 | ```
51 | 
52 | Note that these are simple kernels, with more complex kernels Julia's heuristics start
53 | fighting us (eg. when dealing with long argument lists, inference performs worse and
54 | sometimes refuses to expand our generated functions).
55 | 
56 | Also, when dealing with more arguments there's an overhead caused by CUDA copying over
57 | arguments, and cannot be avoided. For use of hardware counters, see the CUPTI library.
58 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using Test
 2 | 
 3 | # development often happens in lockstep with other packages,
 4 | # so check-out the master branch of those packages.
 5 | using Pkg
 6 | if haskey(ENV, "GITLAB_CI")
 7 |   Pkg.add([PackageSpec(name = x; rev = "master")
 8 |            for x in ["CUDAdrv", "LLVM", "CuArrays"]])
 9 | end
10 | 
11 | using CUDAnative, CUDAdrv
12 | import LLVM
13 | 
14 | include("util.jl")
15 | 
16 | @testset "CUDAnative" begin
17 | 
18 | include("base.jl")
19 | include("pointer.jl")
20 | include("codegen.jl")
21 | 
22 | if CUDAnative.configured
23 |     @test length(devices()) > 0
24 |     if length(devices()) > 0
25 |         # the API shouldn't have been initialized
26 |         @test CuCurrentContext() == nothing
27 | 
28 |         device_callbacked = nothing
29 |         device_callback = (dev, ctx) -> begin
30 |             device_callbacked = dev
31 |         end
32 |         push!(CUDAnative.device!_listeners, device_callback)
33 | 
34 |         # now cause initialization
35 |         Mem.alloc(1)
36 |         @test CuCurrentContext() != nothing
37 |         @test device(CuCurrentContext()) == CuDevice(0)
38 |         @test device_callbacked == CuDevice(0)
39 | 
40 |         device!(CuDevice(0))
41 |         device!(CuDevice(0)) do
42 |             nothing
43 |         end
44 | 
45 |         # test the device selection functionality
46 |         if length(devices()) > 1
47 |             device!(1) do
48 |                 @test device(CuCurrentContext()) == CuDevice(1)
49 |             end
50 |             @test device(CuCurrentContext()) == CuDevice(0)
51 | 
52 |             device!(1)
53 |             @test device(CuCurrentContext()) == CuDevice(1)
54 |         end
55 | 
56 |         # pick most recent device (based on compute capability)
57 |         global dev = last(sort(collect(devices()); by=capability))
58 |         @info("Testing using device $(name(dev))")
59 |         device!(dev)
60 | 
61 |         if capability(dev) < v"2.0"
62 |             @warn("native execution not supported on SM < 2.0")
63 |         else
64 |             include("device/codegen.jl")
65 |             include("device/execution.jl")
66 |             include("device/pointer.jl")
67 |             include("device/array.jl")
68 |             include("device/intrinsics.jl")
69 | 
70 |             #include("examples.jl")
71 |         end
72 |     end
73 | else
74 |     @warn("CUDAnative.jl has not been configured; skipping on-device tests.")
75 | end
76 | 
77 | end
78 | 


--------------------------------------------------------------------------------
/test/pointer.jl:
--------------------------------------------------------------------------------
 1 | @testset "pointer" begin
 2 | 
 3 | # inner constructors
 4 | 
 5 | voidptr_a = CuPtr{Cvoid}(Int(0xDEADBEEF))
 6 | generic_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Generic}(voidptr_a)
 7 | global_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Global}(voidptr_a)
 8 | local_voidptr_a = CUDAnative.DevicePtr{Cvoid,AS.Local}(voidptr_a)
 9 | 
10 | voidptr_b = CuPtr{Cvoid}(Int(0xCAFEBABE))
11 | generic_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Generic}(voidptr_b)
12 | global_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Global}(voidptr_b)
13 | local_voidptr_b = CUDAnative.DevicePtr{Cvoid,AS.Local}(voidptr_b)
14 | 
15 | intptr_b = convert(CuPtr{Int}, voidptr_b)
16 | generic_intptr_b = CUDAnative.DevicePtr{Int,AS.Generic}(intptr_b)
17 | global_intptr_b = CUDAnative.DevicePtr{Int,AS.Global}(intptr_b)
18 | local_intptr_b = CUDAnative.DevicePtr{Int,AS.Local}(intptr_b)
19 | 
20 | # outer constructors
21 | @test CUDAnative.DevicePtr{Cvoid}(voidptr_a) == generic_voidptr_a
22 | @test CUDAnative.DevicePtr(voidptr_a) == generic_voidptr_a
23 | 
24 | # getters
25 | @test eltype(generic_voidptr_a) == Cvoid
26 | @test eltype(global_intptr_b) == Int
27 | @test addrspace(generic_voidptr_a) == AS.Generic
28 | @test addrspace(global_voidptr_a) == AS.Global
29 | @test addrspace(local_voidptr_a) == AS.Local
30 | 
31 | # comparisons
32 | @test generic_voidptr_a != global_voidptr_a
33 | @test generic_voidptr_a != generic_intptr_b
34 | 
35 | 
36 | @testset "conversions" begin
37 | 
38 | # between host and device pointers
39 | 
40 | @test convert(CuPtr{Cvoid}, generic_voidptr_a) == voidptr_a
41 | @test convert(CUDAnative.DevicePtr{Cvoid}, voidptr_a) == generic_voidptr_a
42 | @test convert(CUDAnative.DevicePtr{Cvoid,AS.Global}, voidptr_a) == global_voidptr_a
43 | 
44 | 
45 | # between device pointers
46 | 
47 | @test_throws ArgumentError convert(typeof(local_voidptr_a), global_voidptr_a)
48 | @test convert(typeof(generic_voidptr_a), generic_voidptr_a) == generic_voidptr_a
49 | @test convert(typeof(global_voidptr_a), global_voidptr_a) == global_voidptr_a
50 | @test Base.unsafe_convert(typeof(local_voidptr_a), global_voidptr_a) == local_voidptr_a
51 | 
52 | @test convert(typeof(global_voidptr_a), global_intptr_b) == global_voidptr_b
53 | @test convert(typeof(generic_voidptr_a), global_intptr_b) == generic_voidptr_b
54 | @test convert(typeof(global_voidptr_a), generic_intptr_b) == global_voidptr_b
55 | 
56 | @test convert(CUDAnative.DevicePtr{Cvoid}, global_intptr_b) == global_voidptr_b
57 | 
58 | end
59 | 
60 | end
61 | 


--------------------------------------------------------------------------------
/test/perf/launch_overhead/cuda.c:
--------------------------------------------------------------------------------
 1 | // C version
 2 | 
 3 | #include <stdio.h>
 4 | #include <time.h>
 5 | #include <math.h>
 6 | 
 7 | #include <cuda.h>
 8 | #include <cudaProfiler.h>
 9 | 
10 | #define check(err) __check(err, __FILE__, __LINE__)
11 | void __check(CUresult err, const char *file, const int line) {
12 |   if (CUDA_SUCCESS != err) {
13 |     const char *msg;
14 |     cuGetErrorName(err, &msg);
15 |     fprintf(stderr, "CUDA error: %s (%04d) at %s:%i.\n", msg, err, file, line);
16 |     exit(-1);
17 |   }
18 | }
19 | 
20 | const size_t len = 1000;
21 | const size_t ITERATIONS = 100;
22 | 
23 | int main(int argc, char **argv) {
24 |   check(cuInit(0x0));
25 | 
26 |   CUdevice dev;
27 |   check(cuDeviceGet(&dev, 0));
28 | 
29 |   CUcontext ctx;
30 |   check(cuCtxCreate(&ctx, 0, dev));
31 | 
32 |   CUmodule mod;
33 |   check(cuModuleLoad(&mod, "cuda.ptx"));
34 | 
35 |   CUfunction fun;
36 |   check(cuModuleGetFunction(&fun, mod, "kernel_dummy"));
37 | 
38 |   CUdeviceptr gpu_arr;
39 |   check(cuMemAlloc(&gpu_arr, sizeof(float) * len));
40 | 
41 |   float cpu_time[ITERATIONS];
42 |   float gpu_time[ITERATIONS];
43 | 
44 |   for (int i = 0; i < ITERATIONS; i++) {
45 |     if (i == ITERATIONS - 5)
46 |       check(cuProfilerStart());
47 | 
48 |     struct timespec cpu_t0, cpu_t1;
49 |     clock_gettime(CLOCK_MONOTONIC, &cpu_t0);
50 | 
51 |     CUevent gpu_t0, gpu_t1;
52 |     check(cuEventCreate(&gpu_t0, 0x0));
53 |     check(cuEventCreate(&gpu_t1, 0x0));
54 | 
55 |     check(cuEventRecord(gpu_t0, NULL));
56 | 
57 |     void *args[3] = {&gpu_arr};
58 |     check(cuLaunchKernel(fun, len, 1, 1, 1, 1, 1, 0, 0, args, 0));
59 | 
60 |     check(cuEventRecord(gpu_t1, NULL));
61 |     check(cuEventSynchronize(gpu_t1));
62 | 
63 |     clock_gettime(CLOCK_MONOTONIC, &cpu_t1);
64 | 
65 |     check(cuEventElapsedTime(&gpu_time[i], gpu_t0, gpu_t1));
66 |     gpu_time[i] *= 1000;
67 | 
68 |     cpu_time[i] = (cpu_t1.tv_sec - cpu_t0.tv_sec) +
69 |                   (cpu_t1.tv_nsec - cpu_t0.tv_nsec) / 1000.;
70 |   }
71 |   check(cuProfilerStop());
72 | 
73 |   double mean_cpu = 0;
74 |   double mean_gpu = 0;
75 |   int i;
76 |   for (i = 1; i < ITERATIONS ; ++i) {
77 |       mean_cpu += cpu_time[i];
78 |       mean_gpu += gpu_time[i];
79 |   }
80 |   mean_cpu /= (ITERATIONS-1);
81 |   mean_gpu /= (ITERATIONS-1);
82 | 
83 |   double std_cpu = 0;
84 |   double std_gpu = 0;
85 |   for (i = 1; i < ITERATIONS ; ++i ) {
86 |       std_cpu += pow((cpu_time[i] - mean_cpu), 2);
87 |       std_gpu += pow((gpu_time[i] - mean_gpu), 2);
88 |   }
89 |   std_cpu = sqrt(std_cpu / (ITERATIONS-1));
90 |   std_gpu = sqrt(std_gpu / (ITERATIONS-1));
91 | 
92 |   printf("CPU time: %.2f +/- %.2f us\n", mean_cpu, std_cpu);
93 |   printf("GPU time: %.2f +/- %.2f us\n", mean_gpu, std_gpu);
94 | 
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/examples/reduce/benchmark.jl:
--------------------------------------------------------------------------------
 1 | # EXCLUDE FROM TESTING
 2 | 
 3 | using BenchmarkTools
 4 | BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
 5 | BenchmarkTools.DEFAULT_PARAMETERS.gcsample = true
 6 | 
 7 | include("reduce.jl")
 8 | 
 9 | CUDAnative.initialize()
10 | const dev = device()
11 | const cap = capability(dev)
12 | @assert(cap >= v"3.0", "this example requires a newer GPU")
13 | 
14 | len = 10^7
15 | input = ones(Int32, len)
16 | 
17 | 
18 | ## CPU
19 | 
20 | benchmark_cpu = @benchmarkable begin
21 |         reduce(+, input)
22 |     end
23 | 
24 | @show run(benchmark_cpu)
25 | 
26 | 
27 | 
28 | ## CUDAnative
29 | 
30 | # PTX generation
31 | open(joinpath(@__DIR__, "reduce.jl.ptx"), "w") do f
32 |     CUDAnative.code_ptx(f, reduce_grid, Tuple{typeof(+), CuDeviceVector{Int32,AS.Global},
33 |                                               CuDeviceVector{Int32,AS.Global}, Int32};
34 |                         cap=v"6.1.0")
35 | end
36 | 
37 | benchmark_gpu = @benchmarkable begin
38 |         gpu_reduce(+, gpu_input, gpu_output)
39 |         val = Array(gpu_output)[1]
40 |     end setup=(
41 |         val = nothing;
42 |         gpu_input = CuArray($input);
43 |         gpu_output = similar(gpu_input)
44 |     ) teardown=(
45 |         gpu_input = nothing;
46 |         gpu_output = nothing
47 |     )
48 | 
49 | @show run(benchmark_gpu)
50 | 
51 | 
52 | ## CUDA
53 | 
54 | using CUDAapi
55 | using Libdl
56 | 
57 | cd(@__DIR__) do
58 |     toolkit = CUDAapi.find_toolkit()
59 |     nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit)
60 |     toolchain = CUDAapi.find_toolchain(toolkit)
61 |     flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)`
62 |     run(`$nvcc $flags -ptx -o reduce.cu.ptx reduce.cu`)
63 |     run(`$nvcc $flags -shared --compiler-options '-fPIC' -o reduce.so reduce.cu`)
64 | end
65 | 
66 | # Entry-point wrappers
67 | lib = Libdl.dlopen(joinpath(@__DIR__, "reduce.so"))
68 | setup_cuda(input)    = ccall(Libdl.dlsym(lib, "setup"), Ptr{Cvoid},
69 |                              (Ptr{Cint}, Csize_t), input, length(input))
70 | run_cuda(state)      = ccall(Libdl.dlsym(lib, "run"), Cint,
71 |                              (Ptr{Cvoid},), state)
72 | teardown_cuda(state) = ccall(Libdl.dlsym(lib, "teardown"), Cvoid,
73 |                              (Ptr{Cvoid},), state)
74 | 
75 | # Correctness check (not part of verify.jl which is meant to run during testing)
76 | using Test
77 | let
78 |     cuda_state = setup_cuda(input)
79 |     cuda_val = run_cuda(cuda_state)
80 |     teardown_cuda(cuda_state)
81 |     @assert cuda_val == reduce(+, input)
82 | end
83 | 
84 | benchmark_cuda = @benchmarkable begin
85 |         val = run_cuda(state)
86 |     end setup=(
87 |         val = nothing;
88 |         state = setup_cuda($input);
89 |     ) teardown=(
90 |         teardown_cuda(state)
91 |     )
92 | 
93 | @show run(benchmark_cuda)
94 | 


--------------------------------------------------------------------------------
/docs/src/man/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | 
 3 | To increase logging verbosity of the CUDAnative compiler, launch Julia with the
 4 | `JULIA_DEBUG` environment variable set to `CUDAnative`.
 5 | 
 6 | 
 7 | ## LLVM IR generated for ... is not GPU compatible
 8 | 
 9 | Not all of Julia is supported by CUDAnative. Several commonly-used features,
10 | like strings or exceptions, will not compile to GPU code, because of their
11 | interactions with the CPU-only runtime library.
12 | 
13 | When not using GPU-incompatible language features, you might still run into this
14 | compiler error when your code contains type instabilities or other dynamic
15 | behavior. These are often easily spotted by prefixing the failing function call
16 | with one of several `@device_code` macros.
17 | 
18 | For example, say we define and execute the following kernel:
19 | 
20 | ```julia
21 | julia> kernel(a) = @inbounds a[threadId().x] = 0
22 | kernel (generic function with 1 method)
23 | 
24 | julia> @cuda kernel(CuArray([1]))
25 | ERROR: LLVM IR generated for Kernel(CuDeviceArray{Int64,1,CUDAnative.AS.Global}) is not GPU compatible
26 | ```
27 | 
28 | When running with `JULIA_DEBUG=CUDAnative`, you will get to see the actual
29 | incompatible IR constructs. Prefixing our kernel invocation with
30 | `@device_code_warntype` reveals our issue:
31 | 
32 | ```julia
33 | julia> @device_code_warntype @cuda kernel(CuArray([1]))
34 | Variables:
35 |   a::CuDeviceArray{Int64,1,CUDAnative.AS.Global}
36 |   val<optimized out>
37 | 
38 | Body:
39 |   begin
40 |       Core.SSAValue(1) = (Main.threadId)()::ANY
41 |       Core.SSAValue(2) = (Base.getproperty)(Core.SSAValue(1), :x)::ANY
42 |       (Base.setindex!)(a::CuDeviceArray{Int64,1,CUDAnative.AS.Global}, 0, Core.SSAValue(2))::ANY
43 |       return 0
44 |   end::Int64
45 | ERROR: LLVM IR generated for Kernel(CuDeviceArray{Int64,1,CUDAnative.AS.Global}) is not GPU compatible
46 | ```
47 | 
48 | Because of a typo, the call to `threadId` is untyped and returns `Any` (it
49 | should have been `threadIdx`). In the future, we expect to be able to catch such
50 | errors automatically.
51 | 
52 | If you want to dump all forms of generated code to disk, for further inspection,
53 | have a look at the `@device_code` macro instead.
54 | 
55 | 
56 | ## Debug info and line-number information
57 | 
58 | LLVM's NVPTX back-end does not support the undocumented PTX debug format, so we cannot
59 | generate the necessary DWARF sections. This means that debugging generated code with e.g.
60 | `cuda-gdb` will be an unpleasant experience. Nonetheless, the PTX JIT is configured to emit
61 | debug info (which corresponds with `nvcc -G`) when the Julia debug info level is 2 or
62 | higher (`julia -g2`).
63 | 
64 | We do however support emitting line number information, which is useful for other CUDA tools
65 | like `cuda-memcheck`. The functionality (which corresponds with `nvcc -lineinfo`) is enabled
66 | when the Julia debug info level is 1 (the default value). It can be disabled by passing `-g0`
67 | instead.
68 | 


--------------------------------------------------------------------------------
/res/parse_libdevice.jl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env julia
 2 | 
 3 | # Script to parse and compare the libdevice PDF manual against our list of intrinsics
 4 | 
 5 | function parse_intrinsics(cb)
 6 |     fn = joinpath(@__DIR__, "..", "src", "device", "libdevice.jl")
 7 |     open(fn) do f
 8 |         for ln in eachline(f)
 9 |             m = match(r"@wrap ([\w.]+\(.+?\)::\w+)", ln)
10 |             if m != nothing
11 |                 cb(replace(m.captures[1], r"\w+::", "::"))
12 |             end
13 |         end
14 |     end
15 | end
16 | 
17 | function parse_libdevice(fn, cb)
18 |     open(fn) do f
19 |         next_proto = false
20 |         number = 0
21 | 
22 |         for ln in eachline(f)
23 |             if (m = match(r"^\d\.(\d+)\..", ln); m != nothing)
24 |                 number = parse(Int, m.captures[1])
25 |             elseif occursin(r"^Prototype:", ln)
26 |                 next_proto = true
27 |             elseif next_proto
28 |                 cb(chomp(ln), number)
29 |                 next_proto = false
30 |             end
31 |         end
32 |     end
33 | end
34 | 
35 | function main(args)
36 |     if length(args) != 1
37 |         println("Usage: $(basename(@__FILE__)) LIBDEVICE_PDF")
38 |         exit(1)
39 |     end
40 |     pdf = args[1]
41 |     isfile(pdf) || error("input PDF does not exist")
42 | 
43 |     wrapped = Set{String}()
44 |     parse_intrinsics(intr -> push!(wrapped, intr))
45 | 
46 |     intrinsics = Set{String}()
47 |     numbering = Dict{String,Number}()
48 |     txt = tempname()
49 |     run(`pdftotext $pdf $txt`)
50 |     parse_libdevice(txt, (proto, number) -> begin
51 |         m = match(r"^(\w+) (@[\w.]+)\((.*?)\)", proto)
52 |         if m != nothing
53 |             rettype = m.captures[1]
54 |             fn = m.captures[2]
55 |             arglist = m.captures[3]
56 | 
57 |             argpairs = split(arglist, ", ")
58 |             argtypes, args = zip(map(argpair -> split(argpair, " "), argpairs)...)
59 | 
60 |             wrap_fn = strip(fn, '@')
61 |             wrap_argtypes = map(argtyp -> endswith(argtyp, '*') ? "Ptr{$(argtyp[1:end-1])}"
62 |                                                                 : argtyp, argtypes)
63 |             wrap_args = map(arg -> strip(arg, '%'), args)
64 |             wrap_arglist = join(["$arg::$argtyp" for (arg, argtyp) in zip(wrap_args, wrap_argtypes)], ", ")
65 | 
66 |             intr = "$wrap_fn($wrap_arglist)::$rettype"
67 |             push!(intrinsics, intr)
68 |             numbering[intr] = number
69 |         end
70 |     end)
71 |     rm(txt)
72 | 
73 |     missing = setdiff(intrinsics, wrapped)
74 |     superfluous = setdiff(wrapped, intrinsics)
75 | 
76 |     println("Missing intrinsics:")
77 |     for intr in sort(collect(missing), lt=(a,b)->numbering[a]<numbering[b])
78 |         println(" $(numbering[intr]). $intr")
79 |     end
80 | 
81 |     println()
82 | 
83 |     println("Superfluous intrinsics:")
84 |     for intr in sort(collect(superfluous))
85 |         println(" - $intr")
86 |     end
87 | end
88 | 
89 | main(ARGS)
90 | 


--------------------------------------------------------------------------------
/test/device/codegen.jl:
--------------------------------------------------------------------------------
  1 | @testset "code generation (relying on a device)" begin
  2 | 
  3 | ############################################################################################
  4 | 
  5 | @testset "LLVM" begin
  6 | 
  7 | @testset "stripping invariant.load" begin
  8 |     function kernel(ptr, x)
  9 |         i = CUDAnative.threadIdx_x()
 10 |         @inbounds unsafe_store!(ptr, x[i], 1)
 11 |         return
 12 |     end
 13 | 
 14 |     buf = Mem.alloc(Float64)
 15 |     ptr = Base.unsafe_convert(CuPtr{Float64}, buf)
 16 | 
 17 |     @cuda kernel(ptr, (1., 2., ))
 18 |     @test Mem.download(Float64, buf) == [1.]
 19 | end
 20 | 
 21 | @testset "stripping const TBAA" begin
 22 |     # this one is particularly nasty because it occurs in a nested function
 23 | 
 24 |     _a = rand(Int, 2, 1)
 25 |     b = ((1,9999),(1,9999))
 26 | 
 27 |     out_buf = Mem.alloc(Int, 2)
 28 |     a = Tuple(_a)
 29 | 
 30 |     function kernel(out, a, b)
 31 |         i = threadIdx().x
 32 |         blockIdx().x
 33 |         @inbounds out[i,1] = a[i] + b[i][1]
 34 |         return
 35 |     end
 36 | 
 37 |     ptr = Base.unsafe_convert(CuPtr{Int}, out_buf)
 38 | 
 39 |     @cuda threads=2 kernel(CuDeviceArray((2,1), CUDAnative.DevicePtr(ptr)), a, b)
 40 |     @test Mem.download(Int, out_buf, 2) == (_a .+ 1)[1:2]
 41 | end
 42 | 
 43 | 
 44 | @testset "ptxas-compatible control flow" begin
 45 |     @noinline function throw_some()
 46 |         throw(42)
 47 |         return
 48 |     end
 49 | 
 50 |     @inbounds function kernel(input, output, n)
 51 |         i = threadIdx().x
 52 | 
 53 |         temp = @cuStaticSharedMem(Int, 1)
 54 |         if i == 1
 55 |             1 <= n || throw_some()
 56 |             temp[1] = input
 57 |         end
 58 |         sync_threads()
 59 | 
 60 |         1 <= n || throw_some()
 61 |         unsafe_store!(output, temp[1], i)
 62 | 
 63 |         return
 64 |     end
 65 | 
 66 |     function gpu(input)
 67 |         output = Mem.alloc(Int, 2)
 68 | 
 69 |         ptr = Base.unsafe_convert(CuPtr{eltype(input)}, output)
 70 | 
 71 |         @cuda threads=2 kernel(input, ptr, 99)
 72 | 
 73 |         return Mem.download(Int, output, 2)
 74 |     end
 75 | 
 76 |     function cpu(input)
 77 |         output = zeros(eltype(input), 2)
 78 | 
 79 |         for j in 1:2
 80 |             @inbounds output[j] = input
 81 |         end
 82 | 
 83 |         return output
 84 |     end
 85 | 
 86 |     input = rand(1:100)
 87 |     @test cpu(input) == gpu(input)
 88 | end
 89 | 
 90 | end
 91 | 
 92 | ############################################################################################
 93 | 
 94 | @testset "SASS" begin
 95 | 
 96 | @testset "basic reflection" begin
 97 |     valid_kernel() = return
 98 |     invalid_kernel() = 1
 99 | 
100 |     @test CUDAnative.code_sass(devnull, valid_kernel, Tuple{}) == nothing
101 |     @test_throws CUDAnative.KernelError CUDAnative.code_sass(devnull, invalid_kernel, Tuple{})
102 | end
103 | 
104 | end
105 | 
106 | ############################################################################################
107 | 
108 | end
109 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/assertion.jl:
--------------------------------------------------------------------------------
 1 | # Assertion (B.19)
 2 | 
 3 | export @cuassert
 4 | 
 5 | """
 6 |     @assert cond [text]
 7 | 
 8 | Signal assertion failure to the CUDA driver if `cond` is `false`. Preferred syntax for
 9 | writing assertions, mimicking `Base.@assert`. Message `text` is optionally displayed upon
10 | assertion failure.
11 | 
12 | !!! warning
13 |     A failed assertion will crash the GPU, so use sparingly as a debugging tool.
14 |     Furthermore, the assertion might be disabled at various optimization levels, and thus
15 |     should not cause any side-effects.
16 | """
17 | macro cuassert(ex, msgs...)
18 |     # message handling copied from Base.@assert
19 |     msg = isempty(msgs) ? ex : msgs[1]
20 |     if isa(msg, AbstractString)
21 |         msg = msg # pass-through
22 |     elseif !isempty(msgs) && (isa(msg, Expr) || isa(msg, Symbol))
23 |         # message is an expression needing evaluating
24 |         msg = :(Main.Base.string($(esc(msg))))
25 |     elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) && applicable(Main.Base.string, msg)
26 |         msg = Main.Base.string(msg)
27 |     else
28 |         # string() might not be defined during bootstrap
29 |         msg = :(Main.Base.string($(Expr(:quote,msg))))
30 |     end
31 | 
32 |     return :($(esc(ex)) ? $(nothing)
33 |                         : cuassert_fail($(Val(Symbol(msg))),
34 |                                         $(Val(__source__.file)),
35 |                                         $(Val(__source__.line))))
36 | end
37 | 
38 | assert_counter = 0
39 | 
40 | @generated function cuassert_fail(::Val{msg}, ::Val{file}, ::Val{line}) where
41 |                                  {msg, file, line}
42 |     T_void = LLVM.VoidType(JuliaContext())
43 |     T_int32 = LLVM.Int32Type(JuliaContext())
44 |     T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
45 | 
46 |     # create function
47 |     llvm_f, _ = create_function()
48 |     mod = LLVM.parent(llvm_f)
49 | 
50 |     # generate IR
51 |     Builder(JuliaContext()) do builder
52 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
53 |         position!(builder, entry)
54 | 
55 |         global assert_counter
56 |         assert_counter += 1
57 | 
58 |         message = globalstring_ptr!(builder, String(msg), "assert_message_$(assert_counter)")
59 |         file = globalstring_ptr!(builder, String(file), "assert_file_$(assert_counter)")
60 |         line = ConstantInt(T_int32, line)
61 |         func = globalstring_ptr!(builder, "unknown", "assert_function_$(assert_counter)")
62 |         charSize = ConstantInt(Csize_t(1), JuliaContext())
63 | 
64 |         # invoke __assertfail and return
65 |         # NOTE: we don't mark noreturn since that control flow might confuse ptxas
66 |         assertfail_typ =
67 |             LLVM.FunctionType(T_void,
68 |                              [T_pint8, T_pint8, T_int32, T_pint8, llvmtype(charSize)])
69 |         assertfail = LLVM.Function(mod, "__assertfail", assertfail_typ)
70 |         call!(builder, assertfail, [message, file, line, func, charSize])
71 | 
72 |         ret!(builder)
73 |     end
74 | 
75 |     call_function(llvm_f, Nothing, Tuple{})
76 | end
77 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/indexing.jl:
--------------------------------------------------------------------------------
 1 | # Indexing and dimensions (B.4)
 2 | 
 3 | export
 4 |     threadIdx, blockDim, blockIdx, gridDim,
 5 |     warpsize
 6 | 
 7 | @generated function _index(::Val{name}, ::Val{range}) where {name, range}
 8 |     T_int32 = LLVM.Int32Type(JuliaContext())
 9 | 
10 |     # create function
11 |     llvm_f, _ = create_function(T_int32)
12 |     mod = LLVM.parent(llvm_f)
13 | 
14 |     # generate IR
15 |     Builder(JuliaContext()) do builder
16 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
17 |         position!(builder, entry)
18 | 
19 |         # call the indexing intrinsic
20 |         intr_typ = LLVM.FunctionType(T_int32)
21 |         intr = LLVM.Function(mod, "llvm.nvvm.read.ptx.sreg.$name", intr_typ)
22 |         idx = call!(builder, intr)
23 | 
24 |         # attach range metadata
25 |         range_metadata = MDNode([ConstantInt(Int32(range.start), JuliaContext()),
26 |                                  ConstantInt(Int32(range.stop), JuliaContext())],
27 |                                 JuliaContext())
28 |         metadata(idx)[LLVM.MD_range] = range_metadata
29 | 
30 |         ret!(builder, idx)
31 |     end
32 | 
33 |     call_function(llvm_f, UInt32)
34 | end
35 | 
36 | # TODO: look these up for the current device (using contextual dispatch).
37 | #       for now, these values are based on the Volta V100 GPU.
38 | const max_block_size = (x=1024, y=1024, z=1024)
39 | const max_grid_size  = (x=2147483647, y=65535, z=65535)
40 | 
41 | for dim in (:x, :y, :z)
42 |     # Thread index
43 |     fn = Symbol("threadIdx_$dim")
44 |     intr = Symbol("tid.$dim")
45 |     @eval @inline $fn() = Int(_index($(Val(intr)), $(Val(0:max_block_size[dim]-1)))) + 1
46 | 
47 |     # Block size (#threads per block)
48 |     fn = Symbol("blockDim_$dim")
49 |     intr = Symbol("ntid.$dim")
50 |     @eval @inline $fn() = Int(_index($(Val(intr)), $(Val(1:max_block_size[dim]))))
51 | 
52 |     # Block index
53 |     fn = Symbol("blockIdx_$dim")
54 |     intr = Symbol("ctaid.$dim")
55 |     @eval @inline $fn() = Int(_index($(Val(intr)), $(Val(0:max_grid_size[dim]-1)))) + 1
56 | 
57 |     # Grid size (#blocks per grid)
58 |     fn = Symbol("gridDim_$dim")
59 |     intr = Symbol("nctaid.$dim")
60 |     @eval @inline $fn() = Int(_index($(Val(intr)), $(Val(1:max_grid_size[dim]))))
61 | end
62 | 
63 | """
64 |     gridDim()::CuDim3
65 | 
66 | Returns the dimensions of the grid.
67 | """
68 | @inline gridDim() =   (x=gridDim_x(),   y=gridDim_y(),   z=gridDim_z())
69 | 
70 | """
71 |     blockIdx()::CuDim3
72 | 
73 | Returns the block index within the grid.
74 | """
75 | @inline blockIdx() =  (x=blockIdx_x(),  y=blockIdx_y(),  z=blockIdx_z())
76 | 
77 | """
78 |     blockDim()::CuDim3
79 | 
80 | Returns the dimensions of the block.
81 | """
82 | @inline blockDim() =  (x=blockDim_x(),  y=blockDim_y(),  z=blockDim_z())
83 | 
84 | """
85 |     threadIdx()::CuDim3
86 | 
87 | Returns the thread index within the block. 
88 | """
89 | @inline threadIdx() = (x=threadIdx_x(), y=threadIdx_y(), z=threadIdx_z())
90 | 
91 | """
92 |     warpsize()::UInt32
93 | 
94 | Returns the warp size (in threads).
95 | """
96 | @inline warpsize() = Int(ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, UInt32, ()))
97 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/output.jl:
--------------------------------------------------------------------------------
 1 | # Formatted Output (B.17)
 2 | 
 3 | export @cuprintf
 4 | 
 5 | @generated function promote_c_argument(arg)
 6 |     # > When a function with a variable-length argument list is called, the variable
 7 |     # > arguments are passed using C's old ``default argument promotions.'' These say that
 8 |     # > types char and short int are automatically promoted to int, and type float is
 9 |     # > automatically promoted to double. Therefore, varargs functions will never receive
10 |     # > arguments of type char, short int, or float.
11 | 
12 |     if arg == Cchar || arg == Cshort
13 |         return :(Cint(arg))
14 |     elseif arg == Cfloat
15 |         return :(Cdouble(arg))
16 |     else
17 |         return :(arg)
18 |     end
19 | end
20 | 
21 | """
22 | Print a formatted string in device context on the host standard output:
23 | 
24 |     @cuprintf("%Fmt", args...)
25 | 
26 | Note that this is not a fully C-compliant `printf` implementation; see the CUDA
27 | documentation for supported options and inputs.
28 | 
29 | Also beware that it is an untyped, and unforgiving `printf` implementation. Type widths need
30 | to match, eg. printing a 64-bit Julia integer requires the `%ld` formatting string.
31 | """
32 | macro cuprintf(fmt::String, args...)
33 |     fmt_val = Val(Symbol(fmt))
34 | 
35 |     return :(_cuprintf($fmt_val, $(map(arg -> :(promote_c_argument($arg)), esc.(args))...)))
36 | end
37 | 
38 | @generated function _cuprintf(::Val{fmt}, argspec...) where {fmt}
39 |     arg_exprs = [:( argspec[$i] ) for i in 1:length(argspec)]
40 |     arg_types = [argspec...]
41 | 
42 |     T_void = LLVM.VoidType(JuliaContext())
43 |     T_int32 = LLVM.Int32Type(JuliaContext())
44 |     T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
45 | 
46 |     # create functions
47 |     param_types = LLVMType[convert.(LLVMType, arg_types)...]
48 |     llvm_f, _ = create_function(T_int32, param_types)
49 |     mod = LLVM.parent(llvm_f)
50 | 
51 |     # generate IR
52 |     Builder(JuliaContext()) do builder
53 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
54 |         position!(builder, entry)
55 | 
56 |         str = globalstring_ptr!(builder, String(fmt))
57 | 
58 |         # construct and fill args buffer
59 |         if isempty(argspec)
60 |             buffer = LLVM.PointerNull(T_pint8)
61 |         else
62 |             argtypes = LLVM.StructType("printf_args", JuliaContext())
63 |             elements!(argtypes, param_types)
64 | 
65 |             args = alloca!(builder, argtypes)
66 |             for (i, param) in enumerate(parameters(llvm_f))
67 |                 p = struct_gep!(builder, args, i-1)
68 |                 store!(builder, param, p)
69 |             end
70 | 
71 |             buffer = bitcast!(builder, args, T_pint8)
72 |         end
73 | 
74 |         # invoke vprintf and return
75 |         vprintf_typ = LLVM.FunctionType(T_int32, [T_pint8, T_pint8])
76 |         vprintf = LLVM.Function(mod, "vprintf", vprintf_typ)
77 |         chars = call!(builder, vprintf, [str, buffer])
78 | 
79 |         ret!(builder, chars)
80 |     end
81 | 
82 |     arg_tuple = Expr(:tuple, arg_exprs...)
83 |     call_function(llvm_f, Int32, Tuple{arg_types...}, arg_tuple)
84 | end
85 | 


--------------------------------------------------------------------------------
/src/compiler/driver.jl:
--------------------------------------------------------------------------------
  1 | # compiler driver and main interface
  2 | 
  3 | # (::CompilerContext)
  4 | const compile_hook = Ref{Union{Nothing,Function}}(nothing)
  5 | 
  6 | """
  7 |     compile(dev::CuDevice, f, tt; kwargs...)
  8 | 
  9 | Compile a function `f` invoked with types `tt` for device `dev`, returning the compiled
 10 | function module respectively of type `CuFuction` and `CuModule`.
 11 | 
 12 | For a list of supported keyword arguments, refer to the documentation of
 13 | [`cufunction`](@ref).
 14 | """
 15 | function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); kwargs...)
 16 |     CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.")
 17 | 
 18 |     module_asm, module_entry = compile(supported_capability(dev), f, tt; kwargs...)
 19 | 
 20 |     # enable debug options based on Julia's debug setting
 21 |     jit_options = Dict{CUDAdrv.CUjit_option,Any}()
 22 |     if Base.JLOptions().debug_level == 1
 23 |         jit_options[CUDAdrv.GENERATE_LINE_INFO] = true
 24 |     elseif Base.JLOptions().debug_level >= 2
 25 |         jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
 26 |     end
 27 |     cuda_mod = CuModule(module_asm, jit_options)
 28 |     cuda_fun = CuFunction(cuda_mod, module_entry)
 29 | 
 30 |     return cuda_fun, cuda_mod
 31 | end
 32 | 
 33 | # same as above, but without an active device
 34 | function compile(cap::VersionNumber, @nospecialize(f), @nospecialize(tt);
 35 |                  kernel=true, kwargs...)
 36 |     ctx = CompilerContext(f, tt, cap, kernel; kwargs...)
 37 | 
 38 |     return compile(ctx)
 39 | end
 40 | 
 41 | function compile(ctx::CompilerContext)
 42 |     if compile_hook[] != nothing
 43 |         hook = compile_hook[]
 44 |         compile_hook[] = nothing
 45 | 
 46 |         global globalUnique
 47 |         previous_globalUnique = globalUnique
 48 | 
 49 |         hook(ctx)
 50 | 
 51 |         globalUnique = previous_globalUnique
 52 |         compile_hook[] = hook
 53 |     end
 54 | 
 55 | 
 56 |     ## high-level code generation (Julia AST)
 57 | 
 58 |     @debug "(Re)compiling function" ctx
 59 | 
 60 |     check_method(ctx)
 61 | 
 62 | 
 63 |     ## low-level code generation (LLVM IR)
 64 | 
 65 |     mod, entry = irgen(ctx)
 66 | 
 67 |     need_library(lib) = any(f -> isdeclaration(f) &&
 68 |                                  intrinsic_id(f) == 0 &&
 69 |                                  haskey(functions(lib), LLVM.name(f)),
 70 |                             functions(mod))
 71 | 
 72 |     libdevice = load_libdevice(ctx.cap)
 73 |     if need_library(libdevice)
 74 |         link_libdevice!(ctx, mod, libdevice)
 75 |     end
 76 | 
 77 |     # optimize the IR
 78 |     entry = optimize!(ctx, mod, entry)
 79 | 
 80 |     runtime = load_runtime(ctx.cap)
 81 |     if need_library(runtime)
 82 |         link_library!(ctx, mod, runtime)
 83 |     end
 84 | 
 85 |     prepare_execution!(ctx, mod)
 86 | 
 87 |     check_invocation(ctx, entry)
 88 | 
 89 |     # check generated IR
 90 |     check_ir(ctx, mod)
 91 |     verify(mod)
 92 | 
 93 | 
 94 |     ## machine code generation (PTX assembly)
 95 | 
 96 |     module_asm = mcgen(ctx, mod, entry)
 97 | 
 98 |     return module_asm, LLVM.name(entry)
 99 | end
100 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/synchronization.jl:
--------------------------------------------------------------------------------
 1 | # Synchronization (B.6)
 2 | 
 3 | export sync_threads, sync_warp
 4 | export threadfence, threadfence_block, threadfence_system
 5 | 
 6 | """
 7 |     sync_threads()
 8 | 
 9 | Waits until all threads in the thread block have reached this point and all global and
10 | shared memory accesses made by these threads prior to `sync_threads()` are visible to all
11 | threads in the block.
12 | """
13 | @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ())
14 | 
15 | """
16 |     sync_warp(mask::Integer=0xffffffff)
17 | 
18 | Waits threads in the warp, selected by means of the bitmask `mask`, have reached this point
19 | and all global and shared memory accesses made by these threads prior to `sync_warp()` are
20 | visible to those threads in the warp. The default value for `mask` selects all threads in
21 | the warp.
22 | 
23 | !!! note
24 |    Requires CUDA >= 9.0 and sm_6.2
25 | """
26 | sync_warp
27 | 
28 | if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support
29 |     @inline function sync_warp(mask::Integer=0xffffffff)
30 |         @asmcall("bar.warp.sync \$0;", "r", true,
31 |                  Cvoid, Tuple{UInt32}, convert(UInt32, mask))
32 |     end
33 | else
34 |     @inline sync_warp(mask::Integer=0xffffffff) = nothing
35 | end
36 | 
37 | """
38 |     threadfence_block()
39 | 
40 | A memory fence that ensures that:
41 | - All writes to all memory made by the calling thread before the call to `threadfence_block()`
42 |   are observed by all threads in the block of the calling thread as occurring before all writes
43 |   to all memory made by the calling thread after the call to `threadfence_block()`
44 | - All reads from all memory made by the calling thread before the call to `threadfence_block()`
45 |   are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`.
46 | """
47 | @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
48 | 
49 | """
50 |     threadfence()
51 | 
52 | A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
53 | calling thread and also ensures that no writes to all memory made by the calling thread after
54 | the call to `threadfence()` are observed by any thread in the device as occurring before any
55 | write to all memory made by the calling thread before the call to `threadfence()`.
56 | 
57 | Note that for this ordering guarantee to be true, the observing threads must truly observe the
58 | memory and not cached versions of it; this is requires the use of volatile loads and stores,
59 | which is not available from Julia right now.
60 | """
61 | @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
62 | 
63 | """
64 |     threadfence_system()
65 | 
66 | A memory fence that acts as [`threadfence_block`](@ref) for all threads in the block of the
67 | calling thread and also ensures that all writes to all memory made by the calling thread
68 | before the call to `threadfence_system()` are observed by all threads in the device,
69 | host threads, and all threads in peer devices as occurring before all writes to all
70 | memory made by the calling thread after the call to `threadfence_system()`.
71 | """
72 | @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
73 | 
74 | 


--------------------------------------------------------------------------------
/src/compiler/mcgen.jl:
--------------------------------------------------------------------------------
 1 | # machine code generation
 2 | 
 3 | function machine(cap::VersionNumber, triple::String)
 4 |     InitializeNVPTXTarget()
 5 |     InitializeNVPTXTargetInfo()
 6 |     t = Target(triple)
 7 | 
 8 |     InitializeNVPTXTargetMC()
 9 |     cpu = "sm_$(cap.major)$(cap.minor)"
10 |     if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support
11 |         # in the case of CUDA 9, we use sync intrinsics from PTX ISA 6.0+
12 |         feat = "+ptx60"
13 |     else
14 |         feat = ""
15 |     end
16 |     tm = TargetMachine(t, triple, cpu, feat)
17 |     asm_verbosity!(tm, true)
18 | 
19 |     return tm
20 | end
21 | 
22 | # final preparations for the module to be compiled to PTX
23 | # these passes should not be run when e.g. compiling to write to disk.
24 | function prepare_execution!(ctx::CompilerContext, mod::LLVM.Module)
25 |     let pm = ModulePassManager()
26 |         global global_ctx
27 |         global_ctx = ctx
28 | 
29 |         global_optimizer!(pm)
30 | 
31 |         add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!))
32 | 
33 |         global_dce!(pm)
34 |         strip_dead_prototypes!(pm)
35 | 
36 |         run!(pm, mod)
37 |         dispose(pm)
38 |     end
39 | 
40 |     return
41 | end
42 | 
43 | # some Julia code contains references to objects in the CPU run-time,
44 | # without actually using the contents or functionality of those objects.
45 | #
46 | # prime example are type tags, which reference the address of the allocated type.
47 | # since those references are ephemeral, we can't eagerly resolve and emit them in the IR,
48 | # but at the same time the GPU can't resolve them at run-time.
49 | #
50 | # this pass performs that resolution at link time.
51 | function resolve_cpu_references!(mod::LLVM.Module)
52 |     ctx = global_ctx::CompilerContext
53 |     changed = false
54 | 
55 |     for f in functions(mod)
56 |         fn = LLVM.name(f)
57 |         if isdeclaration(f) && intrinsic_id(f) == 0 && startswith(fn, "jl_")
58 |             # eagerly resolve the address of the binding
59 |             address = ccall(:jl_cglobal, Any, (Any, Any), fn, UInt)
60 |             dereferenced = unsafe_load(address)
61 |             dereferenced = LLVM.ConstantInt(dereferenced, JuliaContext())
62 | 
63 |             function replace_bindings!(value)
64 |                 changed = false
65 |                 for use in uses(value)
66 |                     val = user(use)
67 |                     if isa(val, LLVM.ConstantExpr)
68 |                         # recurse
69 |                         changed |= replace_bindings!(val)
70 |                     elseif isa(val, LLVM.LoadInst)
71 |                         # resolve
72 |                         replace_uses!(val, dereferenced)
73 |                         unsafe_delete!(LLVM.parent(val), val)
74 |                         # FIXME: iterator invalidation?
75 |                         changed = true
76 |                     end
77 |                 end
78 |                 changed
79 |             end
80 | 
81 |             changed |= replace_bindings!(f)
82 |         end
83 |     end
84 | 
85 |     return changed
86 | end
87 | 
88 | function mcgen(ctx::CompilerContext, mod::LLVM.Module, f::LLVM.Function)
89 |     tm = machine(ctx.cap, triple(mod))
90 | 
91 |     InitializeNVPTXAsmPrinter()
92 |     return String(emit(tm, mod, LLVM.API.LLVMAssemblyFile))
93 | end
94 | 


--------------------------------------------------------------------------------
/src/compiler/common.jl:
--------------------------------------------------------------------------------
  1 | # common functionality
  2 | 
  3 | struct CompilerContext
  4 |     # core invocation
  5 |     f::Core.Function
  6 |     tt::DataType
  7 |     cap::VersionNumber
  8 |     kernel::Bool
  9 | 
 10 |     # optional properties
 11 |     minthreads::Union{Nothing,CuDim}
 12 |     maxthreads::Union{Nothing,CuDim}
 13 |     blocks_per_sm::Union{Nothing,Integer}
 14 |     maxregs::Union{Nothing,Integer}
 15 | 
 16 |     CompilerContext(f, tt, cap, kernel;
 17 |                     minthreads=nothing, maxthreads=nothing,
 18 |                     blocks_per_sm=nothing, maxregs=nothing) =
 19 |         new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs)
 20 | end
 21 | 
 22 | # global context reference
 23 | # FIXME: thread through `ctx` everywhere (deadlocks the Julia compiler when doing so with
 24 | #        the LLVM passes in CUDAnative)
 25 | global_ctx = nothing
 26 | 
 27 | 
 28 | function signature(ctx::CompilerContext)
 29 |     fn = typeof(ctx.f).name.mt.name
 30 |     args = join(ctx.tt.parameters, ", ")
 31 |     return "$fn($(join(ctx.tt.parameters, ", ")))"
 32 | end
 33 | 
 34 | 
 35 | struct KernelError <: Exception
 36 |     ctx::CompilerContext
 37 |     message::String
 38 |     help::Union{Nothing,String}
 39 |     bt::StackTraces.StackTrace
 40 | 
 41 |     KernelError(ctx::CompilerContext, message::String, help=nothing;
 42 |                 bt=StackTraces.StackTrace()) =
 43 |         new(ctx, message, help, bt)
 44 | end
 45 | 
 46 | function Base.showerror(io::IO, err::KernelError)
 47 |     println(io, "GPU compilation of $(signature(err.ctx)) failed")
 48 |     println(io, "KernelError: $(err.message)")
 49 |     println(io)
 50 |     println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros."))
 51 |     Base.show_backtrace(io, err.bt)
 52 | end
 53 | 
 54 | 
 55 | struct InternalCompilerError <: Exception
 56 |     ctx::CompilerContext
 57 |     message::String
 58 |     meta::Dict
 59 |     InternalCompilerError(ctx, message; kwargs...) = new(ctx, message, kwargs)
 60 | end
 61 | 
 62 | function Base.showerror(io::IO, err::InternalCompilerError)
 63 |     println(io, """CUDAnative.jl encountered an unexpected internal compiler error.
 64 |                    Please file an issue attaching the following information, including the backtrace,
 65 |                    as well as a reproducible example (if possible).""")
 66 | 
 67 |     println(io, "\nInternalCompilerError: $(err.message)")
 68 | 
 69 |     println(io, "\nCompiler invocation:")
 70 |     for field in fieldnames(CompilerContext)
 71 |         println(io, " - $field = $(repr(getfield(err.ctx, field)))")
 72 |     end
 73 | 
 74 |     if !isempty(err.meta)
 75 |         println(io, "\nAdditional information:")
 76 |         for (key,val) in err.meta
 77 |             println(io, " - $key = $(repr(val))")
 78 |         end
 79 |     end
 80 | 
 81 |     println(io, "\nInstalled packages:")
 82 |     for (pkg,ver) in Pkg.installed()
 83 |         println(io, " - $pkg = $ver")
 84 |     end
 85 | 
 86 |     println(io)
 87 |     versioninfo(io)
 88 | end
 89 | 
 90 | macro compiler_assert(ex, ctx, kwargs...)
 91 |     msg = "$ex, at $(__source__.file):$(__source__.line)"
 92 |     return :($(esc(ex)) ? $(nothing)
 93 |                         : throw(InternalCompilerError($(esc(ctx)), $msg;
 94 |                                                       $(map(esc, kwargs)...)))
 95 |             )
 96 | end
 97 | 
 98 | 
 99 | # maintain our own "global unique" suffix for disambiguating kernels
100 | globalUnique = 0
101 | 


--------------------------------------------------------------------------------
/examples/reduce/reduce.cu:
--------------------------------------------------------------------------------
  1 | // Fast parallel reduction for Kepler hardware
  2 | //
  3 | // Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
  4 | 
  5 | #include <cuda.h>
  6 | #include <stdio.h>
  7 | 
  8 | #define APICALL(code) { check_code((code), __FILE__, __LINE__); }
  9 | inline void check_code(cudaError_t code, const char *file, int line)
 10 | {
 11 |   if (code != cudaSuccess)
 12 |   {
 13 |     fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
 14 |     exit(code);
 15 |   }
 16 | }
 17 | 
 18 | 
 19 | //
 20 | // Main implementation
 21 | //
 22 | 
 23 | // Reduce a value across a warp
 24 | __inline__ __device__
 25 | int sumReduce_warp(int val) {
 26 |   for (int offset = warpSize/2; offset > 0; offset /= 2) 
 27 |     val += __shfl_down(val, offset);
 28 |   return val;
 29 | }
 30 | 
 31 | // Reduce a value across a block, using shared memory for communication
 32 | __inline__ __device__ int sumReduce_block(int val) {
 33 |   // shared mem for 32 partial sums
 34 |   static __shared__ int shared[32];
 35 | 
 36 |   int lane = threadIdx.x % warpSize;
 37 |   int wid = threadIdx.x / warpSize;
 38 | 
 39 |   // each warp performs partial reduction
 40 |   val = sumReduce_warp(val);
 41 | 
 42 |   // write reduced value to shared memory
 43 |   if (lane==0) shared[wid]=val;
 44 | 
 45 |   // wait for all partial reductions
 46 |   __syncthreads();
 47 | 
 48 |   // read from shared memory only if that warp existed
 49 |   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
 50 | 
 51 |   // final reduce within first warp
 52 |   if (wid==0) {
 53 |     val = sumReduce_warp(val);
 54 |   }
 55 | 
 56 |   return val;
 57 | }
 58 | 
 59 | // Reduce an array across a complete grid
 60 | __global__ void sumReduce_grid(int *input, int* output, int N) {
 61 |   int sum = 0;
 62 | 
 63 |   // reduce multiple elements per thread (grid-stride loop)
 64 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; 
 65 |        i < N; 
 66 |        i += blockDim.x * gridDim.x) {
 67 |     sum += input[i];
 68 |   }
 69 | 
 70 |   sum = sumReduce_block(sum);
 71 | 
 72 |   if (threadIdx.x==0)
 73 |     output[blockIdx.x]=sum;
 74 | }
 75 | 
 76 | void sumReduce(int *input, int* output, int N) {
 77 |   int threads = 512;
 78 |   int blocks = min((N + threads - 1) / threads, 1024);
 79 | 
 80 |   sumReduce_grid<<<blocks, threads>>>(input, output, N);
 81 |   sumReduce_grid<<<1, 1024>>>(output, output, blocks);
 82 | }
 83 | 
 84 | 
 85 | //
 86 | // Benchmark entry-points
 87 | //
 88 | 
 89 | struct State
 90 | {
 91 |   size_t len;
 92 |   int *gpu_input;
 93 |   int *gpu_output;
 94 | };
 95 | 
 96 | extern "C"
 97 | State *setup(int *input, size_t len)
 98 | {
 99 |   State *state = new State();
100 | 
101 |   state->len = len;
102 | 
103 |   APICALL(cudaMalloc(&state->gpu_input, len*sizeof(int)));
104 |   APICALL(cudaMemcpy(state->gpu_input, input, len*sizeof(int), cudaMemcpyHostToDevice));
105 |   APICALL(cudaMalloc(&state->gpu_output, len*sizeof(int)));
106 | 
107 |   return state;
108 | }
109 | 
110 | extern "C"
111 | int run(State *state)
112 | {
113 |   sumReduce(state->gpu_input, state->gpu_output, state->len);
114 | 
115 |   int* output = (int*) malloc(state->len * sizeof(int));
116 |   APICALL(cudaMemcpy(output, state->gpu_output, state->len*sizeof(int), cudaMemcpyDeviceToHost));
117 |   int val = output[0];
118 |   free(output);
119 | 
120 |   return val;
121 | }
122 | 
123 | extern "C"
124 | void teardown(State *state)
125 | {
126 |   APICALL(cudaFree(state->gpu_output));
127 |   APICALL(cudaFree(state->gpu_input));
128 | }
129 | 


--------------------------------------------------------------------------------
/src/init.jl:
--------------------------------------------------------------------------------
  1 | # Initialization
  2 | 
  3 | export device!
  4 | 
  5 | 
  6 | const initialized = Ref{Bool}(false)
  7 | const device_contexts = Dict{CuDevice,CuContext}()
  8 | 
  9 | # FIXME: support for flags (see `cudaSetDeviceFlags`)
 10 | 
 11 | # API calls that are allowed without lazily initializing the CUDA library
 12 | #
 13 | # this list isn't meant to be complete (ie. many other API calls are actually allowed
 14 | # without setting-up a context), and only serves to make multi-device applications possible.
 15 | #
 16 | # feel free to open a PR adding additional API calls, if you have a specific use for them.
 17 | const preinit_apicalls = Set{Symbol}([
 18 |     :cuDriverGetVersion,
 19 |     # device calls, commonly used to determine the most appropriate device
 20 |     :cuDeviceGet,
 21 |     :cuDeviceGetAttribute,
 22 |     :cuDeviceGetCount,
 23 |     :cuDeviceGetName,
 24 |     :cuDeviceTotalMem,
 25 |     # context calls, for testing
 26 |     :cuCtxGetCurrent
 27 | ])
 28 | 
 29 | function maybe_initialize(apicall)
 30 |     initialized[] && return
 31 |     apicall in preinit_apicalls && return
 32 |     @debug "Initializing CUDA after call to $apicall"
 33 |     initialize()
 34 | end
 35 | 
 36 | function initialize(dev = CuDevice(0))
 37 |     # NOTE: we could do something smarter here,
 38 |     #       eg. select the most powerful device,
 39 |     #       or skip devices without free memory
 40 |     device!(dev)
 41 | end
 42 | 
 43 | const device!_listeners = Set{Function}()
 44 | 
 45 | """
 46 |     device!(dev)
 47 | 
 48 | Sets `dev` as the current active device for the calling host thread. Devices can be
 49 | specified by integer id, or as a `CuDevice`. This is intended to be a low-cost operation,
 50 | only performing significant work when calling it for the first time for each device.
 51 | 
 52 | If your library or code needs to perform an action when the active device changes, add a
 53 | callback of the signature `(::CuDevice, ::CuContext)` to the `device!_listeners` set.
 54 | """
 55 | function device!(dev::CuDevice)
 56 |     if !initialized[]
 57 |         initialized[] = true
 58 |         CUDAdrv.apicall_hook[] = nothing
 59 |     end
 60 | 
 61 |     # NOTE: although these conceptually match what the primary context is for,
 62 |     #       we don't use that because it is refcounted separately
 63 |     #       and might confuse / be confused by user operations
 64 |     #       (eg. calling `unsafe_reset!` on a primary context)
 65 |     if haskey(device_contexts, dev)
 66 |         ctx = device_contexts[dev]
 67 |         activate(ctx)
 68 |     else
 69 |         device_contexts[dev] = CuContext(dev)
 70 |     end
 71 | 
 72 |     for listener in device!_listeners
 73 |         listener(dev, device_contexts[dev])
 74 |     end
 75 | end
 76 | device!(dev::Integer) = device!(CuDevice(dev))
 77 | 
 78 | """
 79 |     device!(f, dev)
 80 | 
 81 | Sets the active device for the duration of `f`.
 82 | """
 83 | function device!(f::Function, dev::CuDevice)
 84 |     # FIXME: should use Push/Pop
 85 |     old_ctx = CuCurrentContext()
 86 |     try
 87 |         device!(dev)
 88 |         f()
 89 |     finally
 90 |         if old_ctx != nothing
 91 |             activate(old_ctx)
 92 |         end
 93 |     end
 94 | end
 95 | device!(f::Function, dev::Integer) = device!(f, CuDevice(dev))
 96 | 
 97 | function __init__()
 98 |     configured || return
 99 | 
100 |     if CUDAdrv.version() != cuda_driver_version
101 |         error("Your set-up has changed. Please run Pkg.build(\"CUDAnative\") and restart Julia.")
102 |     end
103 | 
104 |     CUDAdrv.apicall_hook[] = maybe_initialize
105 |     __init_compiler__()
106 | end
107 | 


--------------------------------------------------------------------------------
/docs/src/man/usage.md:
--------------------------------------------------------------------------------
  1 | # Usage
  2 | 
  3 | 
  4 | ## Quick start
  5 | 
  6 | First you have to write the kernel function and make sure it only uses features from the
  7 | CUDA-supported subset of Julia:
  8 | 
  9 | ```julia
 10 | using CUDAnative
 11 | 
 12 | function kernel_vadd(a, b, c)
 13 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 14 |     c[i] = a[i] + b[i]
 15 |     return nothing
 16 | end
 17 | ```
 18 | 
 19 | Using the `@cuda` macro, you can launch the kernel on a GPU of your choice:
 20 | 
 21 | ```julia
 22 | using CUDAdrv, CUDAnative, CuArrays
 23 | using Test
 24 | 
 25 | # CUDAdrv functionality: generate and upload data
 26 | a = round.(rand(Float32, (3, 4)) * 100)
 27 | b = round.(rand(Float32, (3, 4)) * 100)
 28 | d_a = CuArray(a)
 29 | d_b = CuArray(b)
 30 | d_c = similar(d_a)  # output array
 31 | 
 32 | # run the kernel and fetch results
 33 | # syntax: @cuda [kwargs...] kernel(args...)
 34 | @cuda threads=12 kernel_vadd(d_a, d_b, d_c)
 35 | 
 36 | # CUDAdrv functionality: download data
 37 | # this synchronizes the device
 38 | c = Array(d_c)
 39 | 
 40 | @test a+b ≈ c
 41 | ```
 42 | 
 43 | This code is executed in a default, global context for the first device in your
 44 | system. Similar to `cudaSetDevice`, you can switch devices by calling
 45 | CUDAnative's `device!` function:
 46 | 
 47 | ```julia
 48 | # change the active device
 49 | device!(1)
 50 | 
 51 | # the same, but only temporarily
 52 | device!(2) do
 53 |     # ...
 54 | end
 55 | ```
 56 | 
 57 | To enable debug logging, launch Julia with the `JULIA_DEBUG` environment
 58 | variable set to `CUDAnative`.
 59 | 
 60 | 
 61 | 
 62 | ## Julia support
 63 | 
 64 | Only a limited subset of Julia is supported by this package. This subset is undocumented, as
 65 | it is too much in flux.
 66 | 
 67 | In general, GPU support of Julia code is determined by the language features used by the
 68 | code. Several parts of the language are downright disallowed, such as calls to the Julia
 69 | runtime, or garbage allocations. Other features might get reduced in strength, eg. throwing
 70 | exceptions will result in a `trap`.
 71 | 
 72 | If your code is incompatible with GPU execution, the compiler will mention the unsupported
 73 | feature, and where the use came from:
 74 | 
 75 | ```
 76 | julia> foo(i) = (print("can't do this"); return nothing)
 77 | foo (generic function with 1 method)
 78 | 
 79 | julia> @cuda foo(1)
 80 | ERROR: error compiling foo: error compiling print: generic call to unsafe_write requires the runtime language feature
 81 | ```
 82 | 
 83 | In addition, the JIT doesn't support certain modes of compilation. For example, recursive
 84 | functions require a proper cached compilation, which is currently absent.
 85 | 
 86 | 
 87 | ## CUDA support
 88 | 
 89 | Not all of CUDA is supported, and because of time constraints the supported subset is again
 90 | undocumented. The following (incomplete) list details the support and their CUDAnative.jl
 91 | names. Most are implemented in `intrinsics.jl`, so have a look at that file for a more up to
 92 | date list:
 93 | 
 94 | * Indexing: `threadIdx().{x,y,z}`, `blockDim()`, `blockIdx()`, `gridDim()`, `warpsize()`
 95 | * Shared memory: `@cuStaticSharedMemory`, `@cuDynamicSharedMemory`
 96 | * Array type: `CuDeviceArray` (converted from input `CuArray`s, or shared memory)
 97 | * I/O: `@cuprintf`
 98 | * Synchronization: `sync_threads`
 99 | * Communication: `vote_{all,any,ballot}`
100 | * Data movement: `shfl_{up,down,bfly,idx}`
101 | 
102 | ### `libdevice`
103 | 
104 | In addition to the native intrinsics listed above, math functionality from `libdevice` is
105 | wrapped and part of CUDAnative. For now, you need to fully qualify function calls to these
106 | intrinsics, which provide similar functionality to some of the low-level math functionality
107 | of Base which would otherwise call out to `libm`.
108 | 


--------------------------------------------------------------------------------
/examples/reduce/reduce.jl:
--------------------------------------------------------------------------------
  1 | # EXCLUDE FROM TESTING
  2 | # this file doesn't have an entry point, see `verify.jl` instead
  3 | 
  4 | # Fast parallel reduction for Kepler hardware
  5 | # - uses shuffle and shared memory to reduce efficiently
  6 | # - support for large arrays
  7 | #
  8 | # Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
  9 | 
 10 | using CUDAdrv, CUDAnative, CuArrays
 11 | 
 12 | 
 13 | #
 14 | # Main implementation
 15 | #
 16 | 
 17 | # Reduce a value across a warp
 18 | @inline function reduce_warp(op::F, val::T)::T where {F<:Function,T}
 19 |     offset = CUDAnative.warpsize() ÷ 2
 20 |     # TODO: this can be unrolled if warpsize is known...
 21 |     while offset > 0
 22 |         val = op(val, shfl_down(val, offset))
 23 |         offset ÷= 2
 24 |     end
 25 |     return val
 26 | end
 27 | 
 28 | # Reduce a value across a block, using shared memory for communication
 29 | @inline function reduce_block(op::F, val::T)::T where {F<:Function,T}
 30 |     # shared mem for 32 partial sums
 31 |     shared = @cuStaticSharedMem(T, 32)
 32 | 
 33 |     wid, lane = fldmod1(threadIdx().x, CUDAnative.warpsize())
 34 | 
 35 |     # each warp performs partial reduction
 36 |     val = reduce_warp(op, val)
 37 | 
 38 |     # write reduced value to shared memory
 39 |     if lane == 1
 40 |         @inbounds shared[wid] = val
 41 |     end
 42 | 
 43 |     # wait for all partial reductions
 44 |     sync_threads()
 45 | 
 46 |     # read from shared memory only if that warp existed
 47 |     @inbounds val = (threadIdx().x <= fld(blockDim().x, CUDAnative.warpsize())) ? shared[lane] : zero(T)
 48 | 
 49 |     # final reduce within first warp
 50 |     if wid == 1
 51 |         val = reduce_warp(op, val)
 52 |     end
 53 | 
 54 |     return val
 55 | end
 56 | 
 57 | # Reduce an array across a complete grid
 58 | function reduce_grid(op::F, input::CuDeviceVector{T}, output::CuDeviceVector{T},
 59 |                      len::Integer) where {F<:Function,T}
 60 | 
 61 |     # TODO: neutral element depends on the operator (see Base's 2 and 3 argument `reduce`)
 62 |     val = zero(T)
 63 | 
 64 |     # reduce multiple elements per thread (grid-stride loop)
 65 |     # TODO: step range (see JuliaGPU/CUDAnative.jl#12)
 66 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 67 |     step = blockDim().x * gridDim().x
 68 |     while i <= len
 69 |         @inbounds val = op(val, input[i])
 70 |         i += step
 71 |     end
 72 | 
 73 |     val = reduce_block(op, val)
 74 | 
 75 |     if threadIdx().x == 1
 76 |         @inbounds output[blockIdx().x] = val
 77 |     end
 78 | 
 79 |     return
 80 | end
 81 | 
 82 | """
 83 | Reduce a large array.
 84 | 
 85 | Kepler-specific implementation, ie. you need sm_30 or higher to run this code.
 86 | """
 87 | function gpu_reduce(op::Function, input::CuVector{T}, output::CuVector{T}) where {T}
 88 |     len = length(input)
 89 | 
 90 |     # TODO: these values are hardware-dependent, with recent GPUs supporting more threads
 91 |     threads = 512
 92 |     blocks = min((len + threads - 1) ÷ threads, 1024)
 93 | 
 94 |     # the output array must have a size equal to or larger than the number of thread blocks
 95 |     # in the grid because each block writes to a unique location within the array.
 96 |     if length(output) < blocks
 97 |         throw(ArgumentError("output array too small, should be at least $blocks elements"))
 98 |     end
 99 | 
100 |     @cuda blocks=blocks threads=threads reduce_grid(op, input, output, len)
101 |     @cuda threads=1024 reduce_grid(op, output, output, blocks)
102 | end
103 | 
104 | 
105 | # FURTHER IMPROVEMENTS:
106 | # - use atomic memory operations
107 | # - dynamic block/grid size based on device capabilities
108 | # - vectorized memory access
109 | #   devblogs.nvidia.com/parallelforall/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
110 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | CUDAnative v1.0 release notes
 2 | =============================
 3 | 
 4 | This document describes major features and user-facing changes to CUDAnative.
 5 | 
 6 | 
 7 | New features
 8 | ------------
 9 | 
10 |   * `@device_code_...` macros make it easy to inspect generated device code even
11 |     if the outermost function call isn't a `@cuda` invocation. This is especially
12 |     useful in combination with, e.g., CuArrays. The `@device_code` macro dumps
13 |     _all_ forms of intermediate code to a directory, for easy inspection ([#147]).
14 | 
15 |   * Fast versions of CUDA math intrinsics are now wrapped ([#152]).
16 | 
17 |   * Support for loading values through the texture cache, aka. `__ldg`, has been
18 |     added. No `getindex`-based interfaced is available yet, manually use
19 |     `unsafe_cached_load` instead ([#158]).
20 | 
21 |   * Multiple devices are supported, by calling `device!` to switch to another
22 |     device. The CUDA API is now also initialized lazily, so be sure to call
23 |     `device!` before performing any work to avoid allocating a context on device
24 |     0 ([#175]).
25 | 
26 |   * Support for object and closure kernel functions has been added ([#176]).
27 | 
28 |   * IR transformation passes have been introduced to rewrite exceptions, where
29 |     possible, to generate user-friendly messages as well as prevent hitting
30 |     issues in `ptxas` ([#241]).
31 | 
32 |   * Code generated by `@cuda` can now be recreated manually using a low-level
33 |     kernel launch API. The kernel objects used in that API are useful for
34 |     reflecting on hardware resource usage ([#266]).
35 | 
36 |   * A GPU runtime library has been introduced ([#303]), implementing certain functionality
37 |     from the Julia runtime library that would previously have prevented GPU execution
38 |     ([#314], [#318], [#321]).
39 | 
40 | 
41 | Changes
42 | -------
43 | 
44 |   * Debug info generation now honors the `-g` flag as passed to the Julia command,
45 |     and is no longer tied to the `DEBUG` environment variable.
46 | 
47 |   * Log messages are implemented using the new Base Julia logging system. Debug
48 |     logging can be enabled by specifying the `JULIA_DEBUG=CUDAnative` environment
49 |     variable.
50 | 
51 |   * The syntax of `@cuda` now takes keyword arguments, eg. `@cuda threads=1
52 |     foo(...)`, instead of the old tuple syntax. See the documentation of `@cuda`
53 |     for a list of supported arguments ([#154]).
54 | 
55 |   * Non isbits values can be passed to a kernel, as long as they are unused. This
56 |     makes it easier to implement GPU-versions of existing functions, without
57 |     requiring a different method signature ([#168]).
58 | 
59 |   * Indexing intrinsics now return `Int`, so no need to convert to `(U)Int32`
60 |     anymore. Although this might require more registers, it allows LLVM to
61 |     simplify code ([#182]).
62 | 
63 |   * Better error messages, showing backtraces into GPU code (#189) and detecting
64 |     common pitfalls like recursion or use of Base intrinsics (#210).
65 | 
66 |   * Debug information is now stripped from LLVM and PTX reflection functions
67 |     ([#208], [#214]). Use the `strip_ir_metadata` (cfr. Base) keyword argument
68 |     to disable this.
69 | 
70 |   * Error handling and reporting has been improved. This includes
71 |     GPU-incompatible `ccall`s which are now detected and decoded by the IR
72 |     validator ([#248]).
73 | 
74 |   * A callback mechanism has been introduced to inform downstream users about
75 |     device switches ([#226]).
76 | 
77 |   * Adapt.jl is now used for host-device argument conversions ([#269]).
78 | 
79 | 
80 | Deprecations and removals
81 | -------------------------
82 | 
83 |   * `CUDAnative.@profile` has been removed, use `CUDAdrv.@profile` with a manual
84 |     warm-up step instead.
85 | 
86 |   * The `KernelWrapper` has been removed since it prevented inferring varargs
87 |     functions ([#254]).
88 | 
89 |   * Support for `CUDAdrv.CuArray` has been removed, the CuArrays.jl package should be used
90 |     instead ([#284]).


--------------------------------------------------------------------------------
/test/util.jl:
--------------------------------------------------------------------------------
  1 | # @test_throw, with additional testing for the exception message
  2 | macro test_throws_message(f, typ, ex...)
  3 |     quote
  4 |         msg = ""
  5 |         @test_throws $(esc(typ)) try
  6 |             $(esc(ex...))
  7 |         catch err
  8 |             msg = sprint(showerror, err)
  9 |             rethrow()
 10 |         end
 11 | 
 12 |         if !$(esc(f))(msg)
 13 |             # @test should return its result, but doesn't
 14 |             @error "Failed to validate error message\n$msg"
 15 |         end
 16 |         @test $(esc(f))(msg)
 17 |     end
 18 | end
 19 | 
 20 | # NOTE: based on test/pkg.jl::capture_stdout, but doesn't discard exceptions
 21 | macro grab_output(ex)
 22 |     quote
 23 |         mktemp() do fname, fout
 24 |             ret = nothing
 25 |             open(fname, "w") do fout
 26 |                 redirect_stdout(fout) do
 27 |                     ret = $(esc(ex))
 28 |                 end
 29 |             end
 30 |             ret, read(fname, String)
 31 |         end
 32 |     end
 33 | end
 34 | 
 35 | # Run some code on-device, returning captured standard output
 36 | macro on_device(ex)
 37 |     quote
 38 |         let
 39 |             function kernel()
 40 |                 $(esc(ex))
 41 |                 return
 42 |             end
 43 | 
 44 |             @cuda kernel()
 45 |             synchronize()
 46 |         end
 47 |     end
 48 | end
 49 | 
 50 | # helper function for sinking a value to prevent the callee from getting optimized away
 51 | @inline sink(i::Int32) =
 52 |     Base.llvmcall("""%slot = alloca i32
 53 |                      store volatile i32 %0, i32* %slot
 54 |                      %value = load volatile i32, i32* %slot
 55 |                      ret i32 %value""", Int32, Tuple{Int32}, i)
 56 | @inline sink(i::Int64) =
 57 |     Base.llvmcall("""%slot = alloca i64
 58 |                      store volatile i64 %0, i64* %slot
 59 |                      %value = load volatile i64, i64* %slot
 60 |                      ret i64 %value""", Int64, Tuple{Int64}, i)
 61 | 
 62 | function julia_script(code, args=``)
 63 |     # FIXME: this doesn't work when the compute mode is set to exclusive
 64 |     script = "using CUDAnative; import CUDAdrv; $code"
 65 |     out = Pipe()
 66 |     err = Pipe()
 67 |     cmd = `$(Base.julia_cmd()) -e $script`
 68 |     if Base.JLOptions().project != C_NULL
 69 |         # --project isn't preserved by julia_cmd()
 70 |         cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))`
 71 |     end
 72 |     cmd = `$cmd $args`
 73 |     proc = run(pipeline(cmd, stdout=out, stderr=err), wait=false)
 74 |     close(out.in)
 75 |     close(err.in)
 76 |     wait(proc)
 77 |     proc.exitcode, read(out, String), read(err, String)
 78 | end
 79 | 
 80 | # a lightweight CUDA array type for testing purposes
 81 | ## ctor & finalizer
 82 | mutable struct CuTestArray{T,N}
 83 |     buf::Mem.Buffer
 84 |     shape::NTuple{N,Int}
 85 |     function CuTestArray{T,N}(shape::NTuple{N,Int}) where {T,N}
 86 |         len = prod(shape)
 87 |         buf = Mem.alloc(len*sizeof(T))
 88 | 
 89 |         obj = new{T,N}(buf, shape)
 90 |         finalizer(unsafe_free!, obj)
 91 |         return obj
 92 |     end
 93 | end
 94 | function unsafe_free!(a::CuTestArray)
 95 |     CUDAdrv.isvalid(a.buf.ctx) && Mem.free(a.buf)
 96 | end
 97 | ## memory copy operations
 98 | function CuTestArray(src::Array{T,N}) where {T,N}
 99 |     dst = CuTestArray{T,N}(size(src))
100 |     Mem.upload!(dst.buf, pointer(src), length(src) * sizeof(T))
101 |     return dst
102 | end
103 | function Base.Array(src::CuTestArray{T,N}) where {T,N}
104 |     dst = Array{T,N}(undef, src.shape)
105 |     Mem.download!(pointer(dst), src.buf, prod(src.shape) * sizeof(T))
106 |     return dst
107 | end
108 | ## conversions
109 | using Adapt
110 | function Adapt.adapt_storage(::CUDAnative.Adaptor, a::CuTestArray{T,N}) where {T,N}
111 |     ptr = Base.unsafe_convert(CuPtr{T}, a.buf)
112 |     devptr = CUDAnative.DevicePtr{T,AS.Global}(ptr)
113 |     CuDeviceArray{T,N,AS.Global}(a.shape, devptr)
114 | end
115 | 


--------------------------------------------------------------------------------
/examples/pairwise.jl:
--------------------------------------------------------------------------------
  1 | # calculate pairwise distance between every point in a vector
  2 | 
  3 | using CUDAdrv, CUDAnative, CuArrays
  4 | 
  5 | 
  6 | function haversine_cpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32)
  7 |     c1 = cospi(lat1 / 180.0f0)
  8 |     c2 = cospi(lat2 / 180.0f0)
  9 |     dlat = lat2 - lat1
 10 |     dlon = lon2 - lon1
 11 |     d1 = sinpi(dlat / 360.0f0)
 12 |     d2 = sinpi(dlon / 360.0f0)
 13 |     t = d2 * d2 * c1 * c2
 14 |     a = d1 * d1 + t
 15 |     c = 2.0f0 * asin(min(1.0f0, sqrt(a)))
 16 |     return radius * c
 17 | end
 18 | 
 19 | function pairwise_dist_cpu(lat::Vector{Float32}, lon::Vector{Float32})
 20 |     # allocate
 21 |     n = length(lat)
 22 |     rowresult = Array{Float32}(undef, n, n)
 23 |     
 24 |     # brute force fill in each cell
 25 |     for i in 1:n, j in 1:n
 26 |         @inbounds rowresult[i, j] = haversine_cpu(lat[i], lon[i], lat[j], lon[j] , 6372.8f0)
 27 |     end
 28 |     
 29 |     return rowresult    
 30 | end
 31 | 
 32 | # from https://devblogs.nvidia.com/parallelforall/fast-great-circle-distance-calculation-cuda-c/
 33 | function haversine_gpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32)
 34 |     # XXX: need to prefix math intrinsics with CUDAnative
 35 |     c1 = CUDAnative.cospi(lat1 / 180.0f0)
 36 |     c2 = CUDAnative.cospi(lat2 / 180.0f0)
 37 |     dlat = lat2 - lat1
 38 |     dlon = lon2 - lon1
 39 |     d1 = CUDAnative.sinpi(dlat / 360.0f0)
 40 |     d2 = CUDAnative.sinpi(dlon / 360.0f0)
 41 |     t = d2 * d2 * c1 * c2
 42 |     a = d1 * d1 + t
 43 |     c = 2.0f0 * CUDAnative.asin(CUDAnative.min(1.0f0, CUDAnative.sqrt(a)))
 44 |     return radius * c
 45 | end
 46 | 
 47 | # pairwise distance calculation kernel
 48 | function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{Float32},
 49 |                               rowresult::CuDeviceMatrix{Float32}, n)
 50 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 51 |     j = (blockIdx().y-1) * blockDim().y + threadIdx().y
 52 | 
 53 |     if i <= n && j <= n
 54 |         # store to shared memory
 55 |         shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y)
 56 |         if threadIdx().y == 1
 57 |             shmem[threadIdx().x] = lat[i]
 58 |             shmem[blockDim().x + threadIdx().x] = lon[i]
 59 |         end
 60 |         if threadIdx().x == 1
 61 |             shmem[2*blockDim().x + threadIdx().y] = lat[j]
 62 |             shmem[2*blockDim().x + blockDim().y + threadIdx().y] = lon[j]
 63 |         end
 64 |         sync_threads()
 65 | 
 66 |         # load from shared memory
 67 |         lat_i = shmem[threadIdx().x]
 68 |         lon_i = shmem[blockDim().x + threadIdx().x]
 69 |         lat_j = shmem[2*blockDim().x + threadIdx().y]
 70 |         lon_j = shmem[2*blockDim().x + blockDim().y + threadIdx().y]
 71 | 
 72 |         @inbounds rowresult[i, j] = haversine_gpu(lat_i, lon_i, lat_j, lon_j, 6372.8f0)
 73 |     end
 74 | 
 75 |     return
 76 | end
 77 | 
 78 | function pairwise_dist_gpu(lat::Vector{Float32}, lon::Vector{Float32})
 79 |     # upload
 80 |     lat_gpu = CuArray(lat)
 81 |     lon_gpu = CuArray(lon)
 82 | 
 83 |     # allocate
 84 |     n = length(lat)
 85 |     rowresult_gpu = CuArray{Float32}(undef, n, n)
 86 | 
 87 |     # calculate launch configuration
 88 |     # NOTE: we want our launch configuration to be as square as possible,
 89 |     #       because that minimizes shared memory usage
 90 |     dev = device()
 91 |     total_threads = min(n, attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK))
 92 |     threads_x = floor(Int, sqrt(total_threads))
 93 |     threads_y = total_threads ÷ threads_x
 94 |     threads = (threads_x, threads_y)
 95 |     blocks = ceil.(Int, n ./ threads)
 96 | 
 97 |     # calculate size of dynamic shared memory
 98 |     shmem = 2 * sum(threads) * sizeof(Float32)
 99 | 
100 |     @cuda blocks=blocks threads=threads shmem=shmem pairwise_dist_kernel(lat_gpu, lon_gpu, rowresult_gpu, n)
101 | 
102 |     return Array(rowresult_gpu)
103 | end
104 | 
105 | using Test
106 | 
107 | # generate reasonable data
108 | function main(n = 10000)
109 |     lat = rand(Float32, n) .* 45
110 |     lon = rand(Float32, n) .* -120
111 | 
112 |     @test pairwise_dist_cpu(lat, lon) ≈ pairwise_dist_gpu(lat, lon) rtol=1e-2
113 | end
114 | main()
115 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/memory_shared.jl:
--------------------------------------------------------------------------------
  1 | # Shared Memory (part of B.2)
  2 | 
  3 | export @cuStaticSharedMem, @cuDynamicSharedMem
  4 | 
  5 | # FIXME: `shmem_id` increment in the macro isn't correct, as multiple parametrically typed
  6 | #        functions will alias the id (but the size might be a parameter). but incrementing in
  7 | #        the @generated function doesn't work, as it is supposed to be pure and identical
  8 | #        invocations will erroneously share (and even cause multiple shmem globals).
  9 | shmem_id = 0
 10 | 
 11 | """
 12 |     @cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
 13 | 
 14 | Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 15 | pointing to a statically-allocated piece of shared memory. The type should be statically
 16 | inferable and the dimensions should be constant, or an error will be thrown and the
 17 | generator function will be called dynamically.
 18 | """
 19 | macro cuStaticSharedMem(T, dims)
 20 |     global shmem_id
 21 |     id = shmem_id::Int += 1
 22 | 
 23 |     quote
 24 |         len = prod($(esc(dims)))
 25 |         ptr = _shmem(Val($id), $(esc(T)), Val(len))
 26 |         CuDeviceArray($(esc(dims)), ptr)
 27 |     end
 28 | end
 29 | 
 30 | """
 31 |     @cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
 32 | 
 33 | Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 34 | pointing to a dynamically-allocated piece of shared memory. The type should be statically
 35 | inferable or an error will be thrown and the generator function will be called dynamically.
 36 | 
 37 | Note that the amount of dynamic shared memory needs to specified when launching the kernel.
 38 | 
 39 | Optionally, an offset parameter indicating how many bytes to add to the base shared memory
 40 | pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic
 41 | shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`.
 42 | """
 43 | macro cuDynamicSharedMem(T, dims, offset=0)
 44 |     global shmem_id
 45 |     id = shmem_id::Int += 1
 46 | 
 47 |     # TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM)
 48 | 
 49 |     quote
 50 |         len = prod($(esc(dims)))
 51 |         ptr = _shmem(Val($id), $(esc(T))) + $(esc(offset))
 52 |         CuDeviceArray($(esc(dims)), ptr)
 53 |     end
 54 | end
 55 | 
 56 | # get a pointer to shared memory, with known (static) or zero length (dynamic shared memory)
 57 | @generated function _shmem(::Val{id}, ::Type{T}, ::Val{len}=Val(0)) where {id,T,len}
 58 |     eltyp = convert(LLVMType, T)
 59 | 
 60 |     T_ptr = convert(LLVMType, DevicePtr{T,AS.Shared})
 61 |     T_actual_ptr = LLVM.PointerType(eltyp)
 62 | 
 63 |     # create a function
 64 |     llvm_f, _ = create_function(T_ptr)
 65 | 
 66 |     # create the global variable
 67 |     mod = LLVM.parent(llvm_f)
 68 |     gv_typ = LLVM.ArrayType(eltyp, len)
 69 |     gv = GlobalVariable(mod, gv_typ, "shmem$id", #=addrspace=# 3)
 70 |     if len > 0
 71 |         # static shared memory should be demoted to local variables, whenever possible.
 72 |         # this is done by the NVPTX ASM printer:
 73 |         # > Find out if a global variable can be demoted to local scope.
 74 |         # > Currently, this is valid for CUDA shared variables, which have local
 75 |         # > scope and global lifetime. So the conditions to check are :
 76 |         # > 1. Is the global variable in shared address space?
 77 |         # > 2. Does it have internal linkage?
 78 |         # > 3. Is the global variable referenced only in one function?
 79 |         linkage!(gv, LLVM.API.LLVMInternalLinkage)
 80 |         initializer!(gv, null(gv_typ))
 81 |     end
 82 |     # by requesting a larger-than-datatype alignment, we might be able to vectorize.
 83 |     # we pick 16 bytes since this is the largest transaction size as supported by PTX.
 84 |     alignment!(gv, Base.max(16, datatype_align(T)))
 85 | 
 86 |     # generate IR
 87 |     Builder(JuliaContext()) do builder
 88 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
 89 |         position!(builder, entry)
 90 | 
 91 |         ptr_with_as = gep!(builder, gv, [ConstantInt(0, JuliaContext()),
 92 |                                          ConstantInt(0, JuliaContext())])
 93 | 
 94 |         ptr = addrspacecast!(builder, ptr_with_as, T_actual_ptr)
 95 |         val = ptrtoint!(builder, ptr, T_ptr)
 96 |         ret!(builder, val)
 97 |     end
 98 | 
 99 |     call_function(llvm_f, DevicePtr{T,AS.Shared})
100 | end
101 | 


--------------------------------------------------------------------------------
/docs/src/man/hacking.md:
--------------------------------------------------------------------------------
  1 | # Hacking
  2 | 
  3 | ## Generated functions
  4 | 
  5 | Generated functions are used heavily in CUDAnative.jl, in combination with LLVM.jl, to
  6 | generate type-specialized code and IR. If evaluating the generator results in an error,
  7 | Julia generates a dynamic call to the generator for you to inspect the error at run-time.
  8 | This is a problem in the world of GPUs, where dynamic calls are prohibited. A band-aid is to
  9 | print the exception during inference:
 10 | 
 11 | ```patch
 12 | diff --git a/base/inference.jl b/base/inference.jl
 13 | index 6443665676..b03d78ddaa 100644
 14 | --- a/base/inference.jl
 15 | +++ b/base/inference.jl
 16 | @@ -2430,7 +2430,10 @@ function typeinf_frame(linfo::MethodInstance, caller, optimize::Bool, cached::Bo
 17 |              try
 18 |                  # user code might throw errors – ignore them
 19 |                  src = get_staged(linfo)
 20 | -            catch
 21 | +            catch ex
 22 | +                println("WARNING: An error occurred during generated function execution.")
 23 | +                println(ex)
 24 | +                ccall(:jlbacktrace, Void, ())
 25 |                  return nothing
 26 |              end
 27 |          else
 28 | ```
 29 | 
 30 | 
 31 | ## Adding intrinsics
 32 | 
 33 | Adding intrinsics to `CUDAnative.jl` can be relatively convoluted, depending on the type of
 34 | intrinsic. Most of the boil down to inlining a snippet of LLVM IR, using `llvmcall` (or
 35 | `ccall` with the `llvmcall` calling convention). For more complex code, use LLVM.jl to build
 36 | the IR string.
 37 | 
 38 | 
 39 | ### `libdevice` intrinsics
 40 | 
 41 | These intrinsics are represented by function calls to `libdevice`. Most of them should
 42 | already be covered. There's a convenience macro, `@wrap`, simplifying the job of adding and
 43 | exporting intrinsics, and converting arguments and return values. See the documentation of
 44 | the macro for more details, and look at `src/device/libdevice.jl` for examples.
 45 | 
 46 | 
 47 | ### LLVM back-end intrinsics
 48 | 
 49 | Calls to functions like `llvm.nvvm.barrier0` are backed the PTX LLVM back-end, and can be
 50 | wrapped using `ccall` with the `llvmcall` calling convention. For more complex intrinsics,
 51 | or when you're not actually calling an intrinsic function, you can still use `@wrap`.
 52 | 
 53 | 
 54 | ### Inline PTX assembly
 55 | 
 56 | When there's no corresponding `libdevice` function or PTX back-end intrinsic exposing the
 57 | required functionality, you can use inline PTX assembly via `llvmcall`. This requires you to
 58 | embed the PTX assembly in LLVM IR, which is often messy.
 59 | 
 60 | If the source of the assembly instructions is CUDA C code, you simplify this task by first
 61 | compiling the CUDA code using Clang, and adapting the resulting LLVM IR for use within
 62 | `llvmcall`. For example, extracting the following function definition from the CUDA SDK:
 63 | 
 64 | ```cuda
 65 | __device__ unsigned int __ballot(int a)
 66 | {
 67 |   int result;
 68 |   asm __volatile__ ("{ \n\t"
 69 |         ".reg .pred \t%%p1; \n\t"
 70 |         "setp.ne.u32 \t%%p1, %1, 0; \n\t"
 71 |         "vote.ballot.b32 \t%0, %%p1; \n\t"
 72 |         "}" : "=r"(result) : "r"(a));
 73 |   return result;
 74 | }
 75 | ```
 76 | 
 77 | We can generate the following LLVM IR by executing `clang++ -Xclang -fcuda-is-device -S
 78 | -emit-llvm -target nvptx64 ballot.cu -o -` (you might need to add [some CUDA
 79 | boilerplate](https://gist.github.com/eliben/b014ac17cbe5a452803f)):
 80 | 
 81 | ```
 82 | define i32 @_Z8__balloti(i32 %a) #0 {
 83 |   %1 = alloca i32, align 4
 84 |   %result = alloca i32, align 4
 85 |   store i32 %a, i32* %1, align 4
 86 |   %2 = load i32, i32* %1, align 4
 87 |   %3 = call i32 asm sideeffect "{ \0A\09.reg .pred \09%p1; \0A\09setp.ne.u32 \09%p1, $1, 0; \0A\09vote.ballot.b32 \09$0, %p1; \0A\09}", "=r,r"(i32 %2) #1, !srcloc !1
 88 |   store i32 %3, i32* %result, align 4
 89 |   %4 = load i32, i32* %result, align 4
 90 |   ret i32 %4
 91 | }
 92 | ```
 93 | 
 94 | Finally, we use LLVM.jl's `@asmcall` macro to inline this assembly and call it:
 95 | 
 96 | ```julia
 97 | function vote_ballot(pred::Bool)
 98 |     return @asmcall(
 99 |         """{
100 |                .reg .pred %p1;
101 |                setp.ne.u32 %p1, \$1, 0;
102 |                vote.ballot.b32 \$0, %p1;
103 |            }""", "=r,r", true,
104 |         UInt32, Tuple{Int32}, convert(Int32, pred))
105 | end
106 | ```
107 | 
108 | 
109 | ### Other functionality
110 | 
111 | For other functionality, like shared memory, or when some additional management is required,
112 | like storing a global variable for `printf`'s formatting string, you should use LLVM.jl to
113 | build the IR code instead of hacking strings together. As this doesn't touch global state,
114 | you can even do so from a `@generated` function. Do take care however to use Julia's LLVM
115 | context for all operations.
116 | 


--------------------------------------------------------------------------------
/test/device/array.jl:
--------------------------------------------------------------------------------
  1 | @testset "device arrays" begin
  2 | 
  3 | @testset "constructors" begin
  4 |     # inner constructors
  5 |     let
  6 |         dp = CUDAnative.DevicePtr{Int,AS.Generic}(0)
  7 |         CuDeviceArray{Int,1,AS.Generic}((1,), dp)
  8 |     end
  9 | 
 10 |     # outer constructors
 11 |     for I in [Int32,Int64]
 12 |         a = I(1)
 13 |         b = I(2)
 14 | 
 15 |         dp = CUDAnative.DevicePtr{I,AS.Generic}(0)
 16 | 
 17 |         # not parameterized
 18 |         CuDeviceArray(b, dp)
 19 |         CuDeviceArray((b,), dp)
 20 |         CuDeviceArray((b,a), dp)
 21 | 
 22 |         # partially parameterized
 23 |         CuDeviceArray{I}(b, dp)
 24 |         CuDeviceArray{I}((b,), dp)
 25 |         CuDeviceArray{I}((a,b), dp)
 26 |         CuDeviceArray{I,1}(b, dp)
 27 |         CuDeviceArray{I,1}((b,), dp)
 28 |         @test_throws MethodError CuDeviceArray{I,1}((a,b), dp)
 29 |         @test_throws MethodError CuDeviceArray{I,2}(b, dp)
 30 |         @test_throws MethodError CuDeviceArray{I,2}((b,), dp)
 31 |         CuDeviceArray{I,2}((a,b), dp)
 32 | 
 33 |         # fully parameterized
 34 |         CuDeviceArray{I,1,AS.Generic}(b, dp)
 35 |         CuDeviceArray{I,1,AS.Generic}((b,), dp)
 36 |         @test_throws MethodError CuDeviceArray{I,1,AS.Generic}((a,b), dp)
 37 |         @test_throws MethodError CuDeviceArray{I,1,AS.Shared}((a,b), dp)
 38 |         @test_throws MethodError CuDeviceArray{I,2,AS.Generic}(b, dp)
 39 |         @test_throws MethodError CuDeviceArray{I,2,AS.Generic}((b,), dp)
 40 |         CuDeviceArray{I,2,AS.Generic}((a,b), dp)
 41 | 
 42 |         # type aliases
 43 |         CuDeviceVector{I}(b, dp)
 44 |         CuDeviceMatrix{I}((a,b), dp)
 45 |     end
 46 | end
 47 | 
 48 | @testset "basics" begin     # argument passing, get and setindex, length
 49 |     dims = (16, 16)
 50 |     len = prod(dims)
 51 | 
 52 |     function kernel(input::CuDeviceArray{Float32}, output::CuDeviceArray{Float32})
 53 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 54 | 
 55 |         if i <= length(input)
 56 |             output[i] = Float64(input[i])   # force conversion upon setindex!
 57 |         end
 58 | 
 59 |         return
 60 |     end
 61 | 
 62 |     input = round.(rand(Float32, dims) * 100)
 63 | 
 64 |     input_dev = CuTestArray(input)
 65 |     output_dev = CuTestArray(input)
 66 | 
 67 |     @cuda threads=len kernel(input_dev, output_dev)
 68 |     output = Array(output_dev)
 69 |     @test input ≈ output
 70 | end
 71 | 
 72 | @testset "iteration" begin     # argument passing, get and setindex, length
 73 |     dims = (16, 16)
 74 |     function kernel(input::CuDeviceArray{T}, output::CuDeviceArray{T}) where {T}
 75 |         acc = zero(T)
 76 |         for elem in input
 77 |             acc += elem
 78 |         end
 79 |         output[1] = acc
 80 |         return
 81 |     end
 82 | 
 83 |     input = round.(rand(Float32, dims) * 100)
 84 | 
 85 |     input_dev = CuTestArray(input)
 86 |     output_dev = CuTestArray(Float32[0])
 87 | 
 88 |     @cuda kernel(input_dev, output_dev)
 89 |     output = Array(output_dev)
 90 |     @test sum(input) ≈ output[1]
 91 | end
 92 | 
 93 | @testset "bounds checking" begin
 94 |     @testset "#313" begin
 95 |         function kernel(dest)
 96 |             dest[1] = 1
 97 |             nothing
 98 |         end
 99 |         tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
100 |                             Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
101 | 
102 |         ir = sprint(io->CUDAnative.code_llvm(io, kernel, tt))
103 |         @test !occursin("jl_invoke", ir)
104 |         CUDAnative.code_ptx(devnull, kernel, tt)
105 |     end
106 | end
107 | 
108 | @testset "views" begin
109 |     function kernel(array)
110 |         i = (blockIdx().x-1) * blockDim().x + threadIdx().x
111 | 
112 |         _sub = view(array, 2:length(array)-1)
113 |         if i <= length(_sub)
114 |             _sub[i] = i
115 |         end
116 | 
117 |         return
118 |     end
119 | 
120 |     array = zeros(Int64, 100)
121 |     array_dev = CuTestArray(array)
122 | 
123 |     sub = view(array, 2:length(array)-1)
124 |     for i in 1:length(sub)
125 |         sub[i] = i
126 |     end
127 | 
128 |     @cuda threads=100 kernel(array_dev)
129 |     @test array == Array(array_dev)
130 | end
131 | 
132 | @testset "non-Int index to unsafe_load" begin
133 |     function load_index(a)
134 |         return a[UInt64(1)]
135 |     end
136 | 
137 |     a = [1]
138 |     p = pointer(a)
139 |     dp = Base.bitcast(CUDAnative.DevicePtr{eltype(p), AS.Generic}, p)
140 |     da = CUDAnative.CuDeviceArray(1, dp)
141 |     load_index(da)
142 | end
143 | 
144 | @testset "ldg" begin
145 |     function kernel(a, b, i)
146 |         b[i] = ldg(a, i)
147 |         return
148 |     end
149 | 
150 |     buf = IOBuffer()
151 | 
152 |     a = CuTestArray([0])
153 |     b = CuTestArray([0])
154 |     @device_code_ptx io=buf @cuda kernel(a, b, 1)
155 |     @test Array(a) == Array(b)
156 | 
157 |     asm = String(take!(copy(buf)))
158 |     @test occursin("ld.global.nc", asm)
159 | end
160 | 
161 | end
162 | 


--------------------------------------------------------------------------------
/src/device/cuda_intrinsics/warp_shuffle.jl:
--------------------------------------------------------------------------------
  1 | # Warp Shuffle (B.14)
  2 | 
  3 | # TODO: does not work on sub-word (ie. Int16) or non-word divisible sized types
  4 | 
  5 | # TODO: should shfl_idx conform to 1-based indexing?
  6 | 
  7 | # TODO: these functions should dispatch based on the actual warp size
  8 | const ws = Int32(32)
  9 | 
 10 | # TODO: this functionality should throw <sm_30
 11 | 
 12 | 
 13 | # primitive intrinsics
 14 | 
 15 | # "two packed values specifying a mask for logically splitting warps into sub-segments
 16 | # and an upper bound for clamping the source lane index"
 17 | @inline pack(width::UInt32, mask::UInt32)::UInt32 = (convert(UInt32, ws - width) << 8) | mask
 18 | 
 19 | # NOTE: CUDA C disagrees with PTX on how shuffles are called
 20 | for (name, mode, mask) in (("_up",   :up,   UInt32(0x00)),
 21 |                            ("_down", :down, UInt32(0x1f)),
 22 |                            ("_xor",  :bfly, UInt32(0x1f)),
 23 |                            ("",      :idx,  UInt32(0x1f)))
 24 |     fname = Symbol("shfl$name")
 25 | 
 26 |     if cuda_driver_version >= v"9.0" && v"6.0" in ptx_support
 27 |         instruction = Symbol("shfl.sync.$mode.b32")
 28 |         fname_sync = Symbol("$(fname)_sync")
 29 | 
 30 |         # TODO: implement using LLVM intrinsics when we have D38090
 31 | 
 32 |         @eval begin
 33 |             export $fname_sync, $fname
 34 | 
 35 |             @inline $fname_sync(val::UInt32, src::UInt32, width::UInt32=$ws,
 36 |                                 threadmask::UInt32=0xffffffff) =
 37 |                 @asmcall($"$instruction \$0, \$1, \$2, \$3, \$4;", "=r,r,r,r,r", true,
 38 |                          UInt32, NTuple{4,UInt32},
 39 |                          val, src, pack(width, $mask), threadmask)
 40 | 
 41 |             # FIXME: replace this with a checked conversion once we have exceptions
 42 |             @inline $fname_sync(val::UInt32, src::Integer, width::Integer=$ws,
 43 |                                 threadmask::UInt32=0xffffffff) =
 44 |                 $fname_sync(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width),
 45 |                             threadmask)
 46 | 
 47 |             @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) =
 48 |                 $fname_sync(val, src, width)
 49 |         end
 50 |     else
 51 |         intrinsic = Symbol("llvm.nvvm.shfl.$mode.i32")
 52 | 
 53 |         @eval begin
 54 |             export $fname
 55 |             @inline $fname(val::UInt32, src::UInt32, width::UInt32=$ws) =
 56 |                 ccall($"$intrinsic", llvmcall, UInt32,
 57 |                       (UInt32, UInt32, UInt32),
 58 |                       val, src, pack(width, $mask))
 59 | 
 60 |             # FIXME: replace this with a checked conversion once we have exceptions
 61 |             @inline $fname(val::UInt32, src::Integer, width::Integer=$ws) =
 62 |                 $fname(val, unsafe_trunc(UInt32, src), unsafe_trunc(UInt32, width))
 63 |         end
 64 |     end
 65 | end
 66 | 
 67 | 
 68 | # wide and aggregate intrinsics
 69 | 
 70 | for name in ["_up", "_down", "_xor", ""]
 71 |     fname = Symbol("shfl$name")
 72 |     @eval @inline $fname(src, args...) = recurse_value_invocation($fname, src, args...)
 73 | 
 74 |     fname_sync = Symbol("$(fname)_sync")
 75 |     @eval @inline $fname_sync(src, args...) = recurse_value_invocation($fname, src, args...)
 76 | end
 77 | 
 78 | 
 79 | # documentation
 80 | 
 81 | @doc """
 82 |     shfl(val, lane::Integer, width::Integer=32)
 83 | 
 84 | Shuffle a value from a directly indexed lane `lane`.
 85 | """ shfl
 86 | 
 87 | @doc """
 88 |     shfl_up(val, delta::Integer, width::Integer=32)
 89 | 
 90 | Shuffle a value from a lane with lower ID relative to caller.
 91 | """ shfl_up
 92 | 
 93 | @doc """
 94 |     shfl_down(val, delta::Integer, width::Integer=32)
 95 | 
 96 | Shuffle a value from a lane with higher ID relative to caller.
 97 | """ shfl_down
 98 | 
 99 | @doc """
100 |     shfl_xor(val, mask::Integer, width::Integer=32)
101 | 
102 | Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`.
103 | """ shfl_xor
104 | 
105 | 
106 | @doc """
107 |     shfl_sync(val, lane::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
108 | 
109 | Shuffle a value from a directly indexed lane `lane`. The default value for `threadmask`
110 | performs the shuffle on all threads in the warp.
111 | """ shfl_sync
112 | 
113 | @doc """
114 |     shfl_up_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
115 | 
116 | Shuffle a value from a lane with lower ID relative to caller. The default value for
117 | `threadmask` performs the shuffle on all threads in the warp.
118 | """ shfl_up_sync
119 | 
120 | @doc """
121 |     shfl_down_sync(val, delta::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
122 | 
123 | Shuffle a value from a lane with higher ID relative to caller. The default value for
124 | `threadmask` performs the shuffle on all threads in the warp.
125 | """ shfl_down_sync
126 | 
127 | @doc """
128 |     shfl_xor_sync(val, mask::Integer, width::Integer=32, threadmask::UInt32=0xffffffff)
129 | 
130 | Shuffle a value from a lane based on bitwise XOR of own lane ID with `mask`. The default
131 | value for `threadmask` performs the shuffle on all threads in the warp.
132 | """ shfl_xor_sync
133 | 


--------------------------------------------------------------------------------
/src/device/array.jl:
--------------------------------------------------------------------------------
  1 | # Contiguous on-device arrays
  2 | 
  3 | export
  4 |     CuDeviceArray, CuDeviceVector, CuDeviceMatrix, CuBoundsError, ldg
  5 | 
  6 | 
  7 | ## construction
  8 | 
  9 | """
 10 |     CuDeviceArray(dims, ptr)
 11 |     CuDeviceArray{T}(dims, ptr)
 12 |     CuDeviceArray{T,A}(dims, ptr)
 13 |     CuDeviceArray{T,A,N}(dims, ptr)
 14 | 
 15 | Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
 16 | pointer, where `N` is determined from the length of `dims` and `T` is determined from the
 17 | type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
 18 | lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
 19 | then it must match the length of `dims`. The same applies to the element type `T`, which
 20 | should match the type of the pointer `ptr`.
 21 | """
 22 | CuDeviceArray
 23 | 
 24 | # NOTE: we can't support the typical `tuple or series of integer` style construction,
 25 | #       because we're currently requiring a trailing pointer argument.
 26 | 
 27 | struct CuDeviceArray{T,N,A} <: AbstractArray{T,N}
 28 |     shape::Dims{N}
 29 |     ptr::DevicePtr{T,A}
 30 | 
 31 |     # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
 32 |     CuDeviceArray{T,N,A}(shape::Dims{N}, ptr::DevicePtr{T,A}) where {T,A,N} = new(shape,ptr)
 33 | end
 34 | 
 35 | const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
 36 | const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}
 37 | 
 38 | # outer constructors, non-parameterized
 39 | CuDeviceArray(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A})                where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
 40 | CuDeviceArray(len::Integer,              p::DevicePtr{T,A})                where {T,A}   = CuDeviceVector{T,A}((len,), p)
 41 | 
 42 | # outer constructors, partially parameterized
 43 | CuDeviceArray{T}(dims::NTuple{N,<:Integer},   p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
 44 | CuDeviceArray{T}(len::Integer,                p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
 45 | CuDeviceArray{T,N}(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(dims, p)
 46 | CuDeviceVector{T}(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((len,), p)
 47 | 
 48 | # outer constructors, fully parameterized
 49 | CuDeviceArray{T,N,A}(dims::NTuple{N,<:Integer}, p::DevicePtr{T,A}) where {T,A,N} = CuDeviceArray{T,N,A}(Int.(dims), p)
 50 | CuDeviceVector{T,A}(len::Integer,               p::DevicePtr{T,A}) where {T,A}   = CuDeviceVector{T,A}((Int(len),), p)
 51 | 
 52 | 
 53 | ## getters
 54 | 
 55 | Base.pointer(a::CuDeviceArray) = a.ptr
 56 | 
 57 | Base.size(g::CuDeviceArray) = g.shape
 58 | Base.length(g::CuDeviceArray) = prod(g.shape)
 59 | 
 60 | 
 61 | ## conversions
 62 | 
 63 | Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::CuDeviceArray{T,N,A}) where {T,A,N} = pointer(a)
 64 | 
 65 | 
 66 | ## indexing
 67 | 
 68 | # TODO: arrays as allocated by the CUDA APIs are 256-byte aligned. we should keep track of
 69 | #       this information, because it enables optimizations like Load Store Vectorization
 70 | #       (cfr. shared memory and its wider-than-datatype alignment)
 71 | 
 72 | @inline function Base.getindex(A::CuDeviceArray{T}, index::Integer) where {T}
 73 |     @boundscheck checkbounds(A, index)
 74 |     align = datatype_align(T)
 75 |     Base.unsafe_load(pointer(A), index, Val(align))::T
 76 | end
 77 | 
 78 | @inline function Base.setindex!(A::CuDeviceArray{T}, x, index::Integer) where {T}
 79 |     @boundscheck checkbounds(A, index)
 80 |     align = datatype_align(T)
 81 |     Base.unsafe_store!(pointer(A), x, index, Val(align))
 82 | end
 83 | 
 84 | """
 85 |     ldg(A, i)
 86 | 
 87 | Index the array `A` with the linear index `i`, but loads the value through the read-only
 88 | texture cache for improved cache behavior. You should make sure the array `A`, or any
 89 | aliased instance, is not written to for the duration of the current kernel.
 90 | 
 91 | This function can only be used on devices with compute capability 3.5 or higher.
 92 | 
 93 | See also: [`Base.getindex`](@ref)
 94 | """
 95 | @inline function ldg(A::CuDeviceArray{T}, index::Integer) where {T}
 96 |     # FIXME: this only works on sm_35+, but we can't verify that for now
 97 |     @boundscheck checkbounds(A, index)
 98 |     align = datatype_align(T)
 99 |     unsafe_cached_load(pointer(A), index, Val(align))::T
100 | end
101 | 
102 | Base.IndexStyle(::Type{<:CuDeviceArray}) = Base.IndexLinear()
103 | 
104 | 
105 | ## other
106 | 
107 | Base.show(io::IO, a::CuDeviceVector) =
108 |     print(io, "$(length(a))-element device array at $(pointer(a))")
109 | Base.show(io::IO, a::CuDeviceArray) =
110 |     print(io, "$(join(a.shape, '×')) device array at $(pointer(a))")
111 | 
112 | Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
113 | 
114 | @inline function Base.unsafe_view(A::CuDeviceVector{T}, I::Vararg{Base.ViewIndex,1}) where {T}
115 |     ptr = pointer(A) + (I[1].start-1)*sizeof(T)
116 |     len = I[1].stop - I[1].start + 1
117 |     return CuDeviceArray(len, ptr)
118 | end
119 | 
120 | @inline function Base.iterate(A::CuDeviceArray, i=1)
121 |     if (i % UInt) - 1 < length(A)
122 |         (@inbounds A[i], i + 1)
123 |     else
124 |         nothing
125 |     end
126 | end
127 | 


--------------------------------------------------------------------------------
/src/compiler/rtlib.jl:
--------------------------------------------------------------------------------
  1 | # compiler support for working with run-time libraries
  2 | 
  3 | function link_library!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module)
  4 |     # linking is destructive, so copy the library
  5 |     lib = LLVM.Module(lib)
  6 | 
  7 |     # save list of external functions
  8 |     exports = String[]
  9 |     for f in functions(mod)
 10 |         fn = LLVM.name(f)
 11 |         if !haskey(functions(lib), fn)
 12 |             push!(exports, fn)
 13 |         end
 14 |     end
 15 | 
 16 |     link!(mod, lib)
 17 | 
 18 |     ModulePassManager() do pm
 19 |         # internalize all functions that aren't exports
 20 |         internalize!(pm, exports)
 21 | 
 22 |         # eliminate all unused internal functions
 23 |         global_optimizer!(pm)
 24 |         global_dce!(pm)
 25 |         strip_dead_prototypes!(pm)
 26 | 
 27 |         run!(pm, mod)
 28 |     end
 29 | end
 30 | 
 31 | const libcache = Dict{String, LLVM.Module}()
 32 | 
 33 | 
 34 | #
 35 | # CUDA device library
 36 | #
 37 | 
 38 | function find_libdevice(cap)
 39 |     CUDAnative.configured || return
 40 |     global libdevice
 41 | 
 42 |     if isa(libdevice, Dict)
 43 |         # select the most recent & compatible library
 44 |         vers = keys(CUDAnative.libdevice)
 45 |         compat_vers = Set(ver for ver in vers if ver <= cap)
 46 |         isempty(compat_vers) && error("No compatible CUDA device library available")
 47 |         ver = maximum(compat_vers)
 48 |         path = libdevice[ver]
 49 |     else
 50 |         libdevice
 51 |     end
 52 | end
 53 | 
 54 | function load_libdevice(cap)
 55 |     path = find_libdevice(cap)
 56 | 
 57 |     get!(libcache, path) do
 58 |         open(path) do io
 59 |             parse(LLVM.Module, read(path), JuliaContext())
 60 |         end
 61 |     end
 62 | end
 63 | 
 64 | function link_libdevice!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module)
 65 |     # override libdevice's triple and datalayout to avoid warnings
 66 |     triple!(lib, triple(mod))
 67 |     datalayout!(lib, datalayout(mod))
 68 | 
 69 |     link_library!(ctx, mod, lib)
 70 | 
 71 |     ModulePassManager() do pm
 72 |         push!(metadata(mod), "nvvm-reflect-ftz",
 73 |               MDNode([ConstantInt(Int32(1), JuliaContext())]))
 74 |         # TODO: run the reflect pass?
 75 |         run!(pm, mod)
 76 |     end
 77 | end
 78 | 
 79 | 
 80 | #
 81 | # CUDAnative run-time library
 82 | #
 83 | 
 84 | # remove existing runtime libraries globally,
 85 | # so any change to CUDAnative triggers recompilation
 86 | rm(joinpath(@__DIR__, "..", "..", "deps", "runtime"); recursive=true, force=true)
 87 | 
 88 | 
 89 | ## higher-level functionality to work with runtime functions
 90 | 
 91 | function LLVM.call!(builder, rt::Runtime.RuntimeMethodInstance, args=LLVM.Value[])
 92 |     bb = position(builder)
 93 |     f = LLVM.parent(bb)
 94 |     mod = LLVM.parent(f)
 95 | 
 96 |     # get or create a function prototype
 97 |     if haskey(functions(mod), rt.llvm_name)
 98 |         f = functions(mod)[rt.llvm_name]
 99 |         ft = eltype(llvmtype(f))
100 |     else
101 |         ft = LLVM.FunctionType(rt.llvm_return_type, rt.llvm_types)
102 |         f = LLVM.Function(mod, rt.llvm_name, ft)
103 |     end
104 | 
105 |     # runtime functions are written in Julia, while we're calling from LLVM,
106 |     # this often results in argument type mismatches. try to fix some here.
107 |     for (i,arg) in enumerate(args)
108 |         if llvmtype(arg) != parameters(ft)[i]
109 |             if (llvmtype(arg) isa LLVM.PointerType) &&
110 |                (parameters(ft)[i] isa LLVM.IntegerType)
111 |                 # Julia pointers are passed as integers
112 |                 args[i] = ptrtoint!(builder, args[i], parameters(ft)[i])
113 |             else
114 |                 error("Don't know how to convert ", arg, " argument to ", parameters(ft)[i])
115 |             end
116 |         end
117 |     end
118 | 
119 |     call!(builder, f, args)
120 | end
121 | 
122 | 
123 | ## functionality to build the runtime library
124 | 
125 | function emit_function!(mod, cap, f, types, name)
126 |     tt = Base.to_tuple_type(types)
127 |     ctx = CompilerContext(f, tt, cap, #= kernel =# false)
128 |     new_mod, entry = irgen(ctx)
129 |     entry = optimize!(ctx, new_mod, entry)
130 |     LLVM.name!(entry, name)
131 | 
132 |     link!(mod, new_mod)
133 | end
134 | 
135 | function build_runtime(cap)
136 |     mod = LLVM.Module("CUDAnative run-time library", JuliaContext())
137 | 
138 |     for method in values(Runtime.methods)
139 |         emit_function!(mod, cap, method.def, method.types, method.llvm_name)
140 |     end
141 | 
142 |     mod
143 | end
144 | 
145 | function load_runtime(cap)
146 |     name = "cudanative.$(cap.major)$(cap.minor).bc"
147 |     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
148 |     mkpath(dirname(path))
149 | 
150 |     get!(libcache, path) do
151 |         if ispath(path)
152 |             open(path) do io
153 |                 parse(LLVM.Module, read(io), JuliaContext())
154 |             end
155 |         else
156 |             @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..."
157 |             lib = build_runtime(cap)
158 |             open(path, "w") do io
159 |                 write(io, lib)
160 |             end
161 |             lib
162 |         end
163 |     end
164 | end
165 | 


--------------------------------------------------------------------------------
/src/compiler/validation.jl:
--------------------------------------------------------------------------------
  1 | # validation of properties and code
  2 | 
  3 | function check_method(ctx::CompilerContext)
  4 |     # get the method
  5 |     ms = Base.methods(ctx.f, ctx.tt)
  6 |     isempty(ms)   && throw(KernelError(ctx, "no method found"))
  7 |     length(ms)!=1 && throw(KernelError(ctx, "no unique matching method"))
  8 |     m = first(ms)
  9 | 
 10 |     # kernels can't return values
 11 |     if ctx.kernel
 12 |         rt = Base.return_types(ctx.f, ctx.tt)[1]
 13 |         if rt != Nothing
 14 |             throw(KernelError(ctx, "kernel returns a value of type `$rt`",
 15 |                 """Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
 16 |                    If the returned value is of type `Union{}`, your Julia code probably throws an exception.
 17 |                    Inspect the code with `@device_code_warntype` for more details."""))
 18 |         end
 19 |     end
 20 | 
 21 |     return
 22 | end
 23 | 
 24 | function check_invocation(ctx::CompilerContext, entry::LLVM.Function)
 25 |     # make sure any non-isbits arguments are unused
 26 |     real_arg_i = 0
 27 |     sig = Base.signature_type(ctx.f, ctx.tt)::Type
 28 |     for (arg_i,dt) in enumerate(sig.parameters)
 29 |         isghosttype(dt) && continue
 30 |         real_arg_i += 1
 31 | 
 32 |         if !isbitstype(dt)
 33 |             param = parameters(entry)[real_arg_i]
 34 |             if !isempty(uses(param))
 35 |                 throw(KernelError(ctx, "passing and using non-bitstype argument",
 36 |                     """Argument $arg_i to your kernel function is of type $dt.
 37 |                        That type is not isbits, and such arguments are only allowed when they are unused by the kernel."""))
 38 |             end
 39 |         end
 40 |     end
 41 | 
 42 |     return
 43 | end
 44 | 
 45 | 
 46 | ## IR validation
 47 | 
 48 | const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta
 49 | 
 50 | struct InvalidIRError <: Exception
 51 |     ctx::CompilerContext
 52 |     errors::Vector{IRError}
 53 | end
 54 | 
 55 | const RUNTIME_FUNCTION = "call to the Julia runtime"
 56 | const UNKNOWN_FUNCTION = "call to an unknown function"
 57 | const POINTER_FUNCTION = "call through a literal pointer"
 58 | 
 59 | function Base.showerror(io::IO, err::InvalidIRError)
 60 |     print(io, "InvalidIRError: compiling $(signature(err.ctx)) resulted in invalid LLVM IR")
 61 |     for (kind, bt, meta) in err.errors
 62 |         print(io, "\nReason: unsupported $kind")
 63 |         if meta != nothing
 64 |             if kind == RUNTIME_FUNCTION || kind == UNKNOWN_FUNCTION || kind == POINTER_FUNCTION
 65 |                 print(io, " (call to ", meta, ")")
 66 |             end
 67 |         end
 68 |         Base.show_backtrace(io, bt)
 69 |     end
 70 |     return
 71 | end
 72 | 
 73 | function check_ir(ctx, args...)
 74 |     errors = check_ir!(ctx, IRError[], args...)
 75 |     unique!(errors)
 76 |     if !isempty(errors)
 77 |         throw(InvalidIRError(ctx, errors))
 78 |     end
 79 | 
 80 |     return
 81 | end
 82 | 
 83 | function check_ir!(ctx, errors::Vector{IRError}, mod::LLVM.Module)
 84 |     for f in functions(mod)
 85 |         check_ir!(ctx, errors, f)
 86 |     end
 87 | 
 88 |     return errors
 89 | end
 90 | 
 91 | function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function)
 92 |     for bb in blocks(f), inst in instructions(bb)
 93 |         if isa(inst, LLVM.CallInst)
 94 |             check_ir!(ctx, errors, inst)
 95 |         end
 96 |     end
 97 | 
 98 |     return errors
 99 | end
100 | 
101 | const special_fns = ("vprintf", "__assertfail", "malloc", "free", "__nvvm_reflect")
102 | 
103 | const libjulia = Ref{Ptr{Cvoid}}(C_NULL)
104 | 
105 | function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst)
106 |     dest = called_value(inst)
107 |     if isa(dest, LLVM.Function)
108 |         fn = LLVM.name(dest)
109 | 
110 |         # detect calls to undefined functions
111 |         if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns)
112 |             # figure out if the function lives in the Julia runtime library
113 |             if libjulia[] == C_NULL
114 |                 paths = filter(Libdl.dllist()) do path
115 |                     name = splitdir(path)[2]
116 |                     startswith(name, "libjulia")
117 |                 end
118 |                 libjulia[] = Libdl.dlopen(first(paths))
119 |             end
120 | 
121 |             bt = backtrace(inst)
122 |             if Libdl.dlsym_e(libjulia[], fn) != C_NULL
123 |                 push!(errors, (RUNTIME_FUNCTION, bt, LLVM.name(dest)))
124 |             else
125 |                 push!(errors, (UNKNOWN_FUNCTION, bt, LLVM.name(dest)))
126 |             end
127 |         end
128 |     elseif isa(dest, InlineAsm)
129 |         # let's assume it's valid ASM
130 |     elseif isa(dest, ConstantExpr)
131 |         # detect calls to literal pointers
132 |         # FIXME: can we detect these properly?
133 |         # FIXME: jl_apply_generic and jl_invoke also have such arguments
134 |         if occursin("inttoptr", string(dest))
135 |             # extract the literal pointer
136 |             ptr_arg = first(operands(dest))
137 |             @compiler_assert isa(ptr_arg, ConstantInt) ctx
138 |             ptr_val = convert(Int, ptr_arg)
139 |             ptr = Ptr{Cvoid}(ptr_val)
140 | 
141 |             # look it up in the Julia JIT cache
142 |             bt = backtrace(inst)
143 |             frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
144 |             if length(frames) >= 1
145 |                 @compiler_assert length(frames) == 1 ctx frames=frames
146 |                 fn, file, line, linfo, fromC, inlined, ip = last(frames)
147 |                 push!(errors, (POINTER_FUNCTION, bt, fn))
148 |             else
149 |                 fn, file, line, linfo, fromC, inlined, ip = last(frames)
150 |                 push!(errors, (POINTER_FUNCTION, bt, nothing))
151 |             end
152 |         end
153 |     end
154 | 
155 |     return errors
156 | end
157 | 


--------------------------------------------------------------------------------
/deps/build.jl:
--------------------------------------------------------------------------------
  1 | using CUDAapi
  2 | using CUDAdrv
  3 | using LLVM
  4 | 
  5 | 
  6 | ## auxiliary routines
  7 | 
  8 | function build_error(reason)
  9 |     println("""
 10 |         $reason.
 11 | 
 12 |         This is not a fatal error, but GPU functionality will be unavailable.
 13 |         If you expected this to work, please open a thread on
 14 |         https://discourse.julialang.org/c/domain/gpu""")
 15 |     exit(1)
 16 | end
 17 | 
 18 | function llvm_support(version)
 19 |     @debug("Using LLVM $version")
 20 | 
 21 |     InitializeAllTargets()
 22 |     haskey(targets(), "nvptx") ||
 23 |         build_error("""
 24 |             Your LLVM does not support the NVPTX back-end.
 25 | 
 26 |             This is very strange; both the official binaries
 27 |             and an unmodified build should contain this back-end.""")
 28 | 
 29 |     target_support = sort(collect(CUDAapi.devices_for_llvm(version)))
 30 | 
 31 |     ptx_support = CUDAapi.isas_for_llvm(version)
 32 |     push!(ptx_support, v"6.0") # JuliaLang/julia#23817
 33 |     ptx_support = sort(collect(ptx_support))
 34 | 
 35 |     @debug("LLVM support", targets=target_support, isas=ptx_support)
 36 |     return target_support, ptx_support
 37 | end
 38 | 
 39 | function cuda_support(driver_version, toolkit_version)
 40 |     @debug("Using CUDA driver $driver_version and toolkit $toolkit_version")
 41 | 
 42 |     # the toolkit version as reported contains major.minor.patch,
 43 |     # but the version number returned by libcuda is only major.minor.
 44 |     toolkit_version = VersionNumber(toolkit_version.major, toolkit_version.minor)
 45 |     if toolkit_version > driver_version
 46 |         build_error("""
 47 |             CUDA $(toolkit_version.major).$(toolkit_version.minor) is not supported by
 48 |             your driver (which supports up to $(driver_version.major).$(driver_version.minor))""")
 49 |     end
 50 | 
 51 |     driver_target_support = CUDAapi.devices_for_cuda(driver_version)
 52 |     toolkit_target_support = CUDAapi.devices_for_cuda(toolkit_version)
 53 |     target_support = sort(collect(driver_target_support ∩ toolkit_target_support))
 54 | 
 55 |     driver_ptx_support = CUDAapi.isas_for_cuda(driver_version)
 56 |     toolkit_ptx_support = CUDAapi.isas_for_cuda(toolkit_version)
 57 |     ptx_support = sort(collect(driver_ptx_support ∩ toolkit_ptx_support))
 58 | 
 59 |     @debug("CUDA driver support", version=driver_version,
 60 |            targets=driver_target_support, isas=driver_ptx_support)
 61 |     @debug("CUDA toolkit support", version=toolkit_version,
 62 |            targets=toolkit_target_support, isas=toolkit_ptx_support)
 63 | 
 64 |     return target_support, ptx_support
 65 | end
 66 | 
 67 | 
 68 | ## main
 69 | 
 70 | const config_path = joinpath(@__DIR__, "ext.jl")
 71 | const previous_config_path = config_path * ".bak"
 72 | 
 73 | function write_ext(config, path)
 74 |     open(path, "w") do io
 75 |         println(io, "# autogenerated file, do not edit")
 76 |         for (key,val) in config
 77 |             println(io, "const $key = $(repr(val))")
 78 |         end
 79 |     end
 80 | end
 81 | 
 82 | function read_ext(path)
 83 |     config = Dict{Symbol,Any}()
 84 |     r = r"^const (\w+) = (.+)$"
 85 |     open(path, "r") do io
 86 |         for line in eachline(io)
 87 |             m = match(r, line)
 88 |             if m != nothing
 89 |                 config[Symbol(m.captures[1])] = eval(Meta.parse(m.captures[2]))
 90 |             end
 91 |         end
 92 |     end
 93 |     return config
 94 | end
 95 | 
 96 | function main()
 97 |     ispath(config_path) && mv(config_path, previous_config_path; force=true)
 98 |     config = Dict{Symbol,Any}(:configured => false)
 99 |     write_ext(config, config_path)
100 | 
101 | 
102 |     ## gather info
103 | 
104 |     ### LLVM.jl
105 | 
106 |     LLVM.configured || build_error("Dependent package LLVM.jl has not been built successfully")
107 | 
108 |     LLVM.libllvm_system && build_error("CUDAnative.jl requires LLVM.jl to be built against Julia's LLVM library, not a system-provided one")
109 | 
110 |     llvm_version = LLVM.version()
111 |     llvm_targets, llvm_isas = llvm_support(llvm_version)
112 | 
113 |     ### julia
114 | 
115 |     julia_llvm_version = Base.libllvm_version
116 |     if julia_llvm_version != llvm_version
117 |         build_error("LLVM $llvm_version incompatible with Julia's LLVM $julia_llvm_version")
118 |     end
119 | 
120 |     ### CUDA
121 | 
122 |     CUDAdrv.configured || build_error("Dependent package CUDAdrv.jl has not been built successfully")
123 | 
124 |     toolkit_dirs = find_toolkit()
125 |     cuda_toolkit_version = find_toolkit_version(toolkit_dirs)
126 | 
127 |     config[:cuda_driver_version] = CUDAdrv.version()
128 |     cuda_targets, cuda_isas = cuda_support(config[:cuda_driver_version], cuda_toolkit_version)
129 | 
130 |     config[:target_support] = sort(collect(llvm_targets ∩ cuda_targets))
131 |     isempty(config[:target_support]) && build_error("Your toolchain does not support any device target")
132 | 
133 |     config[:ptx_support] = sort(collect(llvm_isas ∩ cuda_isas))
134 |     isempty(config[:target_support]) && build_error("Your toolchain does not support any PTX ISA")
135 | 
136 |     @debug("CUDAnative support", targets=config[:target_support], isas=config[:ptx_support])
137 | 
138 |     # discover other CUDA toolkit artifacts
139 |     ## required
140 |     config[:libdevice] = find_libdevice(config[:target_support], toolkit_dirs)
141 |     config[:libdevice] == nothing && build_error("Available CUDA toolchain does not provide libdevice")
142 |     ## optional
143 |     config[:nvdisasm] = find_cuda_binary("nvdisasm", toolkit_dirs)
144 |     config[:ptxas] = find_cuda_binary("ptxas", toolkit_dirs)
145 | 
146 |     config[:configured] = true
147 | 
148 | 
149 |     ## (re)generate ext.jl
150 | 
151 |     if isfile(previous_config_path)
152 |         previous_config = read_ext(previous_config_path)
153 | 
154 |         if config == previous_config
155 |             mv(previous_config_path, config_path; force=true)
156 |             return
157 |         end
158 |     end
159 | 
160 |     write_ext(config, config_path)
161 | 
162 |     return
163 | end
164 | 
165 | main()
166 | 


--------------------------------------------------------------------------------
/src/device/runtime_intrinsics.jl:
--------------------------------------------------------------------------------
  1 | # CUDAnative run-time library
  2 | #
  3 | # This module defines method instances that will be compiled into a device-specific image
  4 | # and will be available to the CUDAnative compiler to call after Julia has generated code.
  5 | 
  6 | module Runtime
  7 | 
  8 | using ..CUDAnative
  9 | using LLVM
 10 | using LLVM.Interop
 11 | 
 12 | 
 13 | ## representation of a runtime method instance
 14 | 
 15 | struct RuntimeMethodInstance
 16 |     def::Function
 17 | 
 18 |     return_type::Type
 19 |     types::Tuple
 20 |     name::Symbol
 21 | 
 22 |     # LLVM types cannot be cached, so we can't put them in the runtime method instance.
 23 |     # the actual types are constructed upon accessing them, based on a sentinel value:
 24 |     #  - nothing: construct the LLVM type based on its Julia counterparts
 25 |     #  - function: call this generator to get the type (when more control is needed)
 26 |     llvm_return_type::Union{Nothing, Function}
 27 |     llvm_types::Union{Nothing, Function}
 28 |     llvm_name::String
 29 | end
 30 | 
 31 | function Base.getproperty(rt::RuntimeMethodInstance, field::Symbol)
 32 |     value = getfield(rt, field)
 33 |     if field == :llvm_types
 34 |         if value == nothing
 35 |             LLVMType[convert.(LLVMType, typ) for typ in rt.types]
 36 |         else
 37 |             value()
 38 |         end
 39 |     elseif field == :llvm_return_type
 40 |         if value == nothing
 41 |             convert.(LLVMType, rt.return_type)
 42 |         else
 43 |             value()
 44 |         end
 45 |     else
 46 |         return value
 47 |     end
 48 | end
 49 | 
 50 | const methods = Dict{Symbol,RuntimeMethodInstance}()
 51 | get(name::Symbol) = methods[name]
 52 | 
 53 | # Register a Julia function `def` as a runtime library function identified by `name`. The
 54 | # function will be compiled upon first use for argument types `types` and should return
 55 | # `return_type`. Use `Runtime.get(name)` to get a reference to this method instance.
 56 | #
 57 | # The corresponding LLVM types `llvm_types` and `llvm_return_type` will be deduced from
 58 | # their Julia counterparts. To influence that conversion, pass a callable object instead;
 59 | # this object will be evaluated at run-time and the returned value will be used instead.
 60 | #
 61 | # When generating multiple runtime functions from a single definition, make sure to specify
 62 | # different values for `name`. The LLVM function name will be deduced from that name, but
 63 | # you can always specify `llvm_name` to influence that. Never use an LLVM name that starts
 64 | # with `julia_` or the function might clash with other compiled functions.
 65 | function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=nothing;
 66 |                  name=typeof(def).name.mt.name, llvm_name="ptx_$name")
 67 |     meth = RuntimeMethodInstance(def,
 68 |                                  return_type, types, name,
 69 |                                  llvm_return_type, llvm_types, llvm_name)
 70 |     if haskey(methods, name)
 71 |         error("Runtime function $name has already been registered!")
 72 |     end
 73 |     methods[name] = meth
 74 |     meth
 75 | end
 76 | 
 77 | 
 78 | ## exception handling
 79 | 
 80 | function report_exception(ex)
 81 |     @cuprintf("""
 82 |         ERROR: a %s was thrown during kernel execution.
 83 |                Run Julia on debug level 2 for device stack traces.
 84 |         """, ex)
 85 |     return
 86 | end
 87 | 
 88 | compile(report_exception, Nothing, (Ptr{Cchar},))
 89 | 
 90 | function report_exception_name(ex)
 91 |     @cuprintf("""
 92 |         ERROR: a %s was thrown during kernel execution.
 93 |         Stacktrace:
 94 |         """, ex)
 95 |     return
 96 | end
 97 | 
 98 | function report_exception_frame(idx, func, file, line)
 99 |     @cuprintf(" [%i] %s at %s:%i\n", idx, func, file, line)
100 |     return
101 | end
102 | 
103 | compile(report_exception_frame, Nothing, (Cint, Ptr{Cchar}, Ptr{Cchar}, Cint))
104 | compile(report_exception_name, Nothing, (Ptr{Cchar},))
105 | 
106 | # NOTE: no throw functions are provided here, but replaced by an LLVM pass instead
107 | #       in order to provide some debug information without stack unwinding.
108 | 
109 | 
110 | ## GC
111 | 
112 | @enum AddressSpace begin
113 |     Generic         = 1
114 |     Tracked         = 10
115 |     Derived         = 11
116 |     CalleeRooted    = 12
117 |     Loaded          = 13
118 | end
119 | 
120 | # LLVM type of a tracked pointer
121 | function T_prjlvalue()
122 |     T_pjlvalue = convert(LLVMType, Any, true)
123 |     LLVM.PointerType(eltype(T_pjlvalue), Tracked)
124 | end
125 | 
126 | function gc_pool_alloc(sz::Csize_t)
127 |     ptr = malloc(sz)
128 |     return unsafe_pointer_to_objref(ptr)
129 | end
130 | 
131 | compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
132 | 
133 | 
134 | ## boxing and unboxing
135 | 
136 | const tag_type = UInt
137 | const tag_size = sizeof(tag_type)
138 | 
139 | const gc_bits = 0x3 # FIXME
140 | 
141 | # get the type tag of a type at run-time
142 | @generated function type_tag(::Val{type_name}) where type_name
143 |     T_tag = convert(LLVMType, tag_type)
144 |     T_ptag = LLVM.PointerType(T_tag)
145 | 
146 |     T_pjlvalue = convert(LLVMType, Any, true)
147 | 
148 |     # create function
149 |     llvm_f, _ = create_function(T_tag)
150 |     mod = LLVM.parent(llvm_f)
151 | 
152 |     # this isn't really a function, but we abuse it to get the JIT to resolve the address
153 |     typ = LLVM.Function(mod, "jl_" * String(type_name) * "_type",
154 |                         LLVM.FunctionType(T_pjlvalue))
155 | 
156 |     # generate IR
157 |     Builder(JuliaContext()) do builder
158 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
159 |         position!(builder, entry)
160 | 
161 |         typ_var = bitcast!(builder, typ, T_ptag)
162 | 
163 |         tag = load!(builder, typ_var)
164 | 
165 |         ret!(builder, tag)
166 |     end
167 | 
168 |     call_function(llvm_f, tag_type)
169 | end
170 | 
171 | # we use `jl_value_ptr`, a Julia pseudo-intrinsic that can be used to box and unbox values
172 | 
173 | @generated function box(val, ::Val{type_name}) where type_name
174 |     sz = sizeof(val)
175 |     allocsz = sz + tag_size
176 | 
177 |     # type-tags are ephemeral, so look them up at run time
178 |     #tag = unsafe_load(convert(Ptr{tag_type}, type_name))
179 |     tag = :( type_tag(Val(type_name)) )
180 | 
181 |     quote
182 |         Base.@_inline_meta
183 | 
184 |         ptr = malloc($(Csize_t(allocsz)))
185 | 
186 |         # store the type tag
187 |         ptr = convert(Ptr{tag_type}, ptr)
188 |         Core.Intrinsics.pointerset(ptr, $tag | $gc_bits, #=index=# 1, #=align=# $tag_size)
189 | 
190 |         # store the value
191 |         ptr = convert(Ptr{$val}, ptr+tag_size)
192 |         Core.Intrinsics.pointerset(ptr, val, #=index=# 1, #=align=# $sz)
193 | 
194 |         unsafe_pointer_to_objref(ptr)
195 |     end
196 | end
197 | 
198 | @inline function unbox(obj, ::Type{T}) where T
199 |     ptr = ccall(:jl_value_ptr, Ptr{Cvoid}, (Any,), obj)
200 | 
201 |     # load the value
202 |     ptr = convert(Ptr{T}, ptr)
203 |     Core.Intrinsics.pointerref(ptr, #=index=# 1, #=align=# sizeof(T))
204 | end
205 | 
206 | # generate functions functions that exist in the Julia runtime (see julia/src/datatype.c)
207 | for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  => :int64,
208 |                UInt8  => :uint8, UInt16 => :uint16, UInt32 => :uint32, UInt64 => :uint64]
209 |     box_fn   = Symbol("box_$t")
210 |     unbox_fn = Symbol("unbox_$t")
211 |     @eval begin
212 |         $box_fn(val)   = box($T(val), Val($(QuoteNode(t))))
213 |         $unbox_fn(obj) = unbox(obj, $T)
214 | 
215 |         compile($box_fn, Any, ($T,), T_prjlvalue; llvm_name=$"jl_$box_fn")
216 |         compile($unbox_fn, $T, (Any,); llvm_name=$"jl_$unbox_fn")
217 |     end
218 | end
219 | 
220 | 
221 | end
222 | 


--------------------------------------------------------------------------------
/src/device/pointer.jl:
--------------------------------------------------------------------------------
  1 | # Pointers with address space information
  2 | 
  3 | #
  4 | # Address spaces
  5 | #
  6 | 
  7 | export AS, addrspace
  8 | 
  9 | abstract type AddressSpace end
 10 | 
 11 | module AS
 12 | 
 13 | import ..AddressSpace
 14 | 
 15 | struct Generic  <: AddressSpace end
 16 | struct Global   <: AddressSpace end
 17 | struct Shared   <: AddressSpace end
 18 | struct Constant <: AddressSpace end
 19 | struct Local    <: AddressSpace end
 20 | 
 21 | end
 22 | 
 23 | 
 24 | #
 25 | # Device pointer
 26 | #
 27 | 
 28 | """
 29 |     DevicePtr{T,A}
 30 | 
 31 | A memory address that refers to data of type `T` that is accessible from the GPU. It is the
 32 | on-device counterpart of `CUDAdrv.CuPtr`, additionally keeping track of the address space
 33 | `A` where the data resides (shared, global, constant, etc). This information is used to
 34 | provide optimized implementations of operations such as `unsafe_load` and `unsafe_store!.`
 35 | """
 36 | DevicePtr
 37 | 
 38 | if sizeof(Ptr{Cvoid}) == 8
 39 |     primitive type DevicePtr{T,A} 64 end
 40 | else
 41 |     primitive type DevicePtr{T,A} 32 end
 42 | end
 43 | 
 44 | # constructors
 45 | DevicePtr{T,A}(x::Union{Int,UInt,CuPtr,DevicePtr}) where {T,A<:AddressSpace} = Base.bitcast(DevicePtr{T,A}, x)
 46 | DevicePtr{T}(ptr::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr)
 47 | DevicePtr(ptr::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(ptr)
 48 | 
 49 | 
 50 | ## getters
 51 | 
 52 | Base.eltype(::Type{<:DevicePtr{T}}) where {T} = T
 53 | 
 54 | addrspace(x::DevicePtr) = addrspace(typeof(x))
 55 | addrspace(::Type{DevicePtr{T,A}}) where {T,A} = A
 56 | 
 57 | 
 58 | ## conversions
 59 | 
 60 | # to and from integers
 61 | ## pointer to integer
 62 | Base.convert(::Type{T}, x::DevicePtr) where {T<:Integer} = T(UInt(x))
 63 | ## integer to pointer
 64 | Base.convert(::Type{DevicePtr{T,A}}, x::Union{Int,UInt}) where {T,A<:AddressSpace} = DevicePtr{T,A}(x)
 65 | Int(x::DevicePtr)  = Base.bitcast(Int, x)
 66 | UInt(x::DevicePtr) = Base.bitcast(UInt, x)
 67 | 
 68 | # between host and device pointers
 69 | Base.convert(::Type{CuPtr{T}},  p::DevicePtr)  where {T}                 = Base.bitcast(CuPtr{T}, p)
 70 | Base.convert(::Type{DevicePtr{T,A}}, p::CuPtr) where {T,A<:AddressSpace} = Base.bitcast(DevicePtr{T,A}, p)
 71 | Base.convert(::Type{DevicePtr{T}}, p::CuPtr)   where {T}                 = Base.bitcast(DevicePtr{T,AS.Generic}, p)
 72 | 
 73 | # between device pointers
 74 | Base.convert(::Type{<:DevicePtr}, p::DevicePtr)                         = throw(ArgumentError("cannot convert between incompatible device pointer types"))
 75 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{T,A})   where {T,A}   = p
 76 | Base.unsafe_convert(::Type{DevicePtr{T,A}}, p::DevicePtr) where {T,A}   = Base.bitcast(DevicePtr{T,A}, p)
 77 | ## identical addrspaces
 78 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
 79 | ## convert to & from generic
 80 | Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr)               where {T}     = Base.unsafe_convert(DevicePtr{T,AS.Generic}, p)
 81 | Base.convert(::Type{DevicePtr{T,A}}, p::DevicePtr{U,AS.Generic})          where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
 82 | Base.convert(::Type{DevicePtr{T,AS.Generic}}, p::DevicePtr{T,AS.Generic}) where {T}     = p  # avoid ambiguities
 83 | ## unspecified, preserve source addrspace
 84 | Base.convert(::Type{DevicePtr{T}}, p::DevicePtr{U,A}) where {T,U,A} = Base.unsafe_convert(DevicePtr{T,A}, p)
 85 | 
 86 | # defer conversions to DevicePtr to unsafe_convert
 87 | Base.cconvert(::Type{<:DevicePtr}, x) = x
 88 | 
 89 | 
 90 | ## limited pointer arithmetic & comparison
 91 | 
 92 | isequal(x::DevicePtr, y::DevicePtr) = (x === y) && addrspace(x) == addrspace(y)
 93 | isless(x::DevicePtr{T,A}, y::DevicePtr{T,A}) where {T,A<:AddressSpace} = x < y
 94 | 
 95 | Base.:(==)(x::DevicePtr, y::DevicePtr) = UInt(x) == UInt(y) && addrspace(x) == addrspace(y)
 96 | Base.:(<)(x::DevicePtr,  y::DevicePtr) = UInt(x) < UInt(y)
 97 | Base.:(-)(x::DevicePtr,  y::DevicePtr) = UInt(x) - UInt(y)
 98 | 
 99 | Base.:(+)(x::DevicePtr, y::Integer) = oftype(x, Base.add_ptr(UInt(x), (y % UInt) % UInt))
100 | Base.:(-)(x::DevicePtr, y::Integer) = oftype(x, Base.sub_ptr(UInt(x), (y % UInt) % UInt))
101 | Base.:(+)(x::Integer, y::DevicePtr) = y + x
102 | 
103 | 
104 | 
105 | ## memory operations
106 | 
107 | Base.convert(::Type{Int}, ::Type{AS.Generic})  = 0
108 | Base.convert(::Type{Int}, ::Type{AS.Global})   = 1
109 | Base.convert(::Type{Int}, ::Type{AS.Shared})   = 3
110 | Base.convert(::Type{Int}, ::Type{AS.Constant}) = 4
111 | Base.convert(::Type{Int}, ::Type{AS.Local})    = 5
112 | 
113 | function tbaa_make_child(name::String, constant::Bool=false; ctx::LLVM.Context=JuliaContext())
114 |     tbaa_root = MDNode([MDString("ptxtbaa", ctx)], ctx)
115 |     tbaa_struct_type =
116 |         MDNode([MDString("ptxtbaa_$name", ctx),
117 |                 tbaa_root,
118 |                 LLVM.ConstantInt(0, ctx)], ctx)
119 |     tbaa_access_tag =
120 |         MDNode([tbaa_struct_type,
121 |                 tbaa_struct_type,
122 |                 LLVM.ConstantInt(0, ctx),
123 |                 LLVM.ConstantInt(constant ? 1 : 0, ctx)], ctx)
124 | 
125 |     return tbaa_access_tag
126 | end
127 | 
128 | tbaa_addrspace(as::Type{<:AddressSpace}) = tbaa_make_child(lowercase(String(as.name.name)))
129 | 
130 | @generated function Base.unsafe_load(p::DevicePtr{T,A}, i::Integer=1,
131 |                                      ::Val{align}=Val(1)) where {T,A,align}
132 |     eltyp = convert(LLVMType, T)
133 | 
134 |     T_int = convert(LLVMType, Int)
135 |     T_ptr = convert(LLVMType, DevicePtr{T,A})
136 | 
137 |     T_actual_ptr = LLVM.PointerType(eltyp)
138 | 
139 |     # create a function
140 |     param_types = [T_ptr, T_int]
141 |     llvm_f, _ = create_function(eltyp, param_types)
142 | 
143 |     # generate IR
144 |     Builder(JuliaContext()) do builder
145 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
146 |         position!(builder, entry)
147 | 
148 |         ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr)
149 | 
150 |         ptr = gep!(builder, ptr, [parameters(llvm_f)[2]])
151 |         ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A)))
152 |         ld = load!(builder, ptr_with_as)
153 | 
154 |         if A != AS.Generic
155 |             metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(A)
156 |         end
157 |         alignment!(ld, align)
158 | 
159 |         ret!(builder, ld)
160 |     end
161 | 
162 |     call_function(llvm_f, T, Tuple{DevicePtr{T,A}, Int}, :((p, Int(i-one(i)))))
163 | end
164 | 
165 | @generated function Base.unsafe_store!(p::DevicePtr{T,A}, x, i::Integer=1,
166 |                                        ::Val{align}=Val(1)) where {T,A,align}
167 |     eltyp = convert(LLVMType, T)
168 | 
169 |     T_int = convert(LLVMType, Int)
170 |     T_ptr = convert(LLVMType, DevicePtr{T,A})
171 | 
172 |     T_actual_ptr = LLVM.PointerType(eltyp)
173 | 
174 |     # create a function
175 |     param_types = [T_ptr, eltyp, T_int]
176 |     llvm_f, _ = create_function(LLVM.VoidType(JuliaContext()), param_types)
177 | 
178 |     # generate IR
179 |     Builder(JuliaContext()) do builder
180 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
181 |         position!(builder, entry)
182 | 
183 |         ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr)
184 | 
185 |         ptr = gep!(builder, ptr, [parameters(llvm_f)[3]])
186 |         ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A)))
187 |         val = parameters(llvm_f)[2]
188 |         st = store!(builder, val, ptr_with_as)
189 | 
190 |         if A != AS.Generic
191 |             metadata(st)[LLVM.MD_tbaa] = tbaa_addrspace(A)
192 |         end
193 |         alignment!(st, align)
194 | 
195 |         ret!(builder)
196 |     end
197 | 
198 |     call_function(llvm_f, Cvoid, Tuple{DevicePtr{T,A}, T, Int},
199 |                   :((p, convert(T,x), Int(i-one(i)))))
200 | end
201 | 
202 | ## loading through the texture cache
203 | 
204 | export unsafe_cached_load
205 | 
206 | # NOTE: CUDA 8.0 supports more caching modifiers, but those aren't supported by LLVM yet
207 | 
208 | # TODO: this functionality should throw <sm_32
209 | 
210 | # operand types supported by llvm.nvvm.ldg.global
211 | const CachedLoadOperands = Union{UInt8, UInt16, UInt32, UInt64,
212 |                                  Int8, Int16, Int32, Int64,
213 |                                  Float32, Float64}
214 | 
215 | # containing DevicePtr types
216 | const CachedLoadPointers = Union{Tuple(DevicePtr{T,AS.Global}
217 |                                  for T in Base.uniontypes(CachedLoadOperands))...}
218 | 
219 | @generated function unsafe_cached_load(p::DevicePtr{T,AS.Global}, i::Integer=1,
220 |                                        ::Val{align}=Val(1)) where
221 |                                       {T<:CachedLoadOperands,align}
222 |     # NOTE: we can't `ccall(..., llvmcall)`, because
223 |     #       1) Julia passes pointer arguments as plain integers
224 |     #       2) we need to addrspacecast the pointer argument
225 | 
226 |     eltyp = convert(LLVMType, T)
227 | 
228 |     T_int = convert(LLVMType, Int)
229 |     T_int32 = LLVM.Int32Type(JuliaContext())
230 |     T_ptr = convert(LLVMType, DevicePtr{T,AS.Global})
231 | 
232 |     T_actual_ptr = LLVM.PointerType(eltyp)
233 |     T_actual_ptr_as = LLVM.PointerType(eltyp, convert(Int, AS.Global))
234 | 
235 |     # create a function
236 |     param_types = [T_ptr, T_int]
237 |     llvm_f, _ = create_function(eltyp, param_types)
238 | 
239 |     # create the intrinsic
240 |     intrinsic_name = let
241 |         class = if isa(eltyp, LLVM.IntegerType)
242 |             :i
243 |         elseif isa(eltyp, LLVM.FloatingPointType)
244 |             :f
245 |         else
246 |             error("Cannot handle $eltyp argument to unsafe_cached_load")
247 |         end
248 |         width = sizeof(T)*8
249 |         typ = Symbol(class, width)
250 |         "llvm.nvvm.ldg.global.$class.$typ.p1$typ"
251 |     end
252 |     mod = LLVM.parent(llvm_f)
253 |     intrinsic_typ = LLVM.FunctionType(eltyp, [T_actual_ptr_as, T_int32])
254 |     intrinsic = LLVM.Function(mod, intrinsic_name, intrinsic_typ)
255 | 
256 |     # generate IR
257 |     Builder(JuliaContext()) do builder
258 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
259 |         position!(builder, entry)
260 | 
261 |         ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr)
262 | 
263 |         ptr = gep!(builder, ptr, [parameters(llvm_f)[2]])
264 |         ptr_with_as = addrspacecast!(builder, ptr, T_actual_ptr_as)
265 |         ld = call!(builder, intrinsic,
266 |                    [ptr_with_as, ConstantInt(Int32(align), JuliaContext())])
267 | 
268 |         metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(AS.Global)
269 | 
270 |         ret!(builder, ld)
271 |     end
272 | 
273 |     call_function(llvm_f, T, Tuple{DevicePtr{T,AS.Global}, Int}, :((p, Int(i-one(i)))))
274 | end
275 | 
276 | @inline unsafe_cached_load(p::DevicePtr{T,AS.Global}, i::Integer=1, args...) where {T} =
277 |     recurse_pointer_invocation(unsafe_cached_load, p+sizeof(T)*Int(i-one(i)),
278 |                                CachedLoadPointers, 1, args...)
279 | 


--------------------------------------------------------------------------------
/src/device/tools.jl:
--------------------------------------------------------------------------------
  1 | # Tools for implementing device functionality
  2 | 
  3 | # the inverse, ie. which Julia types map a given LLVM types
  4 | const jltypes = Dict{Symbol,Type}(
  5 |     :void   => Nothing,
  6 |     :i8     => Int8,
  7 |     :i16    => Int16,
  8 |     :i32    => Int32,
  9 |     :i64    => Int64,
 10 |     :float  => Float32,
 11 |     :double => Float64
 12 | )
 13 | 
 14 | # Decode an expression of the form:
 15 | #
 16 | #    function(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type
 17 | #
 18 | # Returns a tuple containing the function name, a vector of argument, a vector of argument
 19 | # types and the return type (all in symbolic form).
 20 | function decode_call(e)
 21 |     @assert e.head == :(::)
 22 | 
 23 |     # decode the return type expression: single symbol (the LLVM type), or a tuple of 2
 24 |     # symbols (the LLVM and corresponding Julia type)
 25 |     retspec = e.args[2]
 26 |     if isa(retspec, Symbol)
 27 |         rettype = retspec
 28 |     else
 29 |         @assert retspec.head == :tuple
 30 |         @assert length(retspec.args) == 2
 31 |         rettype = (retspec.args[1], retspec.args[2])
 32 |     end
 33 | 
 34 |     call = e.args[1]
 35 |     @assert call.head == :call
 36 | 
 37 |     fn = Symbol(call.args[1])
 38 |     args = Symbol[arg.args[1] for arg in call.args[2:end]]
 39 |     argtypes = Symbol[arg.args[2] for arg in call.args[2:end]]
 40 | 
 41 |     return fn, args, argtypes, rettype
 42 | end
 43 | 
 44 | # Generate a `llvmcall` statement calling an intrinsic specified as follows:
 45 | #
 46 | #     intrinsic(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type [attr]
 47 | #
 48 | # The argument types should be valid LLVM type identifiers (eg. i32, float, double).
 49 | # Conversions to the corresponding Julia type are automatically generated; make sure the
 50 | # actual arguments are of the same type to make these conversions no-ops. The optional
 51 | # argument `attr` indicates which LLVM function attributes (such as `readnone` or `nounwind`)
 52 | # to add to the intrinsic declaration.
 53 | 
 54 | # For example, the following call:
 55 | #     `@wrap __some_intrinsic(x::float, y::double)::float`
 56 | #
 57 | # will yield the following `llvmcall`:
 58 | # ```
 59 | #     Base.llvmcall(("declare float @__somme__intr(float, double)",
 60 | #                    "%3 = call float @__somme__intr(float %0, double %1)
 61 | #                     ret float %3"),
 62 | #                   Float32, Tuple{Float32,Float64},
 63 | #                   convert(Float32,x), convert(Float64,y))
 64 | # ```
 65 | macro wrap(call, attrs="")
 66 |     intrinsic, args, argtypes, rettype = decode_call(call)
 67 | 
 68 |     # decide on intrinsic return type
 69 |     if isa(rettype, Symbol)
 70 |         # only LLVM return type specified, match against known LLVM/Julia type combinations
 71 |         llvm_ret_typ = rettype
 72 |         julia_ret_typ = jltypes[rettype]
 73 |     else
 74 |         # both specified (for when there is a mismatch, eg. i32 -> UInt32)
 75 |         llvm_ret_typ = rettype[1]
 76 |         julia_ret_typ = rettype[2]
 77 |     end
 78 | 
 79 |     llvm_args = String["%$i" for i in 0:length(argtypes)]
 80 |     if llvm_ret_typ == :void
 81 |         llvm_ret_asgn = ""
 82 |         llvm_ret = "void"
 83 |     else
 84 |         llvm_ret_var = "%$(length(argtypes)+1)"
 85 |         llvm_ret_asgn = "$llvm_ret_var = "
 86 |         llvm_ret = "$llvm_ret_typ $llvm_ret_var"
 87 |     end
 88 |     llvm_declargs = join(argtypes, ", ")
 89 |     llvm_defargs = join(("$t $arg" for (t,arg) in zip(argtypes, llvm_args)), ", ")
 90 | 
 91 |     julia_argtypes = (jltypes[t] for t in argtypes)
 92 |     julia_args = (:(convert($argtype, $(esc(arg)))) for (arg, argtype) in zip(args, julia_argtypes))
 93 | 
 94 |     dest = ("""declare $llvm_ret_typ @$intrinsic($llvm_declargs)""",
 95 |             """$llvm_ret_asgn call $llvm_ret_typ @$intrinsic($llvm_defargs)
 96 |                 ret $llvm_ret""")
 97 |     return quote
 98 |         Base.llvmcall($dest, $julia_ret_typ, Tuple{$(julia_argtypes...)}, $(julia_args...))
 99 |     end
100 | end
101 | 
102 | 
103 | # julia.h: jl_datatype_align
104 | Base.@pure function datatype_align(::Type{T}) where {T}
105 |     # typedef struct {
106 |     #     uint32_t nfields;
107 |     #     uint32_t alignment : 9;
108 |     #     uint32_t haspadding : 1;
109 |     #     uint32_t npointers : 20;
110 |     #     uint32_t fielddesc_type : 2;
111 |     # } jl_datatype_layout_t;
112 |     field = T.layout + sizeof(UInt32)
113 |     unsafe_load(convert(Ptr{UInt16}, field)) & convert(Int16, 2^9-1)
114 | end
115 | 
116 | 
117 | # generalization of word-based primitives
118 | 
119 | ## extract bits from a larger value
120 | @inline function extract_word(val, ::Val{i}) where {i}
121 |     extract_value(val, UInt32, Val(32*(i-1)))
122 | end
123 | @generated function extract_value(val, ::Type{sub}, ::Val{offset}) where {sub, offset}
124 |     T_val = convert(LLVMType, val)
125 |     T_sub = convert(LLVMType, sub)
126 | 
127 |     bytes = Core.sizeof(val)
128 |     T_int = LLVM.IntType(8*bytes, JuliaContext())
129 | 
130 |     # create function
131 |     llvm_f, _ = create_function(T_sub, [T_val])
132 |     mod = LLVM.parent(llvm_f)
133 | 
134 |     # generate IR
135 |     Builder(JuliaContext()) do builder
136 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
137 |         position!(builder, entry)
138 | 
139 |         equiv = bitcast!(builder, parameters(llvm_f)[1], T_int)
140 |         shifted = lshr!(builder, equiv, LLVM.ConstantInt(T_int, offset))
141 |         # extracted = and!(builder, shifted, 2^32-1)
142 |         extracted = trunc!(builder, shifted, T_sub)
143 | 
144 |         ret!(builder, extracted)
145 |     end
146 | 
147 |     call_function(llvm_f, UInt32, Tuple{val}, :( (val,) ))
148 | end
149 | 
150 | ## insert bits into a larger value
151 | @inline function insert_word(val, word::UInt32, ::Val{i}) where {i}
152 |     insert_value(val, word, Val(32*(i-1)))
153 | end
154 | @generated function insert_value(val, sub, ::Val{offset}) where {offset}
155 |     T_val = convert(LLVMType, val)
156 |     T_sub = convert(LLVMType, sub)
157 | 
158 |     bytes = Core.sizeof(val)
159 |     T_out_int = LLVM.IntType(8*bytes, JuliaContext())
160 | 
161 |     # create function
162 |     llvm_f, _ = create_function(T_val, [T_val, T_sub])
163 |     mod = LLVM.parent(llvm_f)
164 | 
165 |     # generate IR
166 |     Builder(JuliaContext()) do builder
167 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
168 |         position!(builder, entry)
169 | 
170 |         equiv = bitcast!(builder, parameters(llvm_f)[1], T_out_int)
171 |         ext = zext!(builder, parameters(llvm_f)[2], T_out_int)
172 |         shifted = shl!(builder, ext, LLVM.ConstantInt(T_out_int, offset))
173 |         inserted = or!(builder, equiv, shifted)
174 |         orig = bitcast!(builder, inserted, T_val)
175 | 
176 |         ret!(builder, orig)
177 |     end
178 | 
179 |     call_function(llvm_f, val, Tuple{val, sub}, :( (val, sub) ))
180 | end
181 | 
182 | # split the invocation of a function `op` on a value `val` with non-struct eltype
183 | # into multiple smaller invocations on byte-sized partial values.
184 | @generated function split_value_invocation(op::Function, val, args...)
185 |     # TODO: control of lower-limit
186 | 
187 |     ex = quote
188 |         Base.@_inline_meta
189 |     end
190 | 
191 |     # disassemble into words
192 |     words = Symbol[]
193 |     for i in 1:Core.sizeof(val)÷4
194 |         word = Symbol("word$i")
195 |         push!(ex.args, :( $word = extract_word(val, Val($i)) ))
196 |         push!(words, word)
197 |     end
198 | 
199 |     # perform the operation
200 |     for word in words
201 |         push!(ex.args, :( $word = op($word, args...)) )
202 |     end
203 | 
204 |     # reassemble
205 |     push!(ex.args, :( out = zero(val) ))
206 |     for (i,word) in enumerate(words)
207 |         push!(ex.args, :( out = insert_word(out, $word, Val($i)) ))
208 |     end
209 | 
210 |     push!(ex.args, :( out ))
211 |     return ex
212 | end
213 | 
214 | # split the invocation of a function `op` on a value `val`
215 | # by invoking the function on each of its fields
216 | @generated function recurse_value_invocation(op::Function, val, args...)
217 |     ex = quote
218 |         Base.@_inline_meta
219 |     end
220 | 
221 |     fields = fieldnames(val)
222 |     if isempty(fields)
223 |         push!(ex.args, :( split_value_invocation(op, val, args...) ))
224 |     else
225 |         ctor = Expr(:new, val)
226 |         for field in fields
227 |             push!(ctor.args, :(
228 |                 recurse_value_invocation(op, getfield(val, $(QuoteNode(field))), args...) ))
229 |         end
230 |         push!(ex.args, ctor)
231 |     end
232 | 
233 |     return ex
234 | end
235 | 
236 | # split the invocation of a function `op` on a pointer `ptr` with non-struct eltype
237 | # into multiple smaller invocations on any supported pointer as listed in `supported_ptrs`.
238 | @generated function split_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs},
239 |                                              args...) where {supported_ptrs}
240 |     T = eltype(ptr)
241 |     elsize(x) = Core.sizeof(eltype(x))
242 |     supported_ptrs = reverse(Base.uniontypes(supported_ptrs))
243 | 
244 |     ex = quote
245 |         Base.@_inline_meta
246 |     end
247 | 
248 |     # disassemble
249 |     vals = Tuple{Symbol,Int,Type}[]
250 |     offset = 0
251 |     while offset < Core.sizeof(T)
252 |         val = Symbol("value.$(length(vals)+1)")
253 | 
254 |         # greedy selection of next pointer type
255 |         remaining = Core.sizeof(T)-offset
256 |         valid = filter(ptr->elsize(ptr)<=remaining, supported_ptrs)
257 |         if isempty(valid)
258 |             error("Cannot partition $T into values of $supported_typs")
259 |         end
260 |         ptr = first(sort(collect(valid); by=elsize, rev=true))
261 | 
262 |         push!(vals, (val, offset, ptr))
263 |         offset += elsize(ptr)
264 |     end
265 | 
266 |     # perform the operation
267 |     for (val, offset, ptr) in vals
268 |         subptr = :(convert($ptr, ptr+$offset))
269 |         push!(ex.args, :( $val = op($subptr, args...)) )
270 |     end
271 | 
272 |     # reassemble
273 |     push!(ex.args, :( out = zero($T) ))
274 |     for (val, offset, ptr) in vals
275 |         push!(ex.args, :( out = insert_value(out, $val, Val($offset)) ))
276 |     end
277 | 
278 |     push!(ex.args, :( out ))
279 |     return ex
280 | end
281 | 
282 | # split the invocation of a function `op` on a pointer `ptr`
283 | # by invoking the function on a pointer to each of its fields
284 | @generated function recurse_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs},
285 |                                                args...) where {supported_ptrs}
286 |     T = eltype(ptr)
287 | 
288 |     ex = quote
289 |         Base.@_inline_meta
290 |     end
291 | 
292 |     fields = fieldnames(T)
293 |     if isempty(fields)
294 |         push!(ex.args, :( split_pointer_invocation(op, ptr, supported_ptrs, args...) ))
295 |     else
296 |         ctor = Expr(:new, T)
297 |         for (i,field) in enumerate(fields)
298 |             field_typ = fieldtype(T, i)
299 |             field_offset = fieldoffset(T, i)
300 |             field_ptr_typ = :($(ptr.name.wrapper){$field_typ})
301 |             # NOTE: this ctor is a leap of faith
302 |             subptr = :(convert($field_ptr_typ, ptr+$field_offset))
303 |             push!(ctor.args, :(
304 |                 recurse_pointer_invocation(op, $subptr, supported_ptrs, args...) ))
305 |         end
306 |         push!(ex.args, ctor)
307 |     end
308 | 
309 |     return ex
310 | end
311 | 


--------------------------------------------------------------------------------
/src/execution.jl:
--------------------------------------------------------------------------------
  1 | # Native execution support
  2 | 
  3 | export @cuda, cudaconvert, cufunction, nearest_warpsize
  4 | 
  5 | 
  6 | ## kernel object and query functions
  7 | 
  8 | struct Kernel{F,TT}
  9 |     ctx::CuContext
 10 |     mod::CuModule
 11 |     fun::CuFunction
 12 | end
 13 | 
 14 | """
 15 |     version(k::Kernel)
 16 | 
 17 | Queries the PTX and SM versions a kernel was compiled for.
 18 | Returns a named tuple.
 19 | """
 20 | function version(k::Kernel)
 21 |     attr = attributes(k.fun)
 22 |     binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...)
 23 |     ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...)
 24 |     return (ptx=ptx_ver, binary=binary_ver)
 25 | end
 26 | 
 27 | """
 28 |     memory(k::Kernel)
 29 | 
 30 | Queries the local, shared and constant memory usage of a compiled kernel in bytes.
 31 | Returns a named tuple.
 32 | """
 33 | function memory(k::Kernel)
 34 |     attr = attributes(k.fun)
 35 |     local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES]
 36 |     shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES]
 37 |     constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES]
 38 |     return (:local=>local_mem, shared=shared_mem, constant=constant_mem)
 39 | end
 40 | 
 41 | """
 42 |     registers(k::Kernel)
 43 | 
 44 | Queries the register usage of a kernel.
 45 | """
 46 | function registers(k::Kernel)
 47 |     attr = attributes(k.fun)
 48 |     return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS]
 49 | end
 50 | 
 51 | """
 52 |     maxthreads(k::Kernel)
 53 | 
 54 | Queries the maximum amount of threads a kernel can use in a single block.
 55 | """
 56 | function maxthreads(k::Kernel)
 57 |     attr = attributes(k.fun)
 58 |     return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK]
 59 | end
 60 | 
 61 | 
 62 | ## helper functions
 63 | 
 64 | # split keyword arguments to `@cuda` into ones affecting the compiler, or the execution
 65 | function split_kwargs(kwargs)
 66 |     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
 67 |     call_kws     = [:blocks, :threads, :shmem, :stream]
 68 |     compiler_kwargs = []
 69 |     call_kwargs = []
 70 |     for kwarg in kwargs
 71 |         if Meta.isexpr(kwarg, :(=))
 72 |             key,val = kwarg.args
 73 |             if isa(key, Symbol)
 74 |                 if key in compiler_kws
 75 |                     push!(compiler_kwargs, kwarg)
 76 |                 elseif key in call_kws
 77 |                     push!(call_kwargs, kwarg)
 78 |                 else
 79 |                     throw(ArgumentError("unknown keyword argument '$key'"))
 80 |                 end
 81 |             else
 82 |                 throw(ArgumentError("non-symbolic keyword '$key'"))
 83 |             end
 84 |         else
 85 |             throw(ArgumentError("non-keyword argument like option '$kwarg'"))
 86 |         end
 87 |     end
 88 | 
 89 |     return compiler_kwargs, call_kwargs
 90 | end
 91 | 
 92 | # assign arguments to variables, handle splatting
 93 | function assign_args!(code, args)
 94 |     # handle splatting
 95 |     splats = map(arg -> Meta.isexpr(arg, :(...)), args)
 96 |     args = map(args, splats) do arg, splat
 97 |         splat ? arg.args[1] : arg
 98 |     end
 99 | 
100 |     # assign arguments to variables
101 |     vars = Tuple(gensym() for arg in args)
102 |     map(vars, args) do var,arg
103 |         push!(code.args, :($var = $(esc(arg))))
104 |     end
105 | 
106 |     # convert the arguments, compile the function and call the kernel
107 |     # while keeping the original arguments alive
108 |     var_exprs = map(vars, args, splats) do var, arg, splat
109 |          splat ? Expr(:(...), var) : var
110 |     end
111 | 
112 |     return vars, var_exprs
113 | end
114 | 
115 | # fast lookup of global world age
116 | world_age() = ccall(:jl_get_tls_world_age, UInt, ())
117 | 
118 | # slow lookup of local method age
119 | function method_age(f, tt)::UInt
120 |     for m in Base._methods(f, tt, 1, typemax(UInt))
121 |         return m[3].min_world
122 |     end
123 |     throw(MethodError(f, tt))
124 | end
125 | 
126 | 
127 | ## adaptors
128 | 
129 | struct Adaptor end
130 | 
131 | # convert CUDAdrv pointers to CUDAnative pointers
132 | Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p)
133 | 
134 | # Base.RefValue isn't GPU compatible, so provide a compatible alternative
135 | struct CuRefValue{T} <: Ref{T}
136 |   x::T
137 | end
138 | Base.getindex(r::CuRefValue) = r.x
139 | Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
140 | 
141 | # convenience function
142 | """
143 |     cudaconvert(x)
144 | 
145 | This function is called for every argument to be passed to a kernel, allowing it to be
146 | converted to a GPU-friendly format. By default, the function does nothing and returns the
147 | input object `x` as-is.
148 | 
149 | Do not add methods to this function, but instead extend the underlying Adapt.jl package and
150 | register methods for the the `CUDAnative.Adaptor` type.
151 | """
152 | cudaconvert(arg) = adapt(Adaptor(), arg)
153 | 
154 | 
155 | ## high-level @cuda interface
156 | 
157 | """
158 |     @cuda [kwargs...] func(args...)
159 | 
160 | High-level interface for executing code on a GPU. The `@cuda` macro should prefix a call,
161 | with `func` a callable function or object that should return nothing. It will be compiled to
162 | a CUDA function upon first use, and to a certain extent arguments will be converted and
163 | managed automatically using `cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is
164 | performed, scheduling a kernel launch on the current CUDA context.
165 | 
166 | Several keyword arguments are supported that influence kernel compilation and execution. For
167 | more information, refer to the documentation of respectively [`cufunction`](@ref) and
168 | [`CUDAnative.Kernel`](@ref)
169 | 
170 | The underlying operations (argument conversion, kernel compilation, kernel call) can be
171 | performed explicitly when more control is needed, e.g. to reflect on the resource usage of a
172 | kernel to determine the launch configuration:
173 | 
174 |     args = ...
175 |     GC.@preserve args begin
176 |         kernel_args = cudaconvert.(args)
177 |         kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
178 |         kernel = cufunction(f, kernel_tt; compilation_kwargs)
179 |         kernel(kernel_args...; launch_kwargs)
180 |     end
181 | """
182 | macro cuda(ex...)
183 |     # destructure the `@cuda` expression
184 |     if length(ex) > 0 && ex[1].head == :tuple
185 |         error("The tuple argument to @cuda has been replaced by keywords: `@cuda threads=... fun(args...)`")
186 |     end
187 |     call = ex[end]
188 |     kwargs = ex[1:end-1]
189 | 
190 |     # destructure the kernel call
191 |     if call.head != :call
192 |         throw(ArgumentError("second argument to @cuda should be a function call"))
193 |     end
194 |     f = call.args[1]
195 |     args = call.args[2:end]
196 | 
197 |     code = quote end
198 |     compiler_kwargs, call_kwargs = split_kwargs(kwargs)
199 |     vars, var_exprs = assign_args!(code, args)
200 | 
201 |     # convert the arguments, call the compiler and launch the kernel
202 |     # while keeping the original arguments alive
203 |     push!(code.args,
204 |         quote
205 |             GC.@preserve $(vars...) begin
206 |                 local kernel_args = cudaconvert.(($(var_exprs...),))
207 |                 local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
208 |                 local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
209 |                 kernel(kernel_args...; $(map(esc, call_kwargs)...))
210 |             end
211 |          end)
212 |     return code
213 | end
214 | 
215 | 
216 | ## APIs for manual compilation
217 | 
218 | const agecache = Dict{UInt, UInt}()
219 | const compilecache = Dict{UInt, Kernel}()
220 | 
221 | """
222 |     cufunction(f, tt=Tuple{}; kwargs...)
223 | 
224 | Low-level interface to compile a function invocation for the currently-active GPU, returning
225 | a callable kernel object. For a higher-level interface, use [`@cuda`](@ref).
226 | 
227 | The following keyword arguments are supported:
228 | - minthreads: the required number of threads in a thread block.
229 | - maxthreads: the maximum number of threads in a thread block.
230 | - blocks_per_sm: a minimum number of thread blocks to be scheduled on a single
231 |   multiprocessor.
232 | - maxregs: the maximum number of registers to be allocated to a single thread (only
233 |   supported on LLVM 4.0+)
234 | 
235 | The output of this function is automatically cached, i.e. you can simply call `cufunction`
236 | in a hot path without degrading performance. New code will be generated automatically, when
237 | when function changes, or when different types or keyword arguments are provided.
238 | """
239 | @generated function cufunction(f::Core.Function, tt::Type=Tuple{}; kwargs...)
240 |     tt = Base.to_tuple_type(tt.parameters[1])
241 |     sig = Base.signature_type(f, tt)
242 |     t = Tuple(tt.parameters)
243 | 
244 |     precomp_key = hash(sig)  # precomputable part of the keys
245 |     quote
246 |         Base.@_inline_meta
247 | 
248 |         CUDAnative.maybe_initialize("cufunction")
249 | 
250 |         # look-up the method age
251 |         key = hash(world_age(), $precomp_key)
252 |         if haskey(agecache, key)
253 |             age = agecache[key]
254 |         else
255 |             age = method_age(f, $t)
256 |             agecache[key] = age
257 |         end
258 | 
259 |         # compile the function
260 |         ctx = CuCurrentContext()
261 |         key = hash(age, $precomp_key)
262 |         key = hash(ctx, key)
263 |         key = hash(kwargs, key)
264 |         for nf in 1:nfields(f)
265 |             # mix in the values of any captured variable
266 |             key = hash(getfield(f, nf), key)
267 |         end
268 |         if !haskey(compilecache, key)
269 |             fun, mod = compile(device(ctx), f, tt; kwargs...)
270 |             kernel = Kernel{f,tt}(ctx, mod, fun)
271 |             @debug begin
272 |                 ver = version(kernel)
273 |                 mem = memory(kernel)
274 |                 reg = registers(kernel)
275 |                 """Compiled $f to PTX $(ver.ptx) for SM $(ver.binary) using $reg registers.
276 |                    Memory usage: $(Base.format_bytes(mem.local)) local, $(Base.format_bytes(mem.shared)) shared, $(Base.format_bytes(mem.constant)) constant"""
277 |             end
278 |             compilecache[key] = kernel
279 |         end
280 | 
281 |         return compilecache[key]::Kernel{f,tt}
282 |     end
283 | end
284 | 
285 | @generated function (kernel::Kernel{F,TT})(args...; call_kwargs...) where {F,TT}
286 |     sig = Base.signature_type(F, TT)
287 |     args = (:F, (:( args[$i] ) for i in 1:length(args))...)
288 | 
289 |     # filter out ghost arguments that shouldn't be passed
290 |     to_pass = map(!isghosttype, sig.parameters)
291 |     call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
292 |     call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
293 | 
294 |     # replace non-isbits arguments (they should be unused, or compilation would have failed)
295 |     # alternatively, make CUDAdrv allow `launch` with non-isbits arguments.
296 |     for (i,dt) in enumerate(call_t)
297 |         if !isbitstype(dt)
298 |             call_t[i] = Ptr{Any}
299 |             call_args[i] = :C_NULL
300 |         end
301 |     end
302 | 
303 |     # finalize types
304 |     call_tt = Base.to_tuple_type(call_t)
305 | 
306 |     quote
307 |         Base.@_inline_meta
308 | 
309 |         cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
310 |     end
311 | end
312 | 
313 | # There doesn't seem to be a way to access the documentation for the call-syntax,
314 | # so attach it to the type
315 | """
316 |     (::Kernel)(args...; kwargs...)
317 | 
318 | Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`.
319 | For a higher-level interface, use [`@cuda`](@ref).
320 | 
321 | The following keyword arguments are supported:
322 | - threads (defaults to 1)
323 | - blocks (defaults to 1)
324 | - shmem (defaults to 0)
325 | - stream (defaults to the default stream)
326 | """
327 | Kernel
328 | 
329 | ## other
330 | 
331 | """
332 |     nearest_warpsize(dev::CuDevice, threads::Integer)
333 | 
334 | Return the nearest number of threads that is a multiple of the warp size of a device.
335 | 
336 | This is a common requirement, eg. when using shuffle intrinsics.
337 | """
338 | function nearest_warpsize(dev::CuDevice, threads::Integer)
339 |     ws = CUDAdrv.warpsize(dev)
340 |     return threads + (ws - threads % ws) % ws
341 | end
342 | 


--------------------------------------------------------------------------------
/examples/blackscholes.jl:
--------------------------------------------------------------------------------
  1 | using CUDAapi, CUDAdrv, CUDAnative, CuArrays
  2 | 
  3 | CUDAnative.initialize()
  4 | const dev = device()
  5 | const cap = capability(dev)
  6 | 
  7 | using BenchmarkTools
  8 | BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
  9 | BenchmarkTools.DEFAULT_PARAMETERS.gcsample = true
 10 | 
 11 | using SpecialFunctions
 12 | 
 13 | 
 14 | ## scalar CPU version
 15 | 
 16 | @inline cndf2(in::Float32) = 0.5f0 + 0.5f0 * erf(0.707106781f0 * in)
 17 | 
 18 | function blackscholes_cpu(sptprice::Float32, strike::Float32, rate::Float32,
 19 |                       volatility::Float32, time::Float32)
 20 |     logterm = log10(sptprice / strike)
 21 |     powterm = .5f0 * volatility * volatility
 22 |     den = volatility * sqrt(time)
 23 |     d1 = (((rate + powterm) * time) + logterm) / den
 24 |     d2 = d1 - den
 25 |     NofXd1 = cndf2(d1)
 26 |     NofXd2 = cndf2(d2)
 27 |     futureValue = strike * exp(-rate * time)
 28 |     c1 = futureValue * NofXd2
 29 |     call = sptprice * NofXd1 - c1
 30 |     return call - futureValue + sptprice
 31 | end
 32 | 
 33 | 
 34 | ## vectorized CPU version
 35 | 
 36 | @inline cndf2(in::AbstractArray{Float32}) = 0.5f0 .+ 0.5f0 .* erf.(0.707106781f0 .* in)
 37 | 
 38 | function blackscholes_cpu(sptprice::AbstractArray{Float32},
 39 |                           strike::AbstractArray{Float32},
 40 |                           rate::AbstractArray{Float32},
 41 |                           volatility::AbstractArray{Float32},
 42 |                           time::AbstractArray{Float32})
 43 |     logterm = log10.(sptprice ./ strike)
 44 |     powterm = .5f0 .* volatility .* volatility
 45 |     den = volatility .* sqrt.(time)
 46 |     d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den
 47 |     d2 = d1 .- den
 48 |     NofXd1 = cndf2(d1)
 49 |     NofXd2 = cndf2(d2)
 50 |     futureValue = strike .* exp.(- rate .* time)
 51 |     c1 = futureValue .* NofXd2
 52 |     call = sptprice .* NofXd1 .- c1
 53 |     return call .- futureValue .+ sptprice
 54 | end
 55 | 
 56 | 
 57 | ## native CUDA version
 58 | 
 59 | @inline cndf2_cuda(in::Float32) = 0.5f0 + 0.5f0 * CUDAnative.erf(0.707106781f0 * in)
 60 | 
 61 | function blackscholes_kernel(sptprice::AbstractArray{Float32},
 62 |                              strike::AbstractArray{Float32},
 63 |                              rate::AbstractArray{Float32},
 64 |                              volatility::AbstractArray{Float32},
 65 |                              time::AbstractArray{Float32},
 66 |                              out::AbstractArray{Float32})
 67 |     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
 68 | 
 69 |     if i <= size(sptprice, 1)
 70 |         logterm = CUDAnative.log10(sptprice[i] / strike[i])
 71 |         powterm = 0.5f0 * volatility[i] * volatility[i]
 72 |         den = volatility[i] * CUDAnative.sqrt(time[i])
 73 |         d1 = (((rate[i] + powterm) * time[i]) + logterm) / den
 74 |         d2 = d1 - den
 75 |         NofXd1 = cndf2_cuda(d1)
 76 |         NofXd2 = cndf2_cuda(d2)
 77 |         futureValue = strike[i] * CUDAnative.exp(-rate[i] * time[i])
 78 |         c1 = futureValue * NofXd2
 79 |         call = sptprice[i] * NofXd1 - c1
 80 |         out[i] = call - futureValue + sptprice[i]
 81 |     end
 82 | 
 83 |     return
 84 | end
 85 | 
 86 | 
 87 | ## scalar CuArrays version
 88 | 
 89 | function blackscholes_gpu(sptprice::Float32, strike::Float32, rate::Float32,
 90 |                           volatility::Float32, time::Float32)
 91 |     logterm = CUDAnative.log10(sptprice / strike)
 92 |     powterm = .5f0 * volatility * volatility
 93 |     den = volatility * CUDAnative.sqrt(time)
 94 |     d1 = (((rate + powterm) * time) + logterm) / den
 95 |     d2 = d1 - den
 96 |     NofXd1 = cndf2_cuda(d1)
 97 |     NofXd2 = cndf2_cuda(d2)
 98 |     futureValue = strike * CUDAnative.exp(-rate * time)
 99 |     c1 = futureValue * NofXd2
100 |     call = sptprice * NofXd1 - c1
101 |     return call - futureValue + sptprice
102 | end
103 | 
104 | 
105 | ## vectorized CuArrays version
106 | 
107 | @inline cndf2_cuarr(in::AbstractArray{Float32}) = 0.5f0 .+ 0.5f0 .* CUDAnative.erf.(0.707106781f0 .* in)
108 | 
109 | function blackscholes_gpu(sptprice::AbstractArray{Float32},
110 |                           strike::AbstractArray{Float32},
111 |                           rate::AbstractArray{Float32},
112 |                           volatility::AbstractArray{Float32},
113 |                           time::AbstractArray{Float32})
114 |     logterm = CUDAnative.log10.(sptprice ./ strike)
115 |     powterm = .5f0 .* volatility .* volatility
116 |     den = volatility .* CUDAnative.sqrt.(time)
117 |     d1 = (((rate .+ powterm) .* time) .+ logterm) ./ den
118 |     d2 = d1 .- den
119 |     NofXd1 = cndf2_cuarr(d1)
120 |     NofXd2 = cndf2_cuarr(d2)
121 |     futureValue = strike .* CUDAnative.exp.(- rate .* time)
122 |     c1 = futureValue .* NofXd2
123 |     call = sptprice .* NofXd1 .- c1
124 |     return call .- futureValue .+ sptprice
125 | end
126 | 
127 | 
128 | ## non-native CUDA C version
129 | 
130 | const cuda_source = "$(tempname()).cu"
131 | const cuda_ptx = "$(tempname()).ptx"
132 | 
133 | open(cuda_source, "w") do io
134 |     print(io, """
135 |         extern "C" __global__ void blackscholes_kernel(const float *sptprice,
136 |                                                        const float *strike,
137 |                                                        const float *rate,
138 |                                                        const float *volatility,
139 |                                                        const float *time,
140 |                                                        float *out,
141 |                                                        size_t n)
142 |         {
143 |             int i = blockIdx.x * blockDim.x + threadIdx.x;
144 |             if (i < n) {
145 |                 float logterm = log10(sptprice[i] / strike[i]);
146 |                 float powterm = 0.5 * volatility[i] * volatility[i];
147 |                 float den = volatility[i] * sqrt(time[i]);
148 |                 float d1 = (((rate[i] + powterm) * time[i]) + logterm) / den;
149 |                 float d2 = d1 - den;
150 |                 float NofXd1 = 0.5 + 0.5 * erf(0.707106781 * d1);
151 |                 float NofXd2 = 0.5 + 0.5 * erf(0.707106781 * d2);
152 |                 float futureValue = strike[i] * exp(-rate[i] * time[i]);
153 |                 float c1 = futureValue * NofXd2;
154 |                 float call = sptprice[i] * NofXd1 - c1;
155 |                 out[i] = call - futureValue + sptprice[i];
156 |             }
157 |         }
158 |     """)
159 | end
160 | 
161 | toolkit = CUDAapi.find_toolkit()
162 | nvcc = CUDAapi.find_cuda_binary("nvcc", toolkit)
163 | toolchain = CUDAapi.find_toolchain(toolkit)
164 | flags = `-ccbin=$(toolchain.host_compiler) -arch=sm_$(cap.major)$(cap.minor)`
165 | run(`$nvcc $flags -ptx -o $cuda_ptx $cuda_source`)
166 | 
167 | const cuda_module = CuModuleFile(cuda_ptx)
168 | const cuda_function = CuFunction(cuda_module, "blackscholes_kernel")
169 | 
170 | 
171 | ## main
172 | 
173 | function checksum(reference, result)
174 |     reference_sum = sum(reference)
175 |     result_sum = sum(result)
176 |     diff = abs(1-reference_sum/result_sum)
177 |     if diff>0.01
178 |         warn("checksum failed: $result_sum instead of $reference_sum (relative difference: $diff)")
179 |         println(stacktrace())
180 |     end
181 | end
182 | 
183 | function main(iterations)
184 |     sptprice   = Float32[ 42.0 for i = 1:iterations ]
185 |     strike     = Float32[ 40.0 + (i / iterations) for i = 1:iterations ]
186 |     rate       = Float32[ 0.5 for i = 1:iterations ]
187 |     volatility = Float32[ 0.2 for i = 1:iterations ]
188 |     time       = Float32[ 0.5 for i = 1:iterations ]
189 | 
190 |     timings = Dict()
191 | 
192 |     reference = blackscholes_cpu.(sptprice, strike, rate, volatility, time)
193 | 
194 |     let benchmark = @benchmarkable begin
195 |                 out .= blackscholes_cpu.($sptprice, $strike, $rate,
196 |                                          $volatility, $time)
197 |             end setup=(
198 |                 out = similar($strike)
199 |             ) teardown=(
200 |                 checksum($reference, out)
201 |             )
202 |         timings["Single-threaded (scalar)"] = run(benchmark)
203 |     end
204 | 
205 |     let benchmark = @benchmarkable begin
206 |                 out = blackscholes_cpu($sptprice, $strike, $rate,
207 |                                        $volatility, $time)
208 |             end setup=(
209 |                 out = nothing
210 |             ) teardown=(
211 |                 checksum($reference, out)
212 |             )
213 |         timings["Single-threaded (vectorized)"] = run(benchmark)
214 |     end
215 | 
216 |     let benchmark = @benchmarkable begin
217 |                 cudacall(cuda_function,
218 |                          Tuple{Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat},
219 |                                Ptr{Cfloat}, Ptr{Cfloat}, Ptr{Cfloat}, Csize_t},
220 |                          sptprice_dev, strike_dev, rate_dev, volatility_dev,
221 |                          time_dev, out, n; blocks=grid, threads=block)
222 |                 synchronize()
223 |             end setup=(
224 |                 sptprice_dev = CuArray($sptprice);
225 |                 strike_dev = CuArray($strike);
226 |                 rate_dev = CuArray($rate);
227 |                 volatility_dev = CuArray($volatility);
228 |                 time_dev = CuArray($time);
229 |                 out = similar(strike_dev);
230 | 
231 |                 n = size($sptprice, 1);
232 |                 block = min(n, 1024);
233 |                 grid = ceil(Integer, n/block)
234 |             ) teardown=(
235 |                 checksum($reference, Array(out))
236 |             )
237 |         timings["CUDA C (kernel)"] = run(benchmark)
238 |     end
239 | 
240 |     let benchmark = @benchmarkable begin
241 |                 @cuda blocks=grid threads=block blackscholes_kernel(sptprice_dev, strike_dev, rate_dev,
242 |                                                                     volatility_dev, time_dev, out)
243 |                 synchronize()
244 |             end setup=(
245 |                 sptprice_dev = CuArray($sptprice);
246 |                 strike_dev = CuArray($strike);
247 |                 rate_dev = CuArray($rate);
248 |                 volatility_dev = CuArray($volatility);
249 |                 time_dev = CuArray($time);
250 |                 out = similar(strike_dev);
251 | 
252 |                 n = size($sptprice, 1);
253 |                 block = min(n, 1024);
254 |                 grid = ceil(Integer, n/block)
255 |             ) teardown=(
256 |                 checksum($reference, Array(out))
257 |             )
258 |         timings["CUDAnative.jl (kernel)"] = run(benchmark)
259 |     end
260 | 
261 |     let benchmark = @benchmarkable begin
262 |                 out .= blackscholes_gpu.(sptprice_dev, strike_dev, rate_dev,
263 |                                          volatility_dev, time_dev)
264 |                 synchronize()
265 |             end setup=(
266 |                 sptprice_dev = CuArray($sptprice);
267 |                 strike_dev = CuArray($strike);
268 |                 rate_dev = CuArray($rate);
269 |                 volatility_dev = CuArray($volatility);
270 |                 time_dev = CuArray($time);
271 |                 out = similar(strike_dev);
272 |             ) teardown=(
273 |                 checksum($reference, Array(out))
274 |             )
275 |         timings["CuArrays.jl (scalar)"] = run(benchmark)
276 |     end
277 | 
278 |     let benchmark = @benchmarkable begin
279 |                 out = blackscholes_gpu(sptprice_dev, strike_dev, rate_dev,
280 |                                        volatility_dev, time_dev)
281 |                 synchronize()
282 |             end setup=(
283 |                 sptprice_dev = CuArray($sptprice);
284 |                 strike_dev = CuArray($strike);
285 |                 rate_dev = CuArray($rate);
286 |                 volatility_dev = CuArray($volatility);
287 |                 time_dev = CuArray($time);
288 |                 out = nothing
289 |             ) teardown=(
290 |                 checksum($reference, Array(out))
291 |             )
292 |         timings["CuArrays.jl (vectorized)"] = run(benchmark)
293 |     end
294 | 
295 |     return timings
296 | end
297 | 
298 | function main()
299 |     iterations = 10^7
300 |     timings = main(iterations)
301 | 
302 |     println()
303 |     println("Timings:")
304 |     for (test, trials) in timings
305 |         println("* $test: ", BenchmarkTools.prettytime(time(trials)))
306 |     end
307 | 
308 |     println()
309 |     println("Rates:")
310 |     for (test, trials) in timings
311 |         println("* $test: ", 1e9*iterations/time(trials), " ops/sec")
312 |     end
313 | end
314 | 
315 | main()
316 | 
317 | rm(cuda_source)
318 | rm(cuda_ptx)
319 | 


--------------------------------------------------------------------------------
/src/reflection.jl:
--------------------------------------------------------------------------------
  1 | # code reflection entry-points
  2 | 
  3 | using InteractiveUtils
  4 | 
  5 | 
  6 | #
  7 | # code_* replacements
  8 | #
  9 | 
 10 | # NOTE: these functions replicate parts of the main compiler driver in order to generate
 11 | #       more compact code (i.e. without the run-time library) and/or to support generating
 12 | #       otherwise invalid code (e.g. with missing symbols).
 13 | 
 14 | """
 15 |     code_llvm([io], f, types; optimize=true, cap::VersionNumber, kernel=true,
 16 |                               dump_module=false, strip_ir_metadata=true)
 17 | 
 18 | Prints the device LLVM IR generated for the method matching the given generic function and
 19 | type signature to `io` which defaults to `stdout`. The IR is optimized according to
 20 | `optimize` (defaults to true), which includes entry-point specific optimizations if `kernel`
 21 | is set (defaults to false). The device capability `cap` to generate code for defaults to the
 22 | current active device's capability, or v"2.0" if there is no such active context. The entire
 23 | module, including headers and other functions, is dumped if `dump_module` is set (defaults
 24 | to false). Finally, setting `strip_ir_metadata` removes all debug metadata (defaults to
 25 | true).
 26 | 
 27 | See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref)
 28 | """
 29 | function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple);
 30 |                    optimize::Bool=true, cap::VersionNumber=current_capability(),
 31 |                    dump_module::Bool=false, strip_ir_metadata::Bool=true,
 32 |                    kernel::Bool=false, kwargs...)
 33 |     tt = Base.to_tuple_type(types)
 34 |     ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
 35 |     code_llvm(io, ctx; optimize=optimize, dump_module=dump_module,
 36 |               strip_ir_metadata=strip_ir_metadata)
 37 | end
 38 | function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true,
 39 |                    dump_module::Bool=false, strip_ir_metadata::Bool=true)
 40 |     check_method(ctx)
 41 |     mod, entry = irgen(ctx)
 42 |     if optimize
 43 |         entry = optimize!(ctx, mod, entry)
 44 |     end
 45 |     if strip_ir_metadata
 46 |         strip_debuginfo!(mod)
 47 |     end
 48 |     if dump_module
 49 |         show(io, mod)
 50 |     else
 51 |         show(io, entry)
 52 |     end
 53 | end
 54 | code_llvm(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) =
 55 |     code_llvm(stdout, func, types; kwargs...)
 56 | 
 57 | """
 58 |     code_ptx([io], f, types; cap::VersionNumber, kernel=false, strip_ir_metadata=true)
 59 | 
 60 | Prints the PTX assembly generated for the method matching the given generic function and
 61 | type signature to `io` which defaults to `stdout`. The device capability `cap` to generate
 62 | code for defaults to the current active device's capability, or v"2.0" if there is no such
 63 | active context. The optional `kernel` parameter indicates whether the function in question
 64 | is an entry-point function, or a regular device function. Finally, setting
 65 | `strip_ir_metadata` removes all debug metadata (defaults to true).
 66 | 
 67 | See also: [`@device_code_ptx`](@ref)
 68 | """
 69 | function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple);
 70 |                   cap::VersionNumber=current_capability(), kernel::Bool=false,
 71 |                   strip_ir_metadata::Bool=true, kwargs...)
 72 |     tt = Base.to_tuple_type(types)
 73 |     ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
 74 |     code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata)
 75 | end
 76 | function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true)
 77 |     check_method(ctx)
 78 |     mod, entry = irgen(ctx)
 79 |     entry = optimize!(ctx, mod, entry)
 80 |     if strip_ir_metadata
 81 |         strip_debuginfo!(mod)
 82 |     end
 83 |     prepare_execution!(ctx, mod)
 84 |     ptx = mcgen(ctx, mod, entry)
 85 |     print(io, ptx)
 86 | end
 87 | code_ptx(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) =
 88 |     code_ptx(stdout, func, types; kwargs...)
 89 | 
 90 | """
 91 |     code_sass([io], f, types, cap::VersionNumber)
 92 | 
 93 | Prints the SASS code generated for the method matching the given generic function and type
 94 | signature to `io` which defaults to `stdout`. The device capability `cap` to generate code
 95 | for defaults to the current active device's capability, or v"2.0" if there is no such active
 96 | context. The method needs to be a valid entry-point kernel, eg. it should not return any
 97 | values.
 98 | 
 99 | See also: [`@device_code_sass`](@ref)
100 | """
101 | function code_sass(io::IO, @nospecialize(func::Core.Function), @nospecialize(types=Tuple);
102 |                    cap::VersionNumber=current_capability(), kernel::Bool=true, kwargs...)
103 |     tt = Base.to_tuple_type(types)
104 |     ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
105 |     code_sass(io, ctx)
106 | end
107 | function code_sass(io::IO, ctx::CompilerContext)
108 |     if !ctx.kernel
109 |         error("Can only generate SASS code for kernel functions")
110 |     end
111 |     if ptxas === nothing || nvdisasm === nothing
112 |         error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
113 |     end
114 | 
115 |     ptx,_ = compile(ctx)
116 | 
117 |     fn = tempname()
118 |     gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)"
119 |     # NOTE: this might not match what is being executed, due to the PTX->SASS conversion
120 |     #       by the driver possibly not matching what `ptxas` (part of the toolkit) does.
121 |     # TODO: see how `nvvp` extracts SASS code when doing PC sampling, and copy that.
122 |     Base.run(`$ptxas --gpu-name $gpu --output-file $fn --input-as-string $ptx`)
123 |     try
124 |         cmd = `$nvdisasm --print-code --print-line-info $fn`
125 |         for line in readlines(cmd)
126 |             # nvdisasm output is pretty verbose;
127 |             # perform some clean-up and make it look like @code_native
128 |             line = replace(line, r"/\*[0-9a-f]{4}\*/" => "        ") # strip inst addr
129 |             line = replace(line, r"^[ ]{30}" => "   ")               # reduce leading spaces
130 |             line = replace(line, r"[\s+]//##" => ";")                # change line info tag
131 |             line = replace(line, r"^\." => "\n.")                    # break before new BBs
132 |             line = replace(line, r"; File \"(.+?)\", line (\d+)" => s"; Location \1:\2") # rename line info
133 |             println(io, line)
134 |         end
135 |     finally
136 |         rm(fn)
137 |     end
138 | end
139 | code_sass(@nospecialize(func), @nospecialize(types=Tuple); kwargs...) =
140 |     code_sass(stdout, func, types; kwargs...)
141 | 
142 | 
143 | #
144 | # @device_code_* functions
145 | #
146 | 
147 | export @device_code_lowered, @device_code_typed, @device_code_warntype,
148 |        @device_code_llvm, @device_code_ptx, @device_code_sass,
149 |        @device_code
150 | 
151 | function emit_hooked_compilation(inner_hook, ex...)
152 |     user_code = ex[end]
153 |     user_kwargs = ex[1:end-1]
154 |     quote
155 |         # wipe the compile cache to force recompilation
156 |         empty!(CUDAnative.compilecache)
157 | 
158 |         local kernels = 0
159 |         function outer_hook(ctx)
160 |             kernels += 1
161 |             $inner_hook(ctx; $(map(esc, user_kwargs)...))
162 |         end
163 | 
164 |         if CUDAnative.compile_hook[] != nothing
165 |             error("Chaining multiple @device_code calls is unsupported")
166 |         end
167 |         try
168 |             CUDAnative.compile_hook[] = outer_hook
169 |             $(esc(user_code))
170 |         finally
171 |             CUDAnative.compile_hook[] = nothing
172 |         end
173 | 
174 |         if kernels == 0
175 |             error("no kernels executed while evaluating the given expression")
176 |         end
177 | 
178 |         nothing
179 |     end
180 | end
181 | 
182 | # NOTE: these hooks take both a `f` and an inner `f`, because of how `@cuda`/`_cuda` work:
183 | #       kernels are automatically wrapper in a function returning nothing, for usability.
184 | #
185 | #       Julia-level reflection (lowered/typed/warntype) skips these wrapper, because we
186 | #       can't do call-site inlining and the kernel wrapper would hide any meaningful code.
187 | #
188 | #       at the LLVM level, we inline everything so there's no need to hide the wrapper.
189 | 
190 | """
191 |     @device_code_lowered ex
192 | 
193 | Evaluates the expression `ex` and returns the result of
194 | [`InteractiveUtils.code_lowered`](@ref) for every compiled CUDA kernel.
195 | 
196 | See also: [`InteractiveUtils.@code_lowered`](@ref)
197 | """
198 | macro device_code_lowered(ex...)
199 |     quote
200 |         buf = Any[]
201 |         function hook(ctx::CompilerContext)
202 |             append!(buf, code_lowered(ctx.f, ctx.tt))
203 |         end
204 |         $(emit_hooked_compilation(:hook, ex...))
205 |         buf
206 |     end
207 | end
208 | 
209 | """
210 |     @device_code_typed ex
211 | 
212 | Evaluates the expression `ex` and returns the result of
213 | [`InteractiveUtils.code_typed`](@ref) for every compiled CUDA kernel.
214 | 
215 | See also: [`InteractiveUtils.@code_typed`](@ref)
216 | """
217 | macro device_code_typed(ex...)
218 |     quote
219 |         buf = Any[]
220 |         function hook(ctx::CompilerContext)
221 |             append!(buf, code_typed(ctx.f, ctx.tt))
222 |         end
223 |         $(emit_hooked_compilation(:hook, ex...))
224 |         buf
225 |     end
226 | end
227 | 
228 | """
229 |     @device_code_warntype [io::IO=stdout] ex
230 | 
231 | Evaluates the expression `ex` and prints the result of
232 | [`InteractiveUtils.code_warntype`](@ref) to `io` for every compiled CUDA kernel.
233 | 
234 | See also: [`InteractiveUtils.@code_warntype`](@ref)
235 | """
236 | macro device_code_warntype(ex...)
237 |     function hook(ctx::CompilerContext; io::IO=stdout, kwargs...)
238 |         code_warntype(io, ctx.f, ctx.tt; kwargs...)
239 |     end
240 |     emit_hooked_compilation(hook, ex...)
241 | end
242 | 
243 | """
244 |     @device_code_llvm [io::IO=stdout, ...] ex
245 | 
246 | Evaluates the expression `ex` and prints the result of [`InteractiveUtils.code_llvm`](@ref)
247 | to `io` for every compiled CUDA kernel. For other supported keywords, see
248 | [`CUDAnative.code_llvm`](@ref).
249 | 
250 | See also: [`InteractiveUtils.@code_llvm`](@ref)
251 | """
252 | macro device_code_llvm(ex...)
253 |     hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_llvm(io, ctx; kwargs...)
254 |     emit_hooked_compilation(hook, ex...)
255 | end
256 | 
257 | """
258 |     @device_code_ptx [io::IO=stdout, ...] ex
259 | 
260 | Evaluates the expression `ex` and prints the result of [`CUDAnative.code_ptx`](@ref) to `io`
261 | for every compiled CUDA kernel. For other supported keywords, see
262 | [`CUDAnative.code_ptx`](@ref).
263 | """
264 | macro device_code_ptx(ex...)
265 |     hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_ptx(io, ctx; kwargs...)
266 |     emit_hooked_compilation(hook, ex...)
267 | end
268 | 
269 | """
270 |     @device_code_sass [io::IO=stdout, ...] ex
271 | 
272 | Evaluates the expression `ex` and prints the result of [`CUDAnative.code_sass`](@ref) to
273 | `io` for every compiled CUDA kernel. For other supported keywords, see
274 | [`CUDAnative.code_sass`](@ref).
275 | """
276 | macro device_code_sass(ex...)
277 |     hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_sass(io, ctx; kwargs...)
278 |     emit_hooked_compilation(hook, ex...)
279 | end
280 | 
281 | """
282 |     @device_code dir::AbstractString=... [...] ex
283 | 
284 | Evaluates the expression `ex` and dumps all intermediate forms of code to the directory
285 | `dir`.
286 | """
287 | macro device_code(ex...)
288 |     only(xs) = (@assert length(xs) == 1; first(xs))
289 |     function hook(ctx::CompilerContext; dir::AbstractString)
290 |         fn = "$(typeof(ctx.f).name.mt.name)_$(globalUnique+1)"
291 |         mkpath(dir)
292 | 
293 |         open(joinpath(dir, "$fn.lowered.jl"), "w") do io
294 |             code = only(code_lowered(ctx.f, ctx.tt))
295 |             println(io, code)
296 |         end
297 | 
298 |         open(joinpath(dir, "$fn.typed.jl"), "w") do io
299 |             code = only(code_typed(ctx.f, ctx.tt))
300 |             println(io, code)
301 |         end
302 | 
303 |         open(joinpath(dir, "$fn.unopt.ll"), "w") do io
304 |             code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false, optimize=false)
305 |         end
306 | 
307 |         open(joinpath(dir, "$fn.opt.ll"), "w") do io
308 |             code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false)
309 |         end
310 | 
311 |         open(joinpath(dir, "$fn.ptx"), "w") do io
312 |             code_ptx(io, ctx)
313 |         end
314 | 
315 |         open(joinpath(dir, "$fn.sass"), "w") do io
316 |             code_sass(io, ctx)
317 |         end
318 |     end
319 |     emit_hooked_compilation(hook, ex...)
320 | end
321 | 


--------------------------------------------------------------------------------