├── codecov.yml
├── .github
    └── workflows
    │   └── TagBot.yml
├── .travis.yml
├── test
    ├── runtests.jl
    ├── util.jl
    ├── definitions
    │   ├── gcn.jl
    │   ├── native.jl
    │   └── ptx.jl
    ├── gcn.jl
    ├── ptx.jl
    └── native.jl
├── Project.toml
├── src
    ├── native.jl
    ├── utils.jl
    ├── GPUCompiler.jl
    ├── execution.jl
    ├── debug.jl
    ├── error.jl
    ├── mcgen.jl
    ├── cache.jl
    ├── interface.jl
    ├── optim.jl
    ├── rtlib.jl
    ├── runtime.jl
    ├── reflection.jl
    ├── driver.jl
    ├── validation.jl
    ├── gcn.jl
    ├── ptx.jl
    └── irgen.jl
├── README.md
├── Manifest.toml
└── .gitlab-ci.yml


/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     patch: false
4 |     project: false
5 |     changes: false
6 | 


--------------------------------------------------------------------------------
/.github/workflows/TagBot.yml:
--------------------------------------------------------------------------------
 1 | name: TagBot
 2 | on:
 3 |   schedule:
 4 |     - cron: 0 * * * *
 5 | jobs:
 6 |   TagBot:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: JuliaRegistries/TagBot@v1
10 |         with:
11 |           token: ${{ secrets.GITHUB_TOKEN }}
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: julia
 2 | 
 3 | os:
 4 |   - linux
 5 |   - osx
 6 |   - windows
 7 | 
 8 | julia:
 9 |   - 1.3
10 |   - 1.4
11 |   - 1.5
12 |   - nightly
13 | 
14 | jobs:
15 |   allow_failures:
16 |   - julia: nightly
17 | 
18 | notifications:
19 |   email: false
20 | 
21 | codecov: true
22 | 
23 | branches:
24 |   only:
25 |     - master
26 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using Test, Base.CoreLogging
 2 | import Base.CoreLogging: Info
 3 | 
 4 | using GPUCompiler
 5 | 
 6 | using LLVM, LLVM.Interop
 7 | 
 8 | include("util.jl")
 9 | 
10 | @testset "GPUCompiler" begin
11 | 
12 | GPUCompiler.reset_runtime()
13 | 
14 | GPUCompiler.enable_timings()
15 | 
16 | include("native.jl")
17 | include("ptx.jl")
18 | if !parse(Bool, get(ENV, "CI_ASSERTS", "false")) && VERSION < v"1.4"
19 |   include("gcn.jl")
20 | end
21 | 
22 | haskey(ENV, "CI") && GPUCompiler.timings()
23 | 
24 | end
25 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "GPUCompiler"
 2 | uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 3 | authors = ["Tim Besard <tim.besard@gmail.com>"]
 4 | version = "0.4.0"
 5 | 
 6 | [deps]
 7 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 8 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 9 | LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
10 | Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
11 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
12 | UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
13 | 
14 | [compat]
15 | DataStructures = "0.15, 0.16, 0.17"
16 | LLVM = "1.4.0"
17 | TimerOutputs = "0.5"
18 | julia = "1.3"
19 | 
20 | [extras]
21 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
22 | 
23 | [targets]
24 | test = ["Test"]
25 | 


--------------------------------------------------------------------------------
/src/native.jl:
--------------------------------------------------------------------------------
 1 | # native target for CPU execution
 2 | 
 3 | ## target
 4 | 
 5 | export NativeCompilerTarget
 6 | 
 7 | Base.@kwdef struct NativeCompilerTarget <: AbstractCompilerTarget
 8 |     cpu::String=(LLVM.version() < v"8") ? "" : unsafe_string(LLVM.API.LLVMGetHostCPUName())
 9 |     features::String=(LLVM.version() < v"8") ? "" : unsafe_string(LLVM.API.LLVMGetHostCPUFeatures())
10 | end
11 | 
12 | llvm_triple(::NativeCompilerTarget) = Sys.MACHINE
13 | 
14 | function llvm_machine(target::NativeCompilerTarget)
15 |     triple = llvm_triple(target)
16 | 
17 |     t = Target(triple)
18 | 
19 |     tm = TargetMachine(t, triple, target.cpu, target.features)
20 |     asm_verbosity!(tm, true)
21 | 
22 |     return tm
23 | end
24 | 
25 | 
26 | ## job
27 | 
28 | runtime_slug(job::CompilerJob{NativeCompilerTarget}) = "native_$(job.target.cpu)-$(hash(job.target.features))"
29 | 


--------------------------------------------------------------------------------
/src/utils.jl:
--------------------------------------------------------------------------------
 1 | export tbaa_make_child
 2 | 
 3 | function tbaa_make_child(name::String, constant::Bool=false; ctx::LLVM.Context=JuliaContext())
 4 |     tbaa_root = MDNode([MDString("gputbaa", ctx)], ctx)
 5 |     tbaa_struct_type =
 6 |         MDNode([MDString("gputbaa_$name", ctx),
 7 |                 tbaa_root,
 8 |                 LLVM.ConstantInt(0, ctx)], ctx)
 9 |     tbaa_access_tag =
10 |         MDNode([tbaa_struct_type,
11 |                 tbaa_struct_type,
12 |                 LLVM.ConstantInt(0, ctx),
13 |                 LLVM.ConstantInt(constant ? 1 : 0, ctx)], ctx)
14 | 
15 |     return tbaa_access_tag
16 | end
17 | 
18 | 
19 | defs(mod::LLVM.Module)  = filter(f -> !isdeclaration(f), collect(functions(mod)))
20 | decls(mod::LLVM.Module) = filter(f ->  isdeclaration(f) && intrinsic_id(f) == 0,
21 |                                  collect(functions(mod)))
22 | 
23 | 


--------------------------------------------------------------------------------
/src/GPUCompiler.jl:
--------------------------------------------------------------------------------
 1 | module GPUCompiler
 2 | 
 3 | using LLVM
 4 | using LLVM.Interop
 5 | 
 6 | using DataStructures
 7 | 
 8 | using TimerOutputs
 9 | 
10 | using Libdl
11 | 
12 | const to = TimerOutput()
13 | 
14 | timings() = (TimerOutputs.print_timer(to); println())
15 | 
16 | enable_timings() = (TimerOutputs.enable_debug_timings(GPUCompiler); return)
17 | 
18 | include("utils.jl")
19 | 
20 | # compiler interface and implementations
21 | include("interface.jl")
22 | include("error.jl")
23 | include("native.jl")
24 | include("ptx.jl")
25 | include("gcn.jl")
26 | 
27 | include("runtime.jl")
28 | 
29 | # compiler implementation
30 | include("irgen.jl")
31 | include("optim.jl")
32 | include("validation.jl")
33 | include("rtlib.jl")
34 | include("mcgen.jl")
35 | include("debug.jl")
36 | include("driver.jl")
37 | 
38 | # other reusable functionality
39 | include("cache.jl")
40 | include("execution.jl")
41 | include("reflection.jl")
42 | 
43 | function __init__()
44 |     TimerOutputs.reset_timer!(to)
45 |     InitializeAllTargets()
46 |     InitializeAllTargetInfos()
47 |     InitializeAllAsmPrinters()
48 |     InitializeAllAsmParsers()
49 |     InitializeAllTargetMCs()
50 | 
51 |     return
52 | end
53 | 
54 | end # module
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPUCompiler.jl
 2 | 
 3 | *Reusable compiler infrastructure for Julia GPU backends.*
 4 | 
 5 | | **Build Status**                                                                                   | **Coverage**                    |
 6 | |:--------------------------------------------------------------------------------------------------:|:-------------------------------:|
 7 | | [![][gitlab-img]][gitlab-url] [![][travis-img]][travis-url] [![PkgEval][pkgeval-img]][pkgeval-url] | [![][codecov-img]][codecov-url] |
 8 | 
 9 | [gitlab-img]: https://gitlab.com/JuliaGPU/GPUCompiler.jl/badges/master/pipeline.svg
10 | [gitlab-url]: https://gitlab.com/JuliaGPU/GPUCompiler.jl/commits/master
11 | 
12 | [travis-img]: https://api.travis-ci.com/JuliaGPU/GPUCompiler.jl.svg?branch=master
13 | [travis-url]: https://travis-ci.com/JuliaGPU/GPUCompiler.jl
14 | 
15 | [pkgeval-img]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/G/GPUCompiler.svg
16 | [pkgeval-url]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/G/GPUCompiler.html
17 | 
18 | [codecov-img]: https://codecov.io/gh/JuliaGPU/GPUCompiler.jl/branch/master/graph/badge.svg
19 | [codecov-url]: https://codecov.io/gh/JuliaGPU/GPUCompiler.jl
20 | 
21 | This package offers reusable compiler infrastructure and tooling for
22 | implementing GPU compilers in Julia. **It is not intended for end users!**
23 | Instead, you should use one of the packages that builds on GPUCompiler.jl, such
24 | as [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl).
25 | 


--------------------------------------------------------------------------------
/test/util.jl:
--------------------------------------------------------------------------------
 1 | # @test_throw, with additional testing for the exception message
 2 | macro test_throws_message(f, typ, ex...)
 3 |     quote
 4 |         msg = ""
 5 |         @test_throws $(esc(typ)) try
 6 |             $(esc(ex...))
 7 |         catch err
 8 |             msg = sprint(showerror, err)
 9 |             rethrow()
10 |         end
11 | 
12 |         if !$(esc(f))(msg)
13 |             # @test should return its result, but doesn't
14 |             @error "Failed to validate error message\n$msg"
15 |         end
16 |         @test $(esc(f))(msg)
17 |     end
18 | end
19 | 
20 | # helper function for sinking a value to prevent the callee from getting optimized away
21 | @inline sink(i::T) where T <: Union{Int32,UInt32} =
22 |     Base.llvmcall("""%slot = alloca i32
23 |                      store volatile i32 %0, i32* %slot
24 |                      %value = load volatile i32, i32* %slot
25 |                      ret i32 %value""", T, Tuple{T}, i)
26 | @inline sink(i::T) where T <: Union{Int64,UInt64} =
27 |     Base.llvmcall("""%slot = alloca i64
28 |                      store volatile i64 %0, i64* %slot
29 |                      %value = load volatile i64, i64* %slot
30 |                      ret i64 %value""", T, Tuple{T}, i)
31 | 
32 | 
33 | # the GPU runtime library
34 | module TestRuntime
35 |     # dummy methods
36 |     signal_exception() = return
37 |     malloc(sz) = C_NULL
38 |     report_oom(sz) = return
39 |     report_exception(ex) = return
40 |     report_exception_name(ex) = return
41 |     report_exception_frame(idx, func, file, line) = return
42 | 
43 |     # for validation
44 |     sin(x) = Base.sin(x)
45 | end
46 | 
47 | struct TestCompilerParams <: AbstractCompilerParams end
48 | GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime
49 | 


--------------------------------------------------------------------------------
/src/execution.jl:
--------------------------------------------------------------------------------
 1 | # reusable functionality to implement code execution
 2 | 
 3 | export split_kwargs, assign_args!
 4 | 
 5 | 
 6 | ## macro tools
 7 | 
 8 | # split keyword arguments expressions into groups. returns vectors of keyword argument
 9 | # values, one more than the number of groups (unmatched keywords in the last vector).
10 | # intended for use in macros; the resulting groups can be used in expressions.
11 | function split_kwargs(kwargs, kw_groups...)
12 |     kwarg_groups = ntuple(_->[], length(kw_groups) + 1)
13 |     for kwarg in kwargs
14 |         # decode
15 |         Meta.isexpr(kwarg, :(=)) || throw(ArgumentError("non-keyword argument like option '$kwarg'"))
16 |         key, val = kwarg.args
17 |         isa(key, Symbol) || throw(ArgumentError("non-symbolic keyword '$key'"))
18 | 
19 |         # find a matching group
20 |         group = length(kw_groups)
21 |         for (i, kws) in enumerate(kw_groups)
22 |             if key in kws
23 |                 group = i
24 |                 break
25 |             end
26 |         end
27 |         push!(kwarg_groups[group], kwarg)
28 |     end
29 | 
30 |     return kwarg_groups
31 | end
32 | 
33 | # assign arguments to variables, handle splatting
34 | function assign_args!(code, args)
35 |     # handle splatting
36 |     splats = map(arg -> Meta.isexpr(arg, :(...)), args)
37 |     args = map(args, splats) do arg, splat
38 |         splat ? arg.args[1] : arg
39 |     end
40 | 
41 |     # assign arguments to variables
42 |     vars = Tuple(gensym() for arg in args)
43 |     map(vars, args) do var,arg
44 |         push!(code.args, :($var = $arg))
45 |     end
46 | 
47 |     # convert the arguments, compile the function and call the kernel
48 |     # while keeping the original arguments alive
49 |     var_exprs = map(vars, args, splats) do var, arg, splat
50 |          splat ? Expr(:(...), var) : var
51 |     end
52 | 
53 |     return vars, var_exprs
54 | end
55 | 


--------------------------------------------------------------------------------
/test/definitions/gcn.jl:
--------------------------------------------------------------------------------
 1 | using GPUCompiler
 2 | 
 3 | if !@isdefined(TestRuntime)
 4 |     include("../util.jl")
 5 | end
 6 | 
 7 | 
 8 | # create a GCN-based test compiler, and generate reflection methods for it
 9 | 
10 | function gcn_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, kwargs...)
11 |     source = FunctionSpec(func, Base.to_tuple_type(types), kernel)
12 |     target = GCNCompilerTarget("gfx900")
13 |     params = TestCompilerParams()
14 |     CompilerJob(target, source, params), kwargs
15 | end
16 | 
17 | function gcn_code_typed(@nospecialize(func), @nospecialize(types); kwargs...)
18 |     job, kwargs = gcn_job(func, types; kwargs...)
19 |     GPUCompiler.code_typed(job; kwargs...)
20 | end
21 | 
22 | function gcn_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
23 |     job, kwargs = gcn_job(func, types; kwargs...)
24 |     GPUCompiler.code_warntype(io, job; kwargs...)
25 | end
26 | 
27 | function gcn_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
28 |     job, kwargs = gcn_job(func, types; kwargs...)
29 |     GPUCompiler.code_llvm(io, job; kwargs...)
30 | end
31 | 
32 | function gcn_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
33 |     job, kwargs = gcn_job(func, types; kwargs...)
34 |     GPUCompiler.code_native(io, job; kwargs...)
35 | end
36 | 
37 | # aliases without ::IO argument
38 | for method in (:code_warntype, :code_llvm, :code_native)
39 |     gcn_method = Symbol("gcn_$(method)")
40 |     @eval begin
41 |         $gcn_method(@nospecialize(func), @nospecialize(types); kwargs...) =
42 |             $gcn_method(stdout, func, types; kwargs...)
43 |     end
44 | end
45 | 
46 | # simulates codegen for a kernel function: validates by default
47 | function gcn_code_execution(@nospecialize(func), @nospecialize(types); kwargs...)
48 |     job, kwargs = gcn_job(func, types; kernel=true, kwargs...)
49 |     GPUCompiler.compile(:asm, job; kwargs...)
50 | end
51 | 


--------------------------------------------------------------------------------
/Manifest.toml:
--------------------------------------------------------------------------------
 1 | # This file is machine-generated - editing it directly is not advised
 2 | 
 3 | [[Base64]]
 4 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 5 | 
 6 | [[CEnum]]
 7 | git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9"
 8 | uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 9 | version = "0.4.1"
10 | 
11 | [[DataStructures]]
12 | deps = ["InteractiveUtils", "OrderedCollections"]
13 | git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773"
14 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
15 | version = "0.17.11"
16 | 
17 | [[InteractiveUtils]]
18 | deps = ["Markdown"]
19 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
20 | 
21 | [[LLVM]]
22 | deps = ["CEnum", "Libdl", "Printf", "Unicode"]
23 | git-tree-sha1 = "e2ef4155563e7d72790e70817cff7caae7b106a4"
24 | repo-rev = "37794e110bfbe6b4e204c0c7916e7dae45774f2e"
25 | repo-url = "https://github.com/maleadt/LLVM.jl.git"
26 | uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
27 | version = "1.5.1"
28 | 
29 | [[Libdl]]
30 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
31 | 
32 | [[Markdown]]
33 | deps = ["Base64"]
34 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
35 | 
36 | [[OrderedCollections]]
37 | git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
38 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
39 | version = "1.2.0"
40 | 
41 | [[Printf]]
42 | deps = ["Unicode"]
43 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
44 | 
45 | [[Random]]
46 | deps = ["Serialization"]
47 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
48 | 
49 | [[SHA]]
50 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
51 | 
52 | [[Serialization]]
53 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
54 | 
55 | [[TimerOutputs]]
56 | deps = ["Printf"]
57 | git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03"
58 | uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
59 | version = "0.5.3"
60 | 
61 | [[UUIDs]]
62 | deps = ["Random", "SHA"]
63 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
64 | 
65 | [[Unicode]]
66 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
67 | 


--------------------------------------------------------------------------------
/test/definitions/native.jl:
--------------------------------------------------------------------------------
 1 | using GPUCompiler
 2 | 
 3 | if !@isdefined(TestRuntime)
 4 |     include("../util.jl")
 5 | end
 6 | 
 7 | 
 8 | # create a native test compiler, and generate reflection methods for it
 9 | 
10 | function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, kwargs...)
11 |     source = FunctionSpec(func, Base.to_tuple_type(types), kernel)
12 |     target = NativeCompilerTarget()
13 |     params = TestCompilerParams()
14 |     CompilerJob(target, source, params), kwargs
15 | end
16 | 
17 | function native_code_typed(@nospecialize(func), @nospecialize(types); kwargs...)
18 |     job, kwargs = native_job(func, types; kwargs...)
19 |     GPUCompiler.code_typed(job; kwargs...)
20 | end
21 | 
22 | function native_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
23 |     job, kwargs = native_job(func, types; kwargs...)
24 |     GPUCompiler.code_warntype(io, job; kwargs...)
25 | end
26 | 
27 | function native_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
28 |     job, kwargs = native_job(func, types; kwargs...)
29 |     GPUCompiler.code_llvm(io, job; kwargs...)
30 | end
31 | 
32 | function native_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
33 |     job, kwargs = native_job(func, types; kwargs...)
34 |     GPUCompiler.code_native(io, job; kwargs...)
35 | end
36 | 
37 | # aliases without ::IO argument
38 | for method in (:code_warntype, :code_llvm, :code_native)
39 |     native_method = Symbol("native_$(method)")
40 |     @eval begin
41 |         $native_method(@nospecialize(func), @nospecialize(types); kwargs...) =
42 |             $native_method(stdout, func, types; kwargs...)
43 |     end
44 | end
45 | 
46 | # simulates codegen for a kernel function: validates by default
47 | function native_code_execution(@nospecialize(func), @nospecialize(types); kwargs...)
48 |     job, kwargs = native_job(func, types; kernel=true, kwargs...)
49 |     GPUCompiler.compile(:asm, job; kwargs...)
50 | end
51 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | include:
 2 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml'
 3 | 
 4 | 
 5 | # LLVM with assertions
 6 | 
 7 | asserts:1.3:
 8 |   extends:
 9 |     - .julia:source
10 |     - .test
11 |   variables:
12 |     CI_CLONE_ARGS: '-b v1.3.1'
13 |     CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1 LLVM_BB_URL_BASE=https://github.com/staticfloat/LLVMBuilder/releases/download/v6.0.1-7+nowasm/'
14 |     CI_ASSERTS: 'true'
15 | 
16 | # TODO: upgrade to 1.4.1 once it has LLVM+asserts artifacts
17 | asserts:1.4:
18 |   extends:
19 |     - .julia:source
20 |     - .test
21 |   variables:
22 |     CI_CLONE_ARGS: '-b v1.4.0'
23 |     CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1'
24 |     CI_ASSERTS: 'true'
25 | 
26 | # TODO: add 1.5.0 once it has LLVM+asserts artifacts
27 | 
28 | 
29 | # CUDA.jl
30 | 
31 | .test_cuda:
32 |   extends: .test
33 |   variables:
34 |     JULIA_NUM_THREADS: '2'
35 |     JULIA_CUDA_USE_BINARYBUILDER: 'false' # reduce CI network traffic
36 |   script:
37 |     - julia -e 'using Pkg;
38 |                 Pkg.develop(PackageSpec(path=pwd()));
39 |                 Pkg.build();'
40 |     - julia -e 'using Pkg;
41 |                 Pkg.add(PackageSpec(name="CUDA", rev="master"));
42 |                 Pkg.test("CUDA");'
43 | 
44 | cuda:1.4:
45 |   extends:
46 |     - .julia:1.4
47 |     - .test_cuda
48 |   tags:
49 |     - nvidia
50 |   allow_failure: true
51 | 
52 | 
53 | # AMDGPUnative.jl
54 | 
55 | .test_amdgpunative:
56 |   extends: .test
57 |   image: rocm/dev-ubuntu-18.04
58 |   script:
59 |     - julia -e 'using Pkg;
60 |                 Pkg.develop(PackageSpec(path=pwd()));
61 |                 Pkg.build();'
62 |     - julia -e 'using Pkg;
63 |                 Pkg.add(PackageSpec(name="AMDGPUnative", rev="jps/gpucompiler"));
64 |                 Pkg.test("AMDGPUnative");'
65 | 
66 | amdgpunative:1.4:
67 |   extends:
68 |     - .julia:1.4
69 |     - .test_amdgpunative
70 |   tags:
71 |     - rocm
72 |   allow_failure: true
73 | 
74 | 
75 | # other tasks
76 | 
77 | coverage:
78 |   extends:
79 |     - .julia:1.4
80 |     - .coverage
81 | 


--------------------------------------------------------------------------------
/src/debug.jl:
--------------------------------------------------------------------------------
 1 | # tools for dealing with compiler debug information
 2 | 
 3 | # generate a pseudo-backtrace from LLVM IR instruction debug information
 4 | #
 5 | # this works by looking up the debug information of the instruction, and inspecting the call
 6 | # sites of the containing function. if there's only one, repeat the process from that call.
 7 | # finally, the debug information is converted to a Julia stack trace.
 8 | function backtrace(inst::LLVM.Instruction, bt = StackTraces.StackFrame[])
 9 |     name = Ref{Cstring}()
10 |     filename = Ref{Cstring}()
11 |     line = Ref{Cuint}()
12 |     col = Ref{Cuint}()
13 | 
14 |     # look up the debug information from the current instruction
15 |     depth = 0
16 |     while LLVM.API.LLVMGetSourceLocation(LLVM.ref(inst), depth, name, filename, line, col) == 1
17 |         frame = StackTraces.StackFrame(replace(unsafe_string(name[]), r";$"=>""),
18 |                                        unsafe_string(filename[]), line[])
19 |         push!(bt, frame)
20 |         depth += 1
21 |     end
22 | 
23 |     # move up the call chain
24 |     f = LLVM.parent(LLVM.parent(inst))
25 |     ## functions can be used as a *value* in eg. constant expressions, so filter those out
26 |     callers = filter(val -> isa(user(val), LLVM.CallInst), collect(uses(f)))
27 |     ## get rid of calls without debug info
28 |     filter!(callers) do call
29 |         md = metadata(user(call))
30 |         haskey(md, LLVM.MD_dbg)
31 |     end
32 |     if !isempty(callers)
33 |         # figure out the call sites of this instruction
34 |         call_sites = unique(callers) do call
35 |             # there could be multiple calls, originating from the same source location
36 |             md = metadata(user(call))
37 |             md[LLVM.MD_dbg]
38 |         end
39 | 
40 |         if length(call_sites) > 1
41 |             frame = StackTraces.StackFrame("multiple call sites", "unknown", 0)
42 |             push!(bt, frame)
43 |         elseif length(call_sites) == 1
44 |             backtrace(user(first(call_sites)), bt)
45 |         end
46 |     end
47 | 
48 |     return bt
49 | end
50 | 


--------------------------------------------------------------------------------
/test/definitions/ptx.jl:
--------------------------------------------------------------------------------
 1 | using GPUCompiler
 2 | 
 3 | if !@isdefined(TestRuntime)
 4 |     include("../util.jl")
 5 | end
 6 | 
 7 | 
 8 | # create a PTX-based test compiler, and generate reflection methods for it
 9 | 
10 | function ptx_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
11 |                  minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing,
12 |                  maxregs=nothing, kwargs...)
13 |     source = FunctionSpec(func, Base.to_tuple_type(types), kernel)
14 |     target = PTXCompilerTarget(cap=v"7.0",
15 |                                minthreads=minthreads, maxthreads=maxthreads,
16 |                                blocks_per_sm=blocks_per_sm, maxregs=maxregs)
17 |     params = TestCompilerParams()
18 |     CompilerJob(target, source, params), kwargs
19 | end
20 | 
21 | function ptx_code_typed(@nospecialize(func), @nospecialize(types); kwargs...)
22 |     job, kwargs = ptx_job(func, types; kwargs...)
23 |     GPUCompiler.code_typed(job; kwargs...)
24 | end
25 | 
26 | function ptx_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
27 |     job, kwargs = ptx_job(func, types; kwargs...)
28 |     GPUCompiler.code_warntype(io, job; kwargs...)
29 | end
30 | 
31 | function ptx_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
32 |     job, kwargs = ptx_job(func, types; kwargs...)
33 |     GPUCompiler.code_llvm(io, job; kwargs...)
34 | end
35 | 
36 | function ptx_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...)
37 |     job, kwargs = ptx_job(func, types; kwargs...)
38 |     GPUCompiler.code_native(io, job; kwargs...)
39 | end
40 | 
41 | # aliases without ::IO argument
42 | for method in (:code_warntype, :code_llvm, :code_native)
43 |     ptx_method = Symbol("ptx_$(method)")
44 |     @eval begin
45 |         $ptx_method(@nospecialize(func), @nospecialize(types); kwargs...) =
46 |             $ptx_method(stdout, func, types; kwargs...)
47 |     end
48 | end
49 | 
50 | # simulates codegen for a kernel function: validates by default
51 | function ptx_code_execution(@nospecialize(func), @nospecialize(types); kwargs...)
52 |     job, kwargs = ptx_job(func, types; kernel=true, kwargs...)
53 |     GPUCompiler.compile(:asm, job; kwargs...)
54 | end
55 | 


--------------------------------------------------------------------------------
/src/error.jl:
--------------------------------------------------------------------------------
 1 | # error handling
 2 | 
 3 | export KernelError, InternalCompilerError
 4 | 
 5 | struct KernelError <: Exception
 6 |     job::CompilerJob
 7 |     message::String
 8 |     help::Union{Nothing,String}
 9 |     bt::StackTraces.StackTrace
10 | 
11 |     KernelError(job::CompilerJob, message::String, help=nothing;
12 |                 bt=StackTraces.StackTrace()) =
13 |         new(job, message, help, bt)
14 | end
15 | 
16 | function Base.showerror(io::IO, err::KernelError)
17 |     println(io, "GPU compilation of ", err.job.source, " failed")
18 |     println(io, "KernelError: $(err.message)")
19 |     println(io)
20 |     println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros."))
21 |     Base.show_backtrace(io, err.bt)
22 | end
23 | 
24 | 
25 | struct InternalCompilerError <: Exception
26 |     job::CompilerJob
27 |     message::String
28 |     meta::Dict
29 |     InternalCompilerError(job, message; kwargs...) = new(job, message, kwargs)
30 | end
31 | 
32 | function Base.showerror(io::IO, err::InternalCompilerError)
33 |     println(io, """GPUCompiler.jl encountered an unexpected internal error.
34 |                    Please file an issue attaching the following information, including the backtrace,
35 |                    as well as a reproducible example (if possible).""")
36 | 
37 |     println(io, "\nInternalCompilerError: $(err.message)")
38 | 
39 |     println(io, "\nCompiler invocation: ", err.job)
40 | 
41 |     if !isempty(err.meta)
42 |         println(io, "\nAdditional information:")
43 |         for (key,val) in err.meta
44 |             println(io, " - $key = $(repr(val))")
45 |         end
46 |     end
47 | 
48 |     let Pkg = Base.require(Base.PkgId(Base.UUID((0x44cfe95a1eb252ea, 0xb672e2afdf69b78f)), "Pkg"))
49 |         println(io, "\nInstalled packages:")
50 |         for (pkg,ver) in Pkg.installed()
51 |             println(io, " - $pkg = $(repr(ver))")
52 |         end
53 |     end
54 | 
55 |     println(io)
56 |     versioninfo(io)
57 | end
58 | 
59 | macro compiler_assert(ex, job, kwargs...)
60 |     msg = "$ex, at $(__source__.file):$(__source__.line)"
61 |     return :($(esc(ex)) ? $(nothing)
62 |                         : throw(InternalCompilerError($(esc(job)), $msg;
63 |                                                       $(map(esc, kwargs)...)))
64 |             )
65 | end
66 | 


--------------------------------------------------------------------------------
/src/mcgen.jl:
--------------------------------------------------------------------------------
 1 | # machine code generation
 2 | 
 3 | # final preparations for the module to be compiled to PTX
 4 | # these passes should not be run when e.g. compiling to write to disk.
 5 | function prepare_execution!(job::CompilerJob, mod::LLVM.Module)
 6 |     let pm = ModulePassManager()
 7 |         global current_job
 8 |         current_job = job
 9 | 
10 |         global_optimizer!(pm)
11 | 
12 |         add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!))
13 | 
14 |         global_dce!(pm)
15 |         strip_dead_prototypes!(pm)
16 | 
17 |         run!(pm, mod)
18 |         dispose(pm)
19 |     end
20 | 
21 |     return
22 | end
23 | 
24 | # some Julia code contains references to objects in the CPU run-time,
25 | # without actually using the contents or functionality of those objects.
26 | #
27 | # prime example are type tags, which reference the address of the allocated type.
28 | # since those references are ephemeral, we can't eagerly resolve and emit them in the IR,
29 | # but at the same time the GPU can't resolve them at run-time.
30 | #
31 | # this pass performs that resolution at link time.
32 | function resolve_cpu_references!(mod::LLVM.Module)
33 |     job = current_job::CompilerJob
34 |     changed = false
35 | 
36 |     for f in functions(mod)
37 |         fn = LLVM.name(f)
38 |         if isdeclaration(f) && intrinsic_id(f) == 0 && startswith(fn, "jl_")
39 |             # eagerly resolve the address of the binding
40 |             address = ccall(:jl_cglobal, Any, (Any, Any), fn, UInt)
41 |             dereferenced = unsafe_load(address)
42 |             dereferenced = LLVM.ConstantInt(dereferenced, JuliaContext())
43 | 
44 |             function replace_bindings!(value)
45 |                 changed = false
46 |                 for use in uses(value)
47 |                     val = user(use)
48 |                     if isa(val, LLVM.ConstantExpr)
49 |                         # recurse
50 |                         changed |= replace_bindings!(val)
51 |                     elseif isa(val, LLVM.LoadInst)
52 |                         # resolve
53 |                         replace_uses!(val, dereferenced)
54 |                         unsafe_delete!(LLVM.parent(val), val)
55 |                         # FIXME: iterator invalidation?
56 |                         changed = true
57 |                     end
58 |                 end
59 |                 changed
60 |             end
61 | 
62 |             changed |= replace_bindings!(f)
63 |         end
64 |     end
65 | 
66 |     return changed
67 | end
68 | 
69 | 
70 | function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function, format=LLVM.API.LLVMAssemblyFile)
71 |     tm = llvm_machine(job.target)
72 | 
73 |     return String(emit(tm, mod, format))
74 | end
75 | 


--------------------------------------------------------------------------------
/src/cache.jl:
--------------------------------------------------------------------------------
 1 | # compilation cache
 2 | 
 3 | using Core.Compiler: retrieve_code_info, CodeInfo, MethodInstance, SSAValue, SlotNumber
 4 | using Base: _methods_by_ftype
 5 | 
 6 | const compilecache = Dict{UInt, Any}()
 7 | const compilelock = ReentrantLock()
 8 | 
 9 | @inline function check_cache(driver, spec, id; kwargs...)
10 |     # generate a key for indexing the compilation cache
11 |     key = hash(kwargs, id)
12 |     key = hash(spec.name, key)      # fields f and tt are already covered by the id
13 |     key = hash(spec.kernel, key)    # as `cached_compilation` specializes on them.
14 |     for nf in 1:nfields(spec.f)
15 |         # mix in the values of any captured variable
16 |         key = hash(getfield(spec.f, nf), key)
17 |     end
18 | 
19 |     Base.@lock compilelock begin
20 |         get!(compilecache, key) do
21 |             driver(spec; kwargs...)
22 |         end
23 |     end
24 | end
25 | 
26 | # generated function that crafts a custom code info to call the actual cufunction impl.
27 | # this gives us the flexibility to insert manual back edges for automatic recompilation.
28 | #
29 | # we also increment a global specialization counter and pass it along to index the cache.
30 | 
31 | specialization_counter = 0
32 | 
33 | @generated function cached_compilation(driver::Core.Function, spec::FunctionSpec{f,tt},
34 |                                        env::UInt=zero(UInt); kwargs...) where {f,tt}
35 | 
36 |     # get a hold of the method and code info of the kernel function
37 |     sig = Tuple{f, tt.parameters...}
38 |     mthds = _methods_by_ftype(sig, -1, typemax(UInt))
39 |     Base.isdispatchtuple(tt) || return(:(error("$tt is not a dispatch tuple")))
40 |     length(mthds) == 1 || return (:(throw(MethodError(spec.f,spec.tt))))
41 |     mtypes, msp, m = mthds[1]
42 |     mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, (Any, Any, Any), m, mtypes, msp)
43 |     ci = retrieve_code_info(mi)
44 |     @assert isa(ci, CodeInfo)
45 | 
46 |     # generate a unique id to represent this specialization
47 |     global specialization_counter
48 |     id = UInt(specialization_counter += 1)
49 |     # TODO: save the mi/ci here (or embed it in the AST to pass to cufunction)
50 |     #       and use that to drive compilation
51 | 
52 |     # prepare a new code info
53 |     new_ci = copy(ci)
54 |     empty!(new_ci.code)
55 |     empty!(new_ci.codelocs)
56 |     resize!(new_ci.linetable, 1)    # codegen assumes at least one entry on <1.5
57 |     empty!(new_ci.ssaflags)
58 |     new_ci.ssavaluetypes = 0
59 |     new_ci.edges = MethodInstance[mi]
60 |     # XXX: setting this edge does not give us proper method invalidation, see
61 |     #      JuliaLang/julia#34962 which demonstrates we also need to "call" the kernel.
62 |     #      invoking `code_llvm` also does the necessary codegen, as does calling the
63 |     #      underlying C methods -- which GPUCompiler does, so everything Just Works.
64 | 
65 |     # prepare the slots
66 |     new_ci.slotnames = Symbol[:kwfunc, :kwargs, Symbol("#self#"), :driver, :spec, :id]
67 |     new_ci.slotflags = UInt8[0x00 for i = 1:6]
68 |     kwargs = SlotNumber(2)
69 |     driver = SlotNumber(4)
70 |     spec = SlotNumber(5)
71 |     env = SlotNumber(6)
72 | 
73 |     # call the compiler
74 |     append!(new_ci.code, [Expr(:call, Core.kwfunc, check_cache),
75 |                           Expr(:call, merge, NamedTuple(), kwargs),
76 |                           Expr(:call, hash, env, id),
77 |                           Expr(:call, SSAValue(1), SSAValue(2), check_cache, driver, spec, SSAValue(3)),
78 |                           Expr(:return, SSAValue(4))])
79 |     append!(new_ci.codelocs, [0, 0, 0, 0, 0])
80 |     new_ci.ssavaluetypes += 5
81 | 
82 |     return new_ci
83 | end
84 | 


--------------------------------------------------------------------------------
/src/interface.jl:
--------------------------------------------------------------------------------
  1 | # interfaces for defining new compilers
  2 | 
  3 | # the definition of a new GPU compiler is typically split in two:
  4 | # - a generic compiler that lives in GPUCompiler.jl (e.g., emitting PTX, SPIR-V, etc)
  5 | # - a more specific version in a package that targets an environment (e.g. CUDA, ROCm, etc)
  6 | #
  7 | # the first level of customizability is found in the AbstractCompilerTarget hierarchy,
  8 | # with methods and interfaces that can only be implemented within GPUCompiler.jl.
  9 | #
 10 | # further customization should be put in a concrete instance of the AbstractCompilerParams
 11 | # type, and can be used to customize interfaces defined on CompilerJob.
 12 | 
 13 | 
 14 | ## target
 15 | 
 16 | export AbstractCompilerTarget
 17 | 
 18 | # container for state handled by targets defined in GPUCompiler.jl
 19 | 
 20 | abstract type AbstractCompilerTarget end
 21 | 
 22 | llvm_triple(::AbstractCompilerTarget) = error("Not implemented")
 23 | 
 24 | function llvm_machine(target::AbstractCompilerTarget)
 25 |     triple = llvm_triple(target)
 26 | 
 27 |     t = Target(triple)
 28 | 
 29 |     tm = TargetMachine(t, triple)
 30 |     asm_verbosity!(tm, true)
 31 | 
 32 |     return tm
 33 | end
 34 | 
 35 | llvm_datalayout(target::AbstractCompilerTarget) = DataLayout(llvm_machine(target))
 36 | 
 37 | 
 38 | ## params
 39 | 
 40 | export AbstractCompilerParams
 41 | 
 42 | # container for state handled by external users of GPUCompiler.jl
 43 | 
 44 | abstract type AbstractCompilerParams end
 45 | 
 46 | 
 47 | ## function specification
 48 | 
 49 | export FunctionSpec
 50 | 
 51 | # what we'll be compiling
 52 | 
 53 | struct FunctionSpec{F,TT}
 54 |     f::Base.Callable
 55 |     tt::DataType
 56 |     kernel::Bool
 57 |     name::Union{Nothing,String}
 58 | end
 59 | 
 60 | # put the function and argument types in typevars
 61 | # so that we can access it from generated functions
 62 | FunctionSpec(f, tt=Tuple{}, kernel=true, name=nothing) =
 63 |     FunctionSpec{typeof(f),tt}(f, tt, kernel, name)
 64 | 
 65 | function signature(spec::FunctionSpec)
 66 |     fn = something(spec.name, nameof(spec.f))
 67 |     args = join(spec.tt.parameters, ", ")
 68 |     return "$fn($(join(spec.tt.parameters, ", ")))"
 69 | end
 70 | 
 71 | function Base.show(io::IO, spec::FunctionSpec)
 72 |     spec.kernel ? print(io, "kernel ") : print(io, "function ")
 73 |     print(io, signature(spec))
 74 | end
 75 | 
 76 | 
 77 | ## job
 78 | 
 79 | export CompilerJob
 80 | 
 81 | # a specific invocation of the compiler, bundling everything needed to generate code
 82 | 
 83 | Base.@kwdef struct CompilerJob{T,P}
 84 |     target::T
 85 |     source::FunctionSpec
 86 |     params::P
 87 | 
 88 |     CompilerJob(target::AbstractCompilerTarget, source::FunctionSpec, params::AbstractCompilerParams) =
 89 |         new{typeof(target), typeof(params)}(target, source, params)
 90 | end
 91 | 
 92 | Base.similar(job::CompilerJob, source::FunctionSpec) =
 93 |     CompilerJob(target=job.target, source=source, params=job.params)
 94 | 
 95 | function Base.show(io::IO, job::CompilerJob{T}) where {T}
 96 |     print(io, "CompilerJob of ", job.source, " for ", T)
 97 | end
 98 | 
 99 | 
100 | ## interfaces and fallback definitions
101 | 
102 | # the Julia module to look up target-specific runtime functions in (this includes both
103 | # target-specific functions from the GPU runtime library, like `malloc`, but also
104 | # replacements functions for operations like `Base.sin`)
105 | runtime_module(::CompilerJob) = error("Not implemented")
106 | 
107 | # check if a function is an intrinsic that can assumed to be always available
108 | isintrinsic(::CompilerJob, fn::String) = false
109 | 
110 | # does this target support throwing Julia exceptions with jl_throw?
111 | # if not, calls to throw will be replaced with calls to the GPU runtime
112 | can_throw(::CompilerJob) = false
113 | 
114 | # generate a string that represents the type of compilation, for selecting a compiled
115 | # instance of the runtime library. this slug should encode everything that affects
116 | # the generated code of this compiler job (with exception of the function source)
117 | runtime_slug(::CompilerJob) = error("Not implemented")
118 | 
119 | # early processing of the newly generated LLVM IR module
120 | process_module!(::CompilerJob, mod::LLVM.Module) = return
121 | 
122 | # early processing of the newly identified LLVM kernel function
123 | process_kernel!(::CompilerJob, mod::LLVM.Module, kernel::LLVM.Function) = return kernel
124 | 
125 | # final processing of the IR module, right before validation and machine-code generation
126 | finish_module!(::CompilerJob, mod::LLVM.Module) = return
127 | 
128 | add_lowering_passes!(::CompilerJob, pm::LLVM.PassManager) = return
129 | 
130 | add_optimization_passes!(::CompilerJob, pm::LLVM.PassManager) = return
131 | 
132 | link_libraries!(::CompilerJob, mod::LLVM.Module, undefined_fns::Vector{String}) = return
133 | 


--------------------------------------------------------------------------------
/src/optim.jl:
--------------------------------------------------------------------------------
  1 | # LLVM IR optimization
  2 | 
  3 | function optimize!(job::CompilerJob, mod::LLVM.Module)
  4 |     tm = llvm_machine(job.target)
  5 | 
  6 |     function initialize!(pm)
  7 |         add_library_info!(pm, triple(mod))
  8 |         add_transform_info!(pm, tm)
  9 |     end
 10 | 
 11 |     global current_job
 12 |     current_job = job
 13 | 
 14 |     # Julia-specific optimizations
 15 |     #
 16 |     # NOTE: we need to use multiple distinct pass managers to force pass ordering;
 17 |     #       intrinsics should never get lowered before Julia has optimized them.
 18 | 
 19 |     ModulePassManager() do pm
 20 |         initialize!(pm)
 21 |         ccall(:jl_add_optimization_passes, Cvoid,
 22 |                 (LLVM.API.LLVMPassManagerRef, Cint, Cint),
 23 |                 LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
 24 |         run!(pm, mod)
 25 |     end
 26 | 
 27 |     ModulePassManager() do pm
 28 |         initialize!(pm)
 29 | 
 30 |         # lower intrinsics
 31 |         add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
 32 |         aggressive_dce!(pm) # remove dead uses of ptls
 33 |         add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 34 | 
 35 |         # the Julia GC lowering pass also has some clean-up that is required
 36 |         late_lower_gc_frame!(pm)
 37 | 
 38 |         remove_julia_addrspaces!(pm)
 39 | 
 40 |         run!(pm, mod)
 41 |     end
 42 | 
 43 |     # target-specific optimizations
 44 |     ModulePassManager() do pm
 45 |         initialize!(pm)
 46 | 
 47 |         # Julia's operand bundles confuse the inliner, so repeat here now they are gone.
 48 |         # FIXME: we should fix the inliner so that inlined code gets optimized early-on
 49 |         always_inliner!(pm)
 50 | 
 51 |         add_optimization_passes!(job, pm)
 52 | 
 53 |         run!(pm, mod)
 54 |     end
 55 | 
 56 |     # we compile a module containing the entire call graph,
 57 |     # so perform some interprocedural optimizations.
 58 |     #
 59 |     # for some reason, these passes need to be distinct from the regular optimization chain,
 60 |     # or certain values (such as the constant arrays used to populare llvm.compiler.user ad
 61 |     # part of the LateLowerGCFrame pass) aren't collected properly.
 62 |     #
 63 |     # these might not always be safe, as Julia's IR metadata isn't designed for IPO.
 64 |     ModulePassManager() do pm
 65 |         dead_arg_elimination!(pm)   # parent doesn't use return value --> ret void
 66 | 
 67 |         run!(pm, mod)
 68 |     end
 69 | 
 70 |     return
 71 | end
 72 | 
 73 | 
 74 | ## lowering intrinsics
 75 | 
 76 | # lower object allocations to to PTX malloc
 77 | #
 78 | # this is a PoC implementation that is very simple: allocate, and never free. it also runs
 79 | # _before_ Julia's GC lowering passes, so we don't get to use the results of its analyses.
 80 | # when we ever implement a more potent GC, we will need those results, but the relevant pass
 81 | # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc.
 82 | # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
 83 | # lower-level intrinsics which then can be lowered to architecture-specific code.
 84 | function lower_gc_frame!(fun::LLVM.Function)
 85 |     job = current_job::CompilerJob
 86 |     mod = LLVM.parent(fun)
 87 |     changed = false
 88 | 
 89 |     # plain alloc
 90 |     if haskey(functions(mod), "julia.gc_alloc_obj")
 91 |         alloc_obj = functions(mod)["julia.gc_alloc_obj"]
 92 |         alloc_obj_ft = eltype(llvmtype(alloc_obj))
 93 |         T_prjlvalue = return_type(alloc_obj_ft)
 94 |         T_pjlvalue = convert(LLVMType, Any, true)
 95 | 
 96 |         for use in uses(alloc_obj)
 97 |             call = user(use)::LLVM.CallInst
 98 | 
 99 |             # decode the call
100 |             ops = collect(operands(call))
101 |             sz = ops[2]
102 | 
103 |             # replace with PTX alloc_obj
104 |             let builder = Builder(JuliaContext())
105 |                 position!(builder, call)
106 |                 ptr = call!(builder, Runtime.get(:gc_pool_alloc), [sz])
107 |                 replace_uses!(call, ptr)
108 |                 dispose(builder)
109 |             end
110 | 
111 |             unsafe_delete!(LLVM.parent(call), call)
112 | 
113 |             changed = true
114 |         end
115 | 
116 |         @compiler_assert isempty(uses(alloc_obj)) job
117 |     end
118 | 
119 |     # we don't care about write barriers
120 |     if haskey(functions(mod), "julia.write_barrier")
121 |         barrier = functions(mod)["julia.write_barrier"]
122 | 
123 |         for use in uses(barrier)
124 |             call = user(use)::LLVM.CallInst
125 |             unsafe_delete!(LLVM.parent(call), call)
126 |             changed = true
127 |         end
128 | 
129 |         @compiler_assert isempty(uses(barrier)) job
130 |     end
131 | 
132 |     return changed
133 | end
134 | 
135 | # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
136 | #
137 | # this assumes and checks that the TLS is unused, which should be the case for most GPU code
138 | # after lowering the GC intrinsics to TLS-less code and having run DCE.
139 | #
140 | # TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead,
141 | #       making it easier to remove or reimplement that functionality here.
142 | function lower_ptls!(mod::LLVM.Module)
143 |     job = current_job::CompilerJob
144 |     changed = false
145 | 
146 |     if haskey(functions(mod), "julia.ptls_states")
147 |         ptls_getter = functions(mod)["julia.ptls_states"]
148 | 
149 |         for use in uses(ptls_getter)
150 |             val = user(use)
151 |             if !isempty(uses(val))
152 |                 error("Thread local storage is not implemented")
153 |             end
154 |             unsafe_delete!(LLVM.parent(val), val)
155 |             changed = true
156 |         end
157 | 
158 |         @compiler_assert isempty(uses(ptls_getter)) job
159 |      end
160 | 
161 |     return changed
162 | end
163 | 


--------------------------------------------------------------------------------
/src/rtlib.jl:
--------------------------------------------------------------------------------
  1 | # compiler support for working with run-time libraries
  2 | 
  3 | link_library!(mod::LLVM.Module, lib::LLVM.Module) = link_library!(mod, [lib])
  4 | function link_library!(mod::LLVM.Module, libs::Vector{LLVM.Module})
  5 |     # linking is destructive, so copy the libraries
  6 |     libs = [LLVM.Module(lib) for lib in libs]
  7 | 
  8 |     for lib in libs
  9 |         link!(mod, lib)
 10 |     end
 11 | end
 12 | 
 13 | 
 14 | #
 15 | # GPU run-time library
 16 | #
 17 | 
 18 | const libcache = Dict{String, LLVM.Module}()
 19 | 
 20 | # get the path to a directory where we can put cache files (machine-specific, ephemeral)
 21 | # NOTE: maybe we should use XDG_CACHE_PATH/%LOCALAPPDATA%, but other Julia cache files
 22 | #       are put in .julia anyway so let's just follow suit for now.
 23 | function cachedir(depot=DEPOT_PATH[1])
 24 |     # this mimicks Base.compilecache. we can't just call the function, or we might actually
 25 |     # _generate_ a cache file, e.g., when running with `--compiled-modules=no`.
 26 |     if VERSION >= v"1.3.0-alpha.146"
 27 |         entrypath, entryfile = Base.cache_file_entry(Base.PkgId(GPUCompiler))
 28 |         abspath(depot, entrypath, entryfile)
 29 |     else
 30 |         cachefile = abspath(depot, Base.cache_file_entry(Base.PkgId(GPUCompiler)))
 31 | 
 32 |         # the cachefile consists of `/depot/compiled/vXXX/GPUCompiler/$slug.ji`
 33 |         # transform that into `/depot/compiled/vXXX/GPUCompiler/$slug/`
 34 |         splitext(cachefile)[1]
 35 |     end
 36 | end
 37 | 
 38 | 
 39 | ## higher-level functionality to work with runtime functions
 40 | 
 41 | function LLVM.call!(builder, rt::Runtime.RuntimeMethodInstance, args=LLVM.Value[])
 42 |     bb = position(builder)
 43 |     f = LLVM.parent(bb)
 44 |     mod = LLVM.parent(f)
 45 | 
 46 |     # get or create a function prototype
 47 |     if haskey(functions(mod), rt.llvm_name)
 48 |         f = functions(mod)[rt.llvm_name]
 49 |         ft = eltype(llvmtype(f))
 50 |     else
 51 |         ft = LLVM.FunctionType(rt.llvm_return_type, rt.llvm_types)
 52 |         f = LLVM.Function(mod, rt.llvm_name, ft)
 53 |     end
 54 | 
 55 |     # runtime functions are written in Julia, while we're calling from LLVM,
 56 |     # this often results in argument type mismatches. try to fix some here.
 57 |     for (i,arg) in enumerate(args)
 58 |         if llvmtype(arg) != parameters(ft)[i]
 59 |             if (llvmtype(arg) isa LLVM.PointerType) &&
 60 |                (parameters(ft)[i] isa LLVM.IntegerType)
 61 |                 # Julia pointers are passed as integers
 62 |                 args[i] = ptrtoint!(builder, args[i], parameters(ft)[i])
 63 |             else
 64 |                 error("Don't know how to convert ", arg, " argument to ", parameters(ft)[i])
 65 |             end
 66 |         end
 67 |     end
 68 | 
 69 |     call!(builder, f, args)
 70 | end
 71 | 
 72 | 
 73 | ## functionality to build the runtime library
 74 | 
 75 | function emit_function!(mod, job::CompilerJob, f, method)
 76 |     tt = Base.to_tuple_type(method.types)
 77 |     new_mod, entry = codegen(:llvm, similar(job, FunctionSpec(f, tt, #=kernel=# false));
 78 |                              optimize=false, libraries=false)
 79 |     if return_type(eltype(llvmtype(entry))) != method.llvm_return_type
 80 |         error("Invalid return type for runtime function '$(method.name)': expected $(method.llvm_return_type), got $(return_type(eltype(llvmtype(entry))))")
 81 |     end
 82 | 
 83 |     # recent Julia versions include prototypes for all runtime functions, even if unused
 84 |     if VERSION >= v"1.5-"
 85 |         pm = ModulePassManager()
 86 |         strip_dead_prototypes!(pm)
 87 |         run!(pm, new_mod)
 88 |         dispose(pm)
 89 |     end
 90 | 
 91 |     temp_name = LLVM.name(entry)
 92 |     link!(mod, new_mod)
 93 |     entry = functions(mod)[temp_name]
 94 | 
 95 |     # if a declaration already existed, replace it with the function to avoid aliasing
 96 |     # (and getting function names like gpu_signal_exception1)
 97 |     name = method.llvm_name
 98 |     if haskey(functions(mod), name)
 99 |         decl = functions(mod)[name]
100 |         @assert llvmtype(decl) == llvmtype(entry)
101 |         replace_uses!(decl, entry)
102 |         unsafe_delete!(mod, decl)
103 |     end
104 |     LLVM.name!(entry, name)
105 | end
106 | 
107 | function build_runtime(job::CompilerJob)
108 |     mod = LLVM.Module("GPUCompiler run-time library", JuliaContext())
109 | 
110 |     for method in values(Runtime.methods)
111 |         def = if isa(method.def, Symbol)
112 |             isdefined(runtime_module(job), method.def) || continue
113 |             getfield(runtime_module(job), method.def)
114 |         else
115 |             method.def
116 |         end
117 |         emit_function!(mod, job, def, method)
118 |     end
119 | 
120 |     optimize!(job, mod)
121 | 
122 |     mod
123 | end
124 | 
125 | function load_runtime(job::CompilerJob)
126 |     # find the first existing cache directory (for when dealing with layered depots)
127 |     cachedirs = [cachedir(depot) for depot in DEPOT_PATH]
128 |     filter!(isdir, cachedirs)
129 |     input_dir = if isempty(cachedirs)
130 |         nothing
131 |     else
132 |         first(cachedirs)
133 |     end
134 | 
135 |     # we are only guaranteed to be able to write in the current depot
136 |     output_dir = cachedir()
137 | 
138 |     # if both aren't equal, copy pregenerated runtime libraries to our depot
139 |     # NOTE: we don't just lazily read from the one and write to the other, because
140 |     #       once we generate additional runtimes in the output dir we don't know if
141 |     #       it's safe to load from other layers (since those could have been invalidated)
142 |     if input_dir !== nothing && input_dir != output_dir
143 |         mkpath(dirname(output_dir))
144 |         cp(input_dir, output_dir)
145 |     end
146 | 
147 |     slug = runtime_slug(job)
148 |     name = "runtime_$(slug).bc"
149 |     path = joinpath(output_dir, name)
150 | 
151 |     get!(libcache, path) do
152 |         if ispath(path)
153 |             open(path) do io
154 |                 parse(LLVM.Module, read(io), JuliaContext())
155 |             end
156 |         else
157 |             @debug "Building the GPU runtime library at $path"
158 |             mkpath(output_dir)
159 |             lib = build_runtime(job)
160 |             open(path, "w") do io
161 |                 write(io, lib)
162 |             end
163 |             lib
164 |         end
165 |     end
166 | end
167 | 
168 | # remove the existing cache
169 | # NOTE: call this function from global scope, so any change triggers recompilation.
170 | function reset_runtime()
171 |     rm(cachedir(); recursive=true, force=true)
172 |     # create an empty cache directory. since we only ever load from the first existing cachedir,
173 |     # this effectively invalidates preexisting caches in lower layers of the depot.
174 |     mkpath(cachedir())
175 | 
176 |     # wipe the cache so we can use this function at run-time too
177 |     empty!(libcache)
178 | 
179 |     return
180 | end
181 | 


--------------------------------------------------------------------------------
/test/gcn.jl:
--------------------------------------------------------------------------------
  1 | @testset "GCN" begin
  2 | 
  3 | include("definitions/gcn.jl")
  4 | 
  5 | ############################################################################################
  6 | 
  7 | @testset "IR" begin
  8 | 
  9 | @testset "kernel calling convention" begin
 10 |     kernel() = return
 11 | 
 12 |     ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{}; dump_module=true))
 13 |     @test !occursin("amdgpu_kernel", ir)
 14 | 
 15 |     ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{};
 16 |                                          dump_module=true, kernel=true))
 17 |     @test occursin("amdgpu_kernel", ir)
 18 | end
 19 | 
 20 | end
 21 | 
 22 | ############################################################################################
 23 | 
 24 | @testset "assembly" begin
 25 | 
 26 | @testset "skip scalar trap" begin
 27 |     workitem_idx_x() = ccall("llvm.amdgcn.workitem.id.x", llvmcall, Int32, ())
 28 |     trap() = ccall("llvm.trap", llvmcall, Nothing, ())
 29 |     function kernel()
 30 |         if workitem_idx_x() > 1
 31 |             trap()
 32 |         end
 33 |         return
 34 |     end
 35 | 
 36 |     asm = sprint(io->gcn_code_native(io, kernel, Tuple{}))
 37 |     @test occursin("s_trap 2", asm)
 38 |     @test_broken occursin("s_cbranch_execz", asm)
 39 |     if Base.libllvm_version < v"9"
 40 |         @test_broken occursin("v_readfirstlane", asm)
 41 |     end
 42 | end
 43 | 
 44 | @testset "child functions" begin
 45 |     # we often test using @noinline child functions, so test whether these survive
 46 |     # (despite not having side-effects)
 47 |     @noinline child(i) = sink(i)
 48 |     function parent(i)
 49 |         child(i)
 50 |         return
 51 |     end
 52 | 
 53 |     asm = sprint(io->gcn_code_native(io, parent, Tuple{Int64}))
 54 |     @test occursin(r"s_add_u32.*julia_child_.*@rel32@lo\+4", asm)
 55 |     @test occursin(r"s_addc_u32.*julia_child_.*@rel32@hi\+4", asm)
 56 | end
 57 | 
 58 | @testset "kernel functions" begin
 59 |     @noinline nonentry(i) = sink(i)
 60 |     function entry(i)
 61 |         nonentry(i)
 62 |         return
 63 |     end
 64 | 
 65 |     asm = sprint(io->gcn_code_native(io, entry, Tuple{Int64}; kernel=true))
 66 |     @test occursin(r"\.amdgpu_hsa_kernel .*julia_entry", asm)
 67 |     @test !occursin(r"\.amdgpu_hsa_kernel .*julia_nonentry", asm)
 68 |     @test occursin(r"\.type.*julia_nonentry_\d*,@function", asm)
 69 | end
 70 | 
 71 | @testset "child function reuse" begin
 72 |     # bug: depending on a child function from multiple parents resulted in
 73 |     #      the child only being present once
 74 | 
 75 |     @noinline child(i) = sink(i)
 76 |     function parent1(i)
 77 |         child(i)
 78 |         return
 79 |     end
 80 | 
 81 |     asm = sprint(io->gcn_code_native(io, parent1, Tuple{Int}))
 82 |     @test occursin(r"\.type.*julia__\d*_child_\d*,@function", asm)
 83 | 
 84 |     function parent2(i)
 85 |         child(i+1)
 86 |         return
 87 |     end
 88 | 
 89 |     asm = sprint(io->gcn_code_native(io, parent2, Tuple{Int}))
 90 |     @test occursin(r"\.type.*julia__\d*_child_\d*,@function", asm)
 91 | end
 92 | 
 93 | @testset "child function reuse bis" begin
 94 |     # bug: similar, but slightly different issue as above
 95 |     #      in the case of two child functions
 96 |     @noinline child1(i) = sink(i)
 97 |     @noinline child2(i) = sink(i+1)
 98 |     function parent1(i)
 99 |         child1(i) + child2(i)
100 |         return
101 |     end
102 |     gcn_code_native(devnull, parent1, Tuple{Int})
103 | 
104 |     function parent2(i)
105 |         child1(i+1) + child2(i+1)
106 |         return
107 |     end
108 |     gcn_code_native(devnull, parent2, Tuple{Int})
109 | end
110 | 
111 | @testset "indirect sysimg function use" begin
112 |     # issue #9: re-using sysimg functions should force recompilation
113 |     #           (host fldmod1->mod1 throws, so the GCN code shouldn't contain a throw)
114 | 
115 |     # NOTE: Int32 to test for #49
116 | 
117 |     function kernel(out)
118 |         wid, lane = fldmod1(unsafe_load(out), Int32(32))
119 |         unsafe_store!(out, wid)
120 |         return
121 |     end
122 | 
123 |     asm = sprint(io->gcn_code_native(io, kernel, Tuple{Ptr{Int32}}))
124 |     @test !occursin("jl_throw", asm)
125 |     @test !occursin("jl_invoke", asm)   # forced recompilation should still not invoke
126 | end
127 | 
128 | @testset "LLVM intrinsics" begin
129 |     # issue #13 (a): cannot select trunc
130 |     function kernel(x)
131 |         unsafe_trunc(Int, x)
132 |         return
133 |     end
134 |     gcn_code_native(devnull, kernel, Tuple{Float64})
135 | end
136 | 
137 | @test_broken "exception arguments"
138 | #= FIXME: _ZNK4llvm14TargetLowering20scalarizeVectorStoreEPNS_11StoreSDNodeERNS_12SelectionDAGE
139 | @testset "exception arguments" begin
140 |     function kernel(a)
141 |         unsafe_store!(a, trunc(Int, unsafe_load(a)))
142 |         return
143 |     end
144 | 
145 |     gcn_code_native(devnull, kernel, Tuple{Ptr{Float64}})
146 | end
147 | =#
148 | 
149 | @test_broken "GC and TLS lowering"
150 | #= FIXME: in function julia_inner_18528 void (%jl_value_t addrspace(10)*): invalid addrspacecast
151 | @testset "GC and TLS lowering" begin
152 |     @eval mutable struct PleaseAllocate
153 |         y::Csize_t
154 |     end
155 | 
156 |     # common pattern in Julia 0.7: outlined throw to avoid a GC frame in the calling code
157 |     @noinline function inner(x)
158 |         sink(x.y)
159 |         nothing
160 |     end
161 | 
162 |     function kernel(i)
163 |         inner(PleaseAllocate(Csize_t(42)))
164 |         nothing
165 |     end
166 | 
167 |     asm = sprint(io->gcn_code_native(io, kernel, Tuple{Int}))
168 |     @test occursin("gpu_gc_pool_alloc", asm)
169 | 
170 |     # make sure that we can still ellide allocations
171 |     function ref_kernel(ptr, i)
172 |         data = Ref{Int64}()
173 |         data[] = 0
174 |         if i > 1
175 |             data[] = 1
176 |         else
177 |             data[] = 2
178 |         end
179 |         unsafe_store!(ptr, data[], i)
180 |         return nothing
181 |     end
182 | 
183 |     asm = sprint(io->gcn_code_native(io, ref_kernel, Tuple{Ptr{Int64}, Int}))
184 | 
185 | 
186 |     if VERSION < v"1.2.0-DEV.375"
187 |         @test_broken !occursin("gpu_gc_pool_alloc", asm)
188 |     else
189 |         @test !occursin("gpu_gc_pool_alloc", asm)
190 |     end
191 | end
192 | =#
193 | 
194 | @testset "float boxes" begin
195 |     function kernel(a,b)
196 |         c = Int32(a)
197 |         # the conversion to Int32 may fail, in which case the input Float32 is boxed in order to
198 |         # pass it to the @nospecialize exception constructor. we should really avoid that (eg.
199 |         # by avoiding @nospecialize, or optimize the unused arguments away), but for now the box
200 |         # should just work.
201 |         unsafe_store!(b, c)
202 |         return
203 |     end
204 | 
205 |     ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}}))
206 |     @test occursin("jl_box_float32", ir)
207 |     gcn_code_native(devnull, kernel, Tuple{Float32,Ptr{Float32}})
208 | end
209 | 
210 | end
211 | 
212 | ############################################################################################
213 | 
214 | end
215 | 


--------------------------------------------------------------------------------
/src/runtime.jl:
--------------------------------------------------------------------------------
  1 | # GPU runtime library
  2 | #
  3 | # This module defines method instances that will be compiled into a target-specific image
  4 | # and will be available to the GPU compiler to call after Julia has generated code.
  5 | #
  6 | # Most functions implement, or are used to support Julia runtime functions that are expected
  7 | # by the Julia compiler to be available at run time, e.g., to dynamically allocate memory,
  8 | # box values, etc.
  9 | 
 10 | module Runtime
 11 | 
 12 | using ..GPUCompiler
 13 | using LLVM
 14 | using LLVM.Interop
 15 | 
 16 | 
 17 | ## representation of a runtime method instance
 18 | 
 19 | struct RuntimeMethodInstance
 20 |     # either a function defined here, or a symbol to fetch a target-specific definition
 21 |     def::Union{Function,Symbol}
 22 | 
 23 |     return_type::Type
 24 |     types::Tuple
 25 |     name::Symbol
 26 | 
 27 |     # LLVM types cannot be cached, so we can't put them in the runtime method instance.
 28 |     # the actual types are constructed upon accessing them, based on a sentinel value:
 29 |     #  - nothing: construct the LLVM type based on its Julia counterparts
 30 |     #  - function: call this generator to get the type (when more control is needed)
 31 |     llvm_return_type::Union{Nothing, Function}
 32 |     llvm_types::Union{Nothing, Function}
 33 |     llvm_name::String
 34 | end
 35 | 
 36 | function Base.getproperty(rt::RuntimeMethodInstance, field::Symbol)
 37 |     value = getfield(rt, field)
 38 |     if field == :llvm_types
 39 |         if value == nothing
 40 |             LLVMType[convert.(LLVMType, typ) for typ in rt.types]
 41 |         else
 42 |             value()
 43 |         end
 44 |     elseif field == :llvm_return_type
 45 |         if value == nothing
 46 |             convert(LLVMType, rt.return_type)
 47 |         else
 48 |             value()
 49 |         end
 50 |     else
 51 |         return value
 52 |     end
 53 | end
 54 | 
 55 | const methods = Dict{Symbol,RuntimeMethodInstance}()
 56 | function get(name::Symbol)
 57 |     if !haskey(methods, name)
 58 |         display(methods)
 59 |     end
 60 |     methods[name]
 61 | end
 62 | 
 63 | # Register a Julia function `def` as a runtime library function identified by `name`. The
 64 | # function will be compiled upon first use for argument types `types` and should return
 65 | # `return_type`. Use `Runtime.get(name)` to get a reference to this method instance.
 66 | #
 67 | # The corresponding LLVM types `llvm_types` and `llvm_return_type` will be deduced from
 68 | # their Julia counterparts. To influence that conversion, pass a callable object instead;
 69 | # this object will be evaluated at run-time and the returned value will be used instead.
 70 | #
 71 | # When generating multiple runtime functions from a single definition, make sure to specify
 72 | # different values for `name`. The LLVM function name will be deduced from that name, but
 73 | # you can always specify `llvm_name` to influence that. Never use an LLVM name that starts
 74 | # with `julia_` or the function might clash with other compiled functions.
 75 | function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=nothing;
 76 |                  name=isa(def,Symbol) ? def : nameof(def), llvm_name="gpu_$name")
 77 |     meth = RuntimeMethodInstance(def,
 78 |                                  return_type, types, name,
 79 |                                  llvm_return_type, llvm_types, llvm_name)
 80 |     if haskey(methods, name)
 81 |         error("Runtime function $name has already been registered!")
 82 |     end
 83 |     methods[name] = meth
 84 | 
 85 |     # FIXME: if the function is a symbol, implying it will be specified by the target,
 86 |     #        we won't be able to call this function here or we'll get UndefVarErrors.
 87 |     #        work around that by generating an llvmcall stub. can we do better by
 88 |     #        using the new nonrecursive codegen to handle function lookup ourselves?
 89 |     if def isa Symbol
 90 |         args = [gensym() for typ in types]
 91 |         @eval @inline $def($(args...)) =
 92 |             ccall($"extern $llvm_name", llvmcall, $return_type, ($(types...),), $(args...))
 93 |     end
 94 | 
 95 |     return
 96 | end
 97 | 
 98 | 
 99 | ## exception handling
100 | 
101 | # expected functions for exception signalling
102 | compile(:signal_exception, Nothing, ())
103 | 
104 | # expected functions for simple exception handling
105 | compile(:report_exception, Nothing, (Ptr{Cchar},))
106 | compile(:report_oom, Nothing, (Csize_t,))
107 | 
108 | # expected functions for verbose exception handling
109 | compile(:report_exception_frame, Nothing, (Cint, Ptr{Cchar}, Ptr{Cchar}, Cint))
110 | compile(:report_exception_name, Nothing, (Ptr{Cchar},))
111 | 
112 | # NOTE: no throw functions are provided here, but replaced by an LLVM pass instead
113 | #       in order to provide some debug information without stack unwinding.
114 | 
115 | 
116 | ## GC
117 | 
118 | if VERSION < v"1.4"
119 | 
120 | @enum AddressSpace begin
121 |     Generic         = 1
122 |     Tracked         = 10
123 |     Derived         = 11
124 |     CalleeRooted    = 12
125 |     Loaded          = 13
126 | end
127 | 
128 | # LLVM type of a tracked pointer
129 | function T_prjlvalue()
130 |     T_pjlvalue = convert(LLVMType, Any, true)
131 |     LLVM.PointerType(eltype(T_pjlvalue), Tracked)
132 | end
133 | 
134 | else
135 | 
136 | # FIXME: once we only support 1.4, get rid of this and allow boxed types
137 | T_prjlvalue() = convert(LLVMType, Any, true)
138 | 
139 | end
140 | 
141 | function gc_pool_alloc(sz::Csize_t)
142 |     ptr = malloc(sz)
143 |     if ptr == C_NULL
144 |         report_oom(sz)
145 |         throw(OutOfMemoryError())
146 |     end
147 |     return unsafe_pointer_to_objref(ptr)
148 | end
149 | 
150 | compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
151 | 
152 | # expected functions for GC support
153 | compile(:malloc, Ptr{Nothing}, (Csize_t,))
154 | 
155 | 
156 | ## boxing and unboxing
157 | 
158 | const tag_type = UInt
159 | const tag_size = sizeof(tag_type)
160 | 
161 | const gc_bits = 0x3 # FIXME
162 | 
163 | # get the type tag of a type at run-time
164 | @generated function type_tag(::Val{type_name}) where type_name
165 |     T_tag = convert(LLVMType, tag_type)
166 |     T_ptag = LLVM.PointerType(T_tag)
167 | 
168 |     T_pjlvalue = convert(LLVMType, Any, true)
169 | 
170 |     # create function
171 |     llvm_f, _ = create_function(T_tag)
172 |     mod = LLVM.parent(llvm_f)
173 | 
174 |     # this isn't really a function, but we abuse it to get the JIT to resolve the address
175 |     typ = LLVM.Function(mod, "jl_" * String(type_name) * "_type",
176 |                         LLVM.FunctionType(T_pjlvalue))
177 | 
178 |     # generate IR
179 |     Builder(JuliaContext()) do builder
180 |         entry = BasicBlock(llvm_f, "entry", JuliaContext())
181 |         position!(builder, entry)
182 | 
183 |         typ_var = bitcast!(builder, typ, T_ptag)
184 | 
185 |         tag = load!(builder, typ_var)
186 | 
187 |         ret!(builder, tag)
188 |     end
189 | 
190 |     call_function(llvm_f, tag_type)
191 | end
192 | 
193 | # we use `jl_value_ptr`, a Julia pseudo-intrinsic that can be used to box and unbox values
194 | 
195 | @generated function box(val, ::Val{type_name}) where type_name
196 |     sz = sizeof(val)
197 |     allocsz = sz + tag_size
198 | 
199 |     # type-tags are ephemeral, so look them up at run time
200 |     #tag = unsafe_load(convert(Ptr{tag_type}, type_name))
201 |     tag = :( type_tag(Val(type_name)) )
202 | 
203 |     quote
204 |         Base.@_inline_meta
205 | 
206 |         ptr = malloc($(Csize_t(allocsz)))
207 | 
208 |         # store the type tag
209 |         ptr = convert(Ptr{tag_type}, ptr)
210 |         Core.Intrinsics.pointerset(ptr, $tag | $gc_bits, #=index=# 1, #=align=# $tag_size)
211 | 
212 |         # store the value
213 |         ptr = convert(Ptr{$val}, ptr+tag_size)
214 |         Core.Intrinsics.pointerset(ptr, val, #=index=# 1, #=align=# $sz)
215 | 
216 |         unsafe_pointer_to_objref(ptr)
217 |     end
218 | end
219 | 
220 | @inline function unbox(obj, ::Type{T}) where T
221 |     ptr = ccall(:jl_value_ptr, Ptr{Cvoid}, (Any,), obj)
222 | 
223 |     # load the value
224 |     ptr = convert(Ptr{T}, ptr)
225 |     Core.Intrinsics.pointerref(ptr, #=index=# 1, #=align=# sizeof(T))
226 | end
227 | 
228 | # generate functions functions that exist in the Julia runtime (see julia/src/datatype.c)
229 | for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  => :int64,
230 |                UInt8  => :uint8, UInt16 => :uint16, UInt32 => :uint32, UInt64 => :uint64,
231 |                Bool => :bool, Float32 => :float32, Float64 => :float64]
232 |     box_fn   = Symbol("box_$t")
233 |     unbox_fn = Symbol("unbox_$t")
234 |     @eval begin
235 |         $box_fn(val)   = box($T(val), Val($(QuoteNode(t))))
236 |         $unbox_fn(obj) = unbox(obj, $T)
237 | 
238 |         compile($box_fn, Any, ($T,), T_prjlvalue; llvm_name=$"jl_$box_fn")
239 |         compile($unbox_fn, $T, (Any,); llvm_name=$"jl_$unbox_fn")
240 |     end
241 | end
242 | 
243 | 
244 | end
245 | 


--------------------------------------------------------------------------------
/src/reflection.jl:
--------------------------------------------------------------------------------
  1 | using InteractiveUtils, UUIDs
  2 | const Cthulhu = Base.PkgId(UUID("f68482b8-f384-11e8-15f7-abe071a5a75f"), "Cthulhu")
  3 | 
  4 | 
  5 | #
  6 | # code_* replacements
  7 | #
  8 | 
  9 | code_lowered(job::CompilerJob; kwargs...) =
 10 |     InteractiveUtils.code_lowered(job.source.f, job.source.tt; kwargs...)
 11 | 
 12 | function code_typed(job::CompilerJob; interactive::Bool=false, kwargs...)
 13 |     # TODO: use the compiler driver to get the Julia method instance (we might rewrite it)
 14 |     if interactive
 15 |         # call Cthulhu without introducing a dependency on Cthulhu
 16 |         mod = get(Base.loaded_modules, Cthulhu, nothing)
 17 |         mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.")
 18 |         descend_code_typed = getfield(mod, :descend_code_typed)
 19 |         descend_code_typed(job.source.f, job.source.tt; kwargs...)
 20 |     else
 21 |         InteractiveUtils.code_typed(job.source.f, job.source.tt; kwargs...)
 22 |     end
 23 | end
 24 | 
 25 | function code_warntype(io::IO, job::CompilerJob; interactive::Bool=false, kwargs...)
 26 |     # TODO: use the compiler driver to get the Julia method instance (we might rewrite it)
 27 |     if interactive
 28 |         @assert io == stdout
 29 |         # call Cthulhu without introducing a dependency on Cthulhu
 30 |         mod = get(Base.loaded_modules, Cthulhu, nothing)
 31 |         mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.")
 32 |         descend_code_warntype = getfield(mod, :descend_code_warntype)
 33 |         descend_code_warntype(job.source.f, job.source.tt; kwargs...)
 34 |     else
 35 |         InteractiveUtils.code_warntype(io, job.source.f, job.source.tt; kwargs...)
 36 |     end
 37 | end
 38 | code_warntype(job::CompilerJob; kwargs...) = code_warntype(stdout, job; kwargs...)
 39 | 
 40 | """
 41 |     code_llvm([io], job; optimize=true, raw=false, dump_module=false)
 42 | 
 43 | Prints the device LLVM IR generated for the given compiler job to `io` (default `stdout`).
 44 | 
 45 | The following keyword arguments are supported:
 46 | 
 47 | - `optimize`: determines if the code is optimized, which includes kernel-specific
 48 |   optimizations if `kernel` is true
 49 | - `raw`: return the raw IR including all metadata
 50 | - `dump_module`: display the entire module instead of just the function
 51 | 
 52 | See also: [`@device_code_llvm`](@ref), `InteractiveUtils.code_llvm`
 53 | """
 54 | function code_llvm(io::IO, job::CompilerJob; optimize::Bool=true, raw::Bool=false,
 55 |                    debuginfo::Symbol=:default, dump_module::Bool=false)
 56 |     # NOTE: jl_dump_function_ir supports stripping metadata, so don't do it in the driver
 57 |     ir, entry = GPUCompiler.codegen(:llvm, job; optimize=optimize, strip=false, validate=false)
 58 |     str = ccall(:jl_dump_function_ir, Ref{String},
 59 |                 (Ptr{Cvoid}, Bool, Bool, Ptr{UInt8}),
 60 |                 LLVM.ref(entry), !raw, dump_module, debuginfo)
 61 |     print(io, str)
 62 | end
 63 | code_llvm(job::CompilerJob; kwargs...) = code_llvm(stdout, job; kwargs...)
 64 | 
 65 | """
 66 |     code_native([io], f, types; cap::VersionNumber, kernel=false, raw=false)
 67 | 
 68 | Prints the native assembly generated for the given compiler job to `io` (default `stdout`).
 69 | 
 70 | The following keyword arguments are supported:
 71 | 
 72 | - `cap` which device to generate code for
 73 | - `kernel`: treat the function as an entry-point kernel
 74 | - `raw`: return the raw code including all metadata
 75 | 
 76 | See also: [`@device_code_native`](@ref), `InteractiveUtils.code_llvm`
 77 | """
 78 | function code_native(io::IO, job::CompilerJob; raw::Bool=false)
 79 |     asm, _ = GPUCompiler.codegen(:asm, job; strip=!raw, validate=false)
 80 |     print(io, asm)
 81 | end
 82 | code_native(job::CompilerJob; kwargs...) =
 83 |     code_native(stdout, func, types; kwargs...)
 84 | 
 85 | 
 86 | #
 87 | # @device_code_* functions
 88 | #
 89 | 
 90 | function emit_hooked_compilation(inner_hook, ex...)
 91 |     user_code = ex[end]
 92 |     user_kwargs = ex[1:end-1]
 93 |     quote
 94 |         # wipe the compile cache to force recompilation
 95 |         empty!(GPUCompiler.compilecache)
 96 | 
 97 |         local kernels = 0
 98 |         function outer_hook(job)
 99 |             kernels += 1
100 |             $inner_hook(job; $(map(esc, user_kwargs)...))
101 |         end
102 | 
103 |         if GPUCompiler.compile_hook[] != nothing
104 |             error("Chaining multiple @device_code calls is unsupported")
105 |         end
106 |         try
107 |             GPUCompiler.compile_hook[] = outer_hook
108 |             $(esc(user_code))
109 |         finally
110 |             GPUCompiler.compile_hook[] = nothing
111 |         end
112 | 
113 |         if kernels == 0
114 |             error("no kernels executed while evaluating the given expression")
115 |         end
116 | 
117 |         nothing
118 |     end
119 | end
120 | 
121 | """
122 |     @device_code_lowered ex
123 | 
124 | Evaluates the expression `ex` and returns the result of
125 | `InteractiveUtils.code_lowered` for every compiled GPU kernel.
126 | 
127 | See also: `InteractiveUtils.@code_lowered`
128 | """
129 | macro device_code_lowered(ex...)
130 |     quote
131 |         buf = Any[]
132 |         function hook(job::CompilerJob)
133 |             append!(buf, code_lowered(job))
134 |         end
135 |         $(emit_hooked_compilation(:hook, ex...))
136 |         buf
137 |     end
138 | end
139 | 
140 | """
141 |     @device_code_typed ex
142 | 
143 | Evaluates the expression `ex` and returns the result of
144 | `InteractiveUtils.code_typed` for every compiled GPU kernel.
145 | 
146 | See also: `InteractiveUtils.@code_typed`
147 | """
148 | macro device_code_typed(ex...)
149 |     quote
150 |         output = Dict{CompilerJob,Any}()
151 |         function hook(job::CompilerJob)
152 |             output[job] = code_typed(job)
153 |         end
154 |         $(emit_hooked_compilation(:hook, ex...))
155 |         output
156 |     end
157 | end
158 | 
159 | """
160 |     @device_code_warntype [io::IO=stdout] ex
161 | 
162 | Evaluates the expression `ex` and prints the result of
163 | `InteractiveUtils.code_warntype` to `io` for every compiled GPU kernel.
164 | 
165 | See also: `InteractiveUtils.@code_warntype`
166 | """
167 | macro device_code_warntype(ex...)
168 |     function hook(job::CompilerJob; io::IO=stdout, kwargs...)
169 |         println(io, "$job")
170 |         println(io)
171 |         code_warntype(io, job; kwargs...)
172 |     end
173 |     emit_hooked_compilation(hook, ex...)
174 | end
175 | 
176 | """
177 |     @device_code_llvm [io::IO=stdout, ...] ex
178 | 
179 | Evaluates the expression `ex` and prints the result of InteractiveUtils.code_llvm
180 | to `io` for every compiled GPU kernel. For other supported keywords, see
181 | [`GPUCompiler.code_llvm`](@ref).
182 | 
183 | See also: InteractiveUtils.@code_llvm
184 | """
185 | macro device_code_llvm(ex...)
186 |     function hook(job::CompilerJob; io::IO=stdout, kwargs...)
187 |         println(io, "; $job")
188 |         code_llvm(io, job; kwargs...)
189 |     end
190 |     emit_hooked_compilation(hook, ex...)
191 | end
192 | 
193 | """
194 |     @device_code_native [io::IO=stdout, ...] ex
195 | 
196 | Evaluates the expression `ex` and prints the result of [`GPUCompiler.code_native`](@ref) to `io`
197 | for every compiled GPU kernel. For other supported keywords, see
198 | [`GPUCompiler.code_native`](@ref).
199 | """
200 | macro device_code_native(ex...)
201 |     function hook(job::CompilerJob; io::IO=stdout, kwargs...)
202 |         println(io, "// $job")
203 |         println(io)
204 |         code_native(io, job; kwargs...)
205 |     end
206 |     emit_hooked_compilation(hook, ex...)
207 | end
208 | 
209 | """
210 |     @device_code dir::AbstractString=... [...] ex
211 | 
212 | Evaluates the expression `ex` and dumps all intermediate forms of code to the directory
213 | `dir`.
214 | """
215 | macro device_code(ex...)
216 |     only(xs) = (@assert length(xs) == 1; first(xs))
217 |     localUnique = 1
218 |     function hook(job::CompilerJob; dir::AbstractString)
219 |         name = something(job.source.name, nameof(job.source.f))
220 |         fn = "$(name)_$(localUnique)"
221 |         mkpath(dir)
222 | 
223 |         open(joinpath(dir, "$fn.lowered.jl"), "w") do io
224 |             code = only(code_lowered(job))
225 |             println(io, code)
226 |         end
227 | 
228 |         open(joinpath(dir, "$fn.typed.jl"), "w") do io
229 |             if VERSION >= v"1.1.0"
230 |                 code = only(code_typed(job; debuginfo=:source))
231 |             else
232 |                 code = only(code_typed(job))
233 |             end
234 |             println(io, code)
235 |         end
236 | 
237 |         open(joinpath(dir, "$fn.unopt.ll"), "w") do io
238 |             code_llvm(io, job; dump_module=true, raw=true, optimize=false)
239 |         end
240 | 
241 |         open(joinpath(dir, "$fn.opt.ll"), "w") do io
242 |             code_llvm(io, job; dump_module=true, raw=true)
243 |         end
244 | 
245 |         open(joinpath(dir, "$fn.asm"), "w") do io
246 |             code_native(io, job)
247 |         end
248 | 
249 |         localUnique += 1
250 |     end
251 |     emit_hooked_compilation(hook, ex...)
252 | end
253 | 


--------------------------------------------------------------------------------
/test/ptx.jl:
--------------------------------------------------------------------------------
  1 | @testset "PTX" begin
  2 | 
  3 | include("definitions/ptx.jl")
  4 | 
  5 | ############################################################################################
  6 | 
  7 | @testset "IR" begin
  8 | 
  9 | @testset "exceptions" begin
 10 |     foobar() = throw(DivideError())
 11 |     ir = sprint(io->ptx_code_llvm(io, foobar, Tuple{}))
 12 | 
 13 |     # plain exceptions should get lowered to a call to the GPU run-time
 14 |     @test occursin("gpu_report_exception", ir)
 15 |     # not a jl_throw referencing a jl_value_t representing the exception
 16 |     @test !occursin("jl_throw", ir)
 17 | end
 18 | 
 19 | @testset "kernel functions" begin
 20 | @testset "kernel argument attributes" begin
 21 |     kernel(x) = return
 22 | 
 23 |     @eval struct Aggregate
 24 |         x::Int
 25 |     end
 26 | 
 27 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Aggregate}))
 28 |     if VERSION < v"1.5.0-DEV.802"
 29 |         @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\]) addrspace\(\d+\)?\*", ir)
 30 |     else
 31 |         @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\])\*", ir)
 32 |     end
 33 | 
 34 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Aggregate}; kernel=true))
 35 |     if VERSION < v"1.5.0-DEV.802"
 36 |         @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\]) addrspace\(\d+\)?\*.+byval", ir)
 37 |     else
 38 |         @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\])\*.+byval", ir)
 39 |     end
 40 | end
 41 | 
 42 | @testset "property_annotations" begin
 43 |     kernel() = return
 44 | 
 45 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; dump_module=true))
 46 |     @test !occursin("nvvm.annotations", ir)
 47 | 
 48 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 49 |                                          dump_module=true, kernel=true))
 50 |     @test occursin("nvvm.annotations", ir)
 51 |     @test !occursin("maxntid", ir)
 52 |     @test !occursin("reqntid", ir)
 53 |     @test !occursin("minctasm", ir)
 54 |     @test !occursin("maxnreg", ir)
 55 | 
 56 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 57 |                                   dump_module=true, kernel=true, maxthreads=42))
 58 |     @test occursin("maxntidx\", i32 42", ir)
 59 |     @test occursin("maxntidy\", i32 1", ir)
 60 |     @test occursin("maxntidz\", i32 1", ir)
 61 | 
 62 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 63 |                                   dump_module=true, kernel=true, minthreads=42))
 64 |     @test occursin("reqntidx\", i32 42", ir)
 65 |     @test occursin("reqntidy\", i32 1", ir)
 66 |     @test occursin("reqntidz\", i32 1", ir)
 67 | 
 68 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 69 |                                   dump_module=true, kernel=true, blocks_per_sm=42))
 70 |     @test occursin("minctasm\", i32 42", ir)
 71 | 
 72 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 73 |                                   dump_module=true, kernel=true, maxregs=42))
 74 |     @test occursin("maxnreg\", i32 42", ir)
 75 | end
 76 | 
 77 | LLVM.version() >= v"8" && @testset "calling convention" begin
 78 |     kernel() = return
 79 | 
 80 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; dump_module=true))
 81 |     @test !occursin("ptx_kernel", ir)
 82 | 
 83 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{};
 84 |                                   dump_module=true, kernel=true))
 85 |     @test occursin("ptx_kernel", ir)
 86 | end
 87 | end
 88 | 
 89 | end
 90 | 
 91 | ############################################################################################
 92 | 
 93 | @testset "assembly" begin
 94 | 
 95 | @testset "child functions" begin
 96 |     # we often test using @noinline child functions, so test whether these survive
 97 |     # (despite not having side-effects)
 98 |     @noinline child(i) = sink(i)
 99 |     function parent(i)
100 |         child(i)
101 |         return
102 |     end
103 | 
104 |     asm = sprint(io->ptx_code_native(io, parent, Tuple{Int64}))
105 |     @test occursin(r"call.uni\s+julia_.*child_"m, asm)
106 | end
107 | 
108 | @testset "kernel functions" begin
109 |     @noinline nonentry(i) = sink(i)
110 |     function entry(i)
111 |         nonentry(i)
112 |         return
113 |     end
114 | 
115 |     asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; kernel=true))
116 |     @test occursin(r"\.visible \.entry .*julia_entry", asm)
117 |     @test !occursin(r"\.visible \.func .*julia_nonentry", asm)
118 |     @test occursin(r"\.func .*julia_nonentry", asm)
119 | 
120 | @testset "property_annotations" begin
121 |     asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; kernel=true))
122 |     @test !occursin("maxntid", asm)
123 | 
124 |     asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64};
125 |                                          kernel=true, maxthreads=42))
126 |     @test occursin(".maxntid 42, 1, 1", asm)
127 | 
128 |     asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64};
129 |                                          kernel=true, minthreads=42))
130 |     @test occursin(".reqntid 42, 1, 1", asm)
131 | 
132 |     asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64};
133 |                                          kernel=true, blocks_per_sm=42))
134 |     @test occursin(".minnctapersm 42", asm)
135 | 
136 |     if LLVM.version() >= v"4.0"
137 |         asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64};
138 |                                              kernel=true, maxregs=42))
139 |         @test occursin(".maxnreg 42", asm)
140 |     end
141 | end
142 | end
143 | 
144 | @testset "child function reuse" begin
145 |     # bug: depending on a child function from multiple parents resulted in
146 |     #      the child only being present once
147 | 
148 |     @noinline child(i) = sink(i)
149 |     function parent1(i)
150 |         child(i)
151 |         return
152 |     end
153 | 
154 |     asm = sprint(io->ptx_code_native(io, parent1, Tuple{Int}))
155 |     @test occursin(r".func julia_.*child_", asm)
156 | 
157 |     function parent2(i)
158 |         child(i+1)
159 |         return
160 |     end
161 | 
162 |     asm = sprint(io->ptx_code_native(io, parent2, Tuple{Int}))
163 |     @test occursin(r".func julia_.*child_", asm)
164 | end
165 | 
166 | @testset "child function reuse bis" begin
167 |     # bug: similar, but slightly different issue as above
168 |     #      in the case of two child functions
169 |     @noinline child1(i) = sink(i)
170 |     @noinline child2(i) = sink(i+1)
171 |     function parent1(i)
172 |         child1(i) + child2(i)
173 |         return
174 |     end
175 |     ptx_code_native(devnull, parent1, Tuple{Int})
176 | 
177 |     function parent2(i)
178 |         child1(i+1) + child2(i+1)
179 |         return
180 |     end
181 |     ptx_code_native(devnull, parent2, Tuple{Int})
182 | end
183 | 
184 | @testset "indirect sysimg function use" begin
185 |     # issue #9: re-using sysimg functions should force recompilation
186 |     #           (host fldmod1->mod1 throws, so the PTX code shouldn't contain a throw)
187 | 
188 |     # NOTE: Int32 to test for #49
189 | 
190 |     function kernel(out)
191 |         wid, lane = fldmod1(unsafe_load(out), Int32(32))
192 |         unsafe_store!(out, wid)
193 |         return
194 |     end
195 | 
196 |     asm = sprint(io->ptx_code_native(io, kernel, Tuple{Ptr{Int32}}))
197 |     @test !occursin("jl_throw", asm)
198 |     @test !occursin("jl_invoke", asm)   # forced recompilation should still not invoke
199 | end
200 | 
201 | @testset "LLVM intrinsics" begin
202 |     # issue #13 (a): cannot select trunc
203 |     function kernel(x)
204 |         unsafe_trunc(Int, x)
205 |         return
206 |     end
207 |     ptx_code_native(devnull, kernel, Tuple{Float64})
208 | end
209 | 
210 | @testset "exception arguments" begin
211 |     function kernel(a)
212 |         unsafe_store!(a, trunc(Int, unsafe_load(a)))
213 |         return
214 |     end
215 | 
216 |     ptx_code_native(devnull, kernel, Tuple{Ptr{Float64}})
217 | end
218 | 
219 | @testset "GC and TLS lowering" begin
220 |     @eval mutable struct PleaseAllocate
221 |         y::Csize_t
222 |     end
223 | 
224 |     # common pattern in Julia 0.7: outlined throw to avoid a GC frame in the calling code
225 |     @noinline function inner(x)
226 |         sink(x.y)
227 |         nothing
228 |     end
229 | 
230 |     function kernel(i)
231 |         inner(PleaseAllocate(Csize_t(42)))
232 |         nothing
233 |     end
234 | 
235 |     asm = sprint(io->ptx_code_native(io, kernel, Tuple{Int}))
236 |     @test occursin("gpu_gc_pool_alloc", asm)
237 | 
238 |     # make sure that we can still ellide allocations
239 |     function ref_kernel(ptr, i)
240 |         data = Ref{Int64}()
241 |         data[] = 0
242 |         if i > 1
243 |             data[] = 1
244 |         else
245 |             data[] = 2
246 |         end
247 |         unsafe_store!(ptr, data[], i)
248 |         return nothing
249 |     end
250 | 
251 |     asm = sprint(io->ptx_code_native(io, ref_kernel, Tuple{Ptr{Int64}, Int}))
252 | 
253 | 
254 |     if VERSION < v"1.2.0-DEV.375"
255 |         @test_broken !occursin("gpu_gc_pool_alloc", asm)
256 |     else
257 |         @test !occursin("gpu_gc_pool_alloc", asm)
258 |     end
259 | end
260 | 
261 | @testset "float boxes" begin
262 |     function kernel(a,b)
263 |         c = Int32(a)
264 |         # the conversion to Int32 may fail, in which case the input Float32 is boxed in order to
265 |         # pass it to the @nospecialize exception constructor. we should really avoid that (eg.
266 |         # by avoiding @nospecialize, or optimize the unused arguments away), but for now the box
267 |         # should just work.
268 |         unsafe_store!(b, c)
269 |         return
270 |     end
271 | 
272 |     ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}}))
273 |     @test occursin("jl_box_float32", ir)
274 |     ptx_code_native(devnull, kernel, Tuple{Float32,Ptr{Float32}})
275 | end
276 | 
277 | end
278 | 
279 | 
280 | ############################################################################################
281 | 
282 | end
283 | 


--------------------------------------------------------------------------------
/src/driver.jl:
--------------------------------------------------------------------------------
  1 | # compiler driver and main interface
  2 | 
  3 | # NOTE: the keyword arguments to compile/codegen control those aspects of compilation that
  4 | #       might have to be changed (e.g. set libraries=false when recursing, or set
  5 | #       strip=true for reflection). What remains defines the compilation job itself,
  6 | #       and those values are contained in the CompilerJob struct.
  7 | 
  8 | # (::CompilerJob)
  9 | const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 10 | 
 11 | """
 12 |     compile(target::Symbol, job::CompilerJob;
 13 |             libraries=true, deferred_codegen=true,
 14 |             optimize=true, strip=false, ...)
 15 | 
 16 | Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
 17 | following formats as specified by the `target` argument: `:julia` for Julia IR, `:llvm` for
 18 | LLVM IR and `:asm` for machine code.
 19 | 
 20 | The following keyword arguments are supported:
 21 | - `libraries`: link the GPU runtime and `libdevice` libraries (if required)
 22 | - `deferred_codegen`: resolve deferred compiler invocations (if required)
 23 | - `optimize`: optimize the code (default: true)
 24 | - `strip`: strip non-functional metadata and debug information (default: false)
 25 | - `validate`: validate the generated IR before emitting machine code (default: true)
 26 | 
 27 | Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
 28 | """
 29 | function compile(target::Symbol, job::CompilerJob;
 30 |                  libraries::Bool=true, deferred_codegen::Bool=true,
 31 |                  optimize::Bool=true, strip::Bool=false, validate::Bool=true)
 32 |     if compile_hook[] != nothing
 33 |         compile_hook[](job)
 34 |     end
 35 | 
 36 |     return codegen(target, job;
 37 |                    libraries=libraries, deferred_codegen=deferred_codegen,
 38 |                    optimize=optimize, strip=strip, validate=validate)
 39 | end
 40 | 
 41 | # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism.
 42 | # this could both be generalized (e.g. supporting actual function calls, instead of
 43 | # returning a function pointer), and be integrated with the nonrecursive codegen.
 44 | const deferred_codegen_jobs = Vector{Tuple{Core.Function,Type}}()
 45 | @generated function deferred_codegen(::Val{f}, ::Val{tt}) where {f,tt}
 46 |     push!(deferred_codegen_jobs, (f,tt))
 47 |     id = length(deferred_codegen_jobs)
 48 | 
 49 |     quote
 50 |         # TODO: add an edge to this method instance to support method redefinitions
 51 |         ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id)
 52 |     end
 53 | end
 54 | 
 55 | function codegen(output::Symbol, job::CompilerJob;
 56 |                  libraries::Bool=true, deferred_codegen::Bool=true, optimize::Bool=true,
 57 |                  strip::Bool=false, validate::Bool=true)
 58 |     ## Julia IR
 59 | 
 60 |     @timeit_debug to "validation" check_method(job)
 61 | 
 62 |     @timeit_debug to "Julia front-end" begin
 63 | 
 64 |         # get the method instance
 65 |         world = typemax(UInt)
 66 |         meth = which(job.source.f, job.source.tt)
 67 |         sig = Base.signature_type(job.source.f, job.source.tt)::Type
 68 |         (ti, env) = ccall(:jl_type_intersection_with_env, Any,
 69 |                           (Any, Any), sig, meth.sig)::Core.SimpleVector
 70 |         if VERSION >= v"1.2.0-DEV.320"
 71 |             meth = Base.func_for_method_checked(meth, ti, env)
 72 |         else
 73 |             meth = Base.func_for_method_checked(meth, ti)
 74 |         end
 75 |         method_instance = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance},
 76 |                       (Any, Any, Any, UInt), meth, ti, env, world)
 77 | 
 78 |         for var in env
 79 |             if var isa TypeVar
 80 |                 throw(KernelError(job, "method captures a typevar (you probably use an unbound type variable)"))
 81 |             end
 82 |         end
 83 |     end
 84 | 
 85 |     output == :julia && return method_instance
 86 | 
 87 | 
 88 |     ## LLVM IR
 89 | 
 90 |     # always preload the runtime, and do so early; it cannot be part of any timing block
 91 |     # because it recurses into the compiler
 92 |     if libraries
 93 |         runtime = load_runtime(job)
 94 |         runtime_fns = LLVM.name.(defs(runtime))
 95 |     end
 96 | 
 97 |     @timeit_debug to "LLVM middle-end" begin
 98 |         ir, kernel = @timeit_debug to "IR generation" irgen(job, method_instance, world)
 99 |         kernel_fn = LLVM.name(kernel)
100 | 
101 |         # target-specific libraries
102 |         if libraries
103 |             undefined_fns = LLVM.name.(decls(ir))
104 |             @timeit_debug to "target libraries" link_libraries!(job, ir, undefined_fns)
105 |         end
106 | 
107 |         if optimize
108 |             @timeit_debug to "optimization" optimize!(job, ir)
109 | 
110 |             # optimization may have replaced functions, so look the entry point up again
111 |             kernel = functions(ir)[kernel_fn]
112 |         end
113 | 
114 |         if libraries
115 |             undefined_fns = LLVM.name.(decls(ir))
116 |             if any(fn -> fn in runtime_fns, undefined_fns)
117 |                 @timeit_debug to "runtime library" link_library!(ir, runtime)
118 |             end
119 |         end
120 | 
121 |         if ccall(:jl_is_debugbuild, Cint, ()) == 1
122 |             @timeit_debug to "verification" verify(ir)
123 |         end
124 | 
125 |         # remove everything except for the kernel
126 |         @timeit_debug to "clean-up" begin
127 |             exports = String[kernel_fn]
128 |             ModulePassManager() do pm
129 |                 # internalize all functions that aren't exports
130 |                 internalize!(pm, exports)
131 | 
132 |                 # eliminate all unused internal functions
133 |                 global_optimizer!(pm)
134 |                 global_dce!(pm)
135 |                 strip_dead_prototypes!(pm)
136 | 
137 |                 run!(pm, ir)
138 |             end
139 |         end
140 |     end
141 | 
142 |     # deferred code generation
143 |     if deferred_codegen && haskey(functions(ir), "deferred_codegen")
144 |         dyn_marker = functions(ir)["deferred_codegen"]
145 | 
146 |         cache = Dict{CompilerJob, String}(job => kernel_fn)
147 | 
148 |         # iterative compilation (non-recursive)
149 |         changed = true
150 |         while changed
151 |             changed = false
152 | 
153 |             # find deferred compiler
154 |             # TODO: recover this information earlier, from the Julia IR
155 |             worklist = MultiDict{CompilerJob, LLVM.CallInst}()
156 |             for use in uses(dyn_marker)
157 |                 # decode the call
158 |                 call = user(use)::LLVM.CallInst
159 |                 id = convert(Int, first(operands(call)))
160 | 
161 |                 global deferred_codegen_jobs
162 |                 dyn_f, dyn_tt = deferred_codegen_jobs[id]
163 |                 dyn_job = similar(job, FunctionSpec(dyn_f, dyn_tt, #=kernel=# true))
164 |                 push!(worklist, dyn_job => call)
165 |             end
166 | 
167 |             # compile and link
168 |             for dyn_job in keys(worklist)
169 |                 # cached compilation
170 |                 dyn_kernel_fn = get!(cache, dyn_job) do
171 |                     dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize,
172 |                                                  strip=strip, validate=validate,
173 |                                                  deferred_codegen=false)
174 |                     dyn_kernel_fn = LLVM.name(dyn_kernel)
175 |                     link!(ir, dyn_ir)
176 |                     changed = true
177 |                     dyn_kernel_fn
178 |                 end
179 |                 dyn_kernel = functions(ir)[dyn_kernel_fn]
180 | 
181 |                 # insert a pointer to the function everywhere the kernel is used
182 |                 T_ptr = convert(LLVMType, Ptr{Cvoid})
183 |                 for call in worklist[dyn_job]
184 |                     Builder(JuliaContext()) do builder
185 |                         position!(builder, call)
186 |                         fptr = ptrtoint!(builder, dyn_kernel, T_ptr)
187 |                         replace_uses!(call, fptr)
188 |                     end
189 |                     unsafe_delete!(LLVM.parent(call), call)
190 |                 end
191 |             end
192 |         end
193 | 
194 |         # all deferred compilations should have been resolved
195 |         @compiler_assert isempty(uses(dyn_marker)) job
196 |         unsafe_delete!(ir, dyn_marker)
197 |     end
198 | 
199 |     if output == :llvm
200 |         if strip
201 |             @timeit_debug to "strip debug info" strip_debuginfo!(ir)
202 |         end
203 | 
204 |         return ir, kernel
205 |     end
206 | 
207 | 
208 |     ## machine code
209 | 
210 |     finish_module!(job, ir)
211 | 
212 |     if validate
213 |         @timeit_debug to "validation" begin
214 |             check_invocation(job, kernel)
215 |             check_ir(job, ir)
216 |         end
217 |     end
218 | 
219 |     # NOTE: strip after validation to get better errors
220 |     if strip
221 |         @timeit_debug to "strip debug info" strip_debuginfo!(ir)
222 |     end
223 | 
224 |     @timeit_debug to "LLVM back-end" begin
225 |         @timeit_debug to "preparation" prepare_execution!(job, ir)
226 | 
227 |         if output == :asm
228 |             code = @timeit_debug to "machine-code generation" mcgen(job, ir, kernel, LLVM.API.LLVMAssemblyFile)
229 |         elseif output == :obj
230 |             code = @timeit_debug to "machine-code generation" mcgen(job, ir, kernel, LLVM.API.LLVMObjectFile)
231 |         end
232 |     end
233 | 
234 |     undefined_fns = LLVM.name.(decls(ir))
235 |     undefined_gbls = map(x->(name=LLVM.name(x),type=llvmtype(x),external=isextinit(x)), LLVM.globals(ir))
236 | 
237 |     (output == :asm || output == :obj) && return code, kernel_fn, undefined_fns, undefined_gbls
238 | 
239 | 
240 |     error("Unknown compilation output $output")
241 | end
242 | 


--------------------------------------------------------------------------------
/src/validation.jl:
--------------------------------------------------------------------------------
  1 | # validation of properties and code
  2 | 
  3 | export InvalidIRError
  4 | 
  5 | function check_method(job::CompilerJob)
  6 |     isa(job.source.f, Core.Builtin) && throw(KernelError(job, "function is not a generic function"))
  7 | 
  8 |     # get the method
  9 |     ms = Base.methods(job.source.f, job.source.tt)
 10 |     isempty(ms)   && throw(KernelError(job, "no method found"))
 11 |     length(ms)!=1 && throw(KernelError(job, "no unique matching method"))
 12 |     m = first(ms)
 13 | 
 14 |     # kernels can't return values
 15 |     if job.source.kernel
 16 |         rt = Base.return_types(job.source.f, job.source.tt)[1]
 17 |         if rt != Nothing
 18 |             throw(KernelError(job, "kernel returns a value of type `$rt`",
 19 |                 """Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
 20 |                    If the returned value is of type `Union{}`, your Julia code probably throws an exception.
 21 |                    Inspect the code with `@device_code_warntype` for more details."""))
 22 |         end
 23 |     end
 24 | 
 25 |     return
 26 | end
 27 | 
 28 | if VERSION < v"1.1.0-DEV.593"
 29 |     fieldtypes(@nospecialize(dt)) = ntuple(i->fieldtype(dt, i), fieldcount(dt))
 30 | end
 31 | 
 32 | # The actual check is rather complicated
 33 | # and might change from version to version...
 34 | function hasfieldcount(@nospecialize(dt))
 35 |     try
 36 |         fieldcount(dt)
 37 |     catch
 38 |         return false
 39 |     end
 40 |     return true
 41 | end
 42 | 
 43 | function explain_nonisbits(@nospecialize(dt), depth=1; maxdepth=10)
 44 |     dt===Module && return ""    # work around JuliaLang/julia#33347
 45 |     depth > maxdepth && return ""
 46 |     hasfieldcount(dt) || return ""
 47 |     msg = ""
 48 |     for (ft, fn) in zip(fieldtypes(dt), fieldnames(dt))
 49 |         if !isbitstype(ft)
 50 |             msg *= "  "^depth * ".$fn is of type $ft which is not isbits.\n"
 51 |             msg *= explain_nonisbits(ft, depth+1)
 52 |         end
 53 |     end
 54 |     return msg
 55 | end
 56 | 
 57 | function check_invocation(job::CompilerJob, entry::LLVM.Function)
 58 |     # make sure any non-isbits arguments are unused
 59 |     real_arg_i = 0
 60 |     sig = Base.signature_type(job.source.f, job.source.tt)::Type
 61 |     for (arg_i,dt) in enumerate(sig.parameters)
 62 |         isghosttype(dt) && continue
 63 |         VERSION >= v"1.5.0-DEV.581" && Core.Compiler.isconstType(dt) && continue
 64 |         real_arg_i += 1
 65 | 
 66 |         if !isbitstype(dt)
 67 |             if VERSION >= v"1.5.0-DEV.581"
 68 |                 throw(KernelError(job, "passing and using non-bitstype argument",
 69 |                     """Argument $arg_i to your kernel function is of type $dt, which is not isbits:
 70 |                        $(explain_nonisbits(dt))"""))
 71 |             else
 72 |                 # be slightly more lenient pre 1.5, to support `function(::Type, ...)`
 73 |                 param = parameters(entry)[real_arg_i]
 74 |                 if !isempty(uses(param))
 75 |                     throw(KernelError(job, "passing and using non-bitstype argument",
 76 |                     """Argument $arg_i to your kernel function is of type $dt, which is not isbits:
 77 |                        $(explain_nonisbits(dt))
 78 |                        Passing non-isbits types is only allowed if they they are unused by the kernel."""))
 79 |                 end
 80 |             end
 81 |         end
 82 |     end
 83 | 
 84 |     return
 85 | end
 86 | 
 87 | 
 88 | ## IR validation
 89 | 
 90 | const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta
 91 | 
 92 | struct InvalidIRError <: Exception
 93 |     job::CompilerJob
 94 |     errors::Vector{IRError}
 95 | end
 96 | 
 97 | const RUNTIME_FUNCTION = "call to the Julia runtime"
 98 | const UNKNOWN_FUNCTION = "call to an unknown function"
 99 | const POINTER_FUNCTION = "call through a literal pointer"
100 | const DELAYED_BINDING  = "use of an undefined name"
101 | const DYNAMIC_CALL     = "dynamic function invocation"
102 | 
103 | function Base.showerror(io::IO, err::InvalidIRError)
104 |     print(io, "InvalidIRError: compiling ", err.job.source, " resulted in invalid LLVM IR")
105 |     for (kind, bt, meta) in err.errors
106 |         print(io, "\nReason: unsupported $kind")
107 |         if meta != nothing
108 |             if kind == RUNTIME_FUNCTION || kind == UNKNOWN_FUNCTION || kind == POINTER_FUNCTION || kind == DYNAMIC_CALL
109 |                 print(io, " (call to ", meta, ")")
110 |             elseif kind == DELAYED_BINDING
111 |                 print(io, " (use of '", meta, "')")
112 |             end
113 |         end
114 |         Base.show_backtrace(io, bt)
115 |     end
116 |     return
117 | end
118 | 
119 | function check_ir(job, args...)
120 |     errors = check_ir!(job, IRError[], args...)
121 |     unique!(errors)
122 |     if !isempty(errors)
123 |         throw(InvalidIRError(job, errors))
124 |     end
125 | 
126 |     return
127 | end
128 | 
129 | function check_ir!(job, errors::Vector{IRError}, mod::LLVM.Module)
130 |     for f in functions(mod)
131 |         check_ir!(job, errors, f)
132 |     end
133 | 
134 |     return errors
135 | end
136 | 
137 | function check_ir!(job, errors::Vector{IRError}, f::LLVM.Function)
138 |     for bb in blocks(f), inst in instructions(bb)
139 |         if isa(inst, LLVM.CallInst)
140 |             check_ir!(job, errors, inst)
141 |         end
142 |     end
143 | 
144 |     return errors
145 | end
146 | 
147 | const libjulia = Ref{Ptr{Cvoid}}(C_NULL)
148 | 
149 | function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
150 |     bt = backtrace(inst)
151 |     dest = called_value(inst)
152 |     if isa(dest, LLVM.Function)
153 |         fn = LLVM.name(dest)
154 | 
155 |         # some special handling for runtime functions that we don't implement
156 |         if fn == "jl_get_binding_or_error"
157 |             try
158 |                 m, sym, _ = operands(inst)
159 |                 sym = first(operands(sym::ConstantExpr))::ConstantInt
160 |                 sym = convert(Int, sym)
161 |                 sym = Ptr{Cvoid}(sym)
162 |                 sym = Base.unsafe_pointer_to_objref(sym)
163 |                 push!(errors, (DELAYED_BINDING, bt, sym))
164 |             catch e
165 |                 isa(e,TypeError) || rethrow()
166 |                 @debug "Decoding arguments to jl_get_binding_or_error failed" inst bb=LLVM.parent(inst)
167 |                 push!(errors, (DELAYED_BINDING, bt, nothing))
168 |             end
169 |         elseif fn == "jl_invoke"
170 |             try
171 |                 if VERSION < v"1.3.0-DEV.244"
172 |                     meth, args, nargs, _ = operands(inst)
173 |                 else
174 |                     f, args, nargs, meth = operands(inst)
175 |                 end
176 |                 if VERSION < v"1.5.0-DEV.802"
177 |                     # addrspacecast
178 |                     meth = first(operands(meth::ConstantExpr))
179 |                 end
180 |                 meth = first(operands(meth::ConstantExpr))::ConstantInt
181 |                 meth = convert(Int, meth)
182 |                 meth = Ptr{Cvoid}(meth)
183 |                 meth = Base.unsafe_pointer_to_objref(meth)::Core.MethodInstance
184 |                 push!(errors, (DYNAMIC_CALL, bt, meth.def))
185 |             catch e
186 |                 isa(e,TypeError) || rethrow()
187 |                 @debug "Decoding arguments to jl_invoke failed" inst bb=LLVM.parent(inst)
188 |                 push!(errors, (DYNAMIC_CALL, bt, nothing))
189 |             end
190 |         elseif fn == "jl_apply_generic"
191 |             try
192 |                 if VERSION < v"1.3.0-DEV.244"
193 |                     args, nargs, _ = operands(inst)
194 |                     ## args is a buffer where arguments are stored in
195 |                     f, args = user.(uses(args))
196 |                     ## first store into the args buffer is a direct store
197 |                     f = first(operands(f::LLVM.StoreInst))::ConstantExpr
198 |                 else
199 |                     f, args, nargs, _ = operands(inst)
200 |                 end
201 | 
202 |                 if VERSION < v"1.5.0-DEV.802"
203 |                     f = first(operands(f::ConstantExpr)) # get rid of addrspacecast
204 |                 end
205 |                 f = first(operands(f))::ConstantInt # get rid of inttoptr
206 |                 f = convert(Int, f)
207 |                 f = Ptr{Cvoid}(f)
208 |                 f = Base.unsafe_pointer_to_objref(f)
209 |                 push!(errors, (DYNAMIC_CALL, bt, f))
210 |             catch e
211 |                 isa(e,TypeError) || rethrow()
212 |                 @debug "Decoding arguments to jl_apply_generic failed" inst bb=LLVM.parent(inst)
213 |                 push!(errors, (DYNAMIC_CALL, bt, nothing))
214 |             end
215 | 
216 |         # detect calls to undefined functions
217 |         elseif isdeclaration(dest) && intrinsic_id(dest) == 0 && !isintrinsic(job, fn)
218 |             # figure out if the function lives in the Julia runtime library
219 |             if libjulia[] == C_NULL
220 |                 paths = filter(Libdl.dllist()) do path
221 |                     name = splitdir(path)[2]
222 |                     startswith(name, "libjulia")
223 |                 end
224 |                 libjulia[] = Libdl.dlopen(first(paths))
225 |             end
226 | 
227 |             if Libdl.dlsym_e(libjulia[], fn) != C_NULL
228 |                 push!(errors, (RUNTIME_FUNCTION, bt, LLVM.name(dest)))
229 |             else
230 |                 push!(errors, (UNKNOWN_FUNCTION, bt, LLVM.name(dest)))
231 |             end
232 |         end
233 | 
234 |     elseif isa(dest, InlineAsm)
235 |         # let's assume it's valid ASM
236 | 
237 |     elseif isa(dest, ConstantExpr)
238 |         # detect calls to literal pointers
239 |         if occursin("inttoptr", string(dest))
240 |             # extract the literal pointer
241 |             ptr_arg = first(operands(dest))
242 |             @compiler_assert isa(ptr_arg, ConstantInt) job
243 |             ptr_val = convert(Int, ptr_arg)
244 |             ptr = Ptr{Cvoid}(ptr_val)
245 | 
246 |             # look it up in the Julia JIT cache
247 |             frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
248 |             if length(frames) >= 1
249 |                 @compiler_assert length(frames) == 1 job frames=frames
250 |                 if VERSION >= v"1.4.0-DEV.123"
251 |                     fn, file, line, linfo, fromC, inlined = last(frames)
252 |                 else
253 |                     fn, file, line, linfo, fromC, inlined, ip = last(frames)
254 |                 end
255 |                 push!(errors, (POINTER_FUNCTION, bt, fn))
256 |             else
257 |                 push!(errors, (POINTER_FUNCTION, bt, nothing))
258 |             end
259 |         end
260 |     end
261 | 
262 |     return errors
263 | end
264 | 


--------------------------------------------------------------------------------
/src/gcn.jl:
--------------------------------------------------------------------------------
  1 | # implementation of the GPUCompiler interfaces for generating GCN code
  2 | 
  3 | ## target
  4 | 
  5 | export GCNCompilerTarget
  6 | 
  7 | Base.@kwdef struct GCNCompilerTarget <: AbstractCompilerTarget
  8 |     dev_isa::String
  9 | end
 10 | 
 11 | llvm_triple(::GCNCompilerTarget) = "amdgcn-amd-amdhsa"
 12 | 
 13 | function llvm_machine(target::GCNCompilerTarget)
 14 |     triple = llvm_triple(target)
 15 |     t = Target(triple)
 16 | 
 17 |     cpu = target.dev_isa
 18 |     feat = ""
 19 |     optlevel = LLVM.API.LLVMCodeGenLevelDefault
 20 |     reloc = LLVM.API.LLVMRelocPIC
 21 |     tm = TargetMachine(t, triple, cpu, feat, optlevel, reloc)
 22 |     asm_verbosity!(tm, true)
 23 | 
 24 |     return tm
 25 | end
 26 | 
 27 | 
 28 | ## job
 29 | 
 30 | # TODO: encode debug build or not in the compiler job
 31 | #       https://github.com/JuliaGPU/CUDAnative.jl/issues/368
 32 | runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.target.dev_isa)"
 33 | 
 34 | const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free")
 35 | isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics)
 36 | 
 37 | function process_kernel!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module, kernel::LLVM.Function)
 38 |     kernel = wrap_entry!(job, mod, kernel)
 39 |     # AMDGPU kernel calling convention
 40 |     callconv!(kernel, LLVM.API.LLVMCallConv(91))
 41 |     kernel
 42 | end
 43 | 
 44 | function add_lowering_passes!(job::CompilerJob{GCNCompilerTarget}, pm::LLVM.PassManager)
 45 |     add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!))
 46 | end
 47 | 
 48 | function lower_throw_extra!(mod::LLVM.Module)
 49 |     job = current_job::CompilerJob
 50 |     changed = false
 51 |     @timeit_debug to "lower throw (extra)" begin
 52 | 
 53 |     throw_functions = [
 54 |         r"julia_bounds_error.*",
 55 |         r"julia_throw_boundserror.*",
 56 |         r"julia_error_if_canonical_getindex.*",
 57 |         r"julia_error_if_canonical_setindex.*",
 58 |         r"julia___subarray_throw_boundserror.*",
 59 |     ]
 60 | 
 61 | 
 62 |     for f in functions(mod)
 63 |         f_name = LLVM.name(f)
 64 |         for fn in throw_functions
 65 |             if occursin(fn, f_name)
 66 |                 for use in uses(f)
 67 |                     call = user(use)::LLVM.CallInst
 68 | 
 69 |                     # replace the throw with a trap
 70 |                     let builder = Builder(JuliaContext())
 71 |                         position!(builder, call)
 72 |                         emit_exception!(builder, f_name, call)
 73 |                         dispose(builder)
 74 |                     end
 75 | 
 76 |                     # remove the call
 77 |                     call_args = collect(operands(call))[1:end-1] # last arg is function itself
 78 |                     unsafe_delete!(LLVM.parent(call), call)
 79 | 
 80 |                     # HACK: kill the exceptions' unused arguments
 81 |                     for arg in call_args
 82 |                         # peek through casts
 83 |                         if isa(arg, LLVM.AddrSpaceCastInst)
 84 |                             cast = arg
 85 |                             arg = first(operands(cast))
 86 |                             isempty(uses(cast)) && unsafe_delete!(LLVM.parent(cast), cast)
 87 |                         end
 88 | 
 89 |                         if isa(arg, LLVM.Instruction) && isempty(uses(arg))
 90 |                             unsafe_delete!(LLVM.parent(arg), arg)
 91 |                         end
 92 |                     end
 93 | 
 94 |                     changed = true
 95 |                 end
 96 | 
 97 |                 @compiler_assert isempty(uses(f)) job
 98 |             end
 99 |         end
100 |     end
101 | 
102 |     end
103 |     return changed
104 | end
105 | 
106 | function emit_trap!(job::CompilerJob{GCNCompilerTarget}, builder, mod, inst)
107 |     trap = if haskey(functions(mod), "llvm.trap")
108 |         functions(mod)["llvm.trap"]
109 |     else
110 |         LLVM.Function(mod, "llvm.trap", LLVM.FunctionType(LLVM.VoidType(JuliaContext())))
111 |     end
112 |     if Base.libllvm_version < v"9"
113 |         rl_ft = LLVM.FunctionType(LLVM.Int32Type(JuliaContext()),
114 |                                   [LLVM.Int32Type(JuliaContext())])
115 |         rl = if haskey(functions(mod), "llvm.amdgcn.readfirstlane")
116 |             functions(mod)["llvm.amdgcn.readfirstlane"]
117 |         else
118 |             LLVM.Function(mod, "llvm.amdgcn.readfirstlane", rl_ft)
119 |         end
120 |         # FIXME: Early versions of the AMDGPU target fail to skip machine
121 |         # blocks with certain side effects when EXEC==0, except when certain
122 |         # criteria are met within said block. We emit a v_readfirstlane_b32
123 |         # instruction here, as that is sufficient to trigger a skip. Without
124 |         # this, the target will only attempt to do a "masked branch", which
125 |         # only works on vector instructions (trap is a scalar instruction, and
126 |         # therefore it is executed even when EXEC==0).
127 |         rl_val = call!(builder, rl, [ConstantInt(Int32(32), JuliaContext())])
128 |         rl_bc = inttoptr!(builder, rl_val, LLVM.PointerType(LLVM.Int32Type(JuliaContext())))
129 |         store!(builder, rl_val, rl_bc)
130 |     end
131 |     call!(builder, trap)
132 | end
133 | 
134 | # manual implementation of byval, as the backend doesn't support it for kernel args
135 | # https://reviews.llvm.org/D79744
136 | function wrapper_type(julia_t::Type, codegen_t::LLVMType)::LLVMType
137 |     if !isbitstype(julia_t)
138 |         # don't pass jl_value_t by value; it's an opaque structure
139 |         return codegen_t
140 |     elseif isa(codegen_t, LLVM.PointerType) && !(julia_t <: Ptr)
141 |         # we didn't specify a pointer, but codegen passes one anyway.
142 |         # make the wrapper accept the underlying value instead.
143 |         return eltype(codegen_t)
144 |     else
145 |         return codegen_t
146 |     end
147 | end
148 | # generate a kernel wrapper to fix & improve argument passing
149 | function wrap_entry!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function)
150 |     entry_ft = eltype(llvmtype(entry_f)::LLVM.PointerType)::LLVM.FunctionType
151 |     @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) job
152 | 
153 |     # filter out types which don't occur in the LLVM function signatures
154 |     sig = Base.signature_type(job.source.f, job.source.tt)::Type
155 |     julia_types = Type[]
156 |     for dt::Type in sig.parameters
157 |         if !isghosttype(dt) && (VERSION < v"1.5.0-DEV.581" || !Core.Compiler.isconstType(dt))
158 |             push!(julia_types, dt)
159 |         end
160 |     end
161 | 
162 |     # generate the wrapper function type & definition
163 |     wrapper_types = LLVM.LLVMType[wrapper_type(julia_t, codegen_t)
164 |                                   for (julia_t, codegen_t)
165 |                                   in zip(julia_types, parameters(entry_ft))]
166 |     wrapper_fn = LLVM.name(entry_f)
167 |     LLVM.name!(entry_f, wrapper_fn * ".inner")
168 |     wrapper_ft = LLVM.FunctionType(LLVM.VoidType(JuliaContext()), wrapper_types)
169 |     wrapper_f = LLVM.Function(mod, wrapper_fn, wrapper_ft)
170 | 
171 |     # emit IR performing the "conversions"
172 |     let builder = Builder(JuliaContext())
173 |         entry = BasicBlock(wrapper_f, "entry", JuliaContext())
174 |         position!(builder, entry)
175 | 
176 |         wrapper_args = Vector{LLVM.Value}()
177 | 
178 |         # perform argument conversions
179 |         codegen_types = parameters(entry_ft)
180 |         wrapper_params = parameters(wrapper_f)
181 |         param_index = 0
182 |         for (julia_t, codegen_t, wrapper_t, wrapper_param) in
183 |             zip(julia_types, codegen_types, wrapper_types, wrapper_params)
184 |             param_index += 1
185 |             if codegen_t != wrapper_t
186 |                 # the wrapper argument doesn't match the kernel parameter type.
187 |                 # this only happens when codegen wants to pass a pointer.
188 |                 @compiler_assert isa(codegen_t, LLVM.PointerType) job
189 |                 @compiler_assert eltype(codegen_t) == wrapper_t job
190 | 
191 |                 # copy the argument value to a stack slot, and reference it.
192 |                 ptr = alloca!(builder, wrapper_t)
193 |                 if LLVM.addrspace(codegen_t) != 0
194 |                     ptr = addrspacecast!(builder, ptr, codegen_t)
195 |                 end
196 |                 store!(builder, wrapper_param, ptr)
197 |                 push!(wrapper_args, ptr)
198 |             else
199 |                 push!(wrapper_args, wrapper_param)
200 |                 for attr in collect(parameter_attributes(entry_f, param_index))
201 |                     push!(parameter_attributes(wrapper_f, param_index), attr)
202 |                 end
203 |             end
204 |         end
205 | 
206 |         call!(builder, entry_f, wrapper_args)
207 | 
208 |         ret!(builder)
209 | 
210 |         dispose(builder)
211 |     end
212 | 
213 |     # early-inline the original entry function into the wrapper
214 |     push!(function_attributes(entry_f), EnumAttribute("alwaysinline", 0, JuliaContext()))
215 |     linkage!(entry_f, LLVM.API.LLVMInternalLinkage)
216 | 
217 |     fixup_metadata!(entry_f)
218 |     ModulePassManager() do pm
219 |         always_inliner!(pm)
220 |         run!(pm, mod)
221 |     end
222 | 
223 |     return wrapper_f
224 | end
225 | # HACK: get rid of invariant.load and const TBAA metadata on loads from pointer args,
226 | #       since storing to a stack slot violates the semantics of those attributes.
227 | # TODO: can we emit a wrapper that doesn't violate Julia's metadata?
228 | function fixup_metadata!(f::LLVM.Function)
229 |     for param in parameters(f)
230 |         if isa(llvmtype(param), LLVM.PointerType)
231 |             # collect all uses of the pointer
232 |             worklist = Vector{LLVM.Instruction}(user.(collect(uses(param))))
233 |             while !isempty(worklist)
234 |                 value = popfirst!(worklist)
235 | 
236 |                 # remove the invariant.load attribute
237 |                 md = metadata(value)
238 |                 if haskey(md, LLVM.MD_invariant_load)
239 |                     delete!(md, LLVM.MD_invariant_load)
240 |                 end
241 |                 if haskey(md, LLVM.MD_tbaa)
242 |                     delete!(md, LLVM.MD_tbaa)
243 |                 end
244 | 
245 |                 # recurse on the output of some instructions
246 |                 if isa(value, LLVM.BitCastInst) ||
247 |                    isa(value, LLVM.GetElementPtrInst) ||
248 |                    isa(value, LLVM.AddrSpaceCastInst)
249 |                     append!(worklist, user.(collect(uses(value))))
250 |                 end
251 | 
252 |                 # IMPORTANT NOTE: if we ever want to inline functions at the LLVM level,
253 |                 # we need to recurse into call instructions here, and strip metadata from
254 |                 # called functions (see CUDAnative.jl#238).
255 |             end
256 |         end
257 |     end
258 | end
259 | 


--------------------------------------------------------------------------------
/test/native.jl:
--------------------------------------------------------------------------------
  1 | @testset "native" begin
  2 | 
  3 | include("definitions/native.jl")
  4 | 
  5 | ############################################################################################
  6 | 
  7 | @testset "Compilation" begin
  8 |     kernel() = nothing
  9 | 
 10 |     output = native_code_execution(kernel, (); validate=false)
 11 |     @test occursin("kernel", output[2])
 12 |     @test isempty(output[3])
 13 |     @test isempty(output[4])
 14 | 
 15 |     @testset "Undefined Functions" begin
 16 |         function undef_fn()
 17 |             ccall("extern somefunc", llvmcall, Cvoid, ())
 18 |             nothing
 19 |         end
 20 | 
 21 |         output = native_code_execution(undef_fn, (); validate=false)
 22 |         @test length(output[3]) == 1
 23 |         @test output[3][1] == "somefunc"
 24 |     end
 25 | 
 26 |     @testset "Undefined Globals" begin
 27 |         @generated function makegbl(::Val{name}, ::Type{T}, ::Val{isext}) where {name,T,isext}
 28 |             T_gbl = convert(LLVMType, T)
 29 |             T_ptr = convert(LLVMType, Ptr{T})
 30 |             llvm_f, _ = create_function(T_ptr)
 31 |             mod = LLVM.parent(llvm_f)
 32 |             gvar = GlobalVariable(mod, T_gbl, string(name))
 33 |             isext && extinit!(gvar, true)
 34 |             Builder(JuliaContext()) do builder
 35 |                 entry = BasicBlock(llvm_f, "entry", JuliaContext())
 36 |                 position!(builder, entry)
 37 |                 result = ptrtoint!(builder, gvar, T_ptr)
 38 |                 ret!(builder, result)
 39 |             end
 40 |             call_function(llvm_f, Ptr{T})
 41 |         end
 42 |         function undef_gbl()
 43 |             ext_ptr = makegbl(Val(:someglobal), Int64, Val(true))
 44 |             Base.unsafe_store!(ext_ptr, 1)
 45 |             ptr = makegbl(Val(:otherglobal), Float32, Val(false))
 46 |             Base.unsafe_store!(ptr, 2f0)
 47 |             nothing
 48 |         end
 49 | 
 50 |         output = native_code_execution(undef_gbl, ())
 51 |         @test length(output[4]) == 2
 52 |         @test output[4][1].name == "someglobal"
 53 |         @test eltype(output[4][1].type) isa LLVM.IntegerType
 54 |         @test output[4][1].external
 55 |         @test output[4][2].name == "otherglobal"
 56 |         @test eltype(output[4][2].type) isa LLVM.LLVMFloat
 57 |         @test !output[4][2].external
 58 |     end
 59 | end
 60 | 
 61 | ############################################################################################
 62 | 
 63 | @testset "IR" begin
 64 | 
 65 | @testset "basic reflection" begin
 66 |     valid_kernel() = return
 67 |     invalid_kernel() = 1
 68 | 
 69 |     ir = sprint(io->native_code_llvm(io, valid_kernel, Tuple{}; optimize=false, dump_module=true))
 70 | 
 71 |     # module should contain our function + a generic call wrapper
 72 |     @test occursin(r"define\ .* void\ @.*julia_valid_kernel.*\(\)"x, ir)
 73 |     @test !occursin("define %jl_value_t* @jlcall_", ir)
 74 | 
 75 |     # there should be no debug metadata
 76 |     @test !occursin("!dbg", ir)
 77 | 
 78 |     @test native_code_llvm(devnull, invalid_kernel, Tuple{}) == nothing
 79 |     @test_throws KernelError native_code_llvm(devnull, invalid_kernel, Tuple{}; kernel=true) == nothing
 80 | end
 81 | 
 82 | @testset "unbound typevars" begin
 83 |     invalid_kernel() where {unbound} = return
 84 |     @test_throws KernelError native_code_llvm(devnull, invalid_kernel, Tuple{})
 85 | end
 86 | 
 87 | @testset "child functions" begin
 88 |     # we often test using `@noinline sink` child functions, so test whether these survive
 89 |     @noinline child(i) = sink(i)
 90 |     parent(i) = child(i)
 91 | 
 92 |     ir = sprint(io->native_code_llvm(io, parent, Tuple{Int}))
 93 |     @test occursin(r"call .+ @julia_child_", ir)
 94 | end
 95 | 
 96 | @testset "sysimg" begin
 97 |     # bug: use a system image function
 98 | 
 99 |     function foobar(a,i)
100 |         Base.pointerset(a, 0, mod1(i,10), 8)
101 |     end
102 | 
103 |     ir = sprint(io->native_code_llvm(io, foobar, Tuple{Ptr{Int},Int}))
104 |     @test !occursin("jlsys_", ir)
105 | end
106 | 
107 | @testset "tracked pointers" begin
108 |     function kernel(a)
109 |         a[1] = 1
110 |         return
111 |     end
112 | 
113 |     # this used to throw an LLVM assertion (#223)
114 |     native_code_llvm(devnull, kernel, Tuple{Vector{Int}}; kernel=true)
115 | end
116 | 
117 | if VERSION >= v"1.0.2"
118 | @testset "CUDAnative.jl#278" begin
119 |     # codegen idempotency
120 |     # NOTE: this isn't fixed, but surfaces here due to bad inference of checked_sub
121 |     # NOTE: with the fix to print_to_string this doesn't error anymore,
122 |     #       but still have a test to make sure it doesn't regress
123 |     native_code_llvm(devnull, Base.checked_sub, Tuple{Int,Int}; optimize=false)
124 |     native_code_llvm(devnull, Base.checked_sub, Tuple{Int,Int}; optimize=false)
125 | 
126 |     # breaking recursion in print_to_string makes it possible to compile
127 |     # even in the presence of the above bug
128 |     native_code_llvm(devnull, Base.print_to_string, Tuple{Int,Int}; optimize=false)
129 | end
130 | end
131 | 
132 | @testset "LLVM D32593" begin
133 |     @eval struct D32593_struct
134 |         foo::Float32
135 |         bar::Float32
136 |     end
137 | 
138 |     D32593(ptr) = unsafe_load(ptr).foo
139 | 
140 |     native_code_llvm(devnull, D32593, Tuple{Ptr{D32593_struct}})
141 | end
142 | 
143 | end
144 | 
145 | ############################################################################################
146 | 
147 | @testset "assembly" begin
148 | 
149 | @testset "basic reflection" begin
150 |     valid_kernel() = return
151 |     invalid_kernel() = 1
152 | 
153 |     @test native_code_native(devnull, valid_kernel, Tuple{}) == nothing
154 |     @test native_code_native(devnull, invalid_kernel, Tuple{}) == nothing
155 |     @test_throws KernelError native_code_native(devnull, invalid_kernel, Tuple{}; kernel=true)
156 | end
157 | 
158 | @testset "idempotency" begin
159 |     # bug: generate code twice for the same kernel (jl_to_ptx wasn't idempotent)
160 | 
161 |     kernel() = return
162 |     native_code_native(devnull, kernel, Tuple{})
163 |     native_code_native(devnull, kernel, Tuple{})
164 | end
165 | 
166 | @testset "compile for host after gpu" begin
167 |     # issue #11: re-using host functions after GPU compilation
168 |     @noinline child(i) = sink(i+1)
169 | 
170 |     function fromhost()
171 |         child(10)
172 |     end
173 | 
174 |     function fromptx()
175 |         child(10)
176 |         return
177 |     end
178 | 
179 |     native_code_native(devnull, fromptx, Tuple{})
180 |     @test fromhost() == 11
181 | end
182 | 
183 | end
184 | 
185 | ############################################################################################
186 | 
187 | @testset "errors" begin
188 | 
189 | # some validation happens in the emit_function hook, which is called by code_llvm
190 | 
191 | @testset "base intrinsics" begin
192 |     foobar(i) = sin(i)
193 | 
194 |     # NOTE: we don't use test_logs in order to test all of the warning (exception, backtrace)
195 |     logs, _ = Test.collect_test_logs(min_level=Info) do
196 |         withenv("JULIA_DEBUG" => nothing) do
197 |             native_code_llvm(devnull, foobar, Tuple{Int})
198 |         end
199 |     end
200 |     @test length(logs) == 1
201 |     record = logs[1]
202 |     @test record.level == Base.CoreLogging.Warn
203 |     @test record.message == "calls to Base intrinsics might be GPU incompatible"
204 |     @test haskey(record.kwargs, :exception)
205 |     err,bt = record.kwargs[:exception]
206 |     err_msg = sprint(showerror, err)
207 |     @test occursin(Regex("You called sin(.+) in Base.Math .+, maybe you intended to call sin(.+) in $TestRuntime .+ instead?"), err_msg)
208 |     bt_msg = sprint(Base.show_backtrace, bt)
209 |     @test occursin("[1] sin", bt_msg)
210 |     @test occursin(r"\[2\] .+foobar", bt_msg)
211 | end
212 | 
213 | # some validation happens in `compile`
214 | 
215 | @eval Main begin
216 | struct CleverType{T}
217 |     x::T
218 | end
219 | Base.unsafe_trunc(::Type{Int}, x::CleverType) = unsafe_trunc(Int, x.x)
220 | end
221 | 
222 | @testset "non-isbits arguments" begin
223 |     foobar(i) = (sink(unsafe_trunc(Int,i)); return)
224 | 
225 |     @test_throws_message(KernelError,
226 |                          native_code_execution(foobar, Tuple{BigInt})) do msg
227 |         occursin("passing and using non-bitstype argument", msg) &&
228 |         occursin("BigInt", msg)
229 |     end
230 | 
231 |     # test that we can handle abstract types
232 |     @test_throws_message(KernelError,
233 |                          native_code_execution(foobar, Tuple{Any})) do msg
234 |         occursin("passing and using non-bitstype argument", msg) &&
235 |         occursin("Any", msg)
236 |     end
237 | 
238 |     @test_throws_message(KernelError,
239 |                          native_code_execution(foobar, Tuple{Union{Int32, Int64}})) do msg
240 |         occursin("passing and using non-bitstype argument", msg) &&
241 |         occursin("Union{Int32, Int64}", msg)
242 |     end
243 | 
244 |     @test_throws_message(KernelError,
245 |                          native_code_execution(foobar, Tuple{Union{Int32, Int64}})) do msg
246 |         occursin("passing and using non-bitstype argument", msg) &&
247 |         occursin("Union{Int32, Int64}", msg)
248 |     end
249 | 
250 |     # test that we get information about fields and reason why something is not isbits
251 |     @test_throws_message(KernelError,
252 |                          native_code_execution(foobar, Tuple{CleverType{BigInt}})) do msg
253 |         occursin("passing and using non-bitstype argument", msg) &&
254 |         occursin("CleverType", msg) &&
255 |         occursin("BigInt", msg)
256 |     end
257 | end
258 | 
259 | @testset "invalid LLVM IR" begin
260 |     foobar(i) = println(i)
261 | 
262 |     @test_throws_message(InvalidIRError,
263 |                          native_code_execution(foobar, Tuple{Int})) do msg
264 |         occursin("invalid LLVM IR", msg) &&
265 |         occursin(GPUCompiler.RUNTIME_FUNCTION, msg) &&
266 |         occursin("[1] println", msg) &&
267 |         occursin(r"\[2\] .+foobar", msg)
268 |     end
269 | end
270 | 
271 | @testset "invalid LLVM IR (ccall)" begin
272 |     foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing)
273 | 
274 |     @test_throws_message(InvalidIRError,
275 |                          native_code_execution(foobar, Tuple{Ptr{Int}})) do msg
276 |         occursin("invalid LLVM IR", msg) &&
277 |         occursin(GPUCompiler.POINTER_FUNCTION, msg) &&
278 |         occursin(r"\[1\] .+foobar", msg)
279 |     end
280 | end
281 | 
282 | @testset "delayed bindings" begin
283 |     kernel() = (undefined; return)
284 | 
285 |     @test_throws_message(InvalidIRError,
286 |                          native_code_execution(kernel, Tuple{})) do msg
287 |         occursin("invalid LLVM IR", msg) &&
288 |         occursin(GPUCompiler.DELAYED_BINDING, msg) &&
289 |         occursin("use of 'undefined'", msg) &&
290 |         occursin(r"\[1\] .+kernel", msg)
291 |     end
292 | end
293 | 
294 | @testset "dynamic call (invoke)" begin
295 |     @eval @noinline nospecialize_child(@nospecialize(i)) = i
296 |     kernel(a, b) = (unsafe_store!(b, nospecialize_child(a)); return)
297 | 
298 |     @test_throws_message(InvalidIRError,
299 |                          native_code_execution(kernel, Tuple{Int,Ptr{Int}})) do msg
300 |         occursin("invalid LLVM IR", msg) &&
301 |         occursin(GPUCompiler.DYNAMIC_CALL, msg) &&
302 |         occursin("call to nospecialize_child", msg) &&
303 |         occursin(r"\[1\] .+kernel", msg)
304 |     end
305 | end
306 | 
307 | @testset "dynamic call (apply)" begin
308 |     func() = println(1)
309 | 
310 |     @test_throws_message(InvalidIRError,
311 |                          native_code_execution(func, Tuple{})) do msg
312 |         occursin("invalid LLVM IR", msg) &&
313 |         occursin(GPUCompiler.DYNAMIC_CALL, msg) &&
314 |         occursin("call to println", msg) &&
315 |         occursin("[2] func", msg)
316 |     end
317 | end
318 | 
319 | end
320 | 
321 | ############################################################################################
322 | 
323 | end
324 | 


--------------------------------------------------------------------------------
/src/ptx.jl:
--------------------------------------------------------------------------------
  1 | # implementation of the GPUCompiler interfaces for generating PTX code
  2 | 
  3 | ## target
  4 | 
  5 | export PTXCompilerTarget
  6 | 
  7 | Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget
  8 |     cap::VersionNumber
  9 | 
 10 |     # optional properties
 11 |     minthreads::Union{Nothing,Int,NTuple{<:Any,Int}} = nothing
 12 |     maxthreads::Union{Nothing,Int,NTuple{<:Any,Int}} = nothing
 13 |     blocks_per_sm::Union{Nothing,Int} = nothing
 14 |     maxregs::Union{Nothing,Int} = nothing
 15 | end
 16 | 
 17 | llvm_triple(::PTXCompilerTarget) = Int===Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda"
 18 | 
 19 | function llvm_machine(target::PTXCompilerTarget)
 20 |     triple = llvm_triple(target)
 21 |     t = Target(triple)
 22 | 
 23 |     cpu = "sm_$(target.cap.major)$(target.cap.minor)"
 24 |     feat = "+ptx60" # we only support CUDA 9.0+ and LLVM 6.0+
 25 |     tm = TargetMachine(t, triple, cpu, feat)
 26 |     asm_verbosity!(tm, true)
 27 | 
 28 |     return tm
 29 | end
 30 | 
 31 | # the default datalayout does not match the one in the NVPTX user guide
 32 | llvm_datalayout(::PTXCompilerTarget) = Int===Int64 ?
 33 |     "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64"*
 34 |      "-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" :
 35 |     "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64"*
 36 |      "-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 37 | 
 38 | 
 39 | ## job
 40 | 
 41 | function Base.show(io::IO, job::CompilerJob{PTXCompilerTarget})
 42 |     print(io, "PTX CompilerJob of ", job.source)
 43 |     print(io, " for sm_$(job.target.cap.major)$(job.target.cap.minor)")
 44 | 
 45 |     job.target.minthreads !== nothing && print(io, ", minthreads=$(job.target.minthreads)")
 46 |     job.target.maxthreads !== nothing && print(io, ", maxthreads=$(job.target.maxthreads)")
 47 |     job.target.blocks_per_sm !== nothing && print(io, ", blocks_per_sm=$(job.target.blocks_per_sm)")
 48 |     job.target.maxregs !== nothing && print(io, ", maxregs=$(job.target.maxregs)")
 49 | end
 50 | 
 51 | const ptx_intrinsics = ("vprintf", "__assertfail", "malloc", "free")
 52 | isintrinsic(::CompilerJob{PTXCompilerTarget}, fn::String) = in(fn, ptx_intrinsics)
 53 | 
 54 | # TODO: encode debug build or not in the compiler job
 55 | #       https://github.com/JuliaGPU/CUDAnative.jl/issues/368
 56 | runtime_slug(job::CompilerJob{PTXCompilerTarget}) = "ptx-sm_$(job.target.cap.major)$(job.target.cap.minor)"
 57 | 
 58 | function process_kernel!(job::CompilerJob{PTXCompilerTarget}, mod::LLVM.Module, kernel::LLVM.Function)
 59 |     # property annotations
 60 |     annotations = LLVM.Value[kernel]
 61 | 
 62 |     ## kernel metadata
 63 |     append!(annotations, [MDString("kernel"), ConstantInt(Int32(1), JuliaContext())])
 64 | 
 65 |     ## expected CTA sizes
 66 |     if job.target.minthreads != nothing
 67 |         for (dim, name) in enumerate([:x, :y, :z])
 68 |             bound = dim <= length(job.target.minthreads) ? job.target.minthreads[dim] : 1
 69 |             append!(annotations, [MDString("reqntid$name"),
 70 |                                   ConstantInt(Int32(bound), JuliaContext())])
 71 |         end
 72 |     end
 73 |     if job.target.maxthreads != nothing
 74 |         for (dim, name) in enumerate([:x, :y, :z])
 75 |             bound = dim <= length(job.target.maxthreads) ? job.target.maxthreads[dim] : 1
 76 |             append!(annotations, [MDString("maxntid$name"),
 77 |                                   ConstantInt(Int32(bound), JuliaContext())])
 78 |         end
 79 |     end
 80 | 
 81 |     if job.target.blocks_per_sm != nothing
 82 |         append!(annotations, [MDString("minctasm"),
 83 |                               ConstantInt(Int32(job.target.blocks_per_sm), JuliaContext())])
 84 |     end
 85 | 
 86 |     if job.target.maxregs != nothing
 87 |         append!(annotations, [MDString("maxnreg"),
 88 |                               ConstantInt(Int32(job.target.maxregs), JuliaContext())])
 89 |     end
 90 | 
 91 |     push!(metadata(mod), "nvvm.annotations", MDNode(annotations))
 92 | 
 93 | 
 94 |     if LLVM.version() >= v"8"
 95 |         # calling convention
 96 |         for fun in functions(mod)
 97 |             callconv!(kernel, LLVM.API.LLVMPTXDeviceCallConv)
 98 |         end
 99 |         callconv!(kernel, LLVM.API.LLVMPTXKernelCallConv)
100 |     end
101 | 
102 |     return kernel
103 | end
104 | 
105 | function add_lowering_passes!(job::CompilerJob{PTXCompilerTarget}, pm::LLVM.PassManager)
106 |     add!(pm, FunctionPass("HideUnreachable", hide_unreachable!))
107 |     add!(pm, ModulePass("HideTrap", hide_trap!))
108 | end
109 | 
110 | function add_optimization_passes!(job::CompilerJob{PTXCompilerTarget}, pm::LLVM.PassManager)
111 |     # NVPTX's target machine info enables runtime unrolling,
112 |     # but Julia's pass sequence only invokes the simple unroller.
113 |     loop_unroll!(pm)
114 |     instruction_combining!(pm)  # clean-up redundancy
115 |     licm!(pm)                   # the inner runtime check might be outer loop invariant
116 | 
117 |     # the above loop unroll pass might have unrolled regular, non-runtime nested loops.
118 |     # that code still needs to be optimized (arguably, multiple unroll passes should be
119 |     # scheduled by the Julia optimizer). do so here, instead of re-optimizing entirely.
120 |     early_csemem_ssa!(pm) # TODO: gvn instead? see NVPTXTargetMachine.cpp::addEarlyCSEOrGVNPass
121 |     dead_store_elimination!(pm)
122 | 
123 |     constant_merge!(pm)
124 | 
125 |     cfgsimplification!(pm)
126 | 
127 |     # get rid of the internalized functions; now possible unused
128 |     global_dce!(pm)
129 | end
130 | 
131 | 
132 | ## LLVM passes
133 | 
134 | # HACK: this pass removes `unreachable` information from LLVM
135 | #
136 | # `ptxas` is buggy and cannot deal with thread-divergent control flow in the presence of
137 | # shared memory (see JuliaGPU/CUDAnative.jl#4). avoid that by rewriting control flow to fall
138 | # through any other block. this is semantically invalid, but the code is unreachable anyhow
139 | # (and we expect it to be preceded by eg. a noreturn function, or a trap).
140 | #
141 | # TODO: can LLVM do this with structured CFGs? It seems to have some support, but seemingly
142 | #       only to prevent introducing non-structureness during optimization (ie. the front-end
143 | #       is still responsible for generating structured control flow).
144 | function hide_unreachable!(fun::LLVM.Function)
145 |     job = current_job::CompilerJob
146 |     changed = false
147 |     @timeit_debug to "hide unreachable" begin
148 | 
149 |     # remove `noreturn` attributes
150 |     #
151 |     # when calling a `noreturn` function, LLVM places an `unreachable` after the call.
152 |     # this leads to an early `ret` from the function.
153 |     attrs = function_attributes(fun)
154 |     delete!(attrs, EnumAttribute("noreturn", 0, JuliaContext()))
155 | 
156 |     # build a map of basic block predecessors
157 |     predecessors = Dict(bb => Set{LLVM.BasicBlock}() for bb in blocks(fun))
158 |     @timeit_debug to "predecessors" for bb in blocks(fun)
159 |         insts = instructions(bb)
160 |         if !isempty(insts)
161 |             inst = last(insts)
162 |             if isterminator(inst)
163 |                 for bb′ in successors(inst)
164 |                     push!(predecessors[bb′], bb)
165 |                 end
166 |             end
167 |         end
168 |     end
169 | 
170 |     # scan for unreachable terminators and alternative successors
171 |     worklist = Pair{LLVM.BasicBlock, Union{Nothing,LLVM.BasicBlock}}[]
172 |     @timeit_debug to "find" for bb in blocks(fun)
173 |         unreachable = terminator(bb)
174 |         if isa(unreachable, LLVM.UnreachableInst)
175 |             unsafe_delete!(bb, unreachable)
176 |             changed = true
177 | 
178 |             try
179 |                 terminator(bb)
180 |                 # the basic-block is still terminated properly, nothing to do
181 |                 # (this can happen with `ret; unreachable`)
182 |                 # TODO: `unreachable; unreachable`
183 |             catch ex
184 |                 isa(ex, UndefRefError) || rethrow(ex)
185 |                 let builder = Builder(JuliaContext())
186 |                     position!(builder, bb)
187 | 
188 |                     # find the strict predecessors to this block
189 |                     preds = collect(predecessors[bb])
190 | 
191 |                     # find a fallthrough block: recursively look at predecessors
192 |                     # and find a successor that branches to any other block
193 |                     fallthrough = nothing
194 |                     while !isempty(preds)
195 |                         # find an alternative successor
196 |                         for pred in preds, succ in successors(terminator(pred))
197 |                             if succ != bb
198 |                                 fallthrough = succ
199 |                                 break
200 |                             end
201 |                         end
202 |                         fallthrough === nothing || break
203 | 
204 |                         # recurse upwards
205 |                         old_preds = copy(preds)
206 |                         empty!(preds)
207 |                         for pred in old_preds
208 |                             append!(preds, predecessors[pred])
209 |                         end
210 |                     end
211 |                     push!(worklist, bb => fallthrough)
212 | 
213 |                     dispose(builder)
214 |                 end
215 |             end
216 |         end
217 |     end
218 | 
219 |     # apply the pending terminator rewrites
220 |     @timeit_debug to "replace" if !isempty(worklist)
221 |         let builder = Builder(JuliaContext())
222 |             for (bb, fallthrough) in worklist
223 |                 position!(builder, bb)
224 |                 if fallthrough !== nothing
225 |                     br!(builder, fallthrough)
226 |                 else
227 |                     # couldn't find any other successor. this happens with functions
228 |                     # that only contain a single block, or when the block is dead.
229 |                     ft = eltype(llvmtype(fun))
230 |                     if return_type(ft) == LLVM.VoidType(JuliaContext())
231 |                         # even though returning can lead to invalid control flow,
232 |                         # it mostly happens with functions that just throw,
233 |                         # and leaving the unreachable there would make the optimizer
234 |                         # place another after the call.
235 |                         ret!(builder)
236 |                     else
237 |                         unreachable!(builder)
238 |                     end
239 |                 end
240 |             end
241 |         end
242 |     end
243 | 
244 |     end
245 |     return changed
246 | end
247 | 
248 | # HACK: this pass removes calls to `trap` and replaces them with inline assembly
249 | #
250 | # if LLVM knows we're trapping, code is marked `unreachable` (see `hide_unreachable!`).
251 | function hide_trap!(mod::LLVM.Module)
252 |     job = current_job::CompilerJob
253 |     changed = false
254 |     @timeit_debug to "hide trap" begin
255 | 
256 |     # inline assembly to exit a thread, hiding control flow from LLVM
257 |     exit_ft = LLVM.FunctionType(LLVM.VoidType(JuliaContext()))
258 |     exit = if job.target.cap < v"7"
259 |         # ptxas for old compute capabilities has a bug where it messes up the
260 |         # synchronization stack in the presence of shared memory and thread-divergend exit.
261 |         InlineAsm(exit_ft, "trap;", "", true)
262 |     else
263 |         InlineAsm(exit_ft, "exit;", "", true)
264 |     end
265 | 
266 |     if haskey(functions(mod), "llvm.trap")
267 |         trap = functions(mod)["llvm.trap"]
268 | 
269 |         for use in uses(trap)
270 |             val = user(use)
271 |             if isa(val, LLVM.CallInst)
272 |                 let builder = Builder(JuliaContext())
273 |                     position!(builder, val)
274 |                     call!(builder, exit)
275 |                     dispose(builder)
276 |                 end
277 |                 unsafe_delete!(LLVM.parent(val), val)
278 |                 changed = true
279 |             end
280 |         end
281 |     end
282 | 
283 |     end
284 |     return changed
285 | end
286 | 


--------------------------------------------------------------------------------
/src/irgen.jl:
--------------------------------------------------------------------------------
  1 | # LLVM IR generation
  2 | 
  3 | 
  4 | ## method compilation tracer
  5 | 
  6 | # this functionality is used to detect recursion, and functions that shouldn't be called.
  7 | # it is a hack, and should disappear over time. don't add new features to it.
  8 | 
  9 | # generate a pseudo-backtrace from a stack of methods being emitted
 10 | function backtrace(job::CompilerJob, call_stack::Vector{Core.MethodInstance})
 11 |     bt = StackTraces.StackFrame[]
 12 |     for method_instance in call_stack
 13 |         method = method_instance.def
 14 |         if method.name === :overdub && isdefined(method, :generator)
 15 |             # The inline frames are maintained by the dwarf based backtrace, but here we only have the
 16 |             # calls to overdub directly, the backtrace therefore is collapsed and we have to
 17 |             # lookup the overdubbed function, but only if we likely are using the generated variant.
 18 |             actual_sig = Tuple{method_instance.specTypes.parameters[3:end]...}
 19 |             m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), actual_sig, typemax(UInt))
 20 |             method = m.func::Method
 21 |         end
 22 |         frame = StackTraces.StackFrame(method.name, method.file, method.line)
 23 |         pushfirst!(bt, frame)
 24 |     end
 25 |     bt
 26 | end
 27 | 
 28 | # NOTE: we use an exception to be able to display a stack trace using the logging framework
 29 | struct MethodSubstitutionWarning <: Exception
 30 |     original::Method
 31 |     substitute::Method
 32 | end
 33 | Base.showerror(io::IO, err::MethodSubstitutionWarning) =
 34 |     print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?")
 35 | const method_substitution_whitelist = [:hypot, :exp]
 36 | 
 37 | mutable struct MethodCompileTracer
 38 |     job::CompilerJob
 39 |     call_stack::Vector{Core.MethodInstance}
 40 |     last_method_instance::Union{Nothing,Core.MethodInstance}
 41 | 
 42 |     MethodCompileTracer(job, start) = new(job, Core.MethodInstance[start])
 43 |     MethodCompileTracer(job) = new(job, Core.MethodInstance[])
 44 | end
 45 | 
 46 | function Base.push!(tracer::MethodCompileTracer, method_instance)
 47 |     push!(tracer.call_stack, method_instance)
 48 | 
 49 |     if VERSION < v"1.5.0-DEV.393"
 50 |         # check for recursion
 51 |         if method_instance in tracer.call_stack[1:end-1]
 52 |             throw(KernelError(tracer.job, "recursion is currently not supported";
 53 |                               bt=backtrace(tracer.job, tracer.call_stack)))
 54 |         end
 55 |     end
 56 | 
 57 |     # check for Base functions that exist in the GPU package
 58 |     # FIXME: this might be too coarse
 59 |     method = method_instance.def
 60 |     if Base.moduleroot(method.module) == Base &&
 61 |         isdefined(runtime_module(tracer.job), method_instance.def.name) &&
 62 |         !in(method_instance.def.name, method_substitution_whitelist)
 63 |         substitute_function = getfield(runtime_module(tracer.job), method.name)
 64 |         tt = Tuple{method_instance.specTypes.parameters[2:end]...}
 65 |         if hasmethod(substitute_function, tt)
 66 |             method′ = which(substitute_function, tt)
 67 |             if method′.module == runtime_module(tracer.job)
 68 |                 @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(tracer.job, tracer.call_stack))
 69 |             end
 70 |         end
 71 |     end
 72 | end
 73 | 
 74 | function Base.pop!(tracer::MethodCompileTracer, method_instance)
 75 |     @compiler_assert last(tracer.call_stack) == method_instance tracer.job
 76 |     tracer.last_method_instance = pop!(tracer.call_stack)
 77 | end
 78 | 
 79 | Base.last(tracer::MethodCompileTracer) = tracer.last_method_instance
 80 | 
 81 | 
 82 | ## Julia compiler integration
 83 | 
 84 | if VERSION >= v"1.5.0-DEV.393"
 85 | 
 86 | # JuliaLang/julia#25984 significantly restructured the compiler
 87 | 
 88 | function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world)
 89 |     # set-up the compiler interface
 90 |     tracer = MethodCompileTracer(job, method_instance)
 91 |     hook_emit_function(method_instance, code) = push!(tracer, method_instance)
 92 |     hook_emitted_function(method_instance, code) = pop!(tracer, method_instance)
 93 |     param_kwargs = [:track_allocations  => false,
 94 |                     :code_coverage      => false,
 95 |                     :static_alloc       => false,
 96 |                     :prefer_specsig     => true,
 97 |                     :emit_function      => hook_emit_function,
 98 |                     :emitted_function   => hook_emitted_function]
 99 |     if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547"
100 |         push!(param_kwargs, :gnu_pubnames => false)
101 | 
102 |         debug_info_kind = if Base.JLOptions().debug_level == 0
103 |             LLVM.API.LLVMDebugEmissionKindNoDebug
104 |         elseif Base.JLOptions().debug_level == 1
105 |             LLVM.API.LLVMDebugEmissionKindLineTablesOnly
106 |         elseif Base.JLOptions().debug_level >= 2
107 |             LLVM.API.LLVMDebugEmissionKindFullDebug
108 |         end
109 | 
110 |         # LLVM's debug info crashes older CUDA assemblers
111 |         if job.target isa PTXCompilerTarget # && driver_version(job.target) < v"10.2"
112 |             # FIXME: this was supposed to be fixed on 10.2
113 |             @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1
114 |             debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug
115 |         end
116 | 
117 |         push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind))
118 |     end
119 |     params = Base.CodegenParams(;param_kwargs...)
120 | 
121 |     # generate IR
122 |     if VERSION >= v"1.5.0-DEV.851"
123 |         native_code = ccall(:jl_create_native, Ptr{Cvoid},
124 |                             (Vector{Core.MethodInstance}, Base.CodegenParams, Cint),
125 |                             [method_instance], params, #=extern policy=# 1)
126 |     else
127 |         native_code = ccall(:jl_create_native, Ptr{Cvoid},
128 |                             (Vector{Core.MethodInstance}, Base.CodegenParams),
129 |                             [method_instance], params)
130 |     end
131 |     @assert native_code != C_NULL
132 |     llvm_mod_ref = ccall(:jl_get_llvm_module, LLVM.API.LLVMModuleRef,
133 |                          (Ptr{Cvoid},), native_code)
134 |     @assert llvm_mod_ref != C_NULL
135 |     llvm_mod = LLVM.Module(llvm_mod_ref)
136 | 
137 |     # get the top-level code
138 |     code = if VERSION >= v"1.6.0-DEV.12"
139 |         # TODO: use our own interpreter
140 |         interpreter = Core.Compiler.NativeInterpreter(world)
141 |         Core.Compiler.inf_for_methodinstance(interpreter, method_instance, world, world)
142 |     else
143 |         Core.Compiler.inf_for_methodinstance(method_instance, world, world)
144 |     end
145 | 
146 |     # get the top-level function index
147 |     llvm_func_idx = Ref{Int32}(-1)
148 |     llvm_specfunc_idx = Ref{Int32}(-1)
149 |     ccall(:jl_breakpoint, Nothing, ())
150 |     ccall(:jl_get_function_id, Nothing,
151 |           (Ptr{Cvoid}, Any, Ptr{Int32}, Ptr{Int32}),
152 |           native_code, code, llvm_func_idx, llvm_specfunc_idx)
153 |     @assert llvm_func_idx[] != -1
154 |     @assert llvm_specfunc_idx[] != -1
155 | 
156 |     # get the top-level function)
157 |     llvm_func_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef,
158 |                      (Ptr{Cvoid}, UInt32), native_code, llvm_func_idx[]-1)
159 |     @assert llvm_func_ref != C_NULL
160 |     llvm_func = LLVM.Function(llvm_func_ref)
161 |     llvm_specfunc_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef,
162 |                          (Ptr{Cvoid}, UInt32), native_code, llvm_specfunc_idx[]-1)
163 |     @assert llvm_specfunc_ref != C_NULL
164 |     llvm_specfunc = LLVM.Function(llvm_specfunc_ref)
165 | 
166 |     # configure the module
167 |     triple!(llvm_mod, llvm_triple(job.target))
168 |     if llvm_datalayout(job.target) !== nothing
169 |         datalayout!(llvm_mod, llvm_datalayout(job.target))
170 |     end
171 | 
172 |     return llvm_specfunc, llvm_mod
173 | end
174 | 
175 | else
176 | 
177 | function module_setup(job::CompilerJob, mod::LLVM.Module)
178 |     # configure the module
179 |     triple!(mod, llvm_triple(job.target))
180 |     datalayout!(mod, llvm_datalayout(job.target))
181 | 
182 |     # add debug info metadata
183 |     if LLVM.version() >= v"8.0"
184 |         # Set Dwarf Version to 2, the DI printer will downgrade to v2 automatically,
185 |         # but this is technically correct and the only version supported by NVPTX
186 |         LLVM.flags(mod)["Dwarf Version", LLVM.API.LLVMModuleFlagBehaviorWarning] =
187 |             Metadata(ConstantInt(Int32(2), JuliaContext()))
188 |         LLVM.flags(mod)["Debug Info Version", LLVM.API.LLVMModuleFlagBehaviorError] =
189 |             Metadata(ConstantInt(DEBUG_METADATA_VERSION(), JuliaContext()))
190 |     else
191 |         push!(metadata(mod), "llvm.module.flags",
192 |              MDNode([ConstantInt(Int32(1), JuliaContext()),    # llvm::Module::Error
193 |                      MDString("Debug Info Version"),
194 |                      ConstantInt(DEBUG_METADATA_VERSION(), JuliaContext())]))
195 |     end
196 | end
197 | 
198 | function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world)
199 |     function postprocess(ir)
200 |         # get rid of jfptr wrappers
201 |         for llvmf in functions(ir)
202 |             startswith(LLVM.name(llvmf), "jfptr_") && unsafe_delete!(ir, llvmf)
203 |         end
204 | 
205 |         return
206 |     end
207 | 
208 |     # set-up the compiler interface
209 |     tracer = MethodCompileTracer(job)
210 |     hook_emit_function(method_instance, code, world) = push!(tracer, method_instance)
211 |     hook_emitted_function(method_instance, code, world) = pop!(tracer, method_instance)
212 |     dependencies = MultiDict{Core.MethodInstance,LLVM.Function}()
213 |     function hook_module_setup(ref::Ptr{Cvoid})
214 |         ref = convert(LLVM.API.LLVMModuleRef, ref)
215 |         ir = LLVM.Module(ref)
216 |         module_setup(job, ir)
217 |     end
218 |     function hook_module_activation(ref::Ptr{Cvoid})
219 |         ref = convert(LLVM.API.LLVMModuleRef, ref)
220 |         ir = LLVM.Module(ref)
221 |         postprocess(ir)
222 | 
223 |         # find the function that this module defines
224 |         llvmfs = filter(llvmf -> !isdeclaration(llvmf) &&
225 |                                  linkage(llvmf) == LLVM.API.LLVMExternalLinkage,
226 |                         collect(functions(ir)))
227 | 
228 |         llvmf = nothing
229 |         if length(llvmfs) == 1
230 |             llvmf = first(llvmfs)
231 |         elseif length(llvmfs) > 1
232 |             llvmfs = filter!(llvmf -> startswith(LLVM.name(llvmf), "julia_"), llvmfs)
233 |             if length(llvmfs) == 1
234 |                 llvmf = first(llvmfs)
235 |             end
236 |         end
237 | 
238 |         @compiler_assert llvmf !== nothing job
239 | 
240 |         insert!(dependencies, last(tracer), llvmf)
241 |     end
242 |     param_kwargs = [:cached             => false,
243 |                     :track_allocations  => false,
244 |                     :code_coverage      => false,
245 |                     :static_alloc       => false,
246 |                     :prefer_specsig     => true,
247 |                     :module_setup       => hook_module_setup,
248 |                     :module_activation  => hook_module_activation,
249 |                     :emit_function      => hook_emit_function,
250 |                     :emitted_function   => hook_emitted_function]
251 |     if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547"
252 |         push!(param_kwargs, :gnu_pubnames => false)
253 | 
254 |         debug_info_kind = if Base.JLOptions().debug_level == 0
255 |             LLVM.API.LLVMDebugEmissionKindNoDebug
256 |         elseif Base.JLOptions().debug_level == 1
257 |             LLVM.API.LLVMDebugEmissionKindLineTablesOnly
258 |         elseif Base.JLOptions().debug_level >= 2
259 |             LLVM.API.LLVMDebugEmissionKindFullDebug
260 |         end
261 | 
262 |         # LLVM's debug info crashes older CUDA assemblers
263 |         if job.target isa PTXCompilerTarget # && driver_version(job.target) < v"10.2"
264 |             # FIXME: this was supposed to be fixed on 10.2
265 |             @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1
266 |             debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug
267 |         end
268 | 
269 |         push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind))
270 |     end
271 |     params = Base.CodegenParams(;param_kwargs...)
272 | 
273 |     # get the code
274 |     ref = ccall(:jl_get_llvmf_defn, LLVM.API.LLVMValueRef,
275 |                 (Any, UInt, Bool, Bool, Base.CodegenParams),
276 |                 method_instance, world, #=wrapper=#false, #=optimize=#false, params)
277 |     if ref == C_NULL
278 |         throw(InternalCompilerError(job, "the Julia compiler could not generate LLVM IR"))
279 |     end
280 |     llvmf = LLVM.Function(ref)
281 |     ir = LLVM.parent(llvmf)
282 |     postprocess(ir)
283 | 
284 |     # link in dependent modules
285 |     entry = llvmf
286 |     mod = LLVM.parent(entry)
287 |     @timeit_debug to "linking" begin
288 |         # we disable Julia's compilation cache not to poison it with GPU-specific code.
289 |         # as a result, we might get multiple modules for a single method instance.
290 |         cache = Dict{String,String}()
291 | 
292 |         for called_method_instance in keys(dependencies)
293 |             llvmfs = dependencies[called_method_instance]
294 | 
295 |             # link the first module
296 |             llvmf = popfirst!(llvmfs)
297 |             llvmfn = LLVM.name(llvmf)
298 |             link!(mod, LLVM.parent(llvmf))
299 | 
300 |             # process subsequent duplicate modules
301 |             for dup_llvmf in llvmfs
302 |                 if Base.JLOptions().debug_level >= 2
303 |                     # link them too, to ensure accurate backtrace reconstruction
304 |                     link!(mod, LLVM.parent(dup_llvmf))
305 |                 else
306 |                     # don't link them, but note the called function name in a cache
307 |                     dup_llvmfn = LLVM.name(dup_llvmf)
308 |                     cache[dup_llvmfn] = llvmfn
309 |                 end
310 |             end
311 |         end
312 | 
313 |         # resolve function declarations with cached entries
314 |         for llvmf in filter(isdeclaration, collect(functions(mod)))
315 |             llvmfn = LLVM.name(llvmf)
316 |             if haskey(cache, llvmfn)
317 |                 def_llvmfn = cache[llvmfn]
318 |                 replace_uses!(llvmf, functions(mod)[def_llvmfn])
319 | 
320 |                 @compiler_assert isempty(uses(llvmf)) job
321 |                 unsafe_delete!(LLVM.parent(llvmf), llvmf)
322 |             end
323 |         end
324 |     end
325 | 
326 |     return entry, mod
327 | end
328 | 
329 | end
330 | 
331 | function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
332 |     entry, mod = @timeit_debug to "emission" compile_method_instance(job, method_instance, world)
333 | 
334 |     # clean up incompatibilities
335 |     @timeit_debug to "clean-up" begin
336 |         for llvmf in functions(mod)
337 |             # only occurs in debug builds
338 |             delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext()))
339 | 
340 |             if VERSION < v"1.5.0-DEV.393"
341 |                 # make function names safe for ptxas
342 |                 llvmfn = LLVM.name(llvmf)
343 |                 if !isdeclaration(llvmf)
344 |                     llvmfn′ = safe_name(llvmfn)
345 |                     if llvmfn != llvmfn′
346 |                         LLVM.name!(llvmf, llvmfn′)
347 |                         llvmfn = llvmfn′
348 |                     end
349 |                 end
350 |             end
351 | 
352 |             if Sys.iswindows()
353 |                 personality!(llvmf, nothing)
354 |             end
355 |         end
356 | 
357 |         # remove the exception-handling personality function
358 |         if Sys.iswindows() && "__julia_personality" in functions(mod)
359 |             llvmf = functions(mod)["__julia_personality"]
360 |             @compiler_assert isempty(uses(llvmf)) job
361 |             unsafe_delete!(mod, llvmf)
362 |         end
363 |     end
364 | 
365 |     # target-specific processing
366 |     process_module!(job, mod)
367 | 
368 |     # rename the entry point
369 |     if job.source.name !== nothing
370 |         llvmfn = safe_name(string("julia_", job.source.name))
371 |     else
372 |         # strip the globalUnique counter
373 |         llvmfn = LLVM.name(entry)
374 |     end
375 |     LLVM.name!(entry, llvmfn)
376 | 
377 |     # promote entry-points to kernels and mangle its name
378 |     if job.source.kernel
379 |         entry = promote_kernel!(job, mod, entry)
380 |         LLVM.name!(entry, mangle_call(entry, job.source.tt))
381 |     end
382 | 
383 |     # minimal required optimization
384 |     @timeit_debug to "rewrite" ModulePassManager() do pm
385 |         global current_job
386 |         current_job = job
387 | 
388 |         linkage!(entry, LLVM.API.LLVMExternalLinkage)
389 |         internalize!(pm, [LLVM.name(entry)])
390 | 
391 |         can_throw(job) || add!(pm, ModulePass("LowerThrow", lower_throw!))
392 | 
393 |         add_lowering_passes!(job, pm)
394 | 
395 |         run!(pm, mod)
396 | 
397 |         # NOTE: if an optimization is missing, try scheduling an entirely new optimization
398 |         # to see which passes need to be added to the target-specific list
399 |         #     LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))")
400 |         #     ModulePassManager() do pm
401 |         #         add_library_info!(pm, triple(mod))
402 |         #         add_transform_info!(pm, tm)
403 |         #         PassManagerBuilder() do pmb
404 |         #             populate!(pm, pmb)
405 |         #         end
406 |         #         run!(pm, mod)
407 |         #     end
408 |     end
409 | 
410 |     return mod, entry
411 | end
412 | 
413 | 
414 | ## name mangling
415 | 
416 | # we generate function names that look like C++ functions, because many NVIDIA tools
417 | # support them, e.g., grouping different instantiations of the same kernel together.
418 | 
419 | function mangle_param(t)
420 |     t == Nothing && return "v"
421 | 
422 |     if isa(t, DataType) || isa(t, Core.Function)
423 |         tn = safe_name(t)
424 |         str = "$(length(tn))$tn"
425 | 
426 |         if !isempty(t.parameters)
427 |             str *= "I"
428 |             for t in t.parameters
429 |                 str *= mangle_param(t)
430 |             end
431 |             str *= "E"
432 |         end
433 | 
434 |         str
435 |     elseif isa(t, Integer)
436 |         "Li$(t)E"
437 |     else
438 |         tn = safe_name(t)
439 |         "$(length(tn))$tn"
440 |     end
441 | end
442 | 
443 | function mangle_call(f, tt)
444 |     fn = safe_name(f)
445 |     str = "_Z$(length(fn))$fn"
446 | 
447 |     for t in tt.parameters
448 |         str *= mangle_param(t)
449 |     end
450 | 
451 |     return str
452 | end
453 | 
454 | # make names safe for ptxas
455 | safe_name(fn::String) = replace(fn, r"[^A-Za-z0-9_]"=>"_")
456 | safe_name(f::Union{Core.Function,DataType}) = safe_name(String(nameof(f)))
457 | safe_name(f::LLVM.Function) = safe_name(LLVM.name(f))
458 | safe_name(x) = safe_name(repr(x))
459 | 
460 | 
461 | ## exception handling
462 | 
463 | # this pass lowers `jl_throw` and friends to GPU-compatible exceptions.
464 | # this isn't strictly necessary, but has a couple of advantages:
465 | # - we can kill off unused exception arguments that otherwise would allocate or invoke
466 | # - we can fake debug information (lacking a stack unwinder)
467 | #
468 | # once we have thorough inference (ie. discarding `@nospecialize` and thus supporting
469 | # exception arguments) and proper debug info to unwind the stack, this pass can go.
470 | function lower_throw!(mod::LLVM.Module)
471 |     job = current_job::CompilerJob
472 |     changed = false
473 |     @timeit_debug to "lower throw" begin
474 | 
475 |     throw_functions = Dict{String,String}(
476 |         "jl_throw"                      => "exception",
477 |         "jl_error"                      => "error",
478 |         "jl_too_few_args"               => "too few arguments exception",
479 |         "jl_too_many_args"              => "too many arguments exception",
480 |         "jl_type_error"                 => "type error",
481 |         "jl_type_error_rt"              => "type error",
482 |         "jl_undefined_var_error"        => "undefined variable error",
483 |         "jl_bounds_error"               => "bounds error",
484 |         "jl_bounds_error_v"             => "bounds error",
485 |         "jl_bounds_error_int"           => "bounds error",
486 |         "jl_bounds_error_tuple_int"     => "bounds error",
487 |         "jl_bounds_error_unboxed_int"   => "bounds error",
488 |         "jl_bounds_error_ints"          => "bounds error",
489 |         "jl_eof_error"                  => "EOF error"
490 |     )
491 | 
492 |     for (fn, name) in throw_functions
493 |         if haskey(functions(mod), fn)
494 |             f = functions(mod)[fn]
495 | 
496 |             for use in uses(f)
497 |                 call = user(use)::LLVM.CallInst
498 | 
499 |                 # replace the throw with a PTX-compatible exception
500 |                 let builder = Builder(JuliaContext())
501 |                     position!(builder, call)
502 |                     emit_exception!(builder, name, call)
503 |                     dispose(builder)
504 |                 end
505 | 
506 |                 # remove the call
507 |                 call_args = collect(operands(call))[1:end-1] # last arg is function itself
508 |                 unsafe_delete!(LLVM.parent(call), call)
509 | 
510 |                 # HACK: kill the exceptions' unused arguments
511 |                 for arg in call_args
512 |                     # peek through casts
513 |                     if isa(arg, LLVM.AddrSpaceCastInst)
514 |                         cast = arg
515 |                         arg = first(operands(cast))
516 |                         isempty(uses(cast)) && unsafe_delete!(LLVM.parent(cast), cast)
517 |                     end
518 | 
519 |                     if isa(arg, LLVM.Instruction) && isempty(uses(arg))
520 |                         unsafe_delete!(LLVM.parent(arg), arg)
521 |                     end
522 |                 end
523 | 
524 |                 changed = true
525 |             end
526 | 
527 |             @compiler_assert isempty(uses(f)) job
528 |          end
529 |      end
530 | 
531 |     end
532 |     return changed
533 | end
534 | 
535 | # report an exception in a GPU-compatible manner
536 | #
537 | # the exact behavior depends on the debug level. in all cases, a `trap` will be emitted, On
538 | # debug level 1, the exception name will be printed, and on debug level 2 the individual
539 | # stack frames (as recovered from the LLVM debug information) will be printed as well.
540 | function emit_exception!(builder, name, inst)
541 |     job = current_job::CompilerJob
542 |     bb = position(builder)
543 |     fun = LLVM.parent(bb)
544 |     mod = LLVM.parent(fun)
545 | 
546 |     # report the exception
547 |     if Base.JLOptions().debug_level >= 1
548 |         name = globalstring_ptr!(builder, name, "exception")
549 |         if Base.JLOptions().debug_level == 1
550 |             call!(builder, Runtime.get(:report_exception), [name])
551 |         else
552 |             call!(builder, Runtime.get(:report_exception_name), [name])
553 |         end
554 |     end
555 | 
556 |     # report each frame
557 |     if Base.JLOptions().debug_level >= 2
558 |         rt = Runtime.get(:report_exception_frame)
559 |         bt = backtrace(inst)
560 |         for (i,frame) in enumerate(bt)
561 |             idx = ConstantInt(rt.llvm_types[1], i)
562 |             func = globalstring_ptr!(builder, String(frame.func), "di_func")
563 |             file = globalstring_ptr!(builder, String(frame.file), "di_file")
564 |             line = ConstantInt(rt.llvm_types[4], frame.line)
565 |             call!(builder, rt, [idx, func, file, line])
566 |         end
567 |     end
568 | 
569 |     # signal the exception
570 |     call!(builder, Runtime.get(:signal_exception))
571 | 
572 |     emit_trap!(job, builder, mod, inst)
573 | end
574 | 
575 | function emit_trap!(job::CompilerJob, builder, mod, inst)
576 |     trap = if haskey(functions(mod), "llvm.trap")
577 |         functions(mod)["llvm.trap"]
578 |     else
579 |         LLVM.Function(mod, "llvm.trap", LLVM.FunctionType(LLVM.VoidType(JuliaContext())))
580 |     end
581 |     call!(builder, trap)
582 | end
583 | 
584 | 
585 | ## kernel promotion
586 | 
587 | # promote a function to a kernel
588 | function promote_kernel!(job::CompilerJob, mod::LLVM.Module, kernel::LLVM.Function)
589 |     # pass non-opaque pointer arguments by value (this improves performance,
590 |     # and is mandated by certain back-ends like SPIR-V). only do so for values
591 |     # that aren't a Julia pointer, so we ca still pass those directly.
592 |     kernel_ft = eltype(llvmtype(kernel)::LLVM.PointerType)::LLVM.FunctionType
593 |     kernel_sig = Base.signature_type(job.source.f, job.source.tt)::Type
594 |     kernel_types = filter(dt->!isghosttype(dt) &&
595 |                               (VERSION < v"1.5.0-DEV.581" || !Core.Compiler.isconstType(dt)),
596 |                           [kernel_sig.parameters...])
597 |     @compiler_assert length(kernel_types) == length(parameters(kernel_ft)) job
598 |     for (i, (param_ft,arg_typ)) in enumerate(zip(parameters(kernel_ft), kernel_types))
599 |         if param_ft isa LLVM.PointerType && issized(eltype(param_ft)) &&
600 |            !(arg_typ <: Ptr) && !(VERSION >= v"1.5" && arg_typ <: Core.LLVMPtr)
601 |             push!(parameter_attributes(kernel, i), EnumAttribute("byval"))
602 |         end
603 |     end
604 | 
605 |     # target-specific processing
606 |     kernel = process_kernel!(job, mod, kernel)
607 | 
608 |     return kernel
609 | end
610 | 


--------------------------------------------------------------------------------