├── codecov.yml ├── .github └── workflows │ └── TagBot.yml ├── .travis.yml ├── test ├── runtests.jl ├── util.jl ├── definitions │ ├── gcn.jl │ ├── native.jl │ └── ptx.jl ├── gcn.jl ├── ptx.jl └── native.jl ├── Project.toml ├── src ├── native.jl ├── utils.jl ├── GPUCompiler.jl ├── execution.jl ├── debug.jl ├── error.jl ├── mcgen.jl ├── cache.jl ├── interface.jl ├── optim.jl ├── rtlib.jl ├── runtime.jl ├── reflection.jl ├── driver.jl ├── validation.jl ├── gcn.jl ├── ptx.jl └── irgen.jl ├── README.md ├── Manifest.toml └── .gitlab-ci.yml /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | patch: false 4 | project: false 5 | changes: false 6 | -------------------------------------------------------------------------------- /.github/workflows/TagBot.yml: -------------------------------------------------------------------------------- 1 | name: TagBot 2 | on: 3 | schedule: 4 | - cron: 0 * * * * 5 | jobs: 6 | TagBot: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: JuliaRegistries/TagBot@v1 10 | with: 11 | token: ${{ secrets.GITHUB_TOKEN }} 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: julia 2 | 3 | os: 4 | - linux 5 | - osx 6 | - windows 7 | 8 | julia: 9 | - 1.3 10 | - 1.4 11 | - 1.5 12 | - nightly 13 | 14 | jobs: 15 | allow_failures: 16 | - julia: nightly 17 | 18 | notifications: 19 | email: false 20 | 21 | codecov: true 22 | 23 | branches: 24 | only: 25 | - master 26 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using Test, Base.CoreLogging 2 | import Base.CoreLogging: Info 3 | 4 | using GPUCompiler 5 | 6 | using LLVM, LLVM.Interop 7 | 8 | include("util.jl") 9 | 10 | @testset "GPUCompiler" begin 11 | 12 | GPUCompiler.reset_runtime() 13 | 14 | GPUCompiler.enable_timings() 15 | 16 | include("native.jl") 17 | include("ptx.jl") 18 | if !parse(Bool, get(ENV, "CI_ASSERTS", "false")) && VERSION < v"1.4" 19 | include("gcn.jl") 20 | end 21 | 22 | haskey(ENV, "CI") && GPUCompiler.timings() 23 | 24 | end 25 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "GPUCompiler" 2 | uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" 3 | authors = ["Tim Besard "] 4 | version = "0.4.0" 5 | 6 | [deps] 7 | DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 8 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 9 | LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" 10 | Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" 11 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 12 | UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" 13 | 14 | [compat] 15 | DataStructures = "0.15, 0.16, 0.17" 16 | LLVM = "1.4.0" 17 | TimerOutputs = "0.5" 18 | julia = "1.3" 19 | 20 | [extras] 21 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 22 | 23 | [targets] 24 | test = ["Test"] 25 | -------------------------------------------------------------------------------- /src/native.jl: -------------------------------------------------------------------------------- 1 | # native target for CPU execution 2 | 3 | ## target 4 | 5 | export NativeCompilerTarget 6 | 7 | Base.@kwdef struct NativeCompilerTarget <: AbstractCompilerTarget 8 | cpu::String=(LLVM.version() < v"8") ? "" : unsafe_string(LLVM.API.LLVMGetHostCPUName()) 9 | features::String=(LLVM.version() < v"8") ? "" : unsafe_string(LLVM.API.LLVMGetHostCPUFeatures()) 10 | end 11 | 12 | llvm_triple(::NativeCompilerTarget) = Sys.MACHINE 13 | 14 | function llvm_machine(target::NativeCompilerTarget) 15 | triple = llvm_triple(target) 16 | 17 | t = Target(triple) 18 | 19 | tm = TargetMachine(t, triple, target.cpu, target.features) 20 | asm_verbosity!(tm, true) 21 | 22 | return tm 23 | end 24 | 25 | 26 | ## job 27 | 28 | runtime_slug(job::CompilerJob{NativeCompilerTarget}) = "native_$(job.target.cpu)-$(hash(job.target.features))" 29 | -------------------------------------------------------------------------------- /src/utils.jl: -------------------------------------------------------------------------------- 1 | export tbaa_make_child 2 | 3 | function tbaa_make_child(name::String, constant::Bool=false; ctx::LLVM.Context=JuliaContext()) 4 | tbaa_root = MDNode([MDString("gputbaa", ctx)], ctx) 5 | tbaa_struct_type = 6 | MDNode([MDString("gputbaa_$name", ctx), 7 | tbaa_root, 8 | LLVM.ConstantInt(0, ctx)], ctx) 9 | tbaa_access_tag = 10 | MDNode([tbaa_struct_type, 11 | tbaa_struct_type, 12 | LLVM.ConstantInt(0, ctx), 13 | LLVM.ConstantInt(constant ? 1 : 0, ctx)], ctx) 14 | 15 | return tbaa_access_tag 16 | end 17 | 18 | 19 | defs(mod::LLVM.Module) = filter(f -> !isdeclaration(f), collect(functions(mod))) 20 | decls(mod::LLVM.Module) = filter(f -> isdeclaration(f) && intrinsic_id(f) == 0, 21 | collect(functions(mod))) 22 | 23 | -------------------------------------------------------------------------------- /src/GPUCompiler.jl: -------------------------------------------------------------------------------- 1 | module GPUCompiler 2 | 3 | using LLVM 4 | using LLVM.Interop 5 | 6 | using DataStructures 7 | 8 | using TimerOutputs 9 | 10 | using Libdl 11 | 12 | const to = TimerOutput() 13 | 14 | timings() = (TimerOutputs.print_timer(to); println()) 15 | 16 | enable_timings() = (TimerOutputs.enable_debug_timings(GPUCompiler); return) 17 | 18 | include("utils.jl") 19 | 20 | # compiler interface and implementations 21 | include("interface.jl") 22 | include("error.jl") 23 | include("native.jl") 24 | include("ptx.jl") 25 | include("gcn.jl") 26 | 27 | include("runtime.jl") 28 | 29 | # compiler implementation 30 | include("irgen.jl") 31 | include("optim.jl") 32 | include("validation.jl") 33 | include("rtlib.jl") 34 | include("mcgen.jl") 35 | include("debug.jl") 36 | include("driver.jl") 37 | 38 | # other reusable functionality 39 | include("cache.jl") 40 | include("execution.jl") 41 | include("reflection.jl") 42 | 43 | function __init__() 44 | TimerOutputs.reset_timer!(to) 45 | InitializeAllTargets() 46 | InitializeAllTargetInfos() 47 | InitializeAllAsmPrinters() 48 | InitializeAllAsmParsers() 49 | InitializeAllTargetMCs() 50 | 51 | return 52 | end 53 | 54 | end # module 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPUCompiler.jl 2 | 3 | *Reusable compiler infrastructure for Julia GPU backends.* 4 | 5 | | **Build Status** | **Coverage** | 6 | |:--------------------------------------------------------------------------------------------------:|:-------------------------------:| 7 | | [![][gitlab-img]][gitlab-url] [![][travis-img]][travis-url] [![PkgEval][pkgeval-img]][pkgeval-url] | [![][codecov-img]][codecov-url] | 8 | 9 | [gitlab-img]: https://gitlab.com/JuliaGPU/GPUCompiler.jl/badges/master/pipeline.svg 10 | [gitlab-url]: https://gitlab.com/JuliaGPU/GPUCompiler.jl/commits/master 11 | 12 | [travis-img]: https://api.travis-ci.com/JuliaGPU/GPUCompiler.jl.svg?branch=master 13 | [travis-url]: https://travis-ci.com/JuliaGPU/GPUCompiler.jl 14 | 15 | [pkgeval-img]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/G/GPUCompiler.svg 16 | [pkgeval-url]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/G/GPUCompiler.html 17 | 18 | [codecov-img]: https://codecov.io/gh/JuliaGPU/GPUCompiler.jl/branch/master/graph/badge.svg 19 | [codecov-url]: https://codecov.io/gh/JuliaGPU/GPUCompiler.jl 20 | 21 | This package offers reusable compiler infrastructure and tooling for 22 | implementing GPU compilers in Julia. **It is not intended for end users!** 23 | Instead, you should use one of the packages that builds on GPUCompiler.jl, such 24 | as [CUDAnative](https://github.com/JuliaGPU/CUDAnative.jl). 25 | -------------------------------------------------------------------------------- /test/util.jl: -------------------------------------------------------------------------------- 1 | # @test_throw, with additional testing for the exception message 2 | macro test_throws_message(f, typ, ex...) 3 | quote 4 | msg = "" 5 | @test_throws $(esc(typ)) try 6 | $(esc(ex...)) 7 | catch err 8 | msg = sprint(showerror, err) 9 | rethrow() 10 | end 11 | 12 | if !$(esc(f))(msg) 13 | # @test should return its result, but doesn't 14 | @error "Failed to validate error message\n$msg" 15 | end 16 | @test $(esc(f))(msg) 17 | end 18 | end 19 | 20 | # helper function for sinking a value to prevent the callee from getting optimized away 21 | @inline sink(i::T) where T <: Union{Int32,UInt32} = 22 | Base.llvmcall("""%slot = alloca i32 23 | store volatile i32 %0, i32* %slot 24 | %value = load volatile i32, i32* %slot 25 | ret i32 %value""", T, Tuple{T}, i) 26 | @inline sink(i::T) where T <: Union{Int64,UInt64} = 27 | Base.llvmcall("""%slot = alloca i64 28 | store volatile i64 %0, i64* %slot 29 | %value = load volatile i64, i64* %slot 30 | ret i64 %value""", T, Tuple{T}, i) 31 | 32 | 33 | # the GPU runtime library 34 | module TestRuntime 35 | # dummy methods 36 | signal_exception() = return 37 | malloc(sz) = C_NULL 38 | report_oom(sz) = return 39 | report_exception(ex) = return 40 | report_exception_name(ex) = return 41 | report_exception_frame(idx, func, file, line) = return 42 | 43 | # for validation 44 | sin(x) = Base.sin(x) 45 | end 46 | 47 | struct TestCompilerParams <: AbstractCompilerParams end 48 | GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime 49 | -------------------------------------------------------------------------------- /src/execution.jl: -------------------------------------------------------------------------------- 1 | # reusable functionality to implement code execution 2 | 3 | export split_kwargs, assign_args! 4 | 5 | 6 | ## macro tools 7 | 8 | # split keyword arguments expressions into groups. returns vectors of keyword argument 9 | # values, one more than the number of groups (unmatched keywords in the last vector). 10 | # intended for use in macros; the resulting groups can be used in expressions. 11 | function split_kwargs(kwargs, kw_groups...) 12 | kwarg_groups = ntuple(_->[], length(kw_groups) + 1) 13 | for kwarg in kwargs 14 | # decode 15 | Meta.isexpr(kwarg, :(=)) || throw(ArgumentError("non-keyword argument like option '$kwarg'")) 16 | key, val = kwarg.args 17 | isa(key, Symbol) || throw(ArgumentError("non-symbolic keyword '$key'")) 18 | 19 | # find a matching group 20 | group = length(kw_groups) 21 | for (i, kws) in enumerate(kw_groups) 22 | if key in kws 23 | group = i 24 | break 25 | end 26 | end 27 | push!(kwarg_groups[group], kwarg) 28 | end 29 | 30 | return kwarg_groups 31 | end 32 | 33 | # assign arguments to variables, handle splatting 34 | function assign_args!(code, args) 35 | # handle splatting 36 | splats = map(arg -> Meta.isexpr(arg, :(...)), args) 37 | args = map(args, splats) do arg, splat 38 | splat ? arg.args[1] : arg 39 | end 40 | 41 | # assign arguments to variables 42 | vars = Tuple(gensym() for arg in args) 43 | map(vars, args) do var,arg 44 | push!(code.args, :($var = $arg)) 45 | end 46 | 47 | # convert the arguments, compile the function and call the kernel 48 | # while keeping the original arguments alive 49 | var_exprs = map(vars, args, splats) do var, arg, splat 50 | splat ? Expr(:(...), var) : var 51 | end 52 | 53 | return vars, var_exprs 54 | end 55 | -------------------------------------------------------------------------------- /test/definitions/gcn.jl: -------------------------------------------------------------------------------- 1 | using GPUCompiler 2 | 3 | if !@isdefined(TestRuntime) 4 | include("../util.jl") 5 | end 6 | 7 | 8 | # create a GCN-based test compiler, and generate reflection methods for it 9 | 10 | function gcn_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, kwargs...) 11 | source = FunctionSpec(func, Base.to_tuple_type(types), kernel) 12 | target = GCNCompilerTarget("gfx900") 13 | params = TestCompilerParams() 14 | CompilerJob(target, source, params), kwargs 15 | end 16 | 17 | function gcn_code_typed(@nospecialize(func), @nospecialize(types); kwargs...) 18 | job, kwargs = gcn_job(func, types; kwargs...) 19 | GPUCompiler.code_typed(job; kwargs...) 20 | end 21 | 22 | function gcn_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 23 | job, kwargs = gcn_job(func, types; kwargs...) 24 | GPUCompiler.code_warntype(io, job; kwargs...) 25 | end 26 | 27 | function gcn_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 28 | job, kwargs = gcn_job(func, types; kwargs...) 29 | GPUCompiler.code_llvm(io, job; kwargs...) 30 | end 31 | 32 | function gcn_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 33 | job, kwargs = gcn_job(func, types; kwargs...) 34 | GPUCompiler.code_native(io, job; kwargs...) 35 | end 36 | 37 | # aliases without ::IO argument 38 | for method in (:code_warntype, :code_llvm, :code_native) 39 | gcn_method = Symbol("gcn_$(method)") 40 | @eval begin 41 | $gcn_method(@nospecialize(func), @nospecialize(types); kwargs...) = 42 | $gcn_method(stdout, func, types; kwargs...) 43 | end 44 | end 45 | 46 | # simulates codegen for a kernel function: validates by default 47 | function gcn_code_execution(@nospecialize(func), @nospecialize(types); kwargs...) 48 | job, kwargs = gcn_job(func, types; kernel=true, kwargs...) 49 | GPUCompiler.compile(:asm, job; kwargs...) 50 | end 51 | -------------------------------------------------------------------------------- /Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | [[Base64]] 4 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" 5 | 6 | [[CEnum]] 7 | git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" 8 | uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" 9 | version = "0.4.1" 10 | 11 | [[DataStructures]] 12 | deps = ["InteractiveUtils", "OrderedCollections"] 13 | git-tree-sha1 = "73eb18320fe3ba58790c8b8f6f89420f0a622773" 14 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 15 | version = "0.17.11" 16 | 17 | [[InteractiveUtils]] 18 | deps = ["Markdown"] 19 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 20 | 21 | [[LLVM]] 22 | deps = ["CEnum", "Libdl", "Printf", "Unicode"] 23 | git-tree-sha1 = "e2ef4155563e7d72790e70817cff7caae7b106a4" 24 | repo-rev = "37794e110bfbe6b4e204c0c7916e7dae45774f2e" 25 | repo-url = "https://github.com/maleadt/LLVM.jl.git" 26 | uuid = "929cbde3-209d-540e-8aea-75f648917ca0" 27 | version = "1.5.1" 28 | 29 | [[Libdl]] 30 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" 31 | 32 | [[Markdown]] 33 | deps = ["Base64"] 34 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" 35 | 36 | [[OrderedCollections]] 37 | git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" 38 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" 39 | version = "1.2.0" 40 | 41 | [[Printf]] 42 | deps = ["Unicode"] 43 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" 44 | 45 | [[Random]] 46 | deps = ["Serialization"] 47 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 48 | 49 | [[SHA]] 50 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" 51 | 52 | [[Serialization]] 53 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" 54 | 55 | [[TimerOutputs]] 56 | deps = ["Printf"] 57 | git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03" 58 | uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 59 | version = "0.5.3" 60 | 61 | [[UUIDs]] 62 | deps = ["Random", "SHA"] 63 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" 64 | 65 | [[Unicode]] 66 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" 67 | -------------------------------------------------------------------------------- /test/definitions/native.jl: -------------------------------------------------------------------------------- 1 | using GPUCompiler 2 | 3 | if !@isdefined(TestRuntime) 4 | include("../util.jl") 5 | end 6 | 7 | 8 | # create a native test compiler, and generate reflection methods for it 9 | 10 | function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, kwargs...) 11 | source = FunctionSpec(func, Base.to_tuple_type(types), kernel) 12 | target = NativeCompilerTarget() 13 | params = TestCompilerParams() 14 | CompilerJob(target, source, params), kwargs 15 | end 16 | 17 | function native_code_typed(@nospecialize(func), @nospecialize(types); kwargs...) 18 | job, kwargs = native_job(func, types; kwargs...) 19 | GPUCompiler.code_typed(job; kwargs...) 20 | end 21 | 22 | function native_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 23 | job, kwargs = native_job(func, types; kwargs...) 24 | GPUCompiler.code_warntype(io, job; kwargs...) 25 | end 26 | 27 | function native_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 28 | job, kwargs = native_job(func, types; kwargs...) 29 | GPUCompiler.code_llvm(io, job; kwargs...) 30 | end 31 | 32 | function native_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 33 | job, kwargs = native_job(func, types; kwargs...) 34 | GPUCompiler.code_native(io, job; kwargs...) 35 | end 36 | 37 | # aliases without ::IO argument 38 | for method in (:code_warntype, :code_llvm, :code_native) 39 | native_method = Symbol("native_$(method)") 40 | @eval begin 41 | $native_method(@nospecialize(func), @nospecialize(types); kwargs...) = 42 | $native_method(stdout, func, types; kwargs...) 43 | end 44 | end 45 | 46 | # simulates codegen for a kernel function: validates by default 47 | function native_code_execution(@nospecialize(func), @nospecialize(types); kwargs...) 48 | job, kwargs = native_job(func, types; kernel=true, kwargs...) 49 | GPUCompiler.compile(:asm, job; kwargs...) 50 | end 51 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | include: 2 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v6.yml' 3 | 4 | 5 | # LLVM with assertions 6 | 7 | asserts:1.3: 8 | extends: 9 | - .julia:source 10 | - .test 11 | variables: 12 | CI_CLONE_ARGS: '-b v1.3.1' 13 | CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1 LLVM_BB_URL_BASE=https://github.com/staticfloat/LLVMBuilder/releases/download/v6.0.1-7+nowasm/' 14 | CI_ASSERTS: 'true' 15 | 16 | # TODO: upgrade to 1.4.1 once it has LLVM+asserts artifacts 17 | asserts:1.4: 18 | extends: 19 | - .julia:source 20 | - .test 21 | variables: 22 | CI_CLONE_ARGS: '-b v1.4.0' 23 | CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1' 24 | CI_ASSERTS: 'true' 25 | 26 | # TODO: add 1.5.0 once it has LLVM+asserts artifacts 27 | 28 | 29 | # CUDA.jl 30 | 31 | .test_cuda: 32 | extends: .test 33 | variables: 34 | JULIA_NUM_THREADS: '2' 35 | JULIA_CUDA_USE_BINARYBUILDER: 'false' # reduce CI network traffic 36 | script: 37 | - julia -e 'using Pkg; 38 | Pkg.develop(PackageSpec(path=pwd())); 39 | Pkg.build();' 40 | - julia -e 'using Pkg; 41 | Pkg.add(PackageSpec(name="CUDA", rev="master")); 42 | Pkg.test("CUDA");' 43 | 44 | cuda:1.4: 45 | extends: 46 | - .julia:1.4 47 | - .test_cuda 48 | tags: 49 | - nvidia 50 | allow_failure: true 51 | 52 | 53 | # AMDGPUnative.jl 54 | 55 | .test_amdgpunative: 56 | extends: .test 57 | image: rocm/dev-ubuntu-18.04 58 | script: 59 | - julia -e 'using Pkg; 60 | Pkg.develop(PackageSpec(path=pwd())); 61 | Pkg.build();' 62 | - julia -e 'using Pkg; 63 | Pkg.add(PackageSpec(name="AMDGPUnative", rev="jps/gpucompiler")); 64 | Pkg.test("AMDGPUnative");' 65 | 66 | amdgpunative:1.4: 67 | extends: 68 | - .julia:1.4 69 | - .test_amdgpunative 70 | tags: 71 | - rocm 72 | allow_failure: true 73 | 74 | 75 | # other tasks 76 | 77 | coverage: 78 | extends: 79 | - .julia:1.4 80 | - .coverage 81 | -------------------------------------------------------------------------------- /src/debug.jl: -------------------------------------------------------------------------------- 1 | # tools for dealing with compiler debug information 2 | 3 | # generate a pseudo-backtrace from LLVM IR instruction debug information 4 | # 5 | # this works by looking up the debug information of the instruction, and inspecting the call 6 | # sites of the containing function. if there's only one, repeat the process from that call. 7 | # finally, the debug information is converted to a Julia stack trace. 8 | function backtrace(inst::LLVM.Instruction, bt = StackTraces.StackFrame[]) 9 | name = Ref{Cstring}() 10 | filename = Ref{Cstring}() 11 | line = Ref{Cuint}() 12 | col = Ref{Cuint}() 13 | 14 | # look up the debug information from the current instruction 15 | depth = 0 16 | while LLVM.API.LLVMGetSourceLocation(LLVM.ref(inst), depth, name, filename, line, col) == 1 17 | frame = StackTraces.StackFrame(replace(unsafe_string(name[]), r";$"=>""), 18 | unsafe_string(filename[]), line[]) 19 | push!(bt, frame) 20 | depth += 1 21 | end 22 | 23 | # move up the call chain 24 | f = LLVM.parent(LLVM.parent(inst)) 25 | ## functions can be used as a *value* in eg. constant expressions, so filter those out 26 | callers = filter(val -> isa(user(val), LLVM.CallInst), collect(uses(f))) 27 | ## get rid of calls without debug info 28 | filter!(callers) do call 29 | md = metadata(user(call)) 30 | haskey(md, LLVM.MD_dbg) 31 | end 32 | if !isempty(callers) 33 | # figure out the call sites of this instruction 34 | call_sites = unique(callers) do call 35 | # there could be multiple calls, originating from the same source location 36 | md = metadata(user(call)) 37 | md[LLVM.MD_dbg] 38 | end 39 | 40 | if length(call_sites) > 1 41 | frame = StackTraces.StackFrame("multiple call sites", "unknown", 0) 42 | push!(bt, frame) 43 | elseif length(call_sites) == 1 44 | backtrace(user(first(call_sites)), bt) 45 | end 46 | end 47 | 48 | return bt 49 | end 50 | -------------------------------------------------------------------------------- /test/definitions/ptx.jl: -------------------------------------------------------------------------------- 1 | using GPUCompiler 2 | 3 | if !@isdefined(TestRuntime) 4 | include("../util.jl") 5 | end 6 | 7 | 8 | # create a PTX-based test compiler, and generate reflection methods for it 9 | 10 | function ptx_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, 11 | minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, 12 | maxregs=nothing, kwargs...) 13 | source = FunctionSpec(func, Base.to_tuple_type(types), kernel) 14 | target = PTXCompilerTarget(cap=v"7.0", 15 | minthreads=minthreads, maxthreads=maxthreads, 16 | blocks_per_sm=blocks_per_sm, maxregs=maxregs) 17 | params = TestCompilerParams() 18 | CompilerJob(target, source, params), kwargs 19 | end 20 | 21 | function ptx_code_typed(@nospecialize(func), @nospecialize(types); kwargs...) 22 | job, kwargs = ptx_job(func, types; kwargs...) 23 | GPUCompiler.code_typed(job; kwargs...) 24 | end 25 | 26 | function ptx_code_warntype(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 27 | job, kwargs = ptx_job(func, types; kwargs...) 28 | GPUCompiler.code_warntype(io, job; kwargs...) 29 | end 30 | 31 | function ptx_code_llvm(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 32 | job, kwargs = ptx_job(func, types; kwargs...) 33 | GPUCompiler.code_llvm(io, job; kwargs...) 34 | end 35 | 36 | function ptx_code_native(io::IO, @nospecialize(func), @nospecialize(types); kwargs...) 37 | job, kwargs = ptx_job(func, types; kwargs...) 38 | GPUCompiler.code_native(io, job; kwargs...) 39 | end 40 | 41 | # aliases without ::IO argument 42 | for method in (:code_warntype, :code_llvm, :code_native) 43 | ptx_method = Symbol("ptx_$(method)") 44 | @eval begin 45 | $ptx_method(@nospecialize(func), @nospecialize(types); kwargs...) = 46 | $ptx_method(stdout, func, types; kwargs...) 47 | end 48 | end 49 | 50 | # simulates codegen for a kernel function: validates by default 51 | function ptx_code_execution(@nospecialize(func), @nospecialize(types); kwargs...) 52 | job, kwargs = ptx_job(func, types; kernel=true, kwargs...) 53 | GPUCompiler.compile(:asm, job; kwargs...) 54 | end 55 | -------------------------------------------------------------------------------- /src/error.jl: -------------------------------------------------------------------------------- 1 | # error handling 2 | 3 | export KernelError, InternalCompilerError 4 | 5 | struct KernelError <: Exception 6 | job::CompilerJob 7 | message::String 8 | help::Union{Nothing,String} 9 | bt::StackTraces.StackTrace 10 | 11 | KernelError(job::CompilerJob, message::String, help=nothing; 12 | bt=StackTraces.StackTrace()) = 13 | new(job, message, help, bt) 14 | end 15 | 16 | function Base.showerror(io::IO, err::KernelError) 17 | println(io, "GPU compilation of ", err.job.source, " failed") 18 | println(io, "KernelError: $(err.message)") 19 | println(io) 20 | println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros.")) 21 | Base.show_backtrace(io, err.bt) 22 | end 23 | 24 | 25 | struct InternalCompilerError <: Exception 26 | job::CompilerJob 27 | message::String 28 | meta::Dict 29 | InternalCompilerError(job, message; kwargs...) = new(job, message, kwargs) 30 | end 31 | 32 | function Base.showerror(io::IO, err::InternalCompilerError) 33 | println(io, """GPUCompiler.jl encountered an unexpected internal error. 34 | Please file an issue attaching the following information, including the backtrace, 35 | as well as a reproducible example (if possible).""") 36 | 37 | println(io, "\nInternalCompilerError: $(err.message)") 38 | 39 | println(io, "\nCompiler invocation: ", err.job) 40 | 41 | if !isempty(err.meta) 42 | println(io, "\nAdditional information:") 43 | for (key,val) in err.meta 44 | println(io, " - $key = $(repr(val))") 45 | end 46 | end 47 | 48 | let Pkg = Base.require(Base.PkgId(Base.UUID((0x44cfe95a1eb252ea, 0xb672e2afdf69b78f)), "Pkg")) 49 | println(io, "\nInstalled packages:") 50 | for (pkg,ver) in Pkg.installed() 51 | println(io, " - $pkg = $(repr(ver))") 52 | end 53 | end 54 | 55 | println(io) 56 | versioninfo(io) 57 | end 58 | 59 | macro compiler_assert(ex, job, kwargs...) 60 | msg = "$ex, at $(__source__.file):$(__source__.line)" 61 | return :($(esc(ex)) ? $(nothing) 62 | : throw(InternalCompilerError($(esc(job)), $msg; 63 | $(map(esc, kwargs)...))) 64 | ) 65 | end 66 | -------------------------------------------------------------------------------- /src/mcgen.jl: -------------------------------------------------------------------------------- 1 | # machine code generation 2 | 3 | # final preparations for the module to be compiled to PTX 4 | # these passes should not be run when e.g. compiling to write to disk. 5 | function prepare_execution!(job::CompilerJob, mod::LLVM.Module) 6 | let pm = ModulePassManager() 7 | global current_job 8 | current_job = job 9 | 10 | global_optimizer!(pm) 11 | 12 | add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!)) 13 | 14 | global_dce!(pm) 15 | strip_dead_prototypes!(pm) 16 | 17 | run!(pm, mod) 18 | dispose(pm) 19 | end 20 | 21 | return 22 | end 23 | 24 | # some Julia code contains references to objects in the CPU run-time, 25 | # without actually using the contents or functionality of those objects. 26 | # 27 | # prime example are type tags, which reference the address of the allocated type. 28 | # since those references are ephemeral, we can't eagerly resolve and emit them in the IR, 29 | # but at the same time the GPU can't resolve them at run-time. 30 | # 31 | # this pass performs that resolution at link time. 32 | function resolve_cpu_references!(mod::LLVM.Module) 33 | job = current_job::CompilerJob 34 | changed = false 35 | 36 | for f in functions(mod) 37 | fn = LLVM.name(f) 38 | if isdeclaration(f) && intrinsic_id(f) == 0 && startswith(fn, "jl_") 39 | # eagerly resolve the address of the binding 40 | address = ccall(:jl_cglobal, Any, (Any, Any), fn, UInt) 41 | dereferenced = unsafe_load(address) 42 | dereferenced = LLVM.ConstantInt(dereferenced, JuliaContext()) 43 | 44 | function replace_bindings!(value) 45 | changed = false 46 | for use in uses(value) 47 | val = user(use) 48 | if isa(val, LLVM.ConstantExpr) 49 | # recurse 50 | changed |= replace_bindings!(val) 51 | elseif isa(val, LLVM.LoadInst) 52 | # resolve 53 | replace_uses!(val, dereferenced) 54 | unsafe_delete!(LLVM.parent(val), val) 55 | # FIXME: iterator invalidation? 56 | changed = true 57 | end 58 | end 59 | changed 60 | end 61 | 62 | changed |= replace_bindings!(f) 63 | end 64 | end 65 | 66 | return changed 67 | end 68 | 69 | 70 | function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function, format=LLVM.API.LLVMAssemblyFile) 71 | tm = llvm_machine(job.target) 72 | 73 | return String(emit(tm, mod, format)) 74 | end 75 | -------------------------------------------------------------------------------- /src/cache.jl: -------------------------------------------------------------------------------- 1 | # compilation cache 2 | 3 | using Core.Compiler: retrieve_code_info, CodeInfo, MethodInstance, SSAValue, SlotNumber 4 | using Base: _methods_by_ftype 5 | 6 | const compilecache = Dict{UInt, Any}() 7 | const compilelock = ReentrantLock() 8 | 9 | @inline function check_cache(driver, spec, id; kwargs...) 10 | # generate a key for indexing the compilation cache 11 | key = hash(kwargs, id) 12 | key = hash(spec.name, key) # fields f and tt are already covered by the id 13 | key = hash(spec.kernel, key) # as `cached_compilation` specializes on them. 14 | for nf in 1:nfields(spec.f) 15 | # mix in the values of any captured variable 16 | key = hash(getfield(spec.f, nf), key) 17 | end 18 | 19 | Base.@lock compilelock begin 20 | get!(compilecache, key) do 21 | driver(spec; kwargs...) 22 | end 23 | end 24 | end 25 | 26 | # generated function that crafts a custom code info to call the actual cufunction impl. 27 | # this gives us the flexibility to insert manual back edges for automatic recompilation. 28 | # 29 | # we also increment a global specialization counter and pass it along to index the cache. 30 | 31 | specialization_counter = 0 32 | 33 | @generated function cached_compilation(driver::Core.Function, spec::FunctionSpec{f,tt}, 34 | env::UInt=zero(UInt); kwargs...) where {f,tt} 35 | 36 | # get a hold of the method and code info of the kernel function 37 | sig = Tuple{f, tt.parameters...} 38 | mthds = _methods_by_ftype(sig, -1, typemax(UInt)) 39 | Base.isdispatchtuple(tt) || return(:(error("$tt is not a dispatch tuple"))) 40 | length(mthds) == 1 || return (:(throw(MethodError(spec.f,spec.tt)))) 41 | mtypes, msp, m = mthds[1] 42 | mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, (Any, Any, Any), m, mtypes, msp) 43 | ci = retrieve_code_info(mi) 44 | @assert isa(ci, CodeInfo) 45 | 46 | # generate a unique id to represent this specialization 47 | global specialization_counter 48 | id = UInt(specialization_counter += 1) 49 | # TODO: save the mi/ci here (or embed it in the AST to pass to cufunction) 50 | # and use that to drive compilation 51 | 52 | # prepare a new code info 53 | new_ci = copy(ci) 54 | empty!(new_ci.code) 55 | empty!(new_ci.codelocs) 56 | resize!(new_ci.linetable, 1) # codegen assumes at least one entry on <1.5 57 | empty!(new_ci.ssaflags) 58 | new_ci.ssavaluetypes = 0 59 | new_ci.edges = MethodInstance[mi] 60 | # XXX: setting this edge does not give us proper method invalidation, see 61 | # JuliaLang/julia#34962 which demonstrates we also need to "call" the kernel. 62 | # invoking `code_llvm` also does the necessary codegen, as does calling the 63 | # underlying C methods -- which GPUCompiler does, so everything Just Works. 64 | 65 | # prepare the slots 66 | new_ci.slotnames = Symbol[:kwfunc, :kwargs, Symbol("#self#"), :driver, :spec, :id] 67 | new_ci.slotflags = UInt8[0x00 for i = 1:6] 68 | kwargs = SlotNumber(2) 69 | driver = SlotNumber(4) 70 | spec = SlotNumber(5) 71 | env = SlotNumber(6) 72 | 73 | # call the compiler 74 | append!(new_ci.code, [Expr(:call, Core.kwfunc, check_cache), 75 | Expr(:call, merge, NamedTuple(), kwargs), 76 | Expr(:call, hash, env, id), 77 | Expr(:call, SSAValue(1), SSAValue(2), check_cache, driver, spec, SSAValue(3)), 78 | Expr(:return, SSAValue(4))]) 79 | append!(new_ci.codelocs, [0, 0, 0, 0, 0]) 80 | new_ci.ssavaluetypes += 5 81 | 82 | return new_ci 83 | end 84 | -------------------------------------------------------------------------------- /src/interface.jl: -------------------------------------------------------------------------------- 1 | # interfaces for defining new compilers 2 | 3 | # the definition of a new GPU compiler is typically split in two: 4 | # - a generic compiler that lives in GPUCompiler.jl (e.g., emitting PTX, SPIR-V, etc) 5 | # - a more specific version in a package that targets an environment (e.g. CUDA, ROCm, etc) 6 | # 7 | # the first level of customizability is found in the AbstractCompilerTarget hierarchy, 8 | # with methods and interfaces that can only be implemented within GPUCompiler.jl. 9 | # 10 | # further customization should be put in a concrete instance of the AbstractCompilerParams 11 | # type, and can be used to customize interfaces defined on CompilerJob. 12 | 13 | 14 | ## target 15 | 16 | export AbstractCompilerTarget 17 | 18 | # container for state handled by targets defined in GPUCompiler.jl 19 | 20 | abstract type AbstractCompilerTarget end 21 | 22 | llvm_triple(::AbstractCompilerTarget) = error("Not implemented") 23 | 24 | function llvm_machine(target::AbstractCompilerTarget) 25 | triple = llvm_triple(target) 26 | 27 | t = Target(triple) 28 | 29 | tm = TargetMachine(t, triple) 30 | asm_verbosity!(tm, true) 31 | 32 | return tm 33 | end 34 | 35 | llvm_datalayout(target::AbstractCompilerTarget) = DataLayout(llvm_machine(target)) 36 | 37 | 38 | ## params 39 | 40 | export AbstractCompilerParams 41 | 42 | # container for state handled by external users of GPUCompiler.jl 43 | 44 | abstract type AbstractCompilerParams end 45 | 46 | 47 | ## function specification 48 | 49 | export FunctionSpec 50 | 51 | # what we'll be compiling 52 | 53 | struct FunctionSpec{F,TT} 54 | f::Base.Callable 55 | tt::DataType 56 | kernel::Bool 57 | name::Union{Nothing,String} 58 | end 59 | 60 | # put the function and argument types in typevars 61 | # so that we can access it from generated functions 62 | FunctionSpec(f, tt=Tuple{}, kernel=true, name=nothing) = 63 | FunctionSpec{typeof(f),tt}(f, tt, kernel, name) 64 | 65 | function signature(spec::FunctionSpec) 66 | fn = something(spec.name, nameof(spec.f)) 67 | args = join(spec.tt.parameters, ", ") 68 | return "$fn($(join(spec.tt.parameters, ", ")))" 69 | end 70 | 71 | function Base.show(io::IO, spec::FunctionSpec) 72 | spec.kernel ? print(io, "kernel ") : print(io, "function ") 73 | print(io, signature(spec)) 74 | end 75 | 76 | 77 | ## job 78 | 79 | export CompilerJob 80 | 81 | # a specific invocation of the compiler, bundling everything needed to generate code 82 | 83 | Base.@kwdef struct CompilerJob{T,P} 84 | target::T 85 | source::FunctionSpec 86 | params::P 87 | 88 | CompilerJob(target::AbstractCompilerTarget, source::FunctionSpec, params::AbstractCompilerParams) = 89 | new{typeof(target), typeof(params)}(target, source, params) 90 | end 91 | 92 | Base.similar(job::CompilerJob, source::FunctionSpec) = 93 | CompilerJob(target=job.target, source=source, params=job.params) 94 | 95 | function Base.show(io::IO, job::CompilerJob{T}) where {T} 96 | print(io, "CompilerJob of ", job.source, " for ", T) 97 | end 98 | 99 | 100 | ## interfaces and fallback definitions 101 | 102 | # the Julia module to look up target-specific runtime functions in (this includes both 103 | # target-specific functions from the GPU runtime library, like `malloc`, but also 104 | # replacements functions for operations like `Base.sin`) 105 | runtime_module(::CompilerJob) = error("Not implemented") 106 | 107 | # check if a function is an intrinsic that can assumed to be always available 108 | isintrinsic(::CompilerJob, fn::String) = false 109 | 110 | # does this target support throwing Julia exceptions with jl_throw? 111 | # if not, calls to throw will be replaced with calls to the GPU runtime 112 | can_throw(::CompilerJob) = false 113 | 114 | # generate a string that represents the type of compilation, for selecting a compiled 115 | # instance of the runtime library. this slug should encode everything that affects 116 | # the generated code of this compiler job (with exception of the function source) 117 | runtime_slug(::CompilerJob) = error("Not implemented") 118 | 119 | # early processing of the newly generated LLVM IR module 120 | process_module!(::CompilerJob, mod::LLVM.Module) = return 121 | 122 | # early processing of the newly identified LLVM kernel function 123 | process_kernel!(::CompilerJob, mod::LLVM.Module, kernel::LLVM.Function) = return kernel 124 | 125 | # final processing of the IR module, right before validation and machine-code generation 126 | finish_module!(::CompilerJob, mod::LLVM.Module) = return 127 | 128 | add_lowering_passes!(::CompilerJob, pm::LLVM.PassManager) = return 129 | 130 | add_optimization_passes!(::CompilerJob, pm::LLVM.PassManager) = return 131 | 132 | link_libraries!(::CompilerJob, mod::LLVM.Module, undefined_fns::Vector{String}) = return 133 | -------------------------------------------------------------------------------- /src/optim.jl: -------------------------------------------------------------------------------- 1 | # LLVM IR optimization 2 | 3 | function optimize!(job::CompilerJob, mod::LLVM.Module) 4 | tm = llvm_machine(job.target) 5 | 6 | function initialize!(pm) 7 | add_library_info!(pm, triple(mod)) 8 | add_transform_info!(pm, tm) 9 | end 10 | 11 | global current_job 12 | current_job = job 13 | 14 | # Julia-specific optimizations 15 | # 16 | # NOTE: we need to use multiple distinct pass managers to force pass ordering; 17 | # intrinsics should never get lowered before Julia has optimized them. 18 | 19 | ModulePassManager() do pm 20 | initialize!(pm) 21 | ccall(:jl_add_optimization_passes, Cvoid, 22 | (LLVM.API.LLVMPassManagerRef, Cint, Cint), 23 | LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0) 24 | run!(pm, mod) 25 | end 26 | 27 | ModulePassManager() do pm 28 | initialize!(pm) 29 | 30 | # lower intrinsics 31 | add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) 32 | aggressive_dce!(pm) # remove dead uses of ptls 33 | add!(pm, ModulePass("LowerPTLS", lower_ptls!)) 34 | 35 | # the Julia GC lowering pass also has some clean-up that is required 36 | late_lower_gc_frame!(pm) 37 | 38 | remove_julia_addrspaces!(pm) 39 | 40 | run!(pm, mod) 41 | end 42 | 43 | # target-specific optimizations 44 | ModulePassManager() do pm 45 | initialize!(pm) 46 | 47 | # Julia's operand bundles confuse the inliner, so repeat here now they are gone. 48 | # FIXME: we should fix the inliner so that inlined code gets optimized early-on 49 | always_inliner!(pm) 50 | 51 | add_optimization_passes!(job, pm) 52 | 53 | run!(pm, mod) 54 | end 55 | 56 | # we compile a module containing the entire call graph, 57 | # so perform some interprocedural optimizations. 58 | # 59 | # for some reason, these passes need to be distinct from the regular optimization chain, 60 | # or certain values (such as the constant arrays used to populare llvm.compiler.user ad 61 | # part of the LateLowerGCFrame pass) aren't collected properly. 62 | # 63 | # these might not always be safe, as Julia's IR metadata isn't designed for IPO. 64 | ModulePassManager() do pm 65 | dead_arg_elimination!(pm) # parent doesn't use return value --> ret void 66 | 67 | run!(pm, mod) 68 | end 69 | 70 | return 71 | end 72 | 73 | 74 | ## lowering intrinsics 75 | 76 | # lower object allocations to to PTX malloc 77 | # 78 | # this is a PoC implementation that is very simple: allocate, and never free. it also runs 79 | # _before_ Julia's GC lowering passes, so we don't get to use the results of its analyses. 80 | # when we ever implement a more potent GC, we will need those results, but the relevant pass 81 | # is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc. 82 | # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit 83 | # lower-level intrinsics which then can be lowered to architecture-specific code. 84 | function lower_gc_frame!(fun::LLVM.Function) 85 | job = current_job::CompilerJob 86 | mod = LLVM.parent(fun) 87 | changed = false 88 | 89 | # plain alloc 90 | if haskey(functions(mod), "julia.gc_alloc_obj") 91 | alloc_obj = functions(mod)["julia.gc_alloc_obj"] 92 | alloc_obj_ft = eltype(llvmtype(alloc_obj)) 93 | T_prjlvalue = return_type(alloc_obj_ft) 94 | T_pjlvalue = convert(LLVMType, Any, true) 95 | 96 | for use in uses(alloc_obj) 97 | call = user(use)::LLVM.CallInst 98 | 99 | # decode the call 100 | ops = collect(operands(call)) 101 | sz = ops[2] 102 | 103 | # replace with PTX alloc_obj 104 | let builder = Builder(JuliaContext()) 105 | position!(builder, call) 106 | ptr = call!(builder, Runtime.get(:gc_pool_alloc), [sz]) 107 | replace_uses!(call, ptr) 108 | dispose(builder) 109 | end 110 | 111 | unsafe_delete!(LLVM.parent(call), call) 112 | 113 | changed = true 114 | end 115 | 116 | @compiler_assert isempty(uses(alloc_obj)) job 117 | end 118 | 119 | # we don't care about write barriers 120 | if haskey(functions(mod), "julia.write_barrier") 121 | barrier = functions(mod)["julia.write_barrier"] 122 | 123 | for use in uses(barrier) 124 | call = user(use)::LLVM.CallInst 125 | unsafe_delete!(LLVM.parent(call), call) 126 | changed = true 127 | end 128 | 129 | @compiler_assert isempty(uses(barrier)) job 130 | end 131 | 132 | return changed 133 | end 134 | 135 | # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. 136 | # 137 | # this assumes and checks that the TLS is unused, which should be the case for most GPU code 138 | # after lowering the GC intrinsics to TLS-less code and having run DCE. 139 | # 140 | # TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead, 141 | # making it easier to remove or reimplement that functionality here. 142 | function lower_ptls!(mod::LLVM.Module) 143 | job = current_job::CompilerJob 144 | changed = false 145 | 146 | if haskey(functions(mod), "julia.ptls_states") 147 | ptls_getter = functions(mod)["julia.ptls_states"] 148 | 149 | for use in uses(ptls_getter) 150 | val = user(use) 151 | if !isempty(uses(val)) 152 | error("Thread local storage is not implemented") 153 | end 154 | unsafe_delete!(LLVM.parent(val), val) 155 | changed = true 156 | end 157 | 158 | @compiler_assert isempty(uses(ptls_getter)) job 159 | end 160 | 161 | return changed 162 | end 163 | -------------------------------------------------------------------------------- /src/rtlib.jl: -------------------------------------------------------------------------------- 1 | # compiler support for working with run-time libraries 2 | 3 | link_library!(mod::LLVM.Module, lib::LLVM.Module) = link_library!(mod, [lib]) 4 | function link_library!(mod::LLVM.Module, libs::Vector{LLVM.Module}) 5 | # linking is destructive, so copy the libraries 6 | libs = [LLVM.Module(lib) for lib in libs] 7 | 8 | for lib in libs 9 | link!(mod, lib) 10 | end 11 | end 12 | 13 | 14 | # 15 | # GPU run-time library 16 | # 17 | 18 | const libcache = Dict{String, LLVM.Module}() 19 | 20 | # get the path to a directory where we can put cache files (machine-specific, ephemeral) 21 | # NOTE: maybe we should use XDG_CACHE_PATH/%LOCALAPPDATA%, but other Julia cache files 22 | # are put in .julia anyway so let's just follow suit for now. 23 | function cachedir(depot=DEPOT_PATH[1]) 24 | # this mimicks Base.compilecache. we can't just call the function, or we might actually 25 | # _generate_ a cache file, e.g., when running with `--compiled-modules=no`. 26 | if VERSION >= v"1.3.0-alpha.146" 27 | entrypath, entryfile = Base.cache_file_entry(Base.PkgId(GPUCompiler)) 28 | abspath(depot, entrypath, entryfile) 29 | else 30 | cachefile = abspath(depot, Base.cache_file_entry(Base.PkgId(GPUCompiler))) 31 | 32 | # the cachefile consists of `/depot/compiled/vXXX/GPUCompiler/$slug.ji` 33 | # transform that into `/depot/compiled/vXXX/GPUCompiler/$slug/` 34 | splitext(cachefile)[1] 35 | end 36 | end 37 | 38 | 39 | ## higher-level functionality to work with runtime functions 40 | 41 | function LLVM.call!(builder, rt::Runtime.RuntimeMethodInstance, args=LLVM.Value[]) 42 | bb = position(builder) 43 | f = LLVM.parent(bb) 44 | mod = LLVM.parent(f) 45 | 46 | # get or create a function prototype 47 | if haskey(functions(mod), rt.llvm_name) 48 | f = functions(mod)[rt.llvm_name] 49 | ft = eltype(llvmtype(f)) 50 | else 51 | ft = LLVM.FunctionType(rt.llvm_return_type, rt.llvm_types) 52 | f = LLVM.Function(mod, rt.llvm_name, ft) 53 | end 54 | 55 | # runtime functions are written in Julia, while we're calling from LLVM, 56 | # this often results in argument type mismatches. try to fix some here. 57 | for (i,arg) in enumerate(args) 58 | if llvmtype(arg) != parameters(ft)[i] 59 | if (llvmtype(arg) isa LLVM.PointerType) && 60 | (parameters(ft)[i] isa LLVM.IntegerType) 61 | # Julia pointers are passed as integers 62 | args[i] = ptrtoint!(builder, args[i], parameters(ft)[i]) 63 | else 64 | error("Don't know how to convert ", arg, " argument to ", parameters(ft)[i]) 65 | end 66 | end 67 | end 68 | 69 | call!(builder, f, args) 70 | end 71 | 72 | 73 | ## functionality to build the runtime library 74 | 75 | function emit_function!(mod, job::CompilerJob, f, method) 76 | tt = Base.to_tuple_type(method.types) 77 | new_mod, entry = codegen(:llvm, similar(job, FunctionSpec(f, tt, #=kernel=# false)); 78 | optimize=false, libraries=false) 79 | if return_type(eltype(llvmtype(entry))) != method.llvm_return_type 80 | error("Invalid return type for runtime function '$(method.name)': expected $(method.llvm_return_type), got $(return_type(eltype(llvmtype(entry))))") 81 | end 82 | 83 | # recent Julia versions include prototypes for all runtime functions, even if unused 84 | if VERSION >= v"1.5-" 85 | pm = ModulePassManager() 86 | strip_dead_prototypes!(pm) 87 | run!(pm, new_mod) 88 | dispose(pm) 89 | end 90 | 91 | temp_name = LLVM.name(entry) 92 | link!(mod, new_mod) 93 | entry = functions(mod)[temp_name] 94 | 95 | # if a declaration already existed, replace it with the function to avoid aliasing 96 | # (and getting function names like gpu_signal_exception1) 97 | name = method.llvm_name 98 | if haskey(functions(mod), name) 99 | decl = functions(mod)[name] 100 | @assert llvmtype(decl) == llvmtype(entry) 101 | replace_uses!(decl, entry) 102 | unsafe_delete!(mod, decl) 103 | end 104 | LLVM.name!(entry, name) 105 | end 106 | 107 | function build_runtime(job::CompilerJob) 108 | mod = LLVM.Module("GPUCompiler run-time library", JuliaContext()) 109 | 110 | for method in values(Runtime.methods) 111 | def = if isa(method.def, Symbol) 112 | isdefined(runtime_module(job), method.def) || continue 113 | getfield(runtime_module(job), method.def) 114 | else 115 | method.def 116 | end 117 | emit_function!(mod, job, def, method) 118 | end 119 | 120 | optimize!(job, mod) 121 | 122 | mod 123 | end 124 | 125 | function load_runtime(job::CompilerJob) 126 | # find the first existing cache directory (for when dealing with layered depots) 127 | cachedirs = [cachedir(depot) for depot in DEPOT_PATH] 128 | filter!(isdir, cachedirs) 129 | input_dir = if isempty(cachedirs) 130 | nothing 131 | else 132 | first(cachedirs) 133 | end 134 | 135 | # we are only guaranteed to be able to write in the current depot 136 | output_dir = cachedir() 137 | 138 | # if both aren't equal, copy pregenerated runtime libraries to our depot 139 | # NOTE: we don't just lazily read from the one and write to the other, because 140 | # once we generate additional runtimes in the output dir we don't know if 141 | # it's safe to load from other layers (since those could have been invalidated) 142 | if input_dir !== nothing && input_dir != output_dir 143 | mkpath(dirname(output_dir)) 144 | cp(input_dir, output_dir) 145 | end 146 | 147 | slug = runtime_slug(job) 148 | name = "runtime_$(slug).bc" 149 | path = joinpath(output_dir, name) 150 | 151 | get!(libcache, path) do 152 | if ispath(path) 153 | open(path) do io 154 | parse(LLVM.Module, read(io), JuliaContext()) 155 | end 156 | else 157 | @debug "Building the GPU runtime library at $path" 158 | mkpath(output_dir) 159 | lib = build_runtime(job) 160 | open(path, "w") do io 161 | write(io, lib) 162 | end 163 | lib 164 | end 165 | end 166 | end 167 | 168 | # remove the existing cache 169 | # NOTE: call this function from global scope, so any change triggers recompilation. 170 | function reset_runtime() 171 | rm(cachedir(); recursive=true, force=true) 172 | # create an empty cache directory. since we only ever load from the first existing cachedir, 173 | # this effectively invalidates preexisting caches in lower layers of the depot. 174 | mkpath(cachedir()) 175 | 176 | # wipe the cache so we can use this function at run-time too 177 | empty!(libcache) 178 | 179 | return 180 | end 181 | -------------------------------------------------------------------------------- /test/gcn.jl: -------------------------------------------------------------------------------- 1 | @testset "GCN" begin 2 | 3 | include("definitions/gcn.jl") 4 | 5 | ############################################################################################ 6 | 7 | @testset "IR" begin 8 | 9 | @testset "kernel calling convention" begin 10 | kernel() = return 11 | 12 | ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{}; dump_module=true)) 13 | @test !occursin("amdgpu_kernel", ir) 14 | 15 | ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{}; 16 | dump_module=true, kernel=true)) 17 | @test occursin("amdgpu_kernel", ir) 18 | end 19 | 20 | end 21 | 22 | ############################################################################################ 23 | 24 | @testset "assembly" begin 25 | 26 | @testset "skip scalar trap" begin 27 | workitem_idx_x() = ccall("llvm.amdgcn.workitem.id.x", llvmcall, Int32, ()) 28 | trap() = ccall("llvm.trap", llvmcall, Nothing, ()) 29 | function kernel() 30 | if workitem_idx_x() > 1 31 | trap() 32 | end 33 | return 34 | end 35 | 36 | asm = sprint(io->gcn_code_native(io, kernel, Tuple{})) 37 | @test occursin("s_trap 2", asm) 38 | @test_broken occursin("s_cbranch_execz", asm) 39 | if Base.libllvm_version < v"9" 40 | @test_broken occursin("v_readfirstlane", asm) 41 | end 42 | end 43 | 44 | @testset "child functions" begin 45 | # we often test using @noinline child functions, so test whether these survive 46 | # (despite not having side-effects) 47 | @noinline child(i) = sink(i) 48 | function parent(i) 49 | child(i) 50 | return 51 | end 52 | 53 | asm = sprint(io->gcn_code_native(io, parent, Tuple{Int64})) 54 | @test occursin(r"s_add_u32.*julia_child_.*@rel32@lo\+4", asm) 55 | @test occursin(r"s_addc_u32.*julia_child_.*@rel32@hi\+4", asm) 56 | end 57 | 58 | @testset "kernel functions" begin 59 | @noinline nonentry(i) = sink(i) 60 | function entry(i) 61 | nonentry(i) 62 | return 63 | end 64 | 65 | asm = sprint(io->gcn_code_native(io, entry, Tuple{Int64}; kernel=true)) 66 | @test occursin(r"\.amdgpu_hsa_kernel .*julia_entry", asm) 67 | @test !occursin(r"\.amdgpu_hsa_kernel .*julia_nonentry", asm) 68 | @test occursin(r"\.type.*julia_nonentry_\d*,@function", asm) 69 | end 70 | 71 | @testset "child function reuse" begin 72 | # bug: depending on a child function from multiple parents resulted in 73 | # the child only being present once 74 | 75 | @noinline child(i) = sink(i) 76 | function parent1(i) 77 | child(i) 78 | return 79 | end 80 | 81 | asm = sprint(io->gcn_code_native(io, parent1, Tuple{Int})) 82 | @test occursin(r"\.type.*julia__\d*_child_\d*,@function", asm) 83 | 84 | function parent2(i) 85 | child(i+1) 86 | return 87 | end 88 | 89 | asm = sprint(io->gcn_code_native(io, parent2, Tuple{Int})) 90 | @test occursin(r"\.type.*julia__\d*_child_\d*,@function", asm) 91 | end 92 | 93 | @testset "child function reuse bis" begin 94 | # bug: similar, but slightly different issue as above 95 | # in the case of two child functions 96 | @noinline child1(i) = sink(i) 97 | @noinline child2(i) = sink(i+1) 98 | function parent1(i) 99 | child1(i) + child2(i) 100 | return 101 | end 102 | gcn_code_native(devnull, parent1, Tuple{Int}) 103 | 104 | function parent2(i) 105 | child1(i+1) + child2(i+1) 106 | return 107 | end 108 | gcn_code_native(devnull, parent2, Tuple{Int}) 109 | end 110 | 111 | @testset "indirect sysimg function use" begin 112 | # issue #9: re-using sysimg functions should force recompilation 113 | # (host fldmod1->mod1 throws, so the GCN code shouldn't contain a throw) 114 | 115 | # NOTE: Int32 to test for #49 116 | 117 | function kernel(out) 118 | wid, lane = fldmod1(unsafe_load(out), Int32(32)) 119 | unsafe_store!(out, wid) 120 | return 121 | end 122 | 123 | asm = sprint(io->gcn_code_native(io, kernel, Tuple{Ptr{Int32}})) 124 | @test !occursin("jl_throw", asm) 125 | @test !occursin("jl_invoke", asm) # forced recompilation should still not invoke 126 | end 127 | 128 | @testset "LLVM intrinsics" begin 129 | # issue #13 (a): cannot select trunc 130 | function kernel(x) 131 | unsafe_trunc(Int, x) 132 | return 133 | end 134 | gcn_code_native(devnull, kernel, Tuple{Float64}) 135 | end 136 | 137 | @test_broken "exception arguments" 138 | #= FIXME: _ZNK4llvm14TargetLowering20scalarizeVectorStoreEPNS_11StoreSDNodeERNS_12SelectionDAGE 139 | @testset "exception arguments" begin 140 | function kernel(a) 141 | unsafe_store!(a, trunc(Int, unsafe_load(a))) 142 | return 143 | end 144 | 145 | gcn_code_native(devnull, kernel, Tuple{Ptr{Float64}}) 146 | end 147 | =# 148 | 149 | @test_broken "GC and TLS lowering" 150 | #= FIXME: in function julia_inner_18528 void (%jl_value_t addrspace(10)*): invalid addrspacecast 151 | @testset "GC and TLS lowering" begin 152 | @eval mutable struct PleaseAllocate 153 | y::Csize_t 154 | end 155 | 156 | # common pattern in Julia 0.7: outlined throw to avoid a GC frame in the calling code 157 | @noinline function inner(x) 158 | sink(x.y) 159 | nothing 160 | end 161 | 162 | function kernel(i) 163 | inner(PleaseAllocate(Csize_t(42))) 164 | nothing 165 | end 166 | 167 | asm = sprint(io->gcn_code_native(io, kernel, Tuple{Int})) 168 | @test occursin("gpu_gc_pool_alloc", asm) 169 | 170 | # make sure that we can still ellide allocations 171 | function ref_kernel(ptr, i) 172 | data = Ref{Int64}() 173 | data[] = 0 174 | if i > 1 175 | data[] = 1 176 | else 177 | data[] = 2 178 | end 179 | unsafe_store!(ptr, data[], i) 180 | return nothing 181 | end 182 | 183 | asm = sprint(io->gcn_code_native(io, ref_kernel, Tuple{Ptr{Int64}, Int})) 184 | 185 | 186 | if VERSION < v"1.2.0-DEV.375" 187 | @test_broken !occursin("gpu_gc_pool_alloc", asm) 188 | else 189 | @test !occursin("gpu_gc_pool_alloc", asm) 190 | end 191 | end 192 | =# 193 | 194 | @testset "float boxes" begin 195 | function kernel(a,b) 196 | c = Int32(a) 197 | # the conversion to Int32 may fail, in which case the input Float32 is boxed in order to 198 | # pass it to the @nospecialize exception constructor. we should really avoid that (eg. 199 | # by avoiding @nospecialize, or optimize the unused arguments away), but for now the box 200 | # should just work. 201 | unsafe_store!(b, c) 202 | return 203 | end 204 | 205 | ir = sprint(io->gcn_code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}})) 206 | @test occursin("jl_box_float32", ir) 207 | gcn_code_native(devnull, kernel, Tuple{Float32,Ptr{Float32}}) 208 | end 209 | 210 | end 211 | 212 | ############################################################################################ 213 | 214 | end 215 | -------------------------------------------------------------------------------- /src/runtime.jl: -------------------------------------------------------------------------------- 1 | # GPU runtime library 2 | # 3 | # This module defines method instances that will be compiled into a target-specific image 4 | # and will be available to the GPU compiler to call after Julia has generated code. 5 | # 6 | # Most functions implement, or are used to support Julia runtime functions that are expected 7 | # by the Julia compiler to be available at run time, e.g., to dynamically allocate memory, 8 | # box values, etc. 9 | 10 | module Runtime 11 | 12 | using ..GPUCompiler 13 | using LLVM 14 | using LLVM.Interop 15 | 16 | 17 | ## representation of a runtime method instance 18 | 19 | struct RuntimeMethodInstance 20 | # either a function defined here, or a symbol to fetch a target-specific definition 21 | def::Union{Function,Symbol} 22 | 23 | return_type::Type 24 | types::Tuple 25 | name::Symbol 26 | 27 | # LLVM types cannot be cached, so we can't put them in the runtime method instance. 28 | # the actual types are constructed upon accessing them, based on a sentinel value: 29 | # - nothing: construct the LLVM type based on its Julia counterparts 30 | # - function: call this generator to get the type (when more control is needed) 31 | llvm_return_type::Union{Nothing, Function} 32 | llvm_types::Union{Nothing, Function} 33 | llvm_name::String 34 | end 35 | 36 | function Base.getproperty(rt::RuntimeMethodInstance, field::Symbol) 37 | value = getfield(rt, field) 38 | if field == :llvm_types 39 | if value == nothing 40 | LLVMType[convert.(LLVMType, typ) for typ in rt.types] 41 | else 42 | value() 43 | end 44 | elseif field == :llvm_return_type 45 | if value == nothing 46 | convert(LLVMType, rt.return_type) 47 | else 48 | value() 49 | end 50 | else 51 | return value 52 | end 53 | end 54 | 55 | const methods = Dict{Symbol,RuntimeMethodInstance}() 56 | function get(name::Symbol) 57 | if !haskey(methods, name) 58 | display(methods) 59 | end 60 | methods[name] 61 | end 62 | 63 | # Register a Julia function `def` as a runtime library function identified by `name`. The 64 | # function will be compiled upon first use for argument types `types` and should return 65 | # `return_type`. Use `Runtime.get(name)` to get a reference to this method instance. 66 | # 67 | # The corresponding LLVM types `llvm_types` and `llvm_return_type` will be deduced from 68 | # their Julia counterparts. To influence that conversion, pass a callable object instead; 69 | # this object will be evaluated at run-time and the returned value will be used instead. 70 | # 71 | # When generating multiple runtime functions from a single definition, make sure to specify 72 | # different values for `name`. The LLVM function name will be deduced from that name, but 73 | # you can always specify `llvm_name` to influence that. Never use an LLVM name that starts 74 | # with `julia_` or the function might clash with other compiled functions. 75 | function compile(def, return_type, types, llvm_return_type=nothing, llvm_types=nothing; 76 | name=isa(def,Symbol) ? def : nameof(def), llvm_name="gpu_$name") 77 | meth = RuntimeMethodInstance(def, 78 | return_type, types, name, 79 | llvm_return_type, llvm_types, llvm_name) 80 | if haskey(methods, name) 81 | error("Runtime function $name has already been registered!") 82 | end 83 | methods[name] = meth 84 | 85 | # FIXME: if the function is a symbol, implying it will be specified by the target, 86 | # we won't be able to call this function here or we'll get UndefVarErrors. 87 | # work around that by generating an llvmcall stub. can we do better by 88 | # using the new nonrecursive codegen to handle function lookup ourselves? 89 | if def isa Symbol 90 | args = [gensym() for typ in types] 91 | @eval @inline $def($(args...)) = 92 | ccall($"extern $llvm_name", llvmcall, $return_type, ($(types...),), $(args...)) 93 | end 94 | 95 | return 96 | end 97 | 98 | 99 | ## exception handling 100 | 101 | # expected functions for exception signalling 102 | compile(:signal_exception, Nothing, ()) 103 | 104 | # expected functions for simple exception handling 105 | compile(:report_exception, Nothing, (Ptr{Cchar},)) 106 | compile(:report_oom, Nothing, (Csize_t,)) 107 | 108 | # expected functions for verbose exception handling 109 | compile(:report_exception_frame, Nothing, (Cint, Ptr{Cchar}, Ptr{Cchar}, Cint)) 110 | compile(:report_exception_name, Nothing, (Ptr{Cchar},)) 111 | 112 | # NOTE: no throw functions are provided here, but replaced by an LLVM pass instead 113 | # in order to provide some debug information without stack unwinding. 114 | 115 | 116 | ## GC 117 | 118 | if VERSION < v"1.4" 119 | 120 | @enum AddressSpace begin 121 | Generic = 1 122 | Tracked = 10 123 | Derived = 11 124 | CalleeRooted = 12 125 | Loaded = 13 126 | end 127 | 128 | # LLVM type of a tracked pointer 129 | function T_prjlvalue() 130 | T_pjlvalue = convert(LLVMType, Any, true) 131 | LLVM.PointerType(eltype(T_pjlvalue), Tracked) 132 | end 133 | 134 | else 135 | 136 | # FIXME: once we only support 1.4, get rid of this and allow boxed types 137 | T_prjlvalue() = convert(LLVMType, Any, true) 138 | 139 | end 140 | 141 | function gc_pool_alloc(sz::Csize_t) 142 | ptr = malloc(sz) 143 | if ptr == C_NULL 144 | report_oom(sz) 145 | throw(OutOfMemoryError()) 146 | end 147 | return unsafe_pointer_to_objref(ptr) 148 | end 149 | 150 | compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) 151 | 152 | # expected functions for GC support 153 | compile(:malloc, Ptr{Nothing}, (Csize_t,)) 154 | 155 | 156 | ## boxing and unboxing 157 | 158 | const tag_type = UInt 159 | const tag_size = sizeof(tag_type) 160 | 161 | const gc_bits = 0x3 # FIXME 162 | 163 | # get the type tag of a type at run-time 164 | @generated function type_tag(::Val{type_name}) where type_name 165 | T_tag = convert(LLVMType, tag_type) 166 | T_ptag = LLVM.PointerType(T_tag) 167 | 168 | T_pjlvalue = convert(LLVMType, Any, true) 169 | 170 | # create function 171 | llvm_f, _ = create_function(T_tag) 172 | mod = LLVM.parent(llvm_f) 173 | 174 | # this isn't really a function, but we abuse it to get the JIT to resolve the address 175 | typ = LLVM.Function(mod, "jl_" * String(type_name) * "_type", 176 | LLVM.FunctionType(T_pjlvalue)) 177 | 178 | # generate IR 179 | Builder(JuliaContext()) do builder 180 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 181 | position!(builder, entry) 182 | 183 | typ_var = bitcast!(builder, typ, T_ptag) 184 | 185 | tag = load!(builder, typ_var) 186 | 187 | ret!(builder, tag) 188 | end 189 | 190 | call_function(llvm_f, tag_type) 191 | end 192 | 193 | # we use `jl_value_ptr`, a Julia pseudo-intrinsic that can be used to box and unbox values 194 | 195 | @generated function box(val, ::Val{type_name}) where type_name 196 | sz = sizeof(val) 197 | allocsz = sz + tag_size 198 | 199 | # type-tags are ephemeral, so look them up at run time 200 | #tag = unsafe_load(convert(Ptr{tag_type}, type_name)) 201 | tag = :( type_tag(Val(type_name)) ) 202 | 203 | quote 204 | Base.@_inline_meta 205 | 206 | ptr = malloc($(Csize_t(allocsz))) 207 | 208 | # store the type tag 209 | ptr = convert(Ptr{tag_type}, ptr) 210 | Core.Intrinsics.pointerset(ptr, $tag | $gc_bits, #=index=# 1, #=align=# $tag_size) 211 | 212 | # store the value 213 | ptr = convert(Ptr{$val}, ptr+tag_size) 214 | Core.Intrinsics.pointerset(ptr, val, #=index=# 1, #=align=# $sz) 215 | 216 | unsafe_pointer_to_objref(ptr) 217 | end 218 | end 219 | 220 | @inline function unbox(obj, ::Type{T}) where T 221 | ptr = ccall(:jl_value_ptr, Ptr{Cvoid}, (Any,), obj) 222 | 223 | # load the value 224 | ptr = convert(Ptr{T}, ptr) 225 | Core.Intrinsics.pointerref(ptr, #=index=# 1, #=align=# sizeof(T)) 226 | end 227 | 228 | # generate functions functions that exist in the Julia runtime (see julia/src/datatype.c) 229 | for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => :int64, 230 | UInt8 => :uint8, UInt16 => :uint16, UInt32 => :uint32, UInt64 => :uint64, 231 | Bool => :bool, Float32 => :float32, Float64 => :float64] 232 | box_fn = Symbol("box_$t") 233 | unbox_fn = Symbol("unbox_$t") 234 | @eval begin 235 | $box_fn(val) = box($T(val), Val($(QuoteNode(t)))) 236 | $unbox_fn(obj) = unbox(obj, $T) 237 | 238 | compile($box_fn, Any, ($T,), T_prjlvalue; llvm_name=$"jl_$box_fn") 239 | compile($unbox_fn, $T, (Any,); llvm_name=$"jl_$unbox_fn") 240 | end 241 | end 242 | 243 | 244 | end 245 | -------------------------------------------------------------------------------- /src/reflection.jl: -------------------------------------------------------------------------------- 1 | using InteractiveUtils, UUIDs 2 | const Cthulhu = Base.PkgId(UUID("f68482b8-f384-11e8-15f7-abe071a5a75f"), "Cthulhu") 3 | 4 | 5 | # 6 | # code_* replacements 7 | # 8 | 9 | code_lowered(job::CompilerJob; kwargs...) = 10 | InteractiveUtils.code_lowered(job.source.f, job.source.tt; kwargs...) 11 | 12 | function code_typed(job::CompilerJob; interactive::Bool=false, kwargs...) 13 | # TODO: use the compiler driver to get the Julia method instance (we might rewrite it) 14 | if interactive 15 | # call Cthulhu without introducing a dependency on Cthulhu 16 | mod = get(Base.loaded_modules, Cthulhu, nothing) 17 | mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.") 18 | descend_code_typed = getfield(mod, :descend_code_typed) 19 | descend_code_typed(job.source.f, job.source.tt; kwargs...) 20 | else 21 | InteractiveUtils.code_typed(job.source.f, job.source.tt; kwargs...) 22 | end 23 | end 24 | 25 | function code_warntype(io::IO, job::CompilerJob; interactive::Bool=false, kwargs...) 26 | # TODO: use the compiler driver to get the Julia method instance (we might rewrite it) 27 | if interactive 28 | @assert io == stdout 29 | # call Cthulhu without introducing a dependency on Cthulhu 30 | mod = get(Base.loaded_modules, Cthulhu, nothing) 31 | mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.") 32 | descend_code_warntype = getfield(mod, :descend_code_warntype) 33 | descend_code_warntype(job.source.f, job.source.tt; kwargs...) 34 | else 35 | InteractiveUtils.code_warntype(io, job.source.f, job.source.tt; kwargs...) 36 | end 37 | end 38 | code_warntype(job::CompilerJob; kwargs...) = code_warntype(stdout, job; kwargs...) 39 | 40 | """ 41 | code_llvm([io], job; optimize=true, raw=false, dump_module=false) 42 | 43 | Prints the device LLVM IR generated for the given compiler job to `io` (default `stdout`). 44 | 45 | The following keyword arguments are supported: 46 | 47 | - `optimize`: determines if the code is optimized, which includes kernel-specific 48 | optimizations if `kernel` is true 49 | - `raw`: return the raw IR including all metadata 50 | - `dump_module`: display the entire module instead of just the function 51 | 52 | See also: [`@device_code_llvm`](@ref), `InteractiveUtils.code_llvm` 53 | """ 54 | function code_llvm(io::IO, job::CompilerJob; optimize::Bool=true, raw::Bool=false, 55 | debuginfo::Symbol=:default, dump_module::Bool=false) 56 | # NOTE: jl_dump_function_ir supports stripping metadata, so don't do it in the driver 57 | ir, entry = GPUCompiler.codegen(:llvm, job; optimize=optimize, strip=false, validate=false) 58 | str = ccall(:jl_dump_function_ir, Ref{String}, 59 | (Ptr{Cvoid}, Bool, Bool, Ptr{UInt8}), 60 | LLVM.ref(entry), !raw, dump_module, debuginfo) 61 | print(io, str) 62 | end 63 | code_llvm(job::CompilerJob; kwargs...) = code_llvm(stdout, job; kwargs...) 64 | 65 | """ 66 | code_native([io], f, types; cap::VersionNumber, kernel=false, raw=false) 67 | 68 | Prints the native assembly generated for the given compiler job to `io` (default `stdout`). 69 | 70 | The following keyword arguments are supported: 71 | 72 | - `cap` which device to generate code for 73 | - `kernel`: treat the function as an entry-point kernel 74 | - `raw`: return the raw code including all metadata 75 | 76 | See also: [`@device_code_native`](@ref), `InteractiveUtils.code_llvm` 77 | """ 78 | function code_native(io::IO, job::CompilerJob; raw::Bool=false) 79 | asm, _ = GPUCompiler.codegen(:asm, job; strip=!raw, validate=false) 80 | print(io, asm) 81 | end 82 | code_native(job::CompilerJob; kwargs...) = 83 | code_native(stdout, func, types; kwargs...) 84 | 85 | 86 | # 87 | # @device_code_* functions 88 | # 89 | 90 | function emit_hooked_compilation(inner_hook, ex...) 91 | user_code = ex[end] 92 | user_kwargs = ex[1:end-1] 93 | quote 94 | # wipe the compile cache to force recompilation 95 | empty!(GPUCompiler.compilecache) 96 | 97 | local kernels = 0 98 | function outer_hook(job) 99 | kernels += 1 100 | $inner_hook(job; $(map(esc, user_kwargs)...)) 101 | end 102 | 103 | if GPUCompiler.compile_hook[] != nothing 104 | error("Chaining multiple @device_code calls is unsupported") 105 | end 106 | try 107 | GPUCompiler.compile_hook[] = outer_hook 108 | $(esc(user_code)) 109 | finally 110 | GPUCompiler.compile_hook[] = nothing 111 | end 112 | 113 | if kernels == 0 114 | error("no kernels executed while evaluating the given expression") 115 | end 116 | 117 | nothing 118 | end 119 | end 120 | 121 | """ 122 | @device_code_lowered ex 123 | 124 | Evaluates the expression `ex` and returns the result of 125 | `InteractiveUtils.code_lowered` for every compiled GPU kernel. 126 | 127 | See also: `InteractiveUtils.@code_lowered` 128 | """ 129 | macro device_code_lowered(ex...) 130 | quote 131 | buf = Any[] 132 | function hook(job::CompilerJob) 133 | append!(buf, code_lowered(job)) 134 | end 135 | $(emit_hooked_compilation(:hook, ex...)) 136 | buf 137 | end 138 | end 139 | 140 | """ 141 | @device_code_typed ex 142 | 143 | Evaluates the expression `ex` and returns the result of 144 | `InteractiveUtils.code_typed` for every compiled GPU kernel. 145 | 146 | See also: `InteractiveUtils.@code_typed` 147 | """ 148 | macro device_code_typed(ex...) 149 | quote 150 | output = Dict{CompilerJob,Any}() 151 | function hook(job::CompilerJob) 152 | output[job] = code_typed(job) 153 | end 154 | $(emit_hooked_compilation(:hook, ex...)) 155 | output 156 | end 157 | end 158 | 159 | """ 160 | @device_code_warntype [io::IO=stdout] ex 161 | 162 | Evaluates the expression `ex` and prints the result of 163 | `InteractiveUtils.code_warntype` to `io` for every compiled GPU kernel. 164 | 165 | See also: `InteractiveUtils.@code_warntype` 166 | """ 167 | macro device_code_warntype(ex...) 168 | function hook(job::CompilerJob; io::IO=stdout, kwargs...) 169 | println(io, "$job") 170 | println(io) 171 | code_warntype(io, job; kwargs...) 172 | end 173 | emit_hooked_compilation(hook, ex...) 174 | end 175 | 176 | """ 177 | @device_code_llvm [io::IO=stdout, ...] ex 178 | 179 | Evaluates the expression `ex` and prints the result of InteractiveUtils.code_llvm 180 | to `io` for every compiled GPU kernel. For other supported keywords, see 181 | [`GPUCompiler.code_llvm`](@ref). 182 | 183 | See also: InteractiveUtils.@code_llvm 184 | """ 185 | macro device_code_llvm(ex...) 186 | function hook(job::CompilerJob; io::IO=stdout, kwargs...) 187 | println(io, "; $job") 188 | code_llvm(io, job; kwargs...) 189 | end 190 | emit_hooked_compilation(hook, ex...) 191 | end 192 | 193 | """ 194 | @device_code_native [io::IO=stdout, ...] ex 195 | 196 | Evaluates the expression `ex` and prints the result of [`GPUCompiler.code_native`](@ref) to `io` 197 | for every compiled GPU kernel. For other supported keywords, see 198 | [`GPUCompiler.code_native`](@ref). 199 | """ 200 | macro device_code_native(ex...) 201 | function hook(job::CompilerJob; io::IO=stdout, kwargs...) 202 | println(io, "// $job") 203 | println(io) 204 | code_native(io, job; kwargs...) 205 | end 206 | emit_hooked_compilation(hook, ex...) 207 | end 208 | 209 | """ 210 | @device_code dir::AbstractString=... [...] ex 211 | 212 | Evaluates the expression `ex` and dumps all intermediate forms of code to the directory 213 | `dir`. 214 | """ 215 | macro device_code(ex...) 216 | only(xs) = (@assert length(xs) == 1; first(xs)) 217 | localUnique = 1 218 | function hook(job::CompilerJob; dir::AbstractString) 219 | name = something(job.source.name, nameof(job.source.f)) 220 | fn = "$(name)_$(localUnique)" 221 | mkpath(dir) 222 | 223 | open(joinpath(dir, "$fn.lowered.jl"), "w") do io 224 | code = only(code_lowered(job)) 225 | println(io, code) 226 | end 227 | 228 | open(joinpath(dir, "$fn.typed.jl"), "w") do io 229 | if VERSION >= v"1.1.0" 230 | code = only(code_typed(job; debuginfo=:source)) 231 | else 232 | code = only(code_typed(job)) 233 | end 234 | println(io, code) 235 | end 236 | 237 | open(joinpath(dir, "$fn.unopt.ll"), "w") do io 238 | code_llvm(io, job; dump_module=true, raw=true, optimize=false) 239 | end 240 | 241 | open(joinpath(dir, "$fn.opt.ll"), "w") do io 242 | code_llvm(io, job; dump_module=true, raw=true) 243 | end 244 | 245 | open(joinpath(dir, "$fn.asm"), "w") do io 246 | code_native(io, job) 247 | end 248 | 249 | localUnique += 1 250 | end 251 | emit_hooked_compilation(hook, ex...) 252 | end 253 | -------------------------------------------------------------------------------- /test/ptx.jl: -------------------------------------------------------------------------------- 1 | @testset "PTX" begin 2 | 3 | include("definitions/ptx.jl") 4 | 5 | ############################################################################################ 6 | 7 | @testset "IR" begin 8 | 9 | @testset "exceptions" begin 10 | foobar() = throw(DivideError()) 11 | ir = sprint(io->ptx_code_llvm(io, foobar, Tuple{})) 12 | 13 | # plain exceptions should get lowered to a call to the GPU run-time 14 | @test occursin("gpu_report_exception", ir) 15 | # not a jl_throw referencing a jl_value_t representing the exception 16 | @test !occursin("jl_throw", ir) 17 | end 18 | 19 | @testset "kernel functions" begin 20 | @testset "kernel argument attributes" begin 21 | kernel(x) = return 22 | 23 | @eval struct Aggregate 24 | x::Int 25 | end 26 | 27 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Aggregate})) 28 | if VERSION < v"1.5.0-DEV.802" 29 | @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\]) addrspace\(\d+\)?\*", ir) 30 | else 31 | @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\])\*", ir) 32 | end 33 | 34 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Aggregate}; kernel=true)) 35 | if VERSION < v"1.5.0-DEV.802" 36 | @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\]) addrspace\(\d+\)?\*.+byval", ir) 37 | else 38 | @test occursin(r"@.*julia_kernel.+\(({ i64 }|\[1 x i64\])\*.+byval", ir) 39 | end 40 | end 41 | 42 | @testset "property_annotations" begin 43 | kernel() = return 44 | 45 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; dump_module=true)) 46 | @test !occursin("nvvm.annotations", ir) 47 | 48 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 49 | dump_module=true, kernel=true)) 50 | @test occursin("nvvm.annotations", ir) 51 | @test !occursin("maxntid", ir) 52 | @test !occursin("reqntid", ir) 53 | @test !occursin("minctasm", ir) 54 | @test !occursin("maxnreg", ir) 55 | 56 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 57 | dump_module=true, kernel=true, maxthreads=42)) 58 | @test occursin("maxntidx\", i32 42", ir) 59 | @test occursin("maxntidy\", i32 1", ir) 60 | @test occursin("maxntidz\", i32 1", ir) 61 | 62 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 63 | dump_module=true, kernel=true, minthreads=42)) 64 | @test occursin("reqntidx\", i32 42", ir) 65 | @test occursin("reqntidy\", i32 1", ir) 66 | @test occursin("reqntidz\", i32 1", ir) 67 | 68 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 69 | dump_module=true, kernel=true, blocks_per_sm=42)) 70 | @test occursin("minctasm\", i32 42", ir) 71 | 72 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 73 | dump_module=true, kernel=true, maxregs=42)) 74 | @test occursin("maxnreg\", i32 42", ir) 75 | end 76 | 77 | LLVM.version() >= v"8" && @testset "calling convention" begin 78 | kernel() = return 79 | 80 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; dump_module=true)) 81 | @test !occursin("ptx_kernel", ir) 82 | 83 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{}; 84 | dump_module=true, kernel=true)) 85 | @test occursin("ptx_kernel", ir) 86 | end 87 | end 88 | 89 | end 90 | 91 | ############################################################################################ 92 | 93 | @testset "assembly" begin 94 | 95 | @testset "child functions" begin 96 | # we often test using @noinline child functions, so test whether these survive 97 | # (despite not having side-effects) 98 | @noinline child(i) = sink(i) 99 | function parent(i) 100 | child(i) 101 | return 102 | end 103 | 104 | asm = sprint(io->ptx_code_native(io, parent, Tuple{Int64})) 105 | @test occursin(r"call.uni\s+julia_.*child_"m, asm) 106 | end 107 | 108 | @testset "kernel functions" begin 109 | @noinline nonentry(i) = sink(i) 110 | function entry(i) 111 | nonentry(i) 112 | return 113 | end 114 | 115 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; kernel=true)) 116 | @test occursin(r"\.visible \.entry .*julia_entry", asm) 117 | @test !occursin(r"\.visible \.func .*julia_nonentry", asm) 118 | @test occursin(r"\.func .*julia_nonentry", asm) 119 | 120 | @testset "property_annotations" begin 121 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; kernel=true)) 122 | @test !occursin("maxntid", asm) 123 | 124 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; 125 | kernel=true, maxthreads=42)) 126 | @test occursin(".maxntid 42, 1, 1", asm) 127 | 128 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; 129 | kernel=true, minthreads=42)) 130 | @test occursin(".reqntid 42, 1, 1", asm) 131 | 132 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; 133 | kernel=true, blocks_per_sm=42)) 134 | @test occursin(".minnctapersm 42", asm) 135 | 136 | if LLVM.version() >= v"4.0" 137 | asm = sprint(io->ptx_code_native(io, entry, Tuple{Int64}; 138 | kernel=true, maxregs=42)) 139 | @test occursin(".maxnreg 42", asm) 140 | end 141 | end 142 | end 143 | 144 | @testset "child function reuse" begin 145 | # bug: depending on a child function from multiple parents resulted in 146 | # the child only being present once 147 | 148 | @noinline child(i) = sink(i) 149 | function parent1(i) 150 | child(i) 151 | return 152 | end 153 | 154 | asm = sprint(io->ptx_code_native(io, parent1, Tuple{Int})) 155 | @test occursin(r".func julia_.*child_", asm) 156 | 157 | function parent2(i) 158 | child(i+1) 159 | return 160 | end 161 | 162 | asm = sprint(io->ptx_code_native(io, parent2, Tuple{Int})) 163 | @test occursin(r".func julia_.*child_", asm) 164 | end 165 | 166 | @testset "child function reuse bis" begin 167 | # bug: similar, but slightly different issue as above 168 | # in the case of two child functions 169 | @noinline child1(i) = sink(i) 170 | @noinline child2(i) = sink(i+1) 171 | function parent1(i) 172 | child1(i) + child2(i) 173 | return 174 | end 175 | ptx_code_native(devnull, parent1, Tuple{Int}) 176 | 177 | function parent2(i) 178 | child1(i+1) + child2(i+1) 179 | return 180 | end 181 | ptx_code_native(devnull, parent2, Tuple{Int}) 182 | end 183 | 184 | @testset "indirect sysimg function use" begin 185 | # issue #9: re-using sysimg functions should force recompilation 186 | # (host fldmod1->mod1 throws, so the PTX code shouldn't contain a throw) 187 | 188 | # NOTE: Int32 to test for #49 189 | 190 | function kernel(out) 191 | wid, lane = fldmod1(unsafe_load(out), Int32(32)) 192 | unsafe_store!(out, wid) 193 | return 194 | end 195 | 196 | asm = sprint(io->ptx_code_native(io, kernel, Tuple{Ptr{Int32}})) 197 | @test !occursin("jl_throw", asm) 198 | @test !occursin("jl_invoke", asm) # forced recompilation should still not invoke 199 | end 200 | 201 | @testset "LLVM intrinsics" begin 202 | # issue #13 (a): cannot select trunc 203 | function kernel(x) 204 | unsafe_trunc(Int, x) 205 | return 206 | end 207 | ptx_code_native(devnull, kernel, Tuple{Float64}) 208 | end 209 | 210 | @testset "exception arguments" begin 211 | function kernel(a) 212 | unsafe_store!(a, trunc(Int, unsafe_load(a))) 213 | return 214 | end 215 | 216 | ptx_code_native(devnull, kernel, Tuple{Ptr{Float64}}) 217 | end 218 | 219 | @testset "GC and TLS lowering" begin 220 | @eval mutable struct PleaseAllocate 221 | y::Csize_t 222 | end 223 | 224 | # common pattern in Julia 0.7: outlined throw to avoid a GC frame in the calling code 225 | @noinline function inner(x) 226 | sink(x.y) 227 | nothing 228 | end 229 | 230 | function kernel(i) 231 | inner(PleaseAllocate(Csize_t(42))) 232 | nothing 233 | end 234 | 235 | asm = sprint(io->ptx_code_native(io, kernel, Tuple{Int})) 236 | @test occursin("gpu_gc_pool_alloc", asm) 237 | 238 | # make sure that we can still ellide allocations 239 | function ref_kernel(ptr, i) 240 | data = Ref{Int64}() 241 | data[] = 0 242 | if i > 1 243 | data[] = 1 244 | else 245 | data[] = 2 246 | end 247 | unsafe_store!(ptr, data[], i) 248 | return nothing 249 | end 250 | 251 | asm = sprint(io->ptx_code_native(io, ref_kernel, Tuple{Ptr{Int64}, Int})) 252 | 253 | 254 | if VERSION < v"1.2.0-DEV.375" 255 | @test_broken !occursin("gpu_gc_pool_alloc", asm) 256 | else 257 | @test !occursin("gpu_gc_pool_alloc", asm) 258 | end 259 | end 260 | 261 | @testset "float boxes" begin 262 | function kernel(a,b) 263 | c = Int32(a) 264 | # the conversion to Int32 may fail, in which case the input Float32 is boxed in order to 265 | # pass it to the @nospecialize exception constructor. we should really avoid that (eg. 266 | # by avoiding @nospecialize, or optimize the unused arguments away), but for now the box 267 | # should just work. 268 | unsafe_store!(b, c) 269 | return 270 | end 271 | 272 | ir = sprint(io->ptx_code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}})) 273 | @test occursin("jl_box_float32", ir) 274 | ptx_code_native(devnull, kernel, Tuple{Float32,Ptr{Float32}}) 275 | end 276 | 277 | end 278 | 279 | 280 | ############################################################################################ 281 | 282 | end 283 | -------------------------------------------------------------------------------- /src/driver.jl: -------------------------------------------------------------------------------- 1 | # compiler driver and main interface 2 | 3 | # NOTE: the keyword arguments to compile/codegen control those aspects of compilation that 4 | # might have to be changed (e.g. set libraries=false when recursing, or set 5 | # strip=true for reflection). What remains defines the compilation job itself, 6 | # and those values are contained in the CompilerJob struct. 7 | 8 | # (::CompilerJob) 9 | const compile_hook = Ref{Union{Nothing,Function}}(nothing) 10 | 11 | """ 12 | compile(target::Symbol, job::CompilerJob; 13 | libraries=true, deferred_codegen=true, 14 | optimize=true, strip=false, ...) 15 | 16 | Compile a function `f` invoked with types `tt` for device capability `cap` to one of the 17 | following formats as specified by the `target` argument: `:julia` for Julia IR, `:llvm` for 18 | LLVM IR and `:asm` for machine code. 19 | 20 | The following keyword arguments are supported: 21 | - `libraries`: link the GPU runtime and `libdevice` libraries (if required) 22 | - `deferred_codegen`: resolve deferred compiler invocations (if required) 23 | - `optimize`: optimize the code (default: true) 24 | - `strip`: strip non-functional metadata and debug information (default: false) 25 | - `validate`: validate the generated IR before emitting machine code (default: true) 26 | 27 | Other keyword arguments can be found in the documentation of [`cufunction`](@ref). 28 | """ 29 | function compile(target::Symbol, job::CompilerJob; 30 | libraries::Bool=true, deferred_codegen::Bool=true, 31 | optimize::Bool=true, strip::Bool=false, validate::Bool=true) 32 | if compile_hook[] != nothing 33 | compile_hook[](job) 34 | end 35 | 36 | return codegen(target, job; 37 | libraries=libraries, deferred_codegen=deferred_codegen, 38 | optimize=optimize, strip=strip, validate=validate) 39 | end 40 | 41 | # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism. 42 | # this could both be generalized (e.g. supporting actual function calls, instead of 43 | # returning a function pointer), and be integrated with the nonrecursive codegen. 44 | const deferred_codegen_jobs = Vector{Tuple{Core.Function,Type}}() 45 | @generated function deferred_codegen(::Val{f}, ::Val{tt}) where {f,tt} 46 | push!(deferred_codegen_jobs, (f,tt)) 47 | id = length(deferred_codegen_jobs) 48 | 49 | quote 50 | # TODO: add an edge to this method instance to support method redefinitions 51 | ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id) 52 | end 53 | end 54 | 55 | function codegen(output::Symbol, job::CompilerJob; 56 | libraries::Bool=true, deferred_codegen::Bool=true, optimize::Bool=true, 57 | strip::Bool=false, validate::Bool=true) 58 | ## Julia IR 59 | 60 | @timeit_debug to "validation" check_method(job) 61 | 62 | @timeit_debug to "Julia front-end" begin 63 | 64 | # get the method instance 65 | world = typemax(UInt) 66 | meth = which(job.source.f, job.source.tt) 67 | sig = Base.signature_type(job.source.f, job.source.tt)::Type 68 | (ti, env) = ccall(:jl_type_intersection_with_env, Any, 69 | (Any, Any), sig, meth.sig)::Core.SimpleVector 70 | if VERSION >= v"1.2.0-DEV.320" 71 | meth = Base.func_for_method_checked(meth, ti, env) 72 | else 73 | meth = Base.func_for_method_checked(meth, ti) 74 | end 75 | method_instance = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance}, 76 | (Any, Any, Any, UInt), meth, ti, env, world) 77 | 78 | for var in env 79 | if var isa TypeVar 80 | throw(KernelError(job, "method captures a typevar (you probably use an unbound type variable)")) 81 | end 82 | end 83 | end 84 | 85 | output == :julia && return method_instance 86 | 87 | 88 | ## LLVM IR 89 | 90 | # always preload the runtime, and do so early; it cannot be part of any timing block 91 | # because it recurses into the compiler 92 | if libraries 93 | runtime = load_runtime(job) 94 | runtime_fns = LLVM.name.(defs(runtime)) 95 | end 96 | 97 | @timeit_debug to "LLVM middle-end" begin 98 | ir, kernel = @timeit_debug to "IR generation" irgen(job, method_instance, world) 99 | kernel_fn = LLVM.name(kernel) 100 | 101 | # target-specific libraries 102 | if libraries 103 | undefined_fns = LLVM.name.(decls(ir)) 104 | @timeit_debug to "target libraries" link_libraries!(job, ir, undefined_fns) 105 | end 106 | 107 | if optimize 108 | @timeit_debug to "optimization" optimize!(job, ir) 109 | 110 | # optimization may have replaced functions, so look the entry point up again 111 | kernel = functions(ir)[kernel_fn] 112 | end 113 | 114 | if libraries 115 | undefined_fns = LLVM.name.(decls(ir)) 116 | if any(fn -> fn in runtime_fns, undefined_fns) 117 | @timeit_debug to "runtime library" link_library!(ir, runtime) 118 | end 119 | end 120 | 121 | if ccall(:jl_is_debugbuild, Cint, ()) == 1 122 | @timeit_debug to "verification" verify(ir) 123 | end 124 | 125 | # remove everything except for the kernel 126 | @timeit_debug to "clean-up" begin 127 | exports = String[kernel_fn] 128 | ModulePassManager() do pm 129 | # internalize all functions that aren't exports 130 | internalize!(pm, exports) 131 | 132 | # eliminate all unused internal functions 133 | global_optimizer!(pm) 134 | global_dce!(pm) 135 | strip_dead_prototypes!(pm) 136 | 137 | run!(pm, ir) 138 | end 139 | end 140 | end 141 | 142 | # deferred code generation 143 | if deferred_codegen && haskey(functions(ir), "deferred_codegen") 144 | dyn_marker = functions(ir)["deferred_codegen"] 145 | 146 | cache = Dict{CompilerJob, String}(job => kernel_fn) 147 | 148 | # iterative compilation (non-recursive) 149 | changed = true 150 | while changed 151 | changed = false 152 | 153 | # find deferred compiler 154 | # TODO: recover this information earlier, from the Julia IR 155 | worklist = MultiDict{CompilerJob, LLVM.CallInst}() 156 | for use in uses(dyn_marker) 157 | # decode the call 158 | call = user(use)::LLVM.CallInst 159 | id = convert(Int, first(operands(call))) 160 | 161 | global deferred_codegen_jobs 162 | dyn_f, dyn_tt = deferred_codegen_jobs[id] 163 | dyn_job = similar(job, FunctionSpec(dyn_f, dyn_tt, #=kernel=# true)) 164 | push!(worklist, dyn_job => call) 165 | end 166 | 167 | # compile and link 168 | for dyn_job in keys(worklist) 169 | # cached compilation 170 | dyn_kernel_fn = get!(cache, dyn_job) do 171 | dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, 172 | strip=strip, validate=validate, 173 | deferred_codegen=false) 174 | dyn_kernel_fn = LLVM.name(dyn_kernel) 175 | link!(ir, dyn_ir) 176 | changed = true 177 | dyn_kernel_fn 178 | end 179 | dyn_kernel = functions(ir)[dyn_kernel_fn] 180 | 181 | # insert a pointer to the function everywhere the kernel is used 182 | T_ptr = convert(LLVMType, Ptr{Cvoid}) 183 | for call in worklist[dyn_job] 184 | Builder(JuliaContext()) do builder 185 | position!(builder, call) 186 | fptr = ptrtoint!(builder, dyn_kernel, T_ptr) 187 | replace_uses!(call, fptr) 188 | end 189 | unsafe_delete!(LLVM.parent(call), call) 190 | end 191 | end 192 | end 193 | 194 | # all deferred compilations should have been resolved 195 | @compiler_assert isempty(uses(dyn_marker)) job 196 | unsafe_delete!(ir, dyn_marker) 197 | end 198 | 199 | if output == :llvm 200 | if strip 201 | @timeit_debug to "strip debug info" strip_debuginfo!(ir) 202 | end 203 | 204 | return ir, kernel 205 | end 206 | 207 | 208 | ## machine code 209 | 210 | finish_module!(job, ir) 211 | 212 | if validate 213 | @timeit_debug to "validation" begin 214 | check_invocation(job, kernel) 215 | check_ir(job, ir) 216 | end 217 | end 218 | 219 | # NOTE: strip after validation to get better errors 220 | if strip 221 | @timeit_debug to "strip debug info" strip_debuginfo!(ir) 222 | end 223 | 224 | @timeit_debug to "LLVM back-end" begin 225 | @timeit_debug to "preparation" prepare_execution!(job, ir) 226 | 227 | if output == :asm 228 | code = @timeit_debug to "machine-code generation" mcgen(job, ir, kernel, LLVM.API.LLVMAssemblyFile) 229 | elseif output == :obj 230 | code = @timeit_debug to "machine-code generation" mcgen(job, ir, kernel, LLVM.API.LLVMObjectFile) 231 | end 232 | end 233 | 234 | undefined_fns = LLVM.name.(decls(ir)) 235 | undefined_gbls = map(x->(name=LLVM.name(x),type=llvmtype(x),external=isextinit(x)), LLVM.globals(ir)) 236 | 237 | (output == :asm || output == :obj) && return code, kernel_fn, undefined_fns, undefined_gbls 238 | 239 | 240 | error("Unknown compilation output $output") 241 | end 242 | -------------------------------------------------------------------------------- /src/validation.jl: -------------------------------------------------------------------------------- 1 | # validation of properties and code 2 | 3 | export InvalidIRError 4 | 5 | function check_method(job::CompilerJob) 6 | isa(job.source.f, Core.Builtin) && throw(KernelError(job, "function is not a generic function")) 7 | 8 | # get the method 9 | ms = Base.methods(job.source.f, job.source.tt) 10 | isempty(ms) && throw(KernelError(job, "no method found")) 11 | length(ms)!=1 && throw(KernelError(job, "no unique matching method")) 12 | m = first(ms) 13 | 14 | # kernels can't return values 15 | if job.source.kernel 16 | rt = Base.return_types(job.source.f, job.source.tt)[1] 17 | if rt != Nothing 18 | throw(KernelError(job, "kernel returns a value of type `$rt`", 19 | """Make sure your kernel function ends in `return`, `return nothing` or `nothing`. 20 | If the returned value is of type `Union{}`, your Julia code probably throws an exception. 21 | Inspect the code with `@device_code_warntype` for more details.""")) 22 | end 23 | end 24 | 25 | return 26 | end 27 | 28 | if VERSION < v"1.1.0-DEV.593" 29 | fieldtypes(@nospecialize(dt)) = ntuple(i->fieldtype(dt, i), fieldcount(dt)) 30 | end 31 | 32 | # The actual check is rather complicated 33 | # and might change from version to version... 34 | function hasfieldcount(@nospecialize(dt)) 35 | try 36 | fieldcount(dt) 37 | catch 38 | return false 39 | end 40 | return true 41 | end 42 | 43 | function explain_nonisbits(@nospecialize(dt), depth=1; maxdepth=10) 44 | dt===Module && return "" # work around JuliaLang/julia#33347 45 | depth > maxdepth && return "" 46 | hasfieldcount(dt) || return "" 47 | msg = "" 48 | for (ft, fn) in zip(fieldtypes(dt), fieldnames(dt)) 49 | if !isbitstype(ft) 50 | msg *= " "^depth * ".$fn is of type $ft which is not isbits.\n" 51 | msg *= explain_nonisbits(ft, depth+1) 52 | end 53 | end 54 | return msg 55 | end 56 | 57 | function check_invocation(job::CompilerJob, entry::LLVM.Function) 58 | # make sure any non-isbits arguments are unused 59 | real_arg_i = 0 60 | sig = Base.signature_type(job.source.f, job.source.tt)::Type 61 | for (arg_i,dt) in enumerate(sig.parameters) 62 | isghosttype(dt) && continue 63 | VERSION >= v"1.5.0-DEV.581" && Core.Compiler.isconstType(dt) && continue 64 | real_arg_i += 1 65 | 66 | if !isbitstype(dt) 67 | if VERSION >= v"1.5.0-DEV.581" 68 | throw(KernelError(job, "passing and using non-bitstype argument", 69 | """Argument $arg_i to your kernel function is of type $dt, which is not isbits: 70 | $(explain_nonisbits(dt))""")) 71 | else 72 | # be slightly more lenient pre 1.5, to support `function(::Type, ...)` 73 | param = parameters(entry)[real_arg_i] 74 | if !isempty(uses(param)) 75 | throw(KernelError(job, "passing and using non-bitstype argument", 76 | """Argument $arg_i to your kernel function is of type $dt, which is not isbits: 77 | $(explain_nonisbits(dt)) 78 | Passing non-isbits types is only allowed if they they are unused by the kernel.""")) 79 | end 80 | end 81 | end 82 | end 83 | 84 | return 85 | end 86 | 87 | 88 | ## IR validation 89 | 90 | const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta 91 | 92 | struct InvalidIRError <: Exception 93 | job::CompilerJob 94 | errors::Vector{IRError} 95 | end 96 | 97 | const RUNTIME_FUNCTION = "call to the Julia runtime" 98 | const UNKNOWN_FUNCTION = "call to an unknown function" 99 | const POINTER_FUNCTION = "call through a literal pointer" 100 | const DELAYED_BINDING = "use of an undefined name" 101 | const DYNAMIC_CALL = "dynamic function invocation" 102 | 103 | function Base.showerror(io::IO, err::InvalidIRError) 104 | print(io, "InvalidIRError: compiling ", err.job.source, " resulted in invalid LLVM IR") 105 | for (kind, bt, meta) in err.errors 106 | print(io, "\nReason: unsupported $kind") 107 | if meta != nothing 108 | if kind == RUNTIME_FUNCTION || kind == UNKNOWN_FUNCTION || kind == POINTER_FUNCTION || kind == DYNAMIC_CALL 109 | print(io, " (call to ", meta, ")") 110 | elseif kind == DELAYED_BINDING 111 | print(io, " (use of '", meta, "')") 112 | end 113 | end 114 | Base.show_backtrace(io, bt) 115 | end 116 | return 117 | end 118 | 119 | function check_ir(job, args...) 120 | errors = check_ir!(job, IRError[], args...) 121 | unique!(errors) 122 | if !isempty(errors) 123 | throw(InvalidIRError(job, errors)) 124 | end 125 | 126 | return 127 | end 128 | 129 | function check_ir!(job, errors::Vector{IRError}, mod::LLVM.Module) 130 | for f in functions(mod) 131 | check_ir!(job, errors, f) 132 | end 133 | 134 | return errors 135 | end 136 | 137 | function check_ir!(job, errors::Vector{IRError}, f::LLVM.Function) 138 | for bb in blocks(f), inst in instructions(bb) 139 | if isa(inst, LLVM.CallInst) 140 | check_ir!(job, errors, inst) 141 | end 142 | end 143 | 144 | return errors 145 | end 146 | 147 | const libjulia = Ref{Ptr{Cvoid}}(C_NULL) 148 | 149 | function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst) 150 | bt = backtrace(inst) 151 | dest = called_value(inst) 152 | if isa(dest, LLVM.Function) 153 | fn = LLVM.name(dest) 154 | 155 | # some special handling for runtime functions that we don't implement 156 | if fn == "jl_get_binding_or_error" 157 | try 158 | m, sym, _ = operands(inst) 159 | sym = first(operands(sym::ConstantExpr))::ConstantInt 160 | sym = convert(Int, sym) 161 | sym = Ptr{Cvoid}(sym) 162 | sym = Base.unsafe_pointer_to_objref(sym) 163 | push!(errors, (DELAYED_BINDING, bt, sym)) 164 | catch e 165 | isa(e,TypeError) || rethrow() 166 | @debug "Decoding arguments to jl_get_binding_or_error failed" inst bb=LLVM.parent(inst) 167 | push!(errors, (DELAYED_BINDING, bt, nothing)) 168 | end 169 | elseif fn == "jl_invoke" 170 | try 171 | if VERSION < v"1.3.0-DEV.244" 172 | meth, args, nargs, _ = operands(inst) 173 | else 174 | f, args, nargs, meth = operands(inst) 175 | end 176 | if VERSION < v"1.5.0-DEV.802" 177 | # addrspacecast 178 | meth = first(operands(meth::ConstantExpr)) 179 | end 180 | meth = first(operands(meth::ConstantExpr))::ConstantInt 181 | meth = convert(Int, meth) 182 | meth = Ptr{Cvoid}(meth) 183 | meth = Base.unsafe_pointer_to_objref(meth)::Core.MethodInstance 184 | push!(errors, (DYNAMIC_CALL, bt, meth.def)) 185 | catch e 186 | isa(e,TypeError) || rethrow() 187 | @debug "Decoding arguments to jl_invoke failed" inst bb=LLVM.parent(inst) 188 | push!(errors, (DYNAMIC_CALL, bt, nothing)) 189 | end 190 | elseif fn == "jl_apply_generic" 191 | try 192 | if VERSION < v"1.3.0-DEV.244" 193 | args, nargs, _ = operands(inst) 194 | ## args is a buffer where arguments are stored in 195 | f, args = user.(uses(args)) 196 | ## first store into the args buffer is a direct store 197 | f = first(operands(f::LLVM.StoreInst))::ConstantExpr 198 | else 199 | f, args, nargs, _ = operands(inst) 200 | end 201 | 202 | if VERSION < v"1.5.0-DEV.802" 203 | f = first(operands(f::ConstantExpr)) # get rid of addrspacecast 204 | end 205 | f = first(operands(f))::ConstantInt # get rid of inttoptr 206 | f = convert(Int, f) 207 | f = Ptr{Cvoid}(f) 208 | f = Base.unsafe_pointer_to_objref(f) 209 | push!(errors, (DYNAMIC_CALL, bt, f)) 210 | catch e 211 | isa(e,TypeError) || rethrow() 212 | @debug "Decoding arguments to jl_apply_generic failed" inst bb=LLVM.parent(inst) 213 | push!(errors, (DYNAMIC_CALL, bt, nothing)) 214 | end 215 | 216 | # detect calls to undefined functions 217 | elseif isdeclaration(dest) && intrinsic_id(dest) == 0 && !isintrinsic(job, fn) 218 | # figure out if the function lives in the Julia runtime library 219 | if libjulia[] == C_NULL 220 | paths = filter(Libdl.dllist()) do path 221 | name = splitdir(path)[2] 222 | startswith(name, "libjulia") 223 | end 224 | libjulia[] = Libdl.dlopen(first(paths)) 225 | end 226 | 227 | if Libdl.dlsym_e(libjulia[], fn) != C_NULL 228 | push!(errors, (RUNTIME_FUNCTION, bt, LLVM.name(dest))) 229 | else 230 | push!(errors, (UNKNOWN_FUNCTION, bt, LLVM.name(dest))) 231 | end 232 | end 233 | 234 | elseif isa(dest, InlineAsm) 235 | # let's assume it's valid ASM 236 | 237 | elseif isa(dest, ConstantExpr) 238 | # detect calls to literal pointers 239 | if occursin("inttoptr", string(dest)) 240 | # extract the literal pointer 241 | ptr_arg = first(operands(dest)) 242 | @compiler_assert isa(ptr_arg, ConstantInt) job 243 | ptr_val = convert(Int, ptr_arg) 244 | ptr = Ptr{Cvoid}(ptr_val) 245 | 246 | # look it up in the Julia JIT cache 247 | frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0) 248 | if length(frames) >= 1 249 | @compiler_assert length(frames) == 1 job frames=frames 250 | if VERSION >= v"1.4.0-DEV.123" 251 | fn, file, line, linfo, fromC, inlined = last(frames) 252 | else 253 | fn, file, line, linfo, fromC, inlined, ip = last(frames) 254 | end 255 | push!(errors, (POINTER_FUNCTION, bt, fn)) 256 | else 257 | push!(errors, (POINTER_FUNCTION, bt, nothing)) 258 | end 259 | end 260 | end 261 | 262 | return errors 263 | end 264 | -------------------------------------------------------------------------------- /src/gcn.jl: -------------------------------------------------------------------------------- 1 | # implementation of the GPUCompiler interfaces for generating GCN code 2 | 3 | ## target 4 | 5 | export GCNCompilerTarget 6 | 7 | Base.@kwdef struct GCNCompilerTarget <: AbstractCompilerTarget 8 | dev_isa::String 9 | end 10 | 11 | llvm_triple(::GCNCompilerTarget) = "amdgcn-amd-amdhsa" 12 | 13 | function llvm_machine(target::GCNCompilerTarget) 14 | triple = llvm_triple(target) 15 | t = Target(triple) 16 | 17 | cpu = target.dev_isa 18 | feat = "" 19 | optlevel = LLVM.API.LLVMCodeGenLevelDefault 20 | reloc = LLVM.API.LLVMRelocPIC 21 | tm = TargetMachine(t, triple, cpu, feat, optlevel, reloc) 22 | asm_verbosity!(tm, true) 23 | 24 | return tm 25 | end 26 | 27 | 28 | ## job 29 | 30 | # TODO: encode debug build or not in the compiler job 31 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/368 32 | runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.target.dev_isa)" 33 | 34 | const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free") 35 | isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics) 36 | 37 | function process_kernel!(job::CompilerJob{GCNCompilerTarget}, mod::LLVM.Module, kernel::LLVM.Function) 38 | kernel = wrap_entry!(job, mod, kernel) 39 | # AMDGPU kernel calling convention 40 | callconv!(kernel, LLVM.API.LLVMCallConv(91)) 41 | kernel 42 | end 43 | 44 | function add_lowering_passes!(job::CompilerJob{GCNCompilerTarget}, pm::LLVM.PassManager) 45 | add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!)) 46 | end 47 | 48 | function lower_throw_extra!(mod::LLVM.Module) 49 | job = current_job::CompilerJob 50 | changed = false 51 | @timeit_debug to "lower throw (extra)" begin 52 | 53 | throw_functions = [ 54 | r"julia_bounds_error.*", 55 | r"julia_throw_boundserror.*", 56 | r"julia_error_if_canonical_getindex.*", 57 | r"julia_error_if_canonical_setindex.*", 58 | r"julia___subarray_throw_boundserror.*", 59 | ] 60 | 61 | 62 | for f in functions(mod) 63 | f_name = LLVM.name(f) 64 | for fn in throw_functions 65 | if occursin(fn, f_name) 66 | for use in uses(f) 67 | call = user(use)::LLVM.CallInst 68 | 69 | # replace the throw with a trap 70 | let builder = Builder(JuliaContext()) 71 | position!(builder, call) 72 | emit_exception!(builder, f_name, call) 73 | dispose(builder) 74 | end 75 | 76 | # remove the call 77 | call_args = collect(operands(call))[1:end-1] # last arg is function itself 78 | unsafe_delete!(LLVM.parent(call), call) 79 | 80 | # HACK: kill the exceptions' unused arguments 81 | for arg in call_args 82 | # peek through casts 83 | if isa(arg, LLVM.AddrSpaceCastInst) 84 | cast = arg 85 | arg = first(operands(cast)) 86 | isempty(uses(cast)) && unsafe_delete!(LLVM.parent(cast), cast) 87 | end 88 | 89 | if isa(arg, LLVM.Instruction) && isempty(uses(arg)) 90 | unsafe_delete!(LLVM.parent(arg), arg) 91 | end 92 | end 93 | 94 | changed = true 95 | end 96 | 97 | @compiler_assert isempty(uses(f)) job 98 | end 99 | end 100 | end 101 | 102 | end 103 | return changed 104 | end 105 | 106 | function emit_trap!(job::CompilerJob{GCNCompilerTarget}, builder, mod, inst) 107 | trap = if haskey(functions(mod), "llvm.trap") 108 | functions(mod)["llvm.trap"] 109 | else 110 | LLVM.Function(mod, "llvm.trap", LLVM.FunctionType(LLVM.VoidType(JuliaContext()))) 111 | end 112 | if Base.libllvm_version < v"9" 113 | rl_ft = LLVM.FunctionType(LLVM.Int32Type(JuliaContext()), 114 | [LLVM.Int32Type(JuliaContext())]) 115 | rl = if haskey(functions(mod), "llvm.amdgcn.readfirstlane") 116 | functions(mod)["llvm.amdgcn.readfirstlane"] 117 | else 118 | LLVM.Function(mod, "llvm.amdgcn.readfirstlane", rl_ft) 119 | end 120 | # FIXME: Early versions of the AMDGPU target fail to skip machine 121 | # blocks with certain side effects when EXEC==0, except when certain 122 | # criteria are met within said block. We emit a v_readfirstlane_b32 123 | # instruction here, as that is sufficient to trigger a skip. Without 124 | # this, the target will only attempt to do a "masked branch", which 125 | # only works on vector instructions (trap is a scalar instruction, and 126 | # therefore it is executed even when EXEC==0). 127 | rl_val = call!(builder, rl, [ConstantInt(Int32(32), JuliaContext())]) 128 | rl_bc = inttoptr!(builder, rl_val, LLVM.PointerType(LLVM.Int32Type(JuliaContext()))) 129 | store!(builder, rl_val, rl_bc) 130 | end 131 | call!(builder, trap) 132 | end 133 | 134 | # manual implementation of byval, as the backend doesn't support it for kernel args 135 | # https://reviews.llvm.org/D79744 136 | function wrapper_type(julia_t::Type, codegen_t::LLVMType)::LLVMType 137 | if !isbitstype(julia_t) 138 | # don't pass jl_value_t by value; it's an opaque structure 139 | return codegen_t 140 | elseif isa(codegen_t, LLVM.PointerType) && !(julia_t <: Ptr) 141 | # we didn't specify a pointer, but codegen passes one anyway. 142 | # make the wrapper accept the underlying value instead. 143 | return eltype(codegen_t) 144 | else 145 | return codegen_t 146 | end 147 | end 148 | # generate a kernel wrapper to fix & improve argument passing 149 | function wrap_entry!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function) 150 | entry_ft = eltype(llvmtype(entry_f)::LLVM.PointerType)::LLVM.FunctionType 151 | @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) job 152 | 153 | # filter out types which don't occur in the LLVM function signatures 154 | sig = Base.signature_type(job.source.f, job.source.tt)::Type 155 | julia_types = Type[] 156 | for dt::Type in sig.parameters 157 | if !isghosttype(dt) && (VERSION < v"1.5.0-DEV.581" || !Core.Compiler.isconstType(dt)) 158 | push!(julia_types, dt) 159 | end 160 | end 161 | 162 | # generate the wrapper function type & definition 163 | wrapper_types = LLVM.LLVMType[wrapper_type(julia_t, codegen_t) 164 | for (julia_t, codegen_t) 165 | in zip(julia_types, parameters(entry_ft))] 166 | wrapper_fn = LLVM.name(entry_f) 167 | LLVM.name!(entry_f, wrapper_fn * ".inner") 168 | wrapper_ft = LLVM.FunctionType(LLVM.VoidType(JuliaContext()), wrapper_types) 169 | wrapper_f = LLVM.Function(mod, wrapper_fn, wrapper_ft) 170 | 171 | # emit IR performing the "conversions" 172 | let builder = Builder(JuliaContext()) 173 | entry = BasicBlock(wrapper_f, "entry", JuliaContext()) 174 | position!(builder, entry) 175 | 176 | wrapper_args = Vector{LLVM.Value}() 177 | 178 | # perform argument conversions 179 | codegen_types = parameters(entry_ft) 180 | wrapper_params = parameters(wrapper_f) 181 | param_index = 0 182 | for (julia_t, codegen_t, wrapper_t, wrapper_param) in 183 | zip(julia_types, codegen_types, wrapper_types, wrapper_params) 184 | param_index += 1 185 | if codegen_t != wrapper_t 186 | # the wrapper argument doesn't match the kernel parameter type. 187 | # this only happens when codegen wants to pass a pointer. 188 | @compiler_assert isa(codegen_t, LLVM.PointerType) job 189 | @compiler_assert eltype(codegen_t) == wrapper_t job 190 | 191 | # copy the argument value to a stack slot, and reference it. 192 | ptr = alloca!(builder, wrapper_t) 193 | if LLVM.addrspace(codegen_t) != 0 194 | ptr = addrspacecast!(builder, ptr, codegen_t) 195 | end 196 | store!(builder, wrapper_param, ptr) 197 | push!(wrapper_args, ptr) 198 | else 199 | push!(wrapper_args, wrapper_param) 200 | for attr in collect(parameter_attributes(entry_f, param_index)) 201 | push!(parameter_attributes(wrapper_f, param_index), attr) 202 | end 203 | end 204 | end 205 | 206 | call!(builder, entry_f, wrapper_args) 207 | 208 | ret!(builder) 209 | 210 | dispose(builder) 211 | end 212 | 213 | # early-inline the original entry function into the wrapper 214 | push!(function_attributes(entry_f), EnumAttribute("alwaysinline", 0, JuliaContext())) 215 | linkage!(entry_f, LLVM.API.LLVMInternalLinkage) 216 | 217 | fixup_metadata!(entry_f) 218 | ModulePassManager() do pm 219 | always_inliner!(pm) 220 | run!(pm, mod) 221 | end 222 | 223 | return wrapper_f 224 | end 225 | # HACK: get rid of invariant.load and const TBAA metadata on loads from pointer args, 226 | # since storing to a stack slot violates the semantics of those attributes. 227 | # TODO: can we emit a wrapper that doesn't violate Julia's metadata? 228 | function fixup_metadata!(f::LLVM.Function) 229 | for param in parameters(f) 230 | if isa(llvmtype(param), LLVM.PointerType) 231 | # collect all uses of the pointer 232 | worklist = Vector{LLVM.Instruction}(user.(collect(uses(param)))) 233 | while !isempty(worklist) 234 | value = popfirst!(worklist) 235 | 236 | # remove the invariant.load attribute 237 | md = metadata(value) 238 | if haskey(md, LLVM.MD_invariant_load) 239 | delete!(md, LLVM.MD_invariant_load) 240 | end 241 | if haskey(md, LLVM.MD_tbaa) 242 | delete!(md, LLVM.MD_tbaa) 243 | end 244 | 245 | # recurse on the output of some instructions 246 | if isa(value, LLVM.BitCastInst) || 247 | isa(value, LLVM.GetElementPtrInst) || 248 | isa(value, LLVM.AddrSpaceCastInst) 249 | append!(worklist, user.(collect(uses(value)))) 250 | end 251 | 252 | # IMPORTANT NOTE: if we ever want to inline functions at the LLVM level, 253 | # we need to recurse into call instructions here, and strip metadata from 254 | # called functions (see CUDAnative.jl#238). 255 | end 256 | end 257 | end 258 | end 259 | -------------------------------------------------------------------------------- /test/native.jl: -------------------------------------------------------------------------------- 1 | @testset "native" begin 2 | 3 | include("definitions/native.jl") 4 | 5 | ############################################################################################ 6 | 7 | @testset "Compilation" begin 8 | kernel() = nothing 9 | 10 | output = native_code_execution(kernel, (); validate=false) 11 | @test occursin("kernel", output[2]) 12 | @test isempty(output[3]) 13 | @test isempty(output[4]) 14 | 15 | @testset "Undefined Functions" begin 16 | function undef_fn() 17 | ccall("extern somefunc", llvmcall, Cvoid, ()) 18 | nothing 19 | end 20 | 21 | output = native_code_execution(undef_fn, (); validate=false) 22 | @test length(output[3]) == 1 23 | @test output[3][1] == "somefunc" 24 | end 25 | 26 | @testset "Undefined Globals" begin 27 | @generated function makegbl(::Val{name}, ::Type{T}, ::Val{isext}) where {name,T,isext} 28 | T_gbl = convert(LLVMType, T) 29 | T_ptr = convert(LLVMType, Ptr{T}) 30 | llvm_f, _ = create_function(T_ptr) 31 | mod = LLVM.parent(llvm_f) 32 | gvar = GlobalVariable(mod, T_gbl, string(name)) 33 | isext && extinit!(gvar, true) 34 | Builder(JuliaContext()) do builder 35 | entry = BasicBlock(llvm_f, "entry", JuliaContext()) 36 | position!(builder, entry) 37 | result = ptrtoint!(builder, gvar, T_ptr) 38 | ret!(builder, result) 39 | end 40 | call_function(llvm_f, Ptr{T}) 41 | end 42 | function undef_gbl() 43 | ext_ptr = makegbl(Val(:someglobal), Int64, Val(true)) 44 | Base.unsafe_store!(ext_ptr, 1) 45 | ptr = makegbl(Val(:otherglobal), Float32, Val(false)) 46 | Base.unsafe_store!(ptr, 2f0) 47 | nothing 48 | end 49 | 50 | output = native_code_execution(undef_gbl, ()) 51 | @test length(output[4]) == 2 52 | @test output[4][1].name == "someglobal" 53 | @test eltype(output[4][1].type) isa LLVM.IntegerType 54 | @test output[4][1].external 55 | @test output[4][2].name == "otherglobal" 56 | @test eltype(output[4][2].type) isa LLVM.LLVMFloat 57 | @test !output[4][2].external 58 | end 59 | end 60 | 61 | ############################################################################################ 62 | 63 | @testset "IR" begin 64 | 65 | @testset "basic reflection" begin 66 | valid_kernel() = return 67 | invalid_kernel() = 1 68 | 69 | ir = sprint(io->native_code_llvm(io, valid_kernel, Tuple{}; optimize=false, dump_module=true)) 70 | 71 | # module should contain our function + a generic call wrapper 72 | @test occursin(r"define\ .* void\ @.*julia_valid_kernel.*\(\)"x, ir) 73 | @test !occursin("define %jl_value_t* @jlcall_", ir) 74 | 75 | # there should be no debug metadata 76 | @test !occursin("!dbg", ir) 77 | 78 | @test native_code_llvm(devnull, invalid_kernel, Tuple{}) == nothing 79 | @test_throws KernelError native_code_llvm(devnull, invalid_kernel, Tuple{}; kernel=true) == nothing 80 | end 81 | 82 | @testset "unbound typevars" begin 83 | invalid_kernel() where {unbound} = return 84 | @test_throws KernelError native_code_llvm(devnull, invalid_kernel, Tuple{}) 85 | end 86 | 87 | @testset "child functions" begin 88 | # we often test using `@noinline sink` child functions, so test whether these survive 89 | @noinline child(i) = sink(i) 90 | parent(i) = child(i) 91 | 92 | ir = sprint(io->native_code_llvm(io, parent, Tuple{Int})) 93 | @test occursin(r"call .+ @julia_child_", ir) 94 | end 95 | 96 | @testset "sysimg" begin 97 | # bug: use a system image function 98 | 99 | function foobar(a,i) 100 | Base.pointerset(a, 0, mod1(i,10), 8) 101 | end 102 | 103 | ir = sprint(io->native_code_llvm(io, foobar, Tuple{Ptr{Int},Int})) 104 | @test !occursin("jlsys_", ir) 105 | end 106 | 107 | @testset "tracked pointers" begin 108 | function kernel(a) 109 | a[1] = 1 110 | return 111 | end 112 | 113 | # this used to throw an LLVM assertion (#223) 114 | native_code_llvm(devnull, kernel, Tuple{Vector{Int}}; kernel=true) 115 | end 116 | 117 | if VERSION >= v"1.0.2" 118 | @testset "CUDAnative.jl#278" begin 119 | # codegen idempotency 120 | # NOTE: this isn't fixed, but surfaces here due to bad inference of checked_sub 121 | # NOTE: with the fix to print_to_string this doesn't error anymore, 122 | # but still have a test to make sure it doesn't regress 123 | native_code_llvm(devnull, Base.checked_sub, Tuple{Int,Int}; optimize=false) 124 | native_code_llvm(devnull, Base.checked_sub, Tuple{Int,Int}; optimize=false) 125 | 126 | # breaking recursion in print_to_string makes it possible to compile 127 | # even in the presence of the above bug 128 | native_code_llvm(devnull, Base.print_to_string, Tuple{Int,Int}; optimize=false) 129 | end 130 | end 131 | 132 | @testset "LLVM D32593" begin 133 | @eval struct D32593_struct 134 | foo::Float32 135 | bar::Float32 136 | end 137 | 138 | D32593(ptr) = unsafe_load(ptr).foo 139 | 140 | native_code_llvm(devnull, D32593, Tuple{Ptr{D32593_struct}}) 141 | end 142 | 143 | end 144 | 145 | ############################################################################################ 146 | 147 | @testset "assembly" begin 148 | 149 | @testset "basic reflection" begin 150 | valid_kernel() = return 151 | invalid_kernel() = 1 152 | 153 | @test native_code_native(devnull, valid_kernel, Tuple{}) == nothing 154 | @test native_code_native(devnull, invalid_kernel, Tuple{}) == nothing 155 | @test_throws KernelError native_code_native(devnull, invalid_kernel, Tuple{}; kernel=true) 156 | end 157 | 158 | @testset "idempotency" begin 159 | # bug: generate code twice for the same kernel (jl_to_ptx wasn't idempotent) 160 | 161 | kernel() = return 162 | native_code_native(devnull, kernel, Tuple{}) 163 | native_code_native(devnull, kernel, Tuple{}) 164 | end 165 | 166 | @testset "compile for host after gpu" begin 167 | # issue #11: re-using host functions after GPU compilation 168 | @noinline child(i) = sink(i+1) 169 | 170 | function fromhost() 171 | child(10) 172 | end 173 | 174 | function fromptx() 175 | child(10) 176 | return 177 | end 178 | 179 | native_code_native(devnull, fromptx, Tuple{}) 180 | @test fromhost() == 11 181 | end 182 | 183 | end 184 | 185 | ############################################################################################ 186 | 187 | @testset "errors" begin 188 | 189 | # some validation happens in the emit_function hook, which is called by code_llvm 190 | 191 | @testset "base intrinsics" begin 192 | foobar(i) = sin(i) 193 | 194 | # NOTE: we don't use test_logs in order to test all of the warning (exception, backtrace) 195 | logs, _ = Test.collect_test_logs(min_level=Info) do 196 | withenv("JULIA_DEBUG" => nothing) do 197 | native_code_llvm(devnull, foobar, Tuple{Int}) 198 | end 199 | end 200 | @test length(logs) == 1 201 | record = logs[1] 202 | @test record.level == Base.CoreLogging.Warn 203 | @test record.message == "calls to Base intrinsics might be GPU incompatible" 204 | @test haskey(record.kwargs, :exception) 205 | err,bt = record.kwargs[:exception] 206 | err_msg = sprint(showerror, err) 207 | @test occursin(Regex("You called sin(.+) in Base.Math .+, maybe you intended to call sin(.+) in $TestRuntime .+ instead?"), err_msg) 208 | bt_msg = sprint(Base.show_backtrace, bt) 209 | @test occursin("[1] sin", bt_msg) 210 | @test occursin(r"\[2\] .+foobar", bt_msg) 211 | end 212 | 213 | # some validation happens in `compile` 214 | 215 | @eval Main begin 216 | struct CleverType{T} 217 | x::T 218 | end 219 | Base.unsafe_trunc(::Type{Int}, x::CleverType) = unsafe_trunc(Int, x.x) 220 | end 221 | 222 | @testset "non-isbits arguments" begin 223 | foobar(i) = (sink(unsafe_trunc(Int,i)); return) 224 | 225 | @test_throws_message(KernelError, 226 | native_code_execution(foobar, Tuple{BigInt})) do msg 227 | occursin("passing and using non-bitstype argument", msg) && 228 | occursin("BigInt", msg) 229 | end 230 | 231 | # test that we can handle abstract types 232 | @test_throws_message(KernelError, 233 | native_code_execution(foobar, Tuple{Any})) do msg 234 | occursin("passing and using non-bitstype argument", msg) && 235 | occursin("Any", msg) 236 | end 237 | 238 | @test_throws_message(KernelError, 239 | native_code_execution(foobar, Tuple{Union{Int32, Int64}})) do msg 240 | occursin("passing and using non-bitstype argument", msg) && 241 | occursin("Union{Int32, Int64}", msg) 242 | end 243 | 244 | @test_throws_message(KernelError, 245 | native_code_execution(foobar, Tuple{Union{Int32, Int64}})) do msg 246 | occursin("passing and using non-bitstype argument", msg) && 247 | occursin("Union{Int32, Int64}", msg) 248 | end 249 | 250 | # test that we get information about fields and reason why something is not isbits 251 | @test_throws_message(KernelError, 252 | native_code_execution(foobar, Tuple{CleverType{BigInt}})) do msg 253 | occursin("passing and using non-bitstype argument", msg) && 254 | occursin("CleverType", msg) && 255 | occursin("BigInt", msg) 256 | end 257 | end 258 | 259 | @testset "invalid LLVM IR" begin 260 | foobar(i) = println(i) 261 | 262 | @test_throws_message(InvalidIRError, 263 | native_code_execution(foobar, Tuple{Int})) do msg 264 | occursin("invalid LLVM IR", msg) && 265 | occursin(GPUCompiler.RUNTIME_FUNCTION, msg) && 266 | occursin("[1] println", msg) && 267 | occursin(r"\[2\] .+foobar", msg) 268 | end 269 | end 270 | 271 | @testset "invalid LLVM IR (ccall)" begin 272 | foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing) 273 | 274 | @test_throws_message(InvalidIRError, 275 | native_code_execution(foobar, Tuple{Ptr{Int}})) do msg 276 | occursin("invalid LLVM IR", msg) && 277 | occursin(GPUCompiler.POINTER_FUNCTION, msg) && 278 | occursin(r"\[1\] .+foobar", msg) 279 | end 280 | end 281 | 282 | @testset "delayed bindings" begin 283 | kernel() = (undefined; return) 284 | 285 | @test_throws_message(InvalidIRError, 286 | native_code_execution(kernel, Tuple{})) do msg 287 | occursin("invalid LLVM IR", msg) && 288 | occursin(GPUCompiler.DELAYED_BINDING, msg) && 289 | occursin("use of 'undefined'", msg) && 290 | occursin(r"\[1\] .+kernel", msg) 291 | end 292 | end 293 | 294 | @testset "dynamic call (invoke)" begin 295 | @eval @noinline nospecialize_child(@nospecialize(i)) = i 296 | kernel(a, b) = (unsafe_store!(b, nospecialize_child(a)); return) 297 | 298 | @test_throws_message(InvalidIRError, 299 | native_code_execution(kernel, Tuple{Int,Ptr{Int}})) do msg 300 | occursin("invalid LLVM IR", msg) && 301 | occursin(GPUCompiler.DYNAMIC_CALL, msg) && 302 | occursin("call to nospecialize_child", msg) && 303 | occursin(r"\[1\] .+kernel", msg) 304 | end 305 | end 306 | 307 | @testset "dynamic call (apply)" begin 308 | func() = println(1) 309 | 310 | @test_throws_message(InvalidIRError, 311 | native_code_execution(func, Tuple{})) do msg 312 | occursin("invalid LLVM IR", msg) && 313 | occursin(GPUCompiler.DYNAMIC_CALL, msg) && 314 | occursin("call to println", msg) && 315 | occursin("[2] func", msg) 316 | end 317 | end 318 | 319 | end 320 | 321 | ############################################################################################ 322 | 323 | end 324 | -------------------------------------------------------------------------------- /src/ptx.jl: -------------------------------------------------------------------------------- 1 | # implementation of the GPUCompiler interfaces for generating PTX code 2 | 3 | ## target 4 | 5 | export PTXCompilerTarget 6 | 7 | Base.@kwdef struct PTXCompilerTarget <: AbstractCompilerTarget 8 | cap::VersionNumber 9 | 10 | # optional properties 11 | minthreads::Union{Nothing,Int,NTuple{<:Any,Int}} = nothing 12 | maxthreads::Union{Nothing,Int,NTuple{<:Any,Int}} = nothing 13 | blocks_per_sm::Union{Nothing,Int} = nothing 14 | maxregs::Union{Nothing,Int} = nothing 15 | end 16 | 17 | llvm_triple(::PTXCompilerTarget) = Int===Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda" 18 | 19 | function llvm_machine(target::PTXCompilerTarget) 20 | triple = llvm_triple(target) 21 | t = Target(triple) 22 | 23 | cpu = "sm_$(target.cap.major)$(target.cap.minor)" 24 | feat = "+ptx60" # we only support CUDA 9.0+ and LLVM 6.0+ 25 | tm = TargetMachine(t, triple, cpu, feat) 26 | asm_verbosity!(tm, true) 27 | 28 | return tm 29 | end 30 | 31 | # the default datalayout does not match the one in the NVPTX user guide 32 | llvm_datalayout(::PTXCompilerTarget) = Int===Int64 ? 33 | "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64"* 34 | "-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" : 35 | "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64"* 36 | "-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 37 | 38 | 39 | ## job 40 | 41 | function Base.show(io::IO, job::CompilerJob{PTXCompilerTarget}) 42 | print(io, "PTX CompilerJob of ", job.source) 43 | print(io, " for sm_$(job.target.cap.major)$(job.target.cap.minor)") 44 | 45 | job.target.minthreads !== nothing && print(io, ", minthreads=$(job.target.minthreads)") 46 | job.target.maxthreads !== nothing && print(io, ", maxthreads=$(job.target.maxthreads)") 47 | job.target.blocks_per_sm !== nothing && print(io, ", blocks_per_sm=$(job.target.blocks_per_sm)") 48 | job.target.maxregs !== nothing && print(io, ", maxregs=$(job.target.maxregs)") 49 | end 50 | 51 | const ptx_intrinsics = ("vprintf", "__assertfail", "malloc", "free") 52 | isintrinsic(::CompilerJob{PTXCompilerTarget}, fn::String) = in(fn, ptx_intrinsics) 53 | 54 | # TODO: encode debug build or not in the compiler job 55 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/368 56 | runtime_slug(job::CompilerJob{PTXCompilerTarget}) = "ptx-sm_$(job.target.cap.major)$(job.target.cap.minor)" 57 | 58 | function process_kernel!(job::CompilerJob{PTXCompilerTarget}, mod::LLVM.Module, kernel::LLVM.Function) 59 | # property annotations 60 | annotations = LLVM.Value[kernel] 61 | 62 | ## kernel metadata 63 | append!(annotations, [MDString("kernel"), ConstantInt(Int32(1), JuliaContext())]) 64 | 65 | ## expected CTA sizes 66 | if job.target.minthreads != nothing 67 | for (dim, name) in enumerate([:x, :y, :z]) 68 | bound = dim <= length(job.target.minthreads) ? job.target.minthreads[dim] : 1 69 | append!(annotations, [MDString("reqntid$name"), 70 | ConstantInt(Int32(bound), JuliaContext())]) 71 | end 72 | end 73 | if job.target.maxthreads != nothing 74 | for (dim, name) in enumerate([:x, :y, :z]) 75 | bound = dim <= length(job.target.maxthreads) ? job.target.maxthreads[dim] : 1 76 | append!(annotations, [MDString("maxntid$name"), 77 | ConstantInt(Int32(bound), JuliaContext())]) 78 | end 79 | end 80 | 81 | if job.target.blocks_per_sm != nothing 82 | append!(annotations, [MDString("minctasm"), 83 | ConstantInt(Int32(job.target.blocks_per_sm), JuliaContext())]) 84 | end 85 | 86 | if job.target.maxregs != nothing 87 | append!(annotations, [MDString("maxnreg"), 88 | ConstantInt(Int32(job.target.maxregs), JuliaContext())]) 89 | end 90 | 91 | push!(metadata(mod), "nvvm.annotations", MDNode(annotations)) 92 | 93 | 94 | if LLVM.version() >= v"8" 95 | # calling convention 96 | for fun in functions(mod) 97 | callconv!(kernel, LLVM.API.LLVMPTXDeviceCallConv) 98 | end 99 | callconv!(kernel, LLVM.API.LLVMPTXKernelCallConv) 100 | end 101 | 102 | return kernel 103 | end 104 | 105 | function add_lowering_passes!(job::CompilerJob{PTXCompilerTarget}, pm::LLVM.PassManager) 106 | add!(pm, FunctionPass("HideUnreachable", hide_unreachable!)) 107 | add!(pm, ModulePass("HideTrap", hide_trap!)) 108 | end 109 | 110 | function add_optimization_passes!(job::CompilerJob{PTXCompilerTarget}, pm::LLVM.PassManager) 111 | # NVPTX's target machine info enables runtime unrolling, 112 | # but Julia's pass sequence only invokes the simple unroller. 113 | loop_unroll!(pm) 114 | instruction_combining!(pm) # clean-up redundancy 115 | licm!(pm) # the inner runtime check might be outer loop invariant 116 | 117 | # the above loop unroll pass might have unrolled regular, non-runtime nested loops. 118 | # that code still needs to be optimized (arguably, multiple unroll passes should be 119 | # scheduled by the Julia optimizer). do so here, instead of re-optimizing entirely. 120 | early_csemem_ssa!(pm) # TODO: gvn instead? see NVPTXTargetMachine.cpp::addEarlyCSEOrGVNPass 121 | dead_store_elimination!(pm) 122 | 123 | constant_merge!(pm) 124 | 125 | cfgsimplification!(pm) 126 | 127 | # get rid of the internalized functions; now possible unused 128 | global_dce!(pm) 129 | end 130 | 131 | 132 | ## LLVM passes 133 | 134 | # HACK: this pass removes `unreachable` information from LLVM 135 | # 136 | # `ptxas` is buggy and cannot deal with thread-divergent control flow in the presence of 137 | # shared memory (see JuliaGPU/CUDAnative.jl#4). avoid that by rewriting control flow to fall 138 | # through any other block. this is semantically invalid, but the code is unreachable anyhow 139 | # (and we expect it to be preceded by eg. a noreturn function, or a trap). 140 | # 141 | # TODO: can LLVM do this with structured CFGs? It seems to have some support, but seemingly 142 | # only to prevent introducing non-structureness during optimization (ie. the front-end 143 | # is still responsible for generating structured control flow). 144 | function hide_unreachable!(fun::LLVM.Function) 145 | job = current_job::CompilerJob 146 | changed = false 147 | @timeit_debug to "hide unreachable" begin 148 | 149 | # remove `noreturn` attributes 150 | # 151 | # when calling a `noreturn` function, LLVM places an `unreachable` after the call. 152 | # this leads to an early `ret` from the function. 153 | attrs = function_attributes(fun) 154 | delete!(attrs, EnumAttribute("noreturn", 0, JuliaContext())) 155 | 156 | # build a map of basic block predecessors 157 | predecessors = Dict(bb => Set{LLVM.BasicBlock}() for bb in blocks(fun)) 158 | @timeit_debug to "predecessors" for bb in blocks(fun) 159 | insts = instructions(bb) 160 | if !isempty(insts) 161 | inst = last(insts) 162 | if isterminator(inst) 163 | for bb′ in successors(inst) 164 | push!(predecessors[bb′], bb) 165 | end 166 | end 167 | end 168 | end 169 | 170 | # scan for unreachable terminators and alternative successors 171 | worklist = Pair{LLVM.BasicBlock, Union{Nothing,LLVM.BasicBlock}}[] 172 | @timeit_debug to "find" for bb in blocks(fun) 173 | unreachable = terminator(bb) 174 | if isa(unreachable, LLVM.UnreachableInst) 175 | unsafe_delete!(bb, unreachable) 176 | changed = true 177 | 178 | try 179 | terminator(bb) 180 | # the basic-block is still terminated properly, nothing to do 181 | # (this can happen with `ret; unreachable`) 182 | # TODO: `unreachable; unreachable` 183 | catch ex 184 | isa(ex, UndefRefError) || rethrow(ex) 185 | let builder = Builder(JuliaContext()) 186 | position!(builder, bb) 187 | 188 | # find the strict predecessors to this block 189 | preds = collect(predecessors[bb]) 190 | 191 | # find a fallthrough block: recursively look at predecessors 192 | # and find a successor that branches to any other block 193 | fallthrough = nothing 194 | while !isempty(preds) 195 | # find an alternative successor 196 | for pred in preds, succ in successors(terminator(pred)) 197 | if succ != bb 198 | fallthrough = succ 199 | break 200 | end 201 | end 202 | fallthrough === nothing || break 203 | 204 | # recurse upwards 205 | old_preds = copy(preds) 206 | empty!(preds) 207 | for pred in old_preds 208 | append!(preds, predecessors[pred]) 209 | end 210 | end 211 | push!(worklist, bb => fallthrough) 212 | 213 | dispose(builder) 214 | end 215 | end 216 | end 217 | end 218 | 219 | # apply the pending terminator rewrites 220 | @timeit_debug to "replace" if !isempty(worklist) 221 | let builder = Builder(JuliaContext()) 222 | for (bb, fallthrough) in worklist 223 | position!(builder, bb) 224 | if fallthrough !== nothing 225 | br!(builder, fallthrough) 226 | else 227 | # couldn't find any other successor. this happens with functions 228 | # that only contain a single block, or when the block is dead. 229 | ft = eltype(llvmtype(fun)) 230 | if return_type(ft) == LLVM.VoidType(JuliaContext()) 231 | # even though returning can lead to invalid control flow, 232 | # it mostly happens with functions that just throw, 233 | # and leaving the unreachable there would make the optimizer 234 | # place another after the call. 235 | ret!(builder) 236 | else 237 | unreachable!(builder) 238 | end 239 | end 240 | end 241 | end 242 | end 243 | 244 | end 245 | return changed 246 | end 247 | 248 | # HACK: this pass removes calls to `trap` and replaces them with inline assembly 249 | # 250 | # if LLVM knows we're trapping, code is marked `unreachable` (see `hide_unreachable!`). 251 | function hide_trap!(mod::LLVM.Module) 252 | job = current_job::CompilerJob 253 | changed = false 254 | @timeit_debug to "hide trap" begin 255 | 256 | # inline assembly to exit a thread, hiding control flow from LLVM 257 | exit_ft = LLVM.FunctionType(LLVM.VoidType(JuliaContext())) 258 | exit = if job.target.cap < v"7" 259 | # ptxas for old compute capabilities has a bug where it messes up the 260 | # synchronization stack in the presence of shared memory and thread-divergend exit. 261 | InlineAsm(exit_ft, "trap;", "", true) 262 | else 263 | InlineAsm(exit_ft, "exit;", "", true) 264 | end 265 | 266 | if haskey(functions(mod), "llvm.trap") 267 | trap = functions(mod)["llvm.trap"] 268 | 269 | for use in uses(trap) 270 | val = user(use) 271 | if isa(val, LLVM.CallInst) 272 | let builder = Builder(JuliaContext()) 273 | position!(builder, val) 274 | call!(builder, exit) 275 | dispose(builder) 276 | end 277 | unsafe_delete!(LLVM.parent(val), val) 278 | changed = true 279 | end 280 | end 281 | end 282 | 283 | end 284 | return changed 285 | end 286 | -------------------------------------------------------------------------------- /src/irgen.jl: -------------------------------------------------------------------------------- 1 | # LLVM IR generation 2 | 3 | 4 | ## method compilation tracer 5 | 6 | # this functionality is used to detect recursion, and functions that shouldn't be called. 7 | # it is a hack, and should disappear over time. don't add new features to it. 8 | 9 | # generate a pseudo-backtrace from a stack of methods being emitted 10 | function backtrace(job::CompilerJob, call_stack::Vector{Core.MethodInstance}) 11 | bt = StackTraces.StackFrame[] 12 | for method_instance in call_stack 13 | method = method_instance.def 14 | if method.name === :overdub && isdefined(method, :generator) 15 | # The inline frames are maintained by the dwarf based backtrace, but here we only have the 16 | # calls to overdub directly, the backtrace therefore is collapsed and we have to 17 | # lookup the overdubbed function, but only if we likely are using the generated variant. 18 | actual_sig = Tuple{method_instance.specTypes.parameters[3:end]...} 19 | m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), actual_sig, typemax(UInt)) 20 | method = m.func::Method 21 | end 22 | frame = StackTraces.StackFrame(method.name, method.file, method.line) 23 | pushfirst!(bt, frame) 24 | end 25 | bt 26 | end 27 | 28 | # NOTE: we use an exception to be able to display a stack trace using the logging framework 29 | struct MethodSubstitutionWarning <: Exception 30 | original::Method 31 | substitute::Method 32 | end 33 | Base.showerror(io::IO, err::MethodSubstitutionWarning) = 34 | print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?") 35 | const method_substitution_whitelist = [:hypot, :exp] 36 | 37 | mutable struct MethodCompileTracer 38 | job::CompilerJob 39 | call_stack::Vector{Core.MethodInstance} 40 | last_method_instance::Union{Nothing,Core.MethodInstance} 41 | 42 | MethodCompileTracer(job, start) = new(job, Core.MethodInstance[start]) 43 | MethodCompileTracer(job) = new(job, Core.MethodInstance[]) 44 | end 45 | 46 | function Base.push!(tracer::MethodCompileTracer, method_instance) 47 | push!(tracer.call_stack, method_instance) 48 | 49 | if VERSION < v"1.5.0-DEV.393" 50 | # check for recursion 51 | if method_instance in tracer.call_stack[1:end-1] 52 | throw(KernelError(tracer.job, "recursion is currently not supported"; 53 | bt=backtrace(tracer.job, tracer.call_stack))) 54 | end 55 | end 56 | 57 | # check for Base functions that exist in the GPU package 58 | # FIXME: this might be too coarse 59 | method = method_instance.def 60 | if Base.moduleroot(method.module) == Base && 61 | isdefined(runtime_module(tracer.job), method_instance.def.name) && 62 | !in(method_instance.def.name, method_substitution_whitelist) 63 | substitute_function = getfield(runtime_module(tracer.job), method.name) 64 | tt = Tuple{method_instance.specTypes.parameters[2:end]...} 65 | if hasmethod(substitute_function, tt) 66 | method′ = which(substitute_function, tt) 67 | if method′.module == runtime_module(tracer.job) 68 | @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(tracer.job, tracer.call_stack)) 69 | end 70 | end 71 | end 72 | end 73 | 74 | function Base.pop!(tracer::MethodCompileTracer, method_instance) 75 | @compiler_assert last(tracer.call_stack) == method_instance tracer.job 76 | tracer.last_method_instance = pop!(tracer.call_stack) 77 | end 78 | 79 | Base.last(tracer::MethodCompileTracer) = tracer.last_method_instance 80 | 81 | 82 | ## Julia compiler integration 83 | 84 | if VERSION >= v"1.5.0-DEV.393" 85 | 86 | # JuliaLang/julia#25984 significantly restructured the compiler 87 | 88 | function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world) 89 | # set-up the compiler interface 90 | tracer = MethodCompileTracer(job, method_instance) 91 | hook_emit_function(method_instance, code) = push!(tracer, method_instance) 92 | hook_emitted_function(method_instance, code) = pop!(tracer, method_instance) 93 | param_kwargs = [:track_allocations => false, 94 | :code_coverage => false, 95 | :static_alloc => false, 96 | :prefer_specsig => true, 97 | :emit_function => hook_emit_function, 98 | :emitted_function => hook_emitted_function] 99 | if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547" 100 | push!(param_kwargs, :gnu_pubnames => false) 101 | 102 | debug_info_kind = if Base.JLOptions().debug_level == 0 103 | LLVM.API.LLVMDebugEmissionKindNoDebug 104 | elseif Base.JLOptions().debug_level == 1 105 | LLVM.API.LLVMDebugEmissionKindLineTablesOnly 106 | elseif Base.JLOptions().debug_level >= 2 107 | LLVM.API.LLVMDebugEmissionKindFullDebug 108 | end 109 | 110 | # LLVM's debug info crashes older CUDA assemblers 111 | if job.target isa PTXCompilerTarget # && driver_version(job.target) < v"10.2" 112 | # FIXME: this was supposed to be fixed on 10.2 113 | @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1 114 | debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug 115 | end 116 | 117 | push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind)) 118 | end 119 | params = Base.CodegenParams(;param_kwargs...) 120 | 121 | # generate IR 122 | if VERSION >= v"1.5.0-DEV.851" 123 | native_code = ccall(:jl_create_native, Ptr{Cvoid}, 124 | (Vector{Core.MethodInstance}, Base.CodegenParams, Cint), 125 | [method_instance], params, #=extern policy=# 1) 126 | else 127 | native_code = ccall(:jl_create_native, Ptr{Cvoid}, 128 | (Vector{Core.MethodInstance}, Base.CodegenParams), 129 | [method_instance], params) 130 | end 131 | @assert native_code != C_NULL 132 | llvm_mod_ref = ccall(:jl_get_llvm_module, LLVM.API.LLVMModuleRef, 133 | (Ptr{Cvoid},), native_code) 134 | @assert llvm_mod_ref != C_NULL 135 | llvm_mod = LLVM.Module(llvm_mod_ref) 136 | 137 | # get the top-level code 138 | code = if VERSION >= v"1.6.0-DEV.12" 139 | # TODO: use our own interpreter 140 | interpreter = Core.Compiler.NativeInterpreter(world) 141 | Core.Compiler.inf_for_methodinstance(interpreter, method_instance, world, world) 142 | else 143 | Core.Compiler.inf_for_methodinstance(method_instance, world, world) 144 | end 145 | 146 | # get the top-level function index 147 | llvm_func_idx = Ref{Int32}(-1) 148 | llvm_specfunc_idx = Ref{Int32}(-1) 149 | ccall(:jl_breakpoint, Nothing, ()) 150 | ccall(:jl_get_function_id, Nothing, 151 | (Ptr{Cvoid}, Any, Ptr{Int32}, Ptr{Int32}), 152 | native_code, code, llvm_func_idx, llvm_specfunc_idx) 153 | @assert llvm_func_idx[] != -1 154 | @assert llvm_specfunc_idx[] != -1 155 | 156 | # get the top-level function) 157 | llvm_func_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef, 158 | (Ptr{Cvoid}, UInt32), native_code, llvm_func_idx[]-1) 159 | @assert llvm_func_ref != C_NULL 160 | llvm_func = LLVM.Function(llvm_func_ref) 161 | llvm_specfunc_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef, 162 | (Ptr{Cvoid}, UInt32), native_code, llvm_specfunc_idx[]-1) 163 | @assert llvm_specfunc_ref != C_NULL 164 | llvm_specfunc = LLVM.Function(llvm_specfunc_ref) 165 | 166 | # configure the module 167 | triple!(llvm_mod, llvm_triple(job.target)) 168 | if llvm_datalayout(job.target) !== nothing 169 | datalayout!(llvm_mod, llvm_datalayout(job.target)) 170 | end 171 | 172 | return llvm_specfunc, llvm_mod 173 | end 174 | 175 | else 176 | 177 | function module_setup(job::CompilerJob, mod::LLVM.Module) 178 | # configure the module 179 | triple!(mod, llvm_triple(job.target)) 180 | datalayout!(mod, llvm_datalayout(job.target)) 181 | 182 | # add debug info metadata 183 | if LLVM.version() >= v"8.0" 184 | # Set Dwarf Version to 2, the DI printer will downgrade to v2 automatically, 185 | # but this is technically correct and the only version supported by NVPTX 186 | LLVM.flags(mod)["Dwarf Version", LLVM.API.LLVMModuleFlagBehaviorWarning] = 187 | Metadata(ConstantInt(Int32(2), JuliaContext())) 188 | LLVM.flags(mod)["Debug Info Version", LLVM.API.LLVMModuleFlagBehaviorError] = 189 | Metadata(ConstantInt(DEBUG_METADATA_VERSION(), JuliaContext())) 190 | else 191 | push!(metadata(mod), "llvm.module.flags", 192 | MDNode([ConstantInt(Int32(1), JuliaContext()), # llvm::Module::Error 193 | MDString("Debug Info Version"), 194 | ConstantInt(DEBUG_METADATA_VERSION(), JuliaContext())])) 195 | end 196 | end 197 | 198 | function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world) 199 | function postprocess(ir) 200 | # get rid of jfptr wrappers 201 | for llvmf in functions(ir) 202 | startswith(LLVM.name(llvmf), "jfptr_") && unsafe_delete!(ir, llvmf) 203 | end 204 | 205 | return 206 | end 207 | 208 | # set-up the compiler interface 209 | tracer = MethodCompileTracer(job) 210 | hook_emit_function(method_instance, code, world) = push!(tracer, method_instance) 211 | hook_emitted_function(method_instance, code, world) = pop!(tracer, method_instance) 212 | dependencies = MultiDict{Core.MethodInstance,LLVM.Function}() 213 | function hook_module_setup(ref::Ptr{Cvoid}) 214 | ref = convert(LLVM.API.LLVMModuleRef, ref) 215 | ir = LLVM.Module(ref) 216 | module_setup(job, ir) 217 | end 218 | function hook_module_activation(ref::Ptr{Cvoid}) 219 | ref = convert(LLVM.API.LLVMModuleRef, ref) 220 | ir = LLVM.Module(ref) 221 | postprocess(ir) 222 | 223 | # find the function that this module defines 224 | llvmfs = filter(llvmf -> !isdeclaration(llvmf) && 225 | linkage(llvmf) == LLVM.API.LLVMExternalLinkage, 226 | collect(functions(ir))) 227 | 228 | llvmf = nothing 229 | if length(llvmfs) == 1 230 | llvmf = first(llvmfs) 231 | elseif length(llvmfs) > 1 232 | llvmfs = filter!(llvmf -> startswith(LLVM.name(llvmf), "julia_"), llvmfs) 233 | if length(llvmfs) == 1 234 | llvmf = first(llvmfs) 235 | end 236 | end 237 | 238 | @compiler_assert llvmf !== nothing job 239 | 240 | insert!(dependencies, last(tracer), llvmf) 241 | end 242 | param_kwargs = [:cached => false, 243 | :track_allocations => false, 244 | :code_coverage => false, 245 | :static_alloc => false, 246 | :prefer_specsig => true, 247 | :module_setup => hook_module_setup, 248 | :module_activation => hook_module_activation, 249 | :emit_function => hook_emit_function, 250 | :emitted_function => hook_emitted_function] 251 | if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547" 252 | push!(param_kwargs, :gnu_pubnames => false) 253 | 254 | debug_info_kind = if Base.JLOptions().debug_level == 0 255 | LLVM.API.LLVMDebugEmissionKindNoDebug 256 | elseif Base.JLOptions().debug_level == 1 257 | LLVM.API.LLVMDebugEmissionKindLineTablesOnly 258 | elseif Base.JLOptions().debug_level >= 2 259 | LLVM.API.LLVMDebugEmissionKindFullDebug 260 | end 261 | 262 | # LLVM's debug info crashes older CUDA assemblers 263 | if job.target isa PTXCompilerTarget # && driver_version(job.target) < v"10.2" 264 | # FIXME: this was supposed to be fixed on 10.2 265 | @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1 266 | debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug 267 | end 268 | 269 | push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind)) 270 | end 271 | params = Base.CodegenParams(;param_kwargs...) 272 | 273 | # get the code 274 | ref = ccall(:jl_get_llvmf_defn, LLVM.API.LLVMValueRef, 275 | (Any, UInt, Bool, Bool, Base.CodegenParams), 276 | method_instance, world, #=wrapper=#false, #=optimize=#false, params) 277 | if ref == C_NULL 278 | throw(InternalCompilerError(job, "the Julia compiler could not generate LLVM IR")) 279 | end 280 | llvmf = LLVM.Function(ref) 281 | ir = LLVM.parent(llvmf) 282 | postprocess(ir) 283 | 284 | # link in dependent modules 285 | entry = llvmf 286 | mod = LLVM.parent(entry) 287 | @timeit_debug to "linking" begin 288 | # we disable Julia's compilation cache not to poison it with GPU-specific code. 289 | # as a result, we might get multiple modules for a single method instance. 290 | cache = Dict{String,String}() 291 | 292 | for called_method_instance in keys(dependencies) 293 | llvmfs = dependencies[called_method_instance] 294 | 295 | # link the first module 296 | llvmf = popfirst!(llvmfs) 297 | llvmfn = LLVM.name(llvmf) 298 | link!(mod, LLVM.parent(llvmf)) 299 | 300 | # process subsequent duplicate modules 301 | for dup_llvmf in llvmfs 302 | if Base.JLOptions().debug_level >= 2 303 | # link them too, to ensure accurate backtrace reconstruction 304 | link!(mod, LLVM.parent(dup_llvmf)) 305 | else 306 | # don't link them, but note the called function name in a cache 307 | dup_llvmfn = LLVM.name(dup_llvmf) 308 | cache[dup_llvmfn] = llvmfn 309 | end 310 | end 311 | end 312 | 313 | # resolve function declarations with cached entries 314 | for llvmf in filter(isdeclaration, collect(functions(mod))) 315 | llvmfn = LLVM.name(llvmf) 316 | if haskey(cache, llvmfn) 317 | def_llvmfn = cache[llvmfn] 318 | replace_uses!(llvmf, functions(mod)[def_llvmfn]) 319 | 320 | @compiler_assert isempty(uses(llvmf)) job 321 | unsafe_delete!(LLVM.parent(llvmf), llvmf) 322 | end 323 | end 324 | end 325 | 326 | return entry, mod 327 | end 328 | 329 | end 330 | 331 | function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) 332 | entry, mod = @timeit_debug to "emission" compile_method_instance(job, method_instance, world) 333 | 334 | # clean up incompatibilities 335 | @timeit_debug to "clean-up" begin 336 | for llvmf in functions(mod) 337 | # only occurs in debug builds 338 | delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext())) 339 | 340 | if VERSION < v"1.5.0-DEV.393" 341 | # make function names safe for ptxas 342 | llvmfn = LLVM.name(llvmf) 343 | if !isdeclaration(llvmf) 344 | llvmfn′ = safe_name(llvmfn) 345 | if llvmfn != llvmfn′ 346 | LLVM.name!(llvmf, llvmfn′) 347 | llvmfn = llvmfn′ 348 | end 349 | end 350 | end 351 | 352 | if Sys.iswindows() 353 | personality!(llvmf, nothing) 354 | end 355 | end 356 | 357 | # remove the exception-handling personality function 358 | if Sys.iswindows() && "__julia_personality" in functions(mod) 359 | llvmf = functions(mod)["__julia_personality"] 360 | @compiler_assert isempty(uses(llvmf)) job 361 | unsafe_delete!(mod, llvmf) 362 | end 363 | end 364 | 365 | # target-specific processing 366 | process_module!(job, mod) 367 | 368 | # rename the entry point 369 | if job.source.name !== nothing 370 | llvmfn = safe_name(string("julia_", job.source.name)) 371 | else 372 | # strip the globalUnique counter 373 | llvmfn = LLVM.name(entry) 374 | end 375 | LLVM.name!(entry, llvmfn) 376 | 377 | # promote entry-points to kernels and mangle its name 378 | if job.source.kernel 379 | entry = promote_kernel!(job, mod, entry) 380 | LLVM.name!(entry, mangle_call(entry, job.source.tt)) 381 | end 382 | 383 | # minimal required optimization 384 | @timeit_debug to "rewrite" ModulePassManager() do pm 385 | global current_job 386 | current_job = job 387 | 388 | linkage!(entry, LLVM.API.LLVMExternalLinkage) 389 | internalize!(pm, [LLVM.name(entry)]) 390 | 391 | can_throw(job) || add!(pm, ModulePass("LowerThrow", lower_throw!)) 392 | 393 | add_lowering_passes!(job, pm) 394 | 395 | run!(pm, mod) 396 | 397 | # NOTE: if an optimization is missing, try scheduling an entirely new optimization 398 | # to see which passes need to be added to the target-specific list 399 | # LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))") 400 | # ModulePassManager() do pm 401 | # add_library_info!(pm, triple(mod)) 402 | # add_transform_info!(pm, tm) 403 | # PassManagerBuilder() do pmb 404 | # populate!(pm, pmb) 405 | # end 406 | # run!(pm, mod) 407 | # end 408 | end 409 | 410 | return mod, entry 411 | end 412 | 413 | 414 | ## name mangling 415 | 416 | # we generate function names that look like C++ functions, because many NVIDIA tools 417 | # support them, e.g., grouping different instantiations of the same kernel together. 418 | 419 | function mangle_param(t) 420 | t == Nothing && return "v" 421 | 422 | if isa(t, DataType) || isa(t, Core.Function) 423 | tn = safe_name(t) 424 | str = "$(length(tn))$tn" 425 | 426 | if !isempty(t.parameters) 427 | str *= "I" 428 | for t in t.parameters 429 | str *= mangle_param(t) 430 | end 431 | str *= "E" 432 | end 433 | 434 | str 435 | elseif isa(t, Integer) 436 | "Li$(t)E" 437 | else 438 | tn = safe_name(t) 439 | "$(length(tn))$tn" 440 | end 441 | end 442 | 443 | function mangle_call(f, tt) 444 | fn = safe_name(f) 445 | str = "_Z$(length(fn))$fn" 446 | 447 | for t in tt.parameters 448 | str *= mangle_param(t) 449 | end 450 | 451 | return str 452 | end 453 | 454 | # make names safe for ptxas 455 | safe_name(fn::String) = replace(fn, r"[^A-Za-z0-9_]"=>"_") 456 | safe_name(f::Union{Core.Function,DataType}) = safe_name(String(nameof(f))) 457 | safe_name(f::LLVM.Function) = safe_name(LLVM.name(f)) 458 | safe_name(x) = safe_name(repr(x)) 459 | 460 | 461 | ## exception handling 462 | 463 | # this pass lowers `jl_throw` and friends to GPU-compatible exceptions. 464 | # this isn't strictly necessary, but has a couple of advantages: 465 | # - we can kill off unused exception arguments that otherwise would allocate or invoke 466 | # - we can fake debug information (lacking a stack unwinder) 467 | # 468 | # once we have thorough inference (ie. discarding `@nospecialize` and thus supporting 469 | # exception arguments) and proper debug info to unwind the stack, this pass can go. 470 | function lower_throw!(mod::LLVM.Module) 471 | job = current_job::CompilerJob 472 | changed = false 473 | @timeit_debug to "lower throw" begin 474 | 475 | throw_functions = Dict{String,String}( 476 | "jl_throw" => "exception", 477 | "jl_error" => "error", 478 | "jl_too_few_args" => "too few arguments exception", 479 | "jl_too_many_args" => "too many arguments exception", 480 | "jl_type_error" => "type error", 481 | "jl_type_error_rt" => "type error", 482 | "jl_undefined_var_error" => "undefined variable error", 483 | "jl_bounds_error" => "bounds error", 484 | "jl_bounds_error_v" => "bounds error", 485 | "jl_bounds_error_int" => "bounds error", 486 | "jl_bounds_error_tuple_int" => "bounds error", 487 | "jl_bounds_error_unboxed_int" => "bounds error", 488 | "jl_bounds_error_ints" => "bounds error", 489 | "jl_eof_error" => "EOF error" 490 | ) 491 | 492 | for (fn, name) in throw_functions 493 | if haskey(functions(mod), fn) 494 | f = functions(mod)[fn] 495 | 496 | for use in uses(f) 497 | call = user(use)::LLVM.CallInst 498 | 499 | # replace the throw with a PTX-compatible exception 500 | let builder = Builder(JuliaContext()) 501 | position!(builder, call) 502 | emit_exception!(builder, name, call) 503 | dispose(builder) 504 | end 505 | 506 | # remove the call 507 | call_args = collect(operands(call))[1:end-1] # last arg is function itself 508 | unsafe_delete!(LLVM.parent(call), call) 509 | 510 | # HACK: kill the exceptions' unused arguments 511 | for arg in call_args 512 | # peek through casts 513 | if isa(arg, LLVM.AddrSpaceCastInst) 514 | cast = arg 515 | arg = first(operands(cast)) 516 | isempty(uses(cast)) && unsafe_delete!(LLVM.parent(cast), cast) 517 | end 518 | 519 | if isa(arg, LLVM.Instruction) && isempty(uses(arg)) 520 | unsafe_delete!(LLVM.parent(arg), arg) 521 | end 522 | end 523 | 524 | changed = true 525 | end 526 | 527 | @compiler_assert isempty(uses(f)) job 528 | end 529 | end 530 | 531 | end 532 | return changed 533 | end 534 | 535 | # report an exception in a GPU-compatible manner 536 | # 537 | # the exact behavior depends on the debug level. in all cases, a `trap` will be emitted, On 538 | # debug level 1, the exception name will be printed, and on debug level 2 the individual 539 | # stack frames (as recovered from the LLVM debug information) will be printed as well. 540 | function emit_exception!(builder, name, inst) 541 | job = current_job::CompilerJob 542 | bb = position(builder) 543 | fun = LLVM.parent(bb) 544 | mod = LLVM.parent(fun) 545 | 546 | # report the exception 547 | if Base.JLOptions().debug_level >= 1 548 | name = globalstring_ptr!(builder, name, "exception") 549 | if Base.JLOptions().debug_level == 1 550 | call!(builder, Runtime.get(:report_exception), [name]) 551 | else 552 | call!(builder, Runtime.get(:report_exception_name), [name]) 553 | end 554 | end 555 | 556 | # report each frame 557 | if Base.JLOptions().debug_level >= 2 558 | rt = Runtime.get(:report_exception_frame) 559 | bt = backtrace(inst) 560 | for (i,frame) in enumerate(bt) 561 | idx = ConstantInt(rt.llvm_types[1], i) 562 | func = globalstring_ptr!(builder, String(frame.func), "di_func") 563 | file = globalstring_ptr!(builder, String(frame.file), "di_file") 564 | line = ConstantInt(rt.llvm_types[4], frame.line) 565 | call!(builder, rt, [idx, func, file, line]) 566 | end 567 | end 568 | 569 | # signal the exception 570 | call!(builder, Runtime.get(:signal_exception)) 571 | 572 | emit_trap!(job, builder, mod, inst) 573 | end 574 | 575 | function emit_trap!(job::CompilerJob, builder, mod, inst) 576 | trap = if haskey(functions(mod), "llvm.trap") 577 | functions(mod)["llvm.trap"] 578 | else 579 | LLVM.Function(mod, "llvm.trap", LLVM.FunctionType(LLVM.VoidType(JuliaContext()))) 580 | end 581 | call!(builder, trap) 582 | end 583 | 584 | 585 | ## kernel promotion 586 | 587 | # promote a function to a kernel 588 | function promote_kernel!(job::CompilerJob, mod::LLVM.Module, kernel::LLVM.Function) 589 | # pass non-opaque pointer arguments by value (this improves performance, 590 | # and is mandated by certain back-ends like SPIR-V). only do so for values 591 | # that aren't a Julia pointer, so we ca still pass those directly. 592 | kernel_ft = eltype(llvmtype(kernel)::LLVM.PointerType)::LLVM.FunctionType 593 | kernel_sig = Base.signature_type(job.source.f, job.source.tt)::Type 594 | kernel_types = filter(dt->!isghosttype(dt) && 595 | (VERSION < v"1.5.0-DEV.581" || !Core.Compiler.isconstType(dt)), 596 | [kernel_sig.parameters...]) 597 | @compiler_assert length(kernel_types) == length(parameters(kernel_ft)) job 598 | for (i, (param_ft,arg_typ)) in enumerate(zip(parameters(kernel_ft), kernel_types)) 599 | if param_ft isa LLVM.PointerType && issized(eltype(param_ft)) && 600 | !(arg_typ <: Ptr) && !(VERSION >= v"1.5" && arg_typ <: Core.LLVMPtr) 601 | push!(parameter_attributes(kernel, i), EnumAttribute("byval")) 602 | end 603 | end 604 | 605 | # target-specific processing 606 | kernel = process_kernel!(job, mod, kernel) 607 | 608 | return kernel 609 | end 610 | --------------------------------------------------------------------------------