├── .gitignore ├── bors.toml ├── REQUIRE ├── docs ├── Project.toml ├── make.jl └── src │ └── index.md ├── test ├── gpuenv │ ├── Manifest.toml │ └── Project.toml ├── testenv │ ├── Manifest.toml │ └── Project.toml ├── runtests.jl ├── examples.jl └── test.jl ├── .travis.yml ├── examples ├── simple.jl ├── contextualize.jl └── shmem.jl ├── Project.toml ├── src ├── shmem.jl ├── scratch.jl ├── loopinfo.jl ├── GPUifyLoops.jl └── context.jl ├── README.md ├── LICENSE.md └── .gitlab-ci.yml /.gitignore: -------------------------------------------------------------------------------- 1 | docs/build/ 2 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ 2 | "ci/gitlab/%" 3 | ] 4 | -------------------------------------------------------------------------------- /REQUIRE: -------------------------------------------------------------------------------- 1 | julia 1.1 2 | Requires 3 | Cassette 4 | StaticArrays 5 | -------------------------------------------------------------------------------- /docs/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 3 | 4 | [compat] 5 | Documenter = "~0.19" 6 | -------------------------------------------------------------------------------- /test/gpuenv/Manifest.toml: -------------------------------------------------------------------------------- 1 | [[GPUifyLoops]] 2 | path = "../.." 3 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" 4 | version = "0.1.0" 5 | -------------------------------------------------------------------------------- /test/testenv/Manifest.toml: -------------------------------------------------------------------------------- 1 | [[GPUifyLoops]] 2 | path = "../.." 3 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" 4 | version = "0.1.0" 5 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using GPUifyLoops 2 | using Test 3 | 4 | @testset "Unittests" begin 5 | include("test.jl") 6 | end 7 | 8 | include("examples.jl") 9 | 10 | -------------------------------------------------------------------------------- /docs/make.jl: -------------------------------------------------------------------------------- 1 | using Documenter, GPUifyLoops 2 | 3 | makedocs( 4 | modules = [GPUifyLoops], 5 | format = :html, 6 | sitename = "GPUifyLoops.jl", 7 | pages = [ 8 | "Home" => "index.md", 9 | ], 10 | doctest = true 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | ## Documentation: http://docs.travis-ci.com/user/languages/julia/ 2 | language: julia 3 | os: 4 | - linux 5 | - osx 6 | julia: 7 | - 1.0 8 | - 1.1 9 | - 1.2 10 | - 1.3 11 | - nightly 12 | notifications: 13 | email: false 14 | git: 15 | depth: 99999999 16 | -------------------------------------------------------------------------------- /test/testenv/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" 3 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 4 | Requires = "ae029012-a4dd-5104-9daa-d747884805df" 5 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 6 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 7 | -------------------------------------------------------------------------------- /test/gpuenv/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" 3 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 4 | GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" 5 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 6 | Requires = "ae029012-a4dd-5104-9daa-d747884805df" 7 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 8 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 9 | -------------------------------------------------------------------------------- /examples/simple.jl: -------------------------------------------------------------------------------- 1 | using GPUifyLoops 2 | 3 | function kernel(A) 4 | @loop for i in (1:size(A,1); 5 | threadIdx().x) 6 | A[i] = 2*A[i] 7 | end 8 | @synchronize 9 | end 10 | 11 | data = Array{Float32}(undef, 1024) 12 | kernel(data) 13 | 14 | @static if Base.find_package("CuArrays") !== nothing 15 | using CuArrays 16 | using CUDAnative 17 | 18 | kernel(A::CuArray) = @launch CUDA() kernel(A, threads=length(A)) 19 | 20 | data = CuArray{Float32}(undef, 1024) 21 | kernel(data) 22 | end 23 | 24 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "GPUifyLoops" 2 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" 3 | authors = ["Valentin Churavy "] 4 | version = "0.2.8" 5 | 6 | [deps] 7 | Cassette = "7057c7e9-c182-5462-911a-8362d720325c" 8 | Requires = "ae029012-a4dd-5104-9daa-d747884805df" 9 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 10 | 11 | [compat] 12 | julia = ">= 1.1" 13 | 14 | [extras] 15 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 16 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 17 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 18 | 19 | [targets] 20 | test = ["Test", "Pkg", "InteractiveUtils"] 21 | -------------------------------------------------------------------------------- /examples/contextualize.jl: -------------------------------------------------------------------------------- 1 | using GPUifyLoops 2 | 3 | f1(x) = sin(x) 4 | f(x) = 1 + f1(x) 5 | 6 | function kernel!(A, B, h) 7 | @inbounds @loop for i in (1:size(A,1); threadIdx().x) 8 | A[i] = h(B[i]) 9 | end 10 | nothing 11 | end 12 | 13 | data = rand(Float32, 1024) 14 | fdata = similar(data) 15 | kernel!(fdata, data, f) 16 | 17 | @assert f.(data) ≈ fdata 18 | 19 | @static if Base.find_package("CuArrays") !== nothing 20 | using CuArrays 21 | using CUDAnative 22 | 23 | function kernel!(A::CuArray, B::CuArray) 24 | @launch CUDA() threads=length(A) kernel!(A, B, f) 25 | end 26 | 27 | cudata = CuArray(data) 28 | cufdata = similar(cudata) 29 | kernel!(cufdata, cudata) 30 | 31 | @assert f.(data) ≈ cufdata 32 | end 33 | -------------------------------------------------------------------------------- /examples/shmem.jl: -------------------------------------------------------------------------------- 1 | using GPUifyLoops 2 | 3 | function kernel3!(A) 4 | s1 = @shmem eltype(A) (1024,) 5 | s2 = @shmem eltype(A) (1024,) 6 | 7 | @loop for i in (1:size(A,1); threadIdx().x) 8 | s1[i] = 2*A[i] 9 | s2[i] = 3*A[i] 10 | end 11 | @synchronize 12 | @loop for i in (1:size(A,1); threadIdx().x) 13 | A[i] = s1[i] 14 | end 15 | nothing 16 | end 17 | 18 | data = rand(Float32, 1024) 19 | cpudata = copy(data) 20 | 21 | @launch CPU() kernel3!(cpudata) 22 | @assert cpudata ≈ 2 .* data 23 | 24 | @static if Base.find_package("CuArrays") !== nothing 25 | using CuArrays 26 | using CUDAnative 27 | 28 | cudata = CuArray(data) 29 | @launch CUDA() threads=length(cudata) kernel3!(cudata) 30 | @assert Array(cudata) ≈ 2 .* data 31 | end 32 | -------------------------------------------------------------------------------- /src/shmem.jl: -------------------------------------------------------------------------------- 1 | __size(args::Tuple) = Tuple{args...} 2 | __size(i::Int) = Tuple{i} 3 | 4 | __shmem(D::Device, args...) = throw(MethodError(__shmem, (D, args...))) 5 | @inline __shmem(::CPU, ::Type{T}, ::Val{dims}, ::Val) where {T, dims} =MArray{__size(dims), T}(undef) 6 | 7 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin 8 | using .CUDAnative 9 | 10 | @inline function __shmem(::CUDA, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id} 11 | ptr = CUDAnative._shmem(Val(id), T, Val(prod(dims))) 12 | CUDAnative.CuDeviceArray(dims, CUDAnative.DevicePtr{T, CUDAnative.AS.Shared}(ptr)) 13 | end 14 | end 15 | 16 | shmem_id = 0 17 | macro shmem(T, dims) 18 | global shmem_id 19 | id = shmem_id::Int += 1 20 | 21 | quote 22 | $__shmem($backend(), $(esc(T)), Val($(esc(dims))), Val($id)) 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /test/examples.jl: -------------------------------------------------------------------------------- 1 | @testset "examples" begin 2 | 3 | function find_sources(path::String, sources=String[]) 4 | if isdir(path) 5 | for entry in readdir(path) 6 | find_sources(joinpath(path, entry), sources) 7 | end 8 | elseif endswith(path, ".jl") 9 | push!(sources, path) 10 | end 11 | sources 12 | end 13 | 14 | examples_dir = joinpath(@__DIR__, "..", "examples") 15 | examples = find_sources(examples_dir) 16 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) 17 | 18 | cd(examples_dir) do 19 | examples = relpath.(examples, Ref(examples_dir)) 20 | @testset for example in examples 21 | cmd = ```$(Base.julia_cmd()) --project=$(Base.current_project()) 22 | -e 'using Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.build()' 23 | -L $example 24 | ``` 25 | @test success(pipeline(cmd, stderr=stderr)) 26 | end 27 | end 28 | 29 | end 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GPUifyLoops.jl 2 | ============== 3 | *Support for writing loop-based code that executes both on CPU and GPU* 4 | 5 | [![][docs-latest-img]][docs-latest-url] 6 | 7 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg 8 | [docs-latest-url]: https://juliagpu.gitlab.io/GPUifyLoops.jl/ 9 | 10 | Installation 11 | ------------ 12 | 13 | GPUifyLoops is a registered package, and can be installed using the Julia package 14 | manager. 15 | 16 | ```julia 17 | julia>] 18 | (v1.1) pkg> add GPUifyLoops 19 | ``` 20 | 21 | **Note**: The current version of this package requires Julia 1.1. 22 | 23 | Development 24 | ----------- 25 | 26 | In order to test this package locally you need to do: 27 | 28 | ``` 29 | julia --project=test/gpuenv 30 | julia> ] 31 | (gpuenv) pkg> resolve 32 | (gpuenv) pkg> instantiate 33 | ``` 34 | 35 | This will resolve the GPU environment, please do not checking changes to `test/gpuenv/`. 36 | Then you can run the tests with `julia --project=test/gpuenv test/runtests.jl` 37 | 38 | License 39 | ------- 40 | 41 | GPUifyLoops.jl is licensed under [MIT license](LICENSE.md). 42 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2015: Simon Kornblith. 4 | Copyright © 2018-2019: Valentin Churavy, and other contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /src/scratch.jl: -------------------------------------------------------------------------------- 1 | using StaticArrays 2 | 3 | """ 4 | @scratch T Dims M 5 | 6 | Allocates scratch memory. 7 | - `T` type of array 8 | - `Dims` is a tuple of array dimensions 9 | - `M` the number of dimensions at the tail that are implicit on the GPU 10 | """ 11 | macro scratch(T, Dims, M) 12 | @assert Dims.head == :tuple 13 | dims = Dims.args 14 | N = length(dims) - M 15 | gpudims = ntuple(i->dims[i], N) 16 | esc(quote 17 | if !$isdevice() 18 | $MArray{Tuple{$(dims...)}, $T}(undef) 19 | else 20 | data = if $(length(gpudims)) > 0 21 | $ScratchArray{$N}( 22 | $MArray{Tuple{$(gpudims...)}, $T}(undef) 23 | ) 24 | else 25 | $ScratchArray{$N}( 26 | $MArray{Tuple{1}, $T}(undef) 27 | ) 28 | end 29 | end 30 | end) 31 | end 32 | 33 | struct ScratchArray{N, D} 34 | data::D 35 | ScratchArray{N}(data::D) where {N, D} = new{N, D}(data) 36 | ScratchArray{N, T}() where {N, T} = new{N, T}() 37 | end 38 | 39 | 40 | Base.@propagate_inbounds function Base.getindex(A::ScratchArray{N}, I...) where N 41 | nI = ntuple(i->I[i], N) 42 | if nI == () 43 | return A.data[1] 44 | end 45 | return A.data[nI...] 46 | end 47 | 48 | Base.@propagate_inbounds function Base.setindex!(A::ScratchArray{N}, val, I...) where N 49 | nI = ntuple(i->I[i], N) 50 | if nI == () 51 | return A.data .= val 52 | end 53 | A.data[nI...] = val 54 | end 55 | 56 | -------------------------------------------------------------------------------- /src/loopinfo.jl: -------------------------------------------------------------------------------- 1 | module LoopInfo 2 | 3 | const HAS_LOOPINFO_EXPR = VERSION >= v"1.2.0-DEV.462" 4 | export @unroll 5 | 6 | ## 7 | # Uses the loopinfo expr node to attach LLVM loopinfo to loops 8 | # the full list of supported metadata nodes is available at 9 | # https://llvm.org/docs/LangRef.html#llvm-loop 10 | # TODO: Figure out how to deal with compile-time constants in `@unroll(N, expr)` 11 | # so constants that come from `Val{N}` but are not parse time constant. 12 | # Most likely will require changes to base Julia. 13 | ## 14 | 15 | module MD 16 | unroll_count(n) = (Symbol("llvm.loop.unroll.count"), convert(Int, n)) 17 | unroll_disable() = (Symbol("llvm.loop.unroll.disable"), 1) 18 | unroll_enable() = (Symbol("llvm.loop.unroll.enable"), 1) 19 | unroll_full() = (Symbol("llvm.loop.unroll.full"), 1) 20 | end 21 | 22 | function loopinfo(expr, nodes...) 23 | if expr.head != :for 24 | error("Syntax error: loopinfo needs a for loop") 25 | end 26 | if HAS_LOOPINFO_EXPR 27 | push!(expr.args[2].args, Expr(:loopinfo, nodes...)) 28 | end 29 | return expr 30 | end 31 | 32 | """ 33 | @unroll expr 34 | 35 | Takes a for loop as `expr` and informs the LLVM unroller to fully unroll it, if 36 | it is safe to do so and the loop count is known. 37 | """ 38 | macro unroll(expr) 39 | expr = loopinfo(expr, MD.unroll_full()) 40 | return esc(expr) 41 | end 42 | 43 | """ 44 | @unroll N expr 45 | 46 | Takes a for loop as `expr` and informs the LLVM unroller to unroll it `N` times, 47 | if it is safe to do so. 48 | """ 49 | macro unroll(N, expr) 50 | if !(N isa Integer) 51 | error("Syntax error: `@unroll N expr` needs a constant integer N") 52 | end 53 | expr = loopinfo(expr, MD.unroll_count(N)) 54 | return esc(expr) 55 | end 56 | 57 | end #module 58 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | include: 2 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v3/common.yml' 3 | 4 | .projecttest: 5 | extends: .test 6 | script: 7 | - julia -e 'using InteractiveUtils; 8 | versioninfo()' 9 | - mkdir $JULIA_DEPOT_PATH 10 | - julia --project=$CI_JULIA_PROJECT -e ' 11 | using Pkg; 12 | Pkg.resolve(); 13 | Pkg.instantiate(); 14 | Pkg.build(); 15 | include("test/runtests.jl");' 16 | .gputest: 17 | extends: .projecttest 18 | variables: 19 | CI_IMAGE_TAG: 'cuda' 20 | CI_JULIA_PROJECT: 'test/gpuenv' 21 | tags: 22 | - cuda 23 | 24 | .cputest: 25 | extends: .projecttest 26 | variables: 27 | CI_IMAGE_TAG: 'plain' 28 | CI_JULIA_PROJECT: 'test/testenv' 29 | 30 | gpu:test:dev: 31 | extends: .gputest 32 | variables: 33 | CI_VERSION_TAG: 'dev' 34 | allow_failure: true 35 | 36 | cpu:test:dev: 37 | extends: .cputest 38 | variables: 39 | CI_VERSION_TAG: 'dev' 40 | allow_failure: true 41 | 42 | gpu:test:v1.1: 43 | extends: .gputest 44 | variables: 45 | CI_VERSION_TAG: 'v1.1' 46 | 47 | cpu:test:v1.1: 48 | extends: .cputest 49 | variables: 50 | CI_VERSION_TAG: 'v1.1' 51 | 52 | gpu:test:v1.2: 53 | extends: .gputest 54 | variables: 55 | CI_VERSION_TAG: 'v1.2' 56 | 57 | cpu:test:v1.2: 58 | extends: .cputest 59 | variables: 60 | CI_VERSION_TAG: 'v1.2' 61 | 62 | documentation: 63 | extends: .documentation 64 | dependencies: 65 | - cpu:test:v1.1 66 | variables: 67 | CI_VERSION_TAG: 'v1.1' 68 | CI_IMAGE_TAG: 'plain' 69 | only: 70 | - master 71 | - staging 72 | - trying 73 | 74 | pages: 75 | dependencies: 76 | - documentation 77 | stage: deploy 78 | script: 79 | - mv docs/build public 80 | artifacts: 81 | paths: 82 | - public 83 | only: 84 | - master 85 | 86 | -------------------------------------------------------------------------------- /docs/src/index.md: -------------------------------------------------------------------------------- 1 | # GPUifyLoops.jl 2 | 3 | GPUifyLoops tries to solve the problem of code-duplication that can occur 4 | when writing performant kernels that target multiple devices. 5 | 6 | ## API 7 | 8 | ```@docs 9 | @loop 10 | @setup 11 | @synchronize 12 | ``` 13 | 14 | ## Examples 15 | ### Simple 16 | 17 | ````@eval 18 | using Markdown 19 | Markdown.parse(""" 20 | ```julia 21 | $(read("../../examples/simple.jl", String)) 22 | ``` 23 | """) 24 | ```` 25 | 26 | ## Other useful tools 27 | ### Loop unrolling 28 | 29 | On Julia `v1.2.0-DEV.462` we can pass information to the LLVM loop tooling. 30 | GPUifyLoops contains a macro `@unroll` that can unroll a loop fully if the 31 | trip count is known or partially by a factor. 32 | 33 | ```@docs 34 | @unroll 35 | ``` 36 | #### Example: 37 | 38 | ```julia 39 | @noinline iteration(i) = @show i 40 | # Unknown loop count 41 | f(N) = @unroll 3 for i in 1:N 42 | iteration(i) 43 | end 44 | @code_llvm f(10) 45 | ``` 46 | 47 | This should yield something like: 48 | ```LLVM 49 | %6 = call i64 @julia_iteration_12527(i64 %value_phi3) 50 | %7 = add nuw i64 %value_phi3, 1 51 | %8 = call i64 @julia_iteration_12527(i64 %7) 52 | %9 = add i64 %value_phi3, 2 53 | %10 = call i64 @julia_iteration_12527(i64 %9) 54 | %11 = add i64 %value_phi3, 3 55 | ``` 56 | 57 | You can also unroll a loop fully, but that requires a known/computable 58 | trip-count: 59 | 60 | ```julia 61 | @noinline iteration(i) = @show i 62 | # Unknown loop count 63 | f() = @unroll for i in 1:10 64 | iteration(i) 65 | end 66 | @code_llvm f() 67 | ``` 68 | 69 | Which yields something like: 70 | ```LLVM 71 | %4 = call i64 @julia_iteration_12527(i64 1) 72 | %5 = call i64 @julia_iteration_12527(i64 2) 73 | %6 = call i64 @julia_iteration_12527(i64 3) 74 | %7 = call i64 @julia_iteration_12527(i64 4) 75 | %8 = call i64 @julia_iteration_12527(i64 5) 76 | %9 = call i64 @julia_iteration_12527(i64 6) 77 | %10 = call i64 @julia_iteration_12527(i64 7) 78 | %11 = call i64 @julia_iteration_12527(i64 8) 79 | %12 = call i64 @julia_iteration_12527(i64 9) 80 | %13 = call i64 @julia_iteration_12527(i64 10) 81 | ``` 82 | -------------------------------------------------------------------------------- /src/GPUifyLoops.jl: -------------------------------------------------------------------------------- 1 | module GPUifyLoops 2 | 3 | if VERSION < v"1.1" 4 | @error "GPUifyLoops depends on Julia v1.1" 5 | end 6 | 7 | abstract type Device end 8 | struct CPU <: Device end 9 | 10 | abstract type GPU <: Device end 11 | struct CUDA <: GPU end 12 | 13 | #= 14 | # Hopefully we can eventually support AMDGPUs through ROCm 15 | struct ROCm <: GPU end 16 | =# 17 | 18 | export CPU, CUDA, Device 19 | 20 | using StaticArrays 21 | using Requires 22 | 23 | export @setup, @loop, @synchronize 24 | export @scratch, @shmem 25 | export contextualize 26 | export @unroll 27 | export @launch 28 | 29 | ## 30 | # contextualize 31 | ## 32 | include("context.jl") 33 | 34 | backend() = CPU() 35 | # FIXME: Get backend from Context or have Context per backend 36 | Cassette.overdub(ctx::Ctx, ::typeof(backend)) = CUDA() 37 | 38 | macro launch(ex...) 39 | # destructure the `@launch` expression 40 | call = ex[end] 41 | kwargs = ex[2:end-1] 42 | 43 | device = ex[1] 44 | 45 | # destructure the kernel call 46 | if call.head != :call 47 | throw(ArgumentError("second argument to @launch should be a function call")) 48 | end 49 | 50 | f = call.args[1] 51 | args = call.args[2:end] 52 | 53 | quote 54 | $launch($(esc(device)), $(esc(f)), $(map(esc, args)...); $(map(esc, kwargs)...)) 55 | end 56 | end 57 | 58 | 59 | 60 | """ 61 | launch(::Device, f, args..., kwargs...) 62 | 63 | Launch a kernel on the GPU. `kwargs` are passed to `@cuda` 64 | `kwargs` can be any of the compilation and runtime arguments 65 | normally passed to `@cuda`. 66 | """ 67 | launch(::CPU, f, args...; kwargs...) = f(args...) 68 | 69 | """ 70 | launch_config(::F, maxthreads, args...; kwargs...) 71 | 72 | Calculate a valid launch configuration based on the typeof(F), the 73 | maximum number of threads, the functions arguments and the particular 74 | launch configuration passed to the call. 75 | 76 | Return a NamedTuple that has `blocks`, `threads`, `shmem`, and `stream`. 77 | All arguments are optional, but blocks and threads is recommended. 78 | """ 79 | function launch_config(@nospecialize(f), maxthreads, args...; kwargs...) 80 | return kwargs 81 | end 82 | 83 | function split_kwargs(kwargs) 84 | compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] 85 | call_kws = [:blocks, :threads, :shmem, :stream, :config] 86 | compiler_kwargs = [] 87 | call_kwargs = [] 88 | for kwarg in kwargs 89 | key, val = kwarg 90 | if isa(key, Symbol) 91 | if key in compiler_kws 92 | push!(compiler_kwargs, kwarg) 93 | elseif key in call_kws 94 | push!(call_kwargs, kwarg) 95 | else 96 | throw(ArgumentError("unknown keyword argument '$key'")) 97 | end 98 | else 99 | throw(ArgumentError("non-symbolic keyword '$key'")) 100 | end 101 | end 102 | return compiler_kwargs, call_kwargs 103 | end 104 | 105 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin 106 | using .CUDAnative 107 | 108 | function version_check() 109 | project = joinpath(dirname(pathof(CUDAnative)), "../Project.toml") 110 | let Pkg = Base.require(Base.PkgId(Base.UUID((0x44cfe95a1eb252ea, 0xb672e2afdf69b78f)), "Pkg")) 111 | project = Pkg.TOML.parse(String(read(project))) 112 | return version = VersionNumber(get(project, "version", "0.0.0")) 113 | end 114 | end 115 | 116 | global const CUDANativeVersion = version_check() 117 | 118 | function launch(::CUDA, f::F, args...; kwargs...) where F 119 | compiler_kwargs, call_kwargs = split_kwargs(kwargs) 120 | args = (ctx, f, args...) 121 | GC.@preserve args begin 122 | kernel_args = map(cudaconvert, args) 123 | kernel_tt = Tuple{map(Core.Typeof, kernel_args)...} 124 | if CUDANativeVersion > v"2.1.2" 125 | kernel = cufunction(Cassette.overdub, kernel_tt; name=String(nameof(f)), compiler_kwargs...) 126 | else 127 | kernel = cufunction(Cassette.overdub, kernel_tt; compiler_kwargs...) 128 | end 129 | 130 | maxthreads = CUDAnative.maxthreads(kernel) 131 | config = launch_config(f, maxthreads, args...; call_kwargs...) 132 | 133 | kernel(kernel_args...; config...) 134 | end 135 | return nothing 136 | end 137 | end 138 | 139 | isdevice(::CPU) = false 140 | isdevice(::Device) = true 141 | isdevice() = isdevice(backend()) 142 | 143 | sync(::CPU) = nothing 144 | sync() = sync(backend()) 145 | 146 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin 147 | using .CUDAnative 148 | sync(::CUDA) = CUDAnative.sync_threads() 149 | end 150 | 151 | @deprecate iscpu(::Val{:GPU}) isdevice() 152 | @deprecate iscpu(::Val{:CPU}) !isdevice() 153 | @deprecate sync(::Val{:GPU}) sync() 154 | @deprecate sync(::Val{:CPU}) sync() 155 | 156 | 157 | """ 158 | @syncronize 159 | 160 | Calls `sync_threads()` on the GPU and nothing on the CPU. 161 | """ 162 | macro synchronize() 163 | :($sync()) 164 | end 165 | 166 | """ 167 | @loop for i in (A; B) 168 | # body 169 | end 170 | 171 | Take a `for i in (A; B)` expression and on the CPU lowers it to: 172 | 173 | ```julia 174 | for i in A 175 | # body 176 | end 177 | ``` 178 | 179 | and on the GPU: 180 | ```julia 181 | for i in B 182 | if !(i in A) 183 | continue 184 | end 185 | # body 186 | end 187 | ``` 188 | """ 189 | macro loop(expr) 190 | if expr.head != :for 191 | error("Syntax error: @loop needs a for loop") 192 | end 193 | 194 | induction = expr.args[1] 195 | body = expr.args[2] 196 | 197 | if induction.head != :(=) 198 | error("Syntax error: @loop needs a induction variable") 199 | end 200 | 201 | rhs = induction.args[2] 202 | if rhs.head == :block 203 | @assert length(rhs.args) == 3 204 | # rhs[2] is a linenode 205 | cpuidx = rhs.args[1] 206 | gpuidx = rhs.args[3] 207 | 208 | rhs = Expr(:if, :(!$isdevice()), cpuidx, gpuidx) 209 | induction.args[2] = rhs 210 | 211 | # use cpuidx calculation to check bounds of on GPU. 212 | bounds_chk = quote 213 | if $isdevice() && !($gpuidx in $cpuidx) 214 | continue 215 | end 216 | end 217 | 218 | pushfirst!(body.args, bounds_chk) 219 | end 220 | 221 | return esc(Expr(:for, induction, body)) 222 | end 223 | 224 | ### 225 | # Scratch and shared-memory 226 | ### 227 | include("scratch.jl") 228 | include("shmem.jl") 229 | 230 | ### 231 | # Loopinfo 232 | # - `@unroll` 233 | ### 234 | include("loopinfo.jl") 235 | using .LoopInfo 236 | 237 | end 238 | -------------------------------------------------------------------------------- /src/context.jl: -------------------------------------------------------------------------------- 1 | ## 2 | # Implements contextual dispatch through Cassette.jl 3 | # Goals: 4 | # - Rewrite common CPU functions to appropriate GPU intrinsics 5 | # 6 | # TODO: 7 | # - error (erf, ...) 8 | # - min, max 9 | # - mod, rem 10 | # - gamma 11 | # - bessel 12 | # - distributions 13 | # - unsorted 14 | 15 | using Cassette 16 | 17 | function ir_element(x, code::Vector) 18 | while isa(x, Core.SSAValue) 19 | x = code[x.id] 20 | end 21 | return x 22 | end 23 | 24 | ## 25 | # Forces inlining on everything that is not marked `@noinline` 26 | # avoids overdubbing of pure functions 27 | # avoids overdubbing of IntrinsicFunctions and Builtins 28 | ## 29 | function transform(ctx, ref) 30 | CI = ref.code_info 31 | noinline = any(@nospecialize(x) -> 32 | Core.Compiler.isexpr(x, :meta) && 33 | x.args[1] == :noinline, 34 | CI.code) 35 | CI.inlineable = !noinline 36 | 37 | # don't overdub pure functions 38 | if CI.pure 39 | n_method_args = Int(ref.method.nargs) 40 | if ref.method.isva 41 | Cassette.insert_statements!(CI.code, CI.codelocs, 42 | (x, i) -> i == 1 ? 3 : nothing, 43 | (x, i) -> i == 1 ? [ 44 | # this could run into troubles when the function is @pure f(x...) since then n_method_args==2, but this seems to work sofar. 45 | Expr(:call, Expr(:nooverdub, GlobalRef(Core, :tuple)), (Core.SlotNumber(i) for i in 2:(n_method_args-1))...), 46 | Expr(:call, Expr(:nooverdub, GlobalRef(Core, :_apply)), Core.SlotNumber(1), Core.SSAValue(i), Core.SlotNumber(n_method_args)), 47 | Expr(:return, Core.SSAValue(i+1))] : nothing) 48 | else 49 | Cassette.insert_statements!(CI.code, CI.codelocs, 50 | (x, i) -> i == 1 ? 2 : nothing, 51 | (x, i) -> i == 1 ? [ 52 | Expr(:call, Expr(:nooverdub, Core.SlotNumber(1)), (Core.SlotNumber(i) for i in 2:n_method_args)...) 53 | Expr(:return, Core.SSAValue(i))] : nothing) 54 | end 55 | CI.ssavaluetypes = length(CI.code) 56 | return CI 57 | end 58 | 59 | # overdubbing IntrinsicFunctions removes our ability to profile code 60 | newstmt = (x, i) -> begin 61 | isassign = Base.Meta.isexpr(x, :(=)) 62 | stmt = isassign ? x.args[2] : x 63 | if Base.Meta.isexpr(stmt, :call) 64 | applycall = Cassette.is_ir_element(stmt.args[1], GlobalRef(Core, :_apply), CI.code) 65 | if applycall 66 | f = stmt.args[2] 67 | else 68 | f = stmt.args[1] 69 | end 70 | f = ir_element(f, CI.code) 71 | if f isa GlobalRef 72 | mod = f.mod 73 | name = f.name 74 | if Base.isbindingresolved(mod, name) && Base.isdefined(mod, name) 75 | ff = getfield(f.mod, f.name) 76 | if ff isa Core.IntrinsicFunction || ff isa Core.Builtin 77 | if applycall 78 | stmt.args[2] = Expr(:nooverdub, f) 79 | else 80 | stmt.args[1] = Expr(:nooverdub, f) 81 | end 82 | end 83 | end 84 | end 85 | end 86 | return [x] 87 | end 88 | 89 | Cassette.insert_statements!(CI.code, CI.codelocs, (x, i) -> 1, newstmt) 90 | CI.ssavaluetypes = length(CI.code) 91 | # Core.Compiler.validate_code(CI) 92 | return CI 93 | end 94 | 95 | const GPUifyPass = Cassette.@pass transform 96 | 97 | Cassette.@context Ctx 98 | const ctx = Cassette.disablehooks(Ctx(pass = GPUifyPass)) 99 | 100 | ### 101 | # Cassette fixes 102 | ### 103 | @inline Cassette.overdub(::Ctx, ::typeof(Core.kwfunc), f) = return Core.kwfunc(f) 104 | @inline Cassette.overdub(::Ctx, ::typeof(Core.apply_type), args...) = return Core.apply_type(args...) 105 | @inline Cassette.overdub(::Ctx, ::typeof(StaticArrays.Size), x::Type{<:AbstractArray{<:Any, N}}) where {N} = return StaticArrays.Size(x) 106 | 107 | # this looks like a recursion detection failure 108 | @inline Cassette.overdub(::Ctx, ::typeof(Base.Broadcast.axes), args...) = return Base.Broadcast.axes(args...) 109 | 110 | 111 | ### 112 | # Rewrite functions 113 | ### 114 | 115 | # define +, -, * as contract 116 | 117 | for (f, T) in Base.Iterators.product((:add, :mul, :sub), (Float32, Float64)) 118 | name = Symbol("$(f)_float_contract") 119 | if T === Float32 120 | llvmt = "float" 121 | elseif T === Float64 122 | llvmt = "double" 123 | end 124 | 125 | #XXX Use LLVM.jl 126 | ir = """ 127 | %x = f$f contract $llvmt %0, %1 128 | ret $llvmt %x 129 | """ 130 | @eval begin 131 | # the @pure is necessary so that we can constant propagate. 132 | Base.@pure function $name(a::$T, b::$T) 133 | @Base._inline_meta 134 | Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) 135 | end 136 | end 137 | end 138 | @inline Cassette.overdub(ctx::Ctx, ::typeof(+), a::T, b::T) where T<:Union{Float32, Float64} = add_float_contract(a, b) 139 | @inline Cassette.overdub(ctx::Ctx, ::typeof(-), a::T, b::T) where T<:Union{Float32, Float64} = sub_float_contract(a, b) 140 | @inline Cassette.overdub(ctx::Ctx, ::typeof(*), a::T, b::T) where T<:Union{Float32, Float64} = mul_float_contract(a, b) 141 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float64, y::Float64) = CUDAnative.pow(x, y) 142 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float32, y::Float32) = CUDAnative.pow(x, y) 143 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float64, y::Int32) = CUDAnative.pow(x, y) 144 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float32, y::Int32) = CUDAnative.pow(x, y) 145 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Union{Float32, Float64}, y::Int64) = CUDAnative.pow(x, y) 146 | 147 | # libdevice.jl 148 | const cudafuns = (:cos, :cospi, :sin, :sinpi, :tan, 149 | :acos, :asin, :atan, 150 | :cosh, :sinh, :tanh, 151 | :acosh, :asinh, :atanh, 152 | :log, :log10, :log1p, :log2, 153 | :exp, :exp2, :exp10, :expm1, :ldexp, 154 | # :isfinite, :isinf, :isnan, :signbit, 155 | :abs, 156 | :sqrt, :cbrt, 157 | :ceil, :floor,) 158 | for f in cudafuns 159 | @eval function Cassette.overdub(ctx::Ctx, ::typeof(Base.$f), x::Union{Float32, Float64}) 160 | @Base._inline_meta 161 | return CUDAnative.$f(x) 162 | end 163 | end 164 | 165 | function Cassette.overdub(::Ctx, ::typeof(:), start::T, step::T, stop::T) where T<:Union{Float16,Float32,Float64} 166 | lf = (stop-start)/step 167 | if lf < 0 168 | len = 0 169 | elseif lf == 0 170 | len = 1 171 | else 172 | len = round(Int, lf) + 1 173 | stop′ = start + (len-1)*step 174 | # if we've overshot the end, subtract one: 175 | len -= (start < stop < stop′) + (start > stop > stop′) 176 | end 177 | Base.steprangelen_hp(T, start, step, 0, len, 1) 178 | end 179 | 180 | 181 | """ 182 | contextualize(::Dev, f) 183 | 184 | This contexualizes the function `f` for a given device type `Dev`. 185 | 186 | For the device `CUDA()`, `contextualize` replaces calls to math library 187 | functions. For example, `cos`, `sin`, are replaced with `CUDAnative.cos`, 188 | `CUDAnative.sin`, respectively. 189 | 190 | The full list functions that are replaced is $cudafuns. 191 | 192 | # Examples 193 | ```julia 194 | function kernel!(::Dev, A, f) where {Dev} 195 | @setup Dev 196 | @loop for i in (1:size(A,1); threadIdx().x) 197 | A[i] = f(A[i]) 198 | end 199 | end 200 | 201 | g(x) = sin(x) 202 | kernel!(A::Array) = kernel!(CPU(), A, contextualize(CPU(), g)) 203 | kernel!(A::CuArray) = 204 | @cuda threads=length(A) kernel!(CUDA(), A, contextualize(CUDA(), g)) 205 | 206 | a = rand(Float32, 1024) 207 | b, c = copy(a), CuArray(a) 208 | 209 | kernel!(b) 210 | kernel!(c) 211 | 212 | @assert g.(a) ≈ b 213 | @assert g.(a) ≈ c 214 | ``` 215 | """ 216 | contextualize(f::F) where F = (args...) -> Cassette.overdub(ctx, f, args...) 217 | -------------------------------------------------------------------------------- /test/test.jl: -------------------------------------------------------------------------------- 1 | using GPUifyLoops 2 | using Test 3 | using InteractiveUtils 4 | 5 | function kernel(A) 6 | @loop for i in (1:size(A,1); 7 | threadIdx().x) 8 | A[i] = 2*A[i] 9 | end 10 | @synchronize 11 | end 12 | 13 | f2(x) = sin(x) 14 | f3(x) = 1 + f2(x) 15 | f4(x) = x^1.2 16 | f5(x) = x^3 17 | 18 | function kernel2!(A, B, h) 19 | @inbounds @loop for i in (1:size(A,1); threadIdx().x) 20 | A[i] = h(B[i]) 21 | end 22 | nothing 23 | end 24 | 25 | @testset "Array" begin 26 | data = Array{Float32}(undef, 1024) 27 | kernel(data) 28 | end 29 | 30 | @static if Base.find_package("CuArrays") !== nothing 31 | using CuArrays 32 | using CUDAnative 33 | 34 | function kernel(A::CuArray) 35 | @launch CUDA() threads=length(A) kernel(A) 36 | end 37 | 38 | @testset "CuArray" begin 39 | data = CuArray{Float32}(undef, 1024) 40 | kernel(data) 41 | end 42 | 43 | @testset "contextualize" begin 44 | f(x) = 2*x 45 | g(x) = GPUifyLoops.contextualize(f)(x) 46 | @test g(3.0) == 6.0 47 | f(x) = 3*x 48 | 49 | # Enable test on v1.3 once fix commit is known 50 | # @test g(3.0) == 9.0 51 | @test_broken g(3.0) == 9.0 52 | f1(x) = (sin(1.0 + x); return nothing) 53 | g1(x) = GPUifyLoops.contextualize(f1)(x) 54 | asm = sprint(io->CUDAnative.code_llvm(io, g1, Tuple{Float64}, kernel=true, 55 | optimize=false, dump_module=true)) 56 | @test occursin(r"call .* double @__nv_sin", asm) 57 | @test occursin("fadd contract double", asm) 58 | 59 | @testset "don't overdub intrinsics" begin 60 | global simple_kernel, kernel 61 | simple_kernel(A, x) = (A[1] = 1 + x; return nothing) 62 | kernel(A, x) = GPUifyLoops.contextualize(simple_kernel)(A, x) 63 | CI, ret = CUDAnative.code_typed(kernel, Tuple{CUDAnative.CuDeviceArray{Int64,1, CUDAnative.AS.Global}, Int64}, debuginfo=:source)[1] 64 | 65 | intrinsics = findall(CI.code) do stmt 66 | if Base.Meta.isexpr(stmt, :call) 67 | f = stmt.args[1] 68 | if f isa GlobalRef 69 | f = getfield(f.mod, f.name) 70 | return f isa Core.IntrinsicFunction || f isa Core.Builtin 71 | end 72 | end 73 | return false 74 | end 75 | 76 | for i in intrinsics 77 | lineinfo = CI.linetable[CI.codelocs[i]] 78 | @test !(lineinfo.method === :call || 79 | lineinfo.file === Symbol("context.jl")) 80 | end 81 | end 82 | 83 | begin 84 | global kernel2! 85 | data = rand(Float32, 1024) 86 | fdata = similar(data) 87 | 88 | kernel2!(fdata, data, f3) 89 | @test f3.(data) ≈ fdata 90 | 91 | kernel2!(fdata, data, f4) 92 | @test f4.(data) ≈ fdata 93 | 94 | kernel2!(fdata, data, f5) 95 | @test f5.(data) ≈ fdata 96 | 97 | function kernel2!(A::CuArray, B::CuArray, f) 98 | @launch CUDA() threads=length(A) kernel2!(A, B, f) 99 | end 100 | 101 | cudata = CuArray(data) 102 | cufdata = similar(cudata) 103 | 104 | kernel2!(cufdata, cudata, f3) 105 | @test f3.(data) ≈ cufdata 106 | 107 | kernel2!(cufdata, cudata, f4) 108 | @test f4.(data) ≈ cufdata 109 | 110 | kernel2!(cufdata, cudata, f5) 111 | @test f5.(data) ≈ cufdata 112 | end 113 | end 114 | end 115 | 116 | function kernel3!(A) 117 | s1 = @shmem eltype(A) (1024,) 118 | s2 = @shmem eltype(A) (1024,) 119 | 120 | @loop for i in (1:size(A,1); threadIdx().x) 121 | s1[i] = 2*A[i] 122 | s2[i] = 3*A[i] 123 | end 124 | @synchronize 125 | @loop for i in (1:size(A,1); threadIdx().x) 126 | A[i] = s1[i] 127 | end 128 | nothing 129 | end 130 | 131 | let 132 | function ker1!(::Val{Nq}) where Nq 133 | s_x = @shmem Float32 Nq 134 | end 135 | 136 | CI, rt = @code_typed ker1!(Val(10)) 137 | @test Base.isconcretetype(rt) 138 | end 139 | 140 | @testset "shared memory" begin 141 | data = rand(Float32, 1024) 142 | cpudata = copy(data) 143 | 144 | @launch CPU() kernel3!(cpudata) 145 | @test cpudata ≈ 2 .* data 146 | 147 | @static if Base.find_package("CuArrays") !== nothing 148 | using CuArrays 149 | using CUDAnative 150 | 151 | cudata = CuArray(data) 152 | @launch CUDA() threads=length(cudata) kernel3!(cudata) 153 | @test Array(cudata) ≈ 2 .* data 154 | end 155 | end 156 | 157 | # Scratch arrays 158 | 159 | function kernel_scratch(A, ::Val{N}) where N 160 | a = @scratch eltype(A) (N, N) 2 161 | b = @scratch eltype(A) (2, N, N) 2 162 | @loop for j in (1:size(A,2); threadIdx().y) 163 | @loop for i in (1:size(A,1); threadIdx().x) 164 | a[i, j] = A[i, j] 165 | b[1, i, j] = -A[i, j] 166 | b[2, i, j] = 2a[i, j] 167 | end 168 | end 169 | end 170 | 171 | 172 | function f1() 173 | A = @scratch Int64 (12, 3) 2 174 | @test A.data isa GPUifyLoops.MArray 175 | @test size(A.data) == (1,) 176 | end 177 | 178 | function f2() 179 | A = @scratch Int64 (12, 3) 1 180 | @test A.data isa GPUifyLoops.MArray 181 | end 182 | 183 | function f3() 184 | A = @scratch Int64 (12, 3) 1 185 | @test A isa GPUifyLoops.MArray 186 | end 187 | 188 | @testset "Scratch Arrays" begin 189 | contextualize(f1)() 190 | contextualize(f2)() 191 | f3() 192 | N = 10 193 | A = rand(N, N) 194 | @launch CPU() kernel_scratch(A, Val(N)) 195 | 196 | @static if Base.find_package("CuArrays") !== nothing 197 | using CuArrays 198 | 199 | d_A = CuArray(A) 200 | @launch CUDA() kernel_scratch(d_A, Val(N)) 201 | end 202 | end 203 | 204 | @testset "Loopinfo" begin 205 | # Right now test that we don't break things 206 | # Should probably test that codegen is correct. 207 | f(N) = @unroll 2 for i in 1:N 208 | @show i 209 | end 210 | f(10) 211 | 212 | f() = @unroll for i in 1:10 213 | @show i 214 | end 215 | f() 216 | end 217 | 218 | using StaticArrays 219 | function kernel_MArray!(A) 220 | l_F = MArray{Tuple{3, 3}, eltype(A)}(undef) 221 | @inbounds for j = 1:3, i = 1:3 222 | l_F[i, j] = A[i, j] 223 | end 224 | nothing 225 | end 226 | function kernel_similar_MArray!(A) 227 | l_F = MArray{Tuple{3, 3}, eltype(A)}(undef) 228 | l_G = similar(l_F, Size(2,2)) 229 | @inbounds for j = 1:2, i = 1:2 230 | l_G[i, j] = A[i, j] 231 | end 232 | nothing 233 | end 234 | 235 | @testset "StaticArrays" begin 236 | @static if Base.find_package("CuArrays") !== nothing 237 | using CuArrays 238 | using CUDAnative 239 | 240 | A = CuArray(rand(3,3)) 241 | @launch CUDA() threads=(3,3) kernel_MArray!(A) 242 | @launch CUDA() threads=(3,3) kernel_similar_MArray!(A) 243 | end 244 | 245 | A = rand(3,3) 246 | @launch CPU() threads=(3,3) kernel_MArray!(A) 247 | @launch CPU() threads=(3,3) kernel_similar_MArray!(A) 248 | end 249 | 250 | # For the next three test we check that the _apply got inlined correctly 251 | Base.@pure pure_f1(x, y) = x + y 252 | let 253 | CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f1, 1, 2) 254 | expr = CI.code[end-1] 255 | @test expr.head === :call || expr.head === :invoke 256 | @test expr.args[1] === GlobalRef(Base, :add_int) 257 | end 258 | 259 | Base.@pure pure_f2(x, ys...) = x + sum(ys) 260 | let 261 | CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f2, 1, 2, 3, 4) 262 | expr = CI.code[end-1] 263 | @test expr.head === :call || expr.head === :invoke 264 | @test expr.args[1] === GlobalRef(Base, :add_int) 265 | end 266 | 267 | Base.@pure pure_f3(ys...) = sum(ys) 268 | let 269 | CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f3, 1, 2, 3, 4) 270 | expr = CI.code[end-1] 271 | @test expr.head === :call || expr.head === :invoke 272 | @test expr.args[1] === GlobalRef(Base, :add_int) 273 | end 274 | 275 | --------------------------------------------------------------------------------