├── .gitignore
├── bors.toml
├── REQUIRE
├── docs
    ├── Project.toml
    ├── make.jl
    └── src
    │   └── index.md
├── test
    ├── gpuenv
    │   ├── Manifest.toml
    │   └── Project.toml
    ├── testenv
    │   ├── Manifest.toml
    │   └── Project.toml
    ├── runtests.jl
    ├── examples.jl
    └── test.jl
├── .travis.yml
├── examples
    ├── simple.jl
    ├── contextualize.jl
    └── shmem.jl
├── Project.toml
├── src
    ├── shmem.jl
    ├── scratch.jl
    ├── loopinfo.jl
    ├── GPUifyLoops.jl
    └── context.jl
├── README.md
├── LICENSE.md
└── .gitlab-ci.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | docs/build/
2 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [
2 |   "ci/gitlab/%"
3 | ]
4 | 


--------------------------------------------------------------------------------
/REQUIRE:
--------------------------------------------------------------------------------
1 | julia 1.1
2 | Requires
3 | Cassette
4 | StaticArrays
5 | 


--------------------------------------------------------------------------------
/docs/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
3 | 
4 | [compat]
5 | Documenter = "~0.19"
6 | 


--------------------------------------------------------------------------------
/test/gpuenv/Manifest.toml:
--------------------------------------------------------------------------------
1 | [[GPUifyLoops]]
2 | path = "../.."
3 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
4 | version = "0.1.0"
5 | 


--------------------------------------------------------------------------------
/test/testenv/Manifest.toml:
--------------------------------------------------------------------------------
1 | [[GPUifyLoops]]
2 | path = "../.."
3 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
4 | version = "0.1.0"
5 | 


--------------------------------------------------------------------------------
/test/runtests.jl:
--------------------------------------------------------------------------------
 1 | using GPUifyLoops
 2 | using Test
 3 | 
 4 | @testset "Unittests" begin
 5 |     include("test.jl")
 6 | end
 7 | 
 8 | include("examples.jl")
 9 | 
10 | 


--------------------------------------------------------------------------------
/docs/make.jl:
--------------------------------------------------------------------------------
 1 | using Documenter, GPUifyLoops
 2 | 
 3 | makedocs(
 4 |     modules = [GPUifyLoops],
 5 |     format = :html,
 6 |     sitename = "GPUifyLoops.jl",
 7 |     pages = [
 8 |         "Home"    => "index.md",
 9 |     ],
10 |     doctest = true
11 | )
12 | 
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | ## Documentation: http://docs.travis-ci.com/user/languages/julia/
 2 | language: julia
 3 | os:
 4 |   - linux
 5 |   - osx
 6 | julia:
 7 |   - 1.0
 8 |   - 1.1
 9 |   - 1.2
10 |   - 1.3
11 |   - nightly
12 | notifications:
13 |   email: false
14 | git:
15 |   depth: 99999999
16 | 


--------------------------------------------------------------------------------
/test/testenv/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
3 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
4 | Requires = "ae029012-a4dd-5104-9daa-d747884805df"
5 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
6 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7 | 


--------------------------------------------------------------------------------
/test/gpuenv/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
3 | CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
4 | GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
5 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
6 | Requires = "ae029012-a4dd-5104-9daa-d747884805df"
7 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
8 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
9 | 


--------------------------------------------------------------------------------
/examples/simple.jl:
--------------------------------------------------------------------------------
 1 | using GPUifyLoops
 2 | 
 3 | function kernel(A)
 4 |     @loop for i in (1:size(A,1);
 5 |                     threadIdx().x)
 6 |         A[i] = 2*A[i]
 7 |     end
 8 |     @synchronize
 9 | end
10 | 
11 | data = Array{Float32}(undef, 1024)
12 | kernel(data)
13 | 
14 | @static if Base.find_package("CuArrays") !== nothing
15 |     using CuArrays
16 |     using CUDAnative
17 | 
18 |     kernel(A::CuArray) = @launch CUDA() kernel(A, threads=length(A))
19 | 
20 |     data = CuArray{Float32}(undef, 1024)
21 |     kernel(data)
22 | end
23 | 
24 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "GPUifyLoops"
 2 | uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27"
 3 | authors = ["Valentin Churavy <v.churavy@gmail.com>"]
 4 | version = "0.2.8"
 5 | 
 6 | [deps]
 7 | Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
 8 | Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 9 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
10 | 
11 | [compat]
12 | julia = ">= 1.1"
13 | 
14 | [extras]
15 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
16 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
17 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
18 | 
19 | [targets]
20 | test = ["Test", "Pkg", "InteractiveUtils"]
21 | 


--------------------------------------------------------------------------------
/examples/contextualize.jl:
--------------------------------------------------------------------------------
 1 | using GPUifyLoops
 2 | 
 3 | f1(x) = sin(x)
 4 | f(x) = 1 + f1(x)
 5 | 
 6 | function kernel!(A, B, h)
 7 |     @inbounds @loop for i in (1:size(A,1); threadIdx().x)
 8 |         A[i] = h(B[i])
 9 |     end
10 |     nothing
11 | end
12 | 
13 | data = rand(Float32, 1024)
14 | fdata = similar(data)
15 | kernel!(fdata, data, f)
16 | 
17 | @assert f.(data) ≈ fdata
18 | 
19 | @static if Base.find_package("CuArrays") !== nothing
20 |     using CuArrays
21 |     using CUDAnative
22 | 
23 |     function kernel!(A::CuArray, B::CuArray)
24 |         @launch CUDA() threads=length(A) kernel!(A, B, f)
25 |     end
26 | 
27 |     cudata = CuArray(data)
28 |     cufdata = similar(cudata)
29 |     kernel!(cufdata, cudata)
30 | 
31 |     @assert f.(data) ≈ cufdata
32 | end
33 | 


--------------------------------------------------------------------------------
/examples/shmem.jl:
--------------------------------------------------------------------------------
 1 | using GPUifyLoops
 2 | 
 3 | function kernel3!(A)
 4 |     s1 = @shmem eltype(A) (1024,)
 5 |     s2 = @shmem eltype(A) (1024,)
 6 | 
 7 |     @loop for i in (1:size(A,1); threadIdx().x)
 8 |         s1[i] = 2*A[i]
 9 |         s2[i] = 3*A[i]
10 |     end
11 |     @synchronize
12 |     @loop for i in (1:size(A,1); threadIdx().x)
13 |         A[i] = s1[i]
14 |     end
15 |     nothing
16 | end
17 | 
18 | data = rand(Float32, 1024)
19 | cpudata = copy(data)
20 | 
21 | @launch CPU() kernel3!(cpudata)
22 | @assert cpudata ≈ 2 .* data
23 | 
24 | @static if Base.find_package("CuArrays") !== nothing
25 |   using CuArrays
26 |   using CUDAnative
27 | 
28 |   cudata = CuArray(data)
29 |   @launch CUDA() threads=length(cudata) kernel3!(cudata)
30 |   @assert Array(cudata) ≈ 2 .* data
31 | end
32 | 


--------------------------------------------------------------------------------
/src/shmem.jl:
--------------------------------------------------------------------------------
 1 | __size(args::Tuple) = Tuple{args...}
 2 | __size(i::Int) = Tuple{i}
 3 | 
 4 | __shmem(D::Device, args...) = throw(MethodError(__shmem, (D, args...)))
 5 | @inline __shmem(::CPU, ::Type{T}, ::Val{dims}, ::Val) where {T, dims} =MArray{__size(dims), T}(undef)
 6 | 
 7 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin
 8 |     using .CUDAnative
 9 | 
10 |     @inline function __shmem(::CUDA, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
11 |         ptr = CUDAnative._shmem(Val(id), T, Val(prod(dims)))
12 |         CUDAnative.CuDeviceArray(dims, CUDAnative.DevicePtr{T, CUDAnative.AS.Shared}(ptr))
13 |     end
14 | end
15 | 
16 | shmem_id = 0
17 | macro shmem(T, dims)
18 |     global shmem_id
19 |     id = shmem_id::Int += 1
20 | 
21 |     quote
22 |         $__shmem($backend(), $(esc(T)), Val($(esc(dims))), Val($id))
23 |     end
24 | end
25 | 


--------------------------------------------------------------------------------
/test/examples.jl:
--------------------------------------------------------------------------------
 1 | @testset "examples" begin
 2 | 
 3 | function find_sources(path::String, sources=String[])
 4 |     if isdir(path)
 5 |         for entry in readdir(path)
 6 |             find_sources(joinpath(path, entry), sources)
 7 |         end
 8 |     elseif endswith(path, ".jl")
 9 |         push!(sources, path)
10 |     end
11 |     sources
12 | end
13 | 
14 | examples_dir = joinpath(@__DIR__, "..", "examples")
15 | examples = find_sources(examples_dir)
16 | filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples)
17 | 
18 | cd(examples_dir) do
19 |     examples = relpath.(examples, Ref(examples_dir))
20 |     @testset for example in examples
21 |         cmd = ```$(Base.julia_cmd()) --project=$(Base.current_project())
22 |                     -e 'using Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.build()'
23 |                     -L $example
24 |               ```
25 |         @test success(pipeline(cmd, stderr=stderr))
26 |     end
27 | end
28 | 
29 | end
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | GPUifyLoops.jl
 2 | ==============
 3 | *Support for writing loop-based code that executes both on CPU and GPU*
 4 | 
 5 | [![][docs-latest-img]][docs-latest-url]
 6 | 
 7 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg
 8 | [docs-latest-url]: https://juliagpu.gitlab.io/GPUifyLoops.jl/
 9 | 
10 | Installation
11 | ------------
12 | 
13 | GPUifyLoops is a registered package, and can be installed using the Julia package
14 | manager.
15 | 
16 | ```julia
17 | julia>]
18 | (v1.1) pkg> add GPUifyLoops
19 | ```
20 | 
21 | **Note**: The current version of this package requires Julia 1.1.
22 | 
23 | Development
24 | -----------
25 | 
26 | In order to test this package locally you need to do:
27 | 
28 | ```
29 | julia --project=test/gpuenv
30 | julia> ]
31 | (gpuenv) pkg> resolve
32 | (gpuenv) pkg> instantiate
33 | ```
34 | 
35 | This will resolve the GPU environment, please do not checking changes to `test/gpuenv/`.
36 | Then you can run the tests with `julia --project=test/gpuenv test/runtests.jl`
37 | 
38 | License
39 | -------
40 | 
41 | GPUifyLoops.jl is licensed under [MIT license](LICENSE.md).
42 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright &copy; 2015: Simon Kornblith.
 4 | Copyright &copy; 2018-2019: Valentin Churavy, and other contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/src/scratch.jl:
--------------------------------------------------------------------------------
 1 | using StaticArrays
 2 | 
 3 | """
 4 |    @scratch T Dims M
 5 | 
 6 | Allocates scratch memory.
 7 | - `T` type of array
 8 | - `Dims` is a tuple of array dimensions
 9 | - `M` the number of dimensions at the tail that are implicit on the GPU
10 | """
11 | macro scratch(T, Dims, M)
12 |     @assert Dims.head == :tuple
13 |     dims = Dims.args
14 |     N = length(dims) - M
15 |     gpudims = ntuple(i->dims[i], N)
16 |     esc(quote 
17 |         if !$isdevice()
18 |             $MArray{Tuple{$(dims...)}, $T}(undef)
19 |         else
20 |             data = if $(length(gpudims)) > 0
21 |                 $ScratchArray{$N}(
22 |                     $MArray{Tuple{$(gpudims...)}, $T}(undef)
23 |                 )
24 |             else
25 |                 $ScratchArray{$N}(
26 |                     $MArray{Tuple{1}, $T}(undef)
27 |                 )
28 |             end
29 |         end
30 |     end)
31 | end
32 | 
33 | struct ScratchArray{N, D}
34 |     data::D
35 |     ScratchArray{N}(data::D) where {N, D} = new{N, D}(data)
36 |     ScratchArray{N, T}() where {N, T} = new{N, T}()
37 | end
38 | 
39 | 
40 | Base.@propagate_inbounds function Base.getindex(A::ScratchArray{N}, I...) where N
41 |     nI = ntuple(i->I[i], N)
42 |     if nI == ()
43 |         return A.data[1]
44 |     end
45 |     return A.data[nI...]
46 | end
47 | 
48 | Base.@propagate_inbounds function Base.setindex!(A::ScratchArray{N}, val, I...) where N
49 |     nI = ntuple(i->I[i], N)
50 |     if nI == ()
51 |         return A.data .= val
52 |     end
53 |     A.data[nI...] = val
54 | end
55 | 
56 | 


--------------------------------------------------------------------------------
/src/loopinfo.jl:
--------------------------------------------------------------------------------
 1 | module LoopInfo
 2 | 
 3 | const HAS_LOOPINFO_EXPR = VERSION >= v"1.2.0-DEV.462"
 4 | export @unroll
 5 | 
 6 | ##
 7 | # Uses the loopinfo expr node to attach LLVM loopinfo to loops
 8 | # the full list of supported metadata nodes is available at
 9 | # https://llvm.org/docs/LangRef.html#llvm-loop
10 | # TODO: Figure out how to deal with compile-time constants in `@unroll(N, expr)`
11 | #       so constants that come from `Val{N}` but are not parse time constant.
12 | #       Most likely will require changes to base Julia.
13 | ##
14 | 
15 | module MD
16 |     unroll_count(n) = (Symbol("llvm.loop.unroll.count"), convert(Int, n))
17 |     unroll_disable() = (Symbol("llvm.loop.unroll.disable"), 1)
18 |     unroll_enable() = (Symbol("llvm.loop.unroll.enable"), 1)
19 |     unroll_full() = (Symbol("llvm.loop.unroll.full"), 1)
20 | end
21 | 
22 | function loopinfo(expr, nodes...)
23 |     if expr.head != :for
24 |         error("Syntax error: loopinfo needs a for loop")
25 |     end
26 |     if HAS_LOOPINFO_EXPR
27 |         push!(expr.args[2].args, Expr(:loopinfo, nodes...))
28 |     end
29 |     return expr
30 | end
31 | 
32 | """
33 |    @unroll expr
34 | 
35 | Takes a for loop as `expr` and informs the LLVM unroller to fully unroll it, if
36 | it is safe to do so and the loop count is known.
37 | """
38 | macro unroll(expr)
39 |     expr = loopinfo(expr, MD.unroll_full())
40 |     return esc(expr)
41 | end
42 | 
43 | """
44 |     @unroll N expr
45 | 
46 | Takes a for loop as `expr` and informs the LLVM unroller to unroll it `N` times,
47 | if it is safe to do so.
48 | """
49 | macro unroll(N, expr)
50 |     if !(N isa Integer)
51 |         error("Syntax error: `@unroll N expr` needs a constant integer N")
52 |     end
53 |     expr = loopinfo(expr, MD.unroll_count(N))
54 |     return esc(expr)
55 | end
56 | 
57 | end #module
58 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | include:
 2 |   - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v3/common.yml'
 3 | 
 4 | .projecttest:
 5 |   extends: .test
 6 |   script:
 7 |     - julia -e 'using InteractiveUtils;
 8 |                 versioninfo()'
 9 |     - mkdir $JULIA_DEPOT_PATH
10 |     - julia --project=$CI_JULIA_PROJECT -e '
11 |                           using Pkg;
12 |                           Pkg.resolve();
13 |                           Pkg.instantiate();
14 |                           Pkg.build();
15 |                           include("test/runtests.jl");'
16 | .gputest:
17 |   extends: .projecttest
18 |   variables:
19 |     CI_IMAGE_TAG:     'cuda'
20 |     CI_JULIA_PROJECT: 'test/gpuenv'
21 |   tags:
22 |     - cuda
23 | 
24 | .cputest:
25 |   extends: .projecttest
26 |   variables:
27 |     CI_IMAGE_TAG:     'plain'
28 |     CI_JULIA_PROJECT: 'test/testenv'
29 | 
30 | gpu:test:dev:
31 |   extends: .gputest
32 |   variables:
33 |     CI_VERSION_TAG: 'dev'
34 |   allow_failure: true
35 | 
36 | cpu:test:dev:
37 |   extends: .cputest
38 |   variables:
39 |     CI_VERSION_TAG: 'dev'
40 |   allow_failure: true
41 | 
42 | gpu:test:v1.1:
43 |   extends: .gputest
44 |   variables:
45 |     CI_VERSION_TAG: 'v1.1'
46 | 
47 | cpu:test:v1.1:
48 |   extends: .cputest
49 |   variables:
50 |     CI_VERSION_TAG: 'v1.1'
51 | 
52 | gpu:test:v1.2:
53 |   extends: .gputest
54 |   variables:
55 |     CI_VERSION_TAG: 'v1.2'
56 | 
57 | cpu:test:v1.2:
58 |   extends: .cputest
59 |   variables:
60 |     CI_VERSION_TAG: 'v1.2'
61 | 
62 | documentation:
63 |   extends: .documentation
64 |   dependencies:
65 |     - cpu:test:v1.1 
66 |   variables:
67 |     CI_VERSION_TAG: 'v1.1'
68 |     CI_IMAGE_TAG: 'plain'
69 |   only:
70 |     - master
71 |     - staging
72 |     - trying
73 | 
74 | pages:
75 |   dependencies:
76 |     - documentation
77 |   stage: deploy
78 |   script:
79 |     - mv docs/build public
80 |   artifacts:
81 |     paths:
82 |     - public
83 |   only:
84 |     - master
85 | 
86 | 


--------------------------------------------------------------------------------
/docs/src/index.md:
--------------------------------------------------------------------------------
 1 | # GPUifyLoops.jl
 2 | 
 3 | GPUifyLoops tries to solve the problem of code-duplication that can occur
 4 | when writing performant kernels that target multiple devices.
 5 | 
 6 | ## API
 7 | 
 8 | ```@docs
 9 | @loop
10 | @setup
11 | @synchronize
12 | ```
13 | 
14 | ## Examples
15 | ### Simple
16 | 
17 | ````@eval
18 | using Markdown
19 | Markdown.parse("""
20 | ```julia
21 | $(read("../../examples/simple.jl", String))
22 | ```
23 | """)
24 | ````
25 | 
26 | ## Other useful tools
27 | ### Loop unrolling
28 | 
29 | On Julia `v1.2.0-DEV.462` we can pass information to the LLVM loop tooling.
30 | GPUifyLoops contains a macro `@unroll` that can unroll a loop fully if the
31 | trip count is known or partially by a factor.
32 | 
33 | ```@docs
34 | @unroll
35 | ```
36 | #### Example:
37 | 
38 | ```julia
39 | @noinline iteration(i) = @show i
40 | # Unknown loop count
41 | f(N) = @unroll 3 for i in 1:N
42 |     iteration(i)
43 | end
44 | @code_llvm f(10)
45 | ```
46 | 
47 | This should yield something like:
48 | ```LLVM
49 |   %6 = call i64 @julia_iteration_12527(i64 %value_phi3)
50 |   %7 = add nuw i64 %value_phi3, 1
51 |   %8 = call i64 @julia_iteration_12527(i64 %7)
52 |   %9 = add i64 %value_phi3, 2
53 |   %10 = call i64 @julia_iteration_12527(i64 %9)
54 |   %11 = add i64 %value_phi3, 3
55 | ```
56 | 
57 | You can also unroll a loop fully, but that requires a known/computable
58 | trip-count:
59 | 
60 | ```julia
61 | @noinline iteration(i) = @show i
62 | # Unknown loop count
63 | f() = @unroll for i in 1:10
64 |     iteration(i)
65 | end
66 | @code_llvm f()
67 | ```
68 | 
69 | Which yields something like:
70 | ```LLVM
71 |   %4 = call i64 @julia_iteration_12527(i64 1)
72 |   %5 = call i64 @julia_iteration_12527(i64 2)
73 |   %6 = call i64 @julia_iteration_12527(i64 3)
74 |   %7 = call i64 @julia_iteration_12527(i64 4)
75 |   %8 = call i64 @julia_iteration_12527(i64 5)
76 |   %9 = call i64 @julia_iteration_12527(i64 6)
77 |   %10 = call i64 @julia_iteration_12527(i64 7)
78 |   %11 = call i64 @julia_iteration_12527(i64 8)
79 |   %12 = call i64 @julia_iteration_12527(i64 9)
80 |   %13 = call i64 @julia_iteration_12527(i64 10)
81 | ```
82 | 


--------------------------------------------------------------------------------
/src/GPUifyLoops.jl:
--------------------------------------------------------------------------------
  1 | module GPUifyLoops
  2 | 
  3 | if VERSION < v"1.1"
  4 |     @error "GPUifyLoops depends on Julia v1.1"
  5 | end
  6 | 
  7 | abstract type Device end
  8 | struct CPU <: Device end
  9 | 
 10 | abstract type GPU <: Device end
 11 | struct CUDA <: GPU end
 12 | 
 13 | #=
 14 | # Hopefully we can eventually support AMDGPUs through ROCm
 15 | struct ROCm <: GPU end
 16 | =#
 17 | 
 18 | export CPU, CUDA, Device
 19 | 
 20 | using StaticArrays
 21 | using Requires
 22 | 
 23 | export @setup, @loop, @synchronize
 24 | export @scratch, @shmem
 25 | export contextualize
 26 | export @unroll
 27 | export @launch
 28 | 
 29 | ##
 30 | # contextualize
 31 | ##
 32 | include("context.jl")
 33 | 
 34 | backend() = CPU()
 35 | # FIXME: Get backend from Context or have Context per backend
 36 | Cassette.overdub(ctx::Ctx, ::typeof(backend)) = CUDA()
 37 | 
 38 | macro launch(ex...)
 39 |     # destructure the `@launch` expression
 40 |     call = ex[end]
 41 |     kwargs = ex[2:end-1]
 42 | 
 43 |     device = ex[1]
 44 | 
 45 |     # destructure the kernel call
 46 |     if call.head != :call
 47 |         throw(ArgumentError("second argument to @launch should be a function call"))
 48 |     end
 49 | 
 50 |     f = call.args[1]
 51 |     args = call.args[2:end]
 52 | 
 53 |     quote
 54 |         $launch($(esc(device)), $(esc(f)), $(map(esc, args)...); $(map(esc, kwargs)...))
 55 |     end
 56 | end
 57 | 
 58 | 
 59 | 
 60 | """
 61 |    launch(::Device, f, args..., kwargs...)
 62 | 
 63 | Launch a kernel on the GPU. `kwargs` are passed to `@cuda`
 64 | `kwargs` can be any of the compilation and runtime arguments
 65 | normally passed to `@cuda`.
 66 | """
 67 | launch(::CPU, f, args...; kwargs...) = f(args...)
 68 | 
 69 | """
 70 |     launch_config(::F, maxthreads, args...; kwargs...)
 71 | 
 72 | Calculate a valid launch configuration based on the typeof(F), the
 73 | maximum number of threads, the functions arguments and the particular
 74 | launch configuration passed to the call.
 75 | 
 76 | Return a NamedTuple that has `blocks`, `threads`, `shmem`, and `stream`.
 77 | All arguments are optional, but blocks and threads is recommended.
 78 | """
 79 | function launch_config(@nospecialize(f), maxthreads, args...; kwargs...)
 80 |     return kwargs
 81 | end
 82 | 
 83 | function split_kwargs(kwargs)
 84 |     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
 85 |     call_kws     = [:blocks, :threads, :shmem, :stream, :config]
 86 |     compiler_kwargs = []
 87 |     call_kwargs = []
 88 |     for kwarg in kwargs
 89 |         key, val = kwarg
 90 |         if isa(key, Symbol)
 91 |             if key in compiler_kws
 92 |                 push!(compiler_kwargs, kwarg)
 93 |             elseif key in call_kws
 94 |                 push!(call_kwargs, kwarg)
 95 |             else
 96 |                 throw(ArgumentError("unknown keyword argument '$key'"))
 97 |             end
 98 |         else
 99 |             throw(ArgumentError("non-symbolic keyword '$key'"))
100 |         end
101 |     end
102 |     return compiler_kwargs, call_kwargs
103 | end
104 | 
105 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin
106 |     using .CUDAnative
107 | 
108 |     function version_check()
109 |         project = joinpath(dirname(pathof(CUDAnative)), "../Project.toml")
110 |         let Pkg = Base.require(Base.PkgId(Base.UUID((0x44cfe95a1eb252ea, 0xb672e2afdf69b78f)), "Pkg"))
111 |             project = Pkg.TOML.parse(String(read(project)))
112 |             return version = VersionNumber(get(project, "version", "0.0.0"))
113 |         end
114 |     end
115 | 
116 |     global const CUDANativeVersion = version_check()
117 | 
118 |     function launch(::CUDA, f::F, args...; kwargs...) where F
119 |         compiler_kwargs, call_kwargs = split_kwargs(kwargs)
120 |         args = (ctx, f, args...)
121 |         GC.@preserve args begin
122 |             kernel_args = map(cudaconvert, args)
123 |             kernel_tt = Tuple{map(Core.Typeof, kernel_args)...}
124 |             if CUDANativeVersion > v"2.1.2"
125 |                 kernel = cufunction(Cassette.overdub, kernel_tt; name=String(nameof(f)), compiler_kwargs...)
126 |             else
127 |                 kernel = cufunction(Cassette.overdub, kernel_tt; compiler_kwargs...)
128 |             end
129 | 
130 |             maxthreads = CUDAnative.maxthreads(kernel)
131 |             config = launch_config(f, maxthreads, args...; call_kwargs...)
132 | 
133 |             kernel(kernel_args...; config...)
134 |         end
135 |         return nothing
136 |     end
137 | end
138 | 
139 | isdevice(::CPU) = false
140 | isdevice(::Device) = true
141 | isdevice() = isdevice(backend())
142 | 
143 | sync(::CPU) = nothing
144 | sync() = sync(backend())
145 | 
146 | @init @require CUDAnative="be33ccc6-a3ff-5ff2-a52e-74243cff1e17" begin
147 |     using .CUDAnative
148 |     sync(::CUDA) = CUDAnative.sync_threads()
149 | end
150 | 
151 | @deprecate iscpu(::Val{:GPU}) isdevice()
152 | @deprecate iscpu(::Val{:CPU}) !isdevice()
153 | @deprecate sync(::Val{:GPU}) sync()
154 | @deprecate sync(::Val{:CPU}) sync()
155 | 
156 | 
157 | """
158 |     @syncronize
159 | 
160 | Calls `sync_threads()` on the GPU and nothing on the CPU.
161 | """
162 | macro synchronize()
163 |     :($sync())
164 | end
165 | 
166 | """
167 |     @loop for i in (A; B)
168 |         # body
169 |     end
170 | 
171 | Take a `for i in (A; B)` expression and on the CPU lowers it to:
172 | 
173 | ```julia
174 | for i in A
175 |     # body
176 | end
177 | ```
178 | 
179 | and on the GPU:
180 | ```julia
181 | for i in B
182 |     if !(i in A)
183 |         continue
184 |     end
185 |     # body
186 | end
187 | ```
188 | """
189 | macro loop(expr)
190 |     if expr.head != :for
191 |         error("Syntax error: @loop needs a for loop")
192 |     end
193 | 
194 |     induction = expr.args[1]
195 |     body = expr.args[2]
196 | 
197 |     if induction.head != :(=)
198 |         error("Syntax error: @loop needs a induction variable")
199 |     end
200 | 
201 |     rhs = induction.args[2]
202 |     if rhs.head == :block
203 |         @assert length(rhs.args) == 3
204 |         # rhs[2] is a linenode
205 |         cpuidx = rhs.args[1]
206 |         gpuidx = rhs.args[3]
207 | 
208 |         rhs = Expr(:if, :(!$isdevice()), cpuidx, gpuidx)
209 |         induction.args[2] = rhs
210 | 
211 |         # use cpuidx calculation to check bounds of on GPU.
212 |         bounds_chk = quote
213 |             if $isdevice() && !($gpuidx in $cpuidx)
214 |                 continue
215 |             end
216 |         end
217 | 
218 |         pushfirst!(body.args, bounds_chk)
219 |     end
220 | 
221 |     return esc(Expr(:for, induction, body))
222 | end
223 | 
224 | ###
225 | # Scratch and shared-memory
226 | ###
227 | include("scratch.jl")
228 | include("shmem.jl")
229 | 
230 | ###
231 | # Loopinfo
232 | # - `@unroll`
233 | ###
234 | include("loopinfo.jl")
235 | using .LoopInfo
236 | 
237 | end
238 | 


--------------------------------------------------------------------------------
/src/context.jl:
--------------------------------------------------------------------------------
  1 | ##
  2 | # Implements contextual dispatch through Cassette.jl
  3 | # Goals:
  4 | # - Rewrite common CPU functions to appropriate GPU intrinsics
  5 | #
  6 | # TODO:
  7 | # - error (erf, ...)
  8 | # - min, max
  9 | # - mod, rem
 10 | # - gamma
 11 | # - bessel
 12 | # - distributions
 13 | # - unsorted
 14 | 
 15 | using Cassette
 16 | 
 17 | function ir_element(x, code::Vector)
 18 |     while isa(x, Core.SSAValue)
 19 |         x = code[x.id]
 20 |     end
 21 |     return x
 22 | end
 23 | 
 24 | ##
 25 | # Forces inlining on everything that is not marked `@noinline`
 26 | # avoids overdubbing of pure functions
 27 | # avoids overdubbing of IntrinsicFunctions and Builtins 
 28 | ##
 29 | function transform(ctx, ref)
 30 |     CI = ref.code_info
 31 |     noinline = any(@nospecialize(x) ->
 32 |                        Core.Compiler.isexpr(x, :meta) &&
 33 |                        x.args[1] == :noinline,
 34 |                    CI.code)
 35 |     CI.inlineable = !noinline
 36 | 
 37 |     # don't overdub pure functions
 38 |     if CI.pure
 39 |         n_method_args = Int(ref.method.nargs)
 40 |         if ref.method.isva
 41 |             Cassette.insert_statements!(CI.code, CI.codelocs,
 42 |                 (x, i) -> i == 1 ?  3 : nothing,
 43 |                 (x, i) -> i == 1 ? [
 44 |                     # this could run into troubles when the function is @pure f(x...) since then n_method_args==2, but this seems to work sofar.
 45 |                     Expr(:call, Expr(:nooverdub, GlobalRef(Core, :tuple)), (Core.SlotNumber(i) for i in 2:(n_method_args-1))...),
 46 |                     Expr(:call, Expr(:nooverdub, GlobalRef(Core, :_apply)), Core.SlotNumber(1), Core.SSAValue(i), Core.SlotNumber(n_method_args)),
 47 |                     Expr(:return, Core.SSAValue(i+1))] : nothing)
 48 |         else
 49 |             Cassette.insert_statements!(CI.code, CI.codelocs,
 50 |                 (x, i) -> i == 1 ?  2 : nothing,
 51 |                 (x, i) -> i == 1 ? [
 52 |                     Expr(:call, Expr(:nooverdub, Core.SlotNumber(1)), (Core.SlotNumber(i) for i in 2:n_method_args)...)
 53 |                     Expr(:return, Core.SSAValue(i))] : nothing)
 54 |         end
 55 |         CI.ssavaluetypes = length(CI.code)
 56 |         return CI
 57 |     end
 58 | 
 59 |     # overdubbing IntrinsicFunctions removes our ability to profile code
 60 |     newstmt = (x, i) -> begin
 61 |         isassign = Base.Meta.isexpr(x, :(=))
 62 |         stmt = isassign ? x.args[2] : x
 63 |         if Base.Meta.isexpr(stmt, :call)
 64 |             applycall = Cassette.is_ir_element(stmt.args[1], GlobalRef(Core, :_apply), CI.code)
 65 |             if applycall
 66 |                 f = stmt.args[2]
 67 |             else
 68 |                 f = stmt.args[1]
 69 |             end
 70 |             f = ir_element(f, CI.code)
 71 |             if f isa GlobalRef
 72 |                 mod = f.mod
 73 |                 name = f.name
 74 |                 if Base.isbindingresolved(mod, name) && Base.isdefined(mod, name)
 75 |                     ff = getfield(f.mod, f.name)
 76 |                     if ff isa Core.IntrinsicFunction || ff isa Core.Builtin
 77 |                         if applycall
 78 |                             stmt.args[2] = Expr(:nooverdub, f)
 79 |                         else
 80 |                             stmt.args[1] = Expr(:nooverdub, f)
 81 |                         end
 82 |                     end
 83 |                 end
 84 |             end
 85 |         end
 86 |         return [x]
 87 |     end
 88 | 
 89 |     Cassette.insert_statements!(CI.code, CI.codelocs, (x, i) -> 1, newstmt)
 90 |     CI.ssavaluetypes = length(CI.code)
 91 |     # Core.Compiler.validate_code(CI)
 92 |     return CI
 93 | end
 94 | 
 95 | const GPUifyPass = Cassette.@pass transform
 96 | 
 97 | Cassette.@context Ctx
 98 | const ctx = Cassette.disablehooks(Ctx(pass = GPUifyPass))
 99 | 
100 | ###
101 | # Cassette fixes
102 | ###
103 | @inline Cassette.overdub(::Ctx, ::typeof(Core.kwfunc), f) = return Core.kwfunc(f)
104 | @inline Cassette.overdub(::Ctx, ::typeof(Core.apply_type), args...) = return Core.apply_type(args...)
105 | @inline Cassette.overdub(::Ctx, ::typeof(StaticArrays.Size), x::Type{<:AbstractArray{<:Any, N}}) where {N} = return StaticArrays.Size(x)
106 | 
107 | # this looks like a recursion detection failure
108 | @inline Cassette.overdub(::Ctx, ::typeof(Base.Broadcast.axes), args...) = return Base.Broadcast.axes(args...)
109 | 
110 | 
111 | ###
112 | # Rewrite functions
113 | ###
114 | 
115 | # define +, -, * as contract
116 | 
117 | for (f, T) in Base.Iterators.product((:add, :mul, :sub), (Float32, Float64))
118 |     name = Symbol("$(f)_float_contract")
119 |     if T === Float32
120 |         llvmt = "float"
121 |     elseif T === Float64
122 |         llvmt = "double"
123 |     end
124 | 
125 |     #XXX Use LLVM.jl
126 |     ir = """
127 |         %x = f$f contract $llvmt %0, %1
128 |         ret $llvmt %x
129 |     """
130 |     @eval begin
131 |         # the @pure is necessary so that we can constant propagate.
132 |         Base.@pure function $name(a::$T, b::$T)
133 |             @Base._inline_meta
134 |             Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
135 |         end
136 |     end
137 | end
138 | @inline Cassette.overdub(ctx::Ctx, ::typeof(+), a::T, b::T) where T<:Union{Float32, Float64} = add_float_contract(a, b)
139 | @inline Cassette.overdub(ctx::Ctx, ::typeof(-), a::T, b::T) where T<:Union{Float32, Float64} = sub_float_contract(a, b)
140 | @inline Cassette.overdub(ctx::Ctx, ::typeof(*), a::T, b::T) where T<:Union{Float32, Float64} = mul_float_contract(a, b)
141 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float64, y::Float64) = CUDAnative.pow(x, y)
142 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float32, y::Float32) = CUDAnative.pow(x, y)
143 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float64, y::Int32)   = CUDAnative.pow(x, y)
144 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Float32, y::Int32)   = CUDAnative.pow(x, y)
145 | @inline Cassette.overdub(ctx::Ctx, ::typeof(^), x::Union{Float32, Float64}, y::Int64) = CUDAnative.pow(x, y)
146 | 
147 | # libdevice.jl
148 | const cudafuns = (:cos, :cospi, :sin, :sinpi, :tan,
149 |           :acos, :asin, :atan,
150 |           :cosh, :sinh, :tanh,
151 |           :acosh, :asinh, :atanh,
152 |           :log, :log10, :log1p, :log2,
153 |           :exp, :exp2, :exp10, :expm1, :ldexp,
154 |           # :isfinite, :isinf, :isnan, :signbit,
155 |           :abs,
156 |           :sqrt, :cbrt,
157 |           :ceil, :floor,)
158 | for f in cudafuns
159 |     @eval function Cassette.overdub(ctx::Ctx, ::typeof(Base.$f), x::Union{Float32, Float64})
160 |         @Base._inline_meta
161 |         return CUDAnative.$f(x)
162 |     end
163 | end
164 | 
165 | function Cassette.overdub(::Ctx, ::typeof(:), start::T, step::T, stop::T) where T<:Union{Float16,Float32,Float64}
166 |     lf = (stop-start)/step
167 |     if lf < 0
168 |         len = 0
169 |     elseif lf == 0
170 |         len = 1
171 |     else
172 |         len = round(Int, lf) + 1
173 |         stop′ = start + (len-1)*step
174 |         # if we've overshot the end, subtract one:
175 |         len -= (start < stop < stop′) + (start > stop > stop′)
176 |     end
177 |     Base.steprangelen_hp(T, start, step, 0, len, 1)
178 | end
179 | 
180 | 
181 | """
182 |     contextualize(::Dev, f)
183 | 
184 | This contexualizes the function `f` for a given device type `Dev`.
185 | 
186 | For the device `CUDA()`, `contextualize` replaces calls to math library
187 | functions.  For example, `cos`, `sin`, are replaced with `CUDAnative.cos`,
188 | `CUDAnative.sin`, respectively.
189 | 
190 | The full list functions that are replaced is $cudafuns.
191 | 
192 | # Examples
193 | ```julia
194 | function kernel!(::Dev, A, f) where {Dev}
195 |     @setup Dev
196 |     @loop for i in (1:size(A,1); threadIdx().x)
197 |         A[i] = f(A[i])
198 |     end
199 | end
200 | 
201 | g(x) = sin(x)
202 | kernel!(A::Array) = kernel!(CPU(), A, contextualize(CPU(), g))
203 | kernel!(A::CuArray) =
204 |     @cuda threads=length(A) kernel!(CUDA(), A, contextualize(CUDA(), g))
205 | 
206 | a = rand(Float32, 1024)
207 | b, c = copy(a), CuArray(a)
208 | 
209 | kernel!(b)
210 | kernel!(c)
211 | 
212 | @assert g.(a) ≈ b
213 | @assert g.(a) ≈ c
214 | ```
215 | """
216 | contextualize(f::F) where F = (args...) -> Cassette.overdub(ctx, f, args...)
217 | 


--------------------------------------------------------------------------------
/test/test.jl:
--------------------------------------------------------------------------------
  1 | using GPUifyLoops
  2 | using Test
  3 | using InteractiveUtils
  4 | 
  5 | function kernel(A)
  6 |     @loop for i in (1:size(A,1);
  7 |                     threadIdx().x)
  8 |         A[i] = 2*A[i]
  9 |     end
 10 |     @synchronize
 11 | end
 12 | 
 13 | f2(x) = sin(x)
 14 | f3(x) = 1 + f2(x)
 15 | f4(x) = x^1.2
 16 | f5(x) = x^3
 17 | 
 18 | function kernel2!(A, B, h)
 19 |     @inbounds @loop for i in (1:size(A,1); threadIdx().x)
 20 |         A[i] = h(B[i])
 21 |     end
 22 |     nothing
 23 | end
 24 | 
 25 | @testset "Array" begin
 26 |     data = Array{Float32}(undef, 1024)
 27 |     kernel(data)
 28 | end
 29 | 
 30 | @static if Base.find_package("CuArrays") !== nothing
 31 |     using CuArrays
 32 |     using CUDAnative
 33 | 
 34 |     function kernel(A::CuArray)
 35 |         @launch CUDA() threads=length(A) kernel(A)
 36 |     end
 37 | 
 38 |     @testset "CuArray" begin
 39 |         data = CuArray{Float32}(undef, 1024)
 40 |         kernel(data)
 41 |     end
 42 | 
 43 |     @testset "contextualize" begin
 44 |         f(x) = 2*x
 45 |         g(x) = GPUifyLoops.contextualize(f)(x)
 46 |         @test g(3.0) == 6.0
 47 |         f(x) = 3*x
 48 | 
 49 |         # Enable test on v1.3 once fix commit is known
 50 |         # @test g(3.0) == 9.0
 51 |         @test_broken g(3.0) == 9.0
 52 |         f1(x) = (sin(1.0 + x); return nothing)
 53 |         g1(x) = GPUifyLoops.contextualize(f1)(x)
 54 |         asm = sprint(io->CUDAnative.code_llvm(io, g1, Tuple{Float64}, kernel=true,
 55 |                                               optimize=false, dump_module=true))
 56 |         @test occursin(r"call .* double @__nv_sin", asm)
 57 |         @test occursin("fadd contract double", asm)
 58 | 
 59 |         @testset "don't overdub intrinsics" begin
 60 |             global simple_kernel, kernel
 61 |             simple_kernel(A, x) = (A[1] = 1 + x; return nothing)
 62 |             kernel(A, x) = GPUifyLoops.contextualize(simple_kernel)(A, x)
 63 |             CI, ret = CUDAnative.code_typed(kernel, Tuple{CUDAnative.CuDeviceArray{Int64,1, CUDAnative.AS.Global}, Int64}, debuginfo=:source)[1]
 64 | 
 65 |             intrinsics = findall(CI.code) do stmt
 66 |                 if Base.Meta.isexpr(stmt, :call)
 67 |                     f = stmt.args[1]
 68 |                     if f isa GlobalRef
 69 |                         f = getfield(f.mod, f.name)
 70 |                         return f isa Core.IntrinsicFunction || f isa Core.Builtin
 71 |                     end
 72 |                 end
 73 |                 return false
 74 |             end
 75 | 
 76 |             for i in intrinsics
 77 |                 lineinfo = CI.linetable[CI.codelocs[i]]
 78 |                 @test !(lineinfo.method === :call ||
 79 |                         lineinfo.file === Symbol("context.jl"))
 80 |             end
 81 |         end
 82 | 
 83 |         begin
 84 |             global kernel2!
 85 |             data = rand(Float32, 1024)
 86 |             fdata = similar(data)
 87 | 
 88 |             kernel2!(fdata, data, f3)
 89 |             @test f3.(data) ≈ fdata
 90 | 
 91 |             kernel2!(fdata, data, f4)
 92 |             @test f4.(data) ≈ fdata
 93 | 
 94 |             kernel2!(fdata, data, f5)
 95 |             @test f5.(data) ≈ fdata
 96 | 
 97 |             function kernel2!(A::CuArray, B::CuArray, f)
 98 |                 @launch CUDA() threads=length(A) kernel2!(A, B, f)
 99 |             end
100 | 
101 |             cudata = CuArray(data)
102 |             cufdata = similar(cudata)
103 | 
104 |             kernel2!(cufdata, cudata, f3)
105 |             @test f3.(data) ≈ cufdata
106 | 
107 |             kernel2!(cufdata, cudata, f4)
108 |             @test f4.(data) ≈ cufdata
109 | 
110 |             kernel2!(cufdata, cudata, f5)
111 |             @test f5.(data) ≈ cufdata
112 |         end
113 |     end
114 | end
115 | 
116 | function kernel3!(A)
117 |     s1 = @shmem eltype(A) (1024,)
118 |     s2 = @shmem eltype(A) (1024,)
119 | 
120 |     @loop for i in (1:size(A,1); threadIdx().x)
121 |         s1[i] = 2*A[i]
122 |         s2[i] = 3*A[i]
123 |     end
124 |     @synchronize
125 |     @loop for i in (1:size(A,1); threadIdx().x)
126 |         A[i] = s1[i]
127 |     end
128 |     nothing
129 | end
130 | 
131 | let
132 |     function ker1!(::Val{Nq}) where Nq
133 |       s_x = @shmem Float32 Nq
134 |     end
135 |     
136 |     CI, rt = @code_typed ker1!(Val(10))
137 |     @test Base.isconcretetype(rt)
138 | end
139 | 
140 | @testset "shared memory" begin
141 |     data = rand(Float32, 1024)
142 |     cpudata = copy(data)
143 | 
144 |     @launch CPU() kernel3!(cpudata)
145 |     @test cpudata ≈ 2 .* data
146 | 
147 |     @static if Base.find_package("CuArrays") !== nothing
148 |         using CuArrays
149 |         using CUDAnative
150 | 
151 |         cudata = CuArray(data)
152 |         @launch CUDA() threads=length(cudata) kernel3!(cudata)
153 |         @test Array(cudata) ≈ 2 .* data
154 |     end
155 | end
156 | 
157 | # Scratch arrays
158 | 
159 | function kernel_scratch(A, ::Val{N}) where N
160 |     a = @scratch eltype(A) (N, N) 2
161 |     b = @scratch eltype(A) (2, N, N) 2
162 |     @loop for j in (1:size(A,2); threadIdx().y)
163 |         @loop for i in (1:size(A,1); threadIdx().x)
164 |             a[i, j] = A[i, j]
165 |             b[1, i, j] = -A[i, j]
166 |             b[2, i, j] = 2a[i, j]
167 |         end
168 |     end
169 | end
170 | 
171 | 
172 | function f1()
173 |     A = @scratch Int64 (12, 3) 2
174 |     @test A.data isa GPUifyLoops.MArray
175 |     @test size(A.data) == (1,)
176 | end
177 | 
178 | function f2()
179 |     A = @scratch Int64 (12, 3) 1
180 |     @test A.data isa GPUifyLoops.MArray
181 | end
182 | 
183 | function f3()
184 |     A = @scratch Int64 (12, 3) 1
185 |     @test A isa GPUifyLoops.MArray
186 | end
187 | 
188 | @testset "Scratch Arrays" begin
189 |     contextualize(f1)()
190 |     contextualize(f2)()
191 |     f3()
192 |     N = 10
193 |     A = rand(N, N)
194 |     @launch CPU() kernel_scratch(A, Val(N))
195 | 
196 |     @static if Base.find_package("CuArrays") !== nothing
197 |         using CuArrays
198 | 
199 |         d_A = CuArray(A)
200 |         @launch CUDA() kernel_scratch(d_A, Val(N))
201 |     end
202 | end
203 | 
204 | @testset "Loopinfo" begin
205 |     # Right now test that we don't break things
206 |     # Should probably test that codegen is correct.
207 |     f(N) = @unroll 2 for i in 1:N
208 |         @show i
209 |     end
210 |     f(10)
211 | 
212 |     f() = @unroll for i in 1:10
213 |         @show i
214 |     end
215 |     f()
216 | end
217 | 
218 | using StaticArrays
219 | function kernel_MArray!(A)
220 |   l_F = MArray{Tuple{3, 3}, eltype(A)}(undef)
221 |   @inbounds for j = 1:3, i = 1:3
222 |     l_F[i, j] = A[i, j]
223 |   end
224 |   nothing
225 | end
226 | function kernel_similar_MArray!(A)
227 |   l_F = MArray{Tuple{3, 3}, eltype(A)}(undef)
228 |   l_G = similar(l_F, Size(2,2))
229 |   @inbounds for j = 1:2, i = 1:2
230 |     l_G[i, j] = A[i, j]
231 |   end
232 |   nothing
233 | end
234 | 
235 | @testset "StaticArrays" begin
236 |   @static if Base.find_package("CuArrays") !== nothing
237 |     using CuArrays
238 |     using CUDAnative
239 | 
240 |     A = CuArray(rand(3,3))
241 |     @launch CUDA() threads=(3,3) kernel_MArray!(A)
242 |     @launch CUDA() threads=(3,3) kernel_similar_MArray!(A)
243 |   end
244 | 
245 |   A = rand(3,3)
246 |   @launch CPU() threads=(3,3) kernel_MArray!(A)
247 |   @launch CPU() threads=(3,3) kernel_similar_MArray!(A)
248 | end
249 | 
250 | # For the next three test we check that the _apply got inlined correctly
251 | Base.@pure pure_f1(x, y) = x + y
252 | let
253 |     CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f1, 1, 2)
254 |     expr = CI.code[end-1]
255 |     @test expr.head === :call || expr.head === :invoke
256 |     @test expr.args[1] === GlobalRef(Base, :add_int)
257 | end
258 | 
259 | Base.@pure pure_f2(x, ys...) = x + sum(ys)
260 | let
261 |     CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f2, 1, 2, 3, 4)
262 |     expr = CI.code[end-1]
263 |     @test expr.head === :call || expr.head === :invoke
264 |     @test expr.args[1] === GlobalRef(Base, :add_int)
265 | end
266 | 
267 | Base.@pure pure_f3(ys...) = sum(ys)
268 | let
269 |     CI, rt = @code_typed GPUifyLoops.Cassette.overdub(GPUifyLoops.ctx, pure_f3, 1, 2, 3, 4)
270 |     expr = CI.code[end-1]
271 |     @test expr.head === :call || expr.head === :invoke
272 |     @test expr.args[1] === GlobalRef(Base, :add_int)
273 | end
274 | 
275 | 


--------------------------------------------------------------------------------