├── .gitmodules
├── Tests
    ├── README.md
    └── Test1.hs
├── Setup.hs
├── .gitignore
├── Obsidian
    ├── DimSpec.hs
    ├── Globs.hs
    ├── Data.hs
    ├── Names.hs
    ├── Atomic.hs
    ├── Types.hs
    ├── CodeGen
    │   ├── OpenCLEmbedded.hs
    │   ├── CUDA.hs
    │   ├── Liveness.hs
    │   ├── Reify.hs
    │   ├── Memory.hs
    │   ├── Program.hs
    │   ├── Memory2.hs
    │   ├── CompileIM.hs
    │   └── CompileIMOpenCLEmbedded.hs
    ├── Force.hs
    ├── SeqLoop.hs
    ├── Mutable.hs
    ├── Array.hs
    ├── Memory.hs
    ├── Program.hs
    └── Library.hs
├── README.org
├── .jenkins_script.sh
├── Obsidian.hs
├── LICENSE
├── .jenkins_runBench.sh
└── Obsidian.cabal


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Tests/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Tests directory


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | cabal-dev
 3 | *.o
 4 | *.hi
 5 | *.chi
 6 | *.chs.h
 7 | .virthualenv
 8 | *.cu
 9 | *.cubin
10 | cabal.sandbox.config
11 | *#
12 | *~
13 | .cabal-sandbox


--------------------------------------------------------------------------------
/Obsidian/DimSpec.hs:
--------------------------------------------------------------------------------
1 | 
2 | {- Joel Svensson 2012 -} 
3 | 
4 | module Obsidian.DimSpec (DimSpec (..)) where 
5 | 
6 | data DimSpec = X | Y | Z
7 |              deriving (Eq,Ord,Show)
8 | 


--------------------------------------------------------------------------------
/Obsidian/Globs.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {- Joel Svensson 2012 -} 
 3 | 
 4 | module Obsidian.Globs where
 5 | 
 6 | 
 7 | import Data.Word
 8 | 
 9 | ---------------------------------------------------------------------------
10 | -- Aliases
11 | 
12 | type Name = String
13 | 
14 | 
15 | 
16 | type NumThreads = Word32


--------------------------------------------------------------------------------
/Obsidian/Data.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Obsidian.Data where
 3 | 
 4 | 
 5 | import Obsidian.Exp
 6 | import Obsidian.Memory
 7 | 
 8 | 
 9 | --------------------------------------------------------------------------- 
10 | -- Data (should match up with storable instances for completeness)
11 | -- Use this to ensure nonnestednes where required 
12 | ---------------------------------------------------------------------------
13 | class (Storable a, Choice a) => Data a
14 | instance Scalar a => Data (Exp a)
15 | instance (Data a, Data b) => Data (a,b)
16 | instance (Data a, Data b, Data c) => Data (a,b,c)
17 | instance (Data a, Data b, Data c, Data d) => Data (a,b,c,d)
18 | 
19 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | * Obsidian
 2 |   
 3 |   An embedded language for GPU kernel programming. 
 4 | 
 5 | * News:  
 6 |   + Obsidian does not depend on the "CUDA" package anymore. 
 7 |     If running on a GPU from haskell (by using the CUDA package) 
 8 |     is desired, the Obsidian-Run-CUDA package should be installed as well
 9 | 
10 | * TODO: 
11 |   + "Up" the versions 
12 |   + Remove all example code that relies on CUDA (Move to another repo)  
13 |   
14 | 
15 | * Versioning Policy (from Apr 14 2014 with the release of version 0.1.0.0) 
16 |   + A.B.C.D  
17 |   + A.B changes when API Changes (in a disruptive way) 
18 |   + C   changes with additions that does not break existing programs 
19 |   + D   Bugfixes and internal hacking. 
20 | 


--------------------------------------------------------------------------------
/.jenkins_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e 
 4 | set -x 
 5 | 
 6 | source $HOME/rn_jenkins_scripts/acquire_ghc.sh
 7 | 
 8 | which -a cabal
 9 | cabal sandbox init 
10 | cabal sandbox hc-pkg list
11 | 
12 | 
13 | # Need to pull down HsBencher
14 | # ----------------------------------------
15 | # Need to know what dir this is. and if we 
16 | # are using cabal sandbox. 
17 | 
18 | # echo Pull HSBencher from Github
19 | # git clone git@github.com:rrnewton/HsBencher 
20 | # cabal install ./HsBencher/hsbencher/ 
21 | # cabal install ./HsBencher/hsbnecher-fusion/ 
22 | 
23 | 
24 | 
25 | cabal update
26 | cabal install Obsidian.cabal 
27 | 
28 | #cabal install ./ ./Examples/Simple 
29 | 
30 | 
31 | #cd Examples/Simple 
32 | #cabal sandbox init --sandbox=../../.cabal-sandbox
33 | #cabal install .
34 | 
35 | 
36 | # cd Examples/ReductionTutorial
37 | #cabal sandbox init --sandbox=../../.cabal-sandbox 
38 | #cabal install Reduce.cabal
39 | 


--------------------------------------------------------------------------------
/Tests/Test1.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {-# LANGUAGE ScopedTypeVariables #-}
 3 | module Main where
 4 | 
 5 | 
 6 | import Prelude hiding (replicate)
 7 | import Prelude as P
 8 | 
 9 | 
10 | import Obsidian
11 | import Obsidian.Run.CUDA.Exec
12 | 
13 | 
14 | import qualified Data.Vector.Storable as V
15 | import Control.Monad.State
16 | 
17 | 
18 | import Data.Int
19 | import Data.Word
20 | 
21 | 
22 | test1_local :: SPull EWord32 -> SPush Block EWord32
23 | test1_local arr =  push arr
24 | 
25 | test1 :: Word32 -> DPull EWord32 -> DPush Grid EWord32
26 | test1 n arr = pConcat (fmap test1_local (splitUp n arr))
27 | 
28 | 
29 | runTest1 =
30 |   withCUDA $
31 |   do
32 |     kern <- capture 128 (test1 1024) 
33 | 
34 |     (inputs :: V.Vector Word32) <- lift $ mkRandomVec (1024 * 1024)
35 | 
36 |     useVector inputs $ \i ->
37 |       allocaVector (1024 * 1024) $ \o ->
38 |       do
39 |         o <== (1024,kern) <> i
40 |         r <- peekCUDAVector o
41 |         lift $ putStrLn $ show r 
42 | 


--------------------------------------------------------------------------------
/Obsidian/Names.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | 
 3 | module Obsidian.Names (Names(..), mapNamesM_) where
 4 | 
 5 | import Obsidian.Globs
 6 | 
 7 | 
 8 | data Names a where
 9 |   Single :: Name -> Names a
10 |   Tuple  :: Names a -> Names b -> Names (a,b)
11 |   Triple :: Names a -> Names b -> Names c -> Names (a,b,c)
12 |   Quadruple :: Names a -> Names b -> Names c -> Names d -> Names (a,b,c,d)
13 | 
14 | ---------------------------------------------------------------------------
15 | -- helpers
16 | ---------------------------------------------------------------------------
17 | mapNamesM_ :: Monad m => (Name -> m ()) -> Names a -> m ()
18 | mapNamesM_ f (Single nom)  = f nom
19 | mapNamesM_ f (Tuple n1 n2) = mapNamesM_ f n1 >>
20 |                              mapNamesM_ f n2
21 | mapNamesM_ f (Triple n1 n2 n3) = mapNamesM_ f n1 >>
22 |                                  mapNamesM_ f n2 >>
23 |                                  mapNamesM_ f n3
24 | mapNamesM_ f (Quadruple n1 n2 n3 n4) = mapNamesM_ f n1 >>
25 |                                        mapNamesM_ f n2 >>
26 |                                        mapNamesM_ f n3 >>
27 |                                        mapNamesM_ f n4
28 | 


--------------------------------------------------------------------------------
/Obsidian.hs:
--------------------------------------------------------------------------------
 1 | module Obsidian (module Obsidian.Array,
 2 |                  module Obsidian.Program,
 3 |                  module Obsidian.Exp, 
 4 |                  module Obsidian.Force, 
 5 |                  module Obsidian.Library,
 6 |                  module Obsidian.Atomic, 
 7 |                  module Obsidian.SeqLoop, 
 8 |                  module Obsidian.Memory, 
 9 |                  module Obsidian.Mutable,
10 |                  module Obsidian.Data) where
11 | 
12 | -- These are internal. 
13 | --                 module Obsidian.Types, 
14 | --                 module Obsidian.CodeGen.Reify,
15 | --                 module Obsidian.CodeGen.CompileIM,
16 | --                 module Obsidian.CodeGen.CUDA,
17 | --                 module Obsidian.Names,
18 | 
19 | import Obsidian.Program
20 | import Obsidian.Exp
21 | import Obsidian.Array
22 | import Obsidian.Library
23 | import Obsidian.Force
24 | import Obsidian.Atomic
25 | import Obsidian.SeqLoop
26 | import Obsidian.Memory
27 | import Obsidian.Mutable
28 | import Obsidian.Data 
29 | 
30 | --import Obsidian.Types
31 | --import Obsidian.CodeGen.Reify
32 | --import Obsidian.CodeGen.CompileIM
33 | --import Obsidian.CodeGen.CUDA
34 | --import Obsidian.Names
35 | 


--------------------------------------------------------------------------------
/Obsidian/Atomic.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {-# LANGUAGE GADTs #-}
 3 | 
 4 | {- Joel Svensson,
 5 |    Josef Svenningsson
 6 |    2012 -} 
 7 | module Obsidian.Atomic where
 8 | 
 9 | import Obsidian.Exp
10 | import Data.Word
11 | import Data.Int
12 | 
13 | 
14 | -- Anyone can extend these with new instances.
15 | -- Not good. (I need to think about how to separate
16 | -- low level CUDA concerns, from programmer level concerns) 
17 | class Scalar a => AtomicInc a
18 | instance AtomicInc Word32
19 | 
20 | class Scalar a => AtomicAdd a
21 | instance AtomicAdd Word32
22 | instance AtomicAdd Int32
23 | instance AtomicAdd Word64
24 | 
25 | class Scalar a => AtomicSub a
26 | instance AtomicSub Word32
27 | instance AtomicSub Int32
28 | 
29 | class Scalar a => AtomicExch a
30 | instance AtomicExch Word32
31 | instance AtomicExch Word64
32 | instance AtomicExch Int32
33 | 
34 | 
35 | 
36 | ---------------------------------------------------------------------------
37 | -- Atomic operations 
38 | ---------------------------------------------------------------------------
39 | data Atomic a where
40 | 
41 |   -- Cuda only allows AtomicInc on the Int type
42 |   --  (todo: figure out if CUDA int is 32 or 64 bit) 
43 |   AtomicInc :: AtomicInc a => Atomic a
44 | 
45 |   AtomicAdd :: AtomicAdd a => Exp a -> Atomic a
46 | 
47 |   AtomicSub :: AtomicSub a => Exp a -> Atomic a 
48 | 
49 |   AtomicExch :: AtomicExch a => Exp a -> Atomic a
50 |   
51 | printAtomic AtomicInc = "atomicInc"
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | BSD3 Full Text:
 3 | --------------------------------------------------------------------------------
 4 | 
 5 | Copyright (c) 2011-2014, Joel Svensson 
 6 | All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 |     * Redistributions of source code must retain the above copyright
11 |       notice, this list of conditions and the following disclaimer.
12 |     * Redistributions in binary form must reproduce the above copyright
13 |       notice, this list of conditions and the following disclaimer in the
14 |       documentation and/or other materials provided with the distribution.
15 |     * Neither the name of the <organization> nor the
16 |       names of its contributors may be used to endorse or promote products
17 |       derived from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/Obsidian/Types.hs:
--------------------------------------------------------------------------------
 1 | {-|
 2 | Module      : Types
 3 | Description : Type information, used internally by Obsídian.
 4 | Copyright   : (c) Joel Svensson, 2014, 2015
 5 | License     : BSD
 6 | Maintainer  : bo.joel.svensson@gmail.com
 7 | Stability   : experimental
 8 | 
 9 | -}
10 | 
11 | module Obsidian.Types where
12 | 
13 | ---------------------------------------------------------------------------
14 | -- Types
15 | ---------------------------------------------------------------------------
16 | 
17 | data Type
18 | -- The allowed scalar types
19 |   = Bool
20 |   | Int  | Word        -- A bit problematic since the size of
21 |                        -- of these are platform dependent
22 | 
23 | -- Vector types supported by CUDA (Add more)
24 |   | FloatV2 | FloatV3 | FloatV4
25 |   | DoubleV2
26 | 
27 |   -- | Int8V2 | Int8V3 | Int8V4
28 |   -- | Int16V2 | Int16V3 | Int16V4
29 |   -- | Int32V2 | Int32V3 | Int32V4
30 |   -- | Int64V2
31 | 
32 |   -- | Word8V2 | Word8V3 | Word8V4
33 |   -- | Word16V2 | Word16V3 | Word16V4
34 |   -- | Word32V2 | Word32V3 | Word32V4
35 |   -- | Word64V2
36 | 
37 |   | Int8 | Int16 | Int32 | Int64
38 |   | Word8 | Word16 | Word32 | Word64
39 |   | Float | Double
40 | -- Vector Types
41 |   | Vec2 Type | Vec3 Type | Vec4 Type
42 | 
43 | 
44 | -- Used by CUDA, C And OpenCL generators
45 |   | Shared Type
46 |   | Volatile Type  -- For warp local computations.
47 |   | Pointer Type   -- Pointer to a @type@
48 |   | Global Type    -- OpenCL thing
49 |   | Local Type     -- OpenCL thing
50 |   deriving (Eq, Ord, Show)
51 | 
52 | typeSize :: Num a => Type -> a
53 | typeSize Int8 = 1
54 | typeSize Int16 = 2
55 | typeSize Int32 = 4
56 | typeSize Int64 = 8
57 | typeSize Word8 = 1
58 | typeSize Word16 = 2
59 | typeSize Word32 = 4
60 | typeSize Word64 = 8
61 | typeSize Bool = 4
62 | typeSize Float = 4
63 | typeSize Double = 8
64 | typeSize (Shared t) = typeSize t
65 | typeSize t = error $ "typeSize: this is bad!: " ++ show t
66 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/OpenCLEmbedded.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {- Joel Svensson 2017
 3 | 
 4 |    Notes:
 5 |      2017-04-23: Adding OpenCL module 
 6 |                 
 7 | -} 
 8 | 
 9 | module Obsidian.CodeGen.OpenCLEmbedded (genKernel, genKernelParams, SharedMemConfig(..),ToProgram(..)) where 
10 | 
11 | 
12 | import Obsidian.CodeGen.Reify
13 | import Obsidian.CodeGen.CompileIMOpenCLEmbedded
14 | import Obsidian.CodeGen.Liveness
15 | import Obsidian.CodeGen.Memory2
16 | import Text.PrettyPrint.Mainland
17 | import Text.PrettyPrint.Mainland.Class
18 | 
19 | import qualified Data.Map as M
20 | import Data.Word
21 | ---------------------------------------------------------------------------
22 | -- Generate OpenCL kernels
23 | ---------------------------------------------------------------------------
24 | 
25 | -- | Generates kernel code as a String
26 | --   while assuming there is 48KB of shared mem in 32 banks
27 | genKernel :: ToProgram prg
28 |              => Word32
29 |              -> String
30 |              -> prg
31 |              -> String
32 | genKernel = genKernelParams sm_conf
33 |   where 
34 |     -- pretend we have 32 banks and 48kb shared mem
35 |     sm_conf :: SharedMemConfig 
36 |     sm_conf = SharedMemConfig 49152 32 True
37 | 
38 | 
39 | 
40 | -- | Generates kernel C code as a String,
41 | --   Programmer passes in the Shared memory configuration
42 | genKernelParams :: ToProgram prg
43 |                    => SharedMemConfig
44 |                    -> Word32
45 |                    -> String
46 |                    -> prg
47 |                    -> String
48 | genKernelParams  sm_conf nt kn prg = prgStr
49 |   where
50 |     prgStr = pretty 75
51 |              $ ppr
52 |              $ compileDeclsTop
53 |                 (Config nt bytesShared)
54 |                 name_loc
55 |                 kn (a,im) 
56 |     (a,im) = toProgram_ 0 prg
57 |     iml = computeLiveness im
58 |     (m,mm) = memMapIM sm_conf iml (M.empty)
59 |     bytesShared = size m
60 |     name_loc = M.assocs mm
61 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/CUDA.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | {- Joel Svensson 2013..2017
 3 | 
 4 |    Notes:
 5 |     2017-04-23: compileDeclsTop nolonger takes "platform" arg.
 6 |     2014-11-25: Making changes regarding memory allocation.
 7 | 
 8 | -}
 9 | 
10 | module Obsidian.CodeGen.CUDA (genKernel, genKernelParams, SharedMemConfig(..)) where 
11 | 
12 | 
13 | import Obsidian.CodeGen.Reify
14 | import Obsidian.CodeGen.CompileIM
15 | import Obsidian.CodeGen.Liveness
16 | import Obsidian.CodeGen.Memory2
17 | import Text.PrettyPrint.Mainland
18 | import Text.PrettyPrint.Mainland.Class
19 | 
20 | import qualified Data.Map as M
21 | import Data.Word
22 | ---------------------------------------------------------------------------
23 | -- Generate CUDA kernels
24 | ---------------------------------------------------------------------------
25 | 
26 | -- | Generates kernel C code as a String
27 | --   while assuming there is 48KB of shared mem in 32 banks
28 | genKernel :: ToProgram prg
29 |              => Word32
30 |              -> String
31 |              -> prg
32 |              -> String
33 | genKernel = genKernelParams sm_conf
34 |   where
35 |     -- pretend we have 32 banks and 48kb shared mem
36 |     sm_conf :: SharedMemConfig
37 |     sm_conf = SharedMemConfig 49152 32 True
38 | 
39 | 
40 | 
41 | -- | Generates kernel C code as a String,
42 | --   Programmer passes in the Shared memory configuration
43 | genKernelParams :: ToProgram prg
44 |                    => SharedMemConfig
45 |                    -> Word32
46 |                    -> String
47 |                    -> prg
48 |                    -> String
49 | genKernelParams  sm_conf nt kn prg = prgStr
50 |   where
51 |     prgStr = pretty 75
52 |              $ ppr
53 |              $ compileDeclsTop
54 |                 (Config nt bytesShared)
55 |                 name_loc
56 |                 kn (a,im)
57 |     (a,im) = toProgram_ 0 prg
58 |     iml = computeLiveness im
59 |     (m,mm) = memMapIM sm_conf iml (M.empty)
60 |     bytesShared = size m
61 |     name_loc = M.assocs mm
62 | 


--------------------------------------------------------------------------------
/.jenkins_runBench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Begin running jenkins benchmark script for Obsidian..."
 4 | set -x
 5 | 
 6 | # CONVENTION: The working directory is passed as the first argument.
 7 | CHECKOUT=$1
 8 | shift
 9 | 
10 | if [ "$CHECKOUT" == "" ]; then
11 |  echo "Replacing CHECKOUT with pwd" 
12 |  CHECKOUT=`pwd`
13 | fi
14 | 
15 | if [ "$JENKINS_GHC" == "" ]; then
16 |   export JENKINS_GHC=7.6.3
17 | fi
18 | if [ -f "$HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_ghc.sh" ]; then
19 |   source $HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_ghc.sh
20 | fi
21 | if [ -f "$HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_cuda.sh" ]; then
22 |   source $HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_cuda.sh
23 | fi
24 | 
25 | echo "Running benchmarks remotely on server `hostname`"
26 | 
27 | which cabal
28 | cabal --version
29 | 
30 | which c2hs     || echo ok
31 | c2hs --version || echo ok
32 | 
33 | unset GHC
34 | unset GHC_PKG
35 | unset CABAL
36 | 
37 | set -e
38 | 
39 | #sandboxes are created by the Makefile for run_benchmarks.hs
40 | #this makefile also install Obsidian+hsbencher as a single cabal install line
41 | DIR=`pwd`  
42 | echo $DIR 
43 | if [ ! -d "HSBencher" ]; then 
44 |     git clone git@github.com:rrnewton/HSBencher
45 | fi 
46 | 
47 | (cd HSBencher; git submodule init; git submodule update) 
48 | 
49 | # Sandbox for benchmarks ?
50 | # -- Is created in the Makefile
51 | # ----------------------------------------
52 | # cd "$CHECKOUT"/Examples/ReductionTutorial
53 | # cabal sandbox init --sandbox=../.cabal-sandbox
54 | 
55 | 
56 | # Actually install Obsidian into the sandbox 
57 | # cabal install ./ 
58 | 
59 | 
60 | # Switch to where the benchmarks are
61 | # ----------------------------------------
62 | cd "$CHECKOUT"/Benchmarks
63 | rm -f run_benchmarks.exe
64 | make run_benchmarks.exe
65 | 
66 | export TRIALS=1
67 | 
68 | # Parfunc account, registered app in api console:
69 | CID=905767673358.apps.googleusercontent.com
70 | SEC=2a2H57dBggubW1_rqglC7jtK
71 | 
72 | # Obsidian doc ID:  
73 | TABID=1TsG043VYLu9YuU58EaIBdQiqLDUYcAXxBww44EG3
74 | # https://www.google.com/fusiontables/DataSource?docid=1TsG043VYLu9YuU58EaIBdQiqLDUYcAXxBww44EG3
75 | 
76 | # Enable upload of benchmarking data to a Google Fusion Table:
77 | # ./run_benchmarks.exe --keepgoing --trials=$TRIALS --fusion-upload=$TABID --clientid=$CID --clientsecret=$SEC $*
78 | echo "Running Benchmarks"
79 | ./run_benchmarks.exe --keepgoing --trials=$TRIALS --fusion-upload --name=Obsidian_bench_data --clientid=$CID --clientsecret=$SEC $*
80 | 


--------------------------------------------------------------------------------
/Obsidian.cabal:
--------------------------------------------------------------------------------
 1 | Name:           Obsidian
 2 | Version:        0.4.0.0
 3 | 
 4 | 
 5 | License:                BSD3
 6 | License-file:           LICENSE
 7 | Stability:              Beta
 8 | Maintainer:		Joel Svensson<svenssonjoel@yahoo.se>
 9 | Author:			Joel Svensson<svenssonjoel@yahoo.se>
10 | 
11 | Copyright:              Copyright (c) 2011-2017 Joel Svensson 		  			
12 |   
13 | Synopsis:               Embedded language for GPU Programming 
14 | HomePage:               https://github.com/svenssonjoel/Obsidian
15 | Description: 
16 |    Obsidian is an embedded language for general purpose programming targeting 
17 |    GPU's. 
18 | 
19 | Category: Language
20 | Cabal-Version: >=1.8
21 | Tested-With: GHC == 8.0.2
22 | 
23 | build-type: Simple
24 | 
25 | source-repository head
26 |   type:     git
27 |   location: git://github.com/svenssonjoel/Obsidian.git
28 | 
29 | ----------------------------------------------------------------------------------------------------
30 | Library
31 |   -- Stable packages, no upper bounds
32 |   build-depends: base >= 4 && < 5
33 |                , vector           >= 0.10.9.1 
34 |                , mtl              >= 2.0 
35 |                , containers       >= 0.4.2.1
36 |                , text             >= 0.11.3.1
37 |                , process          >= 1.1.0.2
38 |                , rdtsc            == 1.3.0.0  
39 |   -- Less-stable packages, upper bounds on next major version:
40 |   build-depends: value-supply     >= 0.6      
41 |                , language-c-quote >= 0.10.1.3 
42 |                , mainland-pretty  >= 0.2.6    
43 |                , mwc-random       >= 0.13.1.1 
44 |              
45 |   exposed-modules: Obsidian
46 |                  , Obsidian.CodeGen.Reify
47 |                  , Obsidian.CodeGen.CUDA
48 |                  , Obsidian.CodeGen.OpenCLEmbedded
49 |             
50 |   other-modules:  Obsidian.Array
51 |                ,  Obsidian.Atomic
52 |                ,  Obsidian.DimSpec
53 |                ,  Obsidian.Exp
54 |                ,  Obsidian.Force
55 |                ,  Obsidian.Globs
56 |                ,  Obsidian.Library
57 |                ,  Obsidian.Memory
58 |                ,  Obsidian.Mutable
59 |                ,  Obsidian.Names
60 |                ,  Obsidian.Program 
61 |                ,  Obsidian.SeqLoop 
62 |                ,  Obsidian.Types 
63 |                ,  Obsidian.Data
64 |                ,  Obsidian.CodeGen.CompileIM
65 |                ,  Obsidian.CodeGen.CompileIMOpenCLEmbedded              
66 |                ,  Obsidian.CodeGen.Liveness
67 |                ,  Obsidian.CodeGen.Memory2
68 |                ,  Obsidian.CodeGen.Program
69 |               
70 | 
71 |         
72 |   GHC-Options: 
73 | -- -O2 
74 | 
75 | --  include-dirs:
76 | 
77 | 
78 | --  extra-lib-dirs: 
79 | 
80 | --  Includes: 
81 | --  Extra-libraries: 
82 |   
83 | 
84 | 


--------------------------------------------------------------------------------
/Obsidian/Force.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {-# LANGUAGE ScopedTypeVariables #-}
  3 | {-# LANGUAGE FlexibleInstances #-}
  4 | {-# LANGUAGE FlexibleContexts #-}
  5 | {-# LANGUAGE UndecidableInstances #-}
  6 | {-# LANGUAGE MultiParamTypeClasses #-}
  7 | {-# LANGUAGE TypeOperators #-}
  8 | {-# LANGUAGE GADTs #-}
  9 | ----------------------------------------
 10 | {- LANGUAGE KindSignatures -}
 11 | {-# LANGUAGE TypeFamilies #-}
 12 | 
 13 | {-# LANGUAGE DataKinds #-}
 14 | {-# LANGUAGE ConstraintKinds #-}
 15 | 
 16 | 
 17 | {- Joel Svensson 2012..2017
 18 | 
 19 |    Notes:
 20 |    2014-03-28: Changed API.
 21 |                Not using Obsidian.Mutable currently, it needs more work.
 22 |    2013-06-24: Changed code. uses Obsidian.Mutable now
 23 |    2013-05-02: Removing things to do with forceG
 24 |                Removed the extensions (no longer needed)
 25 |    2013-04-27: Something is broken.
 26 |    2013-04-10: Looking at force and threads
 27 |    2013-01-27: globArrays nolonger exist
 28 |    2013-01-02: Added simple forceG for globArrays
 29 |    2012-12-10: Edited
 30 | 
 31 | -}
 32 | 
 33 | module Obsidian.Force ( unsafeWritePush
 34 |                       , unsafeWritePull
 35 |                       , compute_
 36 |                       , computePull_
 37 |                       , ComputeAs(..)
 38 |                       , Compute) where
 39 | 
 40 | 
 41 | import Obsidian.Program
 42 | import Obsidian.Exp
 43 | import Obsidian.Array
 44 | import Obsidian.Memory
 45 | 
 46 | import Obsidian.Names
 47 | import Obsidian.Data
 48 | 
 49 | import Data.Word
 50 | 
 51 | 
 52 | ---------------------------------------------------------------------------
 53 | --
 54 | -------------------------------------------------------------------------
 55 | 
 56 | -- | Compute constraint.
 57 | type Compute t = (Write t, t *<=* Block)
 58 | 
 59 | -- | Arrays can be computed at level t if level t allows compute.
 60 | class Compute t => ComputeAs t a where
 61 |   compute :: Data e => a Word32 e -> Program t (Pull Word32 e)
 62 | 
 63 | instance Compute t => ComputeAs t Pull where
 64 |   compute = computePull_
 65 | 
 66 | {-
 67 |    The key to this instance is that the typechecker
 68 |    matches only against the head, ignoring the constraint.
 69 |    meaning that all variations of t, t1 is caught by this
 70 |    instance. Though, those where t and t1 are not equal
 71 |    a type error is the result (rather than a missing instance).
 72 | 
 73 |    This means that the constraint "Compute Block (Push Thread)"
 74 |    matches this instance, but is a type error.
 75 | -}
 76 | instance (t ~ t1, Compute t) => ComputeAs t (Push t1) where
 77 |   compute =  compute_
 78 | 
 79 | compute_ :: (Data a, Compute t)
 80 |           => Push t Word32 a -> Program t (Pull Word32 a)
 81 | compute_ arr = do
 82 |   rval <- unsafeWritePush False arr
 83 |   sync
 84 |   return rval
 85 | 
 86 | computePull_ :: (t *<=* Block, Data a, Compute t)
 87 |              => Pull Word32 a -> Program t (Pull Word32 a)
 88 | computePull_ arr =
 89 |   if (len arr <= 32)
 90 |   then do
 91 |     rval <- unsafeWritePush True parr
 92 |     return rval
 93 |   else do
 94 |     rval <- unsafeWritePush False parr
 95 |     sync
 96 |     return rval
 97 |   where parr = push arr
 98 | 
 99 | 
100 | ---------------------------------------------------------------------------
101 | -- Force local (requires static lengths!)
102 | ---------------------------------------------------------------------------
103 | 
104 | class Write t where
105 |   unsafeWritePush :: Storable a => Bool -> Push t Word32 a -> Program t (Pull Word32 a)
106 | 
107 | -- What to do about volatile here?
108 | -- Ignoring that parameter for now.
109 | -- Thought: It does not matter.
110 | -- Thought: Is this function correct at all?
111 | --   What happens if a thread program allocates memory
112 | -- DONE: The above problem has been fixed!
113 | instance  Write Thread where
114 |   unsafeWritePush _ p =
115 |     do
116 |       (snames :: Names a)  <- names "arr"
117 | 
118 |       -- Here I know that this pattern match will succeed
119 |       let n = len p
120 | 
121 |       allocateArray snames  n
122 |       p <: threadAssignArray snames (variable "tid") n
123 | 
124 |       return $ threadPullFrom snames (variable "tid") n
125 | 
126 | instance  Write Warp where
127 |   unsafeWritePush _ p  =
128 |     do
129 |       let n = len p
130 |       noms <- names "arr"
131 |       allocateVolatileArray noms n
132 | 
133 |       p <: warpAssignArray noms (variable "warpID") n
134 |       return $ warpPullFrom noms (variable "warpID") n
135 | 
136 | instance Write Block where
137 |   unsafeWritePush volatile p =
138 |     do
139 |       let  n = len p
140 |       noms <- names "arr"
141 |       if (volatile)
142 |         then allocateVolatileArray noms n
143 |         else allocateArray noms n
144 | 
145 |       p <: assignArray noms
146 |       return $ pullFrom noms n
147 | 
148 | 
149 | 
150 | ---------------------------------------------------------------------------
151 | -- unsafe!
152 | ---------------------------------------------------------------------------
153 | unsafeWritePull :: (t *<=* Block, Write t, Storable a) => Bool -> Pull Word32 a -> Program t (Pull Word32 a)
154 | unsafeWritePull t = unsafeWritePush t . push
155 | 
156 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/Liveness.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | {- Joel Svensson 2012, 2013
  4 | 
  5 |    notes:
  6 |      added case for SeqFor  Jan-21-2013
  7 | 
  8 | -} 
  9 | module Obsidian.CodeGen.Liveness where
 10 | 
 11 | import qualified Data.Set as Set
 12 | 
 13 | import Obsidian.Exp
 14 | import Obsidian.Globs
 15 | import Obsidian.CodeGen.Program
 16 | 
 17 | import Control.Monad.State
 18 | 
 19 | 
 20 | ---------------------------------------------------------------------------
 21 | --
 22 | ---------------------------------------------------------------------------
 23 | type Liveness = Set.Set Name
 24 | 
 25 | --------------------------------------------------------------------------- 
 26 | --
 27 | --------------------------------------------------------------------------- 
 28 | type IML = [(Statement Liveness,Liveness)] 
 29 | 
 30 | 
 31 | {- Plan:
 32 |    # Step through program from end to start
 33 |    # as soon as a new name is encountered, add it to the living set
 34 |    # when an "Allocate" is found, delete the name it allocated from the living set.
 35 | 
 36 |    Requirements:
 37 |    # All names are unique! 
 38 | 
 39 |    TODO: Think more carefully about the ForAllBlocks case
 40 |    TODO: Can ixs contain array names ?
 41 |            (Most likely yes! think about the counting sort example)
 42 | 
 43 |    BUG: Arrays from the "outside" needs special
 44 |         treatment within a loop body. 
 45 | 
 46 | -} 
 47 |            
 48 | 
 49 | -- Nice type 
 50 | computeLiveness :: IMList a -> IML
 51 | computeLiveness im = reverse $ evalState (cl (reverse im)) Set.empty
 52 | 
 53 | -- Nice Type 
 54 | computeLiveness1 :: Liveness -> IMList a -> IML
 55 | computeLiveness1 l im = reverse $ evalState (cl (reverse im)) l
 56 | 
 57 | -- cl :: IM -> State Liveness IML 
 58 | cl im = mapM process im
 59 |   where
 60 |     safeHead [] = Set.empty
 61 |     safeHead (x:xs) = snd x
 62 | 
 63 |     -- Horrific type 
 64 |     process :: (Statement a,a) -> State Liveness (Statement Liveness,Liveness)
 65 |     process (SAssign nom ixs e,_) =
 66 |       do
 67 |         s <- get
 68 |         let arrays = collectArraysI "arr" e
 69 |             arrays1 = collectArraysI "arr" nom
 70 |             living = Set.fromList (arrays1++arrays) `Set.union` s
 71 |         
 72 |         put living  -- update state   
 73 |         return (SAssign nom ixs e,living)
 74 |     process (SAtomicOp nom ix atop,_) =
 75 |       do
 76 |         s <- get
 77 |         let arrays =
 78 |               case atop of
 79 |                 AtInc -> []
 80 |                 AtAdd e -> collectArraysI "arr" e
 81 |                 AtSub e -> collectArraysI "arr" e
 82 |                 AtExch e -> collectArraysI "arr" e
 83 |             arrays1 = collectArraysI "arr" nom
 84 |             living = Set.fromList (arrays1++arrays) `Set.union` s
 85 |         put living
 86 |         return (SAtomicOp nom ix atop, living) 
 87 |     
 88 | --    process (SAtomicOp n1 n2 ixs op,_) =
 89 | --      do
 90 | --        s <- get
 91 | --        return (SAtomicOp n1 n2 ixs op,s)
 92 |         
 93 |     process (SAllocate name size t,_) =
 94 |       do
 95 |         modify (name `Set.delete`)
 96 |         s <- get        
 97 |         return (SAllocate name size t,s)  
 98 |     
 99 |     process (SDeclare name t,_) = 
100 |       do 
101 |         s <- get 
102 |         return (SDeclare name t,s)
103 | 
104 |     process (SSynchronize,_) = 
105 |       do 
106 |         s <- get
107 |         return (SSynchronize,s) 
108 | 
109 |     process (SCond bexp im,_) = 
110 |       do
111 |         -- TODO: What should really happen here ?
112 |         s <- get 
113 |         let iml = computeLiveness1 s im 
114 |             l   = safeHead iml 
115 |             ns  =  s `Set.union` l
116 |         put ns 
117 |         -- Is this correct ?  Same question, all below
118 |         return (SCond bexp iml,ns)
119 | 
120 | 
121 | -- This needs to change. 
122 | -- arrays from the "outside" that  
123 | -- are used within the loop needs special treatment. 
124 |     process (SSeqFor nom n im,_) = 
125 |       do 
126 |         -- get names alive after loop
127 |         s <- get
128 |         let iml = computeLiveness1 s im 
129 |             -- l is liveness info "leaving" im
130 |             l   = safeHead iml
131 |             -- alive at these points are those things in l 
132 |             -- plus the things before (s) 
133 |             ns  = s `Set.union` l
134 |         put ns
135 |         return (SSeqFor nom n iml,ns) 
136 |     process (SSeqWhile b im,_) = 
137 |       do 
138 |         s <- get 
139 |         let iml = computeLiveness1 s im 
140 |             l   = safeHead iml 
141 |             ns  = s `Set.union` l
142 |         put ns 
143 |         return (SSeqWhile b iml,ns)
144 |     process (SBreak,_) = 
145 |       do 
146 |         s <- get 
147 |         return (SBreak,s)
148 | 
149 |     process (SForAll lvl n im,_) =  
150 |       do 
151 |         s <- get 
152 |         let iml = computeLiveness1 s im 
153 |             l   = safeHead iml 
154 |             ns  = s `Set.union` l
155 |         put ns
156 |         return (SForAll lvl n iml,ns) 
157 |     process (SDistrPar lvl n im,_) = 
158 |       do 
159 |         s <- get 
160 |         let iml = computeLiveness1 s im 
161 |             l   = safeHead iml 
162 |             ns  = s `Set.union` l 
163 |         put ns 
164 |         return (SDistrPar lvl n iml,ns) 
165 |     -- process (SForAllBlocks n im,_) = 
166 |     --   do 
167 |     --     s <- get 
168 |     --     let iml = computeLiveness1 s im 
169 |     --         l   = safeHead iml 
170 |     --         ns  = s `Set.union` l
171 |     --     put ns
172 |     --     return (SForAllBlocks n iml,ns)
173 |     -- process (SNWarps n im,_) = 
174 |     --   do 
175 |     --     s <- get
176 |     --     let iml = computeLiveness1 s im 
177 |     --         l   = safeHead iml 
178 |     --         ns  = s `Set.union` l
179 |     --     put ns
180 |     --     return (SNWarps n iml,ns)
181 |     -- process (SWarpForAll n im,_) =  
182 |     --   do 
183 |     --     s <- get 
184 |     --     let iml = computeLiveness1 s im 
185 |     --         l   = safeHead iml 
186 |     --         ns  = s `Set.union` l
187 |     --     put ns
188 |     --     return (SWarpForAll n iml,ns) 
189 | 
190 | 
191 |     -- process (SForAllThreads n im,_) = 
192 |     --   do 
193 |     --     s <- get 
194 |     --     let iml = computeLiveness1 s im 
195 |     --         l   = safeHead iml 
196 |     --         ns  = s `Set.union` l
197 |     --     put ns 
198 |     --     return (SForAllThreads n iml,ns)
199 | 
200 | 


--------------------------------------------------------------------------------
/Obsidian/SeqLoop.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | {-# LANGUAGE TypeOperators #-}
  3 | {-# LANGUAGE FlexibleContexts #-}
  4 | {-# LANGUAGE ConstraintKinds #-}
  5 | {-# LANGUAGE GADTs #-}
  6 | 
  7 | {- Joel Svensson 2013..2017
  8 | 
  9 |    sequential loops with state
 10 | 
 11 | -}
 12 | 
 13 | module Obsidian.SeqLoop where
 14 | 
 15 | 
 16 | import Obsidian.Program
 17 | import Obsidian.Exp
 18 | import Obsidian.Array
 19 | import Obsidian.Memory
 20 | import Obsidian.Names
 21 | import Obsidian.Data
 22 | import Obsidian.Force
 23 | import qualified Obsidian.Library as Lib
 24 | 
 25 | -- TODO: Rename module to something better
 26 | --       Or make part of Library.hs
 27 | ---------------------------------------------------------------------------
 28 | -- seqReduce
 29 | ---------------------------------------------------------------------------
 30 | -- | Sequential reduction of Pull array. Results in a for loop in generated code. 
 31 | seqReduce :: Storable a
 32 |            => (a -> a -> a)
 33 |            -> SPull a
 34 |            -> Program Thread a
 35 | seqReduce op arr =
 36 |   do
 37 |     (ns :: Names a)  <- names "v"
 38 |     allocateScalar ns
 39 | 
 40 |     assignScalar ns init
 41 | 
 42 |     SeqFor (n-1) $ \ ix ->
 43 |       do
 44 |         assignScalar ns (readFrom ns `op`  (arr ! (ix + 1)))
 45 | 
 46 |     return $ readFrom ns
 47 |   where
 48 |     n = sizeConv$ len arr
 49 |     init = arr ! 0
 50 | 
 51 | 
 52 | ---------------------------------------------------------------------------
 53 | -- Iterate
 54 | ---------------------------------------------------------------------------
 55 | -- | iterate a function. Results in a for loop in generated code.
 56 | seqIterate :: Storable a
 57 |               => EWord32
 58 |               -> (EWord32 -> a -> a)
 59 |               -> a
 60 |               -> Program Thread a
 61 | seqIterate n f init =
 62 |   do
 63 |     (ns :: Names a)  <- names "v"
 64 |     allocateScalar ns
 65 | 
 66 |     assignScalar ns init
 67 |     SeqFor n $ \ix ->
 68 |       do
 69 |         assignScalar ns $ f ix (readFrom ns)
 70 | 
 71 |     return $ readFrom ns
 72 | 
 73 | ---------------------------------------------------------------------------
 74 | --
 75 | ---------------------------------------------------------------------------
 76 | -- | iterate a function until a condition holds. Results in a while loop
 77 | -- with a break in the generated code.
 78 | seqUntil :: Storable a
 79 |             => (a -> a)
 80 |             -> (a -> EBool)
 81 |             -> a
 82 |             -> Program Thread a
 83 | seqUntil f p init =
 84 |   do
 85 |     (ns :: Names a) <- names "v"
 86 |     allocateScalar ns
 87 | 
 88 |     assignScalar ns init
 89 |     SeqWhile (p (readFrom ns)) $
 90 |       do
 91 |         (tmp :: Names a) <- names "t"
 92 |         allocateScalar tmp
 93 |         assignScalar tmp (readFrom ns)
 94 |         assignScalar ns $ f (readFrom tmp)
 95 |     return $ readFrom ns
 96 | 
 97 | ---------------------------------------------------------------------------
 98 | -- Sequential scan
 99 | ---------------------------------------------------------------------------
100 | -- | Sequential scan over the elements in a pull array. Results in a for loop
101 | -- in the generated code.
102 | seqScan :: Storable a
103 |            => (a -> a -> a)
104 |            -> SPull a
105 |            -> SPush Thread a
106 | seqScan op arr {-(Pull n ixf)-}  =
107 |   mkPush n $ \wf ->  do
108 |     (ns :: Names a) <- names "v" -- (ixf 0)
109 |     allocateScalar ns -- (ixf 0)
110 |     assignScalar ns (arr ! 0)
111 |     wf (readFrom ns) 0
112 |     SeqFor (sizeConv (n-1)) $ \ix -> do
113 |       assignScalar ns  $ readFrom ns `op` (arr ! (ix + 1))
114 |       wf (readFrom ns) (ix+1)
115 |     where
116 |       n = len arr
117 | 
118 | -- | Sequential scan that takes a carry-in.
119 | seqScanCin :: (Storable a, Storable b)
120 |            => (b -> a -> b)
121 |            -> b -- cin
122 |            -> SPull a
123 |            -> SPush Thread b
124 | seqScanCin op a arr =
125 |   mkPush n $ \wf ->  do
126 |     (ns :: Names a) <- names "v"
127 |     allocateScalar ns
128 |     assignScalar ns a
129 |     -- wf (readFrom ns) 0
130 |     SeqFor (sizeConv  n) $ \ix -> do
131 |       assignScalar ns  $ readFrom ns `op` (arr ! ix)
132 |       wf (readFrom ns) ix
133 |   where
134 |     n = len arr
135 | 
136 | -- | Sequential scan with separate types for input, output and accumulator.
137 | mapAccumL :: (ASize s, Storable a, Storable b, Storable acc)
138 |            => (acc -> a -> (acc,b))
139 |            -> acc -- cin
140 |            -> Pull s a
141 |            -> Push Thread s b
142 | mapAccumL op acc arr {-(Pull n ixf)-} =
143 |   mkPush n $ \wf ->  do
144 |     (ns :: Names a) <- names "v" -- (ixf 0)
145 |     allocateScalar ns -- (ixf 0)
146 |     assignScalar ns acc -- (ixf 0)
147 |     -- wf (readFrom ns) 0
148 |     SeqFor (sizeConv  n) $ \ix -> do
149 |       let (newAcc, b) = op (readFrom ns) (arr ! ix)
150 |       -- order of writing matters, because readFrom is evaluated twice
151 |       wf b ix
152 |       assignScalar ns newAcc
153 |   where
154 |     n = len arr
155 | 
156 | mapAccumR :: (ASize s, Storable a, Storable b, Storable acc)
157 |            => (acc -> a -> (acc,b))
158 |            -> acc -- cin
159 |            -> Pull s a
160 |            -> Push Thread s b
161 | mapAccumR op acc =
162 |    Lib.reverse . mapAccumL op acc . Lib.reverse
163 | 
164 | 
165 | ---------------------------------------------------------------------------
166 | -- sMapAccum
167 | -- Generalisation of the old sConcat functionality.
168 | 
169 | sMapAccum :: (Compute t, Data acc, ASize l)
170 |              => (acc -> Pull l a -> Program t (acc,Push t l b))
171 |              -> acc
172 |              -> Pull l (Pull l a)
173 |              -> Push t l b
174 | sMapAccum f acc arr =
175 | 
176 |   mkPush (n * fromIntegral rn) $ \wf ->
177 |   do
178 |     (noms :: Names acc) <- names "v"
179 |     --(noms2 :: Names acc) <- names "APA"
180 | 
181 |     allocateSharedScalar noms
182 |    -- allocateScalar noms2
183 |     -- a single thread in the group, performs an assignment
184 |     -- May need synchronization!
185 |     singleThread $ assignScalar noms acc
186 |     sync
187 |     seqFor (sizeConv n) $ \bix -> do
188 |       --singleThread $ assignScalar noms2 acc
189 |       (newAcc, b) <- f (readFrom noms) (arr ! bix)
190 |       singleThread $ assignScalar noms newAcc
191 |       sync
192 |       let wf' a ix = wf a (bix * sizeConv rn + ix)
193 |       b <: wf'
194 | 
195 |   where
196 |     n  = len arr
197 |     rn = len $ arr ! 0
198 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/Reify.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {-# LANGUAGE FlexibleInstances,
  3 |              OverlappingInstances,
  4 |              UndecidableInstances,
  5 |              FlexibleContexts,
  6 |              MultiParamTypeClasses,
  7 |              TypeOperators,
  8 |              TypeFamilies ,
  9 |              ScopedTypeVariables
 10 |              #-}
 11 | 
 12 | {- Joel Svensson 2012, 2013
 13 |    Niklas Ulvinge 2013
 14 | 
 15 |  
 16 | -} 
 17 | 
 18 | 
 19 | module Obsidian.CodeGen.Reify (ToProgram(..)) where 
 20 | 
 21 | import Obsidian.Exp 
 22 | import Obsidian.Array
 23 | import Obsidian.Mutable 
 24 | 
 25 | import Obsidian.Types
 26 | import Obsidian.Program
 27 | import Obsidian.Library
 28 | import Obsidian.Globs
 29 | import Obsidian.Names
 30 | 
 31 | import qualified Obsidian.CodeGen.Program as CG
 32 | import Obsidian.CodeGen.CompileIM
 33 | 
 34 | import Data.Word
 35 |       
 36 | ---------------------------------------------------------------------------
 37 | -- New approach (hopefully)
 38 | ---------------------------------------------------------------------------
 39 | -- "reify" Haskell functions into CG.Programs
 40 | 
 41 | 
 42 | ---------------------------------------------------------------------------
 43 | --
 44 | --------------------------------------------------------------------------- 
 45 | class ToProgram a where
 46 |   toProgram :: Int -> a -> InputList a -> (Parameters,CG.IM)
 47 |   toProgram_ :: Int -> a -> (Parameters, CG.IM)
 48 | 
 49 | 
 50 | typeOf_ a = typeOf (Literal a)
 51 | 
 52 | 
 53 | ---------------------------------------------------------------------------
 54 | -- Base cases
 55 | ---------------------------------------------------------------------------
 56 | 
 57 | -- This instance is incorrect
 58 | instance ToProgram (GProgram ()) where
 59 |   -- toProgram i prg () = toProgram $ pJoin prg
 60 |   toProgram i prg () = ([],CG.compileStep1 prg) 
 61 |   -- Needs to deal with GProgram () and GProgram (Push a), GProgram (Pull a)
 62 |   -- in different ways.
 63 | 
 64 |   toProgram_ i prg = ([],CG.compileStep1 prg) 
 65 |   
 66 | -- This instance might fix the problem with empty kernels being generated
 67 | instance (ToProgram (Push Grid l a)) => ToProgram (GProgram (Push Grid l a)) where
 68 |   toProgram i p a = toProgram i (runPush p) a
 69 | 
 70 |   toProgram_ i p = toProgram_ i (runPush p) 
 71 | 
 72 | -- No ToProgram (GProgram (Pull a)) instance is needed. These programs
 73 | -- cannot currently be created using the API. The reason is that GProgram (Pull a)
 74 | -- implies a capability that GPUs do not have. The pulling from an array computed globally.
 75 | -- That kind of computation can not be synced and its result would be undefined. 
 76 | 
 77 | 
 78 | instance Scalar a => ToProgram (Push Grid l (Exp a)) where
 79 |   toProgram i p a =
 80 |     let outT = Pointer $ typeOf_ (undefined :: a)
 81 |         outN = "output" ++ show i
 82 |         
 83 |         prg = p <: assignOut outN
 84 |         
 85 |         (inputs,im) = toProgram (i+1) prg a
 86 |         
 87 |     in (inputs++[(outN,outT)],im) 
 88 |      
 89 |     where
 90 |       assignOut out a ix = Assign out [ix] a
 91 |   toProgram_ i p = toProgram i p () 
 92 | 
 93 | instance (Scalar a, Scalar b) => ToProgram (Push Grid l (Exp a,Exp b)) where
 94 |   toProgram i p a =
 95 |     let   outT1 = Pointer $ typeOf_ (undefined :: a)
 96 |           outT2 = Pointer $ typeOf_ (undefined :: b)
 97 |           outN1 = "output" ++ show i
 98 |           outN2 = "output" ++ show (i+1)
 99 |           
100 | 
101 |           prg = p <: assignOut (outN1,outN2) 
102 |             
103 |           (inputs,im) = toProgram (i+2) prg a
104 |           
105 |     in (inputs++[(outN1,outT1),(outN2,outT2)],im)
106 |     where
107 |       assignOut (o1,o2) (a,b) ix =
108 |         do
109 |           Assign o1 [ix] a
110 |           Assign o2 [ix] b
111 |   toProgram_ i p = toProgram i p () 
112 | 
113 | ---------------------------------------------------------------------------
114 | -- Recursive
115 | ---------------------------------------------------------------------------
116 |           
117 | instance (ToProgram b, Scalar t) => ToProgram (Pull EWord32 (Exp t) -> b) where
118 |   toProgram i f (a :- rest) = ((nom,Pointer t):(n,Word32):ins,prg)
119 |     where
120 |       (ins,prg) = toProgram (i+1) (f input) rest
121 |       nom  = "input" ++ show i
122 |       n    = "n" ++ show i
123 |       lengthVar = variable n
124 |       input = namedGlobal nom lengthVar
125 |       t     = typeOf_ (undefined :: t)
126 |   toProgram_ i f = ((nom,Pointer t):(n,Word32):ins,prg)
127 |     where
128 |       (ins,prg) = toProgram_ (i+1) (f input)
129 |       nom  = "input" ++ show i
130 |       n    = "n" ++ show i
131 |       lengthVar = variable n
132 |       input = namedGlobal nom lengthVar
133 |       t     = typeOf_ (undefined :: t)
134 | 
135 | instance (ToProgram b, Scalar t) => ToProgram (Pull Word32 (Exp t) -> b) where
136 |   toProgram i f (a :- rest) = ((nom,Pointer t):ins,prg)
137 |     where
138 |       (ins,prg) = toProgram (i+1) (f input) rest
139 |       nom  = "input" ++ show i
140 |       input = namedGlobal nom  (len a) 
141 |       t     = typeOf_ (undefined :: t)
142 |   toProgram_ _ _ = error "toProgram_: static length" 
143 | 
144 | 
145 | instance (ToProgram b, Scalar t) => ToProgram (Mutable Global EWord32 (Exp t) -> b) where
146 |   toProgram i f (a :- rest) = ((nom,Pointer t):(n,Word32):ins,prg)
147 |     where
148 |       (ins,prg) = toProgram (i+1) (f input) rest
149 |       nom  = "input" ++ show i
150 |       n    = "n" ++ show i
151 |       lengthVar = variable n
152 |       input = namedMutable nom lengthVar
153 |       t     = typeOf_ (undefined :: t)
154 |   toProgram_ i f = ((nom,Pointer t):(n,Word32):ins,prg)
155 |     where
156 |       (ins,prg) = toProgram_ (i+1) (f input)
157 |       nom  = "input" ++ show i
158 |       n    = "n" ++ show i
159 |       lengthVar = variable n
160 |       input = namedMutable nom lengthVar
161 |       t     = typeOf_ (undefined :: t)
162 | 
163 | namedMutable :: Name -> s -> Mutable mloc s a
164 | namedMutable s v = Mutable v (Single s)
165 | undefinedMutable :: s -> Mutable mloc s a 
166 | undefinedMutable v = Mutable v undefined 
167 | 
168 | 
169 | 
170 | instance (ToProgram b, Scalar t) => ToProgram ((Exp t) -> b) where
171 |   toProgram i f (a :- rest) = ((nom,t):ins,prg)
172 |     where
173 |       (ins,prg) = toProgram (i+1) (f input) rest
174 |       nom  = "input" ++ show i
175 |       input = variable nom -- namedGlobal nom (len a) 
176 |       t     = typeOf_ (undefined :: t)
177 |   toProgram_ i f = ((nom,t):ins,prg)
178 |     where
179 |       (ins,prg) = toProgram_ (i+1) (f input) 
180 |       nom  = "input" ++ show i
181 |       input = variable nom -- namedGlobal nom (len a) 
182 |       t     = typeOf_ (undefined :: t)
183 | 
184 | 
185 | 
186 |     
187 | ---------------------------------------------------------------------------
188 | -- heterogeneous lists of inputs 
189 | ---------------------------------------------------------------------------
190 | data head :- tail = head :- tail
191 | 
192 | infixr 5 :-
193 | 
194 | 
195 | ---------------------------------------------------------------------------
196 | -- Function types to input list types. 
197 | ---------------------------------------------------------------------------
198 | 
199 | type family InputList a
200 | 
201 | type instance InputList (a -> b)        = a :- (InputList b)
202 | type instance InputList (Push Grid l b) = ()
203 | type instance InputList (GProgram b)    = () 
204 | 
205 | -- genKernelSM :: ToProgram prg => Word32 -> String -> prg -> (String, Word32)
206 | -- genKernelSM = genKernelSpecsNL
207 | 
208 | -- genKernelSpecsNL :: ToProgram prg => Word32 -> String -> prg -> (String, Word32)
209 | -- genKernelSpecsNL nt kn prg = (prgStr,bytesShared) 
210 | --   where
211 | --     prgStr = pretty 75 $ ppr $ compile PlatformCUDA (Config nt bytesShared) kn (a,rim) 
212 | --     (a,im) = toProgram_ 0 prg
213 | --     iml = computeLiveness im
214 | --     (m,mm) = mmIM iml sharedMem (M.empty)
215 | --     bytesShared = size m 
216 | --     rim = renameIM mm iml
217 | 
218 | 


--------------------------------------------------------------------------------
/Obsidian/Mutable.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables,
  2 |              TypeFamilies,
  3 |              EmptyDataDecls,
  4 |              FlexibleInstances #-}
  5 | 
  6 | {- Joel Svensson 2013, 2014 -}
  7 | 
  8 | module Obsidian.Mutable ( Mutable(Mutable)
  9 |                         , Shared
 10 |                         , Global
 11 |                         , MShared
 12 |                         , MGlobal
 13 |                         , newSharedMutable
 14 |                         , writeToSync
 15 |                         , writeTo
 16 |                         , assignMutable
 17 |                         , indexMutable
 18 |                         , mutableToPull
 19 |                         , atomicInc
 20 |                         )  where
 21 | 
 22 | 
 23 | 
 24 | import Obsidian.Exp
 25 | import Obsidian.Program
 26 | import Obsidian.Memory
 27 | import Obsidian.Names
 28 | import Obsidian.Array
 29 | import Obsidian.Atomic
 30 | 
 31 | import Data.Word
 32 | 
 33 | {-
 34 |   Todo: Think about Global vs Shared.
 35 |   Todo: Add creation of mutable global arrays.
 36 | 
 37 |   Todo: Make mutable interface (atomic ops) very low-level
 38 | 
 39 | 
 40 |   TODO: Rethink. Have two sepparate types of mutable arrays.
 41 |         Also Skip the Type family magic if possible.
 42 |         Make both kinds of Mutable arrays an instance of Array
 43 | -}
 44 | 
 45 | ---------------------------------------------------------------------------
 46 | -- Mutable arrays
 47 | ---------------------------------------------------------------------------
 48 | --
 49 | -- Global mutable arrays can only be passed as inputs to a function.
 50 | -- Shared mutable arrays may be created using newS
 51 | --
 52 | 
 53 | data Shared
 54 | data Global
 55 | 
 56 | -- A mutable array has an attached location.
 57 | -- Either it recides in Global or in Shared memory.
 58 | data Mutable mloc s a = Mutable s (Names a)
 59 | 
 60 | type MShared a = Mutable Shared Word32 a
 61 | type MGlobal a = Mutable Global EWord32 a
 62 | 
 63 | instance ArrayLength (Mutable Shared) where
 64 |   len (Mutable n _) = n
 65 | 
 66 | 
 67 | ---------------------------------------------------------------------------
 68 | -- Create Mutable Shared memory arrays
 69 | --   # allocates shared memory
 70 | ---------------------------------------------------------------------------
 71 | 
 72 | -- | Create a new Mutable array in shared memory
 73 | newSharedMutable :: Storable a => SPush Block a -> Program Block (Mutable Shared Word32 a)
 74 | newSharedMutable arr = do
 75 |   (snames :: Names a) <- names "arr"
 76 |   allocateArray snames n
 77 |   let mut = Mutable n snames
 78 |   writeTo mut arr
 79 |   return $ mut -- Mutable n snames
 80 |   where
 81 |     n = len arr
 82 | 
 83 | ---------------------------------------------------------------------------
 84 | -- forceTo & writeTo
 85 | ---------------------------------------------------------------------------
 86 | -- Much Hacking here
 87 | 
 88 | -- | Write a Push array into a mutable array.
 89 | --   There is no synchronisation inserted after the write
 90 | writeTo :: Storable a
 91 |            => Mutable Shared Word32 a
 92 |            -> Push Block Word32 a
 93 |            -> Program Block ()
 94 | writeTo (Mutable n snames) p
 95 |   | n <= m =  p <: assignArray snames
 96 |   | otherwise = error "WriteTo: Incompatible sizes"
 97 |   where
 98 |     m = len p
 99 | 
100 | 
101 | -- Add forceTo with offsets (why? just thought it might be useful)
102 | -- | Write  a Push array into a mutable array and sync
103 | writeToSync :: Storable a
104 |                => Mutable Shared Word32 a
105 |                -> Push Block Word32 a
106 |                -> Program Block ()
107 | writeToSync m arr =
108 |   do
109 |     writeTo m arr
110 |     Sync
111 | 
112 | ---------------------------------------------------------------------------
113 | -- Low level operations
114 | ---------------------------------------------------------------------------
115 | 
116 | -- | Write a value into a storable array.
117 | assignMutable :: Storable a => Mutable loc l a -> EWord32 ->  a -> Program Thread ()
118 | assignMutable  (Mutable _ snames) ix a = assignArray snames a ix
119 | 
120 | indexMutable :: (ASize s, Storable a) => Mutable loc s a -> EWord32 -> a
121 | indexMutable mut ix = mutableToPull mut ! ix
122 | 
123 | ---------------------------------------------------------------------------
124 | -- mutable to pull conversion
125 | ---------------------------------------------------------------------------
126 | 
127 | -- | Convert a Mutable array to a Pull array
128 | mutableToPull :: (ASize s, Storable a) => Mutable l s a -> Pull s a
129 | mutableToPull (Mutable n snames) = pullFrom snames n
130 | 
131 | 
132 | 
133 | ---------------------------------------------------------------------------
134 | -- Atomics
135 | ---------------------------------------------------------------------------
136 | -- | Increment atomically
137 | atomicInc :: forall mloc a s t . AtomicInc a
138 |              => EWord32
139 |              -> Mutable mloc s (Exp a)
140 |              -> TProgram ()
141 | atomicInc ix (Mutable n noms) = mapNamesM_ f noms
142 |   where
143 |     f nom = atomicOp nom ix (AtomicInc  :: Atomic a) >> return ()
144 | 
145 | 
146 | -- | Add atomically
147 | atomicAdd :: forall mloc a s. AtomicAdd a
148 |              => EWord32
149 |              -> Exp a
150 |              -> Mutable mloc s  (Exp a)
151 |              -> TProgram ()
152 | atomicAdd ix v (Mutable n noms) = mapNamesM_ f noms
153 |   where
154 |     f nom = atomicOp nom ix (AtomicAdd v) >> return ()
155 | 
156 | 
157 | -- | Subtract atomically
158 | atomicSub :: forall mloc a s. AtomicSub a
159 |              => EWord32
160 |              -> Exp a
161 |              -> Mutable mloc s (Exp a)
162 |              -> TProgram ()
163 | atomicSub ix v (Mutable n noms) = mapNamesM_ f noms
164 | 
165 |   where
166 |     f nom = atomicOp nom ix (AtomicSub v) >> return ()
167 | 
168 | 
169 | -- Special case ? No.
170 | atomicExch :: forall mloc a s. AtomicExch a
171 |               => EWord32
172 |               -> Exp a
173 |               -> Mutable mloc s (Exp a)
174 |               -> TProgram ()
175 | atomicExch ix v  (Mutable n (Single nom)) = f nom
176 |   where
177 |     f nom = atomicOp nom ix (AtomicExch v)
178 | 
179 | 
180 | {-
181 | 
182 | ---------------------------------------------------------------------------
183 | atomicExch()
184 | 
185 | int atomicExch(int* address, int val);
186 | unsigned int atomicExch(unsigned int* address,
187 |                         unsigned int val);
188 | unsigned long long int atomicExch(unsigned long long int* address,
189 |                                   unsigned long long int val);
190 | float atomicExch(float* address, float val);
191 | 
192 | ---------------------------------------------------------------------------
193 | atomicMin()
194 | 
195 | int atomicMin(int* address, int val);
196 | unsigned int atomicMin(unsigned int* address,
197 |                        unsigned int val);
198 | unsigned long long int atomicMin(unsigned long long int* address,
199 |                                  unsigned long long int val);
200 | 
201 | ---------------------------------------------------------------------------
202 | atomicMax()
203 | 
204 | int atomicMax(int* address, int val);
205 | unsigned int atomicMax(unsigned int* address,
206 |                        unsigned int val);
207 | unsigned long long int atomicMax(unsigned long long int* address,
208 |                                  unsigned long long int val);
209 | 
210 | 
211 | ---------------------------------------------------------------------------
212 | atomicInc()
213 | 
214 | unsigned int atomicInc(unsigned int* address,
215 |                        unsigned int val);
216 | 
217 | ---------------------------------------------------------------------------
218 | atomicDec()
219 | 
220 | unsigned int atomicDec(unsigned int* address,
221 |                        unsigned int val);
222 | 
223 | ---------------------------------------------------------------------------
224 | atomicCAS()
225 | 
226 | int atomicCAS(int* address, int compare, int val);
227 | unsigned int atomicCAS(unsigned int* address,
228 |                        unsigned int compare,
229 |                        unsigned int val);
230 | unsigned long long int atomicCAS(unsigned long long int* address,
231 |                                  unsigned long long int compare,
232 |                                  unsigned long long int val);
233 | 
234 | ---------------------------------------------------------------------------
235 | atomicAnd()
236 | 
237 | int atomicAnd(int* address, int val);
238 | unsigned int atomicAnd(unsigned int* address,
239 |                        unsigned int val);
240 | unsigned long long int atomicAnd(unsigned long long int* address,
241 |                                  unsigned long long int val);
242 | 
243 | ---------------------------------------------------------------------------
244 | atomicOr()
245 | 
246 | int atomicOr(int* address, int val);
247 | unsigned int atomicOr(unsigned int* address,
248 |                       unsigned int val);
249 | unsigned long long int atomicOr(unsigned long long int* address,
250 |                                 unsigned long long int val);
251 | ---------------------------------------------------------------------------
252 | atomicXor()
253 | 
254 | int atomicXor(int* address, int val);
255 | unsigned int atomicXor(unsigned int* address,
256 |                        unsigned int val);
257 | unsigned long long int atomicXor(unsigned long long int* address,
258 |                                  unsigned long long int val);
259 | 
260 | -}
261 | 


--------------------------------------------------------------------------------
/Obsidian/Array.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiParamTypeClasses,
  2 |              FlexibleInstances,
  3 |              GADTs  #-}
  4 | {-# LANGUAGE TypeOperators #-}
  5 | {-# LANGUAGE FlexibleContexts #-}
  6 | 
  7 | {- Joel Svensson 2012..2017
  8 | 
  9 |    Notes:
 10 |     2017-04-22: looking over, cleaning.
 11 |     2014-04-08: Experimenting with API
 12 |     ---- OUTDATED ----
 13 |     2013-08-26: Experimenting with warp programs.
 14 |                 These do not fit that well in established Idioms!
 15 |                 TODO: Improve this situation.
 16 |     ---- OUTDATED ----
 17 |     2013-01-08: Removed number-of-blocks field from Distribs
 18 |     2012-12-10: Drastically shortened.
 19 | -}
 20 | 
 21 | module Obsidian.Array (Pull, Push, SPull, DPull, SPush, DPush
 22 |                       , pushApp
 23 |                       , mkPull
 24 |                       , mkPush
 25 |                       , push
 26 |                       , pushThread
 27 |                       , pushWarp
 28 |                       , pushBlock
 29 |                       , setSize
 30 |                       , (!)
 31 |                       , (<:)
 32 |                       , Array(..)
 33 |                       , ArrayLength(..)
 34 |                       , ASize(..)
 35 |                       , namedGlobal
 36 |                       , undefinedGlobal
 37 |                       ) where
 38 | 
 39 | import Obsidian.Exp
 40 | import Obsidian.Program
 41 | import Obsidian.Globs
 42 | 
 43 | import Prelude hiding (replicate, (<*))
 44 | import Data.Word
 45 | 
 46 | ---------------------------------------------------------------------------
 47 | -- Aliases
 48 | ---------------------------------------------------------------------------
 49 | type SPull = Pull Word32
 50 | type DPull = Pull EWord32
 51 | 
 52 | type SPush t a = Push t Word32 a
 53 | type DPush t a = Push t EWord32 a
 54 | ---------------------------------------------------------------------------
 55 | -- Create arrays
 56 | ---------------------------------------------------------------------------
 57 | -- | An undefined array. Use as placeholder when generating code
 58 | undefinedGlobal :: ASize s => s -> Pull s a
 59 | undefinedGlobal n = Pull n $ \_ -> undefined
 60 | -- | A named global array.
 61 | namedGlobal :: (ASize s, Scalar a) => Name -> s -> Pull s (Exp a)
 62 | namedGlobal name n = Pull n $ \gix -> index name gix
 63 | -- namedPull name n = Pull n $ \gix -> index name gix
 64 | 
 65 | ---------------------------------------------------------------------------
 66 | -- Class ArraySize
 67 | ---------------------------------------------------------------------------
 68 | -- | ASize provides conversion to Exp Word32 for array sizes
 69 | class (Integral a, Num a) => ASize a where
 70 |   sizeConv :: a ->  Exp Word32
 71 | 
 72 | instance ASize Word32 where
 73 |   sizeConv = fromIntegral
 74 | 
 75 | instance ASize (Exp Word32) where
 76 |   sizeConv = id
 77 | 
 78 | ---------------------------------------------------------------------------
 79 | -- Push and Pull arrays
 80 | ---------------------------------------------------------------------------
 81 | -- | Push array. Parameterised over Program type and size type.
 82 | data Push t s a =
 83 |   Push s (PushFun t a)
 84 | 
 85 | type PushFun t a = Writer a -> Program t ()
 86 | type Writer    a = a -> EWord32 -> TProgram ()
 87 | 
 88 | -- | Pull array.
 89 | data Pull s a = Pull {pullLen :: s,
 90 |                       pullFun :: EWord32 -> a}
 91 | 
 92 | -- | Create a push array.
 93 | mkPush :: s
 94 |        -> ((a -> EWord32 -> TProgram ()) -> Program t ())
 95 |        -> Push t s a
 96 | mkPush n p = Push n p
 97 | 
 98 | -- | Create a pull array.
 99 | mkPull :: s -> (EWord32 -> a) -> Pull s a
100 | mkPull n p = Pull n p
101 | 
102 | -- Fix this.
103 | --   * you cannot safely resize either push or pull arrays
104 | --   * you can shorten pull arrays safely.
105 | setSize :: ASize l => l -> Pull l a -> Pull l a
106 | setSize n (Pull _ ixf) = mkPull n ixf
107 | 
108 | 
109 | ---------------------------------------------------------------------------
110 | -- Array Class
111 | ---------------------------------------------------------------------------x
112 | class ArrayLength a where
113 |   -- | Get the length of an array.
114 |   len :: a s e -> s
115 | 
116 | instance ArrayLength Pull where
117 |   len    arr  = pullLen arr
118 | 
119 | instance ArrayLength (Push t) where
120 |   len  (Push s _) = s
121 | 
122 | class Array a where
123 |   -- | Array of consecutive integers
124 |   iota      :: ASize s => s -> a s EWord32
125 |   -- | Create an array by replicating an element.
126 |   replicate :: ASize s => s -> e -> a s e
127 | 
128 |   -- | Map a function over an array.
129 |   aMap      :: (e -> e') -> a s e -> a s e'
130 |   -- | Perform arbitrary permutations (dangerous).
131 |   ixMap     :: (EWord32 -> EWord32)
132 |                -> a s e -> a s e
133 |   -- requires Choice !
134 |   -- Simply because the pull array implementation of it does.
135 |   -- | Append two arrays.
136 |   append    :: (ASize s, Choice e) => a s e -> a s e -> a s e
137 | 
138 |   -- technicalities
139 |   -- | Statically sized array to dynamically sized array.
140 |   toDyn     :: a Word32 e -> a EW32 e
141 |   -- | Dynamically sized array to statically sized array.
142 |   fromDyn   :: Word32 -> a EW32 e -> a Word32 e
143 | 
144 | instance Array Pull where
145 |   iota   s = Pull s $ \ix -> ix
146 |   replicate s e = Pull s $ \_ -> e
147 | 
148 |   aMap   f (Pull n ixf) = Pull n (f . ixf)
149 |   ixMap  f (Pull n ixf) = Pull n (ixf . f)
150 | 
151 |   append a1 a2 = Pull (n1+n2)
152 |                $ \ix -> ifThenElse (ix <* sizeConv n1)
153 |                        (a1 ! ix)
154 |                        (a2 ! (ix - sizeConv n1))
155 |     where
156 |       n1 = len a1
157 |       n2 = len a2
158 | 
159 |   -- technicalities
160 |   toDyn (Pull n ixf) = Pull (fromIntegral n) ixf
161 |   fromDyn n (Pull _ ixf) = Pull n ixf
162 | 
163 | 
164 | instance Array (Push Thread) where
165 |   iota s = Push s $ \wf ->
166 |     do
167 |       forAll (sizeConv s) $ \ix -> wf ix ix
168 |   replicate s e = Push s $ \wf ->
169 |     do
170 |       forAll (sizeConv s) $ \ix -> wf e ix
171 |   aMap   f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix)
172 |   ixMap  f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix))
173 | 
174 |   -- unfortunately a Choice constraint.
175 |   append p1 p2  =
176 |     Push (n1 + n2) $ \wf ->
177 |       do p1 <: wf
178 |          p2 <: \a i -> wf a (sizeConv n1 + i)
179 |            where
180 |              n1 = len p1
181 |              n2 = len p2
182 | 
183 |    -- technicalities
184 |   toDyn (Push n p) = Push (fromIntegral n) p
185 |   fromDyn n (Push _ p) = Push n p
186 | 
187 | instance Array (Push Warp) where
188 |   iota s = Push s $ \wf ->
189 |     do
190 |       forAll (sizeConv s) $ \ix -> wf ix ix
191 |   replicate s e = Push s $ \wf ->
192 |     do
193 |       forAll (sizeConv s) $ \ix -> wf e ix
194 |   aMap   f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix)
195 |   ixMap  f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix))
196 | 
197 |   -- unfortunately a Choice constraint.
198 |   append p1 p2  =
199 |     Push (n1 + n2) $ \wf ->
200 |       do p1 <: wf
201 |          p2 <: \a i -> wf a (sizeConv n1 + i)
202 |            where
203 |              n1 = len p1
204 |              n2 = len p2
205 | 
206 |    -- technicalities
207 |   toDyn (Push n p) = Push (fromIntegral n) p
208 |   fromDyn n (Push _ p) = Push n p
209 | 
210 | 
211 | instance Array (Push Block) where
212 |   iota s = Push s $ \wf ->
213 |     do
214 |       forAll (sizeConv s) $ \ix -> wf ix ix
215 |   replicate s e = Push s $ \wf ->
216 |     do
217 |       forAll (sizeConv s) $ \ix -> wf e ix
218 |   aMap   f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix)
219 |   ixMap  f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix))
220 | 
221 |   -- unfortunately a Choice constraint.
222 |   append p1 p2  =
223 |     Push (n1 + n2) $ \wf ->
224 |       do p1 <: wf
225 |          p2 <: \a i -> wf a (sizeConv n1 + i)
226 |            where
227 |              n1 = len p1
228 |              n2 = len p2
229 | 
230 |    -- technicalities
231 |   toDyn (Push n p) = Push (fromIntegral n) p
232 |   fromDyn n (Push _ p) = Push n p
233 | 
234 | instance Array (Push Grid) where
235 |   iota s = error "iota: not supported as Grid"
236 |   replicate s e = error "replicate: not supported as Grid"
237 |   aMap   f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix)
238 |   ixMap  f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix))
239 | 
240 |   -- unfortunately a Choice constraint.
241 |   append p1 p2  =
242 |     mkPush (n1 + n2) $ \wf ->
243 |       do p1 <: wf
244 |          p2 <: \a i -> wf a (sizeConv n1 + i)
245 |            where
246 |              n1 = len p1
247 |              n2 = len p2
248 | 
249 |    -- technicalities
250 |   toDyn (Push n p) = Push (fromIntegral n) p
251 |   fromDyn n (Push _ p) = Push n p
252 | 
253 | 
254 | 
255 | 
256 | ---------------------------------------------------------------------------
257 | -- Functor instance Pull/Push arrays
258 | ---------------------------------------------------------------------------
259 | instance  Array (Push t) => Functor (Push t s) where
260 |   fmap = aMap
261 | 
262 | instance Functor (Pull s) where
263 |   fmap = aMap
264 | 
265 | 
266 | ---------------------------------------------------------------------------
267 | -- Pushable
268 | ---------------------------------------------------------------------------
269 | -- | Convert a pull array to a push array.
270 | push ::  (t *<=* Block) => ASize s => Pull s e -> Push t s e
271 | push (Pull n ixf) =
272 |   mkPush n $ \wf ->
273 |     forAll (sizeConv n) $ \i -> wf (ixf i) i
274 | 
275 | -- Keep push, user may need to annotate with a type
276 | -- Add specific
277 | --   pushThread
278 | --   pushWarp
279 | --   pushBlock
280 | pushThread :: ASize s => Pull s e -> Push Thread s e
281 | pushThread = push
282 | 
283 | pushWarp :: ASize s => Pull s e ->  Push Warp s e
284 | pushWarp = push
285 | 
286 | pushBlock :: ASize s => Pull s e -> Push Block s e
287 | pushBlock = push
288 | 
289 | --------------------------------------------------------------------------
290 | -- Indexing, array creation.
291 | ---------------------------------------------------------------------------
292 | pushApp :: Push t s a -> (a -> EWord32 -> TProgram ()) -> Program t ()
293 | pushApp (Push _ p) a = p a
294 | 
295 | infixl 9 <:
296 | (<:) :: Push t s a
297 |         -> (a -> EWord32 -> Program Thread ())
298 |         -> Program t ()
299 | (<:) = pushApp
300 | 
301 | infixl 9 !
302 | (!) :: Pull s e -> Exp Word32 -> e
303 | (!) arr = pullFun arr
304 | 
305 | 
306 | ---------------------------------------------------------------------------
307 | --
308 | ---------------------------------------------------------------------------
309 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/Memory.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {- Joel Svensson 2012, 2013
  3 |  
  4 |    notes:
  5 |      Added a SeqFor case Jan-21-2013
  6 | 
  7 |  -} 
  8 | module Obsidian.CodeGen.Memory 
  9 |        (MemMap,
 10 |         Memory,
 11 |         allocate,
 12 |         free,
 13 |         freeAll,
 14 |         size, 
 15 |         sharedMem,  
 16 |         Address,
 17 |         Bytes,
 18 |         mmIM,
 19 |         renameIM ) 
 20 |        where 
 21 | 
 22 | import qualified Data.List as List
 23 | import qualified Data.Set  as Set
 24 | import Data.Word
 25 | 
 26 | import Obsidian.Types
 27 | import Obsidian.Globs
 28 | 
 29 | import Obsidian.Exp 
 30 | import Obsidian.CodeGen.Program
 31 | import Obsidian.CodeGen.Liveness
 32 | 
 33 | import qualified Data.Map as Map 
 34 | 
 35 | ---------------------------------------------------------------------------
 36 | -- Planned improvements
 37 | ---------------------------------------------------------------------------
 38 | -- # DONE: Always start a shared memory array at a "bank-aligned" address
 39 | --      + So that programmer can really effect access patterns. 
 40 | -- # DONE: Do not rename with pointers
 41 | --      instead output a list of (type ,name,address) quadruples
 42 | --      that can be used to create an alias (at top scope of program)
 43 | --      + Prettier code -> easier debugging 
 44 | --      + Potential efficiency issues, from less casting etc
 45 | -- # (LONG-TERM) Clever memory allocation
 46 | --      + The future is known! So encode the optimal memory
 47 | --        allocation schema
 48 | 
 49 | -- Assumptions:
 50 | --   # an array declares as being __shared__ in the CUDA code
 51 | --     will be well aligned with the banks
 52 | 
 53 | ---------------------------------------------------------------------------
 54 | -- Memory layout
 55 | ---------------------------------------------------------------------------
 56 | 
 57 | type MemMap = Map.Map Name (Word32,Type)
 58 | 
 59 | type Address = Word32
 60 | type Bytes   = Word32 
 61 | 
 62 | data Memory = Memory {freeList  :: [(Address,Bytes)] ,
 63 |                       allocated :: [(Address,Bytes)] , 
 64 |                       size      :: Bytes} -- how much used
 65 |             deriving Show 
 66 |               
 67 |               
 68 | -- 48 kilobytes of smem              
 69 | sharedMem = Memory [(0,49152)] [] 0
 70 | 
 71 | 
 72 | updateMax :: Memory -> Memory 
 73 | updateMax mem = let m = maximum [a+b|(a,b) <- allocated mem]
 74 |                     m' = max m (size mem)
 75 |                 in mem {size = m'}
 76 | 
 77 | -- This one needs to check that shared memory is not full.
 78 | allocate :: Memory -> Bytes -> (Memory,Address)
 79 | allocate m b = 
 80 |   let adress = filter (\(x,y) -> y >= b) (freeList m) -- get a list of candidates
 81 |       -- getTop mem = let (a,b)  = case null (allocated m) of 
 82 |       --                    False -> maximum $ List.sort (allocated m) 
 83 |       --                    True  -> (0,0)
 84 |       --              in a+b
 85 |   in case adress of 
 86 |     -- use the first candidate (try better approaches 
 87 |     -- such as searching for best match, so that not to waste memory)
 88 |     ((a,bytes):_)  -> let fl = filter (\(addr,_) -> a /= addr) (freeList m)
 89 |                           fl' = if b < bytes 
 90 |                                 then (a+b,bytes-b):fl
 91 |                                 else fl
 92 |                       in  (updateMax (m {freeList = fl', 
 93 |                                          allocated = (a,b):allocated m}) ,a)
 94 |     [] -> error "out of shared memory"
 95 |    
 96 |                           
 97 | free :: Memory -> Address -> Memory
 98 | free m a = mem 
 99 |     where 
100 |       bytes = lookup a (allocated m)
101 |       al    = filter (\(addr,_) -> a /= addr) (allocated m)
102 | 
103 |       -- TODO: Investigate this much closer.
104 |       --       Is it a bug or is freeing a non allocated memory area
105 |       --       OK?
106 |       
107 |       mem   = case bytes of 
108 |                 Nothing -> m
109 |                 {-
110 |                   error $ "error: Address " ++ show a ++
111 |                           " not found in allocated list" ++
112 |                           "\n" ++ show m
113 |                 -} 
114 |                 Just b -> m {freeList = compress ((a,b):(freeList m)),
115 |                              allocated = al}
116 | 
117 | freeAll :: Memory -> [Address] -> Memory 
118 | freeAll m [] = m
119 | freeAll m (a:as) = freeAll (free m a) as
120 | 
121 | compress = merge . List.sort 
122 | 
123 | merge [] = [] 
124 | merge [x] = [x]
125 | merge ((x,b):(y,b2):xs) = if (x+b == y)
126 |                           then merge ((x,b+b2):xs) 
127 |                           else (x,b):merge((y,b2):xs)
128 | 
129 | ---------------------------------------------------------------------------
130 | -- Memory map the new IM
131 | ---------------------------------------------------------------------------
132 | mmIM :: IML -> Memory -> MemMap -> (Memory, MemMap)
133 | mmIM im memory memmap = r im (memory,memmap)
134 |   where
135 |     r [] m = m
136 |     r (x:xs) (m,mm) =
137 |       let
138 |           (m',mm') = process x m mm
139 |            
140 |           freeable = getFreeableSet x xs
141 |           freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable))
142 |           dontMap name = not ((List.isPrefixOf "input" name) || 
143 |                               (List.isPrefixOf "output" name))
144 |           mNew =
145 |             case freeableAddrs of
146 |               (Just as) -> freeAll m' (map fst as)
147 |               Nothing   -> m'
148 |       in r xs (mNew,mm')
149 | mmIM' :: IML -> Memory -> MemMap -> (Memory, MemMap)
150 | mmIM' im memory memmap = r im (memory,memmap)
151 |   where
152 |     r [] m = m
153 |     r (x:xs) (m,mm) =
154 |       let
155 |           (m',mm') = process x m mm
156 |            
157 |           freeable = getFreeableSet x xs
158 |           freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable))
159 |           dontMap name = not ((List.isPrefixOf "input" name) || 
160 |                               (List.isPrefixOf "output" name))
161 |           mNew =
162 |             case freeableAddrs of
163 |               (Just as) -> m' -- freeAll m' (map fst as)
164 |               Nothing   -> m'
165 |       in r xs (mNew,mm')
166 | 
167 |     
168 | process (SAllocate name size t,_) m mm = (m',mm') 
169 |   where (m',addr) = allocate m size
170 |         mm' = case Map.lookup name mm of
171 |           Nothing -> Map.insert name (addr,t) mm
172 |           (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a
173 | 
174 |     -- A tricky case.                      
175 | --    process (SForAllBlocks n im,_) m mm = mmIM im m mm
176 |     -- Another tricky case. 
177 | process (SSeqFor _ n im,_) m mm = mmIM im m mm
178 | process (SSeqWhile b im,_) m mm = mmIM im m mm 
179 |     -- Yet another tricky case.
180 | process (SForAll _ n im,_) m mm = mmIM im m mm
181 | process (SDistrPar Warp n im,_) m mm = mmIM' im m mm -- mmIM im m mm
182 | process (SDistrPar Block n im,_) m mm = mmIM im m mm 
183 |     -- The worst of them all.
184 | --    process (SForAllThreads n im,_) m mm = mmIM im m mm
185 | --    process (SNWarps _ im,_) m mm = mmIM im m mm
186 | --    process (SWarpForAll _ im,_) m mm = mmIM im m mm 
187 | 
188 |   --  process im m mm = error $ printStm im -- "process: WHat!"
189 | process (_,_) m mm = (m,mm) 
190 | 
191 | -- Friday (2013 Mars 29, discovered bug) 
192 | getFreeableSet :: (Statement Liveness,Liveness) -> IML -> Liveness 
193 | getFreeableSet (_,l) [] = Set.empty -- not l ! 
194 | getFreeableSet (_,l) ((_,l1):_) = l Set.\\ l1
195 | 
196 | ---------------------------------------------------------------------------
197 | -- Rename arrays in IM
198 | --------------------------------------------------------------------------- 
199 | 
200 | renameIM :: MemMap -> IML -> IMList ()
201 | renameIM mm im = zip (map (go . fst) im) (repeat ())
202 |   where
203 |     go (SAssign name ix e) = SAssign (renameIVar mm name)
204 |                                      (map (renameIExp mm) ix)
205 |                                      (renameIExp mm e)
206 |     go (SAtomicOp name ix atop) = SAtomicOp (renameIVar mm name)
207 |                                             (renameIExp mm ix)
208 |                                             (renameAtOp mm atop) 
209 |     go (SCond be im) = SCond (renameIExp mm be)
210 |                              (renameIM mm im)
211 |     go (SSeqFor str n im) = SSeqFor str (renameIExp mm n)
212 |                                         (renameIM mm im)
213 |     go SBreak = SBreak
214 |     go (SSeqWhile n im) = SSeqWhile (renameIExp mm n)
215 |                                     (renameIM mm im)
216 |     go (SForAll lvl n im)   = SForAll lvl (renameIExp mm n)
217 |                                           (renameIM mm im)
218 |     go (SDistrPar lvl n im) = SDistrPar lvl (renameIExp mm n)
219 |                                             (renameIM mm im) 
220 | 
221 | --    go (SForAllBlocks n im) = SForAllBlocks (renameIExp mm n)
222 | --                                            (renameIM mm im)
223 | --    go (SNWarps n im) = SNWarps (renameIExp mm n)
224 | --                                (renameIM mm im)
225 | --    go (SWarpForAll n im) = SWarpForAll (renameIExp mm n)
226 | --                                        (renameIM mm im) 
227 |     -- Strip this out earlier. 
228 |     go (SAllocate name n t)  = SAllocate name n t 
229 |     go (SDeclare name t) = SDeclare name t
230 |     go SSynchronize      = SSynchronize 
231 | 
232 | ---------------------------------------------------------------------------
233 | -- Memory map the arrays in an CExpr
234 | ---------------------------------------------------------------------------
235 | renameIExp mm e@(IVar nom t) =  renameIVar mm e 
236 | renameIExp mm (IIndex (e1,es) t) = IIndex (renameIExp mm e1, map (renameIExp mm) es) t
237 | renameIExp mm (IBinOp op e1 e2 t) = IBinOp op (renameIExp mm e1) (renameIExp mm e2) t
238 | renameIExp mm (IUnOp op e t) = IUnOp op (renameIExp mm e) t 
239 | renameIExp mm (IFunCall nom exprs t) = IFunCall nom (map (renameIExp mm) exprs) t
240 | renameIExp mm (ICast e t) = ICast (renameIExp mm e) t
241 | renameIExp mm (ICond e1 e2 e3 t) = ICond (renameIExp mm e1)
242 |                                          (renameIExp mm e2)
243 |                                          (renameIExp mm e3)
244 |                                          t
245 | renameIExp mm a = a
246 | 
247 | 
248 | renameIVar mm (IVar name t) =
249 |     case Map.lookup name mm of 
250 |     Just (addr,t) -> 
251 |       let core = sbaseIExp addr 
252 |           cast c = ICast c t
253 |       in cast core
254 |     
255 |     Nothing -> IVar name t
256 |     where
257 |       sbaseIExp 0    = IVar "sbase" (Pointer Word8) 
258 |       sbaseIExp addr = IBinOp IAdd (IVar "sbase" (Pointer Word8)) 
259 |                                    (IWord32 addr) 
260 |                                    (Pointer Word8) 
261 | 
262 | renameAtOp mm AtInc = AtInc
263 | renameAtOp mm (AtAdd e) = AtAdd (renameIExp mm e)
264 | renameAtOp mm (AtSub e) = AtSub (renameIExp mm e)
265 | renameAtOp mm (AtExch e) = AtExch (renameIExp mm e) 
266 | 
267 | 
268 | 


--------------------------------------------------------------------------------
/Obsidian/Memory.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables,
  2 |              GADTs #-}
  3 | 
  4 | {- Joel Svensson 2013
  5 | 
  6 |    This Module became quite messy.
  7 |    TODO: CLEAN IT UP!
  8 | 
  9 |    notes: 2013-05-02: Cleaned out inspect.
 10 | 
 11 | -}
 12 | 
 13 | module Obsidian.Memory (Storable(..))  where
 14 | 
 15 | 
 16 | import Obsidian.Program
 17 | import Obsidian.Exp
 18 | import Obsidian.Types
 19 | import Obsidian.Array -- Importing this feels a bit strange.
 20 | import Obsidian.Names
 21 | 
 22 | import Data.Word
 23 | 
 24 | -- class MemoryOps a => Storable a
 25 | 
 26 | ---------------------------------------------------------------------------
 27 | -- Local Memory
 28 | ---------------------------------------------------------------------------
 29 | class Storable a where
 30 |   -- | Obtain new names for variables / arrays
 31 |   names          :: String -> Program t (Names a)
 32 | 
 33 |   -- Array operations
 34 |   assignArray    :: Names a -> a -> Exp Word32 -> Program Thread ()
 35 |   allocateArray  :: Names a -> Word32 -> Program t ()
 36 |   pullFrom       :: ASize s => Names a -> s -> Pull s a
 37 | 
 38 | 
 39 |   -- Scalar operations
 40 |   assignScalar   :: Names a -> a -> Program Thread ()
 41 |   allocateScalar :: Names a ->  Program t ()
 42 |   allocateSharedScalar :: Names a -> Program t ()
 43 |   readFrom       :: Names a -> a
 44 | 
 45 |   -- Warp level operations
 46 |   warpAssignArray   :: Names a
 47 |                       -> EWord32
 48 |                       -> Word32
 49 |                       -> a
 50 |                       -> EWord32
 51 |                       -> Program Thread ()
 52 |   warpPullFrom      :: Names a -> EWord32 -> Word32 -> Pull Word32 a
 53 | 
 54 |   threadAssignArray :: Names a
 55 |                        -> EWord32
 56 |                        -> Word32
 57 |                        -> a
 58 |                        -> EWord32
 59 |                        -> Program Thread ()
 60 |   threadPullFrom :: Names a -> EWord32 -> Word32 -> Pull Word32 a
 61 | 
 62 |   -- Extra
 63 |   allocateVolatileArray :: Names a -> Word32 -> Program t ()
 64 | 
 65 | 
 66 | 
 67 | 
 68 | ---------------------------------------------------------------------------
 69 | -- Instances
 70 | ---------------------------------------------------------------------------
 71 | instance Scalar a => Storable (Exp a) where
 72 | 
 73 |   -- Names
 74 |   names pre = do {i <- uniqueNamed pre; return (Single i)}
 75 | 
 76 |   --Array ops
 77 |   allocateArray (Single name) n =
 78 |     Allocate name (n * fromIntegral (sizeOf (undefined :: Exp a)))
 79 |                   (Pointer (typeOf (undefined :: Exp a)))
 80 |   assignArray  (Single name) a ix = Assign name [ix] a
 81 |   pullFrom (Single name) n = mkPull n (\i -> index name i)
 82 | 
 83 |   -- Scalar ops
 84 |   allocateScalar (Single name) =
 85 |     Declare name (typeOf (undefined :: Exp a))
 86 |   allocateSharedScalar (Single name) =
 87 |     Declare name (Shared $ typeOf (undefined :: Exp a))
 88 | 
 89 |   assignScalar (Single name) a    = Assign name [] a
 90 |   readFrom  (Single name) = variable name
 91 | 
 92 |   -- Warp ops
 93 |   warpAssignArray (Single name) warpId step a ix =
 94 |     Assign name [warpId * fromIntegral step + ix] a
 95 | 
 96 |   warpPullFrom (Single name) warpId n
 97 |     = mkPull n (\i -> index name (warpId * fromIntegral n + i))
 98 | 
 99 |   -- Thread ops
100 |   threadAssignArray (Single name) threadId step a ix =
101 |     Assign name [threadId * fromIntegral step + ix] a
102 | 
103 |   threadPullFrom (Single name) threadId n
104 |     = mkPull n (\i -> index name (threadId * fromIntegral n + i))
105 | 
106 |   -- Extra
107 |   allocateVolatileArray (Single name) n =
108 |     Allocate name (n * fromIntegral (sizeOf (undefined :: Exp a)))
109 |                   (Volatile (Pointer (typeOf (undefined :: Exp a))))
110 | 
111 | 
112 | 
113 | instance (Storable a, Storable b) => Storable (a, b) where
114 |   names pre =
115 |     do
116 |       (a' :: Names a) <- names pre
117 |       (b' :: Names b) <- names pre
118 |       return $ Tuple a' b'
119 |   allocateArray (Tuple ns1 ns2)  n =
120 |       allocateArray ns1 n >>
121 |       allocateArray ns2 n
122 | 
123 |   allocateVolatileArray (Tuple ns1 ns2)  n =
124 |       allocateVolatileArray ns1 n >>
125 |       allocateVolatileArray ns2 n
126 | 
127 | 
128 |   allocateScalar (Tuple ns1 ns2) =
129 |       allocateScalar ns1 >>
130 |       allocateScalar ns2
131 | 
132 |   allocateSharedScalar (Tuple ns1 ns2) =
133 |       allocateSharedScalar ns1 >>
134 |       allocateSharedScalar ns2
135 | 
136 | 
137 |   assignArray (Tuple ns1 ns2) (a,b) ix =
138 |       assignArray ns1 a ix >>
139 |       assignArray ns2 b ix
140 | 
141 |   warpAssignArray (Tuple ns1 ns2) warpID step (a,b) ix =
142 |       warpAssignArray ns1 warpID step a ix >>
143 |       warpAssignArray ns2 warpID step b ix
144 | 
145 |   threadAssignArray (Tuple ns1 ns2) threadId step (a,b) ix =
146 |     threadAssignArray ns1 threadId step a ix >>
147 |     threadAssignArray ns2 threadId step b ix
148 | 
149 |   assignScalar (Tuple ns1 ns2) (a,b) =
150 |       assignScalar ns1 a >>
151 |       assignScalar ns2 b
152 | 
153 |   pullFrom (Tuple ns1 ns2) n =
154 |     let p1 = pullFrom ns1 n
155 |         p2 = pullFrom ns2 n
156 |     in mkPull n (\ix -> (p1 ! ix, p2 ! ix))
157 | 
158 |   warpPullFrom (Tuple ns1 ns2) warpID n
159 |     = let p1 = warpPullFrom ns1 warpID n
160 |           p2 = warpPullFrom ns2 warpID n
161 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix))
162 | 
163 |   threadPullFrom (Tuple ns1 ns2) threadId n
164 |     = let p1 = threadPullFrom ns1 threadId n
165 |           p2 = threadPullFrom ns2 threadId n
166 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix))
167 | 
168 |   readFrom (Tuple ns1 ns2)  =
169 |     let p1 = readFrom ns1
170 |         p2 = readFrom ns2
171 |     in (p1,p2)
172 | 
173 | 
174 | 
175 | 
176 | instance (Storable a, Storable b, Storable c) => Storable (a, b, c) where
177 |   names pre =
178 |     do
179 |       (a :: Names a) <- names pre
180 |       (b :: Names b) <- names pre
181 |       (c :: Names c) <- names pre
182 |       return $ Triple a b c
183 | 
184 |   allocateArray (Triple ns1 ns2 ns3) n =
185 |       allocateArray ns1 n >>
186 |       allocateArray ns2 n >>
187 |       allocateArray ns3 n
188 | 
189 |   allocateVolatileArray (Triple ns1 ns2 ns3) n =
190 |       allocateVolatileArray ns1 n >>
191 |       allocateVolatileArray ns2 n >>
192 |       allocateVolatileArray ns3 n
193 | 
194 |   allocateScalar (Triple ns1 ns2 ns3) =
195 |       allocateScalar ns1 >>
196 |       allocateScalar ns2 >>
197 |       allocateScalar ns3
198 | 
199 |   allocateSharedScalar (Triple ns1 ns2 ns3) =
200 |       allocateSharedScalar ns1 >>
201 |       allocateSharedScalar ns2 >>
202 |       allocateSharedScalar ns3
203 | 
204 | 
205 |   assignArray (Triple ns1 ns2 ns3) (a,b,c) ix =
206 |       assignArray ns1 a ix >>
207 |       assignArray ns2 b ix >>
208 |       assignArray ns3 c ix
209 | 
210 |   warpAssignArray (Triple ns1 ns2 ns3) warpID step (a,b,c) ix =
211 |       warpAssignArray ns1 warpID step a ix >>
212 |       warpAssignArray ns2 warpID step b ix >>
213 |       warpAssignArray ns3 warpID step c ix
214 | 
215 |   threadAssignArray (Triple ns1 ns2 ns3) threadID step (a,b,c) ix =
216 |     threadAssignArray ns1 threadID step a ix >>
217 |     threadAssignArray ns2 threadID step b ix >>
218 |     threadAssignArray ns3 threadID step c ix
219 | 
220 | 
221 |   assignScalar (Triple ns1 ns2 ns3) (a,b,c) =
222 |       assignScalar ns1 a >>
223 |       assignScalar ns2 b >>
224 |       assignScalar ns3 c
225 | 
226 |   pullFrom (Triple ns1 ns2 ns3) n =
227 |     let p1 = pullFrom ns1 n
228 |         p2 = pullFrom ns2 n
229 |         p3 = pullFrom ns3 n
230 |     in mkPull n (\ix -> (p1 ! ix, p2 ! ix,p3 ! ix))
231 | 
232 |   warpPullFrom (Triple ns1 ns2 ns3) warpID n
233 |     = let p1 = warpPullFrom ns1 warpID n
234 |           p2 = warpPullFrom ns2 warpID n
235 |           p3 = warpPullFrom ns3 warpID n
236 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix))
237 | 
238 |   threadPullFrom (Triple ns1 ns2 ns3) threadId n
239 |     = let p1 = threadPullFrom ns1 threadId n
240 |           p2 = threadPullFrom ns2 threadId n
241 |           p3 = threadPullFrom ns3 threadId n
242 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix,p3 ! ix))
243 | 
244 | 
245 |   readFrom (Triple ns1 ns2 ns3)  =
246 |     let p1 = readFrom ns1
247 |         p2 = readFrom ns2
248 |         p3 = readFrom ns3
249 |     in (p1,p2,p3)
250 | 
251 | instance (Storable a, Storable b, Storable c, Storable d) => Storable (a, b, c, d) where
252 |   names pre =
253 |     do
254 |       (a :: Names a) <- names pre
255 |       (b :: Names b) <- names pre
256 |       (c :: Names c) <- names pre
257 |       (d :: Names d) <- names pre
258 |       return $ Quadruple a b c d
259 | 
260 |   allocateArray (Quadruple ns1 ns2 ns3 ns4) n =
261 |       allocateArray ns1 n >>
262 |       allocateArray ns2 n >>
263 |       allocateArray ns3 n >>
264 |       allocateArray ns4 n
265 | 
266 |   allocateVolatileArray (Quadruple ns1 ns2 ns3 ns4) n =
267 |       allocateVolatileArray ns1 n >>
268 |       allocateVolatileArray ns2 n >>
269 |       allocateVolatileArray ns3 n >>
270 |       allocateVolatileArray ns4 n
271 | 
272 |   allocateScalar (Quadruple ns1 ns2 ns3 ns4) =
273 |       allocateScalar ns1 >>
274 |       allocateScalar ns2 >>
275 |       allocateScalar ns3 >>
276 |       allocateScalar ns4
277 | 
278 |   allocateSharedScalar (Quadruple ns1 ns2 ns3 ns4) =
279 |       allocateSharedScalar ns1 >>
280 |       allocateSharedScalar ns2 >>
281 |       allocateSharedScalar ns3 >>
282 |       allocateSharedScalar ns4
283 | 
284 | 
285 |   assignArray (Quadruple ns1 ns2 ns3 ns4) (a,b,c,d) ix =
286 |       assignArray ns1 a ix >>
287 |       assignArray ns2 b ix >>
288 |       assignArray ns3 c ix >>
289 |       assignArray ns4 d ix
290 | 
291 |   warpAssignArray (Quadruple ns1 ns2 ns3 ns4) warpID step (a,b,c,d) ix =
292 |       warpAssignArray ns1 warpID step a ix >>
293 |       warpAssignArray ns2 warpID step b ix >>
294 |       warpAssignArray ns3 warpID step c ix >>
295 |       warpAssignArray ns4 warpID step d ix
296 | 
297 |   threadAssignArray (Quadruple ns1 ns2 ns3 ns4) threadID step (a,b,c,d) ix =
298 |     threadAssignArray ns1 threadID step a ix >>
299 |     threadAssignArray ns2 threadID step b ix >>
300 |     threadAssignArray ns3 threadID step c ix >>
301 |     threadAssignArray ns4 threadID step d ix
302 | 
303 | 
304 |   assignScalar (Quadruple ns1 ns2 ns3 ns4) (a,b,c,d) =
305 |       assignScalar ns1 a >>
306 |       assignScalar ns2 b >>
307 |       assignScalar ns3 c >>
308 |       assignScalar ns4 d
309 | 
310 |   pullFrom (Quadruple ns1 ns2 ns3 ns4) n =
311 |     let p1 = pullFrom ns1 n
312 |         p2 = pullFrom ns2 n
313 |         p3 = pullFrom ns3 n
314 |         p4 = pullFrom ns4 n
315 |     in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix))
316 | 
317 |   warpPullFrom (Quadruple ns1 ns2 ns3 ns4) warpID n
318 |    = let p1 = warpPullFrom ns1 warpID n
319 |          p2 = warpPullFrom ns2 warpID n
320 |          p3 = warpPullFrom ns3 warpID n
321 |          p4 = warpPullFrom ns4 warpID n
322 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix))
323 | 
324 |   threadPullFrom (Quadruple ns1 ns2 ns3 ns4) threadID n
325 |    = let p1 = threadPullFrom ns1 threadID n
326 |          p2 = threadPullFrom ns2 threadID n
327 |          p3 = threadPullFrom ns3 threadID n
328 |          p4 = threadPullFrom ns4 threadID n
329 |       in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix))
330 | 
331 |   readFrom (Quadruple ns1 ns2 ns3 ns4)  =
332 |     let p1 = readFrom ns1
333 |         p2 = readFrom ns2
334 |         p3 = readFrom ns3
335 |         p4 = readFrom ns4
336 |     in (p1,p2,p3,p4)
337 | 


--------------------------------------------------------------------------------
/Obsidian/Program.hs:
--------------------------------------------------------------------------------
  1 | {- Joel Svensson 2012,2013,2014
  2 | 
  3 |    Notes:
  4 |    2014      : starting a big overhauling
  5 |    2013-04-02: Added a Break statement to the language.
  6 |                Use it to break out of sequential loops.
  7 |    2013-01-08: removed number-of-blocks field from ForAllBlocks
  8 | 
  9 | -}
 10 | 
 11 | {-# LANGUAGE GADTs #-}
 12 | {-# LANGUAGE EmptyDataDecls #-}
 13 | {-# LANGUAGE FlexibleInstances #-}
 14 | {-# LANGUAGE MultiParamTypeClasses #-}
 15 | {-# LANGUAGE ScopedTypeVariables #-}
 16 | {-# LANGUAGE TypeOperators #-}
 17 | {-# LANGUAGE FlexibleContexts #-}
 18 | {-# LANGUAGE UndecidableInstances #-}
 19 | {-# LANGUAGE ExplicitNamespaces #-}
 20 | 
 21 | {-# LANGUAGE TypeFamilies #-}
 22 | {-# LANGUAGE DataKinds #-}
 23 | {-# LANGUAGE ConstraintKinds #-}
 24 | ------------------------------------------
 25 | {- LANGUAGE FunctionalDependencies -}
 26 | 
 27 | module Obsidian.Program  (
 28 |   -- Hierarchy
 29 |   Thread, Block, Grid, Warp, Step,
 30 |   --  Step, Zero,
 31 |   -- Program type
 32 |   -- CoreProgram(..),
 33 |   Program(..), -- all exported.. for now
 34 |   TProgram, BProgram, GProgram, WProgram,
 35 | 
 36 |   -- Class
 37 |   type (*<=*),
 38 | 
 39 |   -- helpers
 40 |   printPrg,
 41 |   runPrg,
 42 |   uniqueNamed, uniqueNamed_,
 43 | 
 44 |   allocate, declare,
 45 |   atomicOp,
 46 |   -- Programming interface
 47 |   seqFor, forAll, seqWhile, sync, distrPar, forAll2,
 48 |   singleThread
 49 | 
 50 | 
 51 |   ) where
 52 | 
 53 | import Data.Word
 54 | 
 55 | import Obsidian.Exp
 56 | import Obsidian.Types
 57 | import Obsidian.Globs
 58 | import Obsidian.Atomic
 59 | 
 60 | import Control.Applicative
 61 | 
 62 | 
 63 | ---------------------------------------------------------------------------
 64 | -- Thread/Block/Grid
 65 | ---------------------------------------------------------------------------
 66 | 
 67 | data Thread
 68 | data Step t
 69 | 
 70 | type Warp  = Step Thread
 71 | type Block = Step Warp
 72 | type Grid  = Step Block
 73 | 
 74 | 
 75 | -- | Type level less-than-or-equal test.
 76 | type family LessThanOrEqual a b where
 77 |    LessThanOrEqual Thread   Thread   = True
 78 |    LessThanOrEqual Thread   (Step m) = True
 79 |    LessThanOrEqual (Step n) (Step m) = LessThanOrEqual n m
 80 |    LessThanOrEqual x y               = False
 81 | 
 82 | 
 83 | -- | This constraint is a more succinct way of requiring that @a@ be less than or equal to @b@.
 84 | type a *<=* b = (LessThanOrEqual a b ~ True)
 85 | 
 86 | 
 87 | ---------------------------------------------------------------------------
 88 | 
 89 | type Identifier = Int
 90 | 
 91 | ---------------------------------------------------------------------------
 92 | -- Program datatype
 93 | --------------------------------------------------------------------------
 94 | data Program t a where
 95 | 
 96 |   Identifier :: Program t Identifier
 97 | 
 98 |   Assign :: Scalar a
 99 |             => Name
100 |             -> [Exp Word32]
101 |             -> (Exp a)
102 |             -> Program Thread ()
103 | 
104 |   -- 4 March 2014, Changed so that AtOp does not return a result.
105 |   -- Change this back later if an application requires.
106 |   AtomicOp :: Scalar a
107 |               => Name        -- Array name
108 |               -> Exp Word32  -- Index to operate on
109 |               -> Atomic a    -- Atomic operation to perform
110 |               -> Program Thread ()
111 | 
112 |   Cond :: Exp Bool
113 |           -> Program Thread ()
114 |           -> Program Thread ()
115 | 
116 |   SeqWhile :: Exp Bool ->
117 |               Program Thread () ->
118 |               Program Thread ()
119 | 
120 |   Break  :: Program Thread ()
121 | 
122 |   -- use threads along one level
123 |   -- Thread, Warp, Block.
124 |   -- Make sure Code generation works when t ~ Thread
125 |   ForAll :: (t *<=* Block) => EWord32
126 |             -> (EWord32 -> Program Thread ())
127 |             -> Program t ()
128 | 
129 |   -- Distribute over Warps yielding a Block
130 |   -- Distribute over Blocks yielding a Grid
131 |   DistrPar :: EWord32
132 |            -> (EWord32 -> Program t ())
133 |            -> Program (Step t) ()
134 | 
135 |   -- BUG: I Need to recognize sequential distribution of
136 |   -- work too in order to set up storage correctly for
137 |   -- arrays allocated in within sequentially distributed work.
138 |   DistrSeq :: EWord32
139 |            -> (EWord32 -> Program t ())
140 |            -> Program t ()
141 | 
142 | 
143 |   SeqFor :: EWord32 -> (EWord32 -> Program t ())
144 |             -> Program t ()
145 | 
146 |   -- Allocate shared memory in each MP
147 |   -- Can be done from any program level.
148 |   -- Since the allocation happens block-wise though
149 |   -- it is important to figure out how many instances of
150 |   -- that t level program that needs memory! (messy)
151 |   Allocate :: Name -> Word32 -> Type -> Program t ()
152 | 
153 |   -- Automatic Variables
154 |   Declare :: Name -> Type -> Program t ()
155 | 
156 |   Sync     :: (t *<=* Block) => Program t ()
157 | 
158 |   -- Monad
159 |   Return :: a -> Program t a
160 |   Bind   :: Program t a -> (a -> Program t b) -> Program t b
161 | 
162 | ---------------------------------------------------------------------------
163 | -- Aliases
164 | ---------------------------------------------------------------------------
165 | type TProgram = Program Thread
166 | type WProgram = Program Warp
167 | type BProgram = Program Block
168 | type GProgram = Program Grid
169 | 
170 | ---------------------------------------------------------------------------
171 | -- Helpers
172 | ---------------------------------------------------------------------------
173 | uniqueSM = do
174 |   id <- Identifier
175 |   return $ "arr" ++ show id
176 | 
177 | uniqueNamed pre = do
178 |   id <- Identifier
179 |   return $ pre ++ show id
180 | 
181 | uniqueNamed_ pre = do id <- Identifier
182 |                       return $ pre ++ show id
183 | ---------------------------------------------------------------------------
184 | -- Memory
185 | ---------------------------------------------------------------------------
186 | assign :: Scalar a => Name -> [Exp Word32] -> Exp a -> Program Thread ()
187 | assign nom ix e = Assign nom ix e
188 | 
189 | allocate :: Name -> Word32 -> Type -> Program t ()
190 | allocate nom l t = Allocate nom l t
191 | 
192 | declare :: Name -> Type -> Program t ()
193 | declare nom t = Declare nom t
194 | 
195 | ---------------------------------------------------------------------------
196 | -- atomicOp 
197 | ---------------------------------------------------------------------------
198 | atomicOp :: Scalar a
199 |             => Name        -- Array name
200 |             -> Exp Word32  -- Index to operate on
201 |             -> Atomic a    -- Atomic operation to perform
202 |             -> Program Thread ()
203 | atomicOp nom ix atop = AtomicOp nom ix atop
204 | 
205 | ---------------------------------------------------------------------------
206 | -- forAll
207 | ---------------------------------------------------------------------------
208 | forAll :: (t *<=* Block) => EWord32
209 |           -> (EWord32 -> Program Thread ())
210 |           -> Program t ()
211 | forAll n f = ForAll n f
212 | 
213 | forAll2 :: (t *<=* Block) => EWord32
214 |            -> EWord32
215 |            -> (EWord32 -> EWord32 -> Program Thread ())
216 |            -> Program (Step t) ()
217 | forAll2 b n f =
218 |   DistrPar b $ \bs ->
219 |     ForAll n $ \ix -> f bs ix
220 | 
221 | distrPar :: EWord32
222 |            -> (EWord32 -> Program t ())
223 |            -> Program (Step t) ()
224 | distrPar b f = DistrPar b $ \bs -> f bs
225 | 
226 | ---------------------------------------------------------------------------
227 | -- Let a single thread perform  of a block/Warp perform a given
228 | -- Thread program
229 | ---------------------------------------------------------------------------
230 | singleThread :: (t *<=* Block) => Program Thread () -> Program t ()
231 | singleThread p =
232 |   forAll 1 (\_ -> p)
233 | 
234 | -- seqFor
235 | ---------------------------------------------------------------------------
236 | seqFor :: EWord32 -> (EWord32 -> Program t ()) -> Program t ()
237 | seqFor (Literal 1) f = f 0
238 | seqFor n f = SeqFor n f
239 | 
240 | ---------------------------------------------------------------------------
241 | -- seqWhile
242 | ---------------------------------------------------------------------------
243 | seqWhile :: Exp Bool -> Program Thread () -> Program Thread ()
244 | seqWhile b prg = SeqWhile b prg
245 | 
246 | ---------------------------------------------------------------------------
247 | -- Monad
248 | --------------------------------------------------------------------------
249 | instance Monad (Program t) where
250 |   return = Return
251 |   (>>=) = Bind
252 | 
253 | ---------------------------------------------------------------------------
254 | -- Functor
255 | ---------------------------------------------------------------------------
256 | instance Functor (Program t) where
257 |   fmap g fa = do {a <- fa; return $ g a}
258 | 
259 | ---------------------------------------------------------------------------
260 | -- Applicative
261 | ---------------------------------------------------------------------------
262 | instance Applicative (Program t) where
263 |   pure = return
264 |   ff <*> fa =
265 |     do
266 |       f <- ff
267 |       fmap f fa
268 | 
269 | ---------------------------------------------------------------------------
270 | -- sync function
271 | ---------------------------------------------------------------------------
272 | sync :: (t *<=* Block) => Program t ()
273 | sync = Sync
274 | 
275 | ---------------------------------------------------------------------------
276 | -- runPrg (RETHINK!) (Works for Block programs, but all?)
277 | ---------------------------------------------------------------------------
278 | runPrg :: Int -> Program t a -> (a,Int)
279 | runPrg i Identifier = (i,i+1)
280 | 
281 | -- Maybe these two are the most interesting cases!
282 | -- Return may for example give an array.
283 | runPrg i (Return a) = (a,i)
284 | runPrg i (Bind m f) =
285 |   let (a,i') = runPrg i m
286 |   in runPrg i' (f a)
287 | 
288 | -- All other constructors have () result
289 | 
290 | runPrg i (Sync) = ((),i)
291 | runPrg i (ForAll n ixf) =
292 |   let (p,i') = runPrg i (ixf (variable "tid"))
293 |   in  (p,i')
294 | runPrg i (DistrPar n f) =
295 |   let (p,i') = runPrg i (f (variable "DUMMY"))
296 |   in (p,i')
297 | -- What can this boolean depend upon ? its quite general!
298 | -- p here is a Program Thread ()
299 | runPrg i (Cond b p) = ((),i)
300 | runPrg i (Declare _ _) = ((),i)
301 | runPrg i (Allocate _ _ _ ) = ((),i)
302 | runPrg i (Assign _ _ a) = ((),i)
303 | runPrg i (AtomicOp _ _ _) = ((),i) -- variable ("new"++show i),i+1)
304 | 
305 | {- What do I want from runPrg ?
306 | 
307 |    # I want to it to "work" for all block programs (no exceptions)
308 |    # I want a BProgram (Pull a) to return a Pull array of "correct length)
309 | -}
310 | 
311 | 
312 | ---------------------------------------------------------------------------
313 | -- printPrg (REIMPLEMENT)
314 | ---------------------------------------------------------------------------
315 | printPrg :: Program t a -> String
316 | printPrg prg = (\(_,x,_) -> x) $ printPrg' 0 prg
317 | 
318 | printPrg' :: Int -> Program t a -> (a,String,Int)
319 | printPrg' i Identifier = (i,"getId;\n",i+1)
320 | printPrg' i (Assign n ix e) =
321 |   ((),n ++ "[" ++ show ix ++ "] = " ++ show e ++ ";\n", i)
322 | printPrg' i (AtomicOp n ix e) =
323 |   let newname = "r" ++ show i
324 |   --in (variable newname,
325 |   --    newname ++ " = " ++ printAtomic e ++
326 |   --    "( " ++ n ++ "[" ++ show ix ++ "])\n",i+1)
327 |   in ((), printAtomic e ++
328 |           "( " ++ n ++ "[" ++ show ix ++ "])\n",i+1)
329 | printPrg' i (Allocate id n t) =
330 |   let newname = id -- "arr" ++ show id
331 |   in ((),newname ++ " = malloc(" ++ show n ++ ");\n",i+1)
332 | printPrg' i (Declare id t) =
333 |   let newname = id -- "arr" ++ show id
334 |   in ((),show t ++ " " ++ newname ++ "\n",i+1)
335 | printPrg' i (SeqFor n f) =
336 |   let (a,prg2,i') = printPrg' i (f (variable "i"))
337 | 
338 |   in ( a,
339 |        "for (i in 0.." ++ show n ++ ")" ++
340 |        "{\n" ++ prg2 ++ "\n}",
341 |        i')
342 | 
343 | printPrg' i (ForAll n f) =
344 |   let (a,prg2,i') = printPrg' i (f (variable "i"))
345 | 
346 |   in ( a,
347 |        "par (i in 0.." ++ show n ++ ")" ++
348 |        "{\n" ++ prg2 ++ "\n}",
349 |        i')
350 | --printPrg' i (ForAllBlocks n f) =
351 | --  let (d,prg2,i') = printPrg' i (f (variable "BIX"))
352 | --  in (d,
353 | --      "blocks (i)" ++
354 | --      "{\n" ++ prg2 ++ "\n}",
355 | --      i')
356 | printPrg' i (Return a) = (a,"MonadReturn;\n",i)
357 | printPrg' i (Bind m f) =
358 |   let (a1, str1,i1) = printPrg' i m
359 |       (a2,str2,i2) = printPrg' i1 (f a1)
360 |   in (a2,str1 ++ str2, i2)
361 | printPrg' i Sync = ((),"Sync;\n",i)
362 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/Program.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GADTs,
  2 |              ExistentialQuantification,
  3 |              FlexibleInstances  #-}
  4 | {-# LANGUAGE TypeSynonymInstances #-}
  5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
  6 | {-# LANGUAGE ConstraintKinds     #-}
  7 | {-# LANGUAGE DataKinds           #-}
  8 | 
  9 | {- CodeGen.Program.
 10 | 
 11 |    Joel Svensson 2012, 2013
 12 | 
 13 |    Notes:
 14 |      2013-03-17: Codegeneration is changing
 15 | -} 
 16 | 
 17 | 
 18 | module Obsidian.CodeGen.Program where
 19 | 
 20 | import Obsidian.Exp
 21 | import Obsidian.Globs
 22 | import Obsidian.Types
 23 | import Obsidian.Atomic
 24 | 
 25 | import qualified Obsidian.Program as P
 26 | 
 27 | import Data.Word
 28 | import Data.Supply
 29 | import Data.List
 30 | 
 31 | import System.IO.Unsafe
 32 | 
 33 | import Control.Monad.State
 34 | import Control.Applicative 
 35 | 
 36 | 
 37 | ---------------------------------------------------------------------------
 38 | -- New Intermediate representation
 39 | ---------------------------------------------------------------------------
 40 | 
 41 | type IMList a = [(Statement a,a)]
 42 | 
 43 | type IM = IMList ()
 44 | 
 45 | out :: a -> [(a,())] 
 46 | out a = [(a,())]
 47 | 
 48 | -- Atomic operations
 49 | data AtOp = AtInc
 50 |           | AtAdd  IExp 
 51 |           | AtSub  IExp 
 52 |           | AtExch IExp 
 53 | 
 54 | 
 55 | data HLevel = Thread
 56 |             | Warp
 57 |             | Block
 58 |             | Grid
 59 |             
 60 | 
 61 | -- Statements 
 62 | data Statement t = SAssign IExp [IExp] IExp
 63 |                  | SAtomicOp IExp IExp AtOp
 64 |                  | SCond IExp (IMList t) 
 65 |                  | SSeqFor String IExp (IMList t)
 66 |                  | SBreak
 67 |                  | SSeqWhile IExp (IMList t)
 68 | 
 69 |                  --               Iters   Body
 70 |                  | SForAll HLevel IExp    (IMList t)
 71 |                  | SDistrPar HLevel IExp  (IMList t)
 72 |                    
 73 | --                 | SForAllBlocks IExp (IMList t)
 74 | --                 | SNWarps IExp (IMList t)
 75 | --                 | SWarpForAll  IExp (IMList t)                    
 76 | --                 | SWarpForAll String   String IExp (IMList t) 
 77 | 
 78 |     -- Memory Allocation..
 79 |                  | SAllocate Name Word32 Type
 80 |                  | SDeclare  Name Type
 81 | 
 82 |     -- Synchronisation
 83 |                  | SSynchronize
 84 | 
 85 | 
 86 | --------------------------------------------------------------------------- 
 87 | -- Collect and pass around data during first step compilation
 88 | data Context = Context { ctxNWarps :: Maybe Word32,
 89 |                          ctxNThreads :: Maybe Word32, 
 90 |                          ctxGLBUsesTid :: Bool,
 91 |                          ctxGLBUsesWid :: Bool}
 92 | 
 93 | newtype CM a = CM (State Context a)
 94 |    deriving (Monad, MonadState Context, Functor, Applicative)
 95 | 
 96 | runCM :: CM a -> Context -> a 
 97 | --runCM (CM cm) ctx = evalState cm ctx
 98 | runCM (CM cm) = evalState  cm 
 99 | 
100 | evalCM :: CM a -> Context -> (a, Context)
101 | evalCM (CM cm) = runState cm 
102 | 
103 | setUsesTid :: CM ()
104 | setUsesTid = modify $ \ctx -> ctx { ctxGLBUsesTid = True } 
105 | 
106 | setUsesWid :: CM ()
107 | setUsesWid = modify $ \ctx -> ctx { ctxGLBUsesWid = True } 
108 | 
109 | enterWarp :: Word32 -> CM ()
110 | enterWarp  n = modify $ \ctx -> ctx { ctxNWarps = Just n }
111 | 
112 | enterThread :: Word32 -> CM ()
113 | enterThread n = modify $ \ctx -> ctx {ctxNThreads = Just n} 
114 | 
115 | clearWarp :: CM ()
116 | clearWarp = modify $ \ctx -> ctx {ctxNWarps = Nothing}
117 | 
118 | getNWarps :: CM (Maybe Word32)
119 | getNWarps = do
120 |   ctx <- get
121 |   return $ ctxNWarps ctx
122 | 
123 | getNThreads :: CM (Maybe Word32)
124 | getNThreads = do
125 |   ctx <- get
126 |   return $ ctxNThreads ctx 
127 | 
128 | emptyCtx :: Context
129 | emptyCtx = Context Nothing Nothing False False
130 | ---------------------------------------------------------------------------
131 | 
132 | 
133 | -- Sort these out and improve! 
134 | usesWarps :: IMList t -> Bool
135 | usesWarps = any (go . fst)
136 |   where
137 |     go (SDistrPar _ _ im) = usesWarps im 
138 |     go (SForAll Warp _ _) = True
139 |     go _ = False
140 | 
141 | usesTid :: IMList t -> Bool
142 | usesTid = any (go . fst)
143 |   where
144 |     go (SDistrPar _ _ im) = usesTid im 
145 |     go (SForAll Block _ _) = True
146 |     go (SSeqFor _ _ im) = usesTid im 
147 |     go _ = False
148 | usesBid :: IMList t -> Bool
149 | usesBid = any (go . fst)
150 |   where
151 |     go (SDistrPar Block _ _) = True -- usesBid im
152 |     -- go (SForAll Block _ _) = True
153 |     go _ = False
154 | usesGid :: IMList t -> Bool
155 | usesGid = any (go . fst)
156 |   where
157 |     go (SForAll Grid _ _) = True
158 |     go _ = False
159 |  
160 | 
161 | 
162 | ---------------------------------------------------------------------------
163 | -- COmpilation of Program to IM
164 | --------------------------------------------------------------------------- 
165 | 
166 | compileStep1 :: Compile t => P.Program t a -> IM
167 | compileStep1 p = snd $ runCM (compile ns p) emptyCtx
168 |   where
169 |     ns = unsafePerformIO$ newEnumSupply
170 | 
171 | 
172 | class Compile t where
173 |   compile :: Supply Int -> P.Program t a -> CM (a,IM)
174 | 
175 | -- Compile Thread program 
176 | instance Compile P.Thread  where
177 |   -- Can add cases for P.ForAll here.
178 |   -- Turn into sequential loop. Could be important to make push
179 |   -- operate uniformly across entire hierarchy.
180 |   compile s (P.ForAll n f) = 
181 |     do
182 |       let (i1,i2) = split2 s
183 |           nom = "i" ++ show (supplyValue i1)
184 |           v = variable nom
185 |           p = f v
186 |       (a,im) <-  compile i2 p
187 | 
188 |       return ((),out $ SSeqFor nom (expToIExp n) im)
189 |   compile s (P.Allocate nom n t) = do
190 |     (Just nt) <- getNThreads -- must be a Just at this point
191 |     nw' <- getNWarps
192 | 
193 |     let nw = case nw' of
194 |           Nothing -> 1
195 |           Just i  -> i
196 |     
197 |     return ((),out $ SAllocate nom (nt*nw*n) t)
198 |   compile _ (P.Sync) =
199 |     return ((),[])
200 | 
201 |   compile s p = cs s p 
202 | 
203 | -- Compile Warp program
204 | instance Compile P.Warp where
205 |   compile s (P.DistrPar n f) =
206 |     error "Currently not supported to distribute over the threads, use ForAll instead!"
207 |   compile s (P.ForAll n@(Literal n') f) = do
208 | 
209 |     -- setup context to know number of threads
210 |     -- executing 
211 |     enterThread n'
212 |     
213 |     let p = f (variable "warpIx") 
214 |     (a,im) <- compile s p 
215 |     return (a, out $ SForAll Warp (expToIExp n) im)
216 |     --undefined -- compile a warp program that iterates over a space n large
217 |   compile s (P.Allocate nom n t) = do
218 |     (Just nw) <- getNWarps -- Must be a Just here, or something is wrong!
219 |     return ((),out $ SAllocate nom (nw*n) t)
220 |   compile s (P.Bind p f) = do
221 |     let (s1,s2) = split2 s
222 |     (a,im1) <- compile s1 p
223 |     (b,im2) <- compile s2 (f a)
224 |     return (b,(im1 ++ im2))
225 |   compile s (P.Return a) = return (a,[])
226 |   compile s (P.Identifier) = return (supplyValue s, [])
227 |   compile s (P.Sync) = return ((),[])
228 |   -- Why no fallthrough here ?
229 |   -- Adding (must have been a horrible mistake!)
230 |   compile s p = cs s p
231 |   
232 | -- Compile Block program 
233 | instance Compile P.Block where
234 |   compile s (P.ForAll n@(Literal n') f) = do
235 | 
236 |     -- Set up the context to know the number of
237 |     -- concurrent thread programs that are executing. 
238 |     enterThread n'
239 |     setUsesTid
240 |     
241 |     let nom = "tid"
242 |         v   = variable nom
243 |         p   = f v
244 |     
245 |     (a,im) <-  compile s p
246 |     -- in this case a could be () (since it is guaranteed to be anyway). a
247 |     return (a,out (SForAll Block (expToIExp n) im))
248 |     
249 |   compile s (P.DistrPar n'@(Literal n) f) = do
250 |     
251 |     {- Distribute work over warps! -}
252 |     -- Set up the context for the compilation
253 |     -- of the Warp code.
254 |     -- BUG: Something like this is needed for distribution
255 |     -- over threads too!
256 |     -- FIXED: Bug mentioned above should be (at least) partially fixed. 
257 |     enterWarp n
258 |     -- Number of active warps are stored in the context. 
259 |     (a,im) <- compile s (f (variable "warpID"))
260 |     return (a, out (SDistrPar Warp (expToIExp n') im))
261 |   compile s (P.Allocate id n t) = return ((),out (SAllocate id n t))
262 |   compile s (P.Sync) = return ((),out (SSynchronize))
263 |   compile s p = cs s p
264 | 
265 | -- Compile a Grid Program 
266 | instance Compile P.Grid where
267 |   -- compile s (P.ForAll n f) = do
268 |     
269 |   --   -- Incorrect, need to compute global thread ids and apply 
270 |   --   let p = f gid -- (BlockIdx X)
271 |   --       gid = variable "gid" 
272 |   --   (a,im) <- compile s p 
273 |   --   return (a, out (SForAll Grid (expToIExp n) im))
274 | 
275 |   {- Distribute over blocks -}
276 |   compile s (P.DistrPar n f) = do
277 |     -- Need to generate IM here that the backend can read desired number of blocks from
278 |     let p = f (variable "bid") -- (BlockIdx X) 
279 |     
280 |     (a, im) <- compile s p -- (f (BlockIdx X)) 
281 |     return (a, out (SDistrPar Block (expToIExp n) im))
282 |   compile s (P.Allocate _ _ _) = error "Allocate at level Grid" 
283 |   compile s p = cs s p
284 | 
285 | 
286 | 
287 | 
288 | ---------------------------------------------------------------------------
289 | -- General compilation
290 | ---------------------------------------------------------------------------
291 | cs :: forall t a . Compile t => Supply Int -> P.Program t a -> CM (a,IM) 
292 | cs i P.Identifier = return $ (supplyValue i, [])
293 | cs i (P.Assign name ix e) =
294 |   return $ ((),out (SAssign (IVar name (typeOf e)) (map expToIExp ix) (expToIExp e)))
295 | 
296 | cs i (P.AtomicOp name ix atom) =
297 |   case atom of
298 |     AtomicInc -> return $ ((),out (SAtomicOp (IVar name Word32) (expToIExp ix) AtInc))
299 |     AtomicAdd e -> undefined
300 |     AtomicSub e -> undefined
301 |     AtomicExch e -> undefined 
302 |   -- (vres, out im)
303 |   -- where
304 |   --   res = "a" ++ show (supplyValue i)
305 |   --   vres = IVar res (typeOf (undefined :: a))
306 |   --   vname = IVar name (typeOf (undefined :: a))
307 |   --   im = SAtomicOp vres vname ix atom
308 | 
309 | --cs i (P.AtomicOp name ix at) = (v,out im)
310 | --  where 
311 | --    nom = "a" ++ show (supplyValue i)
312 | --    v = variable nom
313 | --    im = SAtomicOp nom name ix at
314 |       
315 | cs i (P.Cond bexp p) = do
316 |   ((),im) <-  compile i p
317 |   return ((),out (SCond (expToIExp bexp) im)) 
318 |  
319 | 
320 | cs i (P.SeqFor n f) = do
321 |   let (i1,i2) = split2 i
322 |       nom = "i" ++ show (supplyValue i1)
323 |       v = variable nom
324 |       p = f v
325 |   (a,im) <-  compile i2 p
326 |   
327 |   return (a,out (SSeqFor nom (expToIExp n) im))
328 | 
329 |     
330 | cs i (P.SeqWhile b p) = do
331 |   (a,im) <-  compile i p
332 |   return (a, out (SSeqWhile (expToIExp b) im))
333 | 
334 |     
335 | 
336 | cs i (P.Break) = return ((), out SBreak)
337 | 
338 | -- This (Allocate) should be covered by the Hierarchy instances
339 | -- above and should be removed from here. 
340 | --cs i (P.Allocate id n t) = return ((),out (SAllocate id n t))
341 | cs i (P.Declare  id t)   = return ((),out (SDeclare id t))
342 | 
343 | -- cs i (P.Sync)            = return ((),out (SSynchronize))
344 | 
345 | 
346 | cs i (P.Bind p f) = do 
347 |   let (s1,s2) = split2 i
348 |   (a,im1) <- compile s1 p
349 |   (b,im2) <- compile s2 (f a)
350 | 
351 |   return (b,im1 ++ im2) 
352 |  
353 |  
354 | cs i (P.Return a) = return (a,[])
355 | 
356 | 
357 | -- Unhandled cases 
358 | cs i p = error $ "#Program.hs# unhandled in cs: " ++ P.printPrg p -- compile i p 
359 | 
360 | ---------------------------------------------------------------------------
361 | -- Turning IM to strings
362 | ---------------------------------------------------------------------------
363 | 
364 | printIM :: Show a => IMList a -> String 
365 | printIM im = concatMap printStm im
366 |   
367 | -- Print a Statement with metadata 
368 | printStm :: Show a => (Statement a,a) -> String
369 | printStm (SAssign name [] e,m) =
370 |   show name ++ " = " ++ show e ++ ";" ++ meta m
371 | printStm (SAssign name ix e,m) =
372 |   show name ++ "[" ++ concat (intersperse "," (map show ix)) ++ "]" ++
373 |   " = " ++ show e ++ ";" ++ meta m
374 | --printStm (SAtomicOp res arr ix op,m) =
375 | --  res ++ " = " ++
376 | --  printAtomic op ++ "(" ++ arr ++ "[" ++ show ix ++ "]);" ++ meta m
377 | printStm (SAllocate name n t,m) =
378 |   name ++ " = malloc(" ++ show n ++ ");" ++ meta m
379 | printStm (SDeclare name t,m) =
380 |   show t ++ " " ++ name ++ ";" ++ meta m
381 | printStm (SCond bexp im,m) =
382 |   "if " ++ show bexp ++ "{\n" ++ 
383 |   concatMap printStm im ++ "\n};" ++ meta m
384 | 
385 | printStm (SSynchronize,m) =
386 |   "sync();" ++ meta m
387 |   
388 | printStm (SSeqFor name n im,m) =
389 |   "for " ++ name  ++ " in [0.." ++ show n ++"] do" ++ meta m ++ 
390 |   concatMap printStm im ++ "\ndone;\n"
391 | 
392 | 
393 | printStm (SForAll Warp n im,m) =
394 |   "forAll wid" ++ "  in [0.." ++ show n ++"] do" ++ meta m ++
395 |   concatMap printStm im ++ "\ndone;\n"
396 | 
397 | printStm (SForAll Block n im,m) =
398 |   "forAll tid" ++ "  in [0.." ++ show n ++"] do" ++ meta m ++
399 |   concatMap printStm im ++ "\ndone;\n"
400 | 
401 | printStm (SForAll Grid n im,m) =
402 |   "forAll gid in [0.." ++ show n ++"] do" ++ meta m ++
403 |   concatMap printStm im ++ "\ndone;\n"
404 | 
405 | printStm (SDistrPar lvl n im,m) = 
406 |   "forAll gid in [0.." ++ show n ++"] do" ++ meta m ++
407 |   concatMap printStm im ++ "\ndone;\n"
408 | 
409 | -- printStm (SWarpForAll n im ,m) =
410 | --   "forAll(InWarp) tid" ++ "  in [0.." ++ show n ++"] do" ++ meta m ++
411 | --   concatMap printStm im ++ "\ndone;\n"
412 | 
413 | -- printStm (SNWarps n im,m) = "Run " ++ show n++ " Warps {\n" ++
414 | --                             printIM im ++ "\n }"
415 | --printStm (SForAllThreads n im,m) =
416 | --  "forAllThreads i in [0.." ++ show n ++"] do" ++ meta m ++ 
417 | --  concatMap printStm im ++ "\ndone;\n"
418 | 
419 | 
420 |   
421 | -- printStm (a,m) = error $ show m 
422 | 
423 | meta :: Show a => a -> String
424 | meta m = "\t//" ++ show m ++ "\n" 
425 | 
426 | 
427 | 
428 |       
429 | 
430 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/Memory2.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {- Joel Svensson 2012, 2013
  3 | 
  4 |    Notes:
  5 |      Jan-27-2015: Bug fix related to arrays alive when
  6 |                   entering into loops.
  7 |                   Fix seems to solve the problem.
  8 |                   Need to investigate that it still frees arrays
  9 |                   as soon as possible. 
 10 |      Nov-25-2014: Changes to memory management
 11 |   
 12 |      Jan-21-2013: Added a SeqFor case 
 13 | 
 14 |  -} 
 15 | module Obsidian.CodeGen.Memory2 
 16 |        (MemMap,
 17 |         Memory,
 18 |         allocate,
 19 |         free,
 20 |         freeAll,
 21 |         size, 
 22 |         Address,
 23 |         Bytes,
 24 |         memMapIM,
 25 |         renameIM,
 26 |         SharedMemConfig(..),
 27 |         createSharedMem
 28 |         ) 
 29 |        where 
 30 | 
 31 | import qualified Data.List as List
 32 | import qualified Data.Set  as Set
 33 | import Data.Word
 34 | import Data.Maybe 
 35 | 
 36 | import Obsidian.Types
 37 | import Obsidian.Globs
 38 | 
 39 | import Obsidian.Exp 
 40 | import Obsidian.CodeGen.Program
 41 | import Obsidian.CodeGen.Liveness
 42 | 
 43 | import Debug.Trace 
 44 | 
 45 | 
 46 | import qualified Data.Map as Map 
 47 | ---------------------------------------------------------------------------
 48 | -- Planned improvements
 49 | ---------------------------------------------------------------------------
 50 | -- # Always start a shared memory array at a "bank-aligned" address
 51 | --      + So that programmer can really effect access patterns. 
 52 | -- # Do not rename with pointers
 53 | --      instead output a list of (type ,name,address) quadruples
 54 | --      that can be used to create an alias (at top scope of program)
 55 | --      + Prettier code -> easier debugging 
 56 | --      + Potential efficiency issues, from less casting etc
 57 | -- # (LONG-TERM) Clever memory allocation
 58 | --      + The future is known! So encode the optimal memory
 59 | --        allocation schema
 60 | 
 61 | ---------------------------------------------------------------------------
 62 | -- Memory layout
 63 | ---------------------------------------------------------------------------
 64 | 
 65 | type MemMap = Map.Map Name (AlignedAddress,Type)
 66 | 
 67 | type AlignedAddress = (Address,Address) 
 68 | 
 69 | type Address = Word32
 70 | type Bytes   = Word32 
 71 | 
 72 | data Memory = Memory {freeList  :: [(Address,Bytes)] ,
 73 |                       allocated :: [(Address,Bytes)] , 
 74 |                       size      :: Bytes} -- how much used
 75 |             deriving Show 
 76 | 
 77 | updateMax :: Memory -> Memory 
 78 | updateMax mem = let m = maximum [a+b|(a,b) <- allocated mem]
 79 |                     m' = max m (size mem)
 80 |                 in mem {size = m'}
 81 |               
 82 | 
 83 | ---------------------------------------------------------------------------
 84 | -- Shared memory configurations
 85 | ---------------------------------------------------------------------------
 86 | data SharedMemConfig =
 87 |   SharedMemConfig { smSize :: Bytes  -- amount of shared mem
 88 |                   , smBanks :: Word32   -- Number of banks 16/32
 89 |                   , smBankAlign :: Bool 
 90 |                   }
 91 | 
 92 | 
 93 | createSharedMem :: SharedMemConfig -> Memory
 94 | createSharedMem conf = Memory [(0,smSize conf)] [] 0 
 95 | 
 96 | -- bank allign an address, returning a new aligned address
 97 | -- and the number of EXTRA bytes that needs to be present
 98 | -- from the old provided address in order to store aligned data
 99 | -- at the new location.
100 | bank_align :: SharedMemConfig -> Address -> (AlignedAddress, Bytes)
101 | bank_align conf address =
102 |   case (how_far_off == 0) of
103 |     True -> ((address,address),0)
104 |     False -> ((address,address + bump),  bump) 
105 |     
106 |   where banks = smBanks conf
107 |         -- if address % bank_alignment == 0
108 |         --   the address is aligned
109 |         bank_alignment = banks * 4 -- number of banks * 4 bytes
110 |         
111 |         how_far_off = address `mod`  bank_alignment
112 |         bump =  bank_alignment - how_far_off 
113 | 
114 | ---------------------------------------------------------------------------
115 | -- Allocate memory
116 | --------------------------------------------------------------------------- 
117 | allocate :: SharedMemConfig -> Memory -> Bytes -> (Memory,AlignedAddress)
118 | allocate conf m b =
119 |   case smBankAlign conf of
120 |     True ->
121 |        -- Does any memory location exist that
122 |        -- allows for the allocation of this array 
123 |        case catMaybes new_candidates of
124 |          [] -> error $ "allocate: out of shared memory:" ++
125 |                        "\n  Allocating: " ++ show b ++ " bytes" ++
126 |                        "\n  Free List: " ++ show (freeList m) ++ 
127 |                        "\n  Potentials: " ++ show address_candidates ++ 
128 |                        "\n  Fit with align: " ++ show new_candidates 
129 |          
130 |          ((aligned_address,free_space,alloc_size):_) ->
131 |            -- update free list
132 |            -- Clear the allocated address from the free list 
133 |            let fl  = filter (\(addr,_) -> (fst aligned_address /= addr)) (freeList m)
134 |                -- if the chosen space is larger than what we need
135 |                -- add the unused chunk to the free list 
136 |                fl' = if alloc_size < free_space
137 |                      then (fst aligned_address + alloc_size,
138 |                            free_space - alloc_size):fl
139 |                      else fl
140 |            -- Update memory and return a result address
141 |            in (updateMax $ m { freeList = fl'
142 |                             , allocated =
143 |                               (fst aligned_address,alloc_size):allocated m}
144 |                , aligned_address)
145 |               
146 |     False ->
147 |       case map (pretend_align b) address_candidates of
148 |         [] -> error "out of shared memory"
149 |          
150 |         ((aligned_address,free_space,alloc_size):_) ->
151 |            -- update free list
152 |            -- Clear the allocated address from the free list 
153 |            let fl  = filter (\(addr,_) -> (fst aligned_address /= addr)) (freeList m)
154 |                -- if the chosen space is larger than what we need
155 |                -- add the unused chunk to the free list 
156 |                fl' = if alloc_size < free_space
157 |                      then (fst aligned_address + alloc_size,
158 |                            free_space - alloc_size):fl
159 |                      else fl
160 |            -- Update memory and return a result address
161 |            in (updateMax $ m { freeList = fl'
162 |                             , allocated =
163 |                               (fst aligned_address,alloc_size):allocated m}
164 |                , aligned_address)
165 | 
166 |       
167 |                     
168 | 
169 |   where
170 |     -- Candidates after aligning
171 |     new_candidates = map (tryCandidate b) address_candidates
172 |     -- Original address canditades 
173 |     address_candidates = filter (\(_,y) -> y >= b) $ freeList m 
174 |     -- Create silly AlignedAddress (that are not really aligned at all)
175 |     pretend_align bytes (addr, free_space) = ((addr,addr),free_space,bytes) 
176 | 
177 |     -- try to align an address
178 |     -- results in an AlignedAdress
179 |     tryCandidate bytes (addr, free_space) =
180 |       let (aligned_addr, extra_bytes) = bank_align conf addr
181 |           alloc_size = bytes + extra_bytes
182 |       in
183 |        case free_space >= alloc_size of 
184 |          True -> Just (aligned_addr,free_space,alloc_size)
185 |          False -> Nothing 
186 | 
187 | ---------------------------------------------------------------------------
188 | -- Free memory 
189 | ---------------------------------------------------------------------------
190 | free :: Memory -> AlignedAddress -> Memory
191 | free m (alloc_addr,_) = mem 
192 |     where 
193 |       bytes = lookup (alloc_addr) (allocated m)
194 |       al    = filter (\(addr,_) -> alloc_addr /= addr) (allocated m)
195 | 
196 |       -- TODO: Investigate this much closer.
197 |       --       Is it a bug or is freeing a non allocated memory area
198 |       --       OK?
199 |       --  2014-Nov-25: I dont remember what this refers to
200 |       --    But, if a problem resurfaces, look here. 
201 |       
202 |       mem   = case bytes of 
203 |                 Nothing -> m
204 |                 {-
205 |                   error $ "error: Address " ++ show a ++
206 |                           " not found in allocated list" ++
207 |                           "\n" ++ show m
208 |                 -} 
209 |                 Just b -> m {freeList = compress ((alloc_addr,b):(freeList m)),
210 |                              allocated = al}
211 | 
212 | freeAll :: Memory -> [AlignedAddress] -> Memory 
213 | freeAll m [] = m
214 | freeAll m (a:as) = freeAll (free m a) as
215 | 
216 | 
217 | compress :: [(Address,Bytes)] -> [(Address,Bytes)]
218 | compress = merge . List.sort
219 |   where
220 |     merge :: [(Address,Bytes)] -> [(Address,Bytes)]
221 |     merge [] = [] 
222 |     merge [x] = [x]
223 |     merge ((x,b):(y,b2):xs) = if (x+b == y)
224 |                               then merge ((x,b+b2):xs) 
225 |                               else (x,b):merge((y,b2):xs)
226 | 
227 | 
228 | ---------------------------------------------------------------------------
229 | -- Memory map the new IM
230 | ---------------------------------------------------------------------------
231 | 
232 | memMapIM :: SharedMemConfig -> IML -> MemMap -> (Memory, MemMap)
233 | memMapIM conf im memmap = mmIM conf im memory memmap
234 |   where
235 |     memory = createSharedMem conf
236 |   
237 | mmIM :: SharedMemConfig -> IML -> Memory -> MemMap -> (Memory, MemMap)
238 | mmIM conf im memory memmap = r im (memory,memmap)
239 |   where 
240 |     r [] m = m
241 |     r (x:xs) (m,mm) =
242 |       let
243 |           (m',mm') = process conf x m mm
244 |            
245 |           freeable = getFreeableSet x xs
246 |           freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable))
247 |           dontMap name = not ((List.isPrefixOf "input" name) || 
248 |                               (List.isPrefixOf "output" name))
249 |           mNew =
250 |             case freeableAddrs of
251 |               (Just as) -> freeAll m' (map fst as)
252 |               Nothing   -> m'
253 |       in -- trace ("freeable: " ++ show freeable   ++ "\n") $ 
254 |          r xs (mNew,mm')
255 |     
256 |     process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap)
257 |     process conf (SAllocate name size t,_) m mm = (m',mm') 
258 |       where (m',addr) = allocate conf m size
259 |             mm' =
260 |               case Map.lookup name mm of
261 |                 Nothing -> Map.insert name (addr,t) mm
262 |                 (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a
263 | 
264 |     -- Boilerplate
265 |     -- BUG: Bug in memory management related to seqloops
266 |     --      It may be better to try to fix this bug here.
267 |     --      A special mmIM for the loop case may be needed. 
268 |     process conf (SSeqFor _ n im,alive) m mm = mmIMLoop conf alive im m mm
269 |     process conf (SSeqWhile b im,_) m mm = mmIM conf im m mm 
270 |     process conf (SForAll _ n im,_) m mm = mmIM conf im m mm
271 |     -- 2014-Nov-25:
272 |     --   This one used mmIM' which was identical to mmIM.
273 |     --   This must have been a leftover from when I thought
274 |     --   warp memory needed some special attention here. 
275 |     process conf (SDistrPar Warp n im,_) m mm =  mmIMDistrWarp conf im m mm
276 |     process conf (SDistrPar Block n im,_) m mm = mmIM conf im m mm 
277 |     process conf (_,_) m mm = (m,mm) 
278 | 
279 | -- Friday (2013 Mars 29, discovered bug)
280 | -- 2014-Nov-25: was the "l" the bug ? (some details help)
281 | getFreeableSet :: (Statement Liveness,Liveness) -> IML -> Liveness 
282 | getFreeableSet (_,l) [] = Set.empty -- not l ! 
283 | getFreeableSet (_,l) ((_,l1):_) = l Set.\\ l1
284 | 
285 | 
286 | ---------------------------------------------------------------------------
287 | -- 
288 | ---------------------------------------------------------------------------
289 | mmIMLoop conf nonfreeable im memory memmap = r im (memory,memmap)
290 |   where 
291 |     r [] m = m
292 |     r (x:xs) (m,mm) =
293 |       let
294 |           (m',mm') = process conf x m mm
295 |            
296 |           freeable' = getFreeableSet x xs
297 |           freeable  = freeable' Set.\\ nonfreeable
298 |           freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable))
299 |           dontMap name = not ((List.isPrefixOf "input" name) || 
300 |                               (List.isPrefixOf "output" name))
301 |           mNew =
302 |             case freeableAddrs of
303 |               (Just as) -> freeAll m' (map fst as)
304 |               Nothing   -> m'
305 |       in --trace ("freeable': " ++ show freeable' ++ "\n" ++
306 |          --       "freeable: " ++ show freeable   ++ "\n" ++ 
307 |          --       "nonfreeable: " ++ show nonfreeable) $
308 |          r xs (mNew,mm')
309 |     
310 |     process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap)
311 |     process conf (SAllocate name size t,_) m mm = (m',mm') 
312 |       where (m',addr) = allocate conf m size
313 |             mm' =
314 |               case Map.lookup name mm of
315 |                 Nothing -> Map.insert name (addr,t) mm
316 |                 (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a
317 | 
318 |     -- Boilerplate
319 |     process conf (SSeqFor _ n im,alive) m mm = mmIMLoop conf (nonfreeable `Set.union` alive) im m mm
320 |     process conf (SSeqWhile b im,_) m mm = mmIMLoop conf nonfreeable im m mm 
321 |     process conf (SForAll _ n im,_) m mm = mmIMLoop conf nonfreeable im m mm
322 |     -- 2014-Nov-25:
323 |     --   This one used mmIM' which was identical to mmIM.
324 |     --   This must have been a leftover from when I thought
325 |     --   warp memory needed some special attention here. 
326 |     process conf (SDistrPar Warp n im,_) m mm = mmIMLoop conf nonfreeable im m mm 
327 |     process conf (SDistrPar Block n im,_) m mm = mmIMLoop conf nonfreeable im m mm 
328 |     process conf (_,_) m mm = (m,mm) 
329 | 
330 | 
331 | 
332 | -- NOTE: This is a hack to make programs distributed
333 | --       over warps not "free" its arrays.
334 | --       Distributing over warps introduces entirely new
335 | --       shared memory behaviour..
336 | --       This needs a review and some real thought!
337 | 
338 | mmIMDistrWarp conf im memory memmap = r im (memory,memmap)
339 |   where 
340 |     r [] m = m
341 |     r (x:xs) (m,mm) =
342 |       let
343 |           (m',mm') = process conf x m mm
344 |            
345 |           freeable = getFreeableSet x xs
346 |           --freeable  = freeable' Set.\\ nonfreeable
347 |           freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable))
348 |           dontMap name = not ((List.isPrefixOf "input" name) || 
349 |                               (List.isPrefixOf "output" name))
350 |           mNew =
351 |             case freeableAddrs of
352 |               (Just as) -> m' -- freeAll m' (map fst as)
353 |               Nothing   -> m'
354 |       in --trace ("freeable': " ++ show freeable' ++ "\n" ++
355 |          --       "freeable: " ++ show freeable   ++ "\n" ++ 
356 |          --       "nonfreeable: " ++ show nonfreeable) $
357 |          r xs (mNew,mm')
358 |     
359 |     process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap)
360 |     process conf (SAllocate name size t,_) m mm = (m',mm') 
361 |       where (m',addr) = allocate conf m size
362 |             mm' =
363 |               case Map.lookup name mm of
364 |                 Nothing -> Map.insert name (addr,t) mm
365 |                 (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a
366 | 
367 |     -- Boilerplate
368 |     process conf (SSeqFor _ n im,alive) m mm = mmIMDistrWarp conf  im m mm
369 |     process conf (SSeqWhile b im,_) m mm = mmIMDistrWarp conf  im m mm 
370 |     process conf (SForAll _ n im,_) m mm = mmIMDistrWarp conf  im m mm
371 |     -- 2014-Nov-25:
372 |     --   This one used mmIM' which was identical to mmIM.
373 |     --   This must have been a leftover from when I thought
374 |     --   warp memory needed some special attention here. 
375 |     process conf (SDistrPar Warp n im,_) m mm = mmIMDistrWarp conf  im m mm 
376 |     process conf (SDistrPar Block n im,_) m mm = mmIMDistrWarp conf im m mm 
377 |     process conf (_,_) m mm = (m,mm) 
378 | 
379 | 
380 | ---------------------------------------------------------------------------
381 | -- Rename arrays in IM
382 | --------------------------------------------------------------------------- 
383 | 
384 | renameIM :: MemMap -> IML -> IMList ()
385 | renameIM mm im = zip (map (go . fst) im) (repeat ())
386 |   where
387 |     go (SAssign name ix e) = SAssign (renameIVar mm name)
388 |                                      (map (renameIExp mm) ix)
389 |                                      (renameIExp mm e)
390 |     go (SAtomicOp name ix atop) = SAtomicOp (renameIVar mm name)
391 |                                             (renameIExp mm ix)
392 |                                             (renameAtOp mm atop) 
393 |     go (SCond be im) = SCond (renameIExp mm be)
394 |                              (renameIM mm im)
395 |     go (SSeqFor str n im) = SSeqFor str (renameIExp mm n)
396 |                                         (renameIM mm im)
397 |     go SBreak = SBreak
398 |     go (SSeqWhile n im) = SSeqWhile (renameIExp mm n)
399 |                                     (renameIM mm im)
400 |     go (SForAll lvl n im)   = SForAll lvl (renameIExp mm n)
401 |                                           (renameIM mm im)
402 |     go (SDistrPar lvl n im) = SDistrPar lvl (renameIExp mm n)
403 |                                             (renameIM mm im) 
404 | 
405 | --    go (SForAllBlocks n im) = SForAllBlocks (renameIExp mm n)
406 | --                                            (renameIM mm im)
407 | --    go (SNWarps n im) = SNWarps (renameIExp mm n)
408 | --                                (renameIM mm im)
409 | --    go (SWarpForAll n im) = SWarpForAll (renameIExp mm n)
410 | --                                        (renameIM mm im) 
411 |     -- Strip this out earlier. 
412 |     go (SAllocate name n t)  = SAllocate name n t 
413 |     go (SDeclare name t) = SDeclare name t
414 |     go SSynchronize      = SSynchronize 
415 | 
416 | ---------------------------------------------------------------------------
417 | -- Memory map the arrays in an CExpr
418 | ---------------------------------------------------------------------------
419 | renameIExp :: MemMap -> IExp -> IExp 
420 | renameIExp mm e@(IVar _ _) =  renameIVar mm e 
421 | renameIExp mm (IIndex (e1,es) t) = IIndex (renameIExp mm e1, map (renameIExp mm) es) t
422 | renameIExp mm (IBinOp op e1 e2 t) = IBinOp op (renameIExp mm e1) (renameIExp mm e2) t
423 | renameIExp mm (IUnOp op e t) = IUnOp op (renameIExp mm e) t 
424 | renameIExp mm (IFunCall nom exprs t) = IFunCall nom (map (renameIExp mm) exprs) t
425 | renameIExp mm (ICast e t) = ICast (renameIExp mm e) t
426 | renameIExp mm (ICond e1 e2 e3 t) = ICond (renameIExp mm e1)
427 |                                          (renameIExp mm e2)
428 |                                          (renameIExp mm e3)
429 |                                          t
430 | renameIExp _ a = a
431 | 
432 | renameIVar :: MemMap -> IExp -> IExp 
433 | renameIVar mm (IVar name t) =
434 |     -- t == t1 should be true 
435 |     case Map.lookup name mm of 
436 |     Just ((_,real_addr),t1) -> 
437 |       let core = sbaseIExp (real_addr)
438 |           cast c = ICast c t1
439 |       in cast core
440 |     
441 |     Nothing -> IVar name t
442 |     where
443 |       sbaseIExp 0    = IVar "sbase" (Pointer Word8) 
444 |       sbaseIExp addr = IBinOp IAdd (IVar "sbase" (Pointer Word8)) 
445 |                                    (IWord32 addr) 
446 |                                    (Pointer Word8)
447 | renameIVar _ _ = error "renameIVar: incorrect expression" 
448 | 
449 | renameAtOp :: MemMap -> AtOp -> AtOp 
450 | renameAtOp _ AtInc = AtInc
451 | renameAtOp mm (AtAdd e) = AtAdd (renameIExp mm e)
452 | renameAtOp mm (AtSub e) = AtSub (renameIExp mm e)
453 | renameAtOp mm (AtExch e) = AtExch (renameIExp mm e) 
454 | 
455 | 
456 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/CompileIM.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {-# LANGUAGE QuasiQuotes #-}
  3 | {-# LANGUAGE PackageImports #-}
  4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 
  5 | 
  6 | {-
  7 | 
  8 |    Joel Svensson 2013..2017
  9 | 
 10 | -} 
 11 | 
 12 | module Obsidian.CodeGen.CompileIM where 
 13 | import Language.C.Quote.CUDA hiding (Block)
 14 | import qualified Language.C.Quote.OpenCL as CL 
 15 | 
 16 | import qualified "language-c-quote" Language.C.Syntax as C
 17 | 
 18 | import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..))
 19 | import Obsidian.Types as T
 20 | import Obsidian.DimSpec 
 21 | import Obsidian.CodeGen.Program
 22 | 
 23 | import Data.Word
 24 | 
 25 | {- Notes:
 26 | 
 27 |    2017-04-22: Generate only CUDA 
 28 | 
 29 | 
 30 | 
 31 |    * TODO: Make sure tid always has correct Value 
 32 | -} 
 33 | 
 34 | ---------------------------------------------------------------------------
 35 | -- Config
 36 | ---------------------------------------------------------------------------
 37 | 
 38 | data Config = Config { configThreadsPerBlock :: Word32,
 39 |                        configSharedMem :: Word32}
 40 | 
 41 | 
 42 | 
 43 | 
 44 | ---------------------------------------------------------------------------
 45 | -- compileExp (maybe a bad name)
 46 | ---------------------------------------------------------------------------
 47 | compileExp :: IExp -> C.Exp 
 48 | compileExp (IVar name t) = [cexp| $id:name |]
 49 | 
 50 | 
 51 | -- TODO: Fix all this! 
 52 | -- compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |]
 53 | -- compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |]
 54 | -- compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |]
 55 | 
 56 | -- compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |]
 57 | -- compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |]
 58 | -- compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |]
 59 | 
 60 | -- compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |]
 61 | -- compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |]
 62 | -- compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |]
 63 | 
 64 | -- compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |]
 65 | -- compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |]
 66 | -- compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |]
 67 | 
 68 | compileExp (IBool True) = [cexp|1|]
 69 | compileExp (IBool False) = [cexp|0|]
 70 | compileExp (IInt8 n) = [cexp| $int:(toInteger n) |]
 71 | compileExp (IInt16 n) = [cexp| $int:(toInteger n) |]
 72 | compileExp (IInt32 n) = [cexp| $int:(toInteger n) |]
 73 | compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |]
 74 | 
 75 | compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |]
 76 | compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |]
 77 | compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |]
 78 | compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |]
 79 | 
 80 | compileExp (IFloat n) = [cexp| $float:(n) |]
 81 | compileExp (IDouble n) = [cexp| $double:(n) |]
 82 | 
 83 | -- Implementing these may be a bit awkward
 84 | -- given there are no vector literals in cuda. 
 85 | compileExp (IFloat2 n m) = error "IFloat2 unhandled"
 86 | compileExp (IFloat3 n m l) = error "IFloat3 unhandled"
 87 | compileExp (IFloat4 n m l k) = error "IFloat4 unhandled"
 88 | compileExp (IDouble2 n m) = error "IDouble2 unhandled" 
 89 | compileExp (IInt8_2 n m) = error "FIXME"
 90 | compileExp (IInt8_3 n m k) = error "FIXME"
 91 | compileExp (IInt8_4 n m k l) = error "FIXME"
 92 | compileExp (IInt16_2 n m ) = error "FIXME"
 93 | compileExp (IInt16_3 n m k) = error "FIXME"
 94 | compileExp (IInt16_4 n m k l) = error "FIXME"
 95 | compileExp (IInt32_2 n m) = error "FIXME"
 96 | compileExp (IInt32_3 n m k) = error "FIXME"
 97 | compileExp (IInt32_4 n m k l) = error "FIXME"
 98 | compileExp (IInt64_2 n m) = error "FIXME"
 99 | compileExp (IInt64_3 n m k) = error "FIXME"
100 | compileExp (IInt64_4 n m k l) = error "FIXME"
101 | compileExp (IWord8_2 n m) = error "FIXME"
102 | compileExp (IWord8_3 n m k) = error "FIXME"
103 | compileExp (IWord8_4 n m k l) = error "FIXME"
104 | compileExp (IWord16_2 n m ) = error "FIXME"
105 | compileExp (IWord16_3 n m k) = error "FIXME"
106 | compileExp (IWord16_4 n m k l) = error "FIXME"
107 | compileExp (IWord32_2 n m) = error "FIXME"
108 | compileExp (IWord32_3 n m k) = error "FIXME"
109 | compileExp (IWord32_4 n m k l) = error "FIXME"
110 | compileExp (IWord64_2 n m) = error "FIXME"
111 | compileExp (IWord64_3 n m k) = error "FIXME"
112 | compileExp (IWord64_4 n m k l) = error "FIXME"
113 | 
114 | 
115 | compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |]
116 | compileExp a@(IIndex (_,_) _) = error $ "compileExp: Malformed index expression " ++ show a
117 | 
118 | compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |]
119 | 
120 | compileExp (IBinOp op e1 e2 t) = go op 
121 |   where
122 |     x = compileExp e1
123 |     y = compileExp e2
124 |     go IAdd = [cexp| $x + $y |]
125 |     go ISub = [cexp| $x - $y |]
126 |     go IMul = [cexp| $x * $y |]
127 |     go IDiv = [cexp| $x / $y |]
128 |     go IFDiv = [cexp| $x / $y |]
129 |     go IMod = [cexp| $x % $y |]
130 |     go IEq = [cexp| $x == $y |]
131 |     go INotEq = [cexp| $x !=  $y |]
132 |     go ILt = [cexp| $x < $y |]
133 |     go IGt = [cexp| $x > $y |]
134 |     go IGEq = [cexp| $x >= $y |]
135 |     go ILEq = [cexp| $x <=  $y |]
136 |     go IAnd = [cexp| $x && $y |]
137 |     go IOr = [cexp| $x  || $y |]
138 |     go IPow = case t of
139 |                 Float ->  [cexp|powf($x,$y) |]
140 |                 Double -> [cexp|pow($x,$y)  |]
141 |                 _      -> error $ "IPow applied at wrong type" 
142 |     go IBitwiseAnd = [cexp| $x & $y |]
143 |     go IBitwiseOr = [cexp| $x | $y |]
144 |     go IBitwiseXor = [cexp| $x ^ $y |]
145 |     go IShiftL = [cexp| $x << $y |]
146 |     go IShiftR = [cexp| $x >> $y |]
147 | compileExp (IUnOp op e t) = go op
148 |   where
149 |     x = compileExp e
150 |     go IBitwiseNeg = [cexp| ~$x|]
151 |     go INot        = [cexp| !$x|]
152 |     go IGetX       = [cexp| $x.x|]
153 |     go IGetY       = [cexp| $x.y|]
154 |     go IGetZ       = [cexp| $x.z|]
155 |     go IGetW       = [cexp| $x.w|]
156 |     
157 | compileExp (IFunCall name es t) = [cexp| $fc |]
158 |   where
159 |     es' = map compileExp es
160 |     fc  = [cexp| $id:(name)($args:(es')) |]
161 | 
162 | compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |]
163 |   where
164 |     e' = compileExp e
165 | 
166 | compileExp any = error $ show any
167 | 
168 | compileType :: T.Type -> C.Type
169 | compileType (Int8) = [cty| typename int8_t |]
170 | compileType (Int16) = [cty| typename int16_t |]
171 | compileType (Int32) = [cty| typename int32_t |]
172 | compileType (Int64) = [cty| typename int64_t |]
173 | compileType (Word8) = [cty| typename uint8_t |]
174 | compileType (Word16) = [cty| typename uint16_t |]
175 | compileType (Word32) = [cty| typename uint32_t |]
176 | compileType (Word64) = [cty| typename uint64_t |]
177 | compileType (Float) = [cty| float |]
178 | compileType (Double) = [cty| double |]
179 | 
180 | compileType (Vec2 Float) = [cty| float4|]
181 | compileType (Vec3 Float) = [cty| float3|]
182 | compileType (Vec4 Float) = [cty| float2|]
183 | 
184 | compileType (Vec2 Double) = [cty| double2|]
185 | 
186 | -- How does this interplay with my use of uint8_t etc. Here it is char!
187 | compileType (Vec2 Int8) = [cty| char2|]
188 | compileType (Vec3 Int8) = [cty| char3|]
189 | compileType (Vec4 Int8) = [cty| char4|]
190 |  
191 | compileType (Vec2 Int16) = [cty| short2|]
192 | compileType (Vec3 Int16) = [cty| short3|]
193 | compileType (Vec4 Int16) = [cty| short4|]
194 | 
195 | compileType (Vec2 Int32) = [cty| int2|]
196 | compileType (Vec3 Int32) = [cty| int3|]
197 | compileType (Vec4 Int32) = [cty| int4|]
198 | 
199 | compileType (Vec2 Word8) = [cty| uchar2|]
200 | compileType (Vec3 Word8) = [cty| uchar3|]
201 | compileType (Vec4 Word8) = [cty| uchar4|]
202 |  
203 | compileType (Vec2 Word16) = [cty| ushort2|]
204 | compileType (Vec3 Word16) = [cty| ushort3|]
205 | compileType (Vec4 Word16) = [cty| ushort4|]
206 | 
207 | compileType (Vec2 Word32) = [cty| uint2|]
208 | compileType (Vec3 Word32) = [cty| uint3|]
209 | compileType (Vec4 Word32) = [cty| uint4|]
210 | 
211 | 
212 | compileType (Shared t) = [cty| __shared__ $ty:(compileType t) |]
213 | compileType (Pointer t) = [cty| $ty:(compileType t)* |]
214 | compileType (Volatile t) =  [cty| volatile $ty:(compileType t)|]
215 | compileType t = error $ "compileType: Not implemented " ++ show t
216 | 
217 | 
218 | ---------------------------------------------------------------------------
219 | -- Statement t to Stm
220 | ---------------------------------------------------------------------------
221 | 
222 | 
223 | compileStm :: Config -> Statement t -> [C.Stm]
224 | compileStm c (SAssign name [] e) =
225 |    [[cstm| $(compileExp name) = $(compileExp e);|]]
226 | compileStm c (SAssign name [ix] e) = 
227 |    [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]]
228 | compileStm c (SAtomicOp name ix atop) = 
229 |   case atop of
230 |     AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]]
231 |     AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
232 |     AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
233 |     AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
234 | 
235 | compileStm c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]]
236 |   where 
237 |     body = compileIM c im  -- (compileIM p c im)
238 | compileStm c (SSeqFor loopVar n im) = 
239 |     [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) 
240 |               { $stms:body } |]]
241 | -- end a sequential for loop with a sync (or begin).
242 | -- Maybe only if the loop is on block level (that is across all threads)
243 | --  __syncthreads();} |]]
244 |   where
245 |     body = compileIM c im 
246 | 
247 | 
248 | -- Just relay to specific compileFunction
249 | compileStm c a@(SForAll lvl n im) = compileForAll c a
250 | 
251 | compileStm c a@(SDistrPar lvl n im) = compileDistr c a 
252 | 
253 | compileStm c (SSeqWhile b im) =
254 |   [[cstm| while ($(compileExp b)) { $stms:body}|]]
255 |   where
256 |     body = compileIM c im 
257 | 
258 | compileStm c SSynchronize = [[cstm| __syncthreads(); |]]
259 | 
260 | compileStm _ (SAllocate _ _ _) = []
261 | compileStm _ (SDeclare name t) = []
262 | 
263 | compileStm _ a = error  $ "compileStm: missing case "
264 | 
265 | ---------------------------------------------------------------------------
266 | -- DistrPar 
267 | ---------------------------------------------------------------------------
268 | compileDistr :: Config -> Statement t -> [C.Stm] 
269 | compileDistr c (SDistrPar Block n im) =  codeQ ++ codeR
270 |   -- New here is BLOCK virtualisation
271 |   where
272 |     cim = compileIM c im  -- ++ [[cstm| __syncthreads();|]]
273 |     
274 |     numBlocks = [cexp| $id:("gridDim.x") |]
275 |     
276 |     blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|]
277 |     blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] 
278 |     
279 |     codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]]
280 |                 
281 |     bodyQ = [cstm| $id:("bid") = blockIdx.x * $exp:blocksQ + b;|] : cim  ++  
282 |             [[cstm| bid = blockIdx.x;|],
283 |              [cstm| __syncthreads();|]] -- yes no ? 
284 |          
285 |     codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + blockIdx.x;|], 
286 |              [cstm| if (blockIdx.x < $exp:blocksR) { $stms:cim }|],
287 |              [cstm| bid = blockIdx.x;|], 
288 |              [cstm| __syncthreads();|]] -- yes no ? 
289 |                     
290 | -- Can I be absolutely sure that 'n' here is statically known ? 
291 | -- I must look over the functions that can potentially create this IM. 
292 | -- Can make a separate case for unknown 'n' but generate worse code.
293 | -- (That is true for all levels)  
294 | compileDistr c (SDistrPar Warp (IWord32 n) im) = codeQ  ++ codeR 
295 |   -- Here the 'im' should be distributed over 'n'warps.
296 |   -- 'im' uses a warpID variable to identify what warp it is.
297 |   -- 'n' may be higher than the actual number of warps we have!
298 |   -- So GPU warp virtualisation is needed. 
299 |   where
300 |     cim = compileIM c im
301 | 
302 |     nWarps   = fromIntegral $ configThreadsPerBlock c `div` 32
303 |     numWarps = [cexp| $int:nWarps|] 
304 |  
305 |     (wq, wr) = (n `div` nWarps, n `mod` nWarps)
306 |      
307 |     warpsQ   = [cexp| $int:wq|]
308 |     warpsR   = [cexp| $int:wr|]
309 |     
310 |     codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]]
311 |     
312 |     bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++
313 |             --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ 
314 |             [[cstm| warpID = threadIdx.x / 32;|]] 
315 | 
316 |     codeR = case (n `mod` nWarps)  of 
317 |              0 -> [] 
318 |              n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|],
319 |                    [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], 
320 |                    [cstm| warpID = threadIdx.x / 32; |], 
321 |                    [cstm| __syncthreads();|]]
322 | 
323 | ---------------------------------------------------------------------------
324 | -- ForAll is compiled differently for different platforms
325 | ---------------------------------------------------------------------------
326 | compileForAll :: Config -> Statement t -> [C.Stm]
327 | compileForAll c (SForAll Warp  (IWord32 n) im) = codeQ ++ codeR
328 |   where
329 |     nt = 32
330 | 
331 |     q = n `div` nt
332 |     r = n `mod` nt
333 | 
334 |     cim = compileIM c im 
335 |     
336 |     codeQ =
337 |       case q of
338 |         0 -> []
339 |         1 -> cim
340 |         n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], 
341 |               [cstm| $id:("warpIx") = threadIdx.x % 32; |]]
342 |               -- [cstm| __syncthreads();|]]
343 |              where 
344 |                body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim
345 |                --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim
346 | 
347 |     q32 = q * 32 -- break out because: parseExp: cannot parse 'q*32' 
348 |     codeR = 
349 |       case r of 
350 |         0 -> [] 
351 |         n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { 
352 |                             $id:("warpIx") = $int:(q32) + (threadIdx.x % 32);  
353 |                             $stms:cim } |],
354 |                   -- [cstm| __syncthreads();|],
355 |                   [cstm| $id:("warpIx") = threadIdx.x % 32; |]]
356 | 
357 | compileForAll c (SForAll Block (IWord32 n) im) = goQ ++ goR 
358 |   where
359 |     cim = compileIM c im -- ++ [[cstm| __syncthreads();|]]
360 |    
361 |     nt = configThreadsPerBlock c 
362 | 
363 |     q  = n `quot` nt
364 |     r  = n `rem`  nt 
365 | 
366 |     -- q is the number full "passes" needed to cover the iteration
367 |     -- space given we have nt threads. 
368 |     goQ =
369 |       case q of
370 |         0 -> []
371 |         1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim
372 |             --do
373 |             --  stm <- updateTid [cexp| threadIdx.x |]
374 |             --  return $ [cstm| $id:loopVar = threadIdx.x; |] : cim 
375 |         n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], 
376 |                                                      --  __syncthreads(); } |], 
377 |               [cstm| $id:("tid") = threadIdx.x; |]]
378 |           --    [cstm| __syncthreads();|]]
379 |              where 
380 |                body = [cstm|$id:("tid") =  i*$int:nt + threadIdx.x; |] : cim
381 |    
382 |     -- r is the number of elements left. 
383 |     -- This generates code for when fewer threads are 
384 |     -- needed than available. (some threads shut down due to the conditional). 
385 | 
386 |     qnt = q * nt -- break out because: parseExp: cannot parse 'q*nt'
387 |     goR = 
388 |       case (r,q) of 
389 |         (0,_) -> [] 
390 |         --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { 
391 |         --                    $stms:cim } |]] 
392 |         (n,m) -> [[cstm| if (threadIdx.x < $int:n) { 
393 |                             $id:("tid") = $int:(qnt) + threadIdx.x;  
394 |                             $stms:cim } |], 
395 |                   [cstm| $id:("tid") = threadIdx.x; |]]
396 | 
397 | compileForAll c (SForAll Grid n im) = error "compileForAll: Grid" -- cim
398 |   -- The grid case is special. May need more thought
399 |   -- 
400 |   -- The problem with this case is that
401 |   -- I need to come up with a blocksize (but without any guidance)
402 |   -- from the programmer.
403 |   -- Though! There is no way the programmer could provide any
404 |   -- such info ... 
405 |   -- where
406 |   --   cim = compileIM c im
407 | 
408 | --compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go
409 | --  where
410 | --    body = compileIM PlatformC c im 
411 | --    go  = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] 
412 |       
413 | 
414 | --------------------------------------------------------------------------- 
415 | -- CompileIM to list of Stm 
416 | --------------------------------------------------------------------------- 
417 | compileIM :: Config -> IMList a -> [C.Stm]
418 | compileIM conf im = concatMap ((compileStm conf) . fst) im
419 | 
420 | ---------------------------------------------------------------------------
421 | -- Generate entire Kernel 
422 | ---------------------------------------------------------------------------
423 | type Parameters = [(String,T.Type)]
424 | 
425 | compile :: Config -> String -> (Parameters,IMList a) -> C.Definition
426 | compile config kname (params,im)
427 |   = go 
428 |   where
429 |     stms = compileIM config im
430 |     
431 |     ps = compileParams params
432 |     go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |]
433 | 
434 |     cudabody = (if (configSharedMem config > 0)
435 |                 -- then [BlockDecl [cdecl| extern volatile __shared__  typename uint8_t sbase[]; |]] 
436 |                 then [C.BlockDecl [cdecl| __shared__  typename uint8_t  sbase[$uint:(configSharedMem config)] ; |]] 
437 |                 else []) ++
438 |                 --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++
439 |                 --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
440 |                 --       BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++
441 | --                [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++
442 |                (if (usesGid im) 
443 |                 then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]]
444 |                 else []) ++ 
445 |                (if (usesBid im) 
446 |                 then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]]
447 |                 else []) ++ 
448 |                (if (usesTid im) 
449 |                 then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]]
450 |                 else []) ++
451 |                (if (usesWarps im) 
452 |                 then  [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
453 |                        C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 
454 |                 else []) ++
455 |                 -- All variables used will be unique and can be declared 
456 |                 -- at the top level 
457 |                 concatMap declares im ++ 
458 |                 -- Not sure if I am using language.C correctly. 
459 |                 -- Maybe compileSTM should create BlockStms ?
460 |                 -- TODO: look how Nikola does it. 
461 |                 map C.BlockStm stms
462 | 
463 |     cbody = -- add memory allocation 
464 |             map C.BlockStm stms
465 | 
466 | -- Declare variables. 
467 | declares :: (Statement t,t) -> [C.BlockItem]
468 | declares (SDeclare name t,_) = [C.BlockDecl [cdecl| $ty:(compileType t)  $id:name;|]]
469 | declares (SCond _ im,_) = concatMap declares im 
470 | declares (SSeqWhile _ im,_) = concatMap declares im
471 | declares (SForAll _ _ im,_) = concatMap declares im
472 | declares (SDistrPar _ _ im,_) = concatMap declares im
473 | declares (SSeqFor _ _  im,_) = concatMap declares im
474 | declares _ = []
475 | 
476 | 
477 | --------------------------------------------------------------------------- 
478 | -- Parameter lists for functions  (kernel head) 
479 | ---------------------------------------------------------------------------
480 | compileParams :: Parameters -> [C.Param]
481 | compileParams = map go
482 |   where
483 |     go (name,t) = [cparam| $ty:(compileType t) $id:name |]
484 |  
485 | 
486 | ---------------------------------------------------------------------------
487 | -- Compile with shared memory arrays declared at top
488 | ---------------------------------------------------------------------------
489 | -- CODE DUPLICATION FOR NOW
490 | 
491 | compileDeclsTop :: Config -> [(String,((Word32,Word32),T.Type))] -> String -> (Parameters,IMList a) -> C.Definition
492 | compileDeclsTop config toplevelarrs kname (params,im)
493 |   = go
494 |   where
495 |     stms = compileIM config im
496 |     
497 |     ps = compileParams params
498 |     go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |]
499 | 
500 |     cudabody = (if (configSharedMem config > 0)
501 |                 -- then [BlockDecl [cdecl| extern volatile __shared__  typename uint8_t sbase[]; |]] 
502 |                 then [C.BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)]; |]] 
503 |                 else []) ++
504 |                 --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++
505 |                 --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
506 |                 --       BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++
507 | --                [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++
508 |                (if (usesGid im) 
509 |                 then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]]
510 |                 else []) ++ 
511 |                (if (usesBid im) 
512 |                 then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]]
513 |                 else []) ++ 
514 |                (if (usesTid im) 
515 |                 then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]]
516 |                 else []) ++
517 |                (if (usesWarps im) 
518 |                 then  [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
519 |                        C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 
520 |                 else []) ++
521 |                 -- declare all arrays used
522 |                 concatMap declareArr toplevelarrs ++
523 |                 -- All variables used will be unique and can be declared 
524 |                 -- at the top level 
525 |                 concatMap declares im ++ 
526 |                 -- Not sure if I am using language.C correctly. 
527 |                 -- Maybe compileSTM should create BlockStms ?
528 |                 -- TODO: look how Nikola does it. 
529 |                 map C.BlockStm stms
530 | 
531 |     cbody = -- add memory allocation 
532 |             map C.BlockStm stms
533 | 
534 | 
535 | declareArr :: (String, ((Word32,Word32),T.Type)) -> [C.BlockItem]
536 | declareArr (arr,((_,addr),t)) =
537 |   [C.BlockDecl [cdecl| $ty:(compileType t) $id:arr = ($ty:(compileType t))(sbase + $int:addr);|]]
538 | 


--------------------------------------------------------------------------------
/Obsidian/CodeGen/CompileIMOpenCLEmbedded.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {-# LANGUAGE QuasiQuotes #-}
  3 | {-# LANGUAGE PackageImports #-}
  4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 
  5 | 
  6 | {-
  7 | 
  8 |    Joel Svensson 2017
  9 | 
 10 |    * notes:
 11 |       2017-04-23: Initial version. Duplicates a lot of the CUDA generation.
 12 | 
 13 |    * Todo:
 14 | 
 15 |      - Make sure generated code makes sense in Vivado_HLS
 16 | 
 17 |      - See what happens to the Workgroup interface generated
 18 |        by Vivado_HLS when using these:
 19 |        * get_num_groups()
 20 |        * get_local_size()
 21 |        * get_group_id()
 22 |        * get_local_id()
 23 |        * get_global_id()
 24 |        * get_global_size()
 25 | 
 26 |        get_num_groups and get_global_size are interesting as I have
 27 |        not seen these communicated to a Vivado_HLS opencl workgroup.
 28 |        It may be that interface is only generated when these functions
 29 |        are used. 
 30 | 
 31 | 
 32 | -} 
 33 | 
 34 | module Obsidian.CodeGen.CompileIMOpenCLEmbedded where
 35 | 
 36 | import Language.C.Quote.OpenCL
 37 | 
 38 | import qualified "language-c-quote" Language.C.Syntax as C
 39 | 
 40 | import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..))
 41 | import Obsidian.Types as T
 42 | import Obsidian.DimSpec 
 43 | import Obsidian.CodeGen.Program
 44 | 
 45 | import Data.Word
 46 | 
 47 | {- Notes:
 48 | 
 49 |    * Generation of OpenCL targeted at embedded profile
 50 |      as it presents itself in VivadoHLS 
 51 | 
 52 | -} 
 53 | 
 54 | ---------------------------------------------------------------------------
 55 | -- Config
 56 | ---------------------------------------------------------------------------
 57 | 
 58 | data Config = Config { configThreadsPerBlock :: Word32,
 59 |                        configSharedMem :: Word32}
 60 | 
 61 | 
 62 | ---------------------------------------------------------------------------
 63 | -- Some strings for OpenCLifying the generator 
 64 | ---------------------------------------------------------------------------
 65 | threadIdx_x = [cexp| get_local_id(0) |]
 66 | blockIdx_x  = [cexp| get_group_id(0) |]
 67 | globalIdx_x = [cexp| get_global_id(0)|]
 68 | blockDim_x  = [cexp| get_local_size(0)|]
 69 | 
 70 | 
 71 | 
 72 | ---------------------------------------------------------------------------
 73 | -- compileExp (maybe a bad name)
 74 | ---------------------------------------------------------------------------
 75 | compileExp :: IExp -> C.Exp 
 76 | compileExp (IVar name t) = [cexp| $id:name |]
 77 | 
 78 | 
 79 | -- TODO: Fix all this! 
 80 | -- compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |]
 81 | -- compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |]
 82 | -- compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |]
 83 | 
 84 | -- compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |]
 85 | -- compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |]
 86 | -- compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |]
 87 | 
 88 | -- compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |]
 89 | -- compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |]
 90 | -- compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |]
 91 | 
 92 | -- compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |]
 93 | -- compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |]
 94 | -- compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |]
 95 | 
 96 | compileExp (IBool True) = [cexp|1|]
 97 | compileExp (IBool False) = [cexp|0|]
 98 | compileExp (IInt8 n) = [cexp| $int:(toInteger n) |]
 99 | compileExp (IInt16 n) = [cexp| $int:(toInteger n) |]
100 | compileExp (IInt32 n) = [cexp| $int:(toInteger n) |]
101 | compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |]
102 | 
103 | compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |]
104 | compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |]
105 | compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |]
106 | compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |]
107 | 
108 | compileExp (IFloat n) = [cexp| $float:(n) |]
109 | compileExp (IDouble n) = [cexp| $double:(n) |]
110 | 
111 | -- Implementing these may be a bit awkward
112 | -- given there are no vector literals in cuda. 
113 | compileExp (IFloat2 n m) = error "IFloat2 unhandled"
114 | compileExp (IFloat3 n m l) = error "IFloat3 unhandled"
115 | compileExp (IFloat4 n m l k) = error "IFloat4 unhandled"
116 | compileExp (IDouble2 n m) = error "IDouble2 unhandled" 
117 | compileExp (IInt8_2 n m) = error "FIXME"
118 | compileExp (IInt8_3 n m k) = error "FIXME"
119 | compileExp (IInt8_4 n m k l) = error "FIXME"
120 | compileExp (IInt16_2 n m ) = error "FIXME"
121 | compileExp (IInt16_3 n m k) = error "FIXME"
122 | compileExp (IInt16_4 n m k l) = error "FIXME"
123 | compileExp (IInt32_2 n m) = error "FIXME"
124 | compileExp (IInt32_3 n m k) = error "FIXME"
125 | compileExp (IInt32_4 n m k l) = error "FIXME"
126 | compileExp (IInt64_2 n m) = error "FIXME"
127 | compileExp (IInt64_3 n m k) = error "FIXME"
128 | compileExp (IInt64_4 n m k l) = error "FIXME"
129 | compileExp (IWord8_2 n m) = error "FIXME"
130 | compileExp (IWord8_3 n m k) = error "FIXME"
131 | compileExp (IWord8_4 n m k l) = error "FIXME"
132 | compileExp (IWord16_2 n m ) = error "FIXME"
133 | compileExp (IWord16_3 n m k) = error "FIXME"
134 | compileExp (IWord16_4 n m k l) = error "FIXME"
135 | compileExp (IWord32_2 n m) = error "FIXME"
136 | compileExp (IWord32_3 n m k) = error "FIXME"
137 | compileExp (IWord32_4 n m k l) = error "FIXME"
138 | compileExp (IWord64_2 n m) = error "FIXME"
139 | compileExp (IWord64_3 n m k) = error "FIXME"
140 | compileExp (IWord64_4 n m k l) = error "FIXME"
141 | 
142 | 
143 | compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |]
144 | compileExp a@(IIndex (_,_) _) = error $ "compileExp: Malformed index expression " ++ show a
145 | 
146 | compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |]
147 | 
148 | compileExp (IBinOp op e1 e2 t) = go op 
149 |   where
150 |     x = compileExp e1
151 |     y = compileExp e2
152 |     go IAdd = [cexp| $x + $y |]
153 |     go ISub = [cexp| $x - $y |]
154 |     go IMul = [cexp| $x * $y |]
155 |     go IDiv = [cexp| $x / $y |]
156 |     go IFDiv = [cexp| $x / $y |]
157 |     go IMod = [cexp| $x % $y |]
158 |     go IEq = [cexp| $x == $y |]
159 |     go INotEq = [cexp| $x !=  $y |]
160 |     go ILt = [cexp| $x < $y |]
161 |     go IGt = [cexp| $x > $y |]
162 |     go IGEq = [cexp| $x >= $y |]
163 |     go ILEq = [cexp| $x <=  $y |]
164 |     go IAnd = [cexp| $x && $y |]
165 |     go IOr = [cexp| $x  || $y |]
166 |     go IPow = case t of
167 |                 Float ->  [cexp|powf($x,$y) |]
168 |                 Double -> [cexp|pow($x,$y)  |]
169 |                 _      -> error $ "IPow applied at wrong type" 
170 |     go IBitwiseAnd = [cexp| $x & $y |]
171 |     go IBitwiseOr = [cexp| $x | $y |]
172 |     go IBitwiseXor = [cexp| $x ^ $y |]
173 |     go IShiftL = [cexp| $x << $y |]
174 |     go IShiftR = [cexp| $x >> $y |]
175 | compileExp (IUnOp op e t) = go op
176 |   where
177 |     x = compileExp e
178 |     go IBitwiseNeg = [cexp| ~$x|]
179 |     go INot        = [cexp| !$x|]
180 |     go IGetX       = [cexp| $x.x|]
181 |     go IGetY       = [cexp| $x.y|]
182 |     go IGetZ       = [cexp| $x.z|]
183 |     go IGetW       = [cexp| $x.w|]
184 |     
185 | compileExp (IFunCall name es t) = [cexp| $fc |]
186 |   where
187 |     es' = map compileExp es
188 |     fc  = [cexp| $id:(name)($args:(es')) |]
189 | 
190 | compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |]
191 |   where
192 |     e' = compileExp e
193 | 
194 | compileType :: T.Type -> C.Type
195 | compileType (Int8) = [cty| typename int8_t |]
196 | compileType (Int16) = [cty| typename int16_t |]
197 | compileType (Int32) = [cty| typename int32_t |]
198 | compileType (Int64) = [cty| typename int64_t |]
199 | compileType (Word8) = [cty| typename uint8_t |]
200 | compileType (Word16) = [cty| typename uint16_t |]
201 | compileType (Word32) = [cty| typename uint32_t |]
202 | compileType (Word64) = [cty| typename uint64_t |]
203 | compileType (Float) = [cty| float |]
204 | compileType (Double) = [cty| double |]
205 | 
206 | compileType (Vec2 Float) = [cty| float4|]
207 | compileType (Vec3 Float) = [cty| float3|]
208 | compileType (Vec4 Float) = [cty| float2|]
209 | 
210 | compileType (Vec2 Double) = [cty| double2|]
211 | 
212 | -- How does this interplay with my use of uint8_t etc. Here it is char!
213 | compileType (Vec2 Int8) = [cty| char2|]
214 | compileType (Vec3 Int8) = [cty| char3|]
215 | compileType (Vec4 Int8) = [cty| char4|]
216 |  
217 | compileType (Vec2 Int16) = [cty| short2|]
218 | compileType (Vec3 Int16) = [cty| short3|]
219 | compileType (Vec4 Int16) = [cty| short4|]
220 | 
221 | compileType (Vec2 Int32) = [cty| int2|]
222 | compileType (Vec3 Int32) = [cty| int3|]
223 | compileType (Vec4 Int32) = [cty| int4|]
224 | 
225 | compileType (Vec2 Word8) = [cty| uchar2|]
226 | compileType (Vec3 Word8) = [cty| uchar3|]
227 | compileType (Vec4 Word8) = [cty| uchar4|]
228 |  
229 | compileType (Vec2 Word16) = [cty| ushort2|]
230 | compileType (Vec3 Word16) = [cty| ushort3|]
231 | compileType (Vec4 Word16) = [cty| ushort4|]
232 | 
233 | compileType (Vec2 Word32) = [cty| uint2|]
234 | compileType (Vec3 Word32) = [cty| uint3|]
235 | compileType (Vec4 Word32) = [cty| uint4|]
236 | 
237 | 
238 | compileType (Shared t) = [cty| __local $ty:(compileType t) |]
239 | compileType (Pointer t) = [cty| $ty:(compileType t)* |]
240 | compileType (Volatile t) =  [cty| volatile $ty:(compileType t)|]
241 | compileType t = error $ "compileType: Not implemented " ++ show t
242 | 
243 | 
244 | ---------------------------------------------------------------------------
245 | -- Statement t to Stm
246 | ---------------------------------------------------------------------------
247 | 
248 | compileStm :: Config -> Statement t -> [C.Stm]
249 | compileStm c (SAssign name [] e) =
250 |    [[cstm| $(compileExp name) = $(compileExp e);|]]
251 | compileStm c (SAssign name [ix] e) = 
252 |    [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]]
253 | compileStm c (SAtomicOp name ix atop) = 
254 |   case atop of
255 |     AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]]
256 |     AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
257 |     AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
258 |     AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]]
259 | 
260 | compileStm c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]]
261 |   where 
262 |     body = compileIM c im  -- (compileIM p c im)
263 | compileStm c (SSeqFor loopVar n im) = 
264 |     [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) 
265 |               { $stms:body } |]]
266 |   where
267 |     body = compileIM c im -- (compileIM p c im)
268 | 
269 | 
270 | -- Just relay to specific compileFunction
271 | compileStm c a@(SForAll lvl n im) = compileForAll c a
272 | 
273 | compileStm c a@(SDistrPar lvl n im) = compileDistr c a 
274 | 
275 | compileStm c (SSeqWhile b im) =
276 |   [[cstm| while ($(compileExp b)) { $stms:body}|]]
277 |   where
278 |     body = compileIM c im 
279 | 
280 | -- compileStm c SSynchronize = [[cstm| __syncthreads(); |]]
281 | compileStm c SSynchronize = [[cstm| barrier(CLK_LOCAL_MEM_FENCE); |]]
282 |     
283 | 
284 | compileStm _ (SAllocate _ _ _) = []
285 | compileStm _ (SDeclare name t) = []
286 | 
287 | compileStm _ a = error  $ "compileStm: missing case "
288 | 
289 | ---------------------------------------------------------------------------
290 | -- DistrPar 
291 | ---------------------------------------------------------------------------
292 | compileDistr :: Config -> Statement t -> [C.Stm] 
293 | compileDistr c (SDistrPar Block n im) =  codeQ ++ codeR
294 |   -- New here is BLOCK virtualisation
295 |   where
296 |     cim = compileIM c im  -- ++ [[cstm| __syncthreads();|]]
297 |     
298 |     numBlocks = [cexp| get_num_groups(0) |]
299 |     
300 |     blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|]
301 |     blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] 
302 |     
303 |     codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]]
304 |                 
305 |     bodyQ = [cstm| $id:("bid") = $exp:blockIdx_x * $exp:blocksQ + b;|] : cim  ++  
306 |             [[cstm| bid = $exp:blockIdx_x;|],
307 |              [cstm| barrier(CLK_LOCAL_MEM_FENCE);|]] -- yes no ? 
308 |          
309 |     codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + $exp:blockIdx_x;|], 
310 |              [cstm| if ($exp:blockIdx_x < $exp:blocksR) { $stms:cim }|],
311 |              [cstm| bid = $exp:blockIdx_x;|], 
312 |              [cstm| barrier(CLK_LOCAL_MEM_FENCE);|]] -- yes no ? 
313 |                     
314 | -- Can I be absolutely sure that 'n' here is statically known ? 
315 | -- I must look over the functions that can potentially create this IM. 
316 | -- Can make a separate case for unknown 'n' but generate worse code.
317 | -- (That is true for all levels)  
318 | compileDistr c (SDistrPar Warp (IWord32 n) im) = 
319 |   error "Compiling distribution over OpenCL wavefronts is not supported yet" 
320 |   -- codeQ  ++ codeR 
321 | 
322 |   -- Here the 'im' should be distributed over 'n'warps.
323 |   -- 'im' uses a warpID variable to identify what warp it is.
324 |   -- 'n' may be higher than the actual number of warps we have!
325 |   -- So GPU warp virtualisation is needed. 
326 |   -- where
327 |   --   cim = compileIM c im
328 | 
329 |   --   nWarps   = fromIntegral $ configThreadsPerBlock c `div` 32
330 |   --   numWarps = [cexp| $int:nWarps|] 
331 | 
332 |   --   warpsQ   = [cexp| $int:(n `div` nWarps)|]
333 |   --   warpsR   = [cexp| $int:(n `mod` nWarps)|]
334 |     
335 |   --   codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]]
336 |     
337 |   --   bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++
338 |   --           --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ 
339 |   --           [[cstm| warpID = threadIdx.x / 32;|]] 
340 | 
341 |   --   codeR = case (n `mod` nWarps)  of 
342 |   --            0 -> [] 
343 |   --            n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|],
344 |   --                  [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], 
345 |   --                  [cstm| warpID = threadIdx.x / 32; |], 
346 |   --                  [cstm| __syncthreads();|]]
347 | 
348 | ---------------------------------------------------------------------------
349 | -- ForAll 
350 | ---------------------------------------------------------------------------
351 | compileForAll :: Config -> Statement t -> [C.Stm]
352 | compileForAll c (SForAll Warp  (IWord32 n) im) =
353 |   error "Compiling distribution over OpenCL wavefronts is not supported yet"
354 |   --  codeQ ++ codeR
355 |   -- where
356 |   --   nt = 32
357 | 
358 |   --   q = n `div` nt
359 |   --   r = n `mod` nt
360 | 
361 |   --   cim = compileIM c im 
362 |     
363 |   --   codeQ =
364 |   --     case q of
365 |   --       0 -> []
366 |   --       1 -> cim
367 |   --       n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], 
368 |   --             [cstm| $id:("warpIx") = threadIdx.x % 32; |]]
369 |   --             --[cstm| __syncthreads();|]]
370 |   --            where 
371 |   --              body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim
372 |   --              --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim
373 | 
374 |   --   codeR = 
375 |   --     case r of 
376 |   --       0 -> [] 
377 |   --       n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { 
378 |   --                           $id:("warpIx") = $int:(q*32) + (threadIdx.x % 32);  
379 |   --                           $stms:cim } |],
380 |   --                 -- [cstm| __syncthreads();|],
381 |   --                 [cstm| $id:("warpIx") = threadIdx.x % 32; |]]
382 | 
383 | compileForAll c (SForAll Block (IWord32 n) im) = goQ ++ goR 
384 |   where
385 |     cim = compileIM c im -- ++ [[cstm| __syncthreads();|]]
386 |    
387 |     nt = configThreadsPerBlock c 
388 | 
389 |     q  = n `quot` nt
390 |     r  = n `rem`  nt 
391 | 
392 |     -- q is the number full "passes" needed to cover the iteration
393 |     -- space given we have nt threads. 
394 |     goQ =
395 |       case q of
396 |         0 -> []
397 |         1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim
398 |             --do
399 |             --  stm <- updateTid [cexp| threadIdx.x |]
400 |             --  return $ [cstm| $id:loopVar = threadIdx.x; |] : cim 
401 |         n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], 
402 |                                                      --  __syncthreads(); } |], 
403 |               [cstm| $id:("tid") = $exp:threadIdx_x; |]]
404 |           --    [cstm| __syncthreads();|]]
405 |              where 
406 |                body = [cstm|$id:("tid") =  i*$int:nt + $exp:threadIdx_x; |] : cim
407 |    
408 |     -- r is the number of elements left. 
409 |     -- This generates code for when fewer threads are 
410 |     -- needed than available. (some threads shut down due to the conditional). 
411 |     goR = 
412 |       case (r,q) of 
413 |         (0,_) -> [] 
414 |         --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { 
415 |         --                    $stms:cim } |]] 
416 |         (n,m) -> [[cstm| if ($exp:threadIdx_x < $int:n) { 
417 |                             $id:("tid") = $int:(q*nt) + $exp:threadIdx_x;  
418 |                             $stms:cim } |], 
419 |                   [cstm| $id:("tid") = $exp:threadIdx_x; |]]
420 | 
421 | compileForAll c (SForAll Grid n im) = error "compileForAll: Grid" -- cim
422 |   -- The grid case is special. May need more thought
423 |   -- 
424 |   -- The problem with this case is that
425 |   -- I need to come up with a blocksize (but without any guidance)
426 |   -- from the programmer.
427 |   -- Though! There is no way the programmer could provide any
428 |   -- such info ... 
429 |   where
430 |     cim = compileIM c im
431 | 
432 | -- compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go
433 | --   where
434 | --     body = compileIM PlatformC c im 
435 | --     go  = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] 
436 |       
437 | 
438 | --------------------------------------------------------------------------- 
439 | -- CompileIM to list of Stm 
440 | --------------------------------------------------------------------------- 
441 | compileIM :: Config -> IMList a -> [C.Stm]
442 | compileIM conf im = concatMap ((compileStm conf) . fst) im
443 | 
444 | ---------------------------------------------------------------------------
445 | -- Generate entire Kernel 
446 | ---------------------------------------------------------------------------
447 | type Parameters = [(String,T.Type)]
448 | 
449 | compile :: Config -> String -> (Parameters,IMList a) -> C.Definition
450 | compile config kname (params,im)
451 |   = go  
452 |   where
453 |     stms = compileIM config im
454 |     
455 |     ps = compileParams params
456 |     go = [cedecl| extern "C" __kernel void $id:kname($params:ps) {$items:cudabody} |]
457 | 
458 |     cudabody = (if (configSharedMem config > 0)
459 |                 then [C.BlockDecl
460 |                       [cdecl| __local  typename uint8_t  sbase[$uint:(configSharedMem config)] ; |]] 
461 |                 else []) ++
462 |                (if (usesGid im) 
463 |                 then [C.BlockDecl [cdecl| typename uint32_t gid = $exp:blockIdx_x * $exp:blockDim_x + $exp:threadIdx_x; |]]
464 |                 else []) ++ 
465 |                (if (usesBid im) 
466 |                 then [C.BlockDecl [cdecl| typename uint32_t bid = $exp:blockIdx_x; |]]
467 |                 else []) ++ 
468 |                (if (usesTid im) 
469 |                 then [C.BlockDecl [cdecl| typename uint32_t tid = $exp:threadIdx_x; |]]
470 |                 else []) ++
471 |                (if (usesWarps im) 
472 |                 then error "Warps not supported: OpenCL"
473 | --                    [C.BlockDecl
474 | --                      [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
475 | --                     C.BlockDecl
476 | --                      [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 
477 |                 else []) ++
478 |                 -- All variables used will be unique and can be declared 
479 |                 -- at the top level 
480 |                 concatMap declares im ++ 
481 |                 -- Not sure if I am using language.C correctly. 
482 |                 -- Maybe compileSTM should create BlockStms ?
483 |                 -- TODO: look how Nikola does it. 
484 |                 map C.BlockStm stms
485 | 
486 |     cbody = -- add memory allocation 
487 |             map C.BlockStm stms
488 | 
489 | -- Declare variables. 
490 | declares :: (Statement t,t) -> [C.BlockItem]
491 | declares (SDeclare name t,_) = [C.BlockDecl [cdecl| $ty:(compileType t)  $id:name;|]]
492 | declares (SCond _ im,_) = concatMap declares im 
493 | declares (SSeqWhile _ im,_) = concatMap declares im
494 | declares (SForAll _ _ im,_) = concatMap declares im
495 | declares (SDistrPar _ _ im,_) = concatMap declares im
496 | declares (SSeqFor _ _  im,_) = concatMap declares im
497 | declares _ = []
498 | 
499 | 
500 | --------------------------------------------------------------------------- 
501 | -- Parameter lists for functions  (kernel head) 
502 | ---------------------------------------------------------------------------
503 | compileParams :: Parameters -> [C.Param]
504 | -- compileParams PlatformOpenCL = map go
505 | --   where
506 | 
507 | -- C or CUDA 
508 | compileParams = map go
509 |   where
510 |     go (name,Pointer t) = [cparam| global  $ty:(compileType (Pointer t)) $id:name |]
511 |     go (name, t)        = [cparam| $ty:(compileType t) $id:name |]
512 |     
513 | 
514 |  
515 | 
516 | ---------------------------------------------------------------------------
517 | -- Compile with shared memory arrays declared at top
518 | ---------------------------------------------------------------------------
519 | -- CODE DUPLICATION FOR NOW
520 | 
521 | compileDeclsTop :: Config -> [(String,((Word32,Word32),T.Type))] -> String -> (Parameters,IMList a) -> C.Definition
522 | compileDeclsTop config toplevelarrs kname (params,im)
523 |   = go 
524 |   where
525 |     stms = compileIM config im
526 |     
527 |     ps = compileParams params
528 |     go = [cedecl| extern "C" __kernel void $id:kname($params:ps) {$items:cudabody} |]
529 |     cudabody = (if (configSharedMem config > 0)
530 |                 -- then [BlockDecl [cdecl| extern volatile __shared__  typename uint8_t sbase[]; |]] 
531 |                 then [C.BlockDecl [cdecl| __local typename uint8_t sbase[$uint:(configSharedMem config)]; |]] 
532 |                 else []) ++
533 |                 --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++
534 |                 --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
535 |                 --       BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++
536 | --                [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++
537 |                (if (usesGid im) 
538 |                 then [C.BlockDecl [cdecl| typename uint32_t gid = $exp:blockIdx_x * $exp:blockDim_x + $exp:threadIdx_x; |]]
539 |                 else []) ++ 
540 |                (if (usesBid im) 
541 |                 then [C.BlockDecl
542 |                        [cdecl| typename uint32_t bid = $exp:blockIdx_x; |]]
543 |                 else []) ++ 
544 |                (if (usesTid im) 
545 |                 then [C.BlockDecl
546 |                        [cdecl| typename uint32_t tid = $exp:threadIdx_x; |]]
547 |                 else []) ++
548 |                (if (usesWarps im) 
549 |                 then error "\"Warps\" no implemented in the OpenCL code-generator" 
550 |                     --[C.BlockDecl
551 |                     --   [cdecl| typename uint32_t warpID = threadIdx.x / 32; |],
552 |                     -- C.BlockDecl
553 |                     --   [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 
554 |                 else []) ++
555 |                 -- declare all arrays used
556 |                 concatMap declareArr toplevelarrs ++
557 |                 -- All variables used will be unique and can be declared 
558 |                 -- at the top level 
559 |                 concatMap declares im ++ 
560 |                 -- Not sure if I am using language.C correctly. 
561 |                 -- Maybe compileSTM should create BlockStms ?
562 |                 -- TODO: look how Nikola does it. 
563 |                 map C.BlockStm stms
564 | 
565 |     cbody = -- add memory allocation 
566 |             map C.BlockStm stms
567 | 
568 | 
569 | declareArr :: (String, ((Word32,Word32),T.Type)) -> [C.BlockItem]
570 | declareArr (arr,((_,addr),t)) =
571 |   [C.BlockDecl [cdecl| $ty:(compileType (Shared t)) $id:arr = ( $ty:(compileType (Shared t)))(sbase + $int:addr);|]]
572 | 


--------------------------------------------------------------------------------
/Obsidian/Library.hs:
--------------------------------------------------------------------------------
  1 | {- Joel Svensson 2012..2017
  2 |    Mary Sheeran  2012
  3 | 
  4 |    Notes:
  5 |    2017-04-22: Cleanup
  6 |    2014-03-31: Merged Library and LibraryG
  7 |    2013-01-24: GlobPull nolonger exists
  8 |                GlobPush is Push Grid
  9 | 
 10 |    2013-01-08: Renamed GlobArray to GlobPush
 11 |    2013-01-02: Added toGlobArray and toGlobArrayN
 12 |    2012-12-10: Refactoring
 13 |                (adherence to new Array types and program types)
 14 | -}
 15 | 
 16 | {-# LANGUAGE FlexibleInstances,
 17 |              TypeSynonymInstances,
 18 |              ScopedTypeVariables,
 19 |              TypeFamilies,
 20 |              GADTs #-}
 21 | {-# LANGUAGE TypeOperators #-}
 22 | {-# LANGUAGE FlexibleContexts #-}
 23 | {-# LANGUAGE MultiParamTypeClasses #-}
 24 | {-# LANGUAGE UndecidableInstances #-}
 25 | 
 26 | module Obsidian.Library
 27 |        (logBaseI
 28 |        , reverse
 29 |        , splitAt
 30 |        , splitUp
 31 |        , coalesce
 32 |        , halve
 33 |        , evenOdds
 34 |        , evens
 35 |        , odds
 36 |        , everyNth
 37 |        , singleton
 38 |        , generate
 39 |        , last
 40 |        , first
 41 |        , take
 42 |        , drop
 43 |        , head
 44 |        , tail
 45 |        , fold1
 46 |        , shiftLeft
 47 |        , unzip
 48 |        , zip
 49 |        , unzip3
 50 |        , zip3
 51 |        , zipWith
 52 |        , zipWith3
 53 |        , pair
 54 |        , unpair
 55 |        , unsafeBinSplit
 56 |        , binSplit
 57 |        , concP
 58 |        , unpairP
 59 |        , load   -- RENAME THIS
 60 |        , store  -- RENAME THIS
 61 |        -- Hierarchy programming
 62 |        , asThread
 63 |        , asThreadMap
 64 |        , asGrid
 65 |        , asGridMap
 66 |        , AsWarp(..)
 67 |        , AsBlock(..)
 68 |        , liftPar -- generic hierarchy programming
 69 |        , liftSeq -- generic hierarchy programming
 70 |        , liftIn  -- generic hierarchy programming
 71 |        -- Repeat a program
 72 |        , rep
 73 | 
 74 |        -- Executing programs
 75 |        , ExecProgram(..)
 76 |        , execThread
 77 |        , execThread'
 78 |        , execWarp
 79 |        , execWarp'
 80 |        , execBlock
 81 |        , execBlock'
 82 | 
 83 |        -- Leftovers from past days
 84 |        , singletonPush
 85 |        , runPush
 86 |        )where
 87 | 
 88 | import Obsidian.Array
 89 | import Obsidian.Exp
 90 | import Obsidian.Program
 91 | import Obsidian.Data
 92 | import Obsidian.Mutable
 93 | 
 94 | import Control.Monad
 95 | 
 96 | import Data.Bits
 97 | import Data.Word
 98 | 
 99 | import Prelude hiding (splitAt,zipWith,replicate,reverse,unzip,zip,zip3,unzip3,zipWith3, last, take, drop, head, tail)
100 | 
101 | 
102 | ---------------------------------------------------------------------------
103 | -- Helper
104 | ---------------------------------------------------------------------------
105 | logBaseI :: Integral a => a -> a -> a
106 | logBaseI b x
107 |   = if x < b
108 |     then 0
109 |     else
110 |       let
111 |         l = 2 * logBaseI (b*b) x
112 |         doDiv x l = if x < b then l else doDiv (x`div`b) (l+1)
113 |       in
114 |        doDiv (x`div`(b^l)) l
115 | 
116 | 
117 | ---------------------------------------------------------------------------
118 | -- Reverse an array by indexing in it backwards
119 | ---------------------------------------------------------------------------
120 | 
121 | -- | Reverses a Pull array.
122 | reverse :: (Array array, ArrayLength array, ASize l) => array l a -> array l a
123 | reverse arr = ixMap (\ix -> (sizeConv m) - ix) arr
124 |     where m = n-1
125 |           n = len arr
126 | ---------------------------------------------------------------------------
127 | -- splitAt (name clashes with Prelude.splitAt)
128 | ---------------------------------------------------------------------------
129 | 
130 | -- | Splits a Pull array at a given point. Performs no bounds checks.
131 | splitAt :: (Integral i, ASize l) => i -> Pull l a -> (Pull l a, Pull l a)
132 | splitAt n arr = (mkPull m (\ix -> arr ! ix),
133 |                  mkPull  (len arr - m) (\ix -> arr ! (ix + pos)))
134 |   where pos = fromIntegral n
135 |         m   = fromIntegral n
136 | 
137 | -- | Splits a Pull array in the middle.
138 | halve :: ASize l => Pull l a -> (Pull l a, Pull l a)
139 | halve arr = splitAt n2 arr
140 |   where
141 |     n = len arr
142 |     n2 = n `div` 2
143 | 
144 | -- | Splits a Pull array into chunks of size n. Result is a Pull of Pull arrays.
145 | splitUp :: (ASize l, ASize s, Integral s) => s -> Pull l a -> Pull l (Pull s a)
146 | splitUp n arr {-(Pull m ixf)-} =
147 |   mkPull (len arr `div` fromIntegral n) $ \i ->
148 |     mkPull n $ \j -> arr ! (i * (sizeConv n) + j)                                               
149 | 
150 | -- | Same as @splitUp@ but also performs a permutation of the elements.
151 | coalesce :: ASize l
152 |          => Word32 -> Pull l a -> Pull l (Pull Word32 a)
153 | coalesce n arr =
154 |   mkPull s $ \i ->
155 |     mkPull n $ \j -> arr ! (i + (sizeConv s) * j)
156 |   where s = len arr `div` fromIntegral n
157 | 
158 | 
159 | ---------------------------------------------------------------------------
160 | -- elements at even indices to fst output, odd to snd.
161 | ---------------------------------------------------------------------------
162 | -- | Split a Pull array into its even and odd indexed elements.
163 | evenOdds :: ASize l => Pull l a -> (Pull l a, Pull l a)
164 | evenOdds arr = (mkPull (n-n2) (\ix -> arr ! (2*ix)) ,
165 |                 mkPull n2     (\ix -> arr ! (2*ix + 1)))
166 |   where
167 |     n  = len arr
168 |     n2 = div n 2
169 | -- | Extract the elements at even indices from a Pull array
170 | evens :: ASize l => Pull l a -> Pull l a
171 | evens = fst . evenOdds
172 | 
173 | -- | Extract the elements at odd indices from a Pull array
174 | odds :: ASize l => Pull l a -> Pull l a
175 | odds  = snd . evenOdds
176 | 
177 | ---------------------------------------------------------------------------
178 | -- everyNth
179 | ---------------------------------------------------------------------------
180 | -- | Extract every nth element from a Pull array.
181 | everyNth :: ASize l => Word32 -> Word32 -> Pull l a -> Pull l a
182 | everyNth n m arr = mkPull n' $ \ix -> arr ! (ix * (fromIntegral n) + fromIntegral m)
183 |   where
184 |     n' = len arr `div` (fromIntegral n)
185 | 
186 | 
187 | ---------------------------------------------------------------------------
188 | -- replicate
189 | ---------------------------------------------------------------------------
190 | -- | Generates a Pull array of length one, containing @a@.
191 | singleton :: (Array a, ASize l) => e -> a l e
192 | singleton a = replicate 1 a
193 | 
194 | -- | Generate a pull or push array using a function from Index to element.
195 | generate :: (Functor (a s), Array a, ASize s)
196 |          => s -> (EWord32 -> b) -> a s b
197 | generate n f = fmap f (iota n)
198 | ---------------------------------------------------------------------------
199 | -- last and first
200 | ---------------------------------------------------------------------------
201 | -- | Extract last element from a Pull array.
202 | last :: ASize l => Pull l a -> a
203 | last arr = arr ! fromIntegral ( len arr - 1)
204 | 
205 | -- | Extract the first element from a Pull array.
206 | first :: ASize l => Pull l a -> a
207 | first arr = arr ! 0
208 | 
209 | ---------------------------------------------------------------------------
210 | -- Take and Drop (what about strange sizes ? fix)
211 | ---------------------------------------------------------------------------
212 | -- | Take the first @n@ elements from a Pull array
213 | take :: ASize l => l -> Pull l a -> Pull l a
214 | take n arr = setSize n arr
215 | 
216 | -- | Drop the first @n@ elements from a Pull array
217 | drop :: ASize l => l -> Pull l a -> Pull l a
218 | drop n arr = setSize (len arr - n) $ ixMap (\ix -> ix + sizeConv n) arr
219 | 
220 | ---------------------------------------------------------------------------
221 | -- Head and Tail  on pull arrays
222 | ---------------------------------------------------------------------------
223 | 
224 | head :: ASize l =>  Pull l a -> a
225 | head arr = arr ! 0
226 | 
227 | tail :: ASize l => Pull l a -> Pull l a
228 | tail = drop 1
229 | 
230 | 
231 | ---------------------------------------------------------------------------
232 | -- fold (sequential , unrolled)
233 | ---------------------------------------------------------------------------
234 | -- | Fold a nonempty pull array using a given operator. The result a singleton array (push or pull). 
235 | fold1 :: Array a => (e -> e -> e) -> Pull Word32 e -> a Word32 e
236 | fold1  f arr = replicate 1
237 |                $ foldl1 f [arr ! (fromIntegral i) | i <- [0..(n-1)]]
238 |   where n = len arr
239 | 
240 | 
241 | ---------------------------------------------------------------------------
242 | -- Shift arrays
243 | ---------------------------------------------------------------------------
244 | shiftLeft :: ASize l =>  Word32 -> Pull l a -> Pull l a
245 | shiftLeft dist arr = setSize (len arr - (fromIntegral dist))
246 |                       $ ixMap (\ix -> ix + (fromIntegral dist)) arr
247 | 
248 | ---------------------------------------------------------------------------
249 | -- zipp unzipp
250 | ---------------------------------------------------------------------------
251 | -- | Unzip implemented on Pull arrays
252 | unzip :: ASize l =>  Pull l (a,b) -> (Pull l a, Pull l b)
253 | unzip arr = (mkPull (len arr) (\ix -> fst (arr ! ix)) ,
254 |              mkPull (len arr) (\ix -> snd (arr ! ix)) )
255 | 
256 | -- | Zip implemented on Pull arrays
257 | zip :: ASize l => Pull l a -> Pull l b -> Pull l (a, b)
258 | zip arr1 arr2 = mkPull (min (len arr1) (len arr2))
259 |                      $ \ix -> (arr1 ! ix, arr2 ! ix)
260 | 
261 | -- | Unzip tripples.
262 | unzip3 :: ASize l => Pull l (a,b,c)
263 |            -> (Pull l a, Pull l b, Pull l c)
264 | unzip3 arr = (fmap (\(x,_,_) -> x) arr,
265 |                fmap (\(_,y,_) -> y) arr,
266 |                fmap (\(_,_,z) -> z)  arr)
267 | 
268 | -- | Zip three arrays
269 | zip3 :: ASize l
270 |         => Pull l a
271 |         -> Pull l b
272 |         -> Pull l c
273 |         -> Pull l (a,b,c)
274 | zip3 arr1 arr2 arr3 =
275 |   mkPull (minimum [len arr1, len arr2, len arr3])
276 |   (\ix -> (arr1 ! ix, arr2 ! ix, arr3 ! ix))
277 | 
278 | 
279 | -- | Perform elementwise operation.
280 | zipWith :: ASize l => (a -> b -> c) -> Pull l a -> Pull l b -> Pull l c
281 | zipWith op a1 a2 =
282 |   mkPull (min (len a1) (len a2))
283 |   (\ix -> (a1 ! ix) `op` (a2 ! ix))
284 | 
285 | -- | Perform elementwise operation.
286 | zipWith3 :: ASize l => (a -> b -> c-> d) -> Pull l a -> Pull l b -> Pull l c -> Pull l d
287 | zipWith3 f a1 a2 a3 =
288 |   mkPull (minimum [len a1,len a2,len a3]) $
289 |     \ix -> f (a1 ! ix)  (a2 ! ix) (a3 ! ix)
290 | 
291 | 
292 | ---------------------------------------------------------------------------
293 | -- pair
294 | ---------------------------------------------------------------------------
295 | -- | Pair up consecutive elements in a Pull array.
296 | pair :: ASize l => Pull l a -> Pull l (a,a)
297 | pair arr =
298 |   mkPull n' (\ix -> (arr ! (ix*2),arr ! (ix*2+1)))
299 |   where
300 |     n' = len arr `div` 2
301 | 
302 | 
303 | -- | Flatten a Pull array of pairs.
304 | unpair :: ASize l => Choice a => Pull l (a,a) -> Pull l a
305 | unpair arr = 
306 |     let n = len arr
307 |     in  mkPull (2*n) (\ix -> ifThenElse ((mod ix 2) ==* 0)
308 |                                   (fst (arr ! (ix `shiftR` 1)))
309 |                                   (snd (arr ! (ix `shiftR` 1))))
310 | 
311 | -- | Triple up consecutive elements in a Pull array.
312 | triple :: ASize l => Pull l a -> Pull l (a,a,a)
313 | triple arr =
314 |   mkPull (len arr `div` 3) $ \ix ->
315 |     (arr ! (ix*3), arr ! (ix*3+1), arr ! (ix*3+2))
316 | 
317 | -- | Flatten a Pull array of triples.
318 | untriple :: ASize l => Choice a => Pull l (a,a,a) -> Pull l a
319 | untriple arr =
320 |   mkPull (3*len arr) $ \ix ->
321 |     let (k,j) = divMod ix 3
322 |         (a0,a1,a2) = arr ! k
323 |     in  ifThenElse (j ==* 0) a0 $
324 |         ifThenElse (j ==* 1) a1 a2
325 | 
326 | 
327 | -- | Quadruple up consecutive elements in a Pull array.
328 | quadruple :: ASize l => Pull l a -> Pull l (a,a,a,a)
329 | quadruple arr =
330 |   mkPull (len arr `div` 4) $ \ix ->
331 |     (arr ! (ix*4), arr ! (ix*4+1), arr ! (ix*4+2), arr ! (ix*4+3))
332 | 
333 | -- | Flatten a Pull array of triples.
334 | unquadruple :: ASize l => Choice a => Pull l (a,a,a,a) -> Pull l a
335 | unquadruple = unpair . unpair . fmap (\(a0,a1,a2,a3) -> ((a0,a1), (a2,a3)))
336 | 
337 | 
338 | 
339 | ---------------------------------------------------------------------------
340 | -- twoK (untested for proper functionality)
341 | ---------------------------------------------------------------------------
342 | 
343 | -- | Recursively split an array in the middle. Apply an array to array computation
344 | --   on each part. @binSplit 3@ divides the array into 8 pieces.
345 | --   UNSAFE
346 | unsafeBinSplit :: Int
347 |            -> (Pull Word32 a -> Pull Word32 b)
348 |            -> Pull Word32 a
349 |            -> Pull Word32 b
350 | unsafeBinSplit = twoK
351 | 
352 | binSplit :: Data a
353 |             => Int
354 |             -> (Pull Word32 a -> Pull Word32 b)
355 |             -> Mutable Shared Word32 a
356 |             -> Pull Word32 b
357 | binSplit n f = unsafeBinSplit n f . mutableToPull
358 | 
359 | -- See if this should be specifically for Static size pull arrays
360 | twoK :: Int -> (Pull Word32 a -> Pull Word32 b) -> Pull Word32 a -> Pull Word32 b 
361 | twoK 0 f = f  -- divide 0 times and apply f
362 | twoK n f = \arr ->
363 |               let arr' = mkPull lt (\i -> (f (mkPull  m (\j -> (arr ! (g i j)))) ! (h i))) 
364 |                   m    = (len arr `shiftR` n)   --pow of two
365 |                   g i j = i .&. (fromIntegral (complement (m-1))) .|. j
366 |                   h i   = i .&. (fromIntegral (nl2-1))   -- optimize
367 | 
368 |                   nl2   = len (f (mkPull  m (\j -> arr ! variable "X")))
369 |                   lt    = nl2 `shiftL` n
370 |               in arr'
371 | 
372 | 
373 | 
374 | ---------------------------------------------------------------------------
375 | -- ***                          PUSHY LIBRARY                       *** ---
376 | ---------------------------------------------------------------------------
377 | 
378 | ---------------------------------------------------------------------------
379 | -- Concatenate on Push arrays
380 | ---------------------------------------------------------------------------
381 | 
382 | -- | Concatenate two push arrays.
383 | concP :: ASize l
384 |       => Push t l a -> Push t l a -> Push t l a
385 | concP p1 p2  =
386 |   mkPush (n1 + n2) $ \wf ->
387 |   do
388 |     p1 <: wf
389 |     p2 <: \a i -> wf a (sizeConv n1 + i)
390 |  where
391 |    n1 = len p1
392 |    n2 = len p2
393 | 
394 | -- | Flatten a Pull array of pairs. Result is a push array
395 | unpairP :: ASize l => Choice a => Push t l (a,a) -> Push t l a
396 | unpairP arr =
397 |   mkPush (2 * len arr) $ \ wf ->
398 |   do
399 |     -- even iterations
400 |     arr <: \ (a,_) i -> wf a ((sizeConv i) `shiftL` 1)
401 |     -- Odd iterations
402 |     arr <: \ (_,b) i -> wf b ((sizeConv i) `shiftL` 1 + 1)
403 | 
404 | 
405 | ---------------------------------------------------------------------------
406 | -- load / Store
407 | ---------------------------------------------------------------------------
408 | load :: Word32 -> Pull Word32 a -> Push Block Word32 a
409 | load n arr =
410 |   mkPush m (\wf ->
411 |   forAll (fromIntegral n') (\tid ->
412 |   do
413 |     seqFor (fromIntegral n) (\ix ->
414 |       wf (arr ! (tid + (ix*fromIntegral n'))) (tid + (ix*fromIntegral n')))))
415 | 
416 |   where
417 |     m = len arr
418 |     n' = m `div` n
419 | 
420 | store :: Word32 -> SPull a -> SPush Block a
421 | store = load
422 | 
423 | 
424 | -- ########################################################################
425 | --
426 | -- Programming the Hierarchy
427 | --
428 | -- ########################################################################
429 | 
430 | -- "Compute as" family of functions
431 | 
432 | liftPar :: ASize l => Pull l (SPush t a) -> Push (Step t) l a
433 | liftPar = pConcat
434 | 
435 | liftSeq :: ASize l => Pull l (SPush t a) -> Push t l a
436 | liftSeq = sConcat
437 | 
438 | liftIn :: (t *<=* Block, ASize l)
439 |       => Pull l (SPush Thread b)
440 |       -> Push t l b
441 | liftIn = tConcat
442 | 
443 | ---------------------------------------------------------------------------
444 | -- AsBlock
445 | ---------------------------------------------------------------------------
446 | -- Rename this to asBlock, asGrid, asThread
447 | class (t *<=* Block) => AsBlock t where
448 |   asBlock :: SPull (SPush t a) ->
449 |              SPush Block a
450 |   asBlockMap :: (a -> SPush t b)
451 |               -> SPull a
452 |               -> SPush Block b
453 | 
454 | instance AsBlock Thread where
455 |   asBlock = tConcat
456 |   asBlockMap f = tConcat . fmap f
457 | 
458 | instance AsBlock Warp where
459 |   asBlock = pConcat
460 |   asBlockMap f = pConcat . fmap f
461 | 
462 | instance AsBlock Block where
463 |   asBlock = sConcat
464 |   asBlockMap f = sConcat . fmap f
465 | 
466 | ---------------------------------------------------------------------------
467 | -- AsWarp
468 | ---------------------------------------------------------------------------
469 | class (t *<=* Warp) => AsWarp t where
470 |   asWarp :: SPull (SPush t a) ->
471 |             SPush Warp a
472 |   asWarpMap :: (a -> SPush t b)
473 |                -> SPull a
474 |                -> SPush Warp b
475 | 
476 | 
477 | instance AsWarp Thread where
478 |   asWarp = tConcat
479 |   asWarpMap f = tConcat . fmap f
480 | 
481 | instance AsWarp Warp where
482 |   asWarp = sConcat
483 |   asWarpMap f = sConcat . fmap f
484 | 
485 | ---------------------------------------------------------------------------
486 | -- LiftThread
487 | ---------------------------------------------------------------------------
488 | 
489 | asThread :: ASize l
490 |            => Pull l (SPush Thread b)
491 |            -> Push Thread l b
492 | asThread = tConcat
493 | 
494 | asThreadMap :: (a -> SPush Thread b)
495 |                -> SPull a
496 |                -> SPush Thread b
497 | asThreadMap f = tConcat . fmap f
498 | 
499 | 
500 | ---------------------------------------------------------------------------
501 | -- AsGrid
502 | ---------------------------------------------------------------------------
503 | 
504 | asGrid :: ASize l => Pull l (SPush Block a)
505 |          -> Push Grid l a
506 | asGrid = pConcat
507 | 
508 | asGridMap :: ASize l => (a -> SPush Block b)
509 |             -> Pull l a
510 |             -> Push Grid l b
511 | asGridMap f = pConcat . fmap f
512 | 
513 | 
514 | ---------------------------------------------------------------------------
515 | -- Repeat a program
516 | ---------------------------------------------------------------------------
517 | 
518 | -- | Repeat a program (iterate it)
519 | rep :: Word32 -> (a -> Program t a) -> a -> Program t a
520 | rep 0 _ a = return a
521 | rep n prg a = do
522 |   b <- rep (n-1) prg a
523 |   prg b
524 | 
525 | ---------------------------------------------------------------------------
526 | -- RunPush
527 | ---------------------------------------------------------------------------
528 | 
529 | class ExecProgram t a  where
530 |   exec :: Data e
531 |         => Program t (a Word32 e)
532 |         -> Push t Word32 e
533 | 
534 | instance (t *<=* Block) => ExecProgram t Pull where
535 |   exec = runPush . liftM push
536 | 
537 | -- Here we also want the type error behaviour.
538 | -- It is a type error to try to "execute" a push t at any level different from t
539 | instance (t ~ t1) => ExecProgram t (Push t1) where
540 |   exec = runPush
541 | 
542 | execThread :: (ExecProgram Thread a, Data e)
543 |         => Program Thread (a Word32 e)
544 |         -> Push Thread Word32 e
545 | execThread = exec
546 | 
547 | execThread' :: Data a => Program Thread a -> SPush Thread a
548 | execThread' = singletonPush
549 | 
550 | execBlock :: (ExecProgram Block a, Data e)
551 |           => Program Block (a Word32 e)
552 |           -> Push Block Word32 e
553 | execBlock = exec
554 | 
555 | execBlock' :: Data a => Program Block a -> SPush Block a
556 | execBlock' = singletonPush
557 | 
558 | execWarp :: (ExecProgram Warp a, Data e)
559 |          => Program Warp (a Word32 e)
560 |          -> Push Warp Word32 e
561 | execWarp = exec
562 | 
563 | execWarp' :: Data a => Program Warp a -> SPush Warp a
564 | execWarp' = singletonPush
565 | 
566 | 
567 | -- | Fuses the program that computes a Push array into the Push array.
568 | runPush :: Program t (Push t s a) -> Push t s a
569 | runPush prg =
570 |   mkPush n $ \wf -> do
571 |     parr <- prg
572 |     parr <: wf
573 |     -- It is a bit scary that I need to "evaluate" programs here.
574 |   where n = len $ fst $ runPrg 0 prg
575 | 
576 | -- | Lifts @runPush@ to one input functions.
577 | runPush1 :: (a -> Program t (Push t s b)) -> a -> Push t s b
578 | runPush1 f a = runPush (f a)
579 | 
580 | -- | Lifts @runPush@ to two input functions.
581 | runPush2 :: (a -> b -> Program t (Push t s c)) -> a -> b -> Push t s c
582 | runPush2 f a b = runPush (f a b)
583 | 
584 | -- | Converts a program computing a pull Array to a Push array
585 | runPull :: (t *<=* Block, ASize s) => Program t (Pull s a) -> Push t s a
586 | runPull = runPush . liftM push
587 | 
588 | -- | Lifts @runPull@ to one input functions.
589 | runPull1 :: (t *<=* Block, ASize s) => (a -> Program t (Pull s b)) -> a -> Push t s b
590 | runPull1 f a = runPull (f a)
591 | 
592 | -- | Lifts @runPull@ to two input functions.
593 | runPull2 :: (t *<=* Block, ASize s) => (a -> b -> Program t (Pull s c)) -> a -> b -> Push t s c
594 | runPull2 f a b = runPull (f a b)
595 | 
596 | ---------------------------------------------------------------------------
597 | --
598 | ---------------------------------------------------------------------------
599 | pushPrg :: (t *<=* Block) => Program t a -> SPush t a
600 | pushPrg = singletonPush
601 | 
602 | 
603 | ---------------------------------------------------------------------------
604 | -- Singleton push arrays
605 | ---------------------------------------------------------------------------
606 | 
607 | -- Danger! use only with Scalar a's
608 | -- -- | Create a singleton Push array.
609 | --singletonPush :: a -> SPush t a
610 | --singletonPush = singletonPushP . return
611 | 
612 | -- | Monadic version of @singleton@.
613 | singletonPush :: (t *<=* Block) => Program t a -> SPush t a
614 | singletonPush prg =
615 |   mkPush 1 $ \wf -> do
616 |     a <- prg
617 |     forAll 1 $ \_ ->
618 |       wf a 0
619 | 
620 | ---------------------------------------------------------------------------
621 | -- Old stuff that should nolonger be exported!
622 | --  * It is still used internally
623 | ---------------------------------------------------------------------------
624 | 
625 | -- | A way to enter into the hierarchy
626 | -- A bunch of Thread computations, spread across the threads of either
627 | -- a Warp, block or grid. (or performed sequentially in a single thread)
628 | tConcat :: (t *<=* Block, ASize l)
629 |            => Pull l (SPush Thread b)
630 |            -> Push t l b
631 | tConcat arr =
632 |   mkPush (n * fromIntegral s) $ \wf -> do
633 |     forAll (sizeConv n) $ \tid ->
634 |        let wf' a ix = wf a (tid * sizeConv s + ix)
635 |            p = arr ! tid -- f tid
636 |        in p <: wf'
637 |   where
638 |     n = len arr
639 |     s  = len (arr ! 0) --(f (variable "tid")) -- arr
640 | 
641 | -- | Variant of @tConcat@.
642 | tDistribute :: (t *<=* Block, ASize l)
643 |                => l
644 |                -> (EWord32 -> SPush Thread b)
645 |                -> Push t l b
646 | tDistribute n f = tConcat (mkPull n f)
647 | 
648 | 
649 | -- | Distribute work across the parallel resources at a given level of the GPU hiearchy
650 | pConcat :: ASize l => Pull l (SPush t a) -> Push (Step t) l a
651 | pConcat arr =
652 |   mkPush (n * fromIntegral rn) $ \wf ->
653 |     distrPar (sizeConv n) $ \bix ->
654 |       let p = arr ! bix
655 |           wf' a ix = wf a (bix * sizeConv rn + ix)
656 | 
657 |       in p <: wf'
658 |   where
659 |     n  = len arr
660 |     rn = len (arr ! 0) -- All arrays are same length
661 | 
662 | -- | Distribute work across the parallel resources at a given level of the GPU hierarchy
663 | pDistribute :: ASize l
664 |                => l
665 |                -> (EWord32 -> SPush t a)
666 |                -> Push (Step t) l a
667 | pDistribute n f = pConcat (mkPull n f)
668 | 
669 | -- | Sequential concatenation of a Pull of Push.
670 | sConcat :: ASize l => Pull l (SPush t a) -> Push t l a
671 | sConcat arr =
672 |   mkPush (n * fromIntegral rn) $ \wf ->
673 |   do
674 |     seqFor (sizeConv n) $ \bix ->
675 |       let p = arr ! bix
676 |           wf' a ix = wf a (bix * sizeConv rn + ix)
677 |       in p <: wf'
678 |   where
679 |     n  = len arr
680 |     rn = len $ arr ! 0
681 | 
682 | 
683 | -- | Variant of sConcat.
684 | sDistribute :: ASize l => l -> (EWord32 -> SPush t a) -> Push t l a
685 | sDistribute n f = sConcat (mkPull n f)
686 | 
687 | -- pUnCoalesce adapted from Niklas branch.
688 | -- | Combines work that was distributed in a Coalesced way.
689 | -- | Applies a permutation on stores.
690 | pUnCoalesce :: ASize l
691 |                => Pull l (SPush t a)
692 |                -> Push (Step t) l a
693 | pUnCoalesce arr =
694 |   mkPush (n * fromIntegral rn) $ \wf ->
695 |   distrPar (sizeConv n) $ \bix ->
696 |     let p = arr ! bix
697 |         wf' a ix = wf a (bix * sizeConv rn + ix)
698 |     in p <: (g wf')
699 |   where
700 |     n  = len arr
701 |     rn = len $ arr ! 0
702 |     s  = sizeConv rn
703 |     g wf a i = wf a (i `div` s + (i`mod`s)*(sizeConv n))
704 | 


--------------------------------------------------------------------------------