├── .gitmodules ├── Tests ├── README.md └── Test1.hs ├── Setup.hs ├── .gitignore ├── Obsidian ├── DimSpec.hs ├── Globs.hs ├── Data.hs ├── Names.hs ├── Atomic.hs ├── Types.hs ├── CodeGen │ ├── OpenCLEmbedded.hs │ ├── CUDA.hs │ ├── Liveness.hs │ ├── Reify.hs │ ├── Memory.hs │ ├── Program.hs │ ├── Memory2.hs │ ├── CompileIM.hs │ └── CompileIMOpenCLEmbedded.hs ├── Force.hs ├── SeqLoop.hs ├── Mutable.hs ├── Array.hs ├── Memory.hs ├── Program.hs └── Library.hs ├── README.org ├── .jenkins_script.sh ├── Obsidian.hs ├── LICENSE ├── .jenkins_runBench.sh └── Obsidian.cabal /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Tests/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Tests directory -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | cabal-dev 3 | *.o 4 | *.hi 5 | *.chi 6 | *.chs.h 7 | .virthualenv 8 | *.cu 9 | *.cubin 10 | cabal.sandbox.config 11 | *# 12 | *~ 13 | .cabal-sandbox -------------------------------------------------------------------------------- /Obsidian/DimSpec.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2012 -} 3 | 4 | module Obsidian.DimSpec (DimSpec (..)) where 5 | 6 | data DimSpec = X | Y | Z 7 | deriving (Eq,Ord,Show) 8 | -------------------------------------------------------------------------------- /Obsidian/Globs.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2012 -} 3 | 4 | module Obsidian.Globs where 5 | 6 | 7 | import Data.Word 8 | 9 | --------------------------------------------------------------------------- 10 | -- Aliases 11 | 12 | type Name = String 13 | 14 | 15 | 16 | type NumThreads = Word32 -------------------------------------------------------------------------------- /Obsidian/Data.hs: -------------------------------------------------------------------------------- 1 | 2 | module Obsidian.Data where 3 | 4 | 5 | import Obsidian.Exp 6 | import Obsidian.Memory 7 | 8 | 9 | --------------------------------------------------------------------------- 10 | -- Data (should match up with storable instances for completeness) 11 | -- Use this to ensure nonnestednes where required 12 | --------------------------------------------------------------------------- 13 | class (Storable a, Choice a) => Data a 14 | instance Scalar a => Data (Exp a) 15 | instance (Data a, Data b) => Data (a,b) 16 | instance (Data a, Data b, Data c) => Data (a,b,c) 17 | instance (Data a, Data b, Data c, Data d) => Data (a,b,c,d) 18 | 19 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | * Obsidian 2 | 3 | An embedded language for GPU kernel programming. 4 | 5 | * News: 6 | + Obsidian does not depend on the "CUDA" package anymore. 7 | If running on a GPU from haskell (by using the CUDA package) 8 | is desired, the Obsidian-Run-CUDA package should be installed as well 9 | 10 | * TODO: 11 | + "Up" the versions 12 | + Remove all example code that relies on CUDA (Move to another repo) 13 | 14 | 15 | * Versioning Policy (from Apr 14 2014 with the release of version 0.1.0.0) 16 | + A.B.C.D 17 | + A.B changes when API Changes (in a disruptive way) 18 | + C changes with additions that does not break existing programs 19 | + D Bugfixes and internal hacking. 20 | -------------------------------------------------------------------------------- /.jenkins_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | source $HOME/rn_jenkins_scripts/acquire_ghc.sh 7 | 8 | which -a cabal 9 | cabal sandbox init 10 | cabal sandbox hc-pkg list 11 | 12 | 13 | # Need to pull down HsBencher 14 | # ---------------------------------------- 15 | # Need to know what dir this is. and if we 16 | # are using cabal sandbox. 17 | 18 | # echo Pull HSBencher from Github 19 | # git clone git@github.com:rrnewton/HsBencher 20 | # cabal install ./HsBencher/hsbencher/ 21 | # cabal install ./HsBencher/hsbnecher-fusion/ 22 | 23 | 24 | 25 | cabal update 26 | cabal install Obsidian.cabal 27 | 28 | #cabal install ./ ./Examples/Simple 29 | 30 | 31 | #cd Examples/Simple 32 | #cabal sandbox init --sandbox=../../.cabal-sandbox 33 | #cabal install . 34 | 35 | 36 | # cd Examples/ReductionTutorial 37 | #cabal sandbox init --sandbox=../../.cabal-sandbox 38 | #cabal install Reduce.cabal 39 | -------------------------------------------------------------------------------- /Tests/Test1.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | module Main where 4 | 5 | 6 | import Prelude hiding (replicate) 7 | import Prelude as P 8 | 9 | 10 | import Obsidian 11 | import Obsidian.Run.CUDA.Exec 12 | 13 | 14 | import qualified Data.Vector.Storable as V 15 | import Control.Monad.State 16 | 17 | 18 | import Data.Int 19 | import Data.Word 20 | 21 | 22 | test1_local :: SPull EWord32 -> SPush Block EWord32 23 | test1_local arr = push arr 24 | 25 | test1 :: Word32 -> DPull EWord32 -> DPush Grid EWord32 26 | test1 n arr = pConcat (fmap test1_local (splitUp n arr)) 27 | 28 | 29 | runTest1 = 30 | withCUDA $ 31 | do 32 | kern <- capture 128 (test1 1024) 33 | 34 | (inputs :: V.Vector Word32) <- lift $ mkRandomVec (1024 * 1024) 35 | 36 | useVector inputs $ \i -> 37 | allocaVector (1024 * 1024) $ \o -> 38 | do 39 | o <== (1024,kern) <> i 40 | r <- peekCUDAVector o 41 | lift $ putStrLn $ show r 42 | -------------------------------------------------------------------------------- /Obsidian/Names.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | 3 | module Obsidian.Names (Names(..), mapNamesM_) where 4 | 5 | import Obsidian.Globs 6 | 7 | 8 | data Names a where 9 | Single :: Name -> Names a 10 | Tuple :: Names a -> Names b -> Names (a,b) 11 | Triple :: Names a -> Names b -> Names c -> Names (a,b,c) 12 | Quadruple :: Names a -> Names b -> Names c -> Names d -> Names (a,b,c,d) 13 | 14 | --------------------------------------------------------------------------- 15 | -- helpers 16 | --------------------------------------------------------------------------- 17 | mapNamesM_ :: Monad m => (Name -> m ()) -> Names a -> m () 18 | mapNamesM_ f (Single nom) = f nom 19 | mapNamesM_ f (Tuple n1 n2) = mapNamesM_ f n1 >> 20 | mapNamesM_ f n2 21 | mapNamesM_ f (Triple n1 n2 n3) = mapNamesM_ f n1 >> 22 | mapNamesM_ f n2 >> 23 | mapNamesM_ f n3 24 | mapNamesM_ f (Quadruple n1 n2 n3 n4) = mapNamesM_ f n1 >> 25 | mapNamesM_ f n2 >> 26 | mapNamesM_ f n3 >> 27 | mapNamesM_ f n4 28 | -------------------------------------------------------------------------------- /Obsidian.hs: -------------------------------------------------------------------------------- 1 | module Obsidian (module Obsidian.Array, 2 | module Obsidian.Program, 3 | module Obsidian.Exp, 4 | module Obsidian.Force, 5 | module Obsidian.Library, 6 | module Obsidian.Atomic, 7 | module Obsidian.SeqLoop, 8 | module Obsidian.Memory, 9 | module Obsidian.Mutable, 10 | module Obsidian.Data) where 11 | 12 | -- These are internal. 13 | -- module Obsidian.Types, 14 | -- module Obsidian.CodeGen.Reify, 15 | -- module Obsidian.CodeGen.CompileIM, 16 | -- module Obsidian.CodeGen.CUDA, 17 | -- module Obsidian.Names, 18 | 19 | import Obsidian.Program 20 | import Obsidian.Exp 21 | import Obsidian.Array 22 | import Obsidian.Library 23 | import Obsidian.Force 24 | import Obsidian.Atomic 25 | import Obsidian.SeqLoop 26 | import Obsidian.Memory 27 | import Obsidian.Mutable 28 | import Obsidian.Data 29 | 30 | --import Obsidian.Types 31 | --import Obsidian.CodeGen.Reify 32 | --import Obsidian.CodeGen.CompileIM 33 | --import Obsidian.CodeGen.CUDA 34 | --import Obsidian.Names 35 | -------------------------------------------------------------------------------- /Obsidian/Atomic.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE GADTs #-} 3 | 4 | {- Joel Svensson, 5 | Josef Svenningsson 6 | 2012 -} 7 | module Obsidian.Atomic where 8 | 9 | import Obsidian.Exp 10 | import Data.Word 11 | import Data.Int 12 | 13 | 14 | -- Anyone can extend these with new instances. 15 | -- Not good. (I need to think about how to separate 16 | -- low level CUDA concerns, from programmer level concerns) 17 | class Scalar a => AtomicInc a 18 | instance AtomicInc Word32 19 | 20 | class Scalar a => AtomicAdd a 21 | instance AtomicAdd Word32 22 | instance AtomicAdd Int32 23 | instance AtomicAdd Word64 24 | 25 | class Scalar a => AtomicSub a 26 | instance AtomicSub Word32 27 | instance AtomicSub Int32 28 | 29 | class Scalar a => AtomicExch a 30 | instance AtomicExch Word32 31 | instance AtomicExch Word64 32 | instance AtomicExch Int32 33 | 34 | 35 | 36 | --------------------------------------------------------------------------- 37 | -- Atomic operations 38 | --------------------------------------------------------------------------- 39 | data Atomic a where 40 | 41 | -- Cuda only allows AtomicInc on the Int type 42 | -- (todo: figure out if CUDA int is 32 or 64 bit) 43 | AtomicInc :: AtomicInc a => Atomic a 44 | 45 | AtomicAdd :: AtomicAdd a => Exp a -> Atomic a 46 | 47 | AtomicSub :: AtomicSub a => Exp a -> Atomic a 48 | 49 | AtomicExch :: AtomicExch a => Exp a -> Atomic a 50 | 51 | printAtomic AtomicInc = "atomicInc" 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | BSD3 Full Text: 3 | -------------------------------------------------------------------------------- 4 | 5 | Copyright (c) 2011-2014, Joel Svensson 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | * Neither the name of the nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Obsidian/Types.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | Module : Types 3 | Description : Type information, used internally by Obsídian. 4 | Copyright : (c) Joel Svensson, 2014, 2015 5 | License : BSD 6 | Maintainer : bo.joel.svensson@gmail.com 7 | Stability : experimental 8 | 9 | -} 10 | 11 | module Obsidian.Types where 12 | 13 | --------------------------------------------------------------------------- 14 | -- Types 15 | --------------------------------------------------------------------------- 16 | 17 | data Type 18 | -- The allowed scalar types 19 | = Bool 20 | | Int | Word -- A bit problematic since the size of 21 | -- of these are platform dependent 22 | 23 | -- Vector types supported by CUDA (Add more) 24 | | FloatV2 | FloatV3 | FloatV4 25 | | DoubleV2 26 | 27 | -- | Int8V2 | Int8V3 | Int8V4 28 | -- | Int16V2 | Int16V3 | Int16V4 29 | -- | Int32V2 | Int32V3 | Int32V4 30 | -- | Int64V2 31 | 32 | -- | Word8V2 | Word8V3 | Word8V4 33 | -- | Word16V2 | Word16V3 | Word16V4 34 | -- | Word32V2 | Word32V3 | Word32V4 35 | -- | Word64V2 36 | 37 | | Int8 | Int16 | Int32 | Int64 38 | | Word8 | Word16 | Word32 | Word64 39 | | Float | Double 40 | -- Vector Types 41 | | Vec2 Type | Vec3 Type | Vec4 Type 42 | 43 | 44 | -- Used by CUDA, C And OpenCL generators 45 | | Shared Type 46 | | Volatile Type -- For warp local computations. 47 | | Pointer Type -- Pointer to a @type@ 48 | | Global Type -- OpenCL thing 49 | | Local Type -- OpenCL thing 50 | deriving (Eq, Ord, Show) 51 | 52 | typeSize :: Num a => Type -> a 53 | typeSize Int8 = 1 54 | typeSize Int16 = 2 55 | typeSize Int32 = 4 56 | typeSize Int64 = 8 57 | typeSize Word8 = 1 58 | typeSize Word16 = 2 59 | typeSize Word32 = 4 60 | typeSize Word64 = 8 61 | typeSize Bool = 4 62 | typeSize Float = 4 63 | typeSize Double = 8 64 | typeSize (Shared t) = typeSize t 65 | typeSize t = error $ "typeSize: this is bad!: " ++ show t 66 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/OpenCLEmbedded.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2017 3 | 4 | Notes: 5 | 2017-04-23: Adding OpenCL module 6 | 7 | -} 8 | 9 | module Obsidian.CodeGen.OpenCLEmbedded (genKernel, genKernelParams, SharedMemConfig(..),ToProgram(..)) where 10 | 11 | 12 | import Obsidian.CodeGen.Reify 13 | import Obsidian.CodeGen.CompileIMOpenCLEmbedded 14 | import Obsidian.CodeGen.Liveness 15 | import Obsidian.CodeGen.Memory2 16 | import Text.PrettyPrint.Mainland 17 | import Text.PrettyPrint.Mainland.Class 18 | 19 | import qualified Data.Map as M 20 | import Data.Word 21 | --------------------------------------------------------------------------- 22 | -- Generate OpenCL kernels 23 | --------------------------------------------------------------------------- 24 | 25 | -- | Generates kernel code as a String 26 | -- while assuming there is 48KB of shared mem in 32 banks 27 | genKernel :: ToProgram prg 28 | => Word32 29 | -> String 30 | -> prg 31 | -> String 32 | genKernel = genKernelParams sm_conf 33 | where 34 | -- pretend we have 32 banks and 48kb shared mem 35 | sm_conf :: SharedMemConfig 36 | sm_conf = SharedMemConfig 49152 32 True 37 | 38 | 39 | 40 | -- | Generates kernel C code as a String, 41 | -- Programmer passes in the Shared memory configuration 42 | genKernelParams :: ToProgram prg 43 | => SharedMemConfig 44 | -> Word32 45 | -> String 46 | -> prg 47 | -> String 48 | genKernelParams sm_conf nt kn prg = prgStr 49 | where 50 | prgStr = pretty 75 51 | $ ppr 52 | $ compileDeclsTop 53 | (Config nt bytesShared) 54 | name_loc 55 | kn (a,im) 56 | (a,im) = toProgram_ 0 prg 57 | iml = computeLiveness im 58 | (m,mm) = memMapIM sm_conf iml (M.empty) 59 | bytesShared = size m 60 | name_loc = M.assocs mm 61 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/CUDA.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2013..2017 3 | 4 | Notes: 5 | 2017-04-23: compileDeclsTop nolonger takes "platform" arg. 6 | 2014-11-25: Making changes regarding memory allocation. 7 | 8 | -} 9 | 10 | module Obsidian.CodeGen.CUDA (genKernel, genKernelParams, SharedMemConfig(..)) where 11 | 12 | 13 | import Obsidian.CodeGen.Reify 14 | import Obsidian.CodeGen.CompileIM 15 | import Obsidian.CodeGen.Liveness 16 | import Obsidian.CodeGen.Memory2 17 | import Text.PrettyPrint.Mainland 18 | import Text.PrettyPrint.Mainland.Class 19 | 20 | import qualified Data.Map as M 21 | import Data.Word 22 | --------------------------------------------------------------------------- 23 | -- Generate CUDA kernels 24 | --------------------------------------------------------------------------- 25 | 26 | -- | Generates kernel C code as a String 27 | -- while assuming there is 48KB of shared mem in 32 banks 28 | genKernel :: ToProgram prg 29 | => Word32 30 | -> String 31 | -> prg 32 | -> String 33 | genKernel = genKernelParams sm_conf 34 | where 35 | -- pretend we have 32 banks and 48kb shared mem 36 | sm_conf :: SharedMemConfig 37 | sm_conf = SharedMemConfig 49152 32 True 38 | 39 | 40 | 41 | -- | Generates kernel C code as a String, 42 | -- Programmer passes in the Shared memory configuration 43 | genKernelParams :: ToProgram prg 44 | => SharedMemConfig 45 | -> Word32 46 | -> String 47 | -> prg 48 | -> String 49 | genKernelParams sm_conf nt kn prg = prgStr 50 | where 51 | prgStr = pretty 75 52 | $ ppr 53 | $ compileDeclsTop 54 | (Config nt bytesShared) 55 | name_loc 56 | kn (a,im) 57 | (a,im) = toProgram_ 0 prg 58 | iml = computeLiveness im 59 | (m,mm) = memMapIM sm_conf iml (M.empty) 60 | bytesShared = size m 61 | name_loc = M.assocs mm 62 | -------------------------------------------------------------------------------- /.jenkins_runBench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Begin running jenkins benchmark script for Obsidian..." 4 | set -x 5 | 6 | # CONVENTION: The working directory is passed as the first argument. 7 | CHECKOUT=$1 8 | shift 9 | 10 | if [ "$CHECKOUT" == "" ]; then 11 | echo "Replacing CHECKOUT with pwd" 12 | CHECKOUT=`pwd` 13 | fi 14 | 15 | if [ "$JENKINS_GHC" == "" ]; then 16 | export JENKINS_GHC=7.6.3 17 | fi 18 | if [ -f "$HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_ghc.sh" ]; then 19 | source $HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_ghc.sh 20 | fi 21 | if [ -f "$HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_cuda.sh" ]; then 22 | source $HOME/continuous_testing_setup/rn_jenkins_scripts/acquire_cuda.sh 23 | fi 24 | 25 | echo "Running benchmarks remotely on server `hostname`" 26 | 27 | which cabal 28 | cabal --version 29 | 30 | which c2hs || echo ok 31 | c2hs --version || echo ok 32 | 33 | unset GHC 34 | unset GHC_PKG 35 | unset CABAL 36 | 37 | set -e 38 | 39 | #sandboxes are created by the Makefile for run_benchmarks.hs 40 | #this makefile also install Obsidian+hsbencher as a single cabal install line 41 | DIR=`pwd` 42 | echo $DIR 43 | if [ ! -d "HSBencher" ]; then 44 | git clone git@github.com:rrnewton/HSBencher 45 | fi 46 | 47 | (cd HSBencher; git submodule init; git submodule update) 48 | 49 | # Sandbox for benchmarks ? 50 | # -- Is created in the Makefile 51 | # ---------------------------------------- 52 | # cd "$CHECKOUT"/Examples/ReductionTutorial 53 | # cabal sandbox init --sandbox=../.cabal-sandbox 54 | 55 | 56 | # Actually install Obsidian into the sandbox 57 | # cabal install ./ 58 | 59 | 60 | # Switch to where the benchmarks are 61 | # ---------------------------------------- 62 | cd "$CHECKOUT"/Benchmarks 63 | rm -f run_benchmarks.exe 64 | make run_benchmarks.exe 65 | 66 | export TRIALS=1 67 | 68 | # Parfunc account, registered app in api console: 69 | CID=905767673358.apps.googleusercontent.com 70 | SEC=2a2H57dBggubW1_rqglC7jtK 71 | 72 | # Obsidian doc ID: 73 | TABID=1TsG043VYLu9YuU58EaIBdQiqLDUYcAXxBww44EG3 74 | # https://www.google.com/fusiontables/DataSource?docid=1TsG043VYLu9YuU58EaIBdQiqLDUYcAXxBww44EG3 75 | 76 | # Enable upload of benchmarking data to a Google Fusion Table: 77 | # ./run_benchmarks.exe --keepgoing --trials=$TRIALS --fusion-upload=$TABID --clientid=$CID --clientsecret=$SEC $* 78 | echo "Running Benchmarks" 79 | ./run_benchmarks.exe --keepgoing --trials=$TRIALS --fusion-upload --name=Obsidian_bench_data --clientid=$CID --clientsecret=$SEC $* 80 | -------------------------------------------------------------------------------- /Obsidian.cabal: -------------------------------------------------------------------------------- 1 | Name: Obsidian 2 | Version: 0.4.0.0 3 | 4 | 5 | License: BSD3 6 | License-file: LICENSE 7 | Stability: Beta 8 | Maintainer: Joel Svensson 9 | Author: Joel Svensson 10 | 11 | Copyright: Copyright (c) 2011-2017 Joel Svensson 12 | 13 | Synopsis: Embedded language for GPU Programming 14 | HomePage: https://github.com/svenssonjoel/Obsidian 15 | Description: 16 | Obsidian is an embedded language for general purpose programming targeting 17 | GPU's. 18 | 19 | Category: Language 20 | Cabal-Version: >=1.8 21 | Tested-With: GHC == 8.0.2 22 | 23 | build-type: Simple 24 | 25 | source-repository head 26 | type: git 27 | location: git://github.com/svenssonjoel/Obsidian.git 28 | 29 | ---------------------------------------------------------------------------------------------------- 30 | Library 31 | -- Stable packages, no upper bounds 32 | build-depends: base >= 4 && < 5 33 | , vector >= 0.10.9.1 34 | , mtl >= 2.0 35 | , containers >= 0.4.2.1 36 | , text >= 0.11.3.1 37 | , process >= 1.1.0.2 38 | , rdtsc == 1.3.0.0 39 | -- Less-stable packages, upper bounds on next major version: 40 | build-depends: value-supply >= 0.6 41 | , language-c-quote >= 0.10.1.3 42 | , mainland-pretty >= 0.2.6 43 | , mwc-random >= 0.13.1.1 44 | 45 | exposed-modules: Obsidian 46 | , Obsidian.CodeGen.Reify 47 | , Obsidian.CodeGen.CUDA 48 | , Obsidian.CodeGen.OpenCLEmbedded 49 | 50 | other-modules: Obsidian.Array 51 | , Obsidian.Atomic 52 | , Obsidian.DimSpec 53 | , Obsidian.Exp 54 | , Obsidian.Force 55 | , Obsidian.Globs 56 | , Obsidian.Library 57 | , Obsidian.Memory 58 | , Obsidian.Mutable 59 | , Obsidian.Names 60 | , Obsidian.Program 61 | , Obsidian.SeqLoop 62 | , Obsidian.Types 63 | , Obsidian.Data 64 | , Obsidian.CodeGen.CompileIM 65 | , Obsidian.CodeGen.CompileIMOpenCLEmbedded 66 | , Obsidian.CodeGen.Liveness 67 | , Obsidian.CodeGen.Memory2 68 | , Obsidian.CodeGen.Program 69 | 70 | 71 | 72 | GHC-Options: 73 | -- -O2 74 | 75 | -- include-dirs: 76 | 77 | 78 | -- extra-lib-dirs: 79 | 80 | -- Includes: 81 | -- Extra-libraries: 82 | 83 | 84 | -------------------------------------------------------------------------------- /Obsidian/Force.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE FlexibleContexts #-} 5 | {-# LANGUAGE UndecidableInstances #-} 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | {-# LANGUAGE TypeOperators #-} 8 | {-# LANGUAGE GADTs #-} 9 | ---------------------------------------- 10 | {- LANGUAGE KindSignatures -} 11 | {-# LANGUAGE TypeFamilies #-} 12 | 13 | {-# LANGUAGE DataKinds #-} 14 | {-# LANGUAGE ConstraintKinds #-} 15 | 16 | 17 | {- Joel Svensson 2012..2017 18 | 19 | Notes: 20 | 2014-03-28: Changed API. 21 | Not using Obsidian.Mutable currently, it needs more work. 22 | 2013-06-24: Changed code. uses Obsidian.Mutable now 23 | 2013-05-02: Removing things to do with forceG 24 | Removed the extensions (no longer needed) 25 | 2013-04-27: Something is broken. 26 | 2013-04-10: Looking at force and threads 27 | 2013-01-27: globArrays nolonger exist 28 | 2013-01-02: Added simple forceG for globArrays 29 | 2012-12-10: Edited 30 | 31 | -} 32 | 33 | module Obsidian.Force ( unsafeWritePush 34 | , unsafeWritePull 35 | , compute_ 36 | , computePull_ 37 | , ComputeAs(..) 38 | , Compute) where 39 | 40 | 41 | import Obsidian.Program 42 | import Obsidian.Exp 43 | import Obsidian.Array 44 | import Obsidian.Memory 45 | 46 | import Obsidian.Names 47 | import Obsidian.Data 48 | 49 | import Data.Word 50 | 51 | 52 | --------------------------------------------------------------------------- 53 | -- 54 | ------------------------------------------------------------------------- 55 | 56 | -- | Compute constraint. 57 | type Compute t = (Write t, t *<=* Block) 58 | 59 | -- | Arrays can be computed at level t if level t allows compute. 60 | class Compute t => ComputeAs t a where 61 | compute :: Data e => a Word32 e -> Program t (Pull Word32 e) 62 | 63 | instance Compute t => ComputeAs t Pull where 64 | compute = computePull_ 65 | 66 | {- 67 | The key to this instance is that the typechecker 68 | matches only against the head, ignoring the constraint. 69 | meaning that all variations of t, t1 is caught by this 70 | instance. Though, those where t and t1 are not equal 71 | a type error is the result (rather than a missing instance). 72 | 73 | This means that the constraint "Compute Block (Push Thread)" 74 | matches this instance, but is a type error. 75 | -} 76 | instance (t ~ t1, Compute t) => ComputeAs t (Push t1) where 77 | compute = compute_ 78 | 79 | compute_ :: (Data a, Compute t) 80 | => Push t Word32 a -> Program t (Pull Word32 a) 81 | compute_ arr = do 82 | rval <- unsafeWritePush False arr 83 | sync 84 | return rval 85 | 86 | computePull_ :: (t *<=* Block, Data a, Compute t) 87 | => Pull Word32 a -> Program t (Pull Word32 a) 88 | computePull_ arr = 89 | if (len arr <= 32) 90 | then do 91 | rval <- unsafeWritePush True parr 92 | return rval 93 | else do 94 | rval <- unsafeWritePush False parr 95 | sync 96 | return rval 97 | where parr = push arr 98 | 99 | 100 | --------------------------------------------------------------------------- 101 | -- Force local (requires static lengths!) 102 | --------------------------------------------------------------------------- 103 | 104 | class Write t where 105 | unsafeWritePush :: Storable a => Bool -> Push t Word32 a -> Program t (Pull Word32 a) 106 | 107 | -- What to do about volatile here? 108 | -- Ignoring that parameter for now. 109 | -- Thought: It does not matter. 110 | -- Thought: Is this function correct at all? 111 | -- What happens if a thread program allocates memory 112 | -- DONE: The above problem has been fixed! 113 | instance Write Thread where 114 | unsafeWritePush _ p = 115 | do 116 | (snames :: Names a) <- names "arr" 117 | 118 | -- Here I know that this pattern match will succeed 119 | let n = len p 120 | 121 | allocateArray snames n 122 | p <: threadAssignArray snames (variable "tid") n 123 | 124 | return $ threadPullFrom snames (variable "tid") n 125 | 126 | instance Write Warp where 127 | unsafeWritePush _ p = 128 | do 129 | let n = len p 130 | noms <- names "arr" 131 | allocateVolatileArray noms n 132 | 133 | p <: warpAssignArray noms (variable "warpID") n 134 | return $ warpPullFrom noms (variable "warpID") n 135 | 136 | instance Write Block where 137 | unsafeWritePush volatile p = 138 | do 139 | let n = len p 140 | noms <- names "arr" 141 | if (volatile) 142 | then allocateVolatileArray noms n 143 | else allocateArray noms n 144 | 145 | p <: assignArray noms 146 | return $ pullFrom noms n 147 | 148 | 149 | 150 | --------------------------------------------------------------------------- 151 | -- unsafe! 152 | --------------------------------------------------------------------------- 153 | unsafeWritePull :: (t *<=* Block, Write t, Storable a) => Bool -> Pull Word32 a -> Program t (Pull Word32 a) 154 | unsafeWritePull t = unsafeWritePush t . push 155 | 156 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/Liveness.hs: -------------------------------------------------------------------------------- 1 | 2 | 3 | {- Joel Svensson 2012, 2013 4 | 5 | notes: 6 | added case for SeqFor Jan-21-2013 7 | 8 | -} 9 | module Obsidian.CodeGen.Liveness where 10 | 11 | import qualified Data.Set as Set 12 | 13 | import Obsidian.Exp 14 | import Obsidian.Globs 15 | import Obsidian.CodeGen.Program 16 | 17 | import Control.Monad.State 18 | 19 | 20 | --------------------------------------------------------------------------- 21 | -- 22 | --------------------------------------------------------------------------- 23 | type Liveness = Set.Set Name 24 | 25 | --------------------------------------------------------------------------- 26 | -- 27 | --------------------------------------------------------------------------- 28 | type IML = [(Statement Liveness,Liveness)] 29 | 30 | 31 | {- Plan: 32 | # Step through program from end to start 33 | # as soon as a new name is encountered, add it to the living set 34 | # when an "Allocate" is found, delete the name it allocated from the living set. 35 | 36 | Requirements: 37 | # All names are unique! 38 | 39 | TODO: Think more carefully about the ForAllBlocks case 40 | TODO: Can ixs contain array names ? 41 | (Most likely yes! think about the counting sort example) 42 | 43 | BUG: Arrays from the "outside" needs special 44 | treatment within a loop body. 45 | 46 | -} 47 | 48 | 49 | -- Nice type 50 | computeLiveness :: IMList a -> IML 51 | computeLiveness im = reverse $ evalState (cl (reverse im)) Set.empty 52 | 53 | -- Nice Type 54 | computeLiveness1 :: Liveness -> IMList a -> IML 55 | computeLiveness1 l im = reverse $ evalState (cl (reverse im)) l 56 | 57 | -- cl :: IM -> State Liveness IML 58 | cl im = mapM process im 59 | where 60 | safeHead [] = Set.empty 61 | safeHead (x:xs) = snd x 62 | 63 | -- Horrific type 64 | process :: (Statement a,a) -> State Liveness (Statement Liveness,Liveness) 65 | process (SAssign nom ixs e,_) = 66 | do 67 | s <- get 68 | let arrays = collectArraysI "arr" e 69 | arrays1 = collectArraysI "arr" nom 70 | living = Set.fromList (arrays1++arrays) `Set.union` s 71 | 72 | put living -- update state 73 | return (SAssign nom ixs e,living) 74 | process (SAtomicOp nom ix atop,_) = 75 | do 76 | s <- get 77 | let arrays = 78 | case atop of 79 | AtInc -> [] 80 | AtAdd e -> collectArraysI "arr" e 81 | AtSub e -> collectArraysI "arr" e 82 | AtExch e -> collectArraysI "arr" e 83 | arrays1 = collectArraysI "arr" nom 84 | living = Set.fromList (arrays1++arrays) `Set.union` s 85 | put living 86 | return (SAtomicOp nom ix atop, living) 87 | 88 | -- process (SAtomicOp n1 n2 ixs op,_) = 89 | -- do 90 | -- s <- get 91 | -- return (SAtomicOp n1 n2 ixs op,s) 92 | 93 | process (SAllocate name size t,_) = 94 | do 95 | modify (name `Set.delete`) 96 | s <- get 97 | return (SAllocate name size t,s) 98 | 99 | process (SDeclare name t,_) = 100 | do 101 | s <- get 102 | return (SDeclare name t,s) 103 | 104 | process (SSynchronize,_) = 105 | do 106 | s <- get 107 | return (SSynchronize,s) 108 | 109 | process (SCond bexp im,_) = 110 | do 111 | -- TODO: What should really happen here ? 112 | s <- get 113 | let iml = computeLiveness1 s im 114 | l = safeHead iml 115 | ns = s `Set.union` l 116 | put ns 117 | -- Is this correct ? Same question, all below 118 | return (SCond bexp iml,ns) 119 | 120 | 121 | -- This needs to change. 122 | -- arrays from the "outside" that 123 | -- are used within the loop needs special treatment. 124 | process (SSeqFor nom n im,_) = 125 | do 126 | -- get names alive after loop 127 | s <- get 128 | let iml = computeLiveness1 s im 129 | -- l is liveness info "leaving" im 130 | l = safeHead iml 131 | -- alive at these points are those things in l 132 | -- plus the things before (s) 133 | ns = s `Set.union` l 134 | put ns 135 | return (SSeqFor nom n iml,ns) 136 | process (SSeqWhile b im,_) = 137 | do 138 | s <- get 139 | let iml = computeLiveness1 s im 140 | l = safeHead iml 141 | ns = s `Set.union` l 142 | put ns 143 | return (SSeqWhile b iml,ns) 144 | process (SBreak,_) = 145 | do 146 | s <- get 147 | return (SBreak,s) 148 | 149 | process (SForAll lvl n im,_) = 150 | do 151 | s <- get 152 | let iml = computeLiveness1 s im 153 | l = safeHead iml 154 | ns = s `Set.union` l 155 | put ns 156 | return (SForAll lvl n iml,ns) 157 | process (SDistrPar lvl n im,_) = 158 | do 159 | s <- get 160 | let iml = computeLiveness1 s im 161 | l = safeHead iml 162 | ns = s `Set.union` l 163 | put ns 164 | return (SDistrPar lvl n iml,ns) 165 | -- process (SForAllBlocks n im,_) = 166 | -- do 167 | -- s <- get 168 | -- let iml = computeLiveness1 s im 169 | -- l = safeHead iml 170 | -- ns = s `Set.union` l 171 | -- put ns 172 | -- return (SForAllBlocks n iml,ns) 173 | -- process (SNWarps n im,_) = 174 | -- do 175 | -- s <- get 176 | -- let iml = computeLiveness1 s im 177 | -- l = safeHead iml 178 | -- ns = s `Set.union` l 179 | -- put ns 180 | -- return (SNWarps n iml,ns) 181 | -- process (SWarpForAll n im,_) = 182 | -- do 183 | -- s <- get 184 | -- let iml = computeLiveness1 s im 185 | -- l = safeHead iml 186 | -- ns = s `Set.union` l 187 | -- put ns 188 | -- return (SWarpForAll n iml,ns) 189 | 190 | 191 | -- process (SForAllThreads n im,_) = 192 | -- do 193 | -- s <- get 194 | -- let iml = computeLiveness1 s im 195 | -- l = safeHead iml 196 | -- ns = s `Set.union` l 197 | -- put ns 198 | -- return (SForAllThreads n iml,ns) 199 | 200 | -------------------------------------------------------------------------------- /Obsidian/SeqLoop.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE TypeOperators #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE ConstraintKinds #-} 5 | {-# LANGUAGE GADTs #-} 6 | 7 | {- Joel Svensson 2013..2017 8 | 9 | sequential loops with state 10 | 11 | -} 12 | 13 | module Obsidian.SeqLoop where 14 | 15 | 16 | import Obsidian.Program 17 | import Obsidian.Exp 18 | import Obsidian.Array 19 | import Obsidian.Memory 20 | import Obsidian.Names 21 | import Obsidian.Data 22 | import Obsidian.Force 23 | import qualified Obsidian.Library as Lib 24 | 25 | -- TODO: Rename module to something better 26 | -- Or make part of Library.hs 27 | --------------------------------------------------------------------------- 28 | -- seqReduce 29 | --------------------------------------------------------------------------- 30 | -- | Sequential reduction of Pull array. Results in a for loop in generated code. 31 | seqReduce :: Storable a 32 | => (a -> a -> a) 33 | -> SPull a 34 | -> Program Thread a 35 | seqReduce op arr = 36 | do 37 | (ns :: Names a) <- names "v" 38 | allocateScalar ns 39 | 40 | assignScalar ns init 41 | 42 | SeqFor (n-1) $ \ ix -> 43 | do 44 | assignScalar ns (readFrom ns `op` (arr ! (ix + 1))) 45 | 46 | return $ readFrom ns 47 | where 48 | n = sizeConv$ len arr 49 | init = arr ! 0 50 | 51 | 52 | --------------------------------------------------------------------------- 53 | -- Iterate 54 | --------------------------------------------------------------------------- 55 | -- | iterate a function. Results in a for loop in generated code. 56 | seqIterate :: Storable a 57 | => EWord32 58 | -> (EWord32 -> a -> a) 59 | -> a 60 | -> Program Thread a 61 | seqIterate n f init = 62 | do 63 | (ns :: Names a) <- names "v" 64 | allocateScalar ns 65 | 66 | assignScalar ns init 67 | SeqFor n $ \ix -> 68 | do 69 | assignScalar ns $ f ix (readFrom ns) 70 | 71 | return $ readFrom ns 72 | 73 | --------------------------------------------------------------------------- 74 | -- 75 | --------------------------------------------------------------------------- 76 | -- | iterate a function until a condition holds. Results in a while loop 77 | -- with a break in the generated code. 78 | seqUntil :: Storable a 79 | => (a -> a) 80 | -> (a -> EBool) 81 | -> a 82 | -> Program Thread a 83 | seqUntil f p init = 84 | do 85 | (ns :: Names a) <- names "v" 86 | allocateScalar ns 87 | 88 | assignScalar ns init 89 | SeqWhile (p (readFrom ns)) $ 90 | do 91 | (tmp :: Names a) <- names "t" 92 | allocateScalar tmp 93 | assignScalar tmp (readFrom ns) 94 | assignScalar ns $ f (readFrom tmp) 95 | return $ readFrom ns 96 | 97 | --------------------------------------------------------------------------- 98 | -- Sequential scan 99 | --------------------------------------------------------------------------- 100 | -- | Sequential scan over the elements in a pull array. Results in a for loop 101 | -- in the generated code. 102 | seqScan :: Storable a 103 | => (a -> a -> a) 104 | -> SPull a 105 | -> SPush Thread a 106 | seqScan op arr {-(Pull n ixf)-} = 107 | mkPush n $ \wf -> do 108 | (ns :: Names a) <- names "v" -- (ixf 0) 109 | allocateScalar ns -- (ixf 0) 110 | assignScalar ns (arr ! 0) 111 | wf (readFrom ns) 0 112 | SeqFor (sizeConv (n-1)) $ \ix -> do 113 | assignScalar ns $ readFrom ns `op` (arr ! (ix + 1)) 114 | wf (readFrom ns) (ix+1) 115 | where 116 | n = len arr 117 | 118 | -- | Sequential scan that takes a carry-in. 119 | seqScanCin :: (Storable a, Storable b) 120 | => (b -> a -> b) 121 | -> b -- cin 122 | -> SPull a 123 | -> SPush Thread b 124 | seqScanCin op a arr = 125 | mkPush n $ \wf -> do 126 | (ns :: Names a) <- names "v" 127 | allocateScalar ns 128 | assignScalar ns a 129 | -- wf (readFrom ns) 0 130 | SeqFor (sizeConv n) $ \ix -> do 131 | assignScalar ns $ readFrom ns `op` (arr ! ix) 132 | wf (readFrom ns) ix 133 | where 134 | n = len arr 135 | 136 | -- | Sequential scan with separate types for input, output and accumulator. 137 | mapAccumL :: (ASize s, Storable a, Storable b, Storable acc) 138 | => (acc -> a -> (acc,b)) 139 | -> acc -- cin 140 | -> Pull s a 141 | -> Push Thread s b 142 | mapAccumL op acc arr {-(Pull n ixf)-} = 143 | mkPush n $ \wf -> do 144 | (ns :: Names a) <- names "v" -- (ixf 0) 145 | allocateScalar ns -- (ixf 0) 146 | assignScalar ns acc -- (ixf 0) 147 | -- wf (readFrom ns) 0 148 | SeqFor (sizeConv n) $ \ix -> do 149 | let (newAcc, b) = op (readFrom ns) (arr ! ix) 150 | -- order of writing matters, because readFrom is evaluated twice 151 | wf b ix 152 | assignScalar ns newAcc 153 | where 154 | n = len arr 155 | 156 | mapAccumR :: (ASize s, Storable a, Storable b, Storable acc) 157 | => (acc -> a -> (acc,b)) 158 | -> acc -- cin 159 | -> Pull s a 160 | -> Push Thread s b 161 | mapAccumR op acc = 162 | Lib.reverse . mapAccumL op acc . Lib.reverse 163 | 164 | 165 | --------------------------------------------------------------------------- 166 | -- sMapAccum 167 | -- Generalisation of the old sConcat functionality. 168 | 169 | sMapAccum :: (Compute t, Data acc, ASize l) 170 | => (acc -> Pull l a -> Program t (acc,Push t l b)) 171 | -> acc 172 | -> Pull l (Pull l a) 173 | -> Push t l b 174 | sMapAccum f acc arr = 175 | 176 | mkPush (n * fromIntegral rn) $ \wf -> 177 | do 178 | (noms :: Names acc) <- names "v" 179 | --(noms2 :: Names acc) <- names "APA" 180 | 181 | allocateSharedScalar noms 182 | -- allocateScalar noms2 183 | -- a single thread in the group, performs an assignment 184 | -- May need synchronization! 185 | singleThread $ assignScalar noms acc 186 | sync 187 | seqFor (sizeConv n) $ \bix -> do 188 | --singleThread $ assignScalar noms2 acc 189 | (newAcc, b) <- f (readFrom noms) (arr ! bix) 190 | singleThread $ assignScalar noms newAcc 191 | sync 192 | let wf' a ix = wf a (bix * sizeConv rn + ix) 193 | b <: wf' 194 | 195 | where 196 | n = len arr 197 | rn = len $ arr ! 0 198 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/Reify.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE FlexibleInstances, 3 | OverlappingInstances, 4 | UndecidableInstances, 5 | FlexibleContexts, 6 | MultiParamTypeClasses, 7 | TypeOperators, 8 | TypeFamilies , 9 | ScopedTypeVariables 10 | #-} 11 | 12 | {- Joel Svensson 2012, 2013 13 | Niklas Ulvinge 2013 14 | 15 | 16 | -} 17 | 18 | 19 | module Obsidian.CodeGen.Reify (ToProgram(..)) where 20 | 21 | import Obsidian.Exp 22 | import Obsidian.Array 23 | import Obsidian.Mutable 24 | 25 | import Obsidian.Types 26 | import Obsidian.Program 27 | import Obsidian.Library 28 | import Obsidian.Globs 29 | import Obsidian.Names 30 | 31 | import qualified Obsidian.CodeGen.Program as CG 32 | import Obsidian.CodeGen.CompileIM 33 | 34 | import Data.Word 35 | 36 | --------------------------------------------------------------------------- 37 | -- New approach (hopefully) 38 | --------------------------------------------------------------------------- 39 | -- "reify" Haskell functions into CG.Programs 40 | 41 | 42 | --------------------------------------------------------------------------- 43 | -- 44 | --------------------------------------------------------------------------- 45 | class ToProgram a where 46 | toProgram :: Int -> a -> InputList a -> (Parameters,CG.IM) 47 | toProgram_ :: Int -> a -> (Parameters, CG.IM) 48 | 49 | 50 | typeOf_ a = typeOf (Literal a) 51 | 52 | 53 | --------------------------------------------------------------------------- 54 | -- Base cases 55 | --------------------------------------------------------------------------- 56 | 57 | -- This instance is incorrect 58 | instance ToProgram (GProgram ()) where 59 | -- toProgram i prg () = toProgram $ pJoin prg 60 | toProgram i prg () = ([],CG.compileStep1 prg) 61 | -- Needs to deal with GProgram () and GProgram (Push a), GProgram (Pull a) 62 | -- in different ways. 63 | 64 | toProgram_ i prg = ([],CG.compileStep1 prg) 65 | 66 | -- This instance might fix the problem with empty kernels being generated 67 | instance (ToProgram (Push Grid l a)) => ToProgram (GProgram (Push Grid l a)) where 68 | toProgram i p a = toProgram i (runPush p) a 69 | 70 | toProgram_ i p = toProgram_ i (runPush p) 71 | 72 | -- No ToProgram (GProgram (Pull a)) instance is needed. These programs 73 | -- cannot currently be created using the API. The reason is that GProgram (Pull a) 74 | -- implies a capability that GPUs do not have. The pulling from an array computed globally. 75 | -- That kind of computation can not be synced and its result would be undefined. 76 | 77 | 78 | instance Scalar a => ToProgram (Push Grid l (Exp a)) where 79 | toProgram i p a = 80 | let outT = Pointer $ typeOf_ (undefined :: a) 81 | outN = "output" ++ show i 82 | 83 | prg = p <: assignOut outN 84 | 85 | (inputs,im) = toProgram (i+1) prg a 86 | 87 | in (inputs++[(outN,outT)],im) 88 | 89 | where 90 | assignOut out a ix = Assign out [ix] a 91 | toProgram_ i p = toProgram i p () 92 | 93 | instance (Scalar a, Scalar b) => ToProgram (Push Grid l (Exp a,Exp b)) where 94 | toProgram i p a = 95 | let outT1 = Pointer $ typeOf_ (undefined :: a) 96 | outT2 = Pointer $ typeOf_ (undefined :: b) 97 | outN1 = "output" ++ show i 98 | outN2 = "output" ++ show (i+1) 99 | 100 | 101 | prg = p <: assignOut (outN1,outN2) 102 | 103 | (inputs,im) = toProgram (i+2) prg a 104 | 105 | in (inputs++[(outN1,outT1),(outN2,outT2)],im) 106 | where 107 | assignOut (o1,o2) (a,b) ix = 108 | do 109 | Assign o1 [ix] a 110 | Assign o2 [ix] b 111 | toProgram_ i p = toProgram i p () 112 | 113 | --------------------------------------------------------------------------- 114 | -- Recursive 115 | --------------------------------------------------------------------------- 116 | 117 | instance (ToProgram b, Scalar t) => ToProgram (Pull EWord32 (Exp t) -> b) where 118 | toProgram i f (a :- rest) = ((nom,Pointer t):(n,Word32):ins,prg) 119 | where 120 | (ins,prg) = toProgram (i+1) (f input) rest 121 | nom = "input" ++ show i 122 | n = "n" ++ show i 123 | lengthVar = variable n 124 | input = namedGlobal nom lengthVar 125 | t = typeOf_ (undefined :: t) 126 | toProgram_ i f = ((nom,Pointer t):(n,Word32):ins,prg) 127 | where 128 | (ins,prg) = toProgram_ (i+1) (f input) 129 | nom = "input" ++ show i 130 | n = "n" ++ show i 131 | lengthVar = variable n 132 | input = namedGlobal nom lengthVar 133 | t = typeOf_ (undefined :: t) 134 | 135 | instance (ToProgram b, Scalar t) => ToProgram (Pull Word32 (Exp t) -> b) where 136 | toProgram i f (a :- rest) = ((nom,Pointer t):ins,prg) 137 | where 138 | (ins,prg) = toProgram (i+1) (f input) rest 139 | nom = "input" ++ show i 140 | input = namedGlobal nom (len a) 141 | t = typeOf_ (undefined :: t) 142 | toProgram_ _ _ = error "toProgram_: static length" 143 | 144 | 145 | instance (ToProgram b, Scalar t) => ToProgram (Mutable Global EWord32 (Exp t) -> b) where 146 | toProgram i f (a :- rest) = ((nom,Pointer t):(n,Word32):ins,prg) 147 | where 148 | (ins,prg) = toProgram (i+1) (f input) rest 149 | nom = "input" ++ show i 150 | n = "n" ++ show i 151 | lengthVar = variable n 152 | input = namedMutable nom lengthVar 153 | t = typeOf_ (undefined :: t) 154 | toProgram_ i f = ((nom,Pointer t):(n,Word32):ins,prg) 155 | where 156 | (ins,prg) = toProgram_ (i+1) (f input) 157 | nom = "input" ++ show i 158 | n = "n" ++ show i 159 | lengthVar = variable n 160 | input = namedMutable nom lengthVar 161 | t = typeOf_ (undefined :: t) 162 | 163 | namedMutable :: Name -> s -> Mutable mloc s a 164 | namedMutable s v = Mutable v (Single s) 165 | undefinedMutable :: s -> Mutable mloc s a 166 | undefinedMutable v = Mutable v undefined 167 | 168 | 169 | 170 | instance (ToProgram b, Scalar t) => ToProgram ((Exp t) -> b) where 171 | toProgram i f (a :- rest) = ((nom,t):ins,prg) 172 | where 173 | (ins,prg) = toProgram (i+1) (f input) rest 174 | nom = "input" ++ show i 175 | input = variable nom -- namedGlobal nom (len a) 176 | t = typeOf_ (undefined :: t) 177 | toProgram_ i f = ((nom,t):ins,prg) 178 | where 179 | (ins,prg) = toProgram_ (i+1) (f input) 180 | nom = "input" ++ show i 181 | input = variable nom -- namedGlobal nom (len a) 182 | t = typeOf_ (undefined :: t) 183 | 184 | 185 | 186 | 187 | --------------------------------------------------------------------------- 188 | -- heterogeneous lists of inputs 189 | --------------------------------------------------------------------------- 190 | data head :- tail = head :- tail 191 | 192 | infixr 5 :- 193 | 194 | 195 | --------------------------------------------------------------------------- 196 | -- Function types to input list types. 197 | --------------------------------------------------------------------------- 198 | 199 | type family InputList a 200 | 201 | type instance InputList (a -> b) = a :- (InputList b) 202 | type instance InputList (Push Grid l b) = () 203 | type instance InputList (GProgram b) = () 204 | 205 | -- genKernelSM :: ToProgram prg => Word32 -> String -> prg -> (String, Word32) 206 | -- genKernelSM = genKernelSpecsNL 207 | 208 | -- genKernelSpecsNL :: ToProgram prg => Word32 -> String -> prg -> (String, Word32) 209 | -- genKernelSpecsNL nt kn prg = (prgStr,bytesShared) 210 | -- where 211 | -- prgStr = pretty 75 $ ppr $ compile PlatformCUDA (Config nt bytesShared) kn (a,rim) 212 | -- (a,im) = toProgram_ 0 prg 213 | -- iml = computeLiveness im 214 | -- (m,mm) = mmIM iml sharedMem (M.empty) 215 | -- bytesShared = size m 216 | -- rim = renameIM mm iml 217 | 218 | -------------------------------------------------------------------------------- /Obsidian/Mutable.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables, 2 | TypeFamilies, 3 | EmptyDataDecls, 4 | FlexibleInstances #-} 5 | 6 | {- Joel Svensson 2013, 2014 -} 7 | 8 | module Obsidian.Mutable ( Mutable(Mutable) 9 | , Shared 10 | , Global 11 | , MShared 12 | , MGlobal 13 | , newSharedMutable 14 | , writeToSync 15 | , writeTo 16 | , assignMutable 17 | , indexMutable 18 | , mutableToPull 19 | , atomicInc 20 | ) where 21 | 22 | 23 | 24 | import Obsidian.Exp 25 | import Obsidian.Program 26 | import Obsidian.Memory 27 | import Obsidian.Names 28 | import Obsidian.Array 29 | import Obsidian.Atomic 30 | 31 | import Data.Word 32 | 33 | {- 34 | Todo: Think about Global vs Shared. 35 | Todo: Add creation of mutable global arrays. 36 | 37 | Todo: Make mutable interface (atomic ops) very low-level 38 | 39 | 40 | TODO: Rethink. Have two sepparate types of mutable arrays. 41 | Also Skip the Type family magic if possible. 42 | Make both kinds of Mutable arrays an instance of Array 43 | -} 44 | 45 | --------------------------------------------------------------------------- 46 | -- Mutable arrays 47 | --------------------------------------------------------------------------- 48 | -- 49 | -- Global mutable arrays can only be passed as inputs to a function. 50 | -- Shared mutable arrays may be created using newS 51 | -- 52 | 53 | data Shared 54 | data Global 55 | 56 | -- A mutable array has an attached location. 57 | -- Either it recides in Global or in Shared memory. 58 | data Mutable mloc s a = Mutable s (Names a) 59 | 60 | type MShared a = Mutable Shared Word32 a 61 | type MGlobal a = Mutable Global EWord32 a 62 | 63 | instance ArrayLength (Mutable Shared) where 64 | len (Mutable n _) = n 65 | 66 | 67 | --------------------------------------------------------------------------- 68 | -- Create Mutable Shared memory arrays 69 | -- # allocates shared memory 70 | --------------------------------------------------------------------------- 71 | 72 | -- | Create a new Mutable array in shared memory 73 | newSharedMutable :: Storable a => SPush Block a -> Program Block (Mutable Shared Word32 a) 74 | newSharedMutable arr = do 75 | (snames :: Names a) <- names "arr" 76 | allocateArray snames n 77 | let mut = Mutable n snames 78 | writeTo mut arr 79 | return $ mut -- Mutable n snames 80 | where 81 | n = len arr 82 | 83 | --------------------------------------------------------------------------- 84 | -- forceTo & writeTo 85 | --------------------------------------------------------------------------- 86 | -- Much Hacking here 87 | 88 | -- | Write a Push array into a mutable array. 89 | -- There is no synchronisation inserted after the write 90 | writeTo :: Storable a 91 | => Mutable Shared Word32 a 92 | -> Push Block Word32 a 93 | -> Program Block () 94 | writeTo (Mutable n snames) p 95 | | n <= m = p <: assignArray snames 96 | | otherwise = error "WriteTo: Incompatible sizes" 97 | where 98 | m = len p 99 | 100 | 101 | -- Add forceTo with offsets (why? just thought it might be useful) 102 | -- | Write a Push array into a mutable array and sync 103 | writeToSync :: Storable a 104 | => Mutable Shared Word32 a 105 | -> Push Block Word32 a 106 | -> Program Block () 107 | writeToSync m arr = 108 | do 109 | writeTo m arr 110 | Sync 111 | 112 | --------------------------------------------------------------------------- 113 | -- Low level operations 114 | --------------------------------------------------------------------------- 115 | 116 | -- | Write a value into a storable array. 117 | assignMutable :: Storable a => Mutable loc l a -> EWord32 -> a -> Program Thread () 118 | assignMutable (Mutable _ snames) ix a = assignArray snames a ix 119 | 120 | indexMutable :: (ASize s, Storable a) => Mutable loc s a -> EWord32 -> a 121 | indexMutable mut ix = mutableToPull mut ! ix 122 | 123 | --------------------------------------------------------------------------- 124 | -- mutable to pull conversion 125 | --------------------------------------------------------------------------- 126 | 127 | -- | Convert a Mutable array to a Pull array 128 | mutableToPull :: (ASize s, Storable a) => Mutable l s a -> Pull s a 129 | mutableToPull (Mutable n snames) = pullFrom snames n 130 | 131 | 132 | 133 | --------------------------------------------------------------------------- 134 | -- Atomics 135 | --------------------------------------------------------------------------- 136 | -- | Increment atomically 137 | atomicInc :: forall mloc a s t . AtomicInc a 138 | => EWord32 139 | -> Mutable mloc s (Exp a) 140 | -> TProgram () 141 | atomicInc ix (Mutable n noms) = mapNamesM_ f noms 142 | where 143 | f nom = atomicOp nom ix (AtomicInc :: Atomic a) >> return () 144 | 145 | 146 | -- | Add atomically 147 | atomicAdd :: forall mloc a s. AtomicAdd a 148 | => EWord32 149 | -> Exp a 150 | -> Mutable mloc s (Exp a) 151 | -> TProgram () 152 | atomicAdd ix v (Mutable n noms) = mapNamesM_ f noms 153 | where 154 | f nom = atomicOp nom ix (AtomicAdd v) >> return () 155 | 156 | 157 | -- | Subtract atomically 158 | atomicSub :: forall mloc a s. AtomicSub a 159 | => EWord32 160 | -> Exp a 161 | -> Mutable mloc s (Exp a) 162 | -> TProgram () 163 | atomicSub ix v (Mutable n noms) = mapNamesM_ f noms 164 | 165 | where 166 | f nom = atomicOp nom ix (AtomicSub v) >> return () 167 | 168 | 169 | -- Special case ? No. 170 | atomicExch :: forall mloc a s. AtomicExch a 171 | => EWord32 172 | -> Exp a 173 | -> Mutable mloc s (Exp a) 174 | -> TProgram () 175 | atomicExch ix v (Mutable n (Single nom)) = f nom 176 | where 177 | f nom = atomicOp nom ix (AtomicExch v) 178 | 179 | 180 | {- 181 | 182 | --------------------------------------------------------------------------- 183 | atomicExch() 184 | 185 | int atomicExch(int* address, int val); 186 | unsigned int atomicExch(unsigned int* address, 187 | unsigned int val); 188 | unsigned long long int atomicExch(unsigned long long int* address, 189 | unsigned long long int val); 190 | float atomicExch(float* address, float val); 191 | 192 | --------------------------------------------------------------------------- 193 | atomicMin() 194 | 195 | int atomicMin(int* address, int val); 196 | unsigned int atomicMin(unsigned int* address, 197 | unsigned int val); 198 | unsigned long long int atomicMin(unsigned long long int* address, 199 | unsigned long long int val); 200 | 201 | --------------------------------------------------------------------------- 202 | atomicMax() 203 | 204 | int atomicMax(int* address, int val); 205 | unsigned int atomicMax(unsigned int* address, 206 | unsigned int val); 207 | unsigned long long int atomicMax(unsigned long long int* address, 208 | unsigned long long int val); 209 | 210 | 211 | --------------------------------------------------------------------------- 212 | atomicInc() 213 | 214 | unsigned int atomicInc(unsigned int* address, 215 | unsigned int val); 216 | 217 | --------------------------------------------------------------------------- 218 | atomicDec() 219 | 220 | unsigned int atomicDec(unsigned int* address, 221 | unsigned int val); 222 | 223 | --------------------------------------------------------------------------- 224 | atomicCAS() 225 | 226 | int atomicCAS(int* address, int compare, int val); 227 | unsigned int atomicCAS(unsigned int* address, 228 | unsigned int compare, 229 | unsigned int val); 230 | unsigned long long int atomicCAS(unsigned long long int* address, 231 | unsigned long long int compare, 232 | unsigned long long int val); 233 | 234 | --------------------------------------------------------------------------- 235 | atomicAnd() 236 | 237 | int atomicAnd(int* address, int val); 238 | unsigned int atomicAnd(unsigned int* address, 239 | unsigned int val); 240 | unsigned long long int atomicAnd(unsigned long long int* address, 241 | unsigned long long int val); 242 | 243 | --------------------------------------------------------------------------- 244 | atomicOr() 245 | 246 | int atomicOr(int* address, int val); 247 | unsigned int atomicOr(unsigned int* address, 248 | unsigned int val); 249 | unsigned long long int atomicOr(unsigned long long int* address, 250 | unsigned long long int val); 251 | --------------------------------------------------------------------------- 252 | atomicXor() 253 | 254 | int atomicXor(int* address, int val); 255 | unsigned int atomicXor(unsigned int* address, 256 | unsigned int val); 257 | unsigned long long int atomicXor(unsigned long long int* address, 258 | unsigned long long int val); 259 | 260 | -} 261 | -------------------------------------------------------------------------------- /Obsidian/Array.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses, 2 | FlexibleInstances, 3 | GADTs #-} 4 | {-# LANGUAGE TypeOperators #-} 5 | {-# LANGUAGE FlexibleContexts #-} 6 | 7 | {- Joel Svensson 2012..2017 8 | 9 | Notes: 10 | 2017-04-22: looking over, cleaning. 11 | 2014-04-08: Experimenting with API 12 | ---- OUTDATED ---- 13 | 2013-08-26: Experimenting with warp programs. 14 | These do not fit that well in established Idioms! 15 | TODO: Improve this situation. 16 | ---- OUTDATED ---- 17 | 2013-01-08: Removed number-of-blocks field from Distribs 18 | 2012-12-10: Drastically shortened. 19 | -} 20 | 21 | module Obsidian.Array (Pull, Push, SPull, DPull, SPush, DPush 22 | , pushApp 23 | , mkPull 24 | , mkPush 25 | , push 26 | , pushThread 27 | , pushWarp 28 | , pushBlock 29 | , setSize 30 | , (!) 31 | , (<:) 32 | , Array(..) 33 | , ArrayLength(..) 34 | , ASize(..) 35 | , namedGlobal 36 | , undefinedGlobal 37 | ) where 38 | 39 | import Obsidian.Exp 40 | import Obsidian.Program 41 | import Obsidian.Globs 42 | 43 | import Prelude hiding (replicate, (<*)) 44 | import Data.Word 45 | 46 | --------------------------------------------------------------------------- 47 | -- Aliases 48 | --------------------------------------------------------------------------- 49 | type SPull = Pull Word32 50 | type DPull = Pull EWord32 51 | 52 | type SPush t a = Push t Word32 a 53 | type DPush t a = Push t EWord32 a 54 | --------------------------------------------------------------------------- 55 | -- Create arrays 56 | --------------------------------------------------------------------------- 57 | -- | An undefined array. Use as placeholder when generating code 58 | undefinedGlobal :: ASize s => s -> Pull s a 59 | undefinedGlobal n = Pull n $ \_ -> undefined 60 | -- | A named global array. 61 | namedGlobal :: (ASize s, Scalar a) => Name -> s -> Pull s (Exp a) 62 | namedGlobal name n = Pull n $ \gix -> index name gix 63 | -- namedPull name n = Pull n $ \gix -> index name gix 64 | 65 | --------------------------------------------------------------------------- 66 | -- Class ArraySize 67 | --------------------------------------------------------------------------- 68 | -- | ASize provides conversion to Exp Word32 for array sizes 69 | class (Integral a, Num a) => ASize a where 70 | sizeConv :: a -> Exp Word32 71 | 72 | instance ASize Word32 where 73 | sizeConv = fromIntegral 74 | 75 | instance ASize (Exp Word32) where 76 | sizeConv = id 77 | 78 | --------------------------------------------------------------------------- 79 | -- Push and Pull arrays 80 | --------------------------------------------------------------------------- 81 | -- | Push array. Parameterised over Program type and size type. 82 | data Push t s a = 83 | Push s (PushFun t a) 84 | 85 | type PushFun t a = Writer a -> Program t () 86 | type Writer a = a -> EWord32 -> TProgram () 87 | 88 | -- | Pull array. 89 | data Pull s a = Pull {pullLen :: s, 90 | pullFun :: EWord32 -> a} 91 | 92 | -- | Create a push array. 93 | mkPush :: s 94 | -> ((a -> EWord32 -> TProgram ()) -> Program t ()) 95 | -> Push t s a 96 | mkPush n p = Push n p 97 | 98 | -- | Create a pull array. 99 | mkPull :: s -> (EWord32 -> a) -> Pull s a 100 | mkPull n p = Pull n p 101 | 102 | -- Fix this. 103 | -- * you cannot safely resize either push or pull arrays 104 | -- * you can shorten pull arrays safely. 105 | setSize :: ASize l => l -> Pull l a -> Pull l a 106 | setSize n (Pull _ ixf) = mkPull n ixf 107 | 108 | 109 | --------------------------------------------------------------------------- 110 | -- Array Class 111 | ---------------------------------------------------------------------------x 112 | class ArrayLength a where 113 | -- | Get the length of an array. 114 | len :: a s e -> s 115 | 116 | instance ArrayLength Pull where 117 | len arr = pullLen arr 118 | 119 | instance ArrayLength (Push t) where 120 | len (Push s _) = s 121 | 122 | class Array a where 123 | -- | Array of consecutive integers 124 | iota :: ASize s => s -> a s EWord32 125 | -- | Create an array by replicating an element. 126 | replicate :: ASize s => s -> e -> a s e 127 | 128 | -- | Map a function over an array. 129 | aMap :: (e -> e') -> a s e -> a s e' 130 | -- | Perform arbitrary permutations (dangerous). 131 | ixMap :: (EWord32 -> EWord32) 132 | -> a s e -> a s e 133 | -- requires Choice ! 134 | -- Simply because the pull array implementation of it does. 135 | -- | Append two arrays. 136 | append :: (ASize s, Choice e) => a s e -> a s e -> a s e 137 | 138 | -- technicalities 139 | -- | Statically sized array to dynamically sized array. 140 | toDyn :: a Word32 e -> a EW32 e 141 | -- | Dynamically sized array to statically sized array. 142 | fromDyn :: Word32 -> a EW32 e -> a Word32 e 143 | 144 | instance Array Pull where 145 | iota s = Pull s $ \ix -> ix 146 | replicate s e = Pull s $ \_ -> e 147 | 148 | aMap f (Pull n ixf) = Pull n (f . ixf) 149 | ixMap f (Pull n ixf) = Pull n (ixf . f) 150 | 151 | append a1 a2 = Pull (n1+n2) 152 | $ \ix -> ifThenElse (ix <* sizeConv n1) 153 | (a1 ! ix) 154 | (a2 ! (ix - sizeConv n1)) 155 | where 156 | n1 = len a1 157 | n2 = len a2 158 | 159 | -- technicalities 160 | toDyn (Pull n ixf) = Pull (fromIntegral n) ixf 161 | fromDyn n (Pull _ ixf) = Pull n ixf 162 | 163 | 164 | instance Array (Push Thread) where 165 | iota s = Push s $ \wf -> 166 | do 167 | forAll (sizeConv s) $ \ix -> wf ix ix 168 | replicate s e = Push s $ \wf -> 169 | do 170 | forAll (sizeConv s) $ \ix -> wf e ix 171 | aMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix) 172 | ixMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix)) 173 | 174 | -- unfortunately a Choice constraint. 175 | append p1 p2 = 176 | Push (n1 + n2) $ \wf -> 177 | do p1 <: wf 178 | p2 <: \a i -> wf a (sizeConv n1 + i) 179 | where 180 | n1 = len p1 181 | n2 = len p2 182 | 183 | -- technicalities 184 | toDyn (Push n p) = Push (fromIntegral n) p 185 | fromDyn n (Push _ p) = Push n p 186 | 187 | instance Array (Push Warp) where 188 | iota s = Push s $ \wf -> 189 | do 190 | forAll (sizeConv s) $ \ix -> wf ix ix 191 | replicate s e = Push s $ \wf -> 192 | do 193 | forAll (sizeConv s) $ \ix -> wf e ix 194 | aMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix) 195 | ixMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix)) 196 | 197 | -- unfortunately a Choice constraint. 198 | append p1 p2 = 199 | Push (n1 + n2) $ \wf -> 200 | do p1 <: wf 201 | p2 <: \a i -> wf a (sizeConv n1 + i) 202 | where 203 | n1 = len p1 204 | n2 = len p2 205 | 206 | -- technicalities 207 | toDyn (Push n p) = Push (fromIntegral n) p 208 | fromDyn n (Push _ p) = Push n p 209 | 210 | 211 | instance Array (Push Block) where 212 | iota s = Push s $ \wf -> 213 | do 214 | forAll (sizeConv s) $ \ix -> wf ix ix 215 | replicate s e = Push s $ \wf -> 216 | do 217 | forAll (sizeConv s) $ \ix -> wf e ix 218 | aMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix) 219 | ixMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix)) 220 | 221 | -- unfortunately a Choice constraint. 222 | append p1 p2 = 223 | Push (n1 + n2) $ \wf -> 224 | do p1 <: wf 225 | p2 <: \a i -> wf a (sizeConv n1 + i) 226 | where 227 | n1 = len p1 228 | n2 = len p2 229 | 230 | -- technicalities 231 | toDyn (Push n p) = Push (fromIntegral n) p 232 | fromDyn n (Push _ p) = Push n p 233 | 234 | instance Array (Push Grid) where 235 | iota s = error "iota: not supported as Grid" 236 | replicate s e = error "replicate: not supported as Grid" 237 | aMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf (f e) ix) 238 | ixMap f (Push s p) = Push s $ \wf -> p (\e ix -> wf e (f ix)) 239 | 240 | -- unfortunately a Choice constraint. 241 | append p1 p2 = 242 | mkPush (n1 + n2) $ \wf -> 243 | do p1 <: wf 244 | p2 <: \a i -> wf a (sizeConv n1 + i) 245 | where 246 | n1 = len p1 247 | n2 = len p2 248 | 249 | -- technicalities 250 | toDyn (Push n p) = Push (fromIntegral n) p 251 | fromDyn n (Push _ p) = Push n p 252 | 253 | 254 | 255 | 256 | --------------------------------------------------------------------------- 257 | -- Functor instance Pull/Push arrays 258 | --------------------------------------------------------------------------- 259 | instance Array (Push t) => Functor (Push t s) where 260 | fmap = aMap 261 | 262 | instance Functor (Pull s) where 263 | fmap = aMap 264 | 265 | 266 | --------------------------------------------------------------------------- 267 | -- Pushable 268 | --------------------------------------------------------------------------- 269 | -- | Convert a pull array to a push array. 270 | push :: (t *<=* Block) => ASize s => Pull s e -> Push t s e 271 | push (Pull n ixf) = 272 | mkPush n $ \wf -> 273 | forAll (sizeConv n) $ \i -> wf (ixf i) i 274 | 275 | -- Keep push, user may need to annotate with a type 276 | -- Add specific 277 | -- pushThread 278 | -- pushWarp 279 | -- pushBlock 280 | pushThread :: ASize s => Pull s e -> Push Thread s e 281 | pushThread = push 282 | 283 | pushWarp :: ASize s => Pull s e -> Push Warp s e 284 | pushWarp = push 285 | 286 | pushBlock :: ASize s => Pull s e -> Push Block s e 287 | pushBlock = push 288 | 289 | -------------------------------------------------------------------------- 290 | -- Indexing, array creation. 291 | --------------------------------------------------------------------------- 292 | pushApp :: Push t s a -> (a -> EWord32 -> TProgram ()) -> Program t () 293 | pushApp (Push _ p) a = p a 294 | 295 | infixl 9 <: 296 | (<:) :: Push t s a 297 | -> (a -> EWord32 -> Program Thread ()) 298 | -> Program t () 299 | (<:) = pushApp 300 | 301 | infixl 9 ! 302 | (!) :: Pull s e -> Exp Word32 -> e 303 | (!) arr = pullFun arr 304 | 305 | 306 | --------------------------------------------------------------------------- 307 | -- 308 | --------------------------------------------------------------------------- 309 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/Memory.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2012, 2013 3 | 4 | notes: 5 | Added a SeqFor case Jan-21-2013 6 | 7 | -} 8 | module Obsidian.CodeGen.Memory 9 | (MemMap, 10 | Memory, 11 | allocate, 12 | free, 13 | freeAll, 14 | size, 15 | sharedMem, 16 | Address, 17 | Bytes, 18 | mmIM, 19 | renameIM ) 20 | where 21 | 22 | import qualified Data.List as List 23 | import qualified Data.Set as Set 24 | import Data.Word 25 | 26 | import Obsidian.Types 27 | import Obsidian.Globs 28 | 29 | import Obsidian.Exp 30 | import Obsidian.CodeGen.Program 31 | import Obsidian.CodeGen.Liveness 32 | 33 | import qualified Data.Map as Map 34 | 35 | --------------------------------------------------------------------------- 36 | -- Planned improvements 37 | --------------------------------------------------------------------------- 38 | -- # DONE: Always start a shared memory array at a "bank-aligned" address 39 | -- + So that programmer can really effect access patterns. 40 | -- # DONE: Do not rename with pointers 41 | -- instead output a list of (type ,name,address) quadruples 42 | -- that can be used to create an alias (at top scope of program) 43 | -- + Prettier code -> easier debugging 44 | -- + Potential efficiency issues, from less casting etc 45 | -- # (LONG-TERM) Clever memory allocation 46 | -- + The future is known! So encode the optimal memory 47 | -- allocation schema 48 | 49 | -- Assumptions: 50 | -- # an array declares as being __shared__ in the CUDA code 51 | -- will be well aligned with the banks 52 | 53 | --------------------------------------------------------------------------- 54 | -- Memory layout 55 | --------------------------------------------------------------------------- 56 | 57 | type MemMap = Map.Map Name (Word32,Type) 58 | 59 | type Address = Word32 60 | type Bytes = Word32 61 | 62 | data Memory = Memory {freeList :: [(Address,Bytes)] , 63 | allocated :: [(Address,Bytes)] , 64 | size :: Bytes} -- how much used 65 | deriving Show 66 | 67 | 68 | -- 48 kilobytes of smem 69 | sharedMem = Memory [(0,49152)] [] 0 70 | 71 | 72 | updateMax :: Memory -> Memory 73 | updateMax mem = let m = maximum [a+b|(a,b) <- allocated mem] 74 | m' = max m (size mem) 75 | in mem {size = m'} 76 | 77 | -- This one needs to check that shared memory is not full. 78 | allocate :: Memory -> Bytes -> (Memory,Address) 79 | allocate m b = 80 | let adress = filter (\(x,y) -> y >= b) (freeList m) -- get a list of candidates 81 | -- getTop mem = let (a,b) = case null (allocated m) of 82 | -- False -> maximum $ List.sort (allocated m) 83 | -- True -> (0,0) 84 | -- in a+b 85 | in case adress of 86 | -- use the first candidate (try better approaches 87 | -- such as searching for best match, so that not to waste memory) 88 | ((a,bytes):_) -> let fl = filter (\(addr,_) -> a /= addr) (freeList m) 89 | fl' = if b < bytes 90 | then (a+b,bytes-b):fl 91 | else fl 92 | in (updateMax (m {freeList = fl', 93 | allocated = (a,b):allocated m}) ,a) 94 | [] -> error "out of shared memory" 95 | 96 | 97 | free :: Memory -> Address -> Memory 98 | free m a = mem 99 | where 100 | bytes = lookup a (allocated m) 101 | al = filter (\(addr,_) -> a /= addr) (allocated m) 102 | 103 | -- TODO: Investigate this much closer. 104 | -- Is it a bug or is freeing a non allocated memory area 105 | -- OK? 106 | 107 | mem = case bytes of 108 | Nothing -> m 109 | {- 110 | error $ "error: Address " ++ show a ++ 111 | " not found in allocated list" ++ 112 | "\n" ++ show m 113 | -} 114 | Just b -> m {freeList = compress ((a,b):(freeList m)), 115 | allocated = al} 116 | 117 | freeAll :: Memory -> [Address] -> Memory 118 | freeAll m [] = m 119 | freeAll m (a:as) = freeAll (free m a) as 120 | 121 | compress = merge . List.sort 122 | 123 | merge [] = [] 124 | merge [x] = [x] 125 | merge ((x,b):(y,b2):xs) = if (x+b == y) 126 | then merge ((x,b+b2):xs) 127 | else (x,b):merge((y,b2):xs) 128 | 129 | --------------------------------------------------------------------------- 130 | -- Memory map the new IM 131 | --------------------------------------------------------------------------- 132 | mmIM :: IML -> Memory -> MemMap -> (Memory, MemMap) 133 | mmIM im memory memmap = r im (memory,memmap) 134 | where 135 | r [] m = m 136 | r (x:xs) (m,mm) = 137 | let 138 | (m',mm') = process x m mm 139 | 140 | freeable = getFreeableSet x xs 141 | freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable)) 142 | dontMap name = not ((List.isPrefixOf "input" name) || 143 | (List.isPrefixOf "output" name)) 144 | mNew = 145 | case freeableAddrs of 146 | (Just as) -> freeAll m' (map fst as) 147 | Nothing -> m' 148 | in r xs (mNew,mm') 149 | mmIM' :: IML -> Memory -> MemMap -> (Memory, MemMap) 150 | mmIM' im memory memmap = r im (memory,memmap) 151 | where 152 | r [] m = m 153 | r (x:xs) (m,mm) = 154 | let 155 | (m',mm') = process x m mm 156 | 157 | freeable = getFreeableSet x xs 158 | freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable)) 159 | dontMap name = not ((List.isPrefixOf "input" name) || 160 | (List.isPrefixOf "output" name)) 161 | mNew = 162 | case freeableAddrs of 163 | (Just as) -> m' -- freeAll m' (map fst as) 164 | Nothing -> m' 165 | in r xs (mNew,mm') 166 | 167 | 168 | process (SAllocate name size t,_) m mm = (m',mm') 169 | where (m',addr) = allocate m size 170 | mm' = case Map.lookup name mm of 171 | Nothing -> Map.insert name (addr,t) mm 172 | (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a 173 | 174 | -- A tricky case. 175 | -- process (SForAllBlocks n im,_) m mm = mmIM im m mm 176 | -- Another tricky case. 177 | process (SSeqFor _ n im,_) m mm = mmIM im m mm 178 | process (SSeqWhile b im,_) m mm = mmIM im m mm 179 | -- Yet another tricky case. 180 | process (SForAll _ n im,_) m mm = mmIM im m mm 181 | process (SDistrPar Warp n im,_) m mm = mmIM' im m mm -- mmIM im m mm 182 | process (SDistrPar Block n im,_) m mm = mmIM im m mm 183 | -- The worst of them all. 184 | -- process (SForAllThreads n im,_) m mm = mmIM im m mm 185 | -- process (SNWarps _ im,_) m mm = mmIM im m mm 186 | -- process (SWarpForAll _ im,_) m mm = mmIM im m mm 187 | 188 | -- process im m mm = error $ printStm im -- "process: WHat!" 189 | process (_,_) m mm = (m,mm) 190 | 191 | -- Friday (2013 Mars 29, discovered bug) 192 | getFreeableSet :: (Statement Liveness,Liveness) -> IML -> Liveness 193 | getFreeableSet (_,l) [] = Set.empty -- not l ! 194 | getFreeableSet (_,l) ((_,l1):_) = l Set.\\ l1 195 | 196 | --------------------------------------------------------------------------- 197 | -- Rename arrays in IM 198 | --------------------------------------------------------------------------- 199 | 200 | renameIM :: MemMap -> IML -> IMList () 201 | renameIM mm im = zip (map (go . fst) im) (repeat ()) 202 | where 203 | go (SAssign name ix e) = SAssign (renameIVar mm name) 204 | (map (renameIExp mm) ix) 205 | (renameIExp mm e) 206 | go (SAtomicOp name ix atop) = SAtomicOp (renameIVar mm name) 207 | (renameIExp mm ix) 208 | (renameAtOp mm atop) 209 | go (SCond be im) = SCond (renameIExp mm be) 210 | (renameIM mm im) 211 | go (SSeqFor str n im) = SSeqFor str (renameIExp mm n) 212 | (renameIM mm im) 213 | go SBreak = SBreak 214 | go (SSeqWhile n im) = SSeqWhile (renameIExp mm n) 215 | (renameIM mm im) 216 | go (SForAll lvl n im) = SForAll lvl (renameIExp mm n) 217 | (renameIM mm im) 218 | go (SDistrPar lvl n im) = SDistrPar lvl (renameIExp mm n) 219 | (renameIM mm im) 220 | 221 | -- go (SForAllBlocks n im) = SForAllBlocks (renameIExp mm n) 222 | -- (renameIM mm im) 223 | -- go (SNWarps n im) = SNWarps (renameIExp mm n) 224 | -- (renameIM mm im) 225 | -- go (SWarpForAll n im) = SWarpForAll (renameIExp mm n) 226 | -- (renameIM mm im) 227 | -- Strip this out earlier. 228 | go (SAllocate name n t) = SAllocate name n t 229 | go (SDeclare name t) = SDeclare name t 230 | go SSynchronize = SSynchronize 231 | 232 | --------------------------------------------------------------------------- 233 | -- Memory map the arrays in an CExpr 234 | --------------------------------------------------------------------------- 235 | renameIExp mm e@(IVar nom t) = renameIVar mm e 236 | renameIExp mm (IIndex (e1,es) t) = IIndex (renameIExp mm e1, map (renameIExp mm) es) t 237 | renameIExp mm (IBinOp op e1 e2 t) = IBinOp op (renameIExp mm e1) (renameIExp mm e2) t 238 | renameIExp mm (IUnOp op e t) = IUnOp op (renameIExp mm e) t 239 | renameIExp mm (IFunCall nom exprs t) = IFunCall nom (map (renameIExp mm) exprs) t 240 | renameIExp mm (ICast e t) = ICast (renameIExp mm e) t 241 | renameIExp mm (ICond e1 e2 e3 t) = ICond (renameIExp mm e1) 242 | (renameIExp mm e2) 243 | (renameIExp mm e3) 244 | t 245 | renameIExp mm a = a 246 | 247 | 248 | renameIVar mm (IVar name t) = 249 | case Map.lookup name mm of 250 | Just (addr,t) -> 251 | let core = sbaseIExp addr 252 | cast c = ICast c t 253 | in cast core 254 | 255 | Nothing -> IVar name t 256 | where 257 | sbaseIExp 0 = IVar "sbase" (Pointer Word8) 258 | sbaseIExp addr = IBinOp IAdd (IVar "sbase" (Pointer Word8)) 259 | (IWord32 addr) 260 | (Pointer Word8) 261 | 262 | renameAtOp mm AtInc = AtInc 263 | renameAtOp mm (AtAdd e) = AtAdd (renameIExp mm e) 264 | renameAtOp mm (AtSub e) = AtSub (renameIExp mm e) 265 | renameAtOp mm (AtExch e) = AtExch (renameIExp mm e) 266 | 267 | 268 | -------------------------------------------------------------------------------- /Obsidian/Memory.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables, 2 | GADTs #-} 3 | 4 | {- Joel Svensson 2013 5 | 6 | This Module became quite messy. 7 | TODO: CLEAN IT UP! 8 | 9 | notes: 2013-05-02: Cleaned out inspect. 10 | 11 | -} 12 | 13 | module Obsidian.Memory (Storable(..)) where 14 | 15 | 16 | import Obsidian.Program 17 | import Obsidian.Exp 18 | import Obsidian.Types 19 | import Obsidian.Array -- Importing this feels a bit strange. 20 | import Obsidian.Names 21 | 22 | import Data.Word 23 | 24 | -- class MemoryOps a => Storable a 25 | 26 | --------------------------------------------------------------------------- 27 | -- Local Memory 28 | --------------------------------------------------------------------------- 29 | class Storable a where 30 | -- | Obtain new names for variables / arrays 31 | names :: String -> Program t (Names a) 32 | 33 | -- Array operations 34 | assignArray :: Names a -> a -> Exp Word32 -> Program Thread () 35 | allocateArray :: Names a -> Word32 -> Program t () 36 | pullFrom :: ASize s => Names a -> s -> Pull s a 37 | 38 | 39 | -- Scalar operations 40 | assignScalar :: Names a -> a -> Program Thread () 41 | allocateScalar :: Names a -> Program t () 42 | allocateSharedScalar :: Names a -> Program t () 43 | readFrom :: Names a -> a 44 | 45 | -- Warp level operations 46 | warpAssignArray :: Names a 47 | -> EWord32 48 | -> Word32 49 | -> a 50 | -> EWord32 51 | -> Program Thread () 52 | warpPullFrom :: Names a -> EWord32 -> Word32 -> Pull Word32 a 53 | 54 | threadAssignArray :: Names a 55 | -> EWord32 56 | -> Word32 57 | -> a 58 | -> EWord32 59 | -> Program Thread () 60 | threadPullFrom :: Names a -> EWord32 -> Word32 -> Pull Word32 a 61 | 62 | -- Extra 63 | allocateVolatileArray :: Names a -> Word32 -> Program t () 64 | 65 | 66 | 67 | 68 | --------------------------------------------------------------------------- 69 | -- Instances 70 | --------------------------------------------------------------------------- 71 | instance Scalar a => Storable (Exp a) where 72 | 73 | -- Names 74 | names pre = do {i <- uniqueNamed pre; return (Single i)} 75 | 76 | --Array ops 77 | allocateArray (Single name) n = 78 | Allocate name (n * fromIntegral (sizeOf (undefined :: Exp a))) 79 | (Pointer (typeOf (undefined :: Exp a))) 80 | assignArray (Single name) a ix = Assign name [ix] a 81 | pullFrom (Single name) n = mkPull n (\i -> index name i) 82 | 83 | -- Scalar ops 84 | allocateScalar (Single name) = 85 | Declare name (typeOf (undefined :: Exp a)) 86 | allocateSharedScalar (Single name) = 87 | Declare name (Shared $ typeOf (undefined :: Exp a)) 88 | 89 | assignScalar (Single name) a = Assign name [] a 90 | readFrom (Single name) = variable name 91 | 92 | -- Warp ops 93 | warpAssignArray (Single name) warpId step a ix = 94 | Assign name [warpId * fromIntegral step + ix] a 95 | 96 | warpPullFrom (Single name) warpId n 97 | = mkPull n (\i -> index name (warpId * fromIntegral n + i)) 98 | 99 | -- Thread ops 100 | threadAssignArray (Single name) threadId step a ix = 101 | Assign name [threadId * fromIntegral step + ix] a 102 | 103 | threadPullFrom (Single name) threadId n 104 | = mkPull n (\i -> index name (threadId * fromIntegral n + i)) 105 | 106 | -- Extra 107 | allocateVolatileArray (Single name) n = 108 | Allocate name (n * fromIntegral (sizeOf (undefined :: Exp a))) 109 | (Volatile (Pointer (typeOf (undefined :: Exp a)))) 110 | 111 | 112 | 113 | instance (Storable a, Storable b) => Storable (a, b) where 114 | names pre = 115 | do 116 | (a' :: Names a) <- names pre 117 | (b' :: Names b) <- names pre 118 | return $ Tuple a' b' 119 | allocateArray (Tuple ns1 ns2) n = 120 | allocateArray ns1 n >> 121 | allocateArray ns2 n 122 | 123 | allocateVolatileArray (Tuple ns1 ns2) n = 124 | allocateVolatileArray ns1 n >> 125 | allocateVolatileArray ns2 n 126 | 127 | 128 | allocateScalar (Tuple ns1 ns2) = 129 | allocateScalar ns1 >> 130 | allocateScalar ns2 131 | 132 | allocateSharedScalar (Tuple ns1 ns2) = 133 | allocateSharedScalar ns1 >> 134 | allocateSharedScalar ns2 135 | 136 | 137 | assignArray (Tuple ns1 ns2) (a,b) ix = 138 | assignArray ns1 a ix >> 139 | assignArray ns2 b ix 140 | 141 | warpAssignArray (Tuple ns1 ns2) warpID step (a,b) ix = 142 | warpAssignArray ns1 warpID step a ix >> 143 | warpAssignArray ns2 warpID step b ix 144 | 145 | threadAssignArray (Tuple ns1 ns2) threadId step (a,b) ix = 146 | threadAssignArray ns1 threadId step a ix >> 147 | threadAssignArray ns2 threadId step b ix 148 | 149 | assignScalar (Tuple ns1 ns2) (a,b) = 150 | assignScalar ns1 a >> 151 | assignScalar ns2 b 152 | 153 | pullFrom (Tuple ns1 ns2) n = 154 | let p1 = pullFrom ns1 n 155 | p2 = pullFrom ns2 n 156 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix)) 157 | 158 | warpPullFrom (Tuple ns1 ns2) warpID n 159 | = let p1 = warpPullFrom ns1 warpID n 160 | p2 = warpPullFrom ns2 warpID n 161 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix)) 162 | 163 | threadPullFrom (Tuple ns1 ns2) threadId n 164 | = let p1 = threadPullFrom ns1 threadId n 165 | p2 = threadPullFrom ns2 threadId n 166 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix)) 167 | 168 | readFrom (Tuple ns1 ns2) = 169 | let p1 = readFrom ns1 170 | p2 = readFrom ns2 171 | in (p1,p2) 172 | 173 | 174 | 175 | 176 | instance (Storable a, Storable b, Storable c) => Storable (a, b, c) where 177 | names pre = 178 | do 179 | (a :: Names a) <- names pre 180 | (b :: Names b) <- names pre 181 | (c :: Names c) <- names pre 182 | return $ Triple a b c 183 | 184 | allocateArray (Triple ns1 ns2 ns3) n = 185 | allocateArray ns1 n >> 186 | allocateArray ns2 n >> 187 | allocateArray ns3 n 188 | 189 | allocateVolatileArray (Triple ns1 ns2 ns3) n = 190 | allocateVolatileArray ns1 n >> 191 | allocateVolatileArray ns2 n >> 192 | allocateVolatileArray ns3 n 193 | 194 | allocateScalar (Triple ns1 ns2 ns3) = 195 | allocateScalar ns1 >> 196 | allocateScalar ns2 >> 197 | allocateScalar ns3 198 | 199 | allocateSharedScalar (Triple ns1 ns2 ns3) = 200 | allocateSharedScalar ns1 >> 201 | allocateSharedScalar ns2 >> 202 | allocateSharedScalar ns3 203 | 204 | 205 | assignArray (Triple ns1 ns2 ns3) (a,b,c) ix = 206 | assignArray ns1 a ix >> 207 | assignArray ns2 b ix >> 208 | assignArray ns3 c ix 209 | 210 | warpAssignArray (Triple ns1 ns2 ns3) warpID step (a,b,c) ix = 211 | warpAssignArray ns1 warpID step a ix >> 212 | warpAssignArray ns2 warpID step b ix >> 213 | warpAssignArray ns3 warpID step c ix 214 | 215 | threadAssignArray (Triple ns1 ns2 ns3) threadID step (a,b,c) ix = 216 | threadAssignArray ns1 threadID step a ix >> 217 | threadAssignArray ns2 threadID step b ix >> 218 | threadAssignArray ns3 threadID step c ix 219 | 220 | 221 | assignScalar (Triple ns1 ns2 ns3) (a,b,c) = 222 | assignScalar ns1 a >> 223 | assignScalar ns2 b >> 224 | assignScalar ns3 c 225 | 226 | pullFrom (Triple ns1 ns2 ns3) n = 227 | let p1 = pullFrom ns1 n 228 | p2 = pullFrom ns2 n 229 | p3 = pullFrom ns3 n 230 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix,p3 ! ix)) 231 | 232 | warpPullFrom (Triple ns1 ns2 ns3) warpID n 233 | = let p1 = warpPullFrom ns1 warpID n 234 | p2 = warpPullFrom ns2 warpID n 235 | p3 = warpPullFrom ns3 warpID n 236 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix)) 237 | 238 | threadPullFrom (Triple ns1 ns2 ns3) threadId n 239 | = let p1 = threadPullFrom ns1 threadId n 240 | p2 = threadPullFrom ns2 threadId n 241 | p3 = threadPullFrom ns3 threadId n 242 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix,p3 ! ix)) 243 | 244 | 245 | readFrom (Triple ns1 ns2 ns3) = 246 | let p1 = readFrom ns1 247 | p2 = readFrom ns2 248 | p3 = readFrom ns3 249 | in (p1,p2,p3) 250 | 251 | instance (Storable a, Storable b, Storable c, Storable d) => Storable (a, b, c, d) where 252 | names pre = 253 | do 254 | (a :: Names a) <- names pre 255 | (b :: Names b) <- names pre 256 | (c :: Names c) <- names pre 257 | (d :: Names d) <- names pre 258 | return $ Quadruple a b c d 259 | 260 | allocateArray (Quadruple ns1 ns2 ns3 ns4) n = 261 | allocateArray ns1 n >> 262 | allocateArray ns2 n >> 263 | allocateArray ns3 n >> 264 | allocateArray ns4 n 265 | 266 | allocateVolatileArray (Quadruple ns1 ns2 ns3 ns4) n = 267 | allocateVolatileArray ns1 n >> 268 | allocateVolatileArray ns2 n >> 269 | allocateVolatileArray ns3 n >> 270 | allocateVolatileArray ns4 n 271 | 272 | allocateScalar (Quadruple ns1 ns2 ns3 ns4) = 273 | allocateScalar ns1 >> 274 | allocateScalar ns2 >> 275 | allocateScalar ns3 >> 276 | allocateScalar ns4 277 | 278 | allocateSharedScalar (Quadruple ns1 ns2 ns3 ns4) = 279 | allocateSharedScalar ns1 >> 280 | allocateSharedScalar ns2 >> 281 | allocateSharedScalar ns3 >> 282 | allocateSharedScalar ns4 283 | 284 | 285 | assignArray (Quadruple ns1 ns2 ns3 ns4) (a,b,c,d) ix = 286 | assignArray ns1 a ix >> 287 | assignArray ns2 b ix >> 288 | assignArray ns3 c ix >> 289 | assignArray ns4 d ix 290 | 291 | warpAssignArray (Quadruple ns1 ns2 ns3 ns4) warpID step (a,b,c,d) ix = 292 | warpAssignArray ns1 warpID step a ix >> 293 | warpAssignArray ns2 warpID step b ix >> 294 | warpAssignArray ns3 warpID step c ix >> 295 | warpAssignArray ns4 warpID step d ix 296 | 297 | threadAssignArray (Quadruple ns1 ns2 ns3 ns4) threadID step (a,b,c,d) ix = 298 | threadAssignArray ns1 threadID step a ix >> 299 | threadAssignArray ns2 threadID step b ix >> 300 | threadAssignArray ns3 threadID step c ix >> 301 | threadAssignArray ns4 threadID step d ix 302 | 303 | 304 | assignScalar (Quadruple ns1 ns2 ns3 ns4) (a,b,c,d) = 305 | assignScalar ns1 a >> 306 | assignScalar ns2 b >> 307 | assignScalar ns3 c >> 308 | assignScalar ns4 d 309 | 310 | pullFrom (Quadruple ns1 ns2 ns3 ns4) n = 311 | let p1 = pullFrom ns1 n 312 | p2 = pullFrom ns2 n 313 | p3 = pullFrom ns3 n 314 | p4 = pullFrom ns4 n 315 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix)) 316 | 317 | warpPullFrom (Quadruple ns1 ns2 ns3 ns4) warpID n 318 | = let p1 = warpPullFrom ns1 warpID n 319 | p2 = warpPullFrom ns2 warpID n 320 | p3 = warpPullFrom ns3 warpID n 321 | p4 = warpPullFrom ns4 warpID n 322 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix)) 323 | 324 | threadPullFrom (Quadruple ns1 ns2 ns3 ns4) threadID n 325 | = let p1 = threadPullFrom ns1 threadID n 326 | p2 = threadPullFrom ns2 threadID n 327 | p3 = threadPullFrom ns3 threadID n 328 | p4 = threadPullFrom ns4 threadID n 329 | in mkPull n (\ix -> (p1 ! ix, p2 ! ix, p3 ! ix, p4 ! ix)) 330 | 331 | readFrom (Quadruple ns1 ns2 ns3 ns4) = 332 | let p1 = readFrom ns1 333 | p2 = readFrom ns2 334 | p3 = readFrom ns3 335 | p4 = readFrom ns4 336 | in (p1,p2,p3,p4) 337 | -------------------------------------------------------------------------------- /Obsidian/Program.hs: -------------------------------------------------------------------------------- 1 | {- Joel Svensson 2012,2013,2014 2 | 3 | Notes: 4 | 2014 : starting a big overhauling 5 | 2013-04-02: Added a Break statement to the language. 6 | Use it to break out of sequential loops. 7 | 2013-01-08: removed number-of-blocks field from ForAllBlocks 8 | 9 | -} 10 | 11 | {-# LANGUAGE GADTs #-} 12 | {-# LANGUAGE EmptyDataDecls #-} 13 | {-# LANGUAGE FlexibleInstances #-} 14 | {-# LANGUAGE MultiParamTypeClasses #-} 15 | {-# LANGUAGE ScopedTypeVariables #-} 16 | {-# LANGUAGE TypeOperators #-} 17 | {-# LANGUAGE FlexibleContexts #-} 18 | {-# LANGUAGE UndecidableInstances #-} 19 | {-# LANGUAGE ExplicitNamespaces #-} 20 | 21 | {-# LANGUAGE TypeFamilies #-} 22 | {-# LANGUAGE DataKinds #-} 23 | {-# LANGUAGE ConstraintKinds #-} 24 | ------------------------------------------ 25 | {- LANGUAGE FunctionalDependencies -} 26 | 27 | module Obsidian.Program ( 28 | -- Hierarchy 29 | Thread, Block, Grid, Warp, Step, 30 | -- Step, Zero, 31 | -- Program type 32 | -- CoreProgram(..), 33 | Program(..), -- all exported.. for now 34 | TProgram, BProgram, GProgram, WProgram, 35 | 36 | -- Class 37 | type (*<=*), 38 | 39 | -- helpers 40 | printPrg, 41 | runPrg, 42 | uniqueNamed, uniqueNamed_, 43 | 44 | allocate, declare, 45 | atomicOp, 46 | -- Programming interface 47 | seqFor, forAll, seqWhile, sync, distrPar, forAll2, 48 | singleThread 49 | 50 | 51 | ) where 52 | 53 | import Data.Word 54 | 55 | import Obsidian.Exp 56 | import Obsidian.Types 57 | import Obsidian.Globs 58 | import Obsidian.Atomic 59 | 60 | import Control.Applicative 61 | 62 | 63 | --------------------------------------------------------------------------- 64 | -- Thread/Block/Grid 65 | --------------------------------------------------------------------------- 66 | 67 | data Thread 68 | data Step t 69 | 70 | type Warp = Step Thread 71 | type Block = Step Warp 72 | type Grid = Step Block 73 | 74 | 75 | -- | Type level less-than-or-equal test. 76 | type family LessThanOrEqual a b where 77 | LessThanOrEqual Thread Thread = True 78 | LessThanOrEqual Thread (Step m) = True 79 | LessThanOrEqual (Step n) (Step m) = LessThanOrEqual n m 80 | LessThanOrEqual x y = False 81 | 82 | 83 | -- | This constraint is a more succinct way of requiring that @a@ be less than or equal to @b@. 84 | type a *<=* b = (LessThanOrEqual a b ~ True) 85 | 86 | 87 | --------------------------------------------------------------------------- 88 | 89 | type Identifier = Int 90 | 91 | --------------------------------------------------------------------------- 92 | -- Program datatype 93 | -------------------------------------------------------------------------- 94 | data Program t a where 95 | 96 | Identifier :: Program t Identifier 97 | 98 | Assign :: Scalar a 99 | => Name 100 | -> [Exp Word32] 101 | -> (Exp a) 102 | -> Program Thread () 103 | 104 | -- 4 March 2014, Changed so that AtOp does not return a result. 105 | -- Change this back later if an application requires. 106 | AtomicOp :: Scalar a 107 | => Name -- Array name 108 | -> Exp Word32 -- Index to operate on 109 | -> Atomic a -- Atomic operation to perform 110 | -> Program Thread () 111 | 112 | Cond :: Exp Bool 113 | -> Program Thread () 114 | -> Program Thread () 115 | 116 | SeqWhile :: Exp Bool -> 117 | Program Thread () -> 118 | Program Thread () 119 | 120 | Break :: Program Thread () 121 | 122 | -- use threads along one level 123 | -- Thread, Warp, Block. 124 | -- Make sure Code generation works when t ~ Thread 125 | ForAll :: (t *<=* Block) => EWord32 126 | -> (EWord32 -> Program Thread ()) 127 | -> Program t () 128 | 129 | -- Distribute over Warps yielding a Block 130 | -- Distribute over Blocks yielding a Grid 131 | DistrPar :: EWord32 132 | -> (EWord32 -> Program t ()) 133 | -> Program (Step t) () 134 | 135 | -- BUG: I Need to recognize sequential distribution of 136 | -- work too in order to set up storage correctly for 137 | -- arrays allocated in within sequentially distributed work. 138 | DistrSeq :: EWord32 139 | -> (EWord32 -> Program t ()) 140 | -> Program t () 141 | 142 | 143 | SeqFor :: EWord32 -> (EWord32 -> Program t ()) 144 | -> Program t () 145 | 146 | -- Allocate shared memory in each MP 147 | -- Can be done from any program level. 148 | -- Since the allocation happens block-wise though 149 | -- it is important to figure out how many instances of 150 | -- that t level program that needs memory! (messy) 151 | Allocate :: Name -> Word32 -> Type -> Program t () 152 | 153 | -- Automatic Variables 154 | Declare :: Name -> Type -> Program t () 155 | 156 | Sync :: (t *<=* Block) => Program t () 157 | 158 | -- Monad 159 | Return :: a -> Program t a 160 | Bind :: Program t a -> (a -> Program t b) -> Program t b 161 | 162 | --------------------------------------------------------------------------- 163 | -- Aliases 164 | --------------------------------------------------------------------------- 165 | type TProgram = Program Thread 166 | type WProgram = Program Warp 167 | type BProgram = Program Block 168 | type GProgram = Program Grid 169 | 170 | --------------------------------------------------------------------------- 171 | -- Helpers 172 | --------------------------------------------------------------------------- 173 | uniqueSM = do 174 | id <- Identifier 175 | return $ "arr" ++ show id 176 | 177 | uniqueNamed pre = do 178 | id <- Identifier 179 | return $ pre ++ show id 180 | 181 | uniqueNamed_ pre = do id <- Identifier 182 | return $ pre ++ show id 183 | --------------------------------------------------------------------------- 184 | -- Memory 185 | --------------------------------------------------------------------------- 186 | assign :: Scalar a => Name -> [Exp Word32] -> Exp a -> Program Thread () 187 | assign nom ix e = Assign nom ix e 188 | 189 | allocate :: Name -> Word32 -> Type -> Program t () 190 | allocate nom l t = Allocate nom l t 191 | 192 | declare :: Name -> Type -> Program t () 193 | declare nom t = Declare nom t 194 | 195 | --------------------------------------------------------------------------- 196 | -- atomicOp 197 | --------------------------------------------------------------------------- 198 | atomicOp :: Scalar a 199 | => Name -- Array name 200 | -> Exp Word32 -- Index to operate on 201 | -> Atomic a -- Atomic operation to perform 202 | -> Program Thread () 203 | atomicOp nom ix atop = AtomicOp nom ix atop 204 | 205 | --------------------------------------------------------------------------- 206 | -- forAll 207 | --------------------------------------------------------------------------- 208 | forAll :: (t *<=* Block) => EWord32 209 | -> (EWord32 -> Program Thread ()) 210 | -> Program t () 211 | forAll n f = ForAll n f 212 | 213 | forAll2 :: (t *<=* Block) => EWord32 214 | -> EWord32 215 | -> (EWord32 -> EWord32 -> Program Thread ()) 216 | -> Program (Step t) () 217 | forAll2 b n f = 218 | DistrPar b $ \bs -> 219 | ForAll n $ \ix -> f bs ix 220 | 221 | distrPar :: EWord32 222 | -> (EWord32 -> Program t ()) 223 | -> Program (Step t) () 224 | distrPar b f = DistrPar b $ \bs -> f bs 225 | 226 | --------------------------------------------------------------------------- 227 | -- Let a single thread perform of a block/Warp perform a given 228 | -- Thread program 229 | --------------------------------------------------------------------------- 230 | singleThread :: (t *<=* Block) => Program Thread () -> Program t () 231 | singleThread p = 232 | forAll 1 (\_ -> p) 233 | 234 | -- seqFor 235 | --------------------------------------------------------------------------- 236 | seqFor :: EWord32 -> (EWord32 -> Program t ()) -> Program t () 237 | seqFor (Literal 1) f = f 0 238 | seqFor n f = SeqFor n f 239 | 240 | --------------------------------------------------------------------------- 241 | -- seqWhile 242 | --------------------------------------------------------------------------- 243 | seqWhile :: Exp Bool -> Program Thread () -> Program Thread () 244 | seqWhile b prg = SeqWhile b prg 245 | 246 | --------------------------------------------------------------------------- 247 | -- Monad 248 | -------------------------------------------------------------------------- 249 | instance Monad (Program t) where 250 | return = Return 251 | (>>=) = Bind 252 | 253 | --------------------------------------------------------------------------- 254 | -- Functor 255 | --------------------------------------------------------------------------- 256 | instance Functor (Program t) where 257 | fmap g fa = do {a <- fa; return $ g a} 258 | 259 | --------------------------------------------------------------------------- 260 | -- Applicative 261 | --------------------------------------------------------------------------- 262 | instance Applicative (Program t) where 263 | pure = return 264 | ff <*> fa = 265 | do 266 | f <- ff 267 | fmap f fa 268 | 269 | --------------------------------------------------------------------------- 270 | -- sync function 271 | --------------------------------------------------------------------------- 272 | sync :: (t *<=* Block) => Program t () 273 | sync = Sync 274 | 275 | --------------------------------------------------------------------------- 276 | -- runPrg (RETHINK!) (Works for Block programs, but all?) 277 | --------------------------------------------------------------------------- 278 | runPrg :: Int -> Program t a -> (a,Int) 279 | runPrg i Identifier = (i,i+1) 280 | 281 | -- Maybe these two are the most interesting cases! 282 | -- Return may for example give an array. 283 | runPrg i (Return a) = (a,i) 284 | runPrg i (Bind m f) = 285 | let (a,i') = runPrg i m 286 | in runPrg i' (f a) 287 | 288 | -- All other constructors have () result 289 | 290 | runPrg i (Sync) = ((),i) 291 | runPrg i (ForAll n ixf) = 292 | let (p,i') = runPrg i (ixf (variable "tid")) 293 | in (p,i') 294 | runPrg i (DistrPar n f) = 295 | let (p,i') = runPrg i (f (variable "DUMMY")) 296 | in (p,i') 297 | -- What can this boolean depend upon ? its quite general! 298 | -- p here is a Program Thread () 299 | runPrg i (Cond b p) = ((),i) 300 | runPrg i (Declare _ _) = ((),i) 301 | runPrg i (Allocate _ _ _ ) = ((),i) 302 | runPrg i (Assign _ _ a) = ((),i) 303 | runPrg i (AtomicOp _ _ _) = ((),i) -- variable ("new"++show i),i+1) 304 | 305 | {- What do I want from runPrg ? 306 | 307 | # I want to it to "work" for all block programs (no exceptions) 308 | # I want a BProgram (Pull a) to return a Pull array of "correct length) 309 | -} 310 | 311 | 312 | --------------------------------------------------------------------------- 313 | -- printPrg (REIMPLEMENT) 314 | --------------------------------------------------------------------------- 315 | printPrg :: Program t a -> String 316 | printPrg prg = (\(_,x,_) -> x) $ printPrg' 0 prg 317 | 318 | printPrg' :: Int -> Program t a -> (a,String,Int) 319 | printPrg' i Identifier = (i,"getId;\n",i+1) 320 | printPrg' i (Assign n ix e) = 321 | ((),n ++ "[" ++ show ix ++ "] = " ++ show e ++ ";\n", i) 322 | printPrg' i (AtomicOp n ix e) = 323 | let newname = "r" ++ show i 324 | --in (variable newname, 325 | -- newname ++ " = " ++ printAtomic e ++ 326 | -- "( " ++ n ++ "[" ++ show ix ++ "])\n",i+1) 327 | in ((), printAtomic e ++ 328 | "( " ++ n ++ "[" ++ show ix ++ "])\n",i+1) 329 | printPrg' i (Allocate id n t) = 330 | let newname = id -- "arr" ++ show id 331 | in ((),newname ++ " = malloc(" ++ show n ++ ");\n",i+1) 332 | printPrg' i (Declare id t) = 333 | let newname = id -- "arr" ++ show id 334 | in ((),show t ++ " " ++ newname ++ "\n",i+1) 335 | printPrg' i (SeqFor n f) = 336 | let (a,prg2,i') = printPrg' i (f (variable "i")) 337 | 338 | in ( a, 339 | "for (i in 0.." ++ show n ++ ")" ++ 340 | "{\n" ++ prg2 ++ "\n}", 341 | i') 342 | 343 | printPrg' i (ForAll n f) = 344 | let (a,prg2,i') = printPrg' i (f (variable "i")) 345 | 346 | in ( a, 347 | "par (i in 0.." ++ show n ++ ")" ++ 348 | "{\n" ++ prg2 ++ "\n}", 349 | i') 350 | --printPrg' i (ForAllBlocks n f) = 351 | -- let (d,prg2,i') = printPrg' i (f (variable "BIX")) 352 | -- in (d, 353 | -- "blocks (i)" ++ 354 | -- "{\n" ++ prg2 ++ "\n}", 355 | -- i') 356 | printPrg' i (Return a) = (a,"MonadReturn;\n",i) 357 | printPrg' i (Bind m f) = 358 | let (a1, str1,i1) = printPrg' i m 359 | (a2,str2,i2) = printPrg' i1 (f a1) 360 | in (a2,str1 ++ str2, i2) 361 | printPrg' i Sync = ((),"Sync;\n",i) 362 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/Program.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs, 2 | ExistentialQuantification, 3 | FlexibleInstances #-} 4 | {-# LANGUAGE TypeSynonymInstances #-} 5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 6 | {-# LANGUAGE ConstraintKinds #-} 7 | {-# LANGUAGE DataKinds #-} 8 | 9 | {- CodeGen.Program. 10 | 11 | Joel Svensson 2012, 2013 12 | 13 | Notes: 14 | 2013-03-17: Codegeneration is changing 15 | -} 16 | 17 | 18 | module Obsidian.CodeGen.Program where 19 | 20 | import Obsidian.Exp 21 | import Obsidian.Globs 22 | import Obsidian.Types 23 | import Obsidian.Atomic 24 | 25 | import qualified Obsidian.Program as P 26 | 27 | import Data.Word 28 | import Data.Supply 29 | import Data.List 30 | 31 | import System.IO.Unsafe 32 | 33 | import Control.Monad.State 34 | import Control.Applicative 35 | 36 | 37 | --------------------------------------------------------------------------- 38 | -- New Intermediate representation 39 | --------------------------------------------------------------------------- 40 | 41 | type IMList a = [(Statement a,a)] 42 | 43 | type IM = IMList () 44 | 45 | out :: a -> [(a,())] 46 | out a = [(a,())] 47 | 48 | -- Atomic operations 49 | data AtOp = AtInc 50 | | AtAdd IExp 51 | | AtSub IExp 52 | | AtExch IExp 53 | 54 | 55 | data HLevel = Thread 56 | | Warp 57 | | Block 58 | | Grid 59 | 60 | 61 | -- Statements 62 | data Statement t = SAssign IExp [IExp] IExp 63 | | SAtomicOp IExp IExp AtOp 64 | | SCond IExp (IMList t) 65 | | SSeqFor String IExp (IMList t) 66 | | SBreak 67 | | SSeqWhile IExp (IMList t) 68 | 69 | -- Iters Body 70 | | SForAll HLevel IExp (IMList t) 71 | | SDistrPar HLevel IExp (IMList t) 72 | 73 | -- | SForAllBlocks IExp (IMList t) 74 | -- | SNWarps IExp (IMList t) 75 | -- | SWarpForAll IExp (IMList t) 76 | -- | SWarpForAll String String IExp (IMList t) 77 | 78 | -- Memory Allocation.. 79 | | SAllocate Name Word32 Type 80 | | SDeclare Name Type 81 | 82 | -- Synchronisation 83 | | SSynchronize 84 | 85 | 86 | --------------------------------------------------------------------------- 87 | -- Collect and pass around data during first step compilation 88 | data Context = Context { ctxNWarps :: Maybe Word32, 89 | ctxNThreads :: Maybe Word32, 90 | ctxGLBUsesTid :: Bool, 91 | ctxGLBUsesWid :: Bool} 92 | 93 | newtype CM a = CM (State Context a) 94 | deriving (Monad, MonadState Context, Functor, Applicative) 95 | 96 | runCM :: CM a -> Context -> a 97 | --runCM (CM cm) ctx = evalState cm ctx 98 | runCM (CM cm) = evalState cm 99 | 100 | evalCM :: CM a -> Context -> (a, Context) 101 | evalCM (CM cm) = runState cm 102 | 103 | setUsesTid :: CM () 104 | setUsesTid = modify $ \ctx -> ctx { ctxGLBUsesTid = True } 105 | 106 | setUsesWid :: CM () 107 | setUsesWid = modify $ \ctx -> ctx { ctxGLBUsesWid = True } 108 | 109 | enterWarp :: Word32 -> CM () 110 | enterWarp n = modify $ \ctx -> ctx { ctxNWarps = Just n } 111 | 112 | enterThread :: Word32 -> CM () 113 | enterThread n = modify $ \ctx -> ctx {ctxNThreads = Just n} 114 | 115 | clearWarp :: CM () 116 | clearWarp = modify $ \ctx -> ctx {ctxNWarps = Nothing} 117 | 118 | getNWarps :: CM (Maybe Word32) 119 | getNWarps = do 120 | ctx <- get 121 | return $ ctxNWarps ctx 122 | 123 | getNThreads :: CM (Maybe Word32) 124 | getNThreads = do 125 | ctx <- get 126 | return $ ctxNThreads ctx 127 | 128 | emptyCtx :: Context 129 | emptyCtx = Context Nothing Nothing False False 130 | --------------------------------------------------------------------------- 131 | 132 | 133 | -- Sort these out and improve! 134 | usesWarps :: IMList t -> Bool 135 | usesWarps = any (go . fst) 136 | where 137 | go (SDistrPar _ _ im) = usesWarps im 138 | go (SForAll Warp _ _) = True 139 | go _ = False 140 | 141 | usesTid :: IMList t -> Bool 142 | usesTid = any (go . fst) 143 | where 144 | go (SDistrPar _ _ im) = usesTid im 145 | go (SForAll Block _ _) = True 146 | go (SSeqFor _ _ im) = usesTid im 147 | go _ = False 148 | usesBid :: IMList t -> Bool 149 | usesBid = any (go . fst) 150 | where 151 | go (SDistrPar Block _ _) = True -- usesBid im 152 | -- go (SForAll Block _ _) = True 153 | go _ = False 154 | usesGid :: IMList t -> Bool 155 | usesGid = any (go . fst) 156 | where 157 | go (SForAll Grid _ _) = True 158 | go _ = False 159 | 160 | 161 | 162 | --------------------------------------------------------------------------- 163 | -- COmpilation of Program to IM 164 | --------------------------------------------------------------------------- 165 | 166 | compileStep1 :: Compile t => P.Program t a -> IM 167 | compileStep1 p = snd $ runCM (compile ns p) emptyCtx 168 | where 169 | ns = unsafePerformIO$ newEnumSupply 170 | 171 | 172 | class Compile t where 173 | compile :: Supply Int -> P.Program t a -> CM (a,IM) 174 | 175 | -- Compile Thread program 176 | instance Compile P.Thread where 177 | -- Can add cases for P.ForAll here. 178 | -- Turn into sequential loop. Could be important to make push 179 | -- operate uniformly across entire hierarchy. 180 | compile s (P.ForAll n f) = 181 | do 182 | let (i1,i2) = split2 s 183 | nom = "i" ++ show (supplyValue i1) 184 | v = variable nom 185 | p = f v 186 | (a,im) <- compile i2 p 187 | 188 | return ((),out $ SSeqFor nom (expToIExp n) im) 189 | compile s (P.Allocate nom n t) = do 190 | (Just nt) <- getNThreads -- must be a Just at this point 191 | nw' <- getNWarps 192 | 193 | let nw = case nw' of 194 | Nothing -> 1 195 | Just i -> i 196 | 197 | return ((),out $ SAllocate nom (nt*nw*n) t) 198 | compile _ (P.Sync) = 199 | return ((),[]) 200 | 201 | compile s p = cs s p 202 | 203 | -- Compile Warp program 204 | instance Compile P.Warp where 205 | compile s (P.DistrPar n f) = 206 | error "Currently not supported to distribute over the threads, use ForAll instead!" 207 | compile s (P.ForAll n@(Literal n') f) = do 208 | 209 | -- setup context to know number of threads 210 | -- executing 211 | enterThread n' 212 | 213 | let p = f (variable "warpIx") 214 | (a,im) <- compile s p 215 | return (a, out $ SForAll Warp (expToIExp n) im) 216 | --undefined -- compile a warp program that iterates over a space n large 217 | compile s (P.Allocate nom n t) = do 218 | (Just nw) <- getNWarps -- Must be a Just here, or something is wrong! 219 | return ((),out $ SAllocate nom (nw*n) t) 220 | compile s (P.Bind p f) = do 221 | let (s1,s2) = split2 s 222 | (a,im1) <- compile s1 p 223 | (b,im2) <- compile s2 (f a) 224 | return (b,(im1 ++ im2)) 225 | compile s (P.Return a) = return (a,[]) 226 | compile s (P.Identifier) = return (supplyValue s, []) 227 | compile s (P.Sync) = return ((),[]) 228 | -- Why no fallthrough here ? 229 | -- Adding (must have been a horrible mistake!) 230 | compile s p = cs s p 231 | 232 | -- Compile Block program 233 | instance Compile P.Block where 234 | compile s (P.ForAll n@(Literal n') f) = do 235 | 236 | -- Set up the context to know the number of 237 | -- concurrent thread programs that are executing. 238 | enterThread n' 239 | setUsesTid 240 | 241 | let nom = "tid" 242 | v = variable nom 243 | p = f v 244 | 245 | (a,im) <- compile s p 246 | -- in this case a could be () (since it is guaranteed to be anyway). a 247 | return (a,out (SForAll Block (expToIExp n) im)) 248 | 249 | compile s (P.DistrPar n'@(Literal n) f) = do 250 | 251 | {- Distribute work over warps! -} 252 | -- Set up the context for the compilation 253 | -- of the Warp code. 254 | -- BUG: Something like this is needed for distribution 255 | -- over threads too! 256 | -- FIXED: Bug mentioned above should be (at least) partially fixed. 257 | enterWarp n 258 | -- Number of active warps are stored in the context. 259 | (a,im) <- compile s (f (variable "warpID")) 260 | return (a, out (SDistrPar Warp (expToIExp n') im)) 261 | compile s (P.Allocate id n t) = return ((),out (SAllocate id n t)) 262 | compile s (P.Sync) = return ((),out (SSynchronize)) 263 | compile s p = cs s p 264 | 265 | -- Compile a Grid Program 266 | instance Compile P.Grid where 267 | -- compile s (P.ForAll n f) = do 268 | 269 | -- -- Incorrect, need to compute global thread ids and apply 270 | -- let p = f gid -- (BlockIdx X) 271 | -- gid = variable "gid" 272 | -- (a,im) <- compile s p 273 | -- return (a, out (SForAll Grid (expToIExp n) im)) 274 | 275 | {- Distribute over blocks -} 276 | compile s (P.DistrPar n f) = do 277 | -- Need to generate IM here that the backend can read desired number of blocks from 278 | let p = f (variable "bid") -- (BlockIdx X) 279 | 280 | (a, im) <- compile s p -- (f (BlockIdx X)) 281 | return (a, out (SDistrPar Block (expToIExp n) im)) 282 | compile s (P.Allocate _ _ _) = error "Allocate at level Grid" 283 | compile s p = cs s p 284 | 285 | 286 | 287 | 288 | --------------------------------------------------------------------------- 289 | -- General compilation 290 | --------------------------------------------------------------------------- 291 | cs :: forall t a . Compile t => Supply Int -> P.Program t a -> CM (a,IM) 292 | cs i P.Identifier = return $ (supplyValue i, []) 293 | cs i (P.Assign name ix e) = 294 | return $ ((),out (SAssign (IVar name (typeOf e)) (map expToIExp ix) (expToIExp e))) 295 | 296 | cs i (P.AtomicOp name ix atom) = 297 | case atom of 298 | AtomicInc -> return $ ((),out (SAtomicOp (IVar name Word32) (expToIExp ix) AtInc)) 299 | AtomicAdd e -> undefined 300 | AtomicSub e -> undefined 301 | AtomicExch e -> undefined 302 | -- (vres, out im) 303 | -- where 304 | -- res = "a" ++ show (supplyValue i) 305 | -- vres = IVar res (typeOf (undefined :: a)) 306 | -- vname = IVar name (typeOf (undefined :: a)) 307 | -- im = SAtomicOp vres vname ix atom 308 | 309 | --cs i (P.AtomicOp name ix at) = (v,out im) 310 | -- where 311 | -- nom = "a" ++ show (supplyValue i) 312 | -- v = variable nom 313 | -- im = SAtomicOp nom name ix at 314 | 315 | cs i (P.Cond bexp p) = do 316 | ((),im) <- compile i p 317 | return ((),out (SCond (expToIExp bexp) im)) 318 | 319 | 320 | cs i (P.SeqFor n f) = do 321 | let (i1,i2) = split2 i 322 | nom = "i" ++ show (supplyValue i1) 323 | v = variable nom 324 | p = f v 325 | (a,im) <- compile i2 p 326 | 327 | return (a,out (SSeqFor nom (expToIExp n) im)) 328 | 329 | 330 | cs i (P.SeqWhile b p) = do 331 | (a,im) <- compile i p 332 | return (a, out (SSeqWhile (expToIExp b) im)) 333 | 334 | 335 | 336 | cs i (P.Break) = return ((), out SBreak) 337 | 338 | -- This (Allocate) should be covered by the Hierarchy instances 339 | -- above and should be removed from here. 340 | --cs i (P.Allocate id n t) = return ((),out (SAllocate id n t)) 341 | cs i (P.Declare id t) = return ((),out (SDeclare id t)) 342 | 343 | -- cs i (P.Sync) = return ((),out (SSynchronize)) 344 | 345 | 346 | cs i (P.Bind p f) = do 347 | let (s1,s2) = split2 i 348 | (a,im1) <- compile s1 p 349 | (b,im2) <- compile s2 (f a) 350 | 351 | return (b,im1 ++ im2) 352 | 353 | 354 | cs i (P.Return a) = return (a,[]) 355 | 356 | 357 | -- Unhandled cases 358 | cs i p = error $ "#Program.hs# unhandled in cs: " ++ P.printPrg p -- compile i p 359 | 360 | --------------------------------------------------------------------------- 361 | -- Turning IM to strings 362 | --------------------------------------------------------------------------- 363 | 364 | printIM :: Show a => IMList a -> String 365 | printIM im = concatMap printStm im 366 | 367 | -- Print a Statement with metadata 368 | printStm :: Show a => (Statement a,a) -> String 369 | printStm (SAssign name [] e,m) = 370 | show name ++ " = " ++ show e ++ ";" ++ meta m 371 | printStm (SAssign name ix e,m) = 372 | show name ++ "[" ++ concat (intersperse "," (map show ix)) ++ "]" ++ 373 | " = " ++ show e ++ ";" ++ meta m 374 | --printStm (SAtomicOp res arr ix op,m) = 375 | -- res ++ " = " ++ 376 | -- printAtomic op ++ "(" ++ arr ++ "[" ++ show ix ++ "]);" ++ meta m 377 | printStm (SAllocate name n t,m) = 378 | name ++ " = malloc(" ++ show n ++ ");" ++ meta m 379 | printStm (SDeclare name t,m) = 380 | show t ++ " " ++ name ++ ";" ++ meta m 381 | printStm (SCond bexp im,m) = 382 | "if " ++ show bexp ++ "{\n" ++ 383 | concatMap printStm im ++ "\n};" ++ meta m 384 | 385 | printStm (SSynchronize,m) = 386 | "sync();" ++ meta m 387 | 388 | printStm (SSeqFor name n im,m) = 389 | "for " ++ name ++ " in [0.." ++ show n ++"] do" ++ meta m ++ 390 | concatMap printStm im ++ "\ndone;\n" 391 | 392 | 393 | printStm (SForAll Warp n im,m) = 394 | "forAll wid" ++ " in [0.." ++ show n ++"] do" ++ meta m ++ 395 | concatMap printStm im ++ "\ndone;\n" 396 | 397 | printStm (SForAll Block n im,m) = 398 | "forAll tid" ++ " in [0.." ++ show n ++"] do" ++ meta m ++ 399 | concatMap printStm im ++ "\ndone;\n" 400 | 401 | printStm (SForAll Grid n im,m) = 402 | "forAll gid in [0.." ++ show n ++"] do" ++ meta m ++ 403 | concatMap printStm im ++ "\ndone;\n" 404 | 405 | printStm (SDistrPar lvl n im,m) = 406 | "forAll gid in [0.." ++ show n ++"] do" ++ meta m ++ 407 | concatMap printStm im ++ "\ndone;\n" 408 | 409 | -- printStm (SWarpForAll n im ,m) = 410 | -- "forAll(InWarp) tid" ++ " in [0.." ++ show n ++"] do" ++ meta m ++ 411 | -- concatMap printStm im ++ "\ndone;\n" 412 | 413 | -- printStm (SNWarps n im,m) = "Run " ++ show n++ " Warps {\n" ++ 414 | -- printIM im ++ "\n }" 415 | --printStm (SForAllThreads n im,m) = 416 | -- "forAllThreads i in [0.." ++ show n ++"] do" ++ meta m ++ 417 | -- concatMap printStm im ++ "\ndone;\n" 418 | 419 | 420 | 421 | -- printStm (a,m) = error $ show m 422 | 423 | meta :: Show a => a -> String 424 | meta m = "\t//" ++ show m ++ "\n" 425 | 426 | 427 | 428 | 429 | 430 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/Memory2.hs: -------------------------------------------------------------------------------- 1 | 2 | {- Joel Svensson 2012, 2013 3 | 4 | Notes: 5 | Jan-27-2015: Bug fix related to arrays alive when 6 | entering into loops. 7 | Fix seems to solve the problem. 8 | Need to investigate that it still frees arrays 9 | as soon as possible. 10 | Nov-25-2014: Changes to memory management 11 | 12 | Jan-21-2013: Added a SeqFor case 13 | 14 | -} 15 | module Obsidian.CodeGen.Memory2 16 | (MemMap, 17 | Memory, 18 | allocate, 19 | free, 20 | freeAll, 21 | size, 22 | Address, 23 | Bytes, 24 | memMapIM, 25 | renameIM, 26 | SharedMemConfig(..), 27 | createSharedMem 28 | ) 29 | where 30 | 31 | import qualified Data.List as List 32 | import qualified Data.Set as Set 33 | import Data.Word 34 | import Data.Maybe 35 | 36 | import Obsidian.Types 37 | import Obsidian.Globs 38 | 39 | import Obsidian.Exp 40 | import Obsidian.CodeGen.Program 41 | import Obsidian.CodeGen.Liveness 42 | 43 | import Debug.Trace 44 | 45 | 46 | import qualified Data.Map as Map 47 | --------------------------------------------------------------------------- 48 | -- Planned improvements 49 | --------------------------------------------------------------------------- 50 | -- # Always start a shared memory array at a "bank-aligned" address 51 | -- + So that programmer can really effect access patterns. 52 | -- # Do not rename with pointers 53 | -- instead output a list of (type ,name,address) quadruples 54 | -- that can be used to create an alias (at top scope of program) 55 | -- + Prettier code -> easier debugging 56 | -- + Potential efficiency issues, from less casting etc 57 | -- # (LONG-TERM) Clever memory allocation 58 | -- + The future is known! So encode the optimal memory 59 | -- allocation schema 60 | 61 | --------------------------------------------------------------------------- 62 | -- Memory layout 63 | --------------------------------------------------------------------------- 64 | 65 | type MemMap = Map.Map Name (AlignedAddress,Type) 66 | 67 | type AlignedAddress = (Address,Address) 68 | 69 | type Address = Word32 70 | type Bytes = Word32 71 | 72 | data Memory = Memory {freeList :: [(Address,Bytes)] , 73 | allocated :: [(Address,Bytes)] , 74 | size :: Bytes} -- how much used 75 | deriving Show 76 | 77 | updateMax :: Memory -> Memory 78 | updateMax mem = let m = maximum [a+b|(a,b) <- allocated mem] 79 | m' = max m (size mem) 80 | in mem {size = m'} 81 | 82 | 83 | --------------------------------------------------------------------------- 84 | -- Shared memory configurations 85 | --------------------------------------------------------------------------- 86 | data SharedMemConfig = 87 | SharedMemConfig { smSize :: Bytes -- amount of shared mem 88 | , smBanks :: Word32 -- Number of banks 16/32 89 | , smBankAlign :: Bool 90 | } 91 | 92 | 93 | createSharedMem :: SharedMemConfig -> Memory 94 | createSharedMem conf = Memory [(0,smSize conf)] [] 0 95 | 96 | -- bank allign an address, returning a new aligned address 97 | -- and the number of EXTRA bytes that needs to be present 98 | -- from the old provided address in order to store aligned data 99 | -- at the new location. 100 | bank_align :: SharedMemConfig -> Address -> (AlignedAddress, Bytes) 101 | bank_align conf address = 102 | case (how_far_off == 0) of 103 | True -> ((address,address),0) 104 | False -> ((address,address + bump), bump) 105 | 106 | where banks = smBanks conf 107 | -- if address % bank_alignment == 0 108 | -- the address is aligned 109 | bank_alignment = banks * 4 -- number of banks * 4 bytes 110 | 111 | how_far_off = address `mod` bank_alignment 112 | bump = bank_alignment - how_far_off 113 | 114 | --------------------------------------------------------------------------- 115 | -- Allocate memory 116 | --------------------------------------------------------------------------- 117 | allocate :: SharedMemConfig -> Memory -> Bytes -> (Memory,AlignedAddress) 118 | allocate conf m b = 119 | case smBankAlign conf of 120 | True -> 121 | -- Does any memory location exist that 122 | -- allows for the allocation of this array 123 | case catMaybes new_candidates of 124 | [] -> error $ "allocate: out of shared memory:" ++ 125 | "\n Allocating: " ++ show b ++ " bytes" ++ 126 | "\n Free List: " ++ show (freeList m) ++ 127 | "\n Potentials: " ++ show address_candidates ++ 128 | "\n Fit with align: " ++ show new_candidates 129 | 130 | ((aligned_address,free_space,alloc_size):_) -> 131 | -- update free list 132 | -- Clear the allocated address from the free list 133 | let fl = filter (\(addr,_) -> (fst aligned_address /= addr)) (freeList m) 134 | -- if the chosen space is larger than what we need 135 | -- add the unused chunk to the free list 136 | fl' = if alloc_size < free_space 137 | then (fst aligned_address + alloc_size, 138 | free_space - alloc_size):fl 139 | else fl 140 | -- Update memory and return a result address 141 | in (updateMax $ m { freeList = fl' 142 | , allocated = 143 | (fst aligned_address,alloc_size):allocated m} 144 | , aligned_address) 145 | 146 | False -> 147 | case map (pretend_align b) address_candidates of 148 | [] -> error "out of shared memory" 149 | 150 | ((aligned_address,free_space,alloc_size):_) -> 151 | -- update free list 152 | -- Clear the allocated address from the free list 153 | let fl = filter (\(addr,_) -> (fst aligned_address /= addr)) (freeList m) 154 | -- if the chosen space is larger than what we need 155 | -- add the unused chunk to the free list 156 | fl' = if alloc_size < free_space 157 | then (fst aligned_address + alloc_size, 158 | free_space - alloc_size):fl 159 | else fl 160 | -- Update memory and return a result address 161 | in (updateMax $ m { freeList = fl' 162 | , allocated = 163 | (fst aligned_address,alloc_size):allocated m} 164 | , aligned_address) 165 | 166 | 167 | 168 | 169 | where 170 | -- Candidates after aligning 171 | new_candidates = map (tryCandidate b) address_candidates 172 | -- Original address canditades 173 | address_candidates = filter (\(_,y) -> y >= b) $ freeList m 174 | -- Create silly AlignedAddress (that are not really aligned at all) 175 | pretend_align bytes (addr, free_space) = ((addr,addr),free_space,bytes) 176 | 177 | -- try to align an address 178 | -- results in an AlignedAdress 179 | tryCandidate bytes (addr, free_space) = 180 | let (aligned_addr, extra_bytes) = bank_align conf addr 181 | alloc_size = bytes + extra_bytes 182 | in 183 | case free_space >= alloc_size of 184 | True -> Just (aligned_addr,free_space,alloc_size) 185 | False -> Nothing 186 | 187 | --------------------------------------------------------------------------- 188 | -- Free memory 189 | --------------------------------------------------------------------------- 190 | free :: Memory -> AlignedAddress -> Memory 191 | free m (alloc_addr,_) = mem 192 | where 193 | bytes = lookup (alloc_addr) (allocated m) 194 | al = filter (\(addr,_) -> alloc_addr /= addr) (allocated m) 195 | 196 | -- TODO: Investigate this much closer. 197 | -- Is it a bug or is freeing a non allocated memory area 198 | -- OK? 199 | -- 2014-Nov-25: I dont remember what this refers to 200 | -- But, if a problem resurfaces, look here. 201 | 202 | mem = case bytes of 203 | Nothing -> m 204 | {- 205 | error $ "error: Address " ++ show a ++ 206 | " not found in allocated list" ++ 207 | "\n" ++ show m 208 | -} 209 | Just b -> m {freeList = compress ((alloc_addr,b):(freeList m)), 210 | allocated = al} 211 | 212 | freeAll :: Memory -> [AlignedAddress] -> Memory 213 | freeAll m [] = m 214 | freeAll m (a:as) = freeAll (free m a) as 215 | 216 | 217 | compress :: [(Address,Bytes)] -> [(Address,Bytes)] 218 | compress = merge . List.sort 219 | where 220 | merge :: [(Address,Bytes)] -> [(Address,Bytes)] 221 | merge [] = [] 222 | merge [x] = [x] 223 | merge ((x,b):(y,b2):xs) = if (x+b == y) 224 | then merge ((x,b+b2):xs) 225 | else (x,b):merge((y,b2):xs) 226 | 227 | 228 | --------------------------------------------------------------------------- 229 | -- Memory map the new IM 230 | --------------------------------------------------------------------------- 231 | 232 | memMapIM :: SharedMemConfig -> IML -> MemMap -> (Memory, MemMap) 233 | memMapIM conf im memmap = mmIM conf im memory memmap 234 | where 235 | memory = createSharedMem conf 236 | 237 | mmIM :: SharedMemConfig -> IML -> Memory -> MemMap -> (Memory, MemMap) 238 | mmIM conf im memory memmap = r im (memory,memmap) 239 | where 240 | r [] m = m 241 | r (x:xs) (m,mm) = 242 | let 243 | (m',mm') = process conf x m mm 244 | 245 | freeable = getFreeableSet x xs 246 | freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable)) 247 | dontMap name = not ((List.isPrefixOf "input" name) || 248 | (List.isPrefixOf "output" name)) 249 | mNew = 250 | case freeableAddrs of 251 | (Just as) -> freeAll m' (map fst as) 252 | Nothing -> m' 253 | in -- trace ("freeable: " ++ show freeable ++ "\n") $ 254 | r xs (mNew,mm') 255 | 256 | process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap) 257 | process conf (SAllocate name size t,_) m mm = (m',mm') 258 | where (m',addr) = allocate conf m size 259 | mm' = 260 | case Map.lookup name mm of 261 | Nothing -> Map.insert name (addr,t) mm 262 | (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a 263 | 264 | -- Boilerplate 265 | -- BUG: Bug in memory management related to seqloops 266 | -- It may be better to try to fix this bug here. 267 | -- A special mmIM for the loop case may be needed. 268 | process conf (SSeqFor _ n im,alive) m mm = mmIMLoop conf alive im m mm 269 | process conf (SSeqWhile b im,_) m mm = mmIM conf im m mm 270 | process conf (SForAll _ n im,_) m mm = mmIM conf im m mm 271 | -- 2014-Nov-25: 272 | -- This one used mmIM' which was identical to mmIM. 273 | -- This must have been a leftover from when I thought 274 | -- warp memory needed some special attention here. 275 | process conf (SDistrPar Warp n im,_) m mm = mmIMDistrWarp conf im m mm 276 | process conf (SDistrPar Block n im,_) m mm = mmIM conf im m mm 277 | process conf (_,_) m mm = (m,mm) 278 | 279 | -- Friday (2013 Mars 29, discovered bug) 280 | -- 2014-Nov-25: was the "l" the bug ? (some details help) 281 | getFreeableSet :: (Statement Liveness,Liveness) -> IML -> Liveness 282 | getFreeableSet (_,l) [] = Set.empty -- not l ! 283 | getFreeableSet (_,l) ((_,l1):_) = l Set.\\ l1 284 | 285 | 286 | --------------------------------------------------------------------------- 287 | -- 288 | --------------------------------------------------------------------------- 289 | mmIMLoop conf nonfreeable im memory memmap = r im (memory,memmap) 290 | where 291 | r [] m = m 292 | r (x:xs) (m,mm) = 293 | let 294 | (m',mm') = process conf x m mm 295 | 296 | freeable' = getFreeableSet x xs 297 | freeable = freeable' Set.\\ nonfreeable 298 | freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable)) 299 | dontMap name = not ((List.isPrefixOf "input" name) || 300 | (List.isPrefixOf "output" name)) 301 | mNew = 302 | case freeableAddrs of 303 | (Just as) -> freeAll m' (map fst as) 304 | Nothing -> m' 305 | in --trace ("freeable': " ++ show freeable' ++ "\n" ++ 306 | -- "freeable: " ++ show freeable ++ "\n" ++ 307 | -- "nonfreeable: " ++ show nonfreeable) $ 308 | r xs (mNew,mm') 309 | 310 | process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap) 311 | process conf (SAllocate name size t,_) m mm = (m',mm') 312 | where (m',addr) = allocate conf m size 313 | mm' = 314 | case Map.lookup name mm of 315 | Nothing -> Map.insert name (addr,t) mm 316 | (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a 317 | 318 | -- Boilerplate 319 | process conf (SSeqFor _ n im,alive) m mm = mmIMLoop conf (nonfreeable `Set.union` alive) im m mm 320 | process conf (SSeqWhile b im,_) m mm = mmIMLoop conf nonfreeable im m mm 321 | process conf (SForAll _ n im,_) m mm = mmIMLoop conf nonfreeable im m mm 322 | -- 2014-Nov-25: 323 | -- This one used mmIM' which was identical to mmIM. 324 | -- This must have been a leftover from when I thought 325 | -- warp memory needed some special attention here. 326 | process conf (SDistrPar Warp n im,_) m mm = mmIMLoop conf nonfreeable im m mm 327 | process conf (SDistrPar Block n im,_) m mm = mmIMLoop conf nonfreeable im m mm 328 | process conf (_,_) m mm = (m,mm) 329 | 330 | 331 | 332 | -- NOTE: This is a hack to make programs distributed 333 | -- over warps not "free" its arrays. 334 | -- Distributing over warps introduces entirely new 335 | -- shared memory behaviour.. 336 | -- This needs a review and some real thought! 337 | 338 | mmIMDistrWarp conf im memory memmap = r im (memory,memmap) 339 | where 340 | r [] m = m 341 | r (x:xs) (m,mm) = 342 | let 343 | (m',mm') = process conf x m mm 344 | 345 | freeable = getFreeableSet x xs 346 | --freeable = freeable' Set.\\ nonfreeable 347 | freeableAddrs = mapM (flip Map.lookup mm') (filter dontMap (Set.toList freeable)) 348 | dontMap name = not ((List.isPrefixOf "input" name) || 349 | (List.isPrefixOf "output" name)) 350 | mNew = 351 | case freeableAddrs of 352 | (Just as) -> m' -- freeAll m' (map fst as) 353 | Nothing -> m' 354 | in --trace ("freeable': " ++ show freeable' ++ "\n" ++ 355 | -- "freeable: " ++ show freeable ++ "\n" ++ 356 | -- "nonfreeable: " ++ show nonfreeable) $ 357 | r xs (mNew,mm') 358 | 359 | process :: SharedMemConfig -> (Statement Liveness,Liveness) -> Memory -> MemMap -> (Memory,MemMap) 360 | process conf (SAllocate name size t,_) m mm = (m',mm') 361 | where (m',addr) = allocate conf m size 362 | mm' = 363 | case Map.lookup name mm of 364 | Nothing -> Map.insert name (addr,t) mm 365 | (Just (a, t)) -> error $ "mmIm: " ++ name ++ " is already mapped to " ++ show a 366 | 367 | -- Boilerplate 368 | process conf (SSeqFor _ n im,alive) m mm = mmIMDistrWarp conf im m mm 369 | process conf (SSeqWhile b im,_) m mm = mmIMDistrWarp conf im m mm 370 | process conf (SForAll _ n im,_) m mm = mmIMDistrWarp conf im m mm 371 | -- 2014-Nov-25: 372 | -- This one used mmIM' which was identical to mmIM. 373 | -- This must have been a leftover from when I thought 374 | -- warp memory needed some special attention here. 375 | process conf (SDistrPar Warp n im,_) m mm = mmIMDistrWarp conf im m mm 376 | process conf (SDistrPar Block n im,_) m mm = mmIMDistrWarp conf im m mm 377 | process conf (_,_) m mm = (m,mm) 378 | 379 | 380 | --------------------------------------------------------------------------- 381 | -- Rename arrays in IM 382 | --------------------------------------------------------------------------- 383 | 384 | renameIM :: MemMap -> IML -> IMList () 385 | renameIM mm im = zip (map (go . fst) im) (repeat ()) 386 | where 387 | go (SAssign name ix e) = SAssign (renameIVar mm name) 388 | (map (renameIExp mm) ix) 389 | (renameIExp mm e) 390 | go (SAtomicOp name ix atop) = SAtomicOp (renameIVar mm name) 391 | (renameIExp mm ix) 392 | (renameAtOp mm atop) 393 | go (SCond be im) = SCond (renameIExp mm be) 394 | (renameIM mm im) 395 | go (SSeqFor str n im) = SSeqFor str (renameIExp mm n) 396 | (renameIM mm im) 397 | go SBreak = SBreak 398 | go (SSeqWhile n im) = SSeqWhile (renameIExp mm n) 399 | (renameIM mm im) 400 | go (SForAll lvl n im) = SForAll lvl (renameIExp mm n) 401 | (renameIM mm im) 402 | go (SDistrPar lvl n im) = SDistrPar lvl (renameIExp mm n) 403 | (renameIM mm im) 404 | 405 | -- go (SForAllBlocks n im) = SForAllBlocks (renameIExp mm n) 406 | -- (renameIM mm im) 407 | -- go (SNWarps n im) = SNWarps (renameIExp mm n) 408 | -- (renameIM mm im) 409 | -- go (SWarpForAll n im) = SWarpForAll (renameIExp mm n) 410 | -- (renameIM mm im) 411 | -- Strip this out earlier. 412 | go (SAllocate name n t) = SAllocate name n t 413 | go (SDeclare name t) = SDeclare name t 414 | go SSynchronize = SSynchronize 415 | 416 | --------------------------------------------------------------------------- 417 | -- Memory map the arrays in an CExpr 418 | --------------------------------------------------------------------------- 419 | renameIExp :: MemMap -> IExp -> IExp 420 | renameIExp mm e@(IVar _ _) = renameIVar mm e 421 | renameIExp mm (IIndex (e1,es) t) = IIndex (renameIExp mm e1, map (renameIExp mm) es) t 422 | renameIExp mm (IBinOp op e1 e2 t) = IBinOp op (renameIExp mm e1) (renameIExp mm e2) t 423 | renameIExp mm (IUnOp op e t) = IUnOp op (renameIExp mm e) t 424 | renameIExp mm (IFunCall nom exprs t) = IFunCall nom (map (renameIExp mm) exprs) t 425 | renameIExp mm (ICast e t) = ICast (renameIExp mm e) t 426 | renameIExp mm (ICond e1 e2 e3 t) = ICond (renameIExp mm e1) 427 | (renameIExp mm e2) 428 | (renameIExp mm e3) 429 | t 430 | renameIExp _ a = a 431 | 432 | renameIVar :: MemMap -> IExp -> IExp 433 | renameIVar mm (IVar name t) = 434 | -- t == t1 should be true 435 | case Map.lookup name mm of 436 | Just ((_,real_addr),t1) -> 437 | let core = sbaseIExp (real_addr) 438 | cast c = ICast c t1 439 | in cast core 440 | 441 | Nothing -> IVar name t 442 | where 443 | sbaseIExp 0 = IVar "sbase" (Pointer Word8) 444 | sbaseIExp addr = IBinOp IAdd (IVar "sbase" (Pointer Word8)) 445 | (IWord32 addr) 446 | (Pointer Word8) 447 | renameIVar _ _ = error "renameIVar: incorrect expression" 448 | 449 | renameAtOp :: MemMap -> AtOp -> AtOp 450 | renameAtOp _ AtInc = AtInc 451 | renameAtOp mm (AtAdd e) = AtAdd (renameIExp mm e) 452 | renameAtOp mm (AtSub e) = AtSub (renameIExp mm e) 453 | renameAtOp mm (AtExch e) = AtExch (renameIExp mm e) 454 | 455 | 456 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/CompileIM.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE QuasiQuotes #-} 3 | {-# LANGUAGE PackageImports #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | 6 | {- 7 | 8 | Joel Svensson 2013..2017 9 | 10 | -} 11 | 12 | module Obsidian.CodeGen.CompileIM where 13 | import Language.C.Quote.CUDA hiding (Block) 14 | import qualified Language.C.Quote.OpenCL as CL 15 | 16 | import qualified "language-c-quote" Language.C.Syntax as C 17 | 18 | import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..)) 19 | import Obsidian.Types as T 20 | import Obsidian.DimSpec 21 | import Obsidian.CodeGen.Program 22 | 23 | import Data.Word 24 | 25 | {- Notes: 26 | 27 | 2017-04-22: Generate only CUDA 28 | 29 | 30 | 31 | * TODO: Make sure tid always has correct Value 32 | -} 33 | 34 | --------------------------------------------------------------------------- 35 | -- Config 36 | --------------------------------------------------------------------------- 37 | 38 | data Config = Config { configThreadsPerBlock :: Word32, 39 | configSharedMem :: Word32} 40 | 41 | 42 | 43 | 44 | --------------------------------------------------------------------------- 45 | -- compileExp (maybe a bad name) 46 | --------------------------------------------------------------------------- 47 | compileExp :: IExp -> C.Exp 48 | compileExp (IVar name t) = [cexp| $id:name |] 49 | 50 | 51 | -- TODO: Fix all this! 52 | -- compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |] 53 | -- compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |] 54 | -- compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |] 55 | 56 | -- compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |] 57 | -- compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |] 58 | -- compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |] 59 | 60 | -- compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |] 61 | -- compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |] 62 | -- compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |] 63 | 64 | -- compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |] 65 | -- compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |] 66 | -- compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |] 67 | 68 | compileExp (IBool True) = [cexp|1|] 69 | compileExp (IBool False) = [cexp|0|] 70 | compileExp (IInt8 n) = [cexp| $int:(toInteger n) |] 71 | compileExp (IInt16 n) = [cexp| $int:(toInteger n) |] 72 | compileExp (IInt32 n) = [cexp| $int:(toInteger n) |] 73 | compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |] 74 | 75 | compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |] 76 | compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |] 77 | compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |] 78 | compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |] 79 | 80 | compileExp (IFloat n) = [cexp| $float:(n) |] 81 | compileExp (IDouble n) = [cexp| $double:(n) |] 82 | 83 | -- Implementing these may be a bit awkward 84 | -- given there are no vector literals in cuda. 85 | compileExp (IFloat2 n m) = error "IFloat2 unhandled" 86 | compileExp (IFloat3 n m l) = error "IFloat3 unhandled" 87 | compileExp (IFloat4 n m l k) = error "IFloat4 unhandled" 88 | compileExp (IDouble2 n m) = error "IDouble2 unhandled" 89 | compileExp (IInt8_2 n m) = error "FIXME" 90 | compileExp (IInt8_3 n m k) = error "FIXME" 91 | compileExp (IInt8_4 n m k l) = error "FIXME" 92 | compileExp (IInt16_2 n m ) = error "FIXME" 93 | compileExp (IInt16_3 n m k) = error "FIXME" 94 | compileExp (IInt16_4 n m k l) = error "FIXME" 95 | compileExp (IInt32_2 n m) = error "FIXME" 96 | compileExp (IInt32_3 n m k) = error "FIXME" 97 | compileExp (IInt32_4 n m k l) = error "FIXME" 98 | compileExp (IInt64_2 n m) = error "FIXME" 99 | compileExp (IInt64_3 n m k) = error "FIXME" 100 | compileExp (IInt64_4 n m k l) = error "FIXME" 101 | compileExp (IWord8_2 n m) = error "FIXME" 102 | compileExp (IWord8_3 n m k) = error "FIXME" 103 | compileExp (IWord8_4 n m k l) = error "FIXME" 104 | compileExp (IWord16_2 n m ) = error "FIXME" 105 | compileExp (IWord16_3 n m k) = error "FIXME" 106 | compileExp (IWord16_4 n m k l) = error "FIXME" 107 | compileExp (IWord32_2 n m) = error "FIXME" 108 | compileExp (IWord32_3 n m k) = error "FIXME" 109 | compileExp (IWord32_4 n m k l) = error "FIXME" 110 | compileExp (IWord64_2 n m) = error "FIXME" 111 | compileExp (IWord64_3 n m k) = error "FIXME" 112 | compileExp (IWord64_4 n m k l) = error "FIXME" 113 | 114 | 115 | compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |] 116 | compileExp a@(IIndex (_,_) _) = error $ "compileExp: Malformed index expression " ++ show a 117 | 118 | compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |] 119 | 120 | compileExp (IBinOp op e1 e2 t) = go op 121 | where 122 | x = compileExp e1 123 | y = compileExp e2 124 | go IAdd = [cexp| $x + $y |] 125 | go ISub = [cexp| $x - $y |] 126 | go IMul = [cexp| $x * $y |] 127 | go IDiv = [cexp| $x / $y |] 128 | go IFDiv = [cexp| $x / $y |] 129 | go IMod = [cexp| $x % $y |] 130 | go IEq = [cexp| $x == $y |] 131 | go INotEq = [cexp| $x != $y |] 132 | go ILt = [cexp| $x < $y |] 133 | go IGt = [cexp| $x > $y |] 134 | go IGEq = [cexp| $x >= $y |] 135 | go ILEq = [cexp| $x <= $y |] 136 | go IAnd = [cexp| $x && $y |] 137 | go IOr = [cexp| $x || $y |] 138 | go IPow = case t of 139 | Float -> [cexp|powf($x,$y) |] 140 | Double -> [cexp|pow($x,$y) |] 141 | _ -> error $ "IPow applied at wrong type" 142 | go IBitwiseAnd = [cexp| $x & $y |] 143 | go IBitwiseOr = [cexp| $x | $y |] 144 | go IBitwiseXor = [cexp| $x ^ $y |] 145 | go IShiftL = [cexp| $x << $y |] 146 | go IShiftR = [cexp| $x >> $y |] 147 | compileExp (IUnOp op e t) = go op 148 | where 149 | x = compileExp e 150 | go IBitwiseNeg = [cexp| ~$x|] 151 | go INot = [cexp| !$x|] 152 | go IGetX = [cexp| $x.x|] 153 | go IGetY = [cexp| $x.y|] 154 | go IGetZ = [cexp| $x.z|] 155 | go IGetW = [cexp| $x.w|] 156 | 157 | compileExp (IFunCall name es t) = [cexp| $fc |] 158 | where 159 | es' = map compileExp es 160 | fc = [cexp| $id:(name)($args:(es')) |] 161 | 162 | compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |] 163 | where 164 | e' = compileExp e 165 | 166 | compileExp any = error $ show any 167 | 168 | compileType :: T.Type -> C.Type 169 | compileType (Int8) = [cty| typename int8_t |] 170 | compileType (Int16) = [cty| typename int16_t |] 171 | compileType (Int32) = [cty| typename int32_t |] 172 | compileType (Int64) = [cty| typename int64_t |] 173 | compileType (Word8) = [cty| typename uint8_t |] 174 | compileType (Word16) = [cty| typename uint16_t |] 175 | compileType (Word32) = [cty| typename uint32_t |] 176 | compileType (Word64) = [cty| typename uint64_t |] 177 | compileType (Float) = [cty| float |] 178 | compileType (Double) = [cty| double |] 179 | 180 | compileType (Vec2 Float) = [cty| float4|] 181 | compileType (Vec3 Float) = [cty| float3|] 182 | compileType (Vec4 Float) = [cty| float2|] 183 | 184 | compileType (Vec2 Double) = [cty| double2|] 185 | 186 | -- How does this interplay with my use of uint8_t etc. Here it is char! 187 | compileType (Vec2 Int8) = [cty| char2|] 188 | compileType (Vec3 Int8) = [cty| char3|] 189 | compileType (Vec4 Int8) = [cty| char4|] 190 | 191 | compileType (Vec2 Int16) = [cty| short2|] 192 | compileType (Vec3 Int16) = [cty| short3|] 193 | compileType (Vec4 Int16) = [cty| short4|] 194 | 195 | compileType (Vec2 Int32) = [cty| int2|] 196 | compileType (Vec3 Int32) = [cty| int3|] 197 | compileType (Vec4 Int32) = [cty| int4|] 198 | 199 | compileType (Vec2 Word8) = [cty| uchar2|] 200 | compileType (Vec3 Word8) = [cty| uchar3|] 201 | compileType (Vec4 Word8) = [cty| uchar4|] 202 | 203 | compileType (Vec2 Word16) = [cty| ushort2|] 204 | compileType (Vec3 Word16) = [cty| ushort3|] 205 | compileType (Vec4 Word16) = [cty| ushort4|] 206 | 207 | compileType (Vec2 Word32) = [cty| uint2|] 208 | compileType (Vec3 Word32) = [cty| uint3|] 209 | compileType (Vec4 Word32) = [cty| uint4|] 210 | 211 | 212 | compileType (Shared t) = [cty| __shared__ $ty:(compileType t) |] 213 | compileType (Pointer t) = [cty| $ty:(compileType t)* |] 214 | compileType (Volatile t) = [cty| volatile $ty:(compileType t)|] 215 | compileType t = error $ "compileType: Not implemented " ++ show t 216 | 217 | 218 | --------------------------------------------------------------------------- 219 | -- Statement t to Stm 220 | --------------------------------------------------------------------------- 221 | 222 | 223 | compileStm :: Config -> Statement t -> [C.Stm] 224 | compileStm c (SAssign name [] e) = 225 | [[cstm| $(compileExp name) = $(compileExp e);|]] 226 | compileStm c (SAssign name [ix] e) = 227 | [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]] 228 | compileStm c (SAtomicOp name ix atop) = 229 | case atop of 230 | AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]] 231 | AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 232 | AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 233 | AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 234 | 235 | compileStm c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]] 236 | where 237 | body = compileIM c im -- (compileIM p c im) 238 | compileStm c (SSeqFor loopVar n im) = 239 | [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) 240 | { $stms:body } |]] 241 | -- end a sequential for loop with a sync (or begin). 242 | -- Maybe only if the loop is on block level (that is across all threads) 243 | -- __syncthreads();} |]] 244 | where 245 | body = compileIM c im 246 | 247 | 248 | -- Just relay to specific compileFunction 249 | compileStm c a@(SForAll lvl n im) = compileForAll c a 250 | 251 | compileStm c a@(SDistrPar lvl n im) = compileDistr c a 252 | 253 | compileStm c (SSeqWhile b im) = 254 | [[cstm| while ($(compileExp b)) { $stms:body}|]] 255 | where 256 | body = compileIM c im 257 | 258 | compileStm c SSynchronize = [[cstm| __syncthreads(); |]] 259 | 260 | compileStm _ (SAllocate _ _ _) = [] 261 | compileStm _ (SDeclare name t) = [] 262 | 263 | compileStm _ a = error $ "compileStm: missing case " 264 | 265 | --------------------------------------------------------------------------- 266 | -- DistrPar 267 | --------------------------------------------------------------------------- 268 | compileDistr :: Config -> Statement t -> [C.Stm] 269 | compileDistr c (SDistrPar Block n im) = codeQ ++ codeR 270 | -- New here is BLOCK virtualisation 271 | where 272 | cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] 273 | 274 | numBlocks = [cexp| $id:("gridDim.x") |] 275 | 276 | blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|] 277 | blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] 278 | 279 | codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]] 280 | 281 | bodyQ = [cstm| $id:("bid") = blockIdx.x * $exp:blocksQ + b;|] : cim ++ 282 | [[cstm| bid = blockIdx.x;|], 283 | [cstm| __syncthreads();|]] -- yes no ? 284 | 285 | codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + blockIdx.x;|], 286 | [cstm| if (blockIdx.x < $exp:blocksR) { $stms:cim }|], 287 | [cstm| bid = blockIdx.x;|], 288 | [cstm| __syncthreads();|]] -- yes no ? 289 | 290 | -- Can I be absolutely sure that 'n' here is statically known ? 291 | -- I must look over the functions that can potentially create this IM. 292 | -- Can make a separate case for unknown 'n' but generate worse code. 293 | -- (That is true for all levels) 294 | compileDistr c (SDistrPar Warp (IWord32 n) im) = codeQ ++ codeR 295 | -- Here the 'im' should be distributed over 'n'warps. 296 | -- 'im' uses a warpID variable to identify what warp it is. 297 | -- 'n' may be higher than the actual number of warps we have! 298 | -- So GPU warp virtualisation is needed. 299 | where 300 | cim = compileIM c im 301 | 302 | nWarps = fromIntegral $ configThreadsPerBlock c `div` 32 303 | numWarps = [cexp| $int:nWarps|] 304 | 305 | (wq, wr) = (n `div` nWarps, n `mod` nWarps) 306 | 307 | warpsQ = [cexp| $int:wq|] 308 | warpsR = [cexp| $int:wr|] 309 | 310 | codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]] 311 | 312 | bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++ 313 | --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ 314 | [[cstm| warpID = threadIdx.x / 32;|]] 315 | 316 | codeR = case (n `mod` nWarps) of 317 | 0 -> [] 318 | n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|], 319 | [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], 320 | [cstm| warpID = threadIdx.x / 32; |], 321 | [cstm| __syncthreads();|]] 322 | 323 | --------------------------------------------------------------------------- 324 | -- ForAll is compiled differently for different platforms 325 | --------------------------------------------------------------------------- 326 | compileForAll :: Config -> Statement t -> [C.Stm] 327 | compileForAll c (SForAll Warp (IWord32 n) im) = codeQ ++ codeR 328 | where 329 | nt = 32 330 | 331 | q = n `div` nt 332 | r = n `mod` nt 333 | 334 | cim = compileIM c im 335 | 336 | codeQ = 337 | case q of 338 | 0 -> [] 339 | 1 -> cim 340 | n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], 341 | [cstm| $id:("warpIx") = threadIdx.x % 32; |]] 342 | -- [cstm| __syncthreads();|]] 343 | where 344 | body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim 345 | --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim 346 | 347 | q32 = q * 32 -- break out because: parseExp: cannot parse 'q*32' 348 | codeR = 349 | case r of 350 | 0 -> [] 351 | n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { 352 | $id:("warpIx") = $int:(q32) + (threadIdx.x % 32); 353 | $stms:cim } |], 354 | -- [cstm| __syncthreads();|], 355 | [cstm| $id:("warpIx") = threadIdx.x % 32; |]] 356 | 357 | compileForAll c (SForAll Block (IWord32 n) im) = goQ ++ goR 358 | where 359 | cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] 360 | 361 | nt = configThreadsPerBlock c 362 | 363 | q = n `quot` nt 364 | r = n `rem` nt 365 | 366 | -- q is the number full "passes" needed to cover the iteration 367 | -- space given we have nt threads. 368 | goQ = 369 | case q of 370 | 0 -> [] 371 | 1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim 372 | --do 373 | -- stm <- updateTid [cexp| threadIdx.x |] 374 | -- return $ [cstm| $id:loopVar = threadIdx.x; |] : cim 375 | n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], 376 | -- __syncthreads(); } |], 377 | [cstm| $id:("tid") = threadIdx.x; |]] 378 | -- [cstm| __syncthreads();|]] 379 | where 380 | body = [cstm|$id:("tid") = i*$int:nt + threadIdx.x; |] : cim 381 | 382 | -- r is the number of elements left. 383 | -- This generates code for when fewer threads are 384 | -- needed than available. (some threads shut down due to the conditional). 385 | 386 | qnt = q * nt -- break out because: parseExp: cannot parse 'q*nt' 387 | goR = 388 | case (r,q) of 389 | (0,_) -> [] 390 | --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { 391 | -- $stms:cim } |]] 392 | (n,m) -> [[cstm| if (threadIdx.x < $int:n) { 393 | $id:("tid") = $int:(qnt) + threadIdx.x; 394 | $stms:cim } |], 395 | [cstm| $id:("tid") = threadIdx.x; |]] 396 | 397 | compileForAll c (SForAll Grid n im) = error "compileForAll: Grid" -- cim 398 | -- The grid case is special. May need more thought 399 | -- 400 | -- The problem with this case is that 401 | -- I need to come up with a blocksize (but without any guidance) 402 | -- from the programmer. 403 | -- Though! There is no way the programmer could provide any 404 | -- such info ... 405 | -- where 406 | -- cim = compileIM c im 407 | 408 | --compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go 409 | -- where 410 | -- body = compileIM PlatformC c im 411 | -- go = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] 412 | 413 | 414 | --------------------------------------------------------------------------- 415 | -- CompileIM to list of Stm 416 | --------------------------------------------------------------------------- 417 | compileIM :: Config -> IMList a -> [C.Stm] 418 | compileIM conf im = concatMap ((compileStm conf) . fst) im 419 | 420 | --------------------------------------------------------------------------- 421 | -- Generate entire Kernel 422 | --------------------------------------------------------------------------- 423 | type Parameters = [(String,T.Type)] 424 | 425 | compile :: Config -> String -> (Parameters,IMList a) -> C.Definition 426 | compile config kname (params,im) 427 | = go 428 | where 429 | stms = compileIM config im 430 | 431 | ps = compileParams params 432 | go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |] 433 | 434 | cudabody = (if (configSharedMem config > 0) 435 | -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] 436 | then [C.BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)] ; |]] 437 | else []) ++ 438 | --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ 439 | --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 440 | -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ 441 | -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ 442 | (if (usesGid im) 443 | then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]] 444 | else []) ++ 445 | (if (usesBid im) 446 | then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] 447 | else []) ++ 448 | (if (usesTid im) 449 | then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] 450 | else []) ++ 451 | (if (usesWarps im) 452 | then [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 453 | C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 454 | else []) ++ 455 | -- All variables used will be unique and can be declared 456 | -- at the top level 457 | concatMap declares im ++ 458 | -- Not sure if I am using language.C correctly. 459 | -- Maybe compileSTM should create BlockStms ? 460 | -- TODO: look how Nikola does it. 461 | map C.BlockStm stms 462 | 463 | cbody = -- add memory allocation 464 | map C.BlockStm stms 465 | 466 | -- Declare variables. 467 | declares :: (Statement t,t) -> [C.BlockItem] 468 | declares (SDeclare name t,_) = [C.BlockDecl [cdecl| $ty:(compileType t) $id:name;|]] 469 | declares (SCond _ im,_) = concatMap declares im 470 | declares (SSeqWhile _ im,_) = concatMap declares im 471 | declares (SForAll _ _ im,_) = concatMap declares im 472 | declares (SDistrPar _ _ im,_) = concatMap declares im 473 | declares (SSeqFor _ _ im,_) = concatMap declares im 474 | declares _ = [] 475 | 476 | 477 | --------------------------------------------------------------------------- 478 | -- Parameter lists for functions (kernel head) 479 | --------------------------------------------------------------------------- 480 | compileParams :: Parameters -> [C.Param] 481 | compileParams = map go 482 | where 483 | go (name,t) = [cparam| $ty:(compileType t) $id:name |] 484 | 485 | 486 | --------------------------------------------------------------------------- 487 | -- Compile with shared memory arrays declared at top 488 | --------------------------------------------------------------------------- 489 | -- CODE DUPLICATION FOR NOW 490 | 491 | compileDeclsTop :: Config -> [(String,((Word32,Word32),T.Type))] -> String -> (Parameters,IMList a) -> C.Definition 492 | compileDeclsTop config toplevelarrs kname (params,im) 493 | = go 494 | where 495 | stms = compileIM config im 496 | 497 | ps = compileParams params 498 | go = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |] 499 | 500 | cudabody = (if (configSharedMem config > 0) 501 | -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] 502 | then [C.BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)]; |]] 503 | else []) ++ 504 | --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ 505 | --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 506 | -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ 507 | -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ 508 | (if (usesGid im) 509 | then [C.BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]] 510 | else []) ++ 511 | (if (usesBid im) 512 | then [C.BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] 513 | else []) ++ 514 | (if (usesTid im) 515 | then [C.BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] 516 | else []) ++ 517 | (if (usesWarps im) 518 | then [C.BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 519 | C.BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 520 | else []) ++ 521 | -- declare all arrays used 522 | concatMap declareArr toplevelarrs ++ 523 | -- All variables used will be unique and can be declared 524 | -- at the top level 525 | concatMap declares im ++ 526 | -- Not sure if I am using language.C correctly. 527 | -- Maybe compileSTM should create BlockStms ? 528 | -- TODO: look how Nikola does it. 529 | map C.BlockStm stms 530 | 531 | cbody = -- add memory allocation 532 | map C.BlockStm stms 533 | 534 | 535 | declareArr :: (String, ((Word32,Word32),T.Type)) -> [C.BlockItem] 536 | declareArr (arr,((_,addr),t)) = 537 | [C.BlockDecl [cdecl| $ty:(compileType t) $id:arr = ($ty:(compileType t))(sbase + $int:addr);|]] 538 | -------------------------------------------------------------------------------- /Obsidian/CodeGen/CompileIMOpenCLEmbedded.hs: -------------------------------------------------------------------------------- 1 | 2 | {-# LANGUAGE QuasiQuotes #-} 3 | {-# LANGUAGE PackageImports #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | 6 | {- 7 | 8 | Joel Svensson 2017 9 | 10 | * notes: 11 | 2017-04-23: Initial version. Duplicates a lot of the CUDA generation. 12 | 13 | * Todo: 14 | 15 | - Make sure generated code makes sense in Vivado_HLS 16 | 17 | - See what happens to the Workgroup interface generated 18 | by Vivado_HLS when using these: 19 | * get_num_groups() 20 | * get_local_size() 21 | * get_group_id() 22 | * get_local_id() 23 | * get_global_id() 24 | * get_global_size() 25 | 26 | get_num_groups and get_global_size are interesting as I have 27 | not seen these communicated to a Vivado_HLS opencl workgroup. 28 | It may be that interface is only generated when these functions 29 | are used. 30 | 31 | 32 | -} 33 | 34 | module Obsidian.CodeGen.CompileIMOpenCLEmbedded where 35 | 36 | import Language.C.Quote.OpenCL 37 | 38 | import qualified "language-c-quote" Language.C.Syntax as C 39 | 40 | import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..)) 41 | import Obsidian.Types as T 42 | import Obsidian.DimSpec 43 | import Obsidian.CodeGen.Program 44 | 45 | import Data.Word 46 | 47 | {- Notes: 48 | 49 | * Generation of OpenCL targeted at embedded profile 50 | as it presents itself in VivadoHLS 51 | 52 | -} 53 | 54 | --------------------------------------------------------------------------- 55 | -- Config 56 | --------------------------------------------------------------------------- 57 | 58 | data Config = Config { configThreadsPerBlock :: Word32, 59 | configSharedMem :: Word32} 60 | 61 | 62 | --------------------------------------------------------------------------- 63 | -- Some strings for OpenCLifying the generator 64 | --------------------------------------------------------------------------- 65 | threadIdx_x = [cexp| get_local_id(0) |] 66 | blockIdx_x = [cexp| get_group_id(0) |] 67 | globalIdx_x = [cexp| get_global_id(0)|] 68 | blockDim_x = [cexp| get_local_size(0)|] 69 | 70 | 71 | 72 | --------------------------------------------------------------------------- 73 | -- compileExp (maybe a bad name) 74 | --------------------------------------------------------------------------- 75 | compileExp :: IExp -> C.Exp 76 | compileExp (IVar name t) = [cexp| $id:name |] 77 | 78 | 79 | -- TODO: Fix all this! 80 | -- compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |] 81 | -- compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |] 82 | -- compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |] 83 | 84 | -- compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |] 85 | -- compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |] 86 | -- compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |] 87 | 88 | -- compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |] 89 | -- compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |] 90 | -- compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |] 91 | 92 | -- compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |] 93 | -- compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |] 94 | -- compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |] 95 | 96 | compileExp (IBool True) = [cexp|1|] 97 | compileExp (IBool False) = [cexp|0|] 98 | compileExp (IInt8 n) = [cexp| $int:(toInteger n) |] 99 | compileExp (IInt16 n) = [cexp| $int:(toInteger n) |] 100 | compileExp (IInt32 n) = [cexp| $int:(toInteger n) |] 101 | compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |] 102 | 103 | compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |] 104 | compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |] 105 | compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |] 106 | compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |] 107 | 108 | compileExp (IFloat n) = [cexp| $float:(n) |] 109 | compileExp (IDouble n) = [cexp| $double:(n) |] 110 | 111 | -- Implementing these may be a bit awkward 112 | -- given there are no vector literals in cuda. 113 | compileExp (IFloat2 n m) = error "IFloat2 unhandled" 114 | compileExp (IFloat3 n m l) = error "IFloat3 unhandled" 115 | compileExp (IFloat4 n m l k) = error "IFloat4 unhandled" 116 | compileExp (IDouble2 n m) = error "IDouble2 unhandled" 117 | compileExp (IInt8_2 n m) = error "FIXME" 118 | compileExp (IInt8_3 n m k) = error "FIXME" 119 | compileExp (IInt8_4 n m k l) = error "FIXME" 120 | compileExp (IInt16_2 n m ) = error "FIXME" 121 | compileExp (IInt16_3 n m k) = error "FIXME" 122 | compileExp (IInt16_4 n m k l) = error "FIXME" 123 | compileExp (IInt32_2 n m) = error "FIXME" 124 | compileExp (IInt32_3 n m k) = error "FIXME" 125 | compileExp (IInt32_4 n m k l) = error "FIXME" 126 | compileExp (IInt64_2 n m) = error "FIXME" 127 | compileExp (IInt64_3 n m k) = error "FIXME" 128 | compileExp (IInt64_4 n m k l) = error "FIXME" 129 | compileExp (IWord8_2 n m) = error "FIXME" 130 | compileExp (IWord8_3 n m k) = error "FIXME" 131 | compileExp (IWord8_4 n m k l) = error "FIXME" 132 | compileExp (IWord16_2 n m ) = error "FIXME" 133 | compileExp (IWord16_3 n m k) = error "FIXME" 134 | compileExp (IWord16_4 n m k l) = error "FIXME" 135 | compileExp (IWord32_2 n m) = error "FIXME" 136 | compileExp (IWord32_3 n m k) = error "FIXME" 137 | compileExp (IWord32_4 n m k l) = error "FIXME" 138 | compileExp (IWord64_2 n m) = error "FIXME" 139 | compileExp (IWord64_3 n m k) = error "FIXME" 140 | compileExp (IWord64_4 n m k l) = error "FIXME" 141 | 142 | 143 | compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |] 144 | compileExp a@(IIndex (_,_) _) = error $ "compileExp: Malformed index expression " ++ show a 145 | 146 | compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |] 147 | 148 | compileExp (IBinOp op e1 e2 t) = go op 149 | where 150 | x = compileExp e1 151 | y = compileExp e2 152 | go IAdd = [cexp| $x + $y |] 153 | go ISub = [cexp| $x - $y |] 154 | go IMul = [cexp| $x * $y |] 155 | go IDiv = [cexp| $x / $y |] 156 | go IFDiv = [cexp| $x / $y |] 157 | go IMod = [cexp| $x % $y |] 158 | go IEq = [cexp| $x == $y |] 159 | go INotEq = [cexp| $x != $y |] 160 | go ILt = [cexp| $x < $y |] 161 | go IGt = [cexp| $x > $y |] 162 | go IGEq = [cexp| $x >= $y |] 163 | go ILEq = [cexp| $x <= $y |] 164 | go IAnd = [cexp| $x && $y |] 165 | go IOr = [cexp| $x || $y |] 166 | go IPow = case t of 167 | Float -> [cexp|powf($x,$y) |] 168 | Double -> [cexp|pow($x,$y) |] 169 | _ -> error $ "IPow applied at wrong type" 170 | go IBitwiseAnd = [cexp| $x & $y |] 171 | go IBitwiseOr = [cexp| $x | $y |] 172 | go IBitwiseXor = [cexp| $x ^ $y |] 173 | go IShiftL = [cexp| $x << $y |] 174 | go IShiftR = [cexp| $x >> $y |] 175 | compileExp (IUnOp op e t) = go op 176 | where 177 | x = compileExp e 178 | go IBitwiseNeg = [cexp| ~$x|] 179 | go INot = [cexp| !$x|] 180 | go IGetX = [cexp| $x.x|] 181 | go IGetY = [cexp| $x.y|] 182 | go IGetZ = [cexp| $x.z|] 183 | go IGetW = [cexp| $x.w|] 184 | 185 | compileExp (IFunCall name es t) = [cexp| $fc |] 186 | where 187 | es' = map compileExp es 188 | fc = [cexp| $id:(name)($args:(es')) |] 189 | 190 | compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |] 191 | where 192 | e' = compileExp e 193 | 194 | compileType :: T.Type -> C.Type 195 | compileType (Int8) = [cty| typename int8_t |] 196 | compileType (Int16) = [cty| typename int16_t |] 197 | compileType (Int32) = [cty| typename int32_t |] 198 | compileType (Int64) = [cty| typename int64_t |] 199 | compileType (Word8) = [cty| typename uint8_t |] 200 | compileType (Word16) = [cty| typename uint16_t |] 201 | compileType (Word32) = [cty| typename uint32_t |] 202 | compileType (Word64) = [cty| typename uint64_t |] 203 | compileType (Float) = [cty| float |] 204 | compileType (Double) = [cty| double |] 205 | 206 | compileType (Vec2 Float) = [cty| float4|] 207 | compileType (Vec3 Float) = [cty| float3|] 208 | compileType (Vec4 Float) = [cty| float2|] 209 | 210 | compileType (Vec2 Double) = [cty| double2|] 211 | 212 | -- How does this interplay with my use of uint8_t etc. Here it is char! 213 | compileType (Vec2 Int8) = [cty| char2|] 214 | compileType (Vec3 Int8) = [cty| char3|] 215 | compileType (Vec4 Int8) = [cty| char4|] 216 | 217 | compileType (Vec2 Int16) = [cty| short2|] 218 | compileType (Vec3 Int16) = [cty| short3|] 219 | compileType (Vec4 Int16) = [cty| short4|] 220 | 221 | compileType (Vec2 Int32) = [cty| int2|] 222 | compileType (Vec3 Int32) = [cty| int3|] 223 | compileType (Vec4 Int32) = [cty| int4|] 224 | 225 | compileType (Vec2 Word8) = [cty| uchar2|] 226 | compileType (Vec3 Word8) = [cty| uchar3|] 227 | compileType (Vec4 Word8) = [cty| uchar4|] 228 | 229 | compileType (Vec2 Word16) = [cty| ushort2|] 230 | compileType (Vec3 Word16) = [cty| ushort3|] 231 | compileType (Vec4 Word16) = [cty| ushort4|] 232 | 233 | compileType (Vec2 Word32) = [cty| uint2|] 234 | compileType (Vec3 Word32) = [cty| uint3|] 235 | compileType (Vec4 Word32) = [cty| uint4|] 236 | 237 | 238 | compileType (Shared t) = [cty| __local $ty:(compileType t) |] 239 | compileType (Pointer t) = [cty| $ty:(compileType t)* |] 240 | compileType (Volatile t) = [cty| volatile $ty:(compileType t)|] 241 | compileType t = error $ "compileType: Not implemented " ++ show t 242 | 243 | 244 | --------------------------------------------------------------------------- 245 | -- Statement t to Stm 246 | --------------------------------------------------------------------------- 247 | 248 | compileStm :: Config -> Statement t -> [C.Stm] 249 | compileStm c (SAssign name [] e) = 250 | [[cstm| $(compileExp name) = $(compileExp e);|]] 251 | compileStm c (SAssign name [ix] e) = 252 | [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]] 253 | compileStm c (SAtomicOp name ix atop) = 254 | case atop of 255 | AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]] 256 | AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 257 | AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 258 | AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] 259 | 260 | compileStm c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]] 261 | where 262 | body = compileIM c im -- (compileIM p c im) 263 | compileStm c (SSeqFor loopVar n im) = 264 | [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) 265 | { $stms:body } |]] 266 | where 267 | body = compileIM c im -- (compileIM p c im) 268 | 269 | 270 | -- Just relay to specific compileFunction 271 | compileStm c a@(SForAll lvl n im) = compileForAll c a 272 | 273 | compileStm c a@(SDistrPar lvl n im) = compileDistr c a 274 | 275 | compileStm c (SSeqWhile b im) = 276 | [[cstm| while ($(compileExp b)) { $stms:body}|]] 277 | where 278 | body = compileIM c im 279 | 280 | -- compileStm c SSynchronize = [[cstm| __syncthreads(); |]] 281 | compileStm c SSynchronize = [[cstm| barrier(CLK_LOCAL_MEM_FENCE); |]] 282 | 283 | 284 | compileStm _ (SAllocate _ _ _) = [] 285 | compileStm _ (SDeclare name t) = [] 286 | 287 | compileStm _ a = error $ "compileStm: missing case " 288 | 289 | --------------------------------------------------------------------------- 290 | -- DistrPar 291 | --------------------------------------------------------------------------- 292 | compileDistr :: Config -> Statement t -> [C.Stm] 293 | compileDistr c (SDistrPar Block n im) = codeQ ++ codeR 294 | -- New here is BLOCK virtualisation 295 | where 296 | cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] 297 | 298 | numBlocks = [cexp| get_num_groups(0) |] 299 | 300 | blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|] 301 | blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] 302 | 303 | codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]] 304 | 305 | bodyQ = [cstm| $id:("bid") = $exp:blockIdx_x * $exp:blocksQ + b;|] : cim ++ 306 | [[cstm| bid = $exp:blockIdx_x;|], 307 | [cstm| barrier(CLK_LOCAL_MEM_FENCE);|]] -- yes no ? 308 | 309 | codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + $exp:blockIdx_x;|], 310 | [cstm| if ($exp:blockIdx_x < $exp:blocksR) { $stms:cim }|], 311 | [cstm| bid = $exp:blockIdx_x;|], 312 | [cstm| barrier(CLK_LOCAL_MEM_FENCE);|]] -- yes no ? 313 | 314 | -- Can I be absolutely sure that 'n' here is statically known ? 315 | -- I must look over the functions that can potentially create this IM. 316 | -- Can make a separate case for unknown 'n' but generate worse code. 317 | -- (That is true for all levels) 318 | compileDistr c (SDistrPar Warp (IWord32 n) im) = 319 | error "Compiling distribution over OpenCL wavefronts is not supported yet" 320 | -- codeQ ++ codeR 321 | 322 | -- Here the 'im' should be distributed over 'n'warps. 323 | -- 'im' uses a warpID variable to identify what warp it is. 324 | -- 'n' may be higher than the actual number of warps we have! 325 | -- So GPU warp virtualisation is needed. 326 | -- where 327 | -- cim = compileIM c im 328 | 329 | -- nWarps = fromIntegral $ configThreadsPerBlock c `div` 32 330 | -- numWarps = [cexp| $int:nWarps|] 331 | 332 | -- warpsQ = [cexp| $int:(n `div` nWarps)|] 333 | -- warpsR = [cexp| $int:(n `mod` nWarps)|] 334 | 335 | -- codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]] 336 | 337 | -- bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++ 338 | -- --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ 339 | -- [[cstm| warpID = threadIdx.x / 32;|]] 340 | 341 | -- codeR = case (n `mod` nWarps) of 342 | -- 0 -> [] 343 | -- n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|], 344 | -- [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], 345 | -- [cstm| warpID = threadIdx.x / 32; |], 346 | -- [cstm| __syncthreads();|]] 347 | 348 | --------------------------------------------------------------------------- 349 | -- ForAll 350 | --------------------------------------------------------------------------- 351 | compileForAll :: Config -> Statement t -> [C.Stm] 352 | compileForAll c (SForAll Warp (IWord32 n) im) = 353 | error "Compiling distribution over OpenCL wavefronts is not supported yet" 354 | -- codeQ ++ codeR 355 | -- where 356 | -- nt = 32 357 | 358 | -- q = n `div` nt 359 | -- r = n `mod` nt 360 | 361 | -- cim = compileIM c im 362 | 363 | -- codeQ = 364 | -- case q of 365 | -- 0 -> [] 366 | -- 1 -> cim 367 | -- n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], 368 | -- [cstm| $id:("warpIx") = threadIdx.x % 32; |]] 369 | -- --[cstm| __syncthreads();|]] 370 | -- where 371 | -- body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim 372 | -- --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim 373 | 374 | -- codeR = 375 | -- case r of 376 | -- 0 -> [] 377 | -- n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { 378 | -- $id:("warpIx") = $int:(q*32) + (threadIdx.x % 32); 379 | -- $stms:cim } |], 380 | -- -- [cstm| __syncthreads();|], 381 | -- [cstm| $id:("warpIx") = threadIdx.x % 32; |]] 382 | 383 | compileForAll c (SForAll Block (IWord32 n) im) = goQ ++ goR 384 | where 385 | cim = compileIM c im -- ++ [[cstm| __syncthreads();|]] 386 | 387 | nt = configThreadsPerBlock c 388 | 389 | q = n `quot` nt 390 | r = n `rem` nt 391 | 392 | -- q is the number full "passes" needed to cover the iteration 393 | -- space given we have nt threads. 394 | goQ = 395 | case q of 396 | 0 -> [] 397 | 1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim 398 | --do 399 | -- stm <- updateTid [cexp| threadIdx.x |] 400 | -- return $ [cstm| $id:loopVar = threadIdx.x; |] : cim 401 | n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], 402 | -- __syncthreads(); } |], 403 | [cstm| $id:("tid") = $exp:threadIdx_x; |]] 404 | -- [cstm| __syncthreads();|]] 405 | where 406 | body = [cstm|$id:("tid") = i*$int:nt + $exp:threadIdx_x; |] : cim 407 | 408 | -- r is the number of elements left. 409 | -- This generates code for when fewer threads are 410 | -- needed than available. (some threads shut down due to the conditional). 411 | goR = 412 | case (r,q) of 413 | (0,_) -> [] 414 | --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { 415 | -- $stms:cim } |]] 416 | (n,m) -> [[cstm| if ($exp:threadIdx_x < $int:n) { 417 | $id:("tid") = $int:(q*nt) + $exp:threadIdx_x; 418 | $stms:cim } |], 419 | [cstm| $id:("tid") = $exp:threadIdx_x; |]] 420 | 421 | compileForAll c (SForAll Grid n im) = error "compileForAll: Grid" -- cim 422 | -- The grid case is special. May need more thought 423 | -- 424 | -- The problem with this case is that 425 | -- I need to come up with a blocksize (but without any guidance) 426 | -- from the programmer. 427 | -- Though! There is no way the programmer could provide any 428 | -- such info ... 429 | where 430 | cim = compileIM c im 431 | 432 | -- compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go 433 | -- where 434 | -- body = compileIM PlatformC c im 435 | -- go = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] 436 | 437 | 438 | --------------------------------------------------------------------------- 439 | -- CompileIM to list of Stm 440 | --------------------------------------------------------------------------- 441 | compileIM :: Config -> IMList a -> [C.Stm] 442 | compileIM conf im = concatMap ((compileStm conf) . fst) im 443 | 444 | --------------------------------------------------------------------------- 445 | -- Generate entire Kernel 446 | --------------------------------------------------------------------------- 447 | type Parameters = [(String,T.Type)] 448 | 449 | compile :: Config -> String -> (Parameters,IMList a) -> C.Definition 450 | compile config kname (params,im) 451 | = go 452 | where 453 | stms = compileIM config im 454 | 455 | ps = compileParams params 456 | go = [cedecl| extern "C" __kernel void $id:kname($params:ps) {$items:cudabody} |] 457 | 458 | cudabody = (if (configSharedMem config > 0) 459 | then [C.BlockDecl 460 | [cdecl| __local typename uint8_t sbase[$uint:(configSharedMem config)] ; |]] 461 | else []) ++ 462 | (if (usesGid im) 463 | then [C.BlockDecl [cdecl| typename uint32_t gid = $exp:blockIdx_x * $exp:blockDim_x + $exp:threadIdx_x; |]] 464 | else []) ++ 465 | (if (usesBid im) 466 | then [C.BlockDecl [cdecl| typename uint32_t bid = $exp:blockIdx_x; |]] 467 | else []) ++ 468 | (if (usesTid im) 469 | then [C.BlockDecl [cdecl| typename uint32_t tid = $exp:threadIdx_x; |]] 470 | else []) ++ 471 | (if (usesWarps im) 472 | then error "Warps not supported: OpenCL" 473 | -- [C.BlockDecl 474 | -- [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 475 | -- C.BlockDecl 476 | -- [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 477 | else []) ++ 478 | -- All variables used will be unique and can be declared 479 | -- at the top level 480 | concatMap declares im ++ 481 | -- Not sure if I am using language.C correctly. 482 | -- Maybe compileSTM should create BlockStms ? 483 | -- TODO: look how Nikola does it. 484 | map C.BlockStm stms 485 | 486 | cbody = -- add memory allocation 487 | map C.BlockStm stms 488 | 489 | -- Declare variables. 490 | declares :: (Statement t,t) -> [C.BlockItem] 491 | declares (SDeclare name t,_) = [C.BlockDecl [cdecl| $ty:(compileType t) $id:name;|]] 492 | declares (SCond _ im,_) = concatMap declares im 493 | declares (SSeqWhile _ im,_) = concatMap declares im 494 | declares (SForAll _ _ im,_) = concatMap declares im 495 | declares (SDistrPar _ _ im,_) = concatMap declares im 496 | declares (SSeqFor _ _ im,_) = concatMap declares im 497 | declares _ = [] 498 | 499 | 500 | --------------------------------------------------------------------------- 501 | -- Parameter lists for functions (kernel head) 502 | --------------------------------------------------------------------------- 503 | compileParams :: Parameters -> [C.Param] 504 | -- compileParams PlatformOpenCL = map go 505 | -- where 506 | 507 | -- C or CUDA 508 | compileParams = map go 509 | where 510 | go (name,Pointer t) = [cparam| global $ty:(compileType (Pointer t)) $id:name |] 511 | go (name, t) = [cparam| $ty:(compileType t) $id:name |] 512 | 513 | 514 | 515 | 516 | --------------------------------------------------------------------------- 517 | -- Compile with shared memory arrays declared at top 518 | --------------------------------------------------------------------------- 519 | -- CODE DUPLICATION FOR NOW 520 | 521 | compileDeclsTop :: Config -> [(String,((Word32,Word32),T.Type))] -> String -> (Parameters,IMList a) -> C.Definition 522 | compileDeclsTop config toplevelarrs kname (params,im) 523 | = go 524 | where 525 | stms = compileIM config im 526 | 527 | ps = compileParams params 528 | go = [cedecl| extern "C" __kernel void $id:kname($params:ps) {$items:cudabody} |] 529 | cudabody = (if (configSharedMem config > 0) 530 | -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] 531 | then [C.BlockDecl [cdecl| __local typename uint8_t sbase[$uint:(configSharedMem config)]; |]] 532 | else []) ++ 533 | --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ 534 | --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 535 | -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ 536 | -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ 537 | (if (usesGid im) 538 | then [C.BlockDecl [cdecl| typename uint32_t gid = $exp:blockIdx_x * $exp:blockDim_x + $exp:threadIdx_x; |]] 539 | else []) ++ 540 | (if (usesBid im) 541 | then [C.BlockDecl 542 | [cdecl| typename uint32_t bid = $exp:blockIdx_x; |]] 543 | else []) ++ 544 | (if (usesTid im) 545 | then [C.BlockDecl 546 | [cdecl| typename uint32_t tid = $exp:threadIdx_x; |]] 547 | else []) ++ 548 | (if (usesWarps im) 549 | then error "\"Warps\" no implemented in the OpenCL code-generator" 550 | --[C.BlockDecl 551 | -- [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], 552 | -- C.BlockDecl 553 | -- [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] 554 | else []) ++ 555 | -- declare all arrays used 556 | concatMap declareArr toplevelarrs ++ 557 | -- All variables used will be unique and can be declared 558 | -- at the top level 559 | concatMap declares im ++ 560 | -- Not sure if I am using language.C correctly. 561 | -- Maybe compileSTM should create BlockStms ? 562 | -- TODO: look how Nikola does it. 563 | map C.BlockStm stms 564 | 565 | cbody = -- add memory allocation 566 | map C.BlockStm stms 567 | 568 | 569 | declareArr :: (String, ((Word32,Word32),T.Type)) -> [C.BlockItem] 570 | declareArr (arr,((_,addr),t)) = 571 | [C.BlockDecl [cdecl| $ty:(compileType (Shared t)) $id:arr = ( $ty:(compileType (Shared t)))(sbase + $int:addr);|]] 572 | -------------------------------------------------------------------------------- /Obsidian/Library.hs: -------------------------------------------------------------------------------- 1 | {- Joel Svensson 2012..2017 2 | Mary Sheeran 2012 3 | 4 | Notes: 5 | 2017-04-22: Cleanup 6 | 2014-03-31: Merged Library and LibraryG 7 | 2013-01-24: GlobPull nolonger exists 8 | GlobPush is Push Grid 9 | 10 | 2013-01-08: Renamed GlobArray to GlobPush 11 | 2013-01-02: Added toGlobArray and toGlobArrayN 12 | 2012-12-10: Refactoring 13 | (adherence to new Array types and program types) 14 | -} 15 | 16 | {-# LANGUAGE FlexibleInstances, 17 | TypeSynonymInstances, 18 | ScopedTypeVariables, 19 | TypeFamilies, 20 | GADTs #-} 21 | {-# LANGUAGE TypeOperators #-} 22 | {-# LANGUAGE FlexibleContexts #-} 23 | {-# LANGUAGE MultiParamTypeClasses #-} 24 | {-# LANGUAGE UndecidableInstances #-} 25 | 26 | module Obsidian.Library 27 | (logBaseI 28 | , reverse 29 | , splitAt 30 | , splitUp 31 | , coalesce 32 | , halve 33 | , evenOdds 34 | , evens 35 | , odds 36 | , everyNth 37 | , singleton 38 | , generate 39 | , last 40 | , first 41 | , take 42 | , drop 43 | , head 44 | , tail 45 | , fold1 46 | , shiftLeft 47 | , unzip 48 | , zip 49 | , unzip3 50 | , zip3 51 | , zipWith 52 | , zipWith3 53 | , pair 54 | , unpair 55 | , unsafeBinSplit 56 | , binSplit 57 | , concP 58 | , unpairP 59 | , load -- RENAME THIS 60 | , store -- RENAME THIS 61 | -- Hierarchy programming 62 | , asThread 63 | , asThreadMap 64 | , asGrid 65 | , asGridMap 66 | , AsWarp(..) 67 | , AsBlock(..) 68 | , liftPar -- generic hierarchy programming 69 | , liftSeq -- generic hierarchy programming 70 | , liftIn -- generic hierarchy programming 71 | -- Repeat a program 72 | , rep 73 | 74 | -- Executing programs 75 | , ExecProgram(..) 76 | , execThread 77 | , execThread' 78 | , execWarp 79 | , execWarp' 80 | , execBlock 81 | , execBlock' 82 | 83 | -- Leftovers from past days 84 | , singletonPush 85 | , runPush 86 | )where 87 | 88 | import Obsidian.Array 89 | import Obsidian.Exp 90 | import Obsidian.Program 91 | import Obsidian.Data 92 | import Obsidian.Mutable 93 | 94 | import Control.Monad 95 | 96 | import Data.Bits 97 | import Data.Word 98 | 99 | import Prelude hiding (splitAt,zipWith,replicate,reverse,unzip,zip,zip3,unzip3,zipWith3, last, take, drop, head, tail) 100 | 101 | 102 | --------------------------------------------------------------------------- 103 | -- Helper 104 | --------------------------------------------------------------------------- 105 | logBaseI :: Integral a => a -> a -> a 106 | logBaseI b x 107 | = if x < b 108 | then 0 109 | else 110 | let 111 | l = 2 * logBaseI (b*b) x 112 | doDiv x l = if x < b then l else doDiv (x`div`b) (l+1) 113 | in 114 | doDiv (x`div`(b^l)) l 115 | 116 | 117 | --------------------------------------------------------------------------- 118 | -- Reverse an array by indexing in it backwards 119 | --------------------------------------------------------------------------- 120 | 121 | -- | Reverses a Pull array. 122 | reverse :: (Array array, ArrayLength array, ASize l) => array l a -> array l a 123 | reverse arr = ixMap (\ix -> (sizeConv m) - ix) arr 124 | where m = n-1 125 | n = len arr 126 | --------------------------------------------------------------------------- 127 | -- splitAt (name clashes with Prelude.splitAt) 128 | --------------------------------------------------------------------------- 129 | 130 | -- | Splits a Pull array at a given point. Performs no bounds checks. 131 | splitAt :: (Integral i, ASize l) => i -> Pull l a -> (Pull l a, Pull l a) 132 | splitAt n arr = (mkPull m (\ix -> arr ! ix), 133 | mkPull (len arr - m) (\ix -> arr ! (ix + pos))) 134 | where pos = fromIntegral n 135 | m = fromIntegral n 136 | 137 | -- | Splits a Pull array in the middle. 138 | halve :: ASize l => Pull l a -> (Pull l a, Pull l a) 139 | halve arr = splitAt n2 arr 140 | where 141 | n = len arr 142 | n2 = n `div` 2 143 | 144 | -- | Splits a Pull array into chunks of size n. Result is a Pull of Pull arrays. 145 | splitUp :: (ASize l, ASize s, Integral s) => s -> Pull l a -> Pull l (Pull s a) 146 | splitUp n arr {-(Pull m ixf)-} = 147 | mkPull (len arr `div` fromIntegral n) $ \i -> 148 | mkPull n $ \j -> arr ! (i * (sizeConv n) + j) 149 | 150 | -- | Same as @splitUp@ but also performs a permutation of the elements. 151 | coalesce :: ASize l 152 | => Word32 -> Pull l a -> Pull l (Pull Word32 a) 153 | coalesce n arr = 154 | mkPull s $ \i -> 155 | mkPull n $ \j -> arr ! (i + (sizeConv s) * j) 156 | where s = len arr `div` fromIntegral n 157 | 158 | 159 | --------------------------------------------------------------------------- 160 | -- elements at even indices to fst output, odd to snd. 161 | --------------------------------------------------------------------------- 162 | -- | Split a Pull array into its even and odd indexed elements. 163 | evenOdds :: ASize l => Pull l a -> (Pull l a, Pull l a) 164 | evenOdds arr = (mkPull (n-n2) (\ix -> arr ! (2*ix)) , 165 | mkPull n2 (\ix -> arr ! (2*ix + 1))) 166 | where 167 | n = len arr 168 | n2 = div n 2 169 | -- | Extract the elements at even indices from a Pull array 170 | evens :: ASize l => Pull l a -> Pull l a 171 | evens = fst . evenOdds 172 | 173 | -- | Extract the elements at odd indices from a Pull array 174 | odds :: ASize l => Pull l a -> Pull l a 175 | odds = snd . evenOdds 176 | 177 | --------------------------------------------------------------------------- 178 | -- everyNth 179 | --------------------------------------------------------------------------- 180 | -- | Extract every nth element from a Pull array. 181 | everyNth :: ASize l => Word32 -> Word32 -> Pull l a -> Pull l a 182 | everyNth n m arr = mkPull n' $ \ix -> arr ! (ix * (fromIntegral n) + fromIntegral m) 183 | where 184 | n' = len arr `div` (fromIntegral n) 185 | 186 | 187 | --------------------------------------------------------------------------- 188 | -- replicate 189 | --------------------------------------------------------------------------- 190 | -- | Generates a Pull array of length one, containing @a@. 191 | singleton :: (Array a, ASize l) => e -> a l e 192 | singleton a = replicate 1 a 193 | 194 | -- | Generate a pull or push array using a function from Index to element. 195 | generate :: (Functor (a s), Array a, ASize s) 196 | => s -> (EWord32 -> b) -> a s b 197 | generate n f = fmap f (iota n) 198 | --------------------------------------------------------------------------- 199 | -- last and first 200 | --------------------------------------------------------------------------- 201 | -- | Extract last element from a Pull array. 202 | last :: ASize l => Pull l a -> a 203 | last arr = arr ! fromIntegral ( len arr - 1) 204 | 205 | -- | Extract the first element from a Pull array. 206 | first :: ASize l => Pull l a -> a 207 | first arr = arr ! 0 208 | 209 | --------------------------------------------------------------------------- 210 | -- Take and Drop (what about strange sizes ? fix) 211 | --------------------------------------------------------------------------- 212 | -- | Take the first @n@ elements from a Pull array 213 | take :: ASize l => l -> Pull l a -> Pull l a 214 | take n arr = setSize n arr 215 | 216 | -- | Drop the first @n@ elements from a Pull array 217 | drop :: ASize l => l -> Pull l a -> Pull l a 218 | drop n arr = setSize (len arr - n) $ ixMap (\ix -> ix + sizeConv n) arr 219 | 220 | --------------------------------------------------------------------------- 221 | -- Head and Tail on pull arrays 222 | --------------------------------------------------------------------------- 223 | 224 | head :: ASize l => Pull l a -> a 225 | head arr = arr ! 0 226 | 227 | tail :: ASize l => Pull l a -> Pull l a 228 | tail = drop 1 229 | 230 | 231 | --------------------------------------------------------------------------- 232 | -- fold (sequential , unrolled) 233 | --------------------------------------------------------------------------- 234 | -- | Fold a nonempty pull array using a given operator. The result a singleton array (push or pull). 235 | fold1 :: Array a => (e -> e -> e) -> Pull Word32 e -> a Word32 e 236 | fold1 f arr = replicate 1 237 | $ foldl1 f [arr ! (fromIntegral i) | i <- [0..(n-1)]] 238 | where n = len arr 239 | 240 | 241 | --------------------------------------------------------------------------- 242 | -- Shift arrays 243 | --------------------------------------------------------------------------- 244 | shiftLeft :: ASize l => Word32 -> Pull l a -> Pull l a 245 | shiftLeft dist arr = setSize (len arr - (fromIntegral dist)) 246 | $ ixMap (\ix -> ix + (fromIntegral dist)) arr 247 | 248 | --------------------------------------------------------------------------- 249 | -- zipp unzipp 250 | --------------------------------------------------------------------------- 251 | -- | Unzip implemented on Pull arrays 252 | unzip :: ASize l => Pull l (a,b) -> (Pull l a, Pull l b) 253 | unzip arr = (mkPull (len arr) (\ix -> fst (arr ! ix)) , 254 | mkPull (len arr) (\ix -> snd (arr ! ix)) ) 255 | 256 | -- | Zip implemented on Pull arrays 257 | zip :: ASize l => Pull l a -> Pull l b -> Pull l (a, b) 258 | zip arr1 arr2 = mkPull (min (len arr1) (len arr2)) 259 | $ \ix -> (arr1 ! ix, arr2 ! ix) 260 | 261 | -- | Unzip tripples. 262 | unzip3 :: ASize l => Pull l (a,b,c) 263 | -> (Pull l a, Pull l b, Pull l c) 264 | unzip3 arr = (fmap (\(x,_,_) -> x) arr, 265 | fmap (\(_,y,_) -> y) arr, 266 | fmap (\(_,_,z) -> z) arr) 267 | 268 | -- | Zip three arrays 269 | zip3 :: ASize l 270 | => Pull l a 271 | -> Pull l b 272 | -> Pull l c 273 | -> Pull l (a,b,c) 274 | zip3 arr1 arr2 arr3 = 275 | mkPull (minimum [len arr1, len arr2, len arr3]) 276 | (\ix -> (arr1 ! ix, arr2 ! ix, arr3 ! ix)) 277 | 278 | 279 | -- | Perform elementwise operation. 280 | zipWith :: ASize l => (a -> b -> c) -> Pull l a -> Pull l b -> Pull l c 281 | zipWith op a1 a2 = 282 | mkPull (min (len a1) (len a2)) 283 | (\ix -> (a1 ! ix) `op` (a2 ! ix)) 284 | 285 | -- | Perform elementwise operation. 286 | zipWith3 :: ASize l => (a -> b -> c-> d) -> Pull l a -> Pull l b -> Pull l c -> Pull l d 287 | zipWith3 f a1 a2 a3 = 288 | mkPull (minimum [len a1,len a2,len a3]) $ 289 | \ix -> f (a1 ! ix) (a2 ! ix) (a3 ! ix) 290 | 291 | 292 | --------------------------------------------------------------------------- 293 | -- pair 294 | --------------------------------------------------------------------------- 295 | -- | Pair up consecutive elements in a Pull array. 296 | pair :: ASize l => Pull l a -> Pull l (a,a) 297 | pair arr = 298 | mkPull n' (\ix -> (arr ! (ix*2),arr ! (ix*2+1))) 299 | where 300 | n' = len arr `div` 2 301 | 302 | 303 | -- | Flatten a Pull array of pairs. 304 | unpair :: ASize l => Choice a => Pull l (a,a) -> Pull l a 305 | unpair arr = 306 | let n = len arr 307 | in mkPull (2*n) (\ix -> ifThenElse ((mod ix 2) ==* 0) 308 | (fst (arr ! (ix `shiftR` 1))) 309 | (snd (arr ! (ix `shiftR` 1)))) 310 | 311 | -- | Triple up consecutive elements in a Pull array. 312 | triple :: ASize l => Pull l a -> Pull l (a,a,a) 313 | triple arr = 314 | mkPull (len arr `div` 3) $ \ix -> 315 | (arr ! (ix*3), arr ! (ix*3+1), arr ! (ix*3+2)) 316 | 317 | -- | Flatten a Pull array of triples. 318 | untriple :: ASize l => Choice a => Pull l (a,a,a) -> Pull l a 319 | untriple arr = 320 | mkPull (3*len arr) $ \ix -> 321 | let (k,j) = divMod ix 3 322 | (a0,a1,a2) = arr ! k 323 | in ifThenElse (j ==* 0) a0 $ 324 | ifThenElse (j ==* 1) a1 a2 325 | 326 | 327 | -- | Quadruple up consecutive elements in a Pull array. 328 | quadruple :: ASize l => Pull l a -> Pull l (a,a,a,a) 329 | quadruple arr = 330 | mkPull (len arr `div` 4) $ \ix -> 331 | (arr ! (ix*4), arr ! (ix*4+1), arr ! (ix*4+2), arr ! (ix*4+3)) 332 | 333 | -- | Flatten a Pull array of triples. 334 | unquadruple :: ASize l => Choice a => Pull l (a,a,a,a) -> Pull l a 335 | unquadruple = unpair . unpair . fmap (\(a0,a1,a2,a3) -> ((a0,a1), (a2,a3))) 336 | 337 | 338 | 339 | --------------------------------------------------------------------------- 340 | -- twoK (untested for proper functionality) 341 | --------------------------------------------------------------------------- 342 | 343 | -- | Recursively split an array in the middle. Apply an array to array computation 344 | -- on each part. @binSplit 3@ divides the array into 8 pieces. 345 | -- UNSAFE 346 | unsafeBinSplit :: Int 347 | -> (Pull Word32 a -> Pull Word32 b) 348 | -> Pull Word32 a 349 | -> Pull Word32 b 350 | unsafeBinSplit = twoK 351 | 352 | binSplit :: Data a 353 | => Int 354 | -> (Pull Word32 a -> Pull Word32 b) 355 | -> Mutable Shared Word32 a 356 | -> Pull Word32 b 357 | binSplit n f = unsafeBinSplit n f . mutableToPull 358 | 359 | -- See if this should be specifically for Static size pull arrays 360 | twoK :: Int -> (Pull Word32 a -> Pull Word32 b) -> Pull Word32 a -> Pull Word32 b 361 | twoK 0 f = f -- divide 0 times and apply f 362 | twoK n f = \arr -> 363 | let arr' = mkPull lt (\i -> (f (mkPull m (\j -> (arr ! (g i j)))) ! (h i))) 364 | m = (len arr `shiftR` n) --pow of two 365 | g i j = i .&. (fromIntegral (complement (m-1))) .|. j 366 | h i = i .&. (fromIntegral (nl2-1)) -- optimize 367 | 368 | nl2 = len (f (mkPull m (\j -> arr ! variable "X"))) 369 | lt = nl2 `shiftL` n 370 | in arr' 371 | 372 | 373 | 374 | --------------------------------------------------------------------------- 375 | -- *** PUSHY LIBRARY *** --- 376 | --------------------------------------------------------------------------- 377 | 378 | --------------------------------------------------------------------------- 379 | -- Concatenate on Push arrays 380 | --------------------------------------------------------------------------- 381 | 382 | -- | Concatenate two push arrays. 383 | concP :: ASize l 384 | => Push t l a -> Push t l a -> Push t l a 385 | concP p1 p2 = 386 | mkPush (n1 + n2) $ \wf -> 387 | do 388 | p1 <: wf 389 | p2 <: \a i -> wf a (sizeConv n1 + i) 390 | where 391 | n1 = len p1 392 | n2 = len p2 393 | 394 | -- | Flatten a Pull array of pairs. Result is a push array 395 | unpairP :: ASize l => Choice a => Push t l (a,a) -> Push t l a 396 | unpairP arr = 397 | mkPush (2 * len arr) $ \ wf -> 398 | do 399 | -- even iterations 400 | arr <: \ (a,_) i -> wf a ((sizeConv i) `shiftL` 1) 401 | -- Odd iterations 402 | arr <: \ (_,b) i -> wf b ((sizeConv i) `shiftL` 1 + 1) 403 | 404 | 405 | --------------------------------------------------------------------------- 406 | -- load / Store 407 | --------------------------------------------------------------------------- 408 | load :: Word32 -> Pull Word32 a -> Push Block Word32 a 409 | load n arr = 410 | mkPush m (\wf -> 411 | forAll (fromIntegral n') (\tid -> 412 | do 413 | seqFor (fromIntegral n) (\ix -> 414 | wf (arr ! (tid + (ix*fromIntegral n'))) (tid + (ix*fromIntegral n'))))) 415 | 416 | where 417 | m = len arr 418 | n' = m `div` n 419 | 420 | store :: Word32 -> SPull a -> SPush Block a 421 | store = load 422 | 423 | 424 | -- ######################################################################## 425 | -- 426 | -- Programming the Hierarchy 427 | -- 428 | -- ######################################################################## 429 | 430 | -- "Compute as" family of functions 431 | 432 | liftPar :: ASize l => Pull l (SPush t a) -> Push (Step t) l a 433 | liftPar = pConcat 434 | 435 | liftSeq :: ASize l => Pull l (SPush t a) -> Push t l a 436 | liftSeq = sConcat 437 | 438 | liftIn :: (t *<=* Block, ASize l) 439 | => Pull l (SPush Thread b) 440 | -> Push t l b 441 | liftIn = tConcat 442 | 443 | --------------------------------------------------------------------------- 444 | -- AsBlock 445 | --------------------------------------------------------------------------- 446 | -- Rename this to asBlock, asGrid, asThread 447 | class (t *<=* Block) => AsBlock t where 448 | asBlock :: SPull (SPush t a) -> 449 | SPush Block a 450 | asBlockMap :: (a -> SPush t b) 451 | -> SPull a 452 | -> SPush Block b 453 | 454 | instance AsBlock Thread where 455 | asBlock = tConcat 456 | asBlockMap f = tConcat . fmap f 457 | 458 | instance AsBlock Warp where 459 | asBlock = pConcat 460 | asBlockMap f = pConcat . fmap f 461 | 462 | instance AsBlock Block where 463 | asBlock = sConcat 464 | asBlockMap f = sConcat . fmap f 465 | 466 | --------------------------------------------------------------------------- 467 | -- AsWarp 468 | --------------------------------------------------------------------------- 469 | class (t *<=* Warp) => AsWarp t where 470 | asWarp :: SPull (SPush t a) -> 471 | SPush Warp a 472 | asWarpMap :: (a -> SPush t b) 473 | -> SPull a 474 | -> SPush Warp b 475 | 476 | 477 | instance AsWarp Thread where 478 | asWarp = tConcat 479 | asWarpMap f = tConcat . fmap f 480 | 481 | instance AsWarp Warp where 482 | asWarp = sConcat 483 | asWarpMap f = sConcat . fmap f 484 | 485 | --------------------------------------------------------------------------- 486 | -- LiftThread 487 | --------------------------------------------------------------------------- 488 | 489 | asThread :: ASize l 490 | => Pull l (SPush Thread b) 491 | -> Push Thread l b 492 | asThread = tConcat 493 | 494 | asThreadMap :: (a -> SPush Thread b) 495 | -> SPull a 496 | -> SPush Thread b 497 | asThreadMap f = tConcat . fmap f 498 | 499 | 500 | --------------------------------------------------------------------------- 501 | -- AsGrid 502 | --------------------------------------------------------------------------- 503 | 504 | asGrid :: ASize l => Pull l (SPush Block a) 505 | -> Push Grid l a 506 | asGrid = pConcat 507 | 508 | asGridMap :: ASize l => (a -> SPush Block b) 509 | -> Pull l a 510 | -> Push Grid l b 511 | asGridMap f = pConcat . fmap f 512 | 513 | 514 | --------------------------------------------------------------------------- 515 | -- Repeat a program 516 | --------------------------------------------------------------------------- 517 | 518 | -- | Repeat a program (iterate it) 519 | rep :: Word32 -> (a -> Program t a) -> a -> Program t a 520 | rep 0 _ a = return a 521 | rep n prg a = do 522 | b <- rep (n-1) prg a 523 | prg b 524 | 525 | --------------------------------------------------------------------------- 526 | -- RunPush 527 | --------------------------------------------------------------------------- 528 | 529 | class ExecProgram t a where 530 | exec :: Data e 531 | => Program t (a Word32 e) 532 | -> Push t Word32 e 533 | 534 | instance (t *<=* Block) => ExecProgram t Pull where 535 | exec = runPush . liftM push 536 | 537 | -- Here we also want the type error behaviour. 538 | -- It is a type error to try to "execute" a push t at any level different from t 539 | instance (t ~ t1) => ExecProgram t (Push t1) where 540 | exec = runPush 541 | 542 | execThread :: (ExecProgram Thread a, Data e) 543 | => Program Thread (a Word32 e) 544 | -> Push Thread Word32 e 545 | execThread = exec 546 | 547 | execThread' :: Data a => Program Thread a -> SPush Thread a 548 | execThread' = singletonPush 549 | 550 | execBlock :: (ExecProgram Block a, Data e) 551 | => Program Block (a Word32 e) 552 | -> Push Block Word32 e 553 | execBlock = exec 554 | 555 | execBlock' :: Data a => Program Block a -> SPush Block a 556 | execBlock' = singletonPush 557 | 558 | execWarp :: (ExecProgram Warp a, Data e) 559 | => Program Warp (a Word32 e) 560 | -> Push Warp Word32 e 561 | execWarp = exec 562 | 563 | execWarp' :: Data a => Program Warp a -> SPush Warp a 564 | execWarp' = singletonPush 565 | 566 | 567 | -- | Fuses the program that computes a Push array into the Push array. 568 | runPush :: Program t (Push t s a) -> Push t s a 569 | runPush prg = 570 | mkPush n $ \wf -> do 571 | parr <- prg 572 | parr <: wf 573 | -- It is a bit scary that I need to "evaluate" programs here. 574 | where n = len $ fst $ runPrg 0 prg 575 | 576 | -- | Lifts @runPush@ to one input functions. 577 | runPush1 :: (a -> Program t (Push t s b)) -> a -> Push t s b 578 | runPush1 f a = runPush (f a) 579 | 580 | -- | Lifts @runPush@ to two input functions. 581 | runPush2 :: (a -> b -> Program t (Push t s c)) -> a -> b -> Push t s c 582 | runPush2 f a b = runPush (f a b) 583 | 584 | -- | Converts a program computing a pull Array to a Push array 585 | runPull :: (t *<=* Block, ASize s) => Program t (Pull s a) -> Push t s a 586 | runPull = runPush . liftM push 587 | 588 | -- | Lifts @runPull@ to one input functions. 589 | runPull1 :: (t *<=* Block, ASize s) => (a -> Program t (Pull s b)) -> a -> Push t s b 590 | runPull1 f a = runPull (f a) 591 | 592 | -- | Lifts @runPull@ to two input functions. 593 | runPull2 :: (t *<=* Block, ASize s) => (a -> b -> Program t (Pull s c)) -> a -> b -> Push t s c 594 | runPull2 f a b = runPull (f a b) 595 | 596 | --------------------------------------------------------------------------- 597 | -- 598 | --------------------------------------------------------------------------- 599 | pushPrg :: (t *<=* Block) => Program t a -> SPush t a 600 | pushPrg = singletonPush 601 | 602 | 603 | --------------------------------------------------------------------------- 604 | -- Singleton push arrays 605 | --------------------------------------------------------------------------- 606 | 607 | -- Danger! use only with Scalar a's 608 | -- -- | Create a singleton Push array. 609 | --singletonPush :: a -> SPush t a 610 | --singletonPush = singletonPushP . return 611 | 612 | -- | Monadic version of @singleton@. 613 | singletonPush :: (t *<=* Block) => Program t a -> SPush t a 614 | singletonPush prg = 615 | mkPush 1 $ \wf -> do 616 | a <- prg 617 | forAll 1 $ \_ -> 618 | wf a 0 619 | 620 | --------------------------------------------------------------------------- 621 | -- Old stuff that should nolonger be exported! 622 | -- * It is still used internally 623 | --------------------------------------------------------------------------- 624 | 625 | -- | A way to enter into the hierarchy 626 | -- A bunch of Thread computations, spread across the threads of either 627 | -- a Warp, block or grid. (or performed sequentially in a single thread) 628 | tConcat :: (t *<=* Block, ASize l) 629 | => Pull l (SPush Thread b) 630 | -> Push t l b 631 | tConcat arr = 632 | mkPush (n * fromIntegral s) $ \wf -> do 633 | forAll (sizeConv n) $ \tid -> 634 | let wf' a ix = wf a (tid * sizeConv s + ix) 635 | p = arr ! tid -- f tid 636 | in p <: wf' 637 | where 638 | n = len arr 639 | s = len (arr ! 0) --(f (variable "tid")) -- arr 640 | 641 | -- | Variant of @tConcat@. 642 | tDistribute :: (t *<=* Block, ASize l) 643 | => l 644 | -> (EWord32 -> SPush Thread b) 645 | -> Push t l b 646 | tDistribute n f = tConcat (mkPull n f) 647 | 648 | 649 | -- | Distribute work across the parallel resources at a given level of the GPU hiearchy 650 | pConcat :: ASize l => Pull l (SPush t a) -> Push (Step t) l a 651 | pConcat arr = 652 | mkPush (n * fromIntegral rn) $ \wf -> 653 | distrPar (sizeConv n) $ \bix -> 654 | let p = arr ! bix 655 | wf' a ix = wf a (bix * sizeConv rn + ix) 656 | 657 | in p <: wf' 658 | where 659 | n = len arr 660 | rn = len (arr ! 0) -- All arrays are same length 661 | 662 | -- | Distribute work across the parallel resources at a given level of the GPU hierarchy 663 | pDistribute :: ASize l 664 | => l 665 | -> (EWord32 -> SPush t a) 666 | -> Push (Step t) l a 667 | pDistribute n f = pConcat (mkPull n f) 668 | 669 | -- | Sequential concatenation of a Pull of Push. 670 | sConcat :: ASize l => Pull l (SPush t a) -> Push t l a 671 | sConcat arr = 672 | mkPush (n * fromIntegral rn) $ \wf -> 673 | do 674 | seqFor (sizeConv n) $ \bix -> 675 | let p = arr ! bix 676 | wf' a ix = wf a (bix * sizeConv rn + ix) 677 | in p <: wf' 678 | where 679 | n = len arr 680 | rn = len $ arr ! 0 681 | 682 | 683 | -- | Variant of sConcat. 684 | sDistribute :: ASize l => l -> (EWord32 -> SPush t a) -> Push t l a 685 | sDistribute n f = sConcat (mkPull n f) 686 | 687 | -- pUnCoalesce adapted from Niklas branch. 688 | -- | Combines work that was distributed in a Coalesced way. 689 | -- | Applies a permutation on stores. 690 | pUnCoalesce :: ASize l 691 | => Pull l (SPush t a) 692 | -> Push (Step t) l a 693 | pUnCoalesce arr = 694 | mkPush (n * fromIntegral rn) $ \wf -> 695 | distrPar (sizeConv n) $ \bix -> 696 | let p = arr ! bix 697 | wf' a ix = wf a (bix * sizeConv rn + ix) 698 | in p <: (g wf') 699 | where 700 | n = len arr 701 | rn = len $ arr ! 0 702 | s = sizeConv rn 703 | g wf a i = wf a (i `div` s + (i`mod`s)*(sizeConv n)) 704 | --------------------------------------------------------------------------------