├── .appveyor.yml
├── .github
    └── workflows
    │   └── ci-linux.yml
├── .gitignore
├── .gitmodules
├── .mailmap
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Setup.hs
├── WINDOWS.md
├── cbits
    ├── init.c
    ├── stubs.c
    └── stubs.h
├── cuda.cabal
├── examples
    ├── Makefile
    ├── common
    │   ├── common.mk
    │   ├── include
    │   │   ├── cudpp
    │   │   │   ├── LICENSE
    │   │   │   ├── cudpp_globals.h
    │   │   │   ├── shared_mem.h
    │   │   │   └── type_vector.h
    │   │   ├── operator.h
    │   │   └── utils.h
    │   └── src
    │   │   ├── C2HS.hs
    │   │   ├── PrettyPrint.hs
    │   │   ├── RandomVector.hs
    │   │   ├── Time.hs
    │   │   ├── control.cpp
    │   │   └── cudpp
    │   │       ├── LICENSE
    │   │       ├── scan_cta.cu
    │   │       ├── scan_kernel.cu
    │   │       ├── segmented_scan_cta.cu
    │   │       ├── segmented_scan_kernel.cu
    │   │       └── vector_kernel.cu
    └── src
    │   ├── bandwidthTest
    │       ├── BandwidthTest.hs
    │       ├── Makefile
    │       └── results
    │       │   ├── Bandwidth.numbers
    │       │   ├── GT120.pdf
    │       │   └── Tesla.pdf
    │   ├── deviceQueryDrv
    │       └── DeviceQuery.hs
    │   ├── enum
    │       ├── Makefile
    │       ├── enum.cu
    │       └── enum.h
    │   ├── fold
    │       ├── Fold.chs
    │       ├── Makefile
    │       ├── fold.cu
    │       └── fold.h
    │   ├── foldSeg
    │       ├── foldSeg.cu
    │       └── foldSeg.h
    │   ├── map2
    │       ├── Makefile
    │       ├── Map.hs
    │       └── map.cu
    │   ├── matrixMul
    │       ├── LICENSE
    │       ├── Makefile
    │       ├── MatrixMul.hs
    │       ├── matrix_mul.cu
    │       └── matrix_mul.h
    │   ├── matrixMulDrv
    │       ├── LICENSE
    │       ├── Makefile
    │       ├── MatrixMul.hs
    │       ├── matrix_mul.cu
    │       └── matrix_mul.h
    │   ├── multiGPU
    │       ├── Makefile
    │       ├── multiGPU.hs
    │       ├── simpleMultiGPU.h
    │       └── simpleMultiGPU_kernel.cu
    │   ├── permute
    │       └── permute.cu
    │   ├── replicate
    │       ├── Makefile
    │       ├── replicate.cu
    │       └── replicate.h
    │   ├── scan
    │       ├── Makefile
    │       ├── Scan.chs
    │       ├── scan.cu
    │       └── scan.h
    │   ├── simpleTexture
    │       ├── Makefile
    │       ├── SimpleTexture.hs
    │       ├── data
    │       │   ├── lena_bw.pgm
    │       │   └── ref_rotated.pgm
    │       └── simpleTexture.cu
    │   ├── smvm
    │       ├── Makefile
    │       ├── SMVM.chs
    │       ├── smvm-csr.cu
    │       ├── smvm-cudpp.cu
    │       ├── smvm.h
    │       └── texture.h
    │   ├── sort
    │       ├── Makefile
    │       ├── Sort.chs
    │       ├── radix_sort.cu
    │       └── sort.h
    │   ├── vectorAddDrv
    │       ├── Makefile
    │       ├── VectorAdd.hs
    │       └── vector_add.cu
    │   └── zipWith
    │       └── zipWith.cu
├── src
    ├── Foreign
    │   ├── C
    │   │   └── Extra.hs
    │   ├── CUDA.hs
    │   └── CUDA
    │   │   ├── Analysis.hs
    │   │   ├── Analysis
    │   │       ├── Device.chs
    │   │       └── Occupancy.hs
    │   │   ├── Driver.hs
    │   │   ├── Driver
    │   │       ├── Context.hs
    │   │       ├── Context
    │   │       │   ├── Base.chs
    │   │       │   ├── Config.chs
    │   │       │   ├── Peer.chs
    │   │       │   └── Primary.chs
    │   │       ├── Device.chs
    │   │       ├── Error.chs
    │   │       ├── Event.chs
    │   │       ├── Exec.chs
    │   │       ├── Graph
    │   │       │   ├── Base.chs
    │   │       │   ├── Build.chs
    │   │       │   ├── Capture.chs
    │   │       │   └── Exec.chs
    │   │       ├── IPC
    │   │       │   ├── Event.chs
    │   │       │   └── Marshal.chs
    │   │       ├── Marshal.chs
    │   │       ├── Module.hs
    │   │       ├── Module
    │   │       │   ├── Base.chs
    │   │       │   ├── Link.chs
    │   │       │   └── Query.chs
    │   │       ├── Profiler.chs
    │   │       ├── Stream.chs
    │   │       ├── Texture.chs
    │   │       ├── Unified.chs
    │   │       └── Utils.chs
    │   │   ├── Internal
    │   │       └── C2HS.hs
    │   │   ├── Path.chs
    │   │   ├── Ptr.hs
    │   │   ├── Runtime.hs
    │   │   └── Runtime
    │   │       ├── Device.chs
    │   │       ├── Error.chs
    │   │       ├── Event.chs
    │   │       ├── Exec.chs
    │   │       ├── Marshal.chs
    │   │       ├── Stream.chs
    │   │       ├── Texture.chs
    │   │       └── Utils.chs
    └── Text
    │   └── Show
    │       └── Describe.hs
├── stack-7.10.yaml
├── stack-7.8.yaml
├── stack-8.0.yaml
├── stack-8.10.yaml
├── stack-8.2.yaml
├── stack-8.4.yaml
├── stack-8.6.yaml
├── stack-8.8.yaml
└── stack-9.0.yaml


/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | # vim: nospell
 2 | 
 3 | clone_folder: "c:\\cuda"
 4 | 
 5 | skip_commits:
 6 |   message: /\[ci skip\]/
 7 | 
 8 | environment:
 9 |   global:
10 |     STACK_ROOT: "c:\\sr"
11 |   matrix:
12 |     - GHC: "8.10"
13 |     - GHC: "8.8"
14 |     - GHC: "8.6"
15 |     - GHC: "8.4"
16 |     - GHC: "8.2"
17 |     - GHC: "8.0"
18 |     # - GHC: "7.10"
19 |     # - GHC: "7.8"  # failed to install ghc: https://ci.appveyor.com/project/tmcdonell/cuda/build/1.0.4/job/ufhtj0klyq73psas#L149
20 | 
21 | before_build:
22 |   # http://help.appveyor.com/discussions/problems/6312-curl-command-not-found
23 |   - set PATH=C:\Program Files\Git\mingw64\bin;%PATH%
24 |   - set PATH=C:\Users\appveyor\AppData\Roaming\local\bin;%PATH%
25 | 
26 |   # install CUDA-9.0
27 |   - appveyor DownloadFile "https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_windows_network-exe" -FileName install_cuda.exe
28 |   - install_cuda.exe -s compiler_9.0 cudart_9.0 cublas_9.0 cublas_dev_9.0 cufft_9.0 cufft_dev_9.0 cusolver_9.0 cusolver_dev_9.0 cusparse_9.0 cusparse_dev_9.0
29 |   - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\bin;%PATH%
30 |   - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;%PATH%
31 |   - nvcc --version
32 | 
33 |   # CUDA refuses to install the driver if no compatible GPU can be found, so
34 |   # copy these .dll files manually
35 |   - appveyor DownloadFile "https://drive.google.com/uc?export=download&id=14x0RX8QlHQ6vKhimbR4FDRgfP7EoHfgc" -FileName nvdriver-9.0.176.7z
36 |   - 7z x nvdriver-9.0.176.7z -oC:\Windows\System32
37 | 
38 |   # install stack
39 |   - appveyor DownloadFile "https://www.stackage.org/stack/windows-x86_64" -FileName stack.zip
40 |   - 7z x stack.zip stack.exe
41 |   - stack --version
42 | 
43 |   - ln -s stack-%GHC%.yaml stack.yaml
44 |   - stack setup --no-terminal > NUL
45 |   - stack build --no-terminal --no-copy-bins --fast --jobs=1 --only-dependencies --test --no-run-tests --haddock --no-haddock-deps
46 | 
47 | build_script:
48 |   - stack build --no-terminal --fast --test --no-run-tests
49 | 
50 | test_script:
51 |   - stack test
52 | 
53 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-linux.yml:
--------------------------------------------------------------------------------
  1 | name: ci-linux
  2 | 
  3 | # Trigger the workflow on push or pull request
  4 | on:
  5 |   pull_request:
  6 |     # branches: [master]
  7 |   push:
  8 |     paths:
  9 |       - '.github/workflows/ci-linux.yml'
 10 |       - 'stack*.yaml'
 11 |       - '*.cabal'
 12 |       - '*/src/**'
 13 |       - '*/cbits/**'
 14 | 
 15 | jobs:
 16 |   build:
 17 |     runs-on: ubuntu-latest
 18 |     strategy:
 19 |       matrix:
 20 |         ghc:
 21 |           - "8.10"
 22 |           - "8.8"
 23 |           - "8.6"
 24 |           - "8.4"
 25 |           - "8.2"
 26 |           - "8.0"
 27 |           - "7.8"
 28 |         cuda:
 29 |           - "10.2"
 30 |           - "10.1"
 31 |           - "10.0"
 32 |           - "9.2"
 33 |           - "9.1"
 34 |           - "9.0"
 35 | 
 36 |         # include:
 37 |         #   - os: windows-latest
 38 |         #     ghc: "8.10"
 39 |         #     cuda: "10.2.89.20191206"
 40 | 
 41 |     env:
 42 |       STACK_FLAGS: "--fast"
 43 |       HADDOCK_FLAGS: "--haddock --no-haddock-deps --no-haddock-hyperlink-source --haddock-arguments=\"--no-print-missing-docs\""
 44 | 
 45 |     steps:
 46 |       - uses: actions/checkout@v2
 47 | 
 48 |       - run: ln -s stack-${{ matrix.ghc }}.yaml stack.yaml
 49 | 
 50 |       - uses: actions/cache@v2
 51 |         with:
 52 |           path: snapshot.pkgdb
 53 |           key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-snapshot.pkgdb
 54 | 
 55 |       - uses: actions/cache@v2
 56 |         with:
 57 |           path: |
 58 |             ~/.local/bin
 59 |             ~/.stack/programs
 60 |             ~/.stack/snapshots
 61 |             .stack-work
 62 |           key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }}
 63 |           restore-keys: |
 64 |             ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }}
 65 |             ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-
 66 |             ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-
 67 | 
 68 |       - name: Install stack
 69 |         run: |
 70 |           mkdir -p ~/.local/bin
 71 |           if [[ ! -x ~/.local/bin/stack ]]; then
 72 |             curl -sL https://get.haskellstack.org/stable/linux-x86_64.tar.gz | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack'
 73 |             chmod a+x ~/.local/bin/stack
 74 |           fi
 75 |           echo "~/.local/bin" >> $GITHUB_PATH
 76 | 
 77 |       - name: Install GHC
 78 |         run: stack setup --install-ghc
 79 | 
 80 |       - name: Install CUDA
 81 |         run: |
 82 |           MATRIX_CUDA=${{ matrix.cuda }}
 83 |           wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
 84 |           sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
 85 |           sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
 86 |           sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
 87 |           sudo apt-get update
 88 |           sudo apt-get -y install cuda-${MATRIX_CUDA/./-}
 89 |           echo "CUDA_HOME=/usr/local/cuda-${MATRIX_CUDA}" >> $GITHUB_ENV
 90 |           echo "LD_LIBRARY_PATH=/usr/local/cuda-${MATRIX_CUDA}/lib64:$(stack exec ghc -- --print-libdir)/rts:/usr/local/cuda-${MATRIX_CUDA}/nvvm/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 91 |           echo "/usr/local/cuda-${MATRIX_CUDA}/bin" >> $GITHUB_PATH
 92 | 
 93 |       - name: Build dependencies
 94 |         run: stack build $STACK_FLAGS --only-dependencies
 95 | 
 96 |       - name: Build
 97 |         run: stack build $STACK_FLAGS $HADDOCK_FLAGS
 98 | 
 99 |       - name: Save snapshot.pkgdb
100 |         run: stack exec ghc-pkg -- --package-db=$(stack path --snapshot-pkg-db) list > snapshot.pkgdb
101 | 
102 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /cuda.buildinfo
2 | /cuda.buildinfo.generated
3 | *.i
4 | dist/
5 | /stack.yaml
6 | /.stack-work
7 | /stack.yaml.lock
8 | dist-newstyle
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule ".travis"]
2 | 	path = .travis
3 | 	url = https://github.com/tmcdonell/travis-scripts.git
4 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
1 | Trevor L. McDonell      <tmcdonell@cse.unsw.edu.au>
2 | Trevor L. McDonell      <tmcdonell@cse.unsw.edu.au>     <trevor.mcdonell@gmail.com>
3 | Sean Lee                <seanl@cse.unsw.edu.au>
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # vim: nospell
 2 | #
 3 | language: c
 4 | dist: trusty
 5 | 
 6 | cache:
 7 |   timeout: 600
 8 |   directories:
 9 |     - $HOME/.stack
10 |     - $HOME/.local/bin
11 |     - $TRAVIS_BUILD_DIR/.stack-work
12 | 
13 | addons:
14 |   apt:
15 |     sources:
16 |       - hvr-ghc
17 |     packages:
18 |       - alex-3.1.7
19 |       - happy-1.19.5
20 | 
21 | compiler:
22 |   - GHC-8.10.1
23 |   - GHC-8.8.3
24 |   - GHC-8.6.5
25 |   - GHC-8.4.4
26 |   - GHC-8.2.2
27 |   - GHC-8.0.2
28 |   # - GHC-7.10.3
29 |   # - GHC-7.8.4
30 | 
31 | env:
32 |   # global:
33 |   matrix:
34 |     - CUDA=10.1.105-1
35 |     - CUDA=10.0.130-1
36 |     - CUDA=8.0.61-1
37 |     - CUDA=7.5-18
38 |     - CUDA=7.0-28
39 |     - CUDA=6.5-14
40 | 
41 | matrix:
42 |   fast_finish: true
43 |   # allow_failures:
44 | 
45 | before_install:
46 |   - export GHC=${CC:4}
47 |   - unset CC
48 |   - export PATH=/opt/alex/3.1.7/bin:/opt/happy/1.19.5/bin:$PATH
49 |   - export CUDA_INSTALL_EXTRA_LIBS=0
50 |   - source .travis/install-cuda-trusty.sh
51 |   - source .travis/install-stack.sh
52 | 
53 |   # # ghc-8.6 builds a broken version of c2hs
54 |   # - |
55 |   #     if [ ${GHC%.*} == "8.6" -a ! -x "$(which c2hs)" ]; then
56 |   #       travis_retry stack --stack-yaml=stack-8.4.yaml setup --no-terminal --no-system-ghc
57 |   #       travis_retry stack --stack-yaml=stack-8.4.yaml install c2hs --fast --no-terminal
58 |   #     fi
59 | 
60 |   # build environment
61 |   - echo "$(stack exec ghc -- --version) [$(stack exec ghc -- --print-project-git-commit-id 2> /dev/null || echo '?')]"
62 |   - stack --version
63 |   - nvcc --version
64 | 
65 | install:
66 |   - export FLAGS="--fast --jobs=2 --no-terminal --no-copy-bins --no-interleaved-output"
67 |   - travis_retry stack build --only-dependencies
68 | 
69 | script:
70 |   - travis_retry stack build --haddock --no-haddock-deps
71 | 
72 | after_success:
73 |   - source .travis/update-accelerate-buildbot.sh
74 | 
75 | after_failure:
76 |   - dmesg
77 |   - ls -R /usr/local/cuda*
78 | 
79 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) [2009..2014] Trevor L. McDonell, University of New South Wales.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the University of New South Wales nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Haskell FFI Bindings to CUDA
  2 | ============================
  3 | 
  4 | [![CI-Linux](https://github.com/tmcdonell/cuda/actions/workflows/ci-linux.yml/badge.svg)](https://github.com/tmcdonell/cuda/actions/workflows/ci-linux.yml)
  5 | [![CI-Windows](https://github.com/tmcdonell/cuda/actions/workflows/ci-windows.yml/badge.svg)](https://github.com/tmcdonell/cuda/actions/workflows/ci-windows.yml)
  6 | [![Stackage LTS](https://stackage.org/package/cuda/badge/lts)](https://stackage.org/lts/package/cuda)
  7 | [![Stackage Nightly](https://stackage.org/package/cuda/badge/nightly)](https://stackage.org/nightly/package/cuda)
  8 | [![Hackage](https://img.shields.io/hackage/v/cuda.svg)](https://hackage.haskell.org/package/cuda)
  9 | 
 10 | The CUDA library provides a direct, general purpose C-like SPMD programming
 11 | model for NVIDIA graphics cards (G8x series onwards). This is a collection of
 12 | bindings to allow you to call and control, although not write, such functions
 13 | from Haskell-land. You will need to install the CUDA driver and developer
 14 | toolkit.
 15 | 
 16 |   <http://developer.nvidia.com/object/cuda.html>
 17 | 
 18 | The configure step will look for your CUDA installation in the standard places,
 19 | and if the `nvcc` compiler is found in your `PATH`, relative to that.
 20 | 
 21 | For important information on installing on Windows, see:
 22 | 
 23 |   <https://github.com/tmcdonell/cuda/blob/master/WINDOWS.md>
 24 | 
 25 | 
 26 | ## Missing functionality
 27 | 
 28 | An incomplete list of missing bindings. Pull requests welcome!
 29 | 
 30 | ### CUDA-9
 31 | 
 32 | - cuLaunchCooperativeKernelMultiDevice
 33 | 
 34 | ### CUDA-10.0
 35 | 
 36 | - cuDeviceGetLuid (windows only?)
 37 | - cuLaunchHostFunc
 38 | - cuGraphHostNode[Get/Set]Params
 39 | - cuGraphKernelNode[Get/Set]Params
 40 | - cuGraphMemcpyNode[Get/Set]Params
 41 | - cuGraphMemsetNode[Get/Set]Params
 42 | 
 43 | ### CUDA-10.2
 44 | 
 45 | - cuDeviceGetNvSciSyncAttributes
 46 | - cuMemAddressFree
 47 | - cuMemAddressReserve
 48 | - cuMemCreate
 49 | - cuMemExportToShareableHandle
 50 | - cuMemGetAccess
 51 | - cuMemGetAllocationGranularity
 52 | - cuMemGetAllocationPrepertiesFromHandle
 53 | - cuMemImportFromShareableHandle
 54 | - cuMemMap
 55 | - cuMemRelease
 56 | - cuMemSetAccess
 57 | - cuMemUnmap
 58 | - cuGraphExecHostNodeSetParams
 59 | - cuGraphExecMemcpyNodeSetParams
 60 | - cuGraphExecMemsetNodeSetParams
 61 | - cuGraphExecUpdate
 62 | 
 63 | ### CUDA-11.0
 64 | 
 65 | - cuCtxResetPersistentingL2Cache
 66 | - cuMemRetainAllocationHandle
 67 | - cuStreamCopyAttributes
 68 | - cuStreamGetAttribute
 69 | - cuStreamSetAttribute
 70 | - cuGraphKernelNodeCopyAttributes
 71 | - cuGraphKernelNodeGetAttribute
 72 | - cuGraphKernelNodeSetAttribute
 73 | - cuOccupancyAvailableDynamicSMemPerBlock
 74 | 
 75 | ### CUDA-11.1
 76 | 
 77 | - cuDeviceGetTexture1DLinearMaxWidth
 78 | - cuArrayGetSparseProperties
 79 | - cuMipmappedArrayGetSparseProperties
 80 | - cuMemMapArrayAsync
 81 | - cuEventRecordWithFlags
 82 | - cuGraphAddEventRecordNode
 83 | - cuGraphAddEventWaitNode
 84 | - cuGraphEventRecordNodeGetEvent
 85 | - cuGraphEventRecordNodeSetEvent
 86 | - cuGraphEventWaitNodeGetEvent
 87 | - cuGraphEventWaitNodeSetEvent
 88 | - cuGraphExecChildGraphNodeSetParams
 89 | - cuGraphExecEventRecordNodeSetEvent
 90 | - cuGraphExecEventWaitNodeSetEvent
 91 | - cuGraphUpload
 92 | 
 93 | ### CUDA-11.2
 94 | 
 95 | - cuDeviceGetDefaultMemPool
 96 | - cuDeviceGetMemPool
 97 | - cuDeviceSetMemPool
 98 | - cuArrayGetPlane
 99 | - cuMemAllocAsync
100 | - cuMemAllocFromPoolAsync
101 | - cuMemFreeAsync
102 | - cuMemPoolCreate
103 | - cuMemPoolDestroy
104 | - cuMemPoolExportPointer
105 | - cuMemPoolExportToShareableHandle
106 | - cuMemPoolGetAccess
107 | - cuMemPoolGetAttribute
108 | - cuMemPoolImportFromShareableHandle
109 | - cuMemPoolImportPointer
110 | - cuMemPoolSetAccess
111 | - cuMemPoolSetAttribute
112 | - cuMemPoolTrimTo
113 | - cuGraphAddExternalSemaphoresSignalNode
114 | - cuGraphAddExternalSemaphoresWaitNode
115 | - cuGraphExecExternalSemaphoresSignalNodeSetParams
116 | - cuGraphExecExternalSemaphoresWaitNodeSetParams
117 | - cuGraphExternalSemaphoresSignalNodeGetParams
118 | - cuGraphExternalSemaphoresSignalNodeSetParams
119 | - cuGraphExternalSemaphoresWaitNodeGetParams
120 | - cuGraphExternalSemaphoresWaitNodeSetParams
121 | 
122 | ### CUDA-11.3
123 | 
124 | - cuStreamGetCaptureInfo_v2
125 | - cuFuncGetModule
126 | - cuGraphDebugDotPrint
127 | - cuGraphReleaseUserObject
128 | - cuGraphRetainUserObject
129 | - cuUserObjectCreate
130 | - cuUserObjectRelease
131 | - cuUserObjectRetain
132 | - cuGetProcAddress
133 | 
134 | ### CUDA-11.4
135 | 
136 | - cuDeviceGetUuid_v2
137 | - cuCtxCreate_v3
138 | - cuCtxGetExecAffinity
139 | - cuDeviceGetGraphMemAttribute
140 | - cuDeviceGraphMemTrim
141 | - cuDeviceSetGraphMemAttribute
142 | - cuGraphAddMemAllocNode
143 | - cuGraphAddMemFreeNode
144 | - cuGraphInstantiateWithFlags
145 | - cuGraphMemAllocNodeGetParams
146 | - cuGraphMemFreeNodeGetParams
147 | 
148 | 


--------------------------------------------------------------------------------
/WINDOWS.md:
--------------------------------------------------------------------------------
 1 | Using the CUDA package on Windows
 2 | =================================
 3 | 
 4 | The CUDA package works on Windows and is actively maintained. If you encounter
 5 | any other issues, please report them.
 6 | 
 7 | Note that if you build your applications for the Windows 64-bit architecture,
 8 | you'll need to update your `ld.exe` as described below.
 9 | 
10 | 
11 | Windows 64-bit
12 | --------------
13 | 
14 | There is a known issue with the version of `ld.exe` that ships with the 64-bit
15 | versions of (at least) GHC-7.8.4 and GHC-7.10.2. The version of `ld.exe` that
16 | ships with these GHC distributions does not properly link against MS-style
17 | dynamic libraries (such as those that ship with the CUDA toolkit), causing the
18 | application to crash at runtime once those library routines are called. The
19 | configure step will fail if it detects an old version of `ld.exe` (< 2.25.1),
20 | which are known to be broken.
21 | 
22 | If you are using the 64-bit GHC distributions mentioned above, you will need to
23 | apply the following steps. This bug does not affect 32-bit GHC distributions.
24 | The bug has been fixed in MinGW binutils `ld.exe` >= 2.25.1, so it is expected
25 | that newer releases of GHC will not have this issue.
26 | 
27 | The problem is fixed by replacing the linker binary `ld.exe` with the newer
28 | (patched) version, available as part of the MSys2 binutils package here:
29 | 
30 | > <http://repo.msys2.org/mingw/x86_64/mingw-w64-x86_64-binutils-2.25.1-1-any.pkg.tar.xz>
31 | 
32 | The updated `ld.exe` binary must replace the version at the path:
33 | 
34 | > `GHC_PATH\mingw\x86_64-w64-mingw32\bin\`
35 | 
36 | Note that there is another copy of `ld.exe` located at `GHC_PATH\mingw\bin\`,
37 | but this version does not seem to be used, so replacing it as well is not
38 | necessary. It is not sufficient to replace whatever version of `ld.exe` appears
39 | first in your `PATH`.
40 | 
41 | Please note that having another MinGW installation in `PATH` before the one
42 | shipped with GHC may break things, particularly if you mix 32/64-bit
43 | distributions of MinGW and GHC.
44 | 
45 | For further discussion of the bug, see:
46 | 
47 |   * [CUDA package issue][cuda31]
48 |   * [GHC issue][ghc10885]
49 |   * [binutils issue][binutils16598]
50 | 
51 |  [cuda31]:              https://github.com/tmcdonell/cuda/issues/31
52 |  [ghc10885]:            https://ghc.haskell.org/trac/ghc/ticket/10885
53 |  [binutils16598]:       https://sourceware.org/bugzilla/show_bug.cgi?id=16598
54 | 
55 | 


--------------------------------------------------------------------------------
/cbits/init.c:
--------------------------------------------------------------------------------
 1 | #include "cbits/stubs.h"
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * Make sure that the linker always touches this module so that it notices the
 6 |  * below constructor function. Calling this empty function as part of
 7 |  * 'Foreign.CUDA.Driver.initialise' should be sufficient to prevent it from ever
 8 |  * being stripped.
 9 |  */
10 | void enable_constructors() { }
11 | 
12 | /*
13 |  * GHC-8 introduced a new (simpler) 64-bit allocator, which on startup 'mmap's
14 |  * 1TB of address space and then commits sub-portions of that memory as needed.
15 |  *
16 |  * The CUDA driver also appears to 'mmap' a large chunk of address space on
17 |  * 'cuInit', probably as the arena for shuffling memory to and from the device,
18 |  * but attempts to do so at a _fixed_ address. If the GHC RTS has already taken
19 |  * that address at the time we call 'cuInit', driver initialisation will fail
20 |  * with an "out of memory" error.
21 |  *
22 |  * The workaround is to call 'cuInit' before initialising the RTS. Then the
23 |  * RTS's allocation will avoid CUDA's allocation, since the RTS doesn't care
24 |  * where in the address space it gets that memory. Embedding the following
25 |  * __attribute__((constructor)) function in the library does the trick nicely,
26 |  * and the linker will ensure that this gets executed when the shared library is
27 |  * loaded (during program startup).
28 |  *
29 |  * Another way around this, without actually calling 'cuInit', would be to just
30 |  * reserve the regions that 'cuInit' requires in the constructor function so
31 |  * that the RTS avoids them, then release them before calling 'cuInit'. However,
32 |  * since the CUDA driver is closed and we don't know exactly which regions to
33 |  * reserve, that approach would be fragile.
34 |  *
35 |  * See: https://github.com/tmcdonell/cuda/issues/39
36 |  */
37 | #ifdef CUDA_PRELOAD
38 | __attribute__((constructor)) void preinitialise_cuda()
39 | {
40 |     CUresult status = cuInit (0);
41 | 
42 |     if ( status != CUDA_SUCCESS ) {
43 | #if CUDA_VERSION >= 6000
44 |         const char* str = NULL;
45 | 
46 |         cuGetErrorString(status, &str);
47 |         fprintf(stderr, "Failed to pre-initialise CUDA: %s\n", str);
48 | #else
49 |         fprintf(stderr, "Failed to pre-initialise CUDA (%d)\n", status);
50 | #endif
51 |     }
52 | }
53 | #endif
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
 1 | ifneq ($(emu),1)
 2 |     PROJECTS := $(shell find src -name Makefile)
 3 | else
 4 |     PROJECTS := $(shell find src -name Makefile | xargs grep -L 'USEDRVAPI')
 5 | endif
 6 | 
 7 | 
 8 | %.do :
 9 | 	$(MAKE) -C $(dir $*) $(MAKECMDGOALS)
10 | 
11 | all : $(addsuffix .do,$(PROJECTS))
12 | 	@echo "Finished building all"
13 | 
14 | clean : $(addsuffix .do,$(PROJECTS))
15 | 	@echo "Finished cleaning all"
16 | 
17 | clobber : $(addsuffix .do,$(PROJECTS))
18 | 	@echo "Finished cleaning all"
19 | 


--------------------------------------------------------------------------------
/examples/common/include/cudpp/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2007-2009 The Regents of the University of California, Davis
 2 | campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, 
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright notice, 
 8 |       this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above copyright notice, 
10 |       this list of conditions and the following disclaimer in the documentation 
11 |       and/or other materials provided with the distribution.
12 |     * Neither the name of the The Regents, nor NVIDIA, nor the names of its 
13 |       contributors may be used to endorse or promote products derived from this 
14 |       software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
19 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
20 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/examples/common/include/cudpp/cudpp_globals.h:
--------------------------------------------------------------------------------
 1 | // -------------------------------------------------------------
 2 | // cuDPP -- CUDA Data Parallel Primitives library
 3 | // -------------------------------------------------------------
 4 | // $Revision: 5632 $
 5 | // $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
 6 | // -------------------------------------------------------------
 7 | // This source code is distributed under the terms of license.txt in
 8 | // the root directory of this source distribution.
 9 | // -------------------------------------------------------------
10 | 
11 | /**
12 |  * @file
13 |  * cudpp_globals.h
14 |  *
15 |  * @brief Global declarations defining machine characteristics of GPU target
16 |  * These are currently set for best performance on G8X GPUs.  The optimal
17 |  * parameters may change on future GPUs. In the future, we hope to make
18 |  * CUDPP a self-tuning library.
19 |  */
20 | 
21 | #ifndef __CUDPP_GLOBALS_H__
22 | #define __CUDPP_GLOBALS_H__
23 | 
24 | const int NUM_BANKS = 16;                        /**< Number of shared memory banks */
25 | const int LOG_NUM_BANKS = 4;                     /**< log_2(NUM_BANKS) */
26 | const int CTA_SIZE = 128;                        /**< Number of threads in a CTA */
27 | const int WARP_SIZE = 32;                        /**< Number of threads in a warp */
28 | const int LOG_CTA_SIZE = 7;                      /**< log_2(CTA_SIZE) */
29 | const int LOG_WARP_SIZE = 5;                     /**< log_2(WARP_SIZE) */
30 | const int LOG_SIZEOF_FLOAT = 2;                  /**< log_2(sizeof(float)) */
31 | const int SCAN_ELTS_PER_THREAD = 8;              /**< Number of elements per scan thread */
32 | const int SEGSCAN_ELTS_PER_THREAD = 8;     /**< Number of elements per segmented scan thread */
33 | 
34 | const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared
35 |                                               memory in each block */
36 | const int maxThreadsPerBlock = CTA_SIZE;   /**< Maximum number of
37 |                                              * threads in a CTA */
38 | 
39 | #define AVOID_BANK_CONFLICTS /**< Set if by default, we want our
40 |                               * shared memory allocation to perform
41 |                               * additional computation to avoid bank
42 |                               * conflicts */
43 | 
44 | #ifdef AVOID_BANK_CONFLICTS
45 | #define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
46 | #else
47 | #define CONFLICT_FREE_OFFSET(index) (0)
48 | #endif
49 | 
50 | #endif // __CUDPP_GLOBALS_H__
51 | 
52 | // Leave this at the end of the file
53 | // Local Variables:
54 | // mode:c++
55 | // c-file-style: "NVIDIA"
56 | // End:
57 | 


--------------------------------------------------------------------------------
/examples/common/include/cudpp/shared_mem.h:
--------------------------------------------------------------------------------
  1 | // -------------------------------------------------------------
  2 | // cuDPP -- CUDA Data Parallel Primitives library
  3 | // -------------------------------------------------------------
  4 | // $Revision: 5633 $
  5 | // $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
  6 | // -------------------------------------------------------------
  7 | // This source code is distributed under the terms of license.txt
  8 | // in the root directory of this source distribution.
  9 | // -------------------------------------------------------------
 10 | 
 11 | /**
 12 |  * @file
 13 |  * sharedmem.h
 14 |  *
 15 |  * @brief Shared memory declaration struct for templatized types.
 16 |  *
 17 |  * Because dynamically sized shared memory arrays are declared "extern" in CUDA,
 18 |  * we can't templatize their types directly.  To get around this, we declare a
 19 |  * simple wrapper struct that will declare the extern array with a different
 20 |  * name depending on the type.  This avoids linker errors about multiple
 21 |  * definitions.
 22 |  *
 23 |  * To use dynamically allocated shared memory in a templatized __global__ or
 24 |  * __device__ function, just replace code like this:
 25 |  *
 26 |  * <pre>
 27 |  *  template<class T>
 28 |  *  __global__ void
 29 |  *  foo( T* d_out, T* d_in)
 30 |  *  {
 31 |  *      // Shared mem size is determined by the host app at run time
 32 |  *      extern __shared__  T sdata[];
 33 |  *      ...
 34 |  *      doStuff(sdata);
 35 |  *      ...
 36 |  *  }
 37 |  * </pre>
 38 |  *
 39 |  *  With this
 40 |  * <pre>
 41 |  *  template<class T>
 42 |  *  __global__ void
 43 |  *  foo( T* d_out, T* d_in)
 44 |  *  {
 45 |  *      // Shared mem size is determined by the host app at run time
 46 |  *      SharedMemory<T> smem;
 47 |  *      T* sdata = smem.getPointer();
 48 |  *      ...
 49 |  *      doStuff(sdata);
 50 |  *      ...
 51 |  *  }
 52 |  * </pre>
 53 |  */
 54 | 
 55 | #ifndef __SHARED_MEM_H__
 56 | #define __SHARED_MEM_H__
 57 | 
 58 | 
 59 | /** @brief Wrapper class for templatized dynamic shared memory arrays.
 60 |   *
 61 |   * This struct uses template specialization on the type \a T to declare
 62 |   * a differently named dynamic shared memory array for each type
 63 |   * (\code extern __shared__ T s_type[] \endcode).
 64 |   *
 65 |   * Currently there are specializations for the following types:
 66 |   * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long,
 67 |   * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
 68 |   * for user defined types.
 69 |   */
 70 | template <typename T>
 71 | struct SharedMemory
 72 | {
 73 |     /** Return a pointer to the runtime-sized shared memory array. **/
 74 |     __device__ T* getPointer()
 75 |     {
 76 |         extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
 77 |         Error_UnsupportedType();
 78 |         return (T*)0;
 79 |     }
 80 |     // TODO: Use operator overloading to make this class look like a regular array
 81 | };
 82 | 
 83 | // Following are the specializations for the following types.
 84 | // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
 85 | // One could also specialize it for user-defined types.
 86 | 
 87 | #define SPEC_SHAREDMEM(T, name)                                                         \
 88 |     template <> struct SharedMemory <T>                                                 \
 89 |     {                                                                                   \
 90 |         __device__ T* getPointer() { extern __shared__ T s_##name[]; return s_##name; } \
 91 |     }
 92 | 
 93 | SPEC_SHAREDMEM(int,    int);
 94 | SPEC_SHAREDMEM(char,   char);
 95 | SPEC_SHAREDMEM(long,   long);
 96 | SPEC_SHAREDMEM(short,  short);
 97 | SPEC_SHAREDMEM(bool,   bool);
 98 | SPEC_SHAREDMEM(float,  float);
 99 | SPEC_SHAREDMEM(double, double);
100 | 
101 | SPEC_SHAREDMEM(unsigned int,   uint);
102 | SPEC_SHAREDMEM(unsigned char,  uchar);
103 | SPEC_SHAREDMEM(unsigned long,  ulong);
104 | SPEC_SHAREDMEM(unsigned short, ushort);
105 | 
106 | SPEC_SHAREDMEM(uchar4, uchar4);
107 | 
108 | #undef SPEC_SHAREDMEM
109 | #endif // __SHARED_MEM_H__
110 | 
111 | // Leave this at the end of the file
112 | // Local Variables:
113 | // mode:c++
114 | // c-file-style: "NVIDIA"
115 | // End:
116 | 


--------------------------------------------------------------------------------
/examples/common/include/cudpp/type_vector.h:
--------------------------------------------------------------------------------
 1 | // -------------------------------------------------------------
 2 | // cuDPP -- CUDA Data Parallel Primitives library
 3 | // -------------------------------------------------------------
 4 | // $Revision: 5632 $
 5 | // $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
 6 | // -------------------------------------------------------------
 7 | // This source code is distributed under the terms of license.txt in
 8 | // the root directory of this source distribution.
 9 | // -------------------------------------------------------------
10 | 
11 | #ifndef __TYPE_VECTOR_H__
12 | #define __TYPE_VECTOR_H__
13 | 
14 | /** @brief Utility template struct for generating small vector types from scalar types
15 |   *
16 |   * Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as
17 |   * template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the
18 |   * specified length and base type.  For example:
19 |   * \code
20 |   * template <class T>
21 |   * __device__ void myKernel(T *data)
22 |   * {
23 |   *     typeToVector<T,4>::Result myVec4;             // create a vec4 of type T
24 |   *     myVec4 = (typeToVector<T,4>::Result*)data[0]; // load first element of data as a vec4
25 |   * }
26 |   * \endcode
27 |   *
28 |   * This functionality is implemented using template specialization.  Currently specializations
29 |   * for int, float, and unsigned int vectors of lengths 2-4 are defined.  Note that this results
30 |   * in types being generated at compile time -- there is no runtime cost.  typeToVector is used by
31 |   * the optimized scan \c __device__ functions in scan_cta.cu.
32 |   */
33 | template <typename T, int N>
34 | struct typeToVector
35 | {
36 |     typedef T Result;
37 | };
38 | 
39 | #define TYPE_VECTOR(type, name)                                                 \
40 |     template <> struct typeToVector<type, 2> { typedef name##2 Result; };       \
41 |     template <> struct typeToVector<type, 3> { typedef name##3 Result; };       \
42 |     template <> struct typeToVector<type, 4> { typedef name##4 Result; }
43 | 
44 | TYPE_VECTOR(int,         int);
45 | TYPE_VECTOR(float,       float);
46 | TYPE_VECTOR(unsigned int, uint);
47 | 
48 | 
49 | #undef TYPE_VECTOR
50 | #endif
51 | 
52 | 


--------------------------------------------------------------------------------
/examples/common/include/operator.h:
--------------------------------------------------------------------------------
  1 | /* -----------------------------------------------------------------------------
  2 |  *
  3 |  * Module    : Operator
  4 |  * Copyright : (c) 2009 Trevor L. McDonell
  5 |  * License   : BSD
  6 |  *
  7 |  * A template class for unary/binary kernel operations
  8 |  *
  9 |  * ---------------------------------------------------------------------------*/
 10 | 
 11 | 
 12 | #ifndef __OPERATOR_H__
 13 | #define __OPERATOR_H__
 14 | 
 15 | #include <float.h>
 16 | #include <limits.h>
 17 | 
 18 | 
 19 | /*
 20 |  * Template class for an operation that can be mapped over an array.
 21 |  */
 22 | template <typename Ta, typename Tb=Ta>
 23 | class Functor
 24 | {
 25 | public:
 26 |     /*
 27 |      * Apply the operation to the given operand.
 28 |      */
 29 |     static __device__ Tb apply(const Ta &x);
 30 | };
 31 | 
 32 | template <typename Ta, typename Tb>
 33 | class fromIntegral : Functor<Ta, Tb>
 34 | {
 35 | public:
 36 |     static __device__ Tb apply(const Ta &x) { return (Tb) x; }
 37 | };
 38 | 
 39 | 
 40 | /*
 41 |  * Template class for binary operators. Certain algorithms may require the
 42 |  * operator to be associative (that is, Ta == Tb), such as parallel scan and
 43 |  * reduction.
 44 |  *
 45 |  * As this is template code, it should compile down to something efficient...
 46 |  */
 47 | template <typename Ta, typename Tb=Ta, typename Tc=Ta>
 48 | class BinaryOp
 49 | {
 50 | public:
 51 |     /*
 52 |      * Apply the operation to the given operands.
 53 |      */
 54 |     static __device__ Tc apply(const Ta &a, const Tb &b);
 55 | 
 56 |     /*
 57 |      * Return an identity element for the type Tc.
 58 |      *
 59 |      * This may have special meaning for a given implementation, for example a
 60 |      * `max' operation over integers may want to return INT_MIN.
 61 |      */
 62 |     static __device__ Tc identity();
 63 | };
 64 | 
 65 | /*
 66 |  * Return the minimum or maximum value of a type
 67 |  */
 68 | template <typename T> inline __device__ T getMin();
 69 | template <typename T> inline __device__ T getMax();
 70 | 
 71 | #define SPEC_MINMAX(type,vmin,vmax)                                            \
 72 |     template <> inline __device__ type getMin() { return vmin; };              \
 73 |     template <> inline __device__ type getMax() { return vmax; }               \
 74 | 
 75 | SPEC_MINMAX(float,        -FLT_MAX,  FLT_MAX);
 76 | SPEC_MINMAX(int,           INT_MIN,  INT_MAX);
 77 | SPEC_MINMAX(char,          CHAR_MIN, CHAR_MAX);
 78 | SPEC_MINMAX(unsigned int,  0,        UINT_MAX);
 79 | SPEC_MINMAX(unsigned char, 0,        UCHAR_MAX);
 80 | 
 81 | 
 82 | /*
 83 |  * Basic binary arithmetic operations. We take advantage of automatic type
 84 |  * promotion to keep the parameters general.
 85 |  */
 86 | #define BASIC_OP(name,expr,id)                                                 \
 87 |     template <typename Ta, typename Tb=Ta, typename Tc=Ta>                     \
 88 |     class name : BinaryOp<Ta, Tb, Tc>                                          \
 89 |     {                                                                          \
 90 |     public:                                                                    \
 91 |         static __device__ Tc apply(const Ta &a, const Tb &b) { return expr; }  \
 92 |         static __device__ Tc identity() { return id; }                         \
 93 |     }
 94 | 
 95 | #define LOGICAL_OP(name,expr,id)                                               \
 96 |     template <typename Ta, typename Tb=Ta>                                     \
 97 |     class name : BinaryOp<Ta, Tb, bool>                                        \
 98 |     {                                                                          \
 99 |     public:                                                                    \
100 |         static __device__ bool apply(const Ta &a, const Tb &b) { return expr; }\
101 |         static __device__ bool identity() { return id; }                       \
102 |     }
103 | 
104 | BASIC_OP(Plus,  a + b,    0);
105 | BASIC_OP(Times, a * b,    1);
106 | BASIC_OP(Min,   min(a,b), getMax<Ta>());
107 | BASIC_OP(Max,   max(a,b), getMin<Ta>());
108 | 
109 | LOGICAL_OP(Eq,  a == b,   false);
110 | 
111 | #undef SPEC_MINMAX
112 | #undef BASIC_OP
113 | #endif
114 | 
115 | 


--------------------------------------------------------------------------------
/examples/common/include/utils.h:
--------------------------------------------------------------------------------
  1 | /* -----------------------------------------------------------------------------
  2 |  *
  3 |  * Module    : Utils
  4 |  * Copyright : (c) 2009 Trevor L. McDonell
  5 |  * License   : BSD
  6 |  *
  7 |  * ---------------------------------------------------------------------------*/
  8 | 
  9 | 
 10 | #ifndef __UTILS_H__
 11 | #define __UTILS_H__
 12 | 
 13 | #include <math.h>
 14 | #include <stdio.h>
 15 | 
 16 | 
 17 | /*
 18 |  * Core assert function. Don't let this escape...
 19 |  */
 20 | #if defined(__CUDACC__) || !defined(__DEVICE_EMULATION__)
 21 | #define __assert(e, file, line) ((void)0)
 22 | #else
 23 | #define __assert(e, file, line) \
 24 |     ((void) fprintf (stderr, "%s:%u: failed assertion `%s'\n", file, line, e), abort())
 25 | #endif
 26 | 
 27 | /*
 28 |  * Test the given expression, and abort the program if it evaluates to false.
 29 |  * Only available in debug mode.
 30 |  */
 31 | #ifndef _DEBUG
 32 | #define assert(e)               ((void)0)
 33 | #else
 34 | #define assert(e)  \
 35 |     ((void) ((e) ? (void(0)) : __assert (#e, __FILE__, __LINE__)))
 36 | #endif
 37 | 
 38 | 
 39 | /*
 40 |  * Macro to insert __syncthreads() in device emulation mode
 41 |  */
 42 | #ifdef __DEVICE_EMULATION__
 43 | #define __EMUSYNC               __syncthreads()
 44 | #else
 45 | #define __EMUSYNC
 46 | #endif
 47 | 
 48 | 
 49 | /*
 50 |  * Check the return status of CUDA API calls, and abort with an appropriate
 51 |  * error string on failure.
 52 |  */
 53 | #define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
 54 |     do {                                                                       \
 55 |         cudaError err = call;                                                  \
 56 |         if(cudaSuccess != err) {                                               \
 57 |             const char *str = cudaGetErrorString(err);                         \
 58 |             __assert(str, __FILE__, __LINE__);                                 \
 59 |         }                                                                      \
 60 |     } while (0)
 61 | 
 62 | #define CUDA_SAFE_CALL(call)                                                   \
 63 |     do {                                                                       \
 64 |         CUDA_SAFE_CALL_NO_SYNC(call);                                          \
 65 |         CUDA_SAFE_CALL_NO_SYNC(cudaThreadSynchronize());                       \
 66 |     } while (0)
 67 | 
 68 | 
 69 | #ifdef __cplusplus
 70 | extern "C" {
 71 | #endif
 72 | 
 73 | /*
 74 |  * Determine if the input is a power of two
 75 |  */
 76 | inline bool
 77 | isPow2(unsigned int x)
 78 | {
 79 |     return ((x&(x-1)) == 0);
 80 | }
 81 | 
 82 | 
 83 | /*
 84 |  * Compute the next highest power of two
 85 |  */
 86 | inline unsigned int
 87 | ceilPow2(unsigned int x)
 88 | {
 89 | #if 0
 90 |     --x;
 91 |     x |= x >> 1;
 92 |     x |= x >> 2;
 93 |     x |= x >> 4;
 94 |     x |= x >> 8;
 95 |     x |= x >> 16;
 96 |     return ++x;
 97 | #endif
 98 | 
 99 |     return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x));
100 | }
101 | 
102 | 
103 | /*
104 |  * Compute the next lowest power of two
105 |  */
106 | inline unsigned int
107 | floorPow2(unsigned int x)
108 | {
109 | #if 0
110 |     float nf = (float) n;
111 |     return 1 << (((*(int*)&nf) >> 23) - 127);
112 | #endif
113 | 
114 |     int exp;
115 |     frexp(x, &exp);
116 |     return 1 << (exp - 1);
117 | }
118 | 
119 | 
120 | /*
121 |  * computes next highest multiple of f from x
122 |  */
123 | inline unsigned int
124 | multiple(unsigned int x, unsigned int f)
125 | {
126 |     return ((x + (f-1)) / f);
127 | }
128 | 
129 | 
130 | /*
131 |  * MS Excel-style CEIL() function. Rounds x up to nearest multiple of f
132 |  */
133 | inline unsigned int
134 | ceiling(unsigned int x, unsigned int f)
135 | {
136 |     return multiple(x, f) * f;
137 | }
138 | 
139 | 
140 | #undef __asert
141 | 
142 | #ifdef __cplusplus
143 | }
144 | #endif
145 | #endif
146 | 
147 | 


--------------------------------------------------------------------------------
/examples/common/src/PrettyPrint.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : PrettyPrint
 4 | -- Copyright : (c) 2009 Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Simple layout and pretty printing
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module PrettyPrint where
12 | 
13 | import Data.List
14 | import Text.PrettyPrint
15 | import System.IO
16 | 
17 | 
18 | --------------------------------------------------------------------------------
19 | -- Printing
20 | --------------------------------------------------------------------------------
21 | 
22 | printDoc :: Doc -> IO ()
23 | printDoc =  putStrLn . flip (++) "\n" . render
24 | 
25 | --
26 | -- stolen from $fptools/ghc/compiler/utils/Pretty.lhs
27 | --
28 | -- This code has a BSD-style license
29 | --
30 | printDocFull :: Mode -> Handle -> Doc -> IO ()
31 | printDocFull m hdl doc = do
32 |   fullRender m cols 1.5 put done doc
33 |   hFlush hdl
34 |   where
35 |     put (Chr c) next  = hPutChar hdl c >> next
36 |     put (Str s) next  = hPutStr  hdl s >> next
37 |     put (PStr s) next = hPutStr  hdl s >> next
38 | 
39 |     done = hPutChar hdl '\n'
40 |     cols = 80
41 | 
42 | 
43 | --------------------------------------------------------------------------------
44 | -- Layout
45 | --------------------------------------------------------------------------------
46 | 
47 | --
48 | -- Display the given grid of renderable data, given as either a list of rows or
49 | -- columns, using the minimum size required for each column. An additional
50 | -- parameter specifies extra space to be inserted between each column.
51 | --
52 | ppAsRows      :: Int -> [[Doc]] -> Doc
53 | ppAsRows q    =  ppAsColumns q . transpose
54 | 
55 | ppAsColumns   :: Int -> [[Doc]] -> Doc
56 | ppAsColumns q =  vcat . map hsep . transpose . map (\col -> pad (width col) col)
57 |   where
58 |     len   = length . render
59 |     width = maximum . map len
60 |     pad w = map (\x -> x <> (hcat $ replicate (w - (len x) + q) space))
61 | 
62 | 


--------------------------------------------------------------------------------
/examples/common/src/RandomVector.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE FlexibleContexts, ParallelListComp #-}
  2 | --------------------------------------------------------------------------------
  3 | --
  4 | -- Module    : RandomVector
  5 | -- Copyright : (c) 2009 Trevor L. McDonell
  6 | -- License   : BSD
  7 | --
  8 | -- Storable multi-dimensional arrays and lists of random numbers
  9 | --
 10 | --------------------------------------------------------------------------------
 11 | 
 12 | module RandomVector
 13 |   (
 14 |     Storable,
 15 |     module RandomVector,
 16 |     module Data.Array.Storable
 17 |   )
 18 |   where
 19 | 
 20 | import Foreign                                          (Ptr, Storable)
 21 | import Control.Monad                                    (join)
 22 | import Control.Exception                                (evaluate)
 23 | import Data.Array.Storable
 24 | import System.Random
 25 | 
 26 | 
 27 | --------------------------------------------------------------------------------
 28 | -- Arrays
 29 | --------------------------------------------------------------------------------
 30 | 
 31 | type Vector e = StorableArray Int e
 32 | type Matrix e = StorableArray (Int,Int) e
 33 | 
 34 | withVector :: Vector e -> (Ptr e -> IO a) -> IO a
 35 | withVector = withStorableArray
 36 | 
 37 | withMatrix :: Matrix e -> (Ptr e -> IO a) -> IO a
 38 | withMatrix = withStorableArray
 39 | 
 40 | 
 41 | --
 42 | -- To ensure the array is fully evaluated, force one element
 43 | --
 44 | evaluateArr :: (Ix i, MArray StorableArray e IO)
 45 |             => i -> StorableArray i e -> IO (StorableArray i e)
 46 | evaluateArr l arr = (join $ evaluate (arr `readArray` l)) >> return arr
 47 | 
 48 | 
 49 | --
 50 | -- Generate a new random array
 51 | --
 52 | randomArrR :: (Ix i, Num e, Storable e, Random e, MArray StorableArray e IO)
 53 |            => (i,i) -> (e,e) -> IO (StorableArray i e)
 54 | randomArrR (l,u) bnds = do
 55 |   rg <- newStdGen
 56 |   let -- The standard random number generator is too slow to generate really
 57 |       -- large vectors. Instead, we generate a short vector and repeat that.
 58 |       k     = 1000
 59 |       rands = take k (randomRs bnds rg)
 60 | 
 61 |   newListArray (l,u) [rands !! (index (l,u) i`mod`k) | i <- range (l,u)] >>= evaluateArr l
 62 | 
 63 | 
 64 | randomArr :: (Ix i, Num e, Storable e, Random e, MArray StorableArray e IO)
 65 |           => (i,i) -> IO (StorableArray i e)
 66 | randomArr (l,u) = randomArrR (l,u) (-1,1)
 67 | 
 68 | 
 69 | --
 70 | -- Verify similarity of two arrays
 71 | --
 72 | verify :: (Ix i, Ord e, Fractional e, Storable e)
 73 |        => StorableArray i e -> StorableArray i e -> IO (Bool)
 74 | verify ref arr = do
 75 |   as <- getElems arr
 76 |   bs <- getElems ref
 77 |   return (verifyList as bs)
 78 | 
 79 | 
 80 | --------------------------------------------------------------------------------
 81 | -- Lists
 82 | --------------------------------------------------------------------------------
 83 | 
 84 | randomListR :: (Num e, Random e, Storable e) => Int -> (e,e) -> IO [e]
 85 | randomListR len bnds = do
 86 |   rg <- newStdGen
 87 |   let -- The standard random number generator is too slow to generate really
 88 |       -- large vectors. Instead, we generate a short vector and repeat that.
 89 |       k     = 1000
 90 |       rands = take k (randomRs bnds rg)
 91 | 
 92 |   evaluate [rands !! (i`mod`k) | i <- [0..len-1]]
 93 | 
 94 | randomList :: (Num e, Random e, Storable e) => Int -> IO [e]
 95 | randomList len = randomListR len (-1,1)
 96 | 
 97 | 
 98 | verifyList :: (Ord e, Fractional e) => [e] -> [e] -> Bool
 99 | verifyList xs ys = all (< epsilon) [abs ((x-y)/(x+y+epsilon)) | x <- xs | y <- ys]
100 |   where epsilon = 0.0005
101 | 
102 | 


--------------------------------------------------------------------------------
/examples/common/src/Time.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | --
 3 | -- Module    : Time
 4 | -- Copyright : (c) 2009 Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Simple timing benchmarks
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module Time where
12 | 
13 | import System.CPUTime
14 | import Control.Monad
15 | 
16 | 
17 | -- Timing
18 | --
19 | data Time = Time { cpu_time :: Integer }
20 | 
21 | type TimeUnit = Integer -> Integer
22 | 
23 | picosecond, millisecond, second :: TimeUnit
24 | picosecond  n = n
25 | millisecond n = n `div` 1000000000
26 | second      n = n `div` 1000000000000
27 | 
28 | getTime :: IO Time
29 | getTime = Time `fmap` getCPUTime
30 | 
31 | timeIn :: TimeUnit -> Time -> Integer
32 | timeIn u (Time t) = u t
33 | 
34 | elapsedTime :: Time -> Time -> Time
35 | elapsedTime (Time t1) (Time t2) = Time (t2 - t1)
36 | 
37 | 
38 | -- Simple benchmarking
39 | --
40 | {-# NOINLINE benchmark #-}
41 | benchmark
42 |   :: Int                -- Number of times to repeat test
43 |   -> IO a               -- Test to run
44 |   -> IO b               -- Finaliser to before measuring elapsed time
45 |   -> IO (Time,a)
46 | benchmark n testee finaliser = do
47 |   t1    <- getTime
48 |   (r:_) <- replicateM n testee
49 |   _     <- finaliser
50 |   t2    <- getTime
51 |   return (elapsedTime t1 t2, r)
52 | 
53 | 


--------------------------------------------------------------------------------
/examples/common/src/control.cpp:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Control
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #include "utils.h"
10 | #include <cuda_runtime_api.h>
11 | 
12 | typedef void* KernelPointer;
13 | 
14 | /*
15 |  * Stolen from the RadixSort example from the CUDA SDK.
16 |  */
17 | 
18 | void
19 | computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
20 | {
21 |     int                 deviceID = -1;
22 |     cudaDeviceProp      devprop;
23 |     cudaFuncAttributes  attr;
24 | 
25 |     assert(cudaSuccess == cudaGetDevice(&deviceID));
26 |     assert(cudaSuccess == cudaGetDeviceProperties(&devprop, deviceID));
27 |     assert(cudaSuccess == cudaFuncGetAttributes(&attr, (const char*)kernel));
28 | 
29 |     /*
30 |      * Determine the maximum number of CTAs that can be run simultaneously for
31 |      * each kernel. This is equivalent to the calculation done in the CUDA
32 |      * Occupancy Calculator spreadsheet
33 |      */
34 |     const unsigned int regAllocationUnit      = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
35 |     const unsigned int warpAllocationMultiple = 2;
36 |     const unsigned int smemAllocationUnit     = 512;                                                  // in bytes
37 |     const unsigned int maxThreadsPerSM        = bManualCoalesce ? 768 : 1024;                         // sm_12 GPUs increase threads/SM to 1024
38 |     const unsigned int maxBlocksPerSM         = 8;
39 | 
40 |     size_t numWarps;
41 |     numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Number of warps (round up to nearest whole multiple of warp size)
42 |     numWarps = ceiling(numWarps, warpAllocationMultiple);       // Round up to warp allocation multiple
43 | 
44 |     size_t regsPerCTA;
45 |     regsPerCTA = attr.numRegs * devprop.warpSize * numWarps;    // Number of regs is (regs per thread * number of warps * warp size)
46 |     regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);        // Round up to multiple of register allocation unit size
47 | 
48 |     size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
49 |     size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
50 | 
51 |     size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock      / regsPerCTA : maxBlocksPerSM;
52 |     size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
53 |     size_t ctaLimitThreads =                  maxThreadsPerSM           / RadixSort::CTA_SIZE;
54 | 
55 |     unsigned int numSMs  = devprop.multiProcessorCount;
56 |     unsigned int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
57 | //    setNumCTAs(kernel, maxCTAs);
58 | }
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/common/src/cudpp/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2007-2009 The Regents of the University of California, Davis
 2 | campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, 
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright notice, 
 8 |       this list of conditions and the following disclaimer.
 9 |     * Redistributions in binary form must reproduce the above copyright notice, 
10 |       this list of conditions and the following disclaimer in the documentation 
11 |       and/or other materials provided with the distribution.
12 |     * Neither the name of the The Regents, nor NVIDIA, nor the names of its 
13 |       contributors may be used to endorse or promote products derived from this 
14 |       software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
19 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
20 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/examples/common/src/cudpp/scan_kernel.cu:
--------------------------------------------------------------------------------
  1 | // -------------------------------------------------------------
  2 | // cuDPP -- CUDA Data Parallel Primitives library
  3 | // -------------------------------------------------------------
  4 | //  $Revision: 5633 $
  5 | //  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
  6 | // -------------------------------------------------------------
  7 | // This source code is distributed under the terms of license.txt
  8 | // in the root directory of this source distribution.
  9 | // -------------------------------------------------------------
 10 | 
 11 | /**
 12 |  * @file
 13 |  * scan_kernel.cu
 14 |  *
 15 |  * @brief CUDPP kernel-level scan routines
 16 |  */
 17 | 
 18 | /** \defgroup cudpp_kernel CUDPP Kernel-Level API
 19 |   * The CUDPP Kernel-Level API contains functions that run on the GPU
 20 |   * device across a grid of Cooperative Thread Array (CTA, aka Thread
 21 |   * Block).  These kernels are declared \c __global__ so that they
 22 |   * must be invoked from host (CPU) code.  They generally invoke GPU
 23 |   * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink.
 24 |   * Kernel-Level API functions are used by CUDPP
 25 |   * \link cudpp_app Application-Level\endlink functions to implement their
 26 |   * functionality.
 27 |   * @{
 28 |   */
 29 | 
 30 | /** @name Scan Functions
 31 | * @{
 32 | */
 33 | 
 34 | #include "cudpp/cudpp_globals.h"
 35 | #include "cudpp/scan_cta.cu"
 36 | #include "cudpp/shared_mem.h"
 37 | 
 38 | /**
 39 |   * @brief Main scan kernel
 40 |   *
 41 |   * This __global__ device function performs one level of a multiblock scan on
 42 |   * an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out
 43 |   * (which may point to the same array).  The same function may be used for
 44 |   * single or multi-row scans.  To perform a multirow scan, pass the width of
 45 |   * each row of the input row (in elements) in \a dataRowPitch, and the width of
 46 |   * the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
 47 |   * with a thread block grid with height greater than 1.
 48 |   *
 49 |   * This function peforms one level of a recursive, multiblock scan.  At the
 50 |   * app level, this function is called by cudppScan and cudppMultiScan and used
 51 |   * in combination with vectorAddUniform4() to produce a complete scan.
 52 |   *
 53 |   * Template parameter \a T is the datatype of the array to be scanned.
 54 |   * Template parameter \a traits is the ScanTraits struct containing
 55 |   * compile-time options for the scan, such as whether it is forward or
 56 |   * backward, exclusive or inclusive, multi- or single-row, etc.
 57 |   *
 58 |   * @param[out] d_out The output (scanned) array
 59 |   * @param[in]  d_in The input array to be scanned
 60 |   * @param[out] d_blockSums The array of per-block sums
 61 |   * @param[in]  numElements The number of elements to scan
 62 |   * @param[in]  dataRowPitch The width of each row of \a d_in in elements
 63 |   * (for multi-row scans)
 64 |   * @param[in]  blockSumRowPitch The with of each row of \a d_blockSums in elements
 65 |   * (for multi-row scans)
 66 |   */
 67 | template<class T, class traits>
 68 | __global__ void scan4(T            *d_out,
 69 |                       const T      *d_in,
 70 |                       T            *d_blockSums,
 71 |                       int          numElements,
 72 |                       unsigned int dataRowPitch,
 73 |                       unsigned int blockSumRowPitch)
 74 | {
 75 |     SharedMemory<T> smem;
 76 |     T* temp = smem.getPointer();
 77 | 
 78 |     int devOffset, ai, bi, aiDev, biDev;
 79 |     T threadScan0[4], threadScan1[4];
 80 | 
 81 |     unsigned int blockN = numElements;
 82 |     unsigned int blockSumIndex = blockIdx.x;
 83 | 
 84 |     if (traits::isMultiRow())
 85 |     {
 86 |         //int width = __mul24(gridDim.x, blockDim.x) << 1;
 87 |         int yIndex     = __umul24(blockDim.y, blockIdx.y) + threadIdx.y;
 88 |         devOffset      = __umul24(dataRowPitch, yIndex);
 89 |         blockN        += (devOffset << 2);
 90 |         devOffset     += __umul24(blockIdx.x, blockDim.x << 1);
 91 |         blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ;
 92 |     }
 93 |     else
 94 |     {
 95 |         devOffset = __umul24(blockIdx.x, (blockDim.x << 1));
 96 |     }
 97 | 
 98 |     // load data into shared memory
 99 |     loadSharedChunkFromMem4<T, traits>
100 |         (temp, threadScan0, threadScan1, d_in,
101 |          blockN, devOffset, ai, bi, aiDev, biDev);
102 | 
103 |     scanCTA<T, traits>(temp, d_blockSums, blockSumIndex);
104 | 
105 |     // write results to device memory
106 |     storeSharedChunkToMem4<T, traits>
107 |         (d_out, threadScan0, threadScan1, temp,
108 |          blockN, devOffset, ai, bi, aiDev, biDev);
109 | 
110 | }
111 | 
112 | /** @} */ // end scan functions
113 | /** @} */ // end cudpp_kernel
114 | 


--------------------------------------------------------------------------------
/examples/common/src/cudpp/segmented_scan_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ***************************************************************
  2 | //  cuDPP -- CUDA Data Parallel Primitives library
  3 | //  -------------------------------------------------------------
  4 | //  $Revision: 3505 $
  5 | //  $Date: 2007-07-06 09:26:06 -0700 (Fri, 06 Jul 2007) $
  6 | //  -------------------------------------------------------------
  7 | // This source code is distributed under the terms of license.txt in
  8 | // the root directory of this source distribution.
  9 | // -------------------------------------------------------------
 10 | 
 11 | /**
 12 | * @file
 13 | * segmented_scan_kernel.cu
 14 | *
 15 | * @brief CUDPP kernel-level scan routines
 16 | */
 17 | 
 18 | /** \defgroup cudpp_kernel CUDPP Kernel-Level API
 19 | * The CUDPP Kernel-Level API contains functions that run on the GPU
 20 | * device across a grid of Cooperative Thread Array (CTA, aka Thread
 21 | * Block).  These kernels are declared \c __global__ so that they
 22 | * must be invoked from host (CPU) code.  They generally invoke GPU
 23 | * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink.
 24 | * Kernel-Level API functions are used by CUDPP
 25 | * \link cudpp_app Application-Level\endlink functions to implement their
 26 | * functionality.
 27 | * @{
 28 | */
 29 | 
 30 | /** @name Segmented scan Functions
 31 | * @{
 32 | */
 33 | 
 34 | #include "cudpp/cudpp_globals.h"
 35 | #include "cudpp/segmented_scan_cta.cu"
 36 | #include "shared_mem.h"
 37 | 
 38 | /**
 39 | * @brief Main segmented scan kernel
 40 | *
 41 | * This __global__ device function performs one level of a multiblock
 42 | * segmented scan on an one-dimensioned array in \a d_idata, returning the
 43 | * result in \a d_odata (which may point to the same array).
 44 | *
 45 | * This function performs one level of a recursive, multiblock scan.  At the
 46 | * app level, this function is called by cudppSegmentedScan and used in combination
 47 | * with either vectorSegmentedAddUniform4() (forward) or
 48 | * vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan.
 49 | *
 50 | * Template parameter \a T is the datatype of the array to be scanned.
 51 | * Template parameter \a traits is the SegmentedScanTraits struct containing
 52 | * compile-time options for the segmented scan, such as whether it is forward
 53 | * or backward, inclusive or exclusive, etc.
 54 | *
 55 | * @param[out] d_odata The output (scanned) array
 56 | * @param[in] d_idata The input array to be scanned
 57 | * @param[in] d_iflags The input array of flags
 58 | * @param[out] d_blockSums The array of per-block sums
 59 | * @param[out] d_blockFlags The array of per-block OR-reduction of flags
 60 | * @param[out] d_blockIndices The array of per-block min-reduction of indices
 61 | * @param[in] numElements The number of elements to scan
 62 | */
 63 | template <class T, class traits>
 64 | __global__
 65 | void segmentedScan4(T                  *d_odata,
 66 |                     const T            *d_idata,
 67 |                     const unsigned int *d_iflags,
 68 |                     unsigned int       numElements,
 69 |                     T                  *d_blockSums=0,
 70 |                     unsigned int       *d_blockFlags=0,
 71 |                     unsigned int       *d_blockIndices=0
 72 |                     )
 73 | {
 74 |     SharedMemory<T> smem;
 75 |     T* temp = smem.getPointer();
 76 | 
 77 |     int ai, bi, aiDev, biDev;
 78 | 
 79 |     // Last index in shared memory which contains data
 80 |     unsigned int lastIdx = ((blockDim.x << 1)-1);
 81 | 
 82 |     // Chop up the shared memory into 4 contiguous spaces - the first
 83 |     // for the data, the second for the indices, the third for the
 84 |     // read-only version of the flags and the last for the read-write
 85 |     // version of the flags
 86 |     unsigned int* indices = (unsigned int *)(&(temp[lastIdx + 1]));
 87 |     unsigned int* flags = (unsigned int *)(&(temp[2*(lastIdx + 1)]));
 88 | 
 89 |     T threadScan0[4];
 90 |     T threadScan1[4];
 91 |     unsigned int threadFlag = 0;
 92 | 
 93 |     int devOffset = blockIdx.x * (blockDim.x << 1);
 94 | 
 95 |     // load data into shared memory
 96 |     loadForSegmentedScanSharedChunkFromMem4<T, traits>(
 97 |         temp, threadScan0, threadScan1, threadFlag,
 98 |         flags, indices, d_idata, d_iflags,
 99 |         numElements, devOffset, ai, bi, aiDev, biDev);
100 | 
101 |     segmentedScanCTA<T, traits>(
102 |         temp, flags, indices,
103 |         d_blockSums, d_blockFlags, d_blockIndices);
104 | 
105 |     // write results to device memory
106 |     storeForSegmentedScanSharedChunkToMem4<T, traits>(
107 |         d_odata, threadScan0, threadScan1, threadFlag,
108 |         temp, numElements,  devOffset, ai, bi, aiDev, biDev);
109 | }
110 | 
111 | /** @} */ // end scan functions
112 | /** @} */ // end cudpp_kernel
113 | 


--------------------------------------------------------------------------------
/examples/src/bandwidthTest/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := bandwidthTest
 9 | 
10 | HSMAIN          := BandwidthTest.hs
11 | CUFILES         :=
12 | 
13 | EXTRALIBS       :=
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/bandwidthTest/results/Bandwidth.numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/Bandwidth.numbers


--------------------------------------------------------------------------------
/examples/src/bandwidthTest/results/GT120.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/GT120.pdf


--------------------------------------------------------------------------------
/examples/src/bandwidthTest/results/Tesla.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/Tesla.pdf


--------------------------------------------------------------------------------
/examples/src/enum/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | STATIC_LIB      := libenum.a
 9 | 
10 | CUFILES         := enum.cu
11 | 
12 | EXTRALIBS       := stdc++
13 | 
14 | # ------------------------------------------------------------------------------
15 | # Haskell/CUDA build system
16 | # ------------------------------------------------------------------------------
17 | include ../../common/common.mk
18 | 


--------------------------------------------------------------------------------
/examples/src/enum/enum.cu:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Enum
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #include "enum.h"
10 | 
11 | #include "algorithms.h"
12 | #include "utils.h"
13 | #include <stdint.h>
14 | 
15 | 
16 | static void
17 | enum_control(int32_t n, uint32_t &blocks, uint32_t &threads)
18 | {
19 |     threads = min(ceilPow2(n), MAX_THREADS);
20 |     blocks  = (n + threads - 1) / threads;
21 | }
22 | 
23 | 
24 | template <typename T, bool increasing>
25 | __global__ static void
26 | enum_core(T *out, const T from, const T then, const T to)
27 | {
28 |     uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
29 |     const T val  = from + (then-from) * idx;
30 | 
31 |     if (increasing) if (val <= to) out[idx] = val;
32 |     else            if (val >= to) out[idx] = val;
33 | }
34 | 
35 | template <typename T>
36 | static void
37 | enumFromThenTo(T *out, const T from, const T then, const T to)
38 | {
39 |     uint32_t threads;
40 |     uint32_t blocks;
41 |     int32_t  n          = 1 + (to - from) / (then - from);
42 |     bool     increasing = then-from > 0;
43 | 
44 |     if (n <= 0) return;
45 | 
46 |     enum_control(n, blocks, threads);
47 |     if (increasing) enum_core<T,true> <<<blocks,threads>>>(out, from, then, to);
48 |     else            enum_core<T,false><<<blocks,threads>>>(out, from, then, to);
49 | }
50 | 
51 | 
52 | // -----------------------------------------------------------------------------
53 | // Instances
54 | // -----------------------------------------------------------------------------
55 | 
56 | void enumFromTo_i(int32_t *out, int32_t l, int32_t u)
57 | {
58 |     enumFromThenTo<int32_t>(out, l, l+1, u);
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/src/enum/enum.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Enum
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | 
10 | #ifndef __ENUM_H__
11 | #define __ENUM_H__
12 | 
13 | #define MAX_THREADS	128
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/examples/src/fold/Fold.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | --------------------------------------------------------------------------------
 3 | --
 4 | -- Module    : Fold
 5 | -- Copyright : (c) 2009 Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | -- Reduce a vector to a single value
 9 | --
10 | --------------------------------------------------------------------------------
11 | 
12 | module Main where
13 | 
14 | #include "fold.h"
15 | 
16 | -- Friends
17 | import C2HS
18 | import Time
19 | import RandomVector
20 | 
21 | -- System
22 | import Control.Exception
23 | import qualified Foreign.CUDA.Runtime as CUDA
24 | 
25 | 
26 | --------------------------------------------------------------------------------
27 | -- Reference
28 | --------------------------------------------------------------------------------
29 | 
30 | foldRef :: Num e => [e] -> IO e
31 | foldRef xs = do
32 |   (t,r) <- benchmark 100 (evaluate (foldl (+) 0 xs)) (return ())
33 |   putStrLn $ "== Reference: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms"
34 |   return r
35 | 
36 | --------------------------------------------------------------------------------
37 | -- CUDA
38 | --------------------------------------------------------------------------------
39 | 
40 | --
41 | -- Note that this requires two memory copies: once from a Haskell list to the C
42 | -- heap, and from there into the graphics card memory. See the `bandwidthTest'
43 | -- example for the atrocious performance of this operation.
44 | --
45 | -- For this test, cheat a little and just time the pure computation.
46 | --
47 | foldCUDA :: [Float] -> IO Float
48 | foldCUDA xs = do
49 |   let len = length xs
50 |   CUDA.withListArray xs $ \d_xs -> do
51 |     (t,r) <- benchmark 100 (fold_plusf d_xs len) CUDA.sync
52 |     putStrLn $ "== CUDA: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms"
53 |     return r
54 | 
55 | {# fun unsafe fold_plusf
56 |   { withDP* `CUDA.DevicePtr Float'
57 |   ,         `Int'
58 |   }
59 |   -> `Float' cFloatConv #}
60 |   where
61 |     withDP p a = CUDA.withDevicePtr p $ \p' -> a (castPtr p')
62 | 
63 | 
64 | --------------------------------------------------------------------------------
65 | -- Main
66 | --------------------------------------------------------------------------------
67 | 
68 | main :: IO ()
69 | main = do
70 |   dev   <- CUDA.get
71 |   props <- CUDA.props dev
72 |   putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props
73 | 
74 |   xs   <- randomList 30000
75 |   ref  <- foldRef xs
76 |   cuda <- foldCUDA xs
77 | 
78 |   putStrLn $ "== Validating: " ++ if ((ref-cuda)/ref) < 0.0001 then "Ok!" else "INVALID!"
79 | 
80 | 


--------------------------------------------------------------------------------
/examples/src/fold/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := fold
 9 | 
10 | HSMAIN          := Fold.chs
11 | CUFILES         := fold.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/fold/fold.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Fold
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #ifndef __FOLD_H__
10 | #define __FOLD_H__
11 | 
12 | /*
13 |  * Optimised for Tesla.
14 |  * Maximum thread occupancy for your card may be achieved with different values.
15 |  */
16 | #define MAX_THREADS     128
17 | #define MAX_BLOCKS      64
18 | 
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | /*
24 |  * Instances
25 |  */
26 | float fold_plusf(float *xs, int N);
27 | 
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | #endif
33 | 


--------------------------------------------------------------------------------
/examples/src/foldSeg/foldSeg.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : FoldSeg
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #ifndef __FOLD_H__
10 | #define __FOLD_H__
11 | 
12 | /*
13 |  * Optimised for Tesla.
14 |  * Maximum thread occupancy for your card may be achieved with different values.
15 |  */
16 | #define MAX_THREADS     128
17 | #define MAX_BLOCKS      64
18 | #define WARP_SIZE       32
19 | #define ELTS_PER_THREAD 8
20 | 
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 | 
25 | /*
26 |  * Instances
27 |  */
28 | float foldSeg_plusf(float *xs, int N);
29 | 
30 | 
31 | #ifdef __cplusplus
32 | }
33 | #endif
34 | #endif
35 | 
36 | 


--------------------------------------------------------------------------------
/examples/src/map2/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := map
 9 | 
10 | HSMAIN          := Map.hs
11 | CUFILES         := map.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/map2/Map.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | --------------------------------------------------------------------------------
 3 | --
 4 | -- Module    : Map
 5 | -- Copyright : (c) 2010 Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | --------------------------------------------------------------------------------
 9 | 
10 | module Main where
11 | 
12 | import qualified Foreign.CUDA                 as CUDA
13 | 
14 | import Foreign.Storable
15 | import Foreign.Storable.Tuple ()
16 | import qualified Foreign.Storable.Traversable as Store
17 | 
18 | instance Storable a => Storable [a] where
19 |   sizeOf    = Store.sizeOf
20 |   alignment = Store.alignment
21 |   peek      = Store.peek (error "instance Traversable [a] is lazy, so we do not provide a real value here")
22 |   poke      = Store.poke
23 | 
24 | main :: IO ()
25 | main =
26 |   let len = 16
27 |       xs  = take len [0..] :: [Int]
28 |       ys  = take len [0..] :: [Int]
29 |   in
30 |   CUDA.allocaArray len  $ \d_out ->
31 |   CUDA.withListArray xs $ \d_in0 ->
32 |   CUDA.withListArray ys $ \d_in1 -> do
33 |     CUDA.setConfig (1,1) (len,1,1) 0 Nothing
34 |     CUDA.setParams [CUDA.VArg d_out, CUDA.VArg (d_in0, d_in1), CUDA.IArg len]
35 |     CUDA.launch "map"
36 | 
37 |     out :: [Int] <- CUDA.peekListArray len d_out
38 |     print out
39 | 
40 | 


--------------------------------------------------------------------------------
/examples/src/map2/map.cu:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Map
 4 |  * Copyright : (c) 2010 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | 
10 | typedef unsigned int Ix;
11 | 
12 | typedef int TyOut;
13 | typedef int TyIn0;
14 | typedef int TyIn1;
15 | 
16 | typedef TyOut* ArrOut;
17 | 
18 | typedef struct {
19 |     TyIn0 a;
20 |     TyIn1 b;
21 | } ArrElem0;
22 | 
23 | typedef struct {
24 |     const TyIn0 *a;
25 |     const TyIn1 *b;
26 | } ArrIn0;
27 | 
28 | 
29 | static inline __device__
30 | ArrElem0 indexArray0(const ArrIn0 arr, const Ix idx)
31 | {
32 |     ArrElem0 x = { arr.a[idx], arr.b[idx] };
33 |     return x;
34 | }
35 | 
36 | __device__ static TyOut
37 | apply(const ArrElem0 in0)
38 | {
39 |     return in0.a + in0.b;
40 | }
41 | 
42 | /*
43 |  * Apply the function to each element of the array. Each thread processes
44 |  * multiple elements, striding the array by the grid size.
45 |  */
46 | extern "C"
47 | __global__ void
48 | map
49 | (
50 |     ArrOut              d_out,
51 |     const ArrIn0        d_in0,
52 |     const unsigned int  length
53 | )
54 | {
55 |     unsigned int       idx;
56 |     const unsigned int gridSize = __umul24(blockDim.x, gridDim.x);
57 | 
58 |     for (idx = __umul24(blockDim.x, blockIdx.x) + threadIdx.x; idx < length; idx += gridSize)
59 |     {
60 |         d_out[idx] = apply(indexArray0(d_in0, idx));
61 |     }
62 | }
63 | 
64 | // vim:filetype=cuda.c
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/src/matrixMul/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := matrixMul
 9 | 
10 | HSMAIN          := MatrixMul.hs
11 | CUFILES         := matrix_mul.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/matrixMul/MatrixMul.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | --------------------------------------------------------------------------------
  3 | --
  4 | -- Module    : MatrixMul
  5 | -- Copyright : (c) 2009 Trevor L. McDonell
  6 | -- License   : BSD
  7 | --
  8 | -- Matrix multiplication using runtime interface and execution control instead
  9 | -- of calling C functions via the FFI.
 10 | --
 11 | --------------------------------------------------------------------------------
 12 | 
 13 | module Main where
 14 | 
 15 | #include "matrix_mul.h"
 16 | 
 17 | -- Friends
 18 | import Time
 19 | import RandomVector
 20 | 
 21 | -- System
 22 | import Data.Array
 23 | import System.IO
 24 | import Foreign
 25 | import qualified Foreign.CUDA as CUDA
 26 | 
 27 | 
 28 | --------------------------------------------------------------------------------
 29 | -- Reference
 30 | --------------------------------------------------------------------------------
 31 | 
 32 | matMult :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e)
 33 | matMult mx my = do
 34 |   x <- unsafeFreeze mx
 35 |   y <- unsafeFreeze my
 36 |   let ((li, lj), (ui, uj))  = bounds x
 37 |       ((li',lj'),(ui',uj')) = bounds y
 38 |       resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj'))
 39 |               | otherwise            = error "matrix dimensions must agree"
 40 | 
 41 |   newListArray resBnds [sum [x!(i,k) * y!(k,j) | k <- range (lj,uj)]
 42 |                          | i <- range (li,ui)
 43 |                          , j <- range (lj',uj') ]
 44 | 
 45 | 
 46 | --------------------------------------------------------------------------------
 47 | -- CUDA
 48 | --------------------------------------------------------------------------------
 49 | 
 50 | matMultCUDA :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e)
 51 | matMultCUDA xs' ys' = doMult undefined xs' ys'
 52 |   where
 53 |     doMult :: (Num e', Storable e') => e' -> Matrix e' -> Matrix e' -> IO (Matrix e')
 54 |     doMult dummy xs ys = do
 55 | 
 56 |       -- Setup matrix parameters
 57 |       --
 58 |       ((li, lj), (ui, uj))  <- getBounds xs
 59 |       ((li',lj'),(ui',uj')) <- getBounds ys
 60 |       let wx = rangeSize (lj,uj)
 61 |           hx = rangeSize (li,ui)
 62 |           wy = rangeSize (lj',uj')
 63 |           hy = rangeSize (li',ui')
 64 |           resBnds | wx == hy  = ((li,lj'),(ui,uj'))
 65 |                   | otherwise = error "matrix dimensions must agree"
 66 | 
 67 |       -- Allocate memory and copy test data
 68 |       --
 69 |       CUDA.allocaArray (wx*hx) $ \d_xs -> do
 70 |       CUDA.allocaArray (wy*hy) $ \d_ys -> do
 71 |       CUDA.allocaArray (wy*hx) $ \d_zs -> do
 72 |       withMatrix xs $ \p -> CUDA.pokeArray (wx*hx) p d_xs
 73 |       withMatrix ys $ \p -> CUDA.pokeArray (wy*hy) p d_ys
 74 | 
 75 |       -- Launch the kernel
 76 |       --
 77 |       let gridDim   = (wy`div`BLOCK_SIZE, hx`div`BLOCK_SIZE)
 78 |           blockDim  = (BLOCK_SIZE,BLOCK_SIZE,1)
 79 |           sharedMem = 2 * BLOCK_SIZE * BLOCK_SIZE * fromIntegral (sizeOf dummy)
 80 | 
 81 |       CUDA.setConfig gridDim blockDim sharedMem Nothing
 82 |       CUDA.setParams [CUDA.VArg d_xs, CUDA.VArg d_ys, CUDA.VArg d_zs, CUDA.IArg wx, CUDA.IArg wy]
 83 |       CUDA.launch "matrixMul"
 84 | 
 85 |       -- Copy back result
 86 |       zs <- newArray_ resBnds
 87 |       withMatrix zs $ \p -> CUDA.peekArray (wy*hx) d_zs p
 88 |       return zs
 89 | 
 90 | 
 91 | --------------------------------------------------------------------------------
 92 | -- Main
 93 | --------------------------------------------------------------------------------
 94 | 
 95 | main :: IO ()
 96 | main = do
 97 |   dev   <- CUDA.get
 98 |   props <- CUDA.props dev
 99 |   putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props
100 | 
101 |   xs <- randomArr ((1,1),(8*BLOCK_SIZE, 4*BLOCK_SIZE)) :: IO (Matrix Float)
102 |   ys <- randomArr ((1,1),(4*BLOCK_SIZE,12*BLOCK_SIZE)) :: IO (Matrix Float)
103 | 
104 |   putStr   "== Reference: " >> hFlush stdout
105 |   (tr,ref) <- benchmark 100 (matMult xs ys) (return ())
106 |   putStrLn $  shows (fromInteger (timeIn millisecond tr) / 100::Float) " ms"
107 | 
108 |   putStr   "== CUDA: " >> hFlush stdout
109 |   (tc,mat) <- benchmark 100 (matMultCUDA xs ys) (CUDA.sync)
110 |   putStrLn $  shows (fromInteger (timeIn millisecond tc) / 100::Float) " ms"
111 | 
112 |   putStr "== Validating: "
113 |   verify ref mat >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!"
114 | 
115 | 


--------------------------------------------------------------------------------
/examples/src/matrixMul/matrix_mul.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property and 
  5 |  * proprietary rights in and to this software and related documentation. 
  6 |  * Any use, reproduction, disclosure, or distribution of this software 
  7 |  * and related documentation without an express license agreement from
  8 |  * NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 11 |  * associated with this source code for terms and conditions that govern 
 12 |  * your use of this NVIDIA software.
 13 |  * 
 14 |  */
 15 | 
 16 | /* Matrix multiplication: C = A * B.
 17 |  * Device code.
 18 |  */
 19 | 
 20 | #ifndef _MATRIXMUL_KERNEL_H_
 21 | #define _MATRIXMUL_KERNEL_H_
 22 | 
 23 | #include <stdio.h>
 24 | #include "matrix_mul.h"
 25 | 
 26 | #define CHECK_BANK_CONFLICTS 0
 27 | #if CHECK_BANK_CONFLICTS
 28 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
 29 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
 30 | #else
 31 | #define AS(i, j) As[i][j]
 32 | #define BS(i, j) Bs[i][j]
 33 | #endif
 34 | 
 35 | ////////////////////////////////////////////////////////////////////////////////
 36 | //! Matrix multiplication on the device: C = A * B
 37 | //! wA is A's width and wB is B's width
 38 | ////////////////////////////////////////////////////////////////////////////////
 39 | extern "C" __global__ void
 40 | matrixMul(float* A, float* B, float* C, int wA, int wB)
 41 | {
 42 |     // Block index
 43 |     int bx = blockIdx.x;
 44 |     int by = blockIdx.y;
 45 | 
 46 |     // Thread index
 47 |     int tx = threadIdx.x;
 48 |     int ty = threadIdx.y;
 49 | 
 50 |     // Index of the first sub-matrix of A processed by the block
 51 |     int aBegin = wA * BLOCK_SIZE * by;
 52 | 
 53 |     // Index of the last sub-matrix of A processed by the block
 54 |     int aEnd   = aBegin + wA - 1;
 55 | 
 56 |     // Step size used to iterate through the sub-matrices of A
 57 |     int aStep  = BLOCK_SIZE;
 58 | 
 59 |     // Index of the first sub-matrix of B processed by the block
 60 |     int bBegin = BLOCK_SIZE * bx;
 61 | 
 62 |     // Step size used to iterate through the sub-matrices of B
 63 |     int bStep  = BLOCK_SIZE * wB;
 64 | 
 65 |     // Csub is used to store the element of the block sub-matrix
 66 |     // that is computed by the thread
 67 |     float Csub = 0;
 68 | 
 69 |     // Loop over all the sub-matrices of A and B
 70 |     // required to compute the block sub-matrix
 71 |     for (int a = aBegin, b = bBegin;
 72 |              a <= aEnd;
 73 |              a += aStep, b += bStep) {
 74 | 
 75 |         // Declaration of the shared memory array As used to
 76 |         // store the sub-matrix of A
 77 |         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 78 | 
 79 |         // Declaration of the shared memory array Bs used to
 80 |         // store the sub-matrix of B
 81 |         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 82 | 
 83 |         // Load the matrices from device memory
 84 |         // to shared memory; each thread loads
 85 |         // one element of each matrix
 86 |         AS(ty, tx) = A[a + wA * ty + tx];
 87 |         BS(ty, tx) = B[b + wB * ty + tx];
 88 | 
 89 |         // Synchronize to make sure the matrices are loaded
 90 |         __syncthreads();
 91 | 
 92 |         // Multiply the two matrices together;
 93 |         // each thread computes one element
 94 |         // of the block sub-matrix
 95 |         for (int k = 0; k < BLOCK_SIZE; ++k)
 96 |             Csub += AS(ty, k) * BS(k, tx);
 97 | 
 98 |         // Synchronize to make sure that the preceding
 99 |         // computation is done before loading two new
100 |         // sub-matrices of A and B in the next iteration
101 |         __syncthreads();
102 |     }
103 | 
104 |     // Write the block sub-matrix to device memory;
105 |     // each thread writes one element
106 |     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
107 |     C[c + wB * ty + tx] = Csub;
108 | }
109 | 
110 | #endif // #ifndef _MATRIXMUL_KERNEL_H_
111 | 


--------------------------------------------------------------------------------
/examples/src/matrixMul/matrix_mul.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * NVIDIA Corporation and its licensors retain all intellectual property and 
 5 |  * proprietary rights in and to this software and related documentation. 
 6 |  * Any use, reproduction, disclosure, or distribution of this software 
 7 |  * and related documentation without an express license agreement from
 8 |  * NVIDIA Corporation is strictly prohibited.
 9 |  *
10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA) 
11 |  * associated with this source code for terms and conditions that govern 
12 |  * your use of this NVIDIA software.
13 |  * 
14 |  */
15 | 
16 | #ifndef _MATRIXMUL_H_
17 | #define _MATRIXMUL_H_
18 | 
19 | /* Thread block size */
20 | #define BLOCK_SIZE 16
21 | 
22 | /* Matrix dimensions
23 |  * (chosen as multiples of the thread block size for simplicity)
24 |  */
25 | #define WA (3 * BLOCK_SIZE) /* Matrix A width  */
26 | #define HA (5 * BLOCK_SIZE) /* Matrix A height */
27 | #define WB (8 * BLOCK_SIZE) /* Matrix B width  */
28 | #define HB WA  /* Matrix B height */
29 | #define WC WB  /* Matrix C width  */
30 | #define HC HA  /* Matrix C height */
31 | 
32 | #endif /* _MATRIXMUL_H_ */
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/src/matrixMulDrv/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE	:= matrixMulDrv
 9 | 
10 | HSMAIN		:= MatrixMul.hs
11 | PTXFILES	:= matrix_mul.cu
12 | 
13 | USEDRVAPI	:= 1
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/matrixMulDrv/MatrixMul.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | --------------------------------------------------------------------------------
  3 | --
  4 | -- Module    : MatrixMul
  5 | -- Copyright : (c) 2009 Trevor L. McDonell
  6 | -- License   : BSD
  7 | --
  8 | -- Matrix multiplication using driver interface
  9 | --
 10 | --------------------------------------------------------------------------------
 11 | 
 12 | module Main where
 13 | 
 14 | #include "matrix_mul.h"
 15 | 
 16 | -- Friends
 17 | import RandomVector
 18 | 
 19 | -- System
 20 | import Numeric
 21 | import Data.Array
 22 | import Control.Exception
 23 | import Data.Array.Storable
 24 | import Foreign.Storable
 25 | import qualified Data.ByteString.Char8 as B
 26 | import qualified Foreign.CUDA.Driver   as CUDA
 27 | 
 28 | 
 29 | -- Return the (width,height) of a matrix
 30 | --
 31 | getSize :: Storable e => Matrix e -> IO (Int,Int)
 32 | getSize mat = do
 33 |   ((li,lj),(ui,uj)) <- getBounds mat
 34 |   return (rangeSize (lj,uj), rangeSize (li,ui))
 35 | 
 36 | --------------------------------------------------------------------------------
 37 | -- Reference implementation
 38 | --------------------------------------------------------------------------------
 39 | 
 40 | matMult :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e)
 41 | matMult mx my = do
 42 |   x <- unsafeFreeze mx
 43 |   y <- unsafeFreeze my
 44 |   let ((li, lj), (ui, uj))  = bounds x
 45 |       ((li',lj'),(ui',uj')) = bounds y
 46 |       resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj'))
 47 |               | otherwise            = error "matrix dimensions must agree"
 48 | 
 49 |   newListArray resBnds [sum [x!(i,k) * y!(k,j) | k <- range (lj,uj)]
 50 |                          | i <- range (li,ui)
 51 |                          , j <- range (lj',uj') ]
 52 | 
 53 | 
 54 | --------------------------------------------------------------------------------
 55 | -- CUDA
 56 | --------------------------------------------------------------------------------
 57 | 
 58 | --
 59 | -- Initialise the device and context. Load the PTX source code, and return a
 60 | -- reference to the kernel function.
 61 | --
 62 | initCUDA :: IO (CUDA.Context, CUDA.Fun)
 63 | initCUDA = do
 64 |   CUDA.initialise []
 65 |   dev     <- CUDA.device 0
 66 |   ctx     <- CUDA.create dev []
 67 |   ptx     <- B.readFile "data/matrix_mul.ptx"
 68 |   (mdl,r) <- CUDA.loadDataEx ptx [CUDA.ThreadsPerBlock (BLOCK_SIZE*BLOCK_SIZE)]
 69 |   fun     <- CUDA.getFun mdl "matrixMul"
 70 | 
 71 |   putStrLn $ ">> PTX JIT compilation (" ++ showFFloat (Just 2) (CUDA.jitTime r) " ms)"
 72 |   B.putStrLn (CUDA.jitInfoLog r)
 73 |   return (ctx,fun)
 74 | 
 75 | 
 76 | --
 77 | -- Allocate some memory, and copy over the input data to the device. Should
 78 | -- probably catch allocation exceptions individually...
 79 | --
 80 | initData :: (Num e, Storable e)
 81 |          => Matrix e -> Matrix e -> IO (CUDA.DevicePtr e, CUDA.DevicePtr e, CUDA.DevicePtr e)
 82 | initData xs ys = do
 83 |   (wx,hx) <- getSize xs
 84 |   (wy,hy) <- getSize ys
 85 |   dxs     <- CUDA.mallocArray (wx*hx)
 86 |   dys     <- CUDA.mallocArray (wy*hy)
 87 |   res     <- CUDA.mallocArray (wy*hx)
 88 | 
 89 |   flip onException (mapM_ CUDA.free [dxs,dys,res]) $ do
 90 |   withMatrix xs $ \p -> CUDA.pokeArray (wx*hx) p dxs
 91 |   withMatrix ys $ \p -> CUDA.pokeArray (wy*hy) p dys
 92 |   return (dxs, dys, res)
 93 | 
 94 | 
 95 | --
 96 | -- Run the test
 97 | --
 98 | testCUDA :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e)
 99 | testCUDA xs' ys' = doTest undefined xs' ys'
100 |   where
101 |     doTest :: (Num e', Storable e') => e' -> Matrix e' -> Matrix e' -> IO (Matrix e')
102 |     doTest dummy xs ys = do
103 |       (widthX,heightX)      <- getSize xs
104 |       (widthY,_)            <- getSize ys
105 |       ((li, lj), (ui, uj))  <- getBounds xs
106 |       ((li',lj'),(ui',uj')) <- getBounds ys
107 |       let resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj'))
108 |                   | otherwise            = error "matrix dimensions must agree"
109 | 
110 |       -- Initialise environment and copy over test data
111 |       --
112 |       putStrLn ">> Initialising"
113 |       bracket initCUDA (\(ctx,_) -> CUDA.destroy ctx) $ \(_,matMul) -> do
114 | 
115 |       -- Ensure we release the memory, even if there was an error
116 |       --
117 |       putStrLn ">> Executing"
118 |       bracket
119 |         (initData xs ys)
120 |         (\(dx,dy,dz) -> mapM_ CUDA.free [dx,dy,dz]) $
121 |          \(dx,dy,dz) -> do
122 |           -- Repeat test many times...
123 |           --
124 |           CUDA.setParams     matMul [CUDA.VArg dx, CUDA.VArg dy, CUDA.VArg dz, CUDA.IArg widthX, CUDA.IArg widthY]
125 |           CUDA.setBlockShape matMul (BLOCK_SIZE,BLOCK_SIZE,1)
126 |           CUDA.setSharedSize matMul (fromIntegral (2 * BLOCK_SIZE * BLOCK_SIZE * sizeOf dummy))
127 |           CUDA.launch        matMul (widthY `div` BLOCK_SIZE, heightX `div` BLOCK_SIZE) Nothing
128 |           CUDA.sync
129 | 
130 |           -- Copy back result
131 |           --
132 |           zs <- newArray_ resBnds
133 |           withMatrix zs $ \p -> CUDA.peekArray (widthY*heightX) dz p
134 |           return zs
135 | 
136 | 
137 | --------------------------------------------------------------------------------
138 | -- Test & Verify
139 | --------------------------------------------------------------------------------
140 | 
141 | main :: IO ()
142 | main = do
143 |   putStrLn "== Generating random matrices"
144 |   xs <- randomArr ((1,1),(8*BLOCK_SIZE, 4*BLOCK_SIZE)) :: IO (Matrix Float)
145 |   ys <- randomArr ((1,1),(4*BLOCK_SIZE,12*BLOCK_SIZE)) :: IO (Matrix Float)
146 | 
147 |   putStrLn "== Generating reference solution"
148 |   ref <- matMult xs ys
149 | 
150 |   putStrLn "== Testing CUDA"
151 |   mat <- testCUDA xs ys
152 | 
153 |   putStr "== Validating: "
154 |   verify ref mat >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!"
155 | 
156 | 


--------------------------------------------------------------------------------
/examples/src/matrixMulDrv/matrix_mul.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property and 
  5 |  * proprietary rights in and to this software and related documentation. 
  6 |  * Any use, reproduction, disclosure, or distribution of this software 
  7 |  * and related documentation without an express license agreement from
  8 |  * NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 11 |  * associated with this source code for terms and conditions that govern 
 12 |  * your use of this NVIDIA software.
 13 |  * 
 14 |  */
 15 | 
 16 | /* Matrix multiplication: C = A * B.
 17 |  * Device code.
 18 |  */
 19 | 
 20 | #ifndef _MATRIXMUL_KERNEL_H_
 21 | #define _MATRIXMUL_KERNEL_H_
 22 | 
 23 | #include <stdio.h>
 24 | #include "matrix_mul.h"
 25 | 
 26 | #define CHECK_BANK_CONFLICTS 0
 27 | #if CHECK_BANK_CONFLICTS
 28 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
 29 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
 30 | #else
 31 | #define AS(i, j) As[i][j]
 32 | #define BS(i, j) Bs[i][j]
 33 | #endif
 34 | 
 35 | ////////////////////////////////////////////////////////////////////////////////
 36 | //! Matrix multiplication on the device: C = A * B
 37 | //! wA is A's width and wB is B's width
 38 | ////////////////////////////////////////////////////////////////////////////////
 39 | extern "C" __global__ void
 40 | matrixMul(float* A, float* B, float* C, int wA, int wB)
 41 | {
 42 |     // Block index
 43 |     int bx = blockIdx.x;
 44 |     int by = blockIdx.y;
 45 | 
 46 |     // Thread index
 47 |     int tx = threadIdx.x;
 48 |     int ty = threadIdx.y;
 49 | 
 50 |     // Index of the first sub-matrix of A processed by the block
 51 |     int aBegin = wA * BLOCK_SIZE * by;
 52 | 
 53 |     // Index of the last sub-matrix of A processed by the block
 54 |     int aEnd   = aBegin + wA - 1;
 55 | 
 56 |     // Step size used to iterate through the sub-matrices of A
 57 |     int aStep  = BLOCK_SIZE;
 58 | 
 59 |     // Index of the first sub-matrix of B processed by the block
 60 |     int bBegin = BLOCK_SIZE * bx;
 61 | 
 62 |     // Step size used to iterate through the sub-matrices of B
 63 |     int bStep  = BLOCK_SIZE * wB;
 64 | 
 65 |     // Csub is used to store the element of the block sub-matrix
 66 |     // that is computed by the thread
 67 |     float Csub = 0;
 68 | 
 69 |     // Loop over all the sub-matrices of A and B
 70 |     // required to compute the block sub-matrix
 71 |     for (int a = aBegin, b = bBegin;
 72 |              a <= aEnd;
 73 |              a += aStep, b += bStep) {
 74 | 
 75 |         // Declaration of the shared memory array As used to
 76 |         // store the sub-matrix of A
 77 |         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 78 | 
 79 |         // Declaration of the shared memory array Bs used to
 80 |         // store the sub-matrix of B
 81 |         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 82 | 
 83 |         // Load the matrices from device memory
 84 |         // to shared memory; each thread loads
 85 |         // one element of each matrix
 86 |         AS(ty, tx) = A[a + wA * ty + tx];
 87 |         BS(ty, tx) = B[b + wB * ty + tx];
 88 | 
 89 |         // Synchronize to make sure the matrices are loaded
 90 |         __syncthreads();
 91 | 
 92 |         // Multiply the two matrices together;
 93 |         // each thread computes one element
 94 |         // of the block sub-matrix
 95 |         for (int k = 0; k < BLOCK_SIZE; ++k)
 96 |             Csub += AS(ty, k) * BS(k, tx);
 97 | 
 98 |         // Synchronize to make sure that the preceding
 99 |         // computation is done before loading two new
100 |         // sub-matrices of A and B in the next iteration
101 |         __syncthreads();
102 |     }
103 | 
104 |     // Write the block sub-matrix to device memory;
105 |     // each thread writes one element
106 |     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
107 |     C[c + wB * ty + tx] = Csub;
108 | }
109 | 
110 | #endif // #ifndef _MATRIXMUL_KERNEL_H_
111 | 


--------------------------------------------------------------------------------
/examples/src/matrixMulDrv/matrix_mul.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * NVIDIA Corporation and its licensors retain all intellectual property and 
 5 |  * proprietary rights in and to this software and related documentation. 
 6 |  * Any use, reproduction, disclosure, or distribution of this software 
 7 |  * and related documentation without an express license agreement from
 8 |  * NVIDIA Corporation is strictly prohibited.
 9 |  *
10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA) 
11 |  * associated with this source code for terms and conditions that govern 
12 |  * your use of this NVIDIA software.
13 |  * 
14 |  */
15 | 
16 | #ifndef _MATRIXMUL_H_
17 | #define _MATRIXMUL_H_
18 | 
19 | /* Thread block size */
20 | #define BLOCK_SIZE 16
21 | 
22 | /* Matrix dimensions
23 |  * (chosen as multiples of the thread block size for simplicity)
24 |  */
25 | #define WA (3 * BLOCK_SIZE) /* Matrix A width  */
26 | #define HA (5 * BLOCK_SIZE) /* Matrix A height */
27 | #define WB (8 * BLOCK_SIZE) /* Matrix B width  */
28 | #define HB WA  /* Matrix B height */
29 | #define WC WB  /* Matrix C width  */
30 | #define HC HA  /* Matrix C height */
31 | 
32 | #endif /* _MATRIXMUL_H_ */
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/src/multiGPU/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := multiGPU
 9 | 
10 | HSMAIN          := multiGPU.hs
11 | CUFILES         := simpleMultiGPU_kernel.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 
20 | 


--------------------------------------------------------------------------------
/examples/src/multiGPU/multiGPU.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ParallelListComp, CPP, BangPatterns, ForeignFunctionInterface #-}
  2 | --------------------------------------------------------------------------------
  3 | --
  4 | -- Module    : MultiGPU
  5 | -- Copyright : (c) 2012 Kevin Ying
  6 | -- License   : BSD
  7 | --
  8 | -- An implemenations of Nvidia's simpleMultiGPU example
  9 | --
 10 | --------------------------------------------------------------------------------
 11 | 
 12 | module Main where
 13 | 
 14 | import Time
 15 | 
 16 | -- System
 17 | import System.Random.MWC
 18 | import Data.List
 19 | import Control.Monad
 20 | import Foreign
 21 | import Foreign.CUDA                     (HostPtr, DevicePtr, withDevicePtr, withHostPtr) 
 22 | import Foreign.CUDA.Runtime.Stream      as CR
 23 | import qualified Foreign.CUDA.Runtime   as CR
 24 | 
 25 | import qualified Data.Vector.Unboxed    as U
 26 | 
 27 | -- Settings
 28 | data_n   = 1048576 * 32
 29 | block_n  = 32
 30 | thread_n = 256 
 31 | accum_n  = block_n * thread_n
 32 | 
 33 | --
 34 | -- |A set of data prepared for execution on a GPU
 35 | --
 36 | data Plan = Plan
 37 |   {
 38 |     device            :: Int,              -- ^ device number
 39 |     stream            :: Stream,           -- ^ stream on the device
 40 |     dataN             :: Int,              -- ^ number of elements in data
 41 |     v_data            :: U.Vector Float,   -- ^ host data
 42 |     h_data            :: HostPtr Float,    -- ^ host data in pagelocked memory
 43 | 
 44 |     d_data            :: DevicePtr Float,  -- ^ device data
 45 |     d_sum             :: DevicePtr Float,  -- ^ device results
 46 | 
 47 |     h_sum_from_device :: HostPtr Float     -- ^ results from device
 48 |   }
 49 |   deriving Show
 50 | 
 51 | 
 52 | main :: IO ()
 53 | main = do
 54 |   gpu_n <- CR.count
 55 |   putStrLn $ "Number of GPUs found: " ++ show gpu_n
 56 | 
 57 |   -- Execute on all GPUs
 58 |   putStrLn $ "Executing " ++ (show gpu_n) ++ " sets of data over " ++ (show gpu_n) ++ " GPU(s)"
 59 |   withPlans gpu_n gpu_n executePlans
 60 | 
 61 |   -- Use only first GPU
 62 |   putStrLn $ "Executing " ++ (show gpu_n) ++ " sets of data over 1 GPU"
 63 |   withPlans gpu_n 1 executePlans
 64 | 
 65 |   putStrLn "Finished"
 66 |   
 67 | 
 68 | -- |Launches kernels on the specified gpu and compares with the CPU 
 69 | executePlans :: [Plan] -> IO ()
 70 | executePlans plans = do
 71 |     --Copy data to GPU, launch Kernel, copy data back all asynchronously
 72 |     (t,_) <- flip (benchmark 100) CR.sync $ forM_ plans $ \plan -> do 
 73 |       CR.set (device plan)
 74 |       --Copy input data from CPU
 75 |       CR.pokeArrayAsync (dataN plan) (h_data plan) (d_data plan) (Just $ stream plan)
 76 |       --Perform GPU computations
 77 |       reduceKernel (d_sum plan) (d_data plan) (dataN plan) block_n thread_n (stream plan)
 78 |       --Read back GPU results
 79 |       CR.peekArrayAsync accum_n (d_sum plan) (h_sum_from_device plan) (Just $ stream plan)
 80 |     
 81 |     --Process results
 82 |     h_sumGPU <- forM plans $ \plan -> do
 83 |       CR.set (device plan)
 84 |       CR.block (stream plan)
 85 |       hs_sum <- withHostPtr (h_sum_from_device plan) $ \ptr -> peekArray accum_n ptr
 86 |       return $ sum hs_sum
 87 |     let sumGPU = sum h_sumGPU
 88 | 
 89 |     putStrLn $ " GPU processing time: " ++ (show $ timeIn millisecond t) ++ " ms"
 90 | 
 91 | 
 92 |     let sumCPU = sum $ map (\p -> U.sum $ v_data p) plans
 93 | 
 94 |     let diff = (abs $ sumCPU - sumGPU) / (abs sumCPU);
 95 |     putStrLn $ "  GPU sum: " ++ show sumGPU 
 96 |     putStrLn $ "  CPU sum: " ++ show sumCPU
 97 |     putStrLn $ "  Relative difference: " ++ show diff
 98 | 
 99 |     if (diff < 1e-4) then 
100 |       putStrLn $ "Passed!"
101 |       else 
102 |       putStrLn $ "Failed!"
103 | 
104 | -- |Makes n plans spread over ndevices
105 | withPlans :: Int -> Int -> ([Plan] -> IO ()) ->  IO ()
106 | withPlans n ndevices action = do
107 |   devStrms' <- forM [0..(ndevices-1)] $ \dev -> do
108 |     CR.set dev
109 |     strm <- CR.create
110 |     return (dev, strm)
111 | 
112 |   let devStrms = take n $ cycle devStrms'  
113 | 
114 |   plans <- forM devStrms $ \(i, stream) -> do
115 |     let dataN = if (i < data_n `mod` n) then (data_n `div` n) + 1 else data_n `div` n
116 |     CR.set i
117 |     -- Allocate memory
118 |     -- Host
119 |     v_data <- randomList dataN 
120 |     h_data <- CR.mallocHostArray [] dataN 
121 |     withHostPtr h_data $ \ptr -> pokeArray ptr (U.toList v_data)
122 | 
123 |     h_sum_from_device <- CR.mallocHostArray [] accum_n 
124 | 
125 |     -- Device
126 |     d_data <- CR.mallocArray (dataN   * sizeOf (undefined :: DevicePtr Float))
127 |     d_sum  <- CR.mallocArray (accum_n * sizeOf (undefined :: DevicePtr Float))
128 | 
129 | 
130 |     return $ Plan i stream dataN v_data h_data d_data d_sum h_sum_from_device
131 | 
132 |   action plans
133 | 
134 |   -- clean up
135 |   forM_ plans $ \plan -> do
136 |     CR.set (device plan)
137 |     CR.freeHost (h_data plan)
138 |     CR.freeHost (h_sum_from_device plan)
139 | 
140 |     CR.free (d_data plan)
141 |     CR.free (d_sum plan)
142 |     --CR.destroy (stream plan)
143 |   forM_ devStrms' $ \(_,stream) -> CR.destroy stream
144 | 
145 | 
146 | randomList :: Int -> IO (U.Vector Float)
147 | randomList n = withSystemRandom $ \gen -> U.replicateM n (uniform gen :: IO Float)
148 | 
149 | -- |Binding for the reduce kernel
150 | reduceKernel :: DevicePtr Float -> DevicePtr Float -> Int -> Int -> Int -> Stream -> IO ()
151 | reduceKernel a1 a2 a3 a4 a5 a6 =
152 |   withDevicePtr a1 $ \a1' ->
153 |   withDevicePtr a2 $ \a2' ->
154 |   reduceKernel'_ a1' a2' a3 a4 a5 a6
155 | 
156 | foreign import ccall unsafe "simpleMultiGPU.h launch_reduceKernel"
157 |   reduceKernel'_ :: Ptr Float -> Ptr Float -> Int -> Int -> Int -> Stream -> IO ()
158 | 


--------------------------------------------------------------------------------
/examples/src/multiGPU/simpleMultiGPU.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 |  * with this source code for terms and conditions that govern your use of
 6 |  * this software. Any use, reproduction, disclosure, or distribution of
 7 |  * this software and related documentation outside the terms of the EULA
 8 |  * is strictly prohibited.
 9 |  *
10 |  */
11 | 
12 | /*
13 |  * This application demonstrates how to use the CUDA API to use multiple GPUs.
14 |  *
15 |  * Note that in order to detect multiple GPUs in your system you have to disable
16 |  * SLI in the nvidia control panel. Otherwise only one GPU is visible to the 
17 |  * application. On the other side, you can still extend your desktop to screens 
18 |  * attached to both GPUs.
19 |  */
20 | 
21 | #ifndef SIMPLEMULTIGPU_H
22 | #define SIMPLEMULTIGPU_H
23 | 
24 | typedef struct {
25 |     //Host-side input data
26 |     int dataN;
27 |     float *h_Data;
28 | 
29 |     //Partial sum for this GPU
30 |     float *h_Sum;
31 | 
32 |     //Device buffers
33 |     float *d_Data,*d_Sum;
34 | 
35 |     //Reduction copied back from GPU
36 |     float *h_Sum_from_device;
37 | 
38 |     //Stream for asynchronous command execution
39 |     cudaStream_t stream;
40 | 
41 | } TGPUplan;
42 | 
43 | extern "C" 
44 | void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t s);
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/examples/src/multiGPU/simpleMultiGPU_kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 |  * with this source code for terms and conditions that govern your use of
 6 |  * this software. Any use, reproduction, disclosure, or distribution of
 7 |  * this software and related documentation outside the terms of the EULA
 8 |  * is strictly prohibited.
 9 |  *
10 |  */
11 | 
12 | /*
13 |  * This application demonstrates how to use the CUDA API to use multiple GPUs.
14 |  *
15 |  * Note that in order to detect multiple GPUs in your system you have to disable
16 |  * SLI in the nvidia control panel. Otherwise only one GPU is visible to the 
17 |  * application. On the other side, you can still extend your desktop to screens 
18 |  * attached to both GPUs.
19 |  */
20 | 
21 | #include <stdio.h>
22 | 
23 | ////////////////////////////////////////////////////////////////////////////////
24 | // Simple reduction kernel.
25 | // Refer to the 'reduction' CUDA SDK sample describing
26 | // reduction optimization strategies
27 | ////////////////////////////////////////////////////////////////////////////////
28 | __global__ static void reduceKernel(float *d_Result, float *d_Input, int N){
29 |     const int     tid = blockIdx.x * blockDim.x + threadIdx.x;
30 |     const int threadN = gridDim.x * blockDim.x;
31 |     float sum = 0;
32 |     for(int pos = tid; pos < N; pos += threadN)
33 |         sum += d_Input[pos];
34 | 
35 |     d_Result[tid] = sum;
36 | }
37 | 
38 | extern "C" 
39 | void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t s) 
40 | {
41 | 	reduceKernel<<<BLOCK_N, THREAD_N, 0, s>>>(d_Result, d_Input, N);
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/src/permute/permute.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Module    : Prelude
  3 |  * Copyright : (c) 2009 Trevor L. McDonell
  4 |  * License   : BSD
  5 |  */
  6 | 
  7 | #include "utils.h"
  8 | 
  9 | 
 10 | static void
 11 | permute_control
 12 | (
 13 |     unsigned int        n,
 14 |     unsigned int        &blocks,
 15 |     unsigned int        &threads,
 16 |     unsigned int        maxThreads = MAX_THREADS,
 17 |     unsigned int        maxBlocks  = MAX_BLOCKS
 18 | )
 19 | {
 20 |     threads = min(ceilPow2(n), MAX_THREADS);
 21 |     blocks  = (n + threads - 1) / threads;
 22 | //    blocks  = min(blocks, maxBlocks);
 23 | }
 24 | 
 25 | 
 26 | /*
 27 |  * Permute an array according to the permutation indices. This handles both
 28 |  * forward and backward permutation, where:
 29 |  *
 30 |  *   bpermute :: [a] -> [Int] -> [a]
 31 |  *   bpermute v is = [ v!i | i <- is ]
 32 |  *
 33 |  * In this case, `length' specifies the number of elements in the `indices' and
 34 |  * `out' arrays.
 35 |  *
 36 |  * An alternative to back permute, where we do not explicitly know the offset
 37 |  * indices, is to use an array of [0,1] flags specifying valid elements, which
 38 |  * we can exclusive-sum-scan to get the offsets. In this case, `length'
 39 |  * specifies the number of elements in the `in' array. The template parameter
 40 |  * `backward' specifies whether the offsets were calculated with a left or right
 41 |  * scan.
 42 |  *
 43 |  * We return the number of valid elements found via `num_valid', but blunder on
 44 |  * ahead regardless of whether the `out' array is large enough or not.
 45 |  */
 46 | template <typename T, bool backward, bool compact>
 47 | __global__ static void
 48 | permute_core
 49 | (
 50 |     const T             *in,
 51 |     T                   *out,
 52 |     unsigned int        *indices,
 53 |     unsigned int        length,
 54 |     unsigned int        *valid     = NULL,
 55 |     unsigned int        *num_valid = NULL
 56 | )
 57 | {
 58 |     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 59 | 
 60 |     /*
 61 |      * Return the number of valid entries found
 62 |      */
 63 |     if (compact && threadIdx.x == 0)
 64 |     {
 65 |         if (backward)
 66 |             num_valid[0] = valid[0] + indices[0];
 67 |         else
 68 |             num_valid[0] = valid[length-1] + indices[length-1];
 69 |     }
 70 | 
 71 |     if (idx < length)
 72 |     {
 73 | 	if (compact && valid[idx])
 74 | 	    out[indices[idx]] = in[idx];
 75 | 	else if (backward)
 76 | 	    out[idx] = in[indices[idx]];
 77 | 	else
 78 | 	    out[indices[idx]] = in[idx];
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | template <typename T, bool backward, bool compact>
 84 | static void
 85 | permute
 86 | (
 87 |     const T             *in,
 88 |     T                   *out,
 89 |     unsigned int        *indices,
 90 |     unsigned int        length,
 91 |     unsigned int        *valid     = NULL,
 92 |     unsigned int        *num_valid = NULL
 93 | )
 94 | {
 95 |     unsigned int threads;
 96 |     unsigned int blocks;
 97 | 
 98 |     permute_control(length, blocks, threads);
 99 |     permute_core< T,backward,compact >
100 |         <<<blocks,threads>>>(in, out, indices, length, valid, num_valid);
101 | }
102 | 
103 | 
104 | template <typename T, bool backward>
105 | static int
106 | compact
107 | (
108 |     const T             *in,
109 |     T                   *out,
110 |     unsigned int        *flags,
111 |     unsigned int        length
112 | )
113 | {
114 |     unsigned int N;
115 |     unsigned int *num;
116 |     unsigned int *indices;
117 | 
118 |     cudaMalloc((void**) &num, sizeof(unsigned int));
119 |     cudaMalloc((void**) &indices, length * sizeof(unsigned int));
120 | 
121 |     if (backward) scanr_plusui(flags, indices, length);
122 |     else          scanl_plusui(flags, indices, length);
123 | 
124 |     /*
125 |      * At this point, we know exactly how many elements will be required for the
126 |      * `out' array. Maybe we should allocate that array here?
127 |      */
128 |     permute<T,backward,true>(in, out, indices, length, flags, num);
129 | 
130 |     cudaMemcpy(&N, num, sizeof(unsigned int), cudaMemcpyDeviceToHost);
131 |     cudaFree(num);
132 |     cudaFree(indices);
133 | 
134 |     return N;
135 | }
136 | 
137 | 
138 | // -----------------------------------------------------------------------------
139 | // Instances
140 | // -----------------------------------------------------------------------------
141 | 
142 | void permute_ui(unsigned int *in, unsigned int *out, unsigned int *indices, int length)
143 | {
144 |     permute<unsigned int,false,false>(in, out, indices, length);
145 | }
146 | 
147 | 
148 | void bpermute_ui(unsigned int *in, unsigned int *out, unsigned int *indices, int length)
149 | {
150 |     permute<unsigned int,true,false>(in, out, indices, length);
151 | }
152 | 
153 | void bpermute_f(float *in, float *out, unsigned int *indices, int length)
154 | {
155 |     permute<float,true,false>(in, out, indices, length);
156 | }
157 | 
158 | 
159 | int  compact_f(float *in, float *out, unsigned int *flags, int length)
160 | {
161 |     int N = compact<float,false>(in, out, flags, length);
162 |     return N;
163 | }
164 | 
165 | 


--------------------------------------------------------------------------------
/examples/src/replicate/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | STATIC_LIB      := libreplicate.a
 9 | 
10 | CUFILES         := replicate.cu
11 | 
12 | EXTRALIBS       := stdc++
13 | 
14 | # ------------------------------------------------------------------------------
15 | # Haskell/CUDA build system
16 | # ------------------------------------------------------------------------------
17 | include ../../common/common.mk
18 | 


--------------------------------------------------------------------------------
/examples/src/replicate/replicate.cu:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Replicate
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #include "utils.h"
10 | #include "replicate.h"
11 | #include "algorithms.h"
12 | 
13 | #include <stdint.h>
14 | 
15 | static void
16 | replicate_control(uint32_t n, uint32_t &blocks, uint32_t &threads)
17 | {
18 |     threads = min(ceilPow2(n), MAX_THREADS);
19 |     blocks  = (n + threads - 1) / threads;
20 | }
21 | 
22 | 
23 | /*
24 |  * Apply a function each element of an array. A single thread is used to compute
25 |  * each result.
26 |  */
27 | template <bool lengthIsPow2>
28 | __global__ static void
29 | replicate_core
30 | (
31 |     uint32_t            *d_out,
32 |     const uint32_t      symbol,
33 |     const uint32_t      length
34 | )
35 | {
36 |     uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
37 | 
38 |     if (lengthIsPow2 || idx < length)
39 | 	d_out[idx] = symbol;
40 | }
41 | 
42 | 
43 | /*
44 |  * TODO: Generalise to 8- and 16-bit values.
45 |  */
46 | void
47 | replicate
48 | (
49 |     void                *d_out,
50 |     const uint32_t      symbol,
51 |     const uint32_t      length
52 | )
53 | {
54 |     uint32_t threads;
55 |     uint32_t blocks;
56 | 
57 |     replicate_control(length, blocks, threads);
58 | 
59 |     if (isPow2(length)) replicate_core<true> <<<blocks,threads>>>((uint32_t*) d_out, symbol, length);
60 |     else                replicate_core<false><<<blocks,threads>>>((uint32_t*) d_out, symbol, length);
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/examples/src/replicate/replicate.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Replicate
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | 
10 | #ifndef __REPLICATE_H__
11 | #define __REPLICATE_H__
12 | 
13 | #define MAX_THREADS	128
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/examples/src/scan/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := scan
 9 | 
10 | HSMAIN          := Scan.chs
11 | CUFILES         := scan.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/scan/Scan.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ForeignFunctionInterface #-}
  2 | --------------------------------------------------------------------------------
  3 | --
  4 | -- Module    : Scan
  5 | -- Copyright : (c) 2009 Trevor L. McDonell
  6 | -- License   : BSD
  7 | --
  8 | -- Apply a binary operator to an array similar to 'fold', but return a
  9 | -- successive list of values reduced from the left (or right).
 10 | --
 11 | --------------------------------------------------------------------------------
 12 | 
 13 | module Main where
 14 | 
 15 | #include "scan.h"
 16 | 
 17 | -- Friends
 18 | import C2HS                                     hiding (newArray)
 19 | import Time
 20 | import RandomVector
 21 | 
 22 | -- System
 23 | import Control.Monad
 24 | import Control.Exception
 25 | import qualified Foreign.CUDA as CUDA
 26 | 
 27 | 
 28 | --------------------------------------------------------------------------------
 29 | -- Reference
 30 | --------------------------------------------------------------------------------
 31 | 
 32 | scanList :: (Num e, Storable e) => Vector e -> IO (Vector e)
 33 | scanList xs = do
 34 |   bnds    <- getBounds xs
 35 |   xs'     <- getElems  xs
 36 |   (t,zs') <- benchmark 100 (return (scanl1 (+) xs')) (return ())
 37 |   putStrLn $ "List: " ++ shows (fromInteger (timeIn millisecond t`div`100)::Float) " ms"
 38 |   newListArray bnds zs'
 39 | 
 40 | 
 41 | scanArr :: (Num e, Storable e) => Vector e -> IO (Vector e)
 42 | scanArr xs = do
 43 |   bnds  <- getBounds xs
 44 |   zs    <- newArray_ bnds
 45 |   let idx = range bnds
 46 |   (t,_) <- benchmark 100 (foldM_ (k zs) 0 idx) (return ())
 47 |   putStrLn $ "Array: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms"
 48 |   return zs
 49 |   where
 50 |     k zs a i = do
 51 |       x <- readArray xs i
 52 |       let z = x+a
 53 |       writeArray zs i z
 54 |       return z
 55 | 
 56 | 
 57 | --------------------------------------------------------------------------------
 58 | -- CUDA
 59 | --------------------------------------------------------------------------------
 60 | 
 61 | --
 62 | -- Include the time to copy the data to/from the storable array (significantly
 63 | -- faster than from a Haskell list)
 64 | --
 65 | scanCUDA :: Vector Float -> IO (Vector Float)
 66 | scanCUDA xs = do
 67 |   bnds  <- getBounds xs
 68 |   zs    <- newArray_ bnds
 69 |   let len = rangeSize bnds
 70 |   CUDA.allocaArray len $ \d_xs -> do
 71 |   CUDA.allocaArray len $ \d_zs -> do
 72 |   (t,_) <- flip (benchmark 100) CUDA.sync $ do
 73 |     withVector xs $ \p -> CUDA.pokeArray len p d_xs
 74 |     scanl1_plusf d_xs d_zs len
 75 |     withVector zs $ \p -> CUDA.peekArray len d_zs p
 76 |   putStrLn $ "CUDA: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms (with copy)"
 77 | 
 78 |   (t',_) <- benchmark 100 (scanl1_plusf d_xs d_zs len) CUDA.sync
 79 |   putStrLn $ "CUDA: " ++ shows (fromInteger (timeIn millisecond t')/100::Float) " ms (compute only)"
 80 | 
 81 |   return zs
 82 | 
 83 | {# fun unsafe scanl1_plusf
 84 |   { withDP* `CUDA.DevicePtr Float'
 85 |   , withDP* `CUDA.DevicePtr Float'
 86 |   ,         `Int'
 87 |   } -> `()' #}
 88 |   where
 89 |     withDP p a = CUDA.withDevicePtr p $ \p' -> a (castPtr p')
 90 | 
 91 | 
 92 | --------------------------------------------------------------------------------
 93 | -- Main
 94 | --------------------------------------------------------------------------------
 95 | 
 96 | main :: IO ()
 97 | main = do
 98 |   dev   <- CUDA.get
 99 |   props <- CUDA.props dev
100 |   putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props
101 | 
102 |   arr  <- randomArr (1,100000) :: IO (Vector Float)
103 |   ref  <- scanList arr
104 |   ref' <- scanArr  arr
105 |   cuda <- scanCUDA arr
106 | 
107 |   return ()
108 | 
109 |   putStr   "== Validating: "
110 |   verify ref ref' >>= \rv -> assert rv (return ())
111 |   verify ref cuda >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!"
112 | 
113 | 


--------------------------------------------------------------------------------
/examples/src/scan/scan.cu:
--------------------------------------------------------------------------------
  1 | /* -----------------------------------------------------------------------------
  2 |  *
  3 |  * Module    : Scan
  4 |  * Copyright : (c) 2009 Trevor L. McDonell
  5 |  * License   : BSD
  6 |  *
  7 |  * ---------------------------------------------------------------------------*/
  8 | 
  9 | #include "scan.h"
 10 | 
 11 | #include "utils.h"
 12 | #include "operator.h"
 13 | #include "cudpp/cudpp_globals.h"
 14 | #include "cudpp/scan_kernel.cu"
 15 | #include "cudpp/vector_kernel.cu"
 16 | 
 17 | template <typename T>
 18 | struct scan_plan
 19 | {
 20 |     T           **block_sums;
 21 |     size_t      num_levels;
 22 | };
 23 | 
 24 | static inline unsigned int
 25 | calc_num_blocks(unsigned int N)
 26 | {
 27 |     return max(1u, (unsigned int)ceil((double)N / (SCAN_ELTS_PER_THREAD * CTA_SIZE)));
 28 | }
 29 | 
 30 | 
 31 | /*
 32 |  * This is the CPU-side workhorse of the scan operation, invoking the kernel on
 33 |  * each of the reduction blocks.
 34 |  */
 35 | template <class op, typename T, bool backward, bool exclusive>
 36 | static void
 37 | scan_recursive
 38 | (
 39 |     const T             *in,
 40 |     T                   *out,
 41 |     scan_plan<T>        *plan,
 42 |     int                 N,
 43 |     int                 level
 44 | )
 45 | {
 46 |     size_t      num_blocks = calc_num_blocks(N);
 47 |     bool        is_full    = N == num_blocks * SCAN_ELTS_PER_THREAD * CTA_SIZE;
 48 | 
 49 |     dim3        grid(num_blocks, 1, 1);
 50 |     dim3        block(CTA_SIZE, 1, 1);
 51 |     size_t      smem = sizeof(T) * CTA_SIZE * 2;
 52 | 
 53 | #define MULTIBLOCK      0x01
 54 | #define FULLBLOCK       0x04
 55 |     int traits = 0;
 56 |     if (num_blocks > 1) traits |= MULTIBLOCK;
 57 |     if (is_full)        traits |= FULLBLOCK;
 58 | 
 59 |     /*
 60 |      * Set up execution parameters, and execute the scan
 61 |      */
 62 |     switch (traits)
 63 |     {
 64 |     case 0:
 65 |         scan4
 66 |             < T, ScanTraits<T, op, backward, exclusive, false, false, false> >
 67 |             <<<grid, block, smem>>>(out, in, NULL, N, 1, 1);
 68 |         break;
 69 | 
 70 |     case MULTIBLOCK:
 71 |         scan4
 72 |             < T, ScanTraits<T, op, backward, exclusive, false, true, false> >
 73 |             <<<grid, block, smem>>>(out, in, plan->block_sums[level], N, 1, 1);
 74 |         break;
 75 | 
 76 |     case FULLBLOCK:
 77 |         scan4
 78 |             < T, ScanTraits<T, op, backward, exclusive, false, false, true> >
 79 |             <<<grid, block, smem>>>(out, in, NULL, N, 1, 1);
 80 |         break;
 81 | 
 82 |     case MULTIBLOCK | FULLBLOCK:
 83 |         scan4
 84 |             < T, ScanTraits<T, op, backward, exclusive, false, true, true> >
 85 |             <<<grid, block, smem>>>(out, in, plan->block_sums[level], N, 1, 1);
 86 |         break;
 87 | 
 88 |     default:
 89 |         assert(!"Non-exhaustive patterns in match");
 90 |     }
 91 | 
 92 |     /*
 93 |      * After scanning the sub-blocks, we now need to combine those results by
 94 |      * taking the last value from each sub-block, and adding that to each of the
 95 |      * successive blocks (i.e. scan across the sub-computations)
 96 |      */
 97 |     if (num_blocks > 1)
 98 |     {
 99 |         T *sums = plan->block_sums[level];
100 | 
101 |         scan_recursive
102 |             <op, T, backward, true>
103 |             (sums, sums, plan, num_blocks, level+1);
104 | 
105 |         vectorAddUniform4
106 |             <T, op, SCAN_ELTS_PER_THREAD>
107 |             <<<grid,block>>>
108 |             (out, sums, N, 4, 4, 0, 0);
109 |     }
110 | 
111 | #undef MULTIBLOCK
112 | #undef FULLBLOCK
113 | }
114 | 
115 | 
116 | /*
117 |  * Allocate temporary memory used by the scan.
118 |  */
119 | template <typename T>
120 | static void
121 | scan_init(int N, scan_plan<T> *plan)
122 | {
123 |     size_t level        = 0;
124 |     size_t elements     = N;
125 |     size_t num_blocks;
126 | 
127 |     /*
128 |      * Determine how many intermediate block-level summations will be required
129 |      */
130 |     for (elements = N; elements > 1; elements = num_blocks)
131 |     {
132 |         num_blocks = calc_num_blocks(elements);
133 | 
134 |         if (num_blocks > 1)
135 |             ++level;
136 |     }
137 | 
138 |     plan->block_sums = (T**) malloc(level * sizeof(T*));
139 |     plan->num_levels = level;
140 | 
141 |     /*
142 |      * Now, allocate the necessary storage at each level
143 |      */
144 |     for (elements = N, level = 0; elements > 1; elements = num_blocks, level++)
145 |     {
146 |         num_blocks = calc_num_blocks(elements);
147 | 
148 |         if (num_blocks > 1)
149 |             cudaMalloc((void**) &plan->block_sums[level], num_blocks * sizeof(T));
150 |     }
151 | }
152 | 
153 | 
154 | /*
155 |  * Clean up temporary memory used by the scan
156 |  */
157 | template <typename T>
158 | static void
159 | scan_finalise(scan_plan<T> *p)
160 | {
161 |     for (size_t l = 0; l < p->num_levels; ++l)
162 |         cudaFree(p->block_sums[l]);
163 | 
164 |     free(p->block_sums);
165 | }
166 | 
167 | 
168 | /*
169 |  * Apply a binary operator to an array similar to `fold', but return a
170 |  * successive list of values reduced from the left. The reduction will take
171 |  * place in parallel, so the operator must be associative.
172 |  */
173 | template <class op, typename T, bool backward, bool exclusive>
174 | void
175 | scan
176 | (
177 |     const T     *in,
178 |     T           *out,
179 |     int         length
180 | )
181 | {
182 |     scan_plan<T> plan;
183 |     scan_init<T>(length, &plan);
184 | 
185 |     scan_recursive<op, T, backward, exclusive>(in, out, &plan, length, 0);
186 | 
187 |     scan_finalise<T>(&plan);
188 | }
189 | 
190 | 
191 | // -----------------------------------------------------------------------------
192 | // Instances
193 | // -----------------------------------------------------------------------------
194 | 
195 | void scanl_plusf(float *in, float *out, int N)
196 | {
197 |     scan< Plus<float>, float, false, true >(in, out, N);
198 | }
199 | 
200 | void scanl1_plusf(float *in, float *out, int N)
201 | {
202 |     scan< Plus<float>, float, false, false >(in, out, N);
203 | }
204 | 
205 | 


--------------------------------------------------------------------------------
/examples/src/scan/scan.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Scan
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #ifndef __SCAN_H__
10 | #define __SCAN_H__
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | /*
17 |  * Instances
18 |  */
19 | void scanl_plusf(float *in, float *out, int N);
20 | void scanl1_plusf(float *in, float *out, int N);
21 | 
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | #endif
27 | 


--------------------------------------------------------------------------------
/examples/src/simpleTexture/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := simpleTexture
 9 | 
10 | HSMAIN          := SimpleTexture.hs
11 | CUFILES         := simpleTexture.cu
12 | 
13 | EXTRALIBS       := stdc++
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/simpleTexture/SimpleTexture.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ParallelListComp #-}
 2 | --------------------------------------------------------------------------------
 3 | --
 4 | -- Module    : SimpleTexture
 5 | -- Copyright : (c) 2009 Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | -- Demonstrates texture references. Takes an input PGM file and generates an
 9 | -- output that has been rotated. The kernel performs a simple 2D transformation
10 | -- on the texture coordinates (u,v).
11 | --
12 | --------------------------------------------------------------------------------
13 | 
14 | module Main where
15 | 
16 | import Data.Array.Unboxed
17 | import Graphics.Pgm
18 | import qualified Foreign.CUDA                 as CUDA
19 | import qualified Foreign.CUDA.Runtime.Texture as CUDA
20 | 
21 | 
22 | imageFile, outputFile, referenceFile :: FilePath
23 | imageFile     = "data/lena_bw.pgm"
24 | outputFile    = "data/lena_bw_out.pgm"
25 | referenceFile = "data/ref_rotated.pgm"
26 | 
27 | rotation :: Float
28 | rotation = 0.5          -- radians
29 | 
30 | 
31 | main :: IO ()
32 | main = do
33 |   img <- either (error . show) head `fmap` pgmsFromFile imageFile
34 |   ref <- either (error . show) head `fmap` pgmsFromFile referenceFile
35 |   runTest img ref
36 | 
37 | 
38 | runTest :: UArray (Int,Int) Int -> UArray (Int,Int) Int -> IO ()
39 | runTest img ref' = do
40 |   rotated <- transform (amap (\x-> fromIntegral x/255) img)
41 |   arrayToFile outputFile (amap (\x -> round (x*255)) rotated)
42 |   if verify (elems ref) (elems rotated)
43 |      then putStrLn "PASSED"
44 |      else putStrLn "FAILED!"
45 | 
46 |   where
47 |     ref          = amap (\x -> fromIntegral x / 255) ref'
48 |     verify xs ys = all (< epsilon) [abs ((x-y)/(x+y+epsilon)) | x <- xs | y <- ys]
49 |     epsilon      = 0.05
50 | 
51 | 
52 | transform :: UArray (Int,Int) Float -> IO (UArray (Int,Int) Float)
53 | transform pgm =
54 |   let bnds@((x0,y0),(x1,y1)) = bounds pgm
55 |       width   = x1-x0+1
56 |       height  = y1-y0+1
57 |       desc    = CUDA.FormatDesc (32,0,0,0) CUDA.Float
58 |       texture = CUDA.Texture True CUDA.Linear (CUDA.Wrap,CUDA.Wrap,CUDA.Wrap) desc
59 |   in
60 |   CUDA.allocaArray (width * height) $ \d_out ->
61 |   CUDA.withListArray (elems pgm)    $ \d_tex -> do
62 |     CUDA.bind2D "tex" texture d_tex (width,height) (fromIntegral $ height * 4)
63 |     CUDA.setConfig (width `div` 8,height `div` 8) (8,8,1) 0 Nothing
64 |     CUDA.setParams [CUDA.VArg d_out, CUDA.IArg width, CUDA.IArg height, CUDA.FArg rotation]
65 |     CUDA.launch "transformKernel"
66 | 
67 |     listArray bnds `fmap` CUDA.peekListArray (width*height) d_out
68 | 
69 | 


--------------------------------------------------------------------------------
/examples/src/simpleTexture/data/lena_bw.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/simpleTexture/data/lena_bw.pgm


--------------------------------------------------------------------------------
/examples/src/simpleTexture/data/ref_rotated.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/simpleTexture/data/ref_rotated.pgm


--------------------------------------------------------------------------------
/examples/src/simpleTexture/simpleTexture.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
 5 |  * proprietary rights in and to this software and related documentation.
 6 |  * Any use, reproduction, disclosure, or distribution of this software
 7 |  * and related documentation without an express license agreement from
 8 |  * NVIDIA Corporation is strictly prohibited.
 9 |  *
10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 |  * associated with this source code for terms and conditions that govern
12 |  * your use of this NVIDIA software.
13 |  *
14 |  */
15 | 
16 | #ifndef _SIMPLETEXTURE_KERNEL_H_
17 | #define _SIMPLETEXTURE_KERNEL_H_
18 | 
19 | // declare texture reference for 2D float texture
20 | texture<float, 2, cudaReadModeElementType> tex;
21 | 
22 | ////////////////////////////////////////////////////////////////////////////////
23 | //! Transform an image using texture lookups
24 | //! @param g_odata  output data in global memory
25 | ////////////////////////////////////////////////////////////////////////////////
26 | extern "C"
27 | __global__ void
28 | transformKernel( float* g_odata, int width, int height, float theta)
29 | {
30 |     // calculate normalized texture coordinates
31 |     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
32 |     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
33 | 
34 |     float u = x / (float) width;
35 |     float v = y / (float) height;
36 | 
37 |     // transform coordinates
38 |     u -= 0.5f;
39 |     v -= 0.5f;
40 |     float tu = u*cosf(theta) - v*sinf(theta) + 0.5f;
41 |     float tv = v*cosf(theta) + u*sinf(theta) + 0.5f;
42 | 
43 |     // read from texture and write to global memory
44 |     g_odata[y*width + x] = tex2D(tex, tu, tv);
45 | }
46 | 
47 | #endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
48 | 


--------------------------------------------------------------------------------
/examples/src/smvm/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := smvm
 9 | 
10 | HSMAIN          := SMVM.chs
11 | CUFILES         := smvm-csr.cu \
12 |                    smvm-cudpp.cu
13 | 
14 | USECUDPP        := 1
15 | EXTRALIBS       := stdc++
16 | 
17 | # ------------------------------------------------------------------------------
18 | # Haskell/CUDA build system
19 | # ------------------------------------------------------------------------------
20 | include ../../common/common.mk
21 | 


--------------------------------------------------------------------------------
/examples/src/smvm/smvm-cudpp.cu:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : SMVM
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | 
10 | #include "smvm.h"
11 | #include <cudpp.h>
12 | 
13 | template <typename T> CUDPPDatatype getType();
14 | template <> CUDPPDatatype getType<float>() { return CUDPP_FLOAT; }
15 | template <> CUDPPDatatype getType<unsigned int>() { return CUDPP_UINT; }
16 | 
17 | 
18 | /*
19 |  * Sparse matrix-dense vector multiply. Hook directly into the CUDPP
20 |  * implementation.
21 |  */
22 | template <typename T>
23 | void smvm_cudpp
24 | (
25 |     float               *d_y,
26 |     const float         *d_x,
27 |     const float         *h_data,
28 |     const unsigned int  *h_rowPtr,
29 |     const unsigned int  *h_colIdx,
30 |     const unsigned int  num_rows,
31 |     const unsigned int  num_nonzeros
32 | )
33 | {
34 |     CUDPPConfiguration cp;
35 |     CUDPPHandle        sm;
36 | 
37 |     cp.datatype  = getType<T>();
38 |     cp.options   = 0;
39 |     cp.algorithm = CUDPP_SPMVMULT;
40 | 
41 |     cudppSparseMatrix(&sm, cp, num_nonzeros, num_rows, h_data, h_rowPtr, h_colIdx);
42 |     cudppSparseMatrixVectorMultiply(sm, d_y, d_x);
43 | 
44 |     cudppDestroySparseMatrix(sm);
45 | }
46 | 
47 | 
48 | // -----------------------------------------------------------------------------
49 | // Instances
50 | // -----------------------------------------------------------------------------
51 | 
52 | void smvm_cudpp_f(float *d_y, float *d_x, float *h_data, unsigned int *h_rowPtr, unsigned int *h_colIdx, unsigned int num_rows, unsigned int num_nonzeros)
53 | {
54 |     smvm_cudpp<float>(d_y, d_x, h_data, h_rowPtr, h_colIdx, num_rows, num_nonzeros);
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/examples/src/smvm/smvm.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : SMVM
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #ifndef __SMVM_H__
10 | #define __SMVM_H__
11 | 
12 | /*
13 |  * Optimised for Tesla C1060 (compute 1.3)
14 |  * Maximum performance for your card may be achieved with different values.
15 |  *
16 |  * http://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
17 |  */
18 | #define MAX_THREADS             128
19 | #define MAX_BLOCKS_PER_SM       8
20 | #define MAX_BLOCKS              (MAX_BLOCKS_PER_SM * 30)
21 | #define WARP_SIZE               32
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | void smvm_csr_f(float *d_y, float *d_x, float *d_data, unsigned int *d_rowPtr, unsigned int *d_colIdx, unsigned int num_rows);
28 | void smvm_cudpp_f(float *d_y, float *d_x, float *h_data, unsigned int *h_rowPtr, unsigned int *h_colIdx, unsigned int num_rows, unsigned int num_nonzeros);
29 | 
30 | #ifdef __cplusplus
31 | }
32 | #endif
33 | #endif
34 | 


--------------------------------------------------------------------------------
/examples/src/smvm/texture.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2008-2009 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | 
18 | #ifndef __TEXTURE_H__
19 | #define __TEXTURE_H__
20 | 
21 | #include "utils.h"
22 | #include <cuda_runtime_api.h>
23 | 
24 | /*
25 |  * These textures are (optionally) used to cache the 'x' vector in y += A*x
26 |  * Use int2 to pull doubles through texture cache.
27 |  */
28 | texture<float,1> tex_x_float;
29 | texture<int2,1>  tex_x_double;
30 | 
31 | inline void
32 | bind_x(const float * x)
33 | {
34 |     size_t offset = size_t(-1);
35 | 
36 |     CUDA_SAFE_CALL(cudaBindTexture(&offset, tex_x_float, x));
37 |     if (offset != 0)
38 |         assert(!"memory is not aligned, refusing to use texture cache");
39 | }
40 | 
41 | inline void
42 | bind_x(const double * x)
43 | {
44 |     size_t offset = size_t(-1);
45 | 
46 |     CUDA_SAFE_CALL(cudaBindTexture(&offset, tex_x_double, x));
47 |     if (offset != 0)
48 |         assert(!"memory is not aligned, refusing to use texture cache");
49 | }
50 | 
51 | /*
52 |  * NOTE: the parameter is unused only to distinguish the two unbind functions
53 |  */
54 | inline void
55 | unbind_x(const float *)
56 | {
57 |     CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_float));
58 | }
59 | 
60 | inline void
61 | unbind_x(const double *)
62 | {
63 |     CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_double));
64 | }
65 | 
66 | template <bool UseCache>
67 | __inline__ __device__ float
68 | fetch_x(const int& i, const float * x)
69 | {
70 |     if (UseCache) return tex1Dfetch(tex_x_float, i);
71 |     else          return x[i];
72 | }
73 | 
74 | #ifndef CUDA_NO_SM_13_DOUBLE_INTRINSICS
75 | template <bool UseCache>
76 | __inline__ __device__ double fetch_x(const int& i, const double * x)
77 | {
78 | #if __CUDA_ARCH__ < 130
79 | #error "double precision require Compute Compatibility 1.3 or greater"
80 | #endif
81 |     if (UseCache)
82 |     {
83 |         int2 v = tex1Dfetch(tex_x_double, i);
84 |         return __hiloint2double(v.y, v.x);
85 |     }
86 |     else
87 |     {
88 |         return x[i];
89 |     }
90 | }
91 | #endif
92 | 
93 | #endif  // __TEXTURE_H__
94 | 
95 | 


--------------------------------------------------------------------------------
/examples/src/sort/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE      := sort
 9 | 
10 | HSMAIN          := Sort.chs
11 | CUFILES         := radix_sort.cu
12 | 
13 | USECUDPP        := 1
14 | EXTRALIBS       := stdc++
15 | 
16 | # ------------------------------------------------------------------------------
17 | # Haskell/CUDA build system
18 | # ------------------------------------------------------------------------------
19 | include ../../common/common.mk
20 | 


--------------------------------------------------------------------------------
/examples/src/sort/Sort.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | --------------------------------------------------------------------------------
 3 | --
 4 | -- Module    : Sort
 5 | -- Copyright : (c) 2009 Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | -- Reduce a vector of (key,value) pairs
 9 | --
10 | --------------------------------------------------------------------------------
11 | 
12 | module Main where
13 | 
14 | #include "sort.h"
15 | 
16 | import C2HS
17 | import RandomVector
18 | 
19 | import Data.Ord
20 | import Data.List
21 | import Control.Monad
22 | import qualified Foreign.CUDA as C
23 | 
24 | 
25 | 
26 | --------------------------------------------------------------------------------
27 | -- CUDA
28 | 
29 | test_f :: (Storable a, Eq a) => [(Float,a)] -> IO Bool
30 | test_f kv =
31 |   let l     = length kv
32 |       (k,v) = unzip kv
33 |   in
34 |   C.withListArray k $ \d_k ->
35 |   C.withListArray v $ \d_v -> do
36 | 
37 |     sort_f d_k d_v (length kv)
38 |     res <- liftM2 zip (C.peekListArray l d_k) (C.peekListArray l d_v)
39 |     return (res == sortBy (comparing fst) kv)
40 | 
41 | 
42 | test_i :: (Storable a, Eq a) => [(Int,a)] -> IO Bool
43 | test_i kv =
44 |   let l     = length kv
45 |       (k,v) = unzip kv
46 |   in
47 |   C.withListArray k $ \d_k ->
48 |   C.withListArray v $ \d_v -> do
49 | 
50 |     sort_ui d_k d_v (length kv)
51 |     res <- liftM2 zip (C.peekListArray l d_k) (C.peekListArray l d_v)
52 |     return (res == sortBy (comparing fst) kv)
53 | 
54 | 
55 | {# fun unsafe sort_f
56 |   { withDP* `C.DevicePtr Float'
57 |   , withDP* `C.DevicePtr a'
58 |   ,         `Int'
59 |   } -> `()' #}
60 |   where
61 |     withDP p a = C.withDevicePtr p $ \p' -> a (castPtr p')
62 | 
63 | {# fun unsafe sort_ui
64 |   { withDP* `C.DevicePtr Int'
65 |   , withDP* `C.DevicePtr a'
66 |   ,         `Int'
67 |   } -> `()' #}
68 |   where
69 |     withDP p a = C.withDevicePtr p $ \p' -> a (castPtr p')
70 | 
71 | --
72 | -- I don't need to learn template haskell or quick check... nah, not at all...
73 | --
74 | main :: IO ()
75 | main = do
76 |   f <- randomList  10000
77 |   i <- randomListR 10000 (0,1000)
78 | 
79 |   putStr "Test (float,int): "
80 |   test_f (zip f i) >>= \r -> case r of
81 |     True -> putStrLn "Ok!"
82 |     _    -> putStrLn "INVALID!"
83 | 
84 |   putStr "Test (float,float): "
85 |   test_f (zip f f) >>= \r -> case r of
86 |     True -> putStrLn "Ok!"
87 |     _    -> putStrLn "INVALID!"
88 | 
89 |   putStr "Test (int,int): "
90 |   test_i (zip i i) >>= \r -> case r of
91 |     True -> putStrLn "Ok!"
92 |     _    -> putStrLn "INVALID!"
93 | 
94 |   putStr "Test (int,float): "
95 |   test_i (zip i f) >>= \r -> case r of
96 |     True -> putStrLn "Ok!"
97 |     _    -> putStrLn "INVALID!"
98 | 
99 | 


--------------------------------------------------------------------------------
/examples/src/sort/radix_sort.cu:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Sort
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  *----------------------------------------------------------------------------*/
 8 | 
 9 | #include <cudpp.h>
10 | 
11 | #include "sort.h"
12 | 
13 | 
14 | template <typename T> CUDPPDatatype static getType();
15 | template <> CUDPPDatatype getType<float>()        { return CUDPP_FLOAT; }
16 | template <> CUDPPDatatype getType<unsigned int>() { return CUDPP_UINT;  }
17 | 
18 | 
19 | /*
20 |  * In-place radix sort of values or key-value pairs. Values can be any 32-bit
21 |  * type, as their payload is never inspected or manipulated.
22 |  */
23 | template <typename T>
24 | static void
25 | radix_sort
26 | (
27 |     unsigned int        length,
28 |     T                   *d_keys,
29 |     void                *d_vals = NULL,
30 |     int                 bits    = 8 * sizeof(T)
31 | )
32 | {
33 |     CUDPPHandle         plan;
34 |     CUDPPConfiguration  cp;
35 | 
36 |     cp.datatype  = getType<T>();
37 |     cp.algorithm = CUDPP_SORT_RADIX;
38 |     cp.options   = (d_vals != NULL) ? CUDPP_OPTION_KEY_VALUE_PAIRS
39 |                                     : CUDPP_OPTION_KEYS_ONLY;
40 | 
41 |     cudppPlan(&plan, cp, length, 1, 0);
42 |     cudppSort(plan, d_keys, d_vals, bits, length);
43 | 
44 |     cudppDestroyPlan(plan);
45 | }
46 | 
47 | 
48 | /* -----------------------------------------------------------------------------
49 |  * Instances
50 |  * ---------------------------------------------------------------------------*/
51 | 
52 | void sort_f(float *d_keys, void *d_vals, unsigned int length)
53 | {
54 |     radix_sort<float>(length, d_keys, d_vals);
55 | }
56 | 
57 | void sort_ui(unsigned int *d_keys, void *d_vals, unsigned int length)
58 | {
59 |     radix_sort<unsigned int>(length, d_keys, d_vals);
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/examples/src/sort/sort.h:
--------------------------------------------------------------------------------
 1 | /* -----------------------------------------------------------------------------
 2 |  *
 3 |  * Module    : Sort
 4 |  * Copyright : (c) 2009 Trevor L. McDonell
 5 |  * License   : BSD
 6 |  *
 7 |  * ---------------------------------------------------------------------------*/
 8 | 
 9 | #ifndef __SORT_PRIV_H__
10 | #define __SORT_PRIV_H__
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | void sort_f(float *d_keys, void *d_vals, unsigned int length);
17 | void sort_ui(unsigned int *d_keys, void *d_vals, unsigned int length);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | #endif
23 | 
24 | 


--------------------------------------------------------------------------------
/examples/src/vectorAddDrv/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Baking!
 3 | #
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Input files
 7 | # ------------------------------------------------------------------------------
 8 | EXECUTABLE	:= vectorAddDrv
 9 | 
10 | HSMAIN		:= VectorAdd.hs
11 | PTXFILES	:= vector_add.cu
12 | 
13 | USEDRVAPI	:= 1
14 | 
15 | # ------------------------------------------------------------------------------
16 | # Haskell/CUDA build system
17 | # ------------------------------------------------------------------------------
18 | include ../../common/common.mk
19 | 


--------------------------------------------------------------------------------
/examples/src/vectorAddDrv/VectorAdd.hs:
--------------------------------------------------------------------------------
  1 | --------------------------------------------------------------------------------
  2 | --
  3 | -- Module    : VectorAdd
  4 | -- Copyright : (c) 2009 Trevor L. McDonell
  5 | -- License   : BSD
  6 | --
  7 | -- Element-wise addition of two vectors
  8 | --
  9 | --------------------------------------------------------------------------------
 10 | 
 11 | module Main where
 12 | 
 13 | -- Friends
 14 | import RandomVector
 15 | 
 16 | -- System
 17 | import Numeric
 18 | import Control.Monad
 19 | import Control.Exception
 20 | import Data.Array.Storable
 21 | import qualified Data.ByteString.Char8 as B
 22 | import qualified Foreign.CUDA.Driver   as CUDA
 23 | 
 24 | 
 25 | --------------------------------------------------------------------------------
 26 | -- Reference implementation
 27 | --------------------------------------------------------------------------------
 28 | 
 29 | testRef :: (Num e, Storable e) => Vector e -> Vector e -> IO (Vector e)
 30 | testRef xs ys = do
 31 |   (i,j) <- getBounds xs
 32 |   res   <- newArray_ (i,j)
 33 |   forM_ [i..j] (add res)
 34 |   return res
 35 |   where
 36 |     add res idx = do
 37 |       a <- readArray xs idx
 38 |       b <- readArray ys idx
 39 |       writeArray res idx (a+b)
 40 | 
 41 | --------------------------------------------------------------------------------
 42 | -- CUDA
 43 | --------------------------------------------------------------------------------
 44 | 
 45 | --
 46 | -- Initialise the device and context. Load the PTX source code, and return a
 47 | -- reference to the kernel function.
 48 | --
 49 | initCUDA :: IO (CUDA.Context, CUDA.Fun)
 50 | initCUDA = do
 51 |   CUDA.initialise []
 52 |   dev     <- CUDA.device 0
 53 |   ctx     <- CUDA.create dev []
 54 |   ptx     <- B.readFile "data/vector_add.ptx"
 55 |   (mdl,r) <- CUDA.loadDataEx ptx [CUDA.MaxRegisters 32]
 56 |   fun     <- CUDA.getFun mdl "VecAdd"
 57 | 
 58 |   putStrLn $ ">> PTX JIT compilation (" ++ showFFloat (Just 2) (CUDA.jitTime r) " ms)"
 59 |   B.putStrLn (CUDA.jitInfoLog r)
 60 |   return (ctx,fun)
 61 | 
 62 | 
 63 | --
 64 | -- Run the test
 65 | --
 66 | testCUDA :: (Num e, Storable e) => Vector e -> Vector e -> IO (Vector e)
 67 | testCUDA xs ys = do
 68 |   (m,n)   <- getBounds xs
 69 |   let len = (n-m+1)
 70 | 
 71 |   -- Initialise environment and copy over test data
 72 |   --
 73 |   putStrLn ">> Initialising"
 74 |   bracket initCUDA (\(ctx,_) -> CUDA.destroy ctx) $ \(_,addVec) -> do
 75 | 
 76 |   -- Allocate some device memory. This will be freed once the computation
 77 |   -- terminates, either normally or by exception.
 78 |   --
 79 |   putStrLn ">> Executing"
 80 |   CUDA.allocaArray len $ \dx -> do
 81 |   CUDA.allocaArray len $ \dy -> do
 82 |   CUDA.allocaArray len $ \dz -> do
 83 | 
 84 |   -- Copy over the data
 85 |   --
 86 |   withVector xs $ \p -> CUDA.pokeArray len p dx
 87 |   withVector ys $ \p -> CUDA.pokeArray len p dy
 88 | 
 89 |   -- Setup and execute the kernel (repeat test many times...)
 90 |   --
 91 |   CUDA.setParams     addVec [CUDA.VArg dx, CUDA.VArg dy, CUDA.VArg dz, CUDA.IArg len]
 92 |   CUDA.setBlockShape addVec (128,1,1)
 93 |   CUDA.launch        addVec ((len+128-1) `div` 128, 1) Nothing
 94 |   CUDA.sync
 95 | 
 96 |   -- Copy back result
 97 |   --
 98 |   zs <- newArray_ (m,n)
 99 |   withVector zs $ \p -> CUDA.peekArray len dz p
100 |   return zs
101 | 
102 | 
103 | --------------------------------------------------------------------------------
104 | -- Test & Verify
105 | --------------------------------------------------------------------------------
106 | 
107 | main :: IO ()
108 | main = do
109 |   putStrLn "== Generating random vectors"
110 |   xs  <- randomArr (1,10000) :: IO (Vector Float)
111 |   ys  <- randomArr (1,10000) :: IO (Vector Float)
112 | 
113 |   putStrLn "== Generating reference solution"
114 |   ref <- testRef  xs ys
115 | 
116 |   putStrLn "== Testing CUDA"
117 |   arr <- testCUDA xs ys
118 | 
119 |   putStr   "== Validating: "
120 |   verify ref arr >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!"
121 | 
122 | 


--------------------------------------------------------------------------------
/examples/src/vectorAddDrv/vector_add.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Name      : VectorAdd
 3 |  * Copyright : (c) 2009 Trevor L. McDonell
 4 |  * License   : BSD
 5 |  *
 6 |  * Element-wise addition of two (floating-point) vectors
 7 |  */
 8 | 
 9 | 
10 | extern "C"
11 | __global__ void VecAdd(const float *xs, const float *ys, float *out, const unsigned int N)
12 | {
13 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
14 | 
15 |     if (idx < N)
16 |         out[idx] = xs[idx] + ys[idx];
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/src/zipWith/zipWith.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Module    : Prelude
 3 |  * Copyright : (c) 2009 Trevor L. McDonell
 4 |  * License   : BSD
 5 |  */
 6 | 
 7 | #include "utils.h"
 8 | #include "kernels.h"
 9 | #include "operator.h"
10 | #include "cudpp/cudpp_globals.h"
11 | 
12 | 
13 | /*
14 |  * Combine two arrays using the given binary operator function. A single thread
15 |  * is used to compute each result pair.
16 |  */
17 | template <class op, bool lengthIsPow2, typename Ta, typename Tb, typename Tc>
18 | __global__ static void
19 | zipWith_core
20 | (
21 |     Ta  *xs,
22 |     Tb  *ys,
23 |     Tc  *out,
24 |     int length
25 | )
26 | {
27 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
28 | 
29 |     if (lengthIsPow2 || idx < length)
30 |         out[idx] = op::apply(xs[idx], ys[idx]);
31 | }
32 | 
33 | 
34 | template <class op, typename Ta, typename Tb, typename Tc>
35 | void
36 | zipWith
37 | (
38 |     Ta  *xs,
39 |     Tb  *ys,
40 |     Tc  *zs,
41 |     int length
42 | )
43 | {
44 |     unsigned int threads = min(ceilPow2(length), CTA_SIZE);
45 |     unsigned int blocks  = (length + threads - 1) / threads;
46 | 
47 |     if (isPow2(length))
48 |         zipWith_core< op,true ><<<blocks,threads>>>(xs, ys, zs, length);
49 |     else
50 |         zipWith_core< op,false ><<<blocks,threads>>>(xs, ys, zs, length);
51 | }
52 | 
53 | 
54 | // -----------------------------------------------------------------------------
55 | // Instances
56 | // -----------------------------------------------------------------------------
57 | 
58 | #if 0
59 | void zipWith_plusf(float *xs, float *ys, float *zs, int N)
60 | {
61 |     zipWith< Plus<float> >(xs, ys, zs, N);
62 | }
63 | 
64 | void zipWith_timesf(float *xs, float *ys, float *zs, int N)
65 | {
66 |     zipWith< Times<float> >(xs, ys, zs, N);
67 | }
68 | #endif
69 | 
70 | void zipWith_timesif(int *xs, float *ys, float *zs, int N)
71 | {
72 |     zipWith< Times<int,float,float> >(xs, ys, zs, N);
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/src/Foreign/C/Extra.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP                      #-}
 2 | {-# LANGUAGE ForeignFunctionInterface #-}
 3 | --------------------------------------------------------------------------------
 4 | -- |
 5 | -- Module    : Foreign.C.Extra
 6 | -- Copyright : [2018..2023] Trevor L. McDonell
 7 | -- License   : BSD
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module Foreign.C.Extra (
12 | 
13 |   c_strnlen,
14 | 
15 | ) where
16 | 
17 | import Foreign.C
18 | 
19 | 
20 | #if defined(WIN32)
21 | {-# INLINE c_strnlen' #-}
22 | c_strnlen' :: CString -> CSize -> IO CSize
23 | c_strnlen' str size = do
24 |   str' <- peekCStringLen (str, fromIntegral size)
25 |   return $ stringLen 0 str'
26 |   where
27 |     stringLen acc []       = acc
28 |     stringLen acc ('\0':_) = acc
29 |     stringLen acc (_:xs)   = stringLen (acc+1) xs
30 | #else
31 | foreign import ccall unsafe "string.h strnlen" c_strnlen'
32 |   :: CString -> CSize -> IO CSize
33 | #endif
34 | 
35 | {-# INLINE c_strnlen #-}
36 | c_strnlen :: CString -> Int -> IO Int
37 | c_strnlen str maxlen = fromIntegral `fmap` c_strnlen' str (fromIntegral maxlen)
38 | 
39 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Foreign.CUDA
 4 | -- Copyright : [2009..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Top level bindings. By default, expose the C-for-CUDA runtime API bindings,
 8 | -- as they are slightly more user friendly.
 9 | --
10 | --------------------------------------------------------------------------------
11 | 
12 | module Foreign.CUDA (
13 | 
14 |   module Foreign.CUDA.Runtime
15 | 
16 | ) where
17 | 
18 | import Foreign.CUDA.Runtime
19 | 
20 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Analysis.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Foreign.CUDA.Analysis
 4 | -- Copyright : [2009..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Meta-module exporting CUDA analysis routines
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module Foreign.CUDA.Analysis (
12 | 
13 |   module Foreign.CUDA.Analysis.Device,
14 |   module Foreign.CUDA.Analysis.Occupancy
15 | 
16 | ) where
17 | 
18 | import Foreign.CUDA.Analysis.Device
19 | import Foreign.CUDA.Analysis.Occupancy
20 | 
21 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Context.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Foreign.CUDA.Driver.Context
 4 | -- Copyright : [2009..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Context management for low-level driver interface
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | 
12 | module Foreign.CUDA.Driver.Context (
13 | 
14 |   module Foreign.CUDA.Driver.Context.Base,
15 |   module Foreign.CUDA.Driver.Context.Config,
16 |   module Foreign.CUDA.Driver.Context.Peer,
17 | 
18 | ) where
19 | 
20 | import Foreign.CUDA.Driver.Context.Base
21 | import Foreign.CUDA.Driver.Context.Config
22 | import Foreign.CUDA.Driver.Context.Peer
23 | 
24 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Context/Peer.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE EmptyDataDecls           #-}
  4 | {-# LANGUAGE ForeignFunctionInterface #-}
  5 | {-# LANGUAGE TemplateHaskell          #-}
  6 | #ifdef USE_EMPTY_CASE
  7 | {-# LANGUAGE EmptyCase                #-}
  8 | #endif
  9 | --------------------------------------------------------------------------------
 10 | -- |
 11 | -- Module    : Foreign.CUDA.Driver.Context.Peer
 12 | -- Copyright : [2009..2023] Trevor L. McDonell
 13 | -- License   : BSD
 14 | --
 15 | -- Direct peer context access functions for the low-level driver interface.
 16 | --
 17 | -- Since: CUDA-4.0
 18 | --
 19 | --------------------------------------------------------------------------------
 20 | 
 21 | module Foreign.CUDA.Driver.Context.Peer (
 22 | 
 23 |   -- * Peer Access
 24 |   PeerFlag, PeerAttribute(..),
 25 |   accessible, add, remove, getAttribute,
 26 | 
 27 | ) where
 28 | 
 29 | #include "cbits/stubs.h"
 30 | {# context lib="cuda" #}
 31 | 
 32 | -- Friends
 33 | import Foreign.CUDA.Driver.Context.Base                 ( Context(..) )
 34 | import Foreign.CUDA.Driver.Device                       ( Device(..) )
 35 | import Foreign.CUDA.Driver.Error
 36 | import Foreign.CUDA.Internal.C2HS
 37 | 
 38 | -- System
 39 | import Foreign
 40 | import Foreign.C
 41 | 
 42 | 
 43 | --------------------------------------------------------------------------------
 44 | -- Data Types
 45 | --------------------------------------------------------------------------------
 46 | 
 47 | -- |
 48 | -- Possible option values for direct peer memory access
 49 | --
 50 | data PeerFlag
 51 | instance Enum PeerFlag where
 52 | #ifdef USE_EMPTY_CASE
 53 |   toEnum   x = error ("PeerFlag.toEnum: Cannot match " ++ show x)
 54 |   fromEnum x = case x of {}
 55 | #endif
 56 | 
 57 | 
 58 | -- | Peer-to-peer attributes
 59 | --
 60 | #if CUDA_VERSION < 8000
 61 | data PeerAttribute
 62 | instance Enum PeerAttribute where
 63 | #ifdef USE_EMPTY_CASE
 64 |   toEnum   x = error ("PeerAttribute.toEnum: Cannot match " ++ show x)
 65 |   fromEnum x = case x of {}
 66 | #endif
 67 | #else
 68 | {# enum CUdevice_P2PAttribute as PeerAttribute
 69 |   { underscoreToCase }
 70 |   with prefix="CU_DEVICE_P2P_ATTRIBUTE" deriving (Eq, Show) #}
 71 | #endif
 72 | 
 73 | 
 74 | --------------------------------------------------------------------------------
 75 | -- Peer access
 76 | --------------------------------------------------------------------------------
 77 | 
 78 | -- |
 79 | -- Queries if the first device can directly access the memory of the second. If
 80 | -- direct access is possible, it can then be enabled with 'add'.
 81 | --
 82 | -- Requires CUDA-4.0.
 83 | --
 84 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e>
 85 | --
 86 | {-# INLINEABLE accessible #-}
 87 | accessible :: Device -> Device -> IO Bool
 88 | #if CUDA_VERSION < 4000
 89 | accessible _ _        = requireSDK 'accessible 4.0
 90 | #else
 91 | accessible !dev !peer = resultIfOk =<< cuDeviceCanAccessPeer dev peer
 92 | 
 93 | {-# INLINE cuDeviceCanAccessPeer #-}
 94 | {# fun unsafe cuDeviceCanAccessPeer
 95 |   { alloca-   `Bool'   peekBool*
 96 |   , useDevice `Device'
 97 |   , useDevice `Device'           } -> `Status' cToEnum #}
 98 | #endif
 99 | 
100 | 
101 | -- |
102 | -- If the devices of both the current and supplied contexts support unified
103 | -- addressing, then enable allocations in the supplied context to be accessible
104 | -- by the current context.
105 | --
106 | -- Note that access is unidirectional, and in order to access memory in the
107 | -- current context from the peer context, a separate symmetric call to
108 | -- 'add' is required.
109 | --
110 | -- Requires CUDA-4.0.
111 | --
112 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a>
113 | --
114 | {-# INLINEABLE add #-}
115 | add :: Context -> [PeerFlag] -> IO ()
116 | #if CUDA_VERSION < 4000
117 | add _ _         = requireSDK 'add 4.0
118 | #else
119 | add !ctx !flags = nothingIfOk =<< cuCtxEnablePeerAccess ctx flags
120 | 
121 | {-# INLINE cuCtxEnablePeerAccess #-}
122 | {# fun unsafe cuCtxEnablePeerAccess
123 |   { useContext      `Context'
124 |   , combineBitMasks `[PeerFlag]' } -> `Status' cToEnum #}
125 | #endif
126 | 
127 | 
128 | -- |
129 | -- Disable direct memory access from the current context to the supplied
130 | -- peer context, and unregisters any registered allocations.
131 | --
132 | -- Requires CUDA-4.0.
133 | --
134 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g5b4b6936ea868d4954ce4d841a3b4810>
135 | --
136 | {-# INLINEABLE remove #-}
137 | remove :: Context -> IO ()
138 | #if CUDA_VERSION < 4000
139 | remove _    = requireSDK 'remave 4.0
140 | #else
141 | remove !ctx = nothingIfOk =<< cuCtxDisablePeerAccess ctx
142 | 
143 | {-# INLINE cuCtxDisablePeerAccess #-}
144 | {# fun unsafe cuCtxDisablePeerAccess
145 |   { useContext `Context' } -> `Status' cToEnum #}
146 | #endif
147 | 
148 | 
149 | -- |
150 | -- Queries attributes of the link between two devices
151 | --
152 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g4c55c60508f8eba4546b51f2ee545393>
153 | --
154 | -- Requires CUDA-8.0
155 | --
156 | -- @since 0.9.0.0@
157 | --
158 | {-# INLINEABLE getAttribute #-}
159 | getAttribute :: PeerAttribute -> Device -> Device -> IO Int
160 | #if CUDA_VERSION < 8000
161 | getAttribute _      _   _   = requireSDK 'getAttribute 8.0
162 | #else
163 | getAttribute attrib src dst = resultIfOk =<< cuDeviceGetP2PAttribute attrib src dst
164 | 
165 | {-# INLINE cuDeviceGetP2PAttribute #-}
166 | {# fun unsafe cuDeviceGetP2PAttribute
167 |   { alloca-   `Int' peekIntConv*
168 |   , cFromEnum `PeerAttribute'
169 |   , useDevice `Device'
170 |   , useDevice `Device'
171 |   } -> `Status' cToEnum #}
172 | #endif
173 | 
174 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Context/Primary.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE ForeignFunctionInterface #-}
  4 | {-# LANGUAGE TemplateHaskell          #-}
  5 | --------------------------------------------------------------------------------
  6 | -- |
  7 | -- Module    : Foreign.CUDA.Driver.Context.Primary
  8 | -- Copyright : [2009..2023] Trevor L. McDonell
  9 | -- License   : BSD
 10 | --
 11 | -- Primary context management for low-level driver interface. The primary
 12 | -- context is unique per device and shared with the Runtime API. This
 13 | -- allows integration with other libraries using CUDA.
 14 | --
 15 | -- Since: CUDA-7.0
 16 | --
 17 | --------------------------------------------------------------------------------
 18 | 
 19 | module Foreign.CUDA.Driver.Context.Primary (
 20 | 
 21 |   status, setup, reset, retain, release,
 22 | 
 23 | ) where
 24 | 
 25 | #include "cbits/stubs.h"
 26 | {# context lib="cuda" #}
 27 | 
 28 | -- Friends
 29 | import Foreign.CUDA.Driver.Context.Base
 30 | import Foreign.CUDA.Driver.Device
 31 | import Foreign.CUDA.Driver.Error
 32 | import Foreign.CUDA.Internal.C2HS
 33 | 
 34 | -- System
 35 | import Control.Exception
 36 | import Control.Monad
 37 | import Foreign
 38 | import Foreign.C
 39 | 
 40 | 
 41 | --------------------------------------------------------------------------------
 42 | -- Primary context management
 43 | --------------------------------------------------------------------------------
 44 | 
 45 | 
 46 | -- |
 47 | -- Get the status of the primary context. Returns whether the current
 48 | -- context is active, and the flags it was (or will be) created with.
 49 | --
 50 | -- Requires CUDA-7.0.
 51 | --
 52 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g65f3e018721b6d90aa05cfb56250f469>
 53 | --
 54 | {-# INLINEABLE status #-}
 55 | status :: Device -> IO (Bool, [ContextFlag])
 56 | #if CUDA_VERSION < 7000
 57 | status _    = requireSDK 'status 7.0
 58 | #else
 59 | status !dev =
 60 |   cuDevicePrimaryCtxGetState dev >>= \(rv, !flags, !active) ->
 61 |   case rv of
 62 |     Success -> return (active, flags)
 63 |     _       -> throwIO (ExitCode rv)
 64 | 
 65 | {# fun unsafe cuDevicePrimaryCtxGetState
 66 |   { useDevice `Device'
 67 |   , alloca-   `[ContextFlag]' peekFlags*
 68 |   , alloca-   `Bool'          peekBool*
 69 |   } -> `Status' cToEnum #}
 70 |   where
 71 |     peekFlags = liftM extractBitMasks . peek
 72 | #endif
 73 | 
 74 | 
 75 | -- |
 76 | -- Specify the flags that the primary context should be created with. Note
 77 | -- that this is an error if the primary context is already active.
 78 | --
 79 | -- Requires CUDA-7.0.
 80 | --
 81 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gd779a84f17acdad0d9143d9fe719cfdf>
 82 | --
 83 | {-# INLINEABLE setup #-}
 84 | setup :: Device -> [ContextFlag] -> IO ()
 85 | #if CUDA_VERSION < 7000
 86 | setup _    _      = requireSDK 'setup 7.0
 87 | #else
 88 | setup !dev !flags = nothingIfOk =<< cuDevicePrimaryCtxSetFlags dev flags
 89 | 
 90 | {-# INLINE cuDevicePrimaryCtxSetFlags #-}
 91 | {# fun unsafe cuDevicePrimaryCtxSetFlags
 92 |   { useDevice       `Device'
 93 |   , combineBitMasks `[ContextFlag]'
 94 |   } -> `Status' cToEnum #}
 95 | #endif
 96 | 
 97 | 
 98 | -- |
 99 | -- Destroy all allocations and reset all state on the primary context of
100 | -- the given device in the current process. Requires cuda-7.0
101 | --
102 | -- Requires CUDA-7.0.
103 | --
104 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g5d38802e8600340283958a117466ce12>
105 | --
106 | {-# INLINEABLE reset #-}
107 | reset :: Device -> IO ()
108 | #if CUDA_VERSION < 7000
109 | reset _    = requireSDK 'reset 7.0
110 | #else
111 | reset !dev = nothingIfOk =<< cuDevicePrimaryCtxReset dev
112 | 
113 | {-# INLINE cuDevicePrimaryCtxReset #-}
114 | {# fun unsafe cuDevicePrimaryCtxReset
115 |   { useDevice `Device' } -> `Status' cToEnum #}
116 | #endif
117 | 
118 | 
119 | -- |
120 | -- Release the primary context on the given device. If there are no more
121 | -- references to the primary context it will be destroyed, regardless of
122 | -- how many threads it is current to.
123 | --
124 | -- Unlike 'Foreign.CUDA.Driver.Context.Base.pop' this does not pop the
125 | -- context from the stack in any circumstances.
126 | --
127 | -- Requires CUDA-7.0.
128 | --
129 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gf2a8bc16f8df0c88031f6a1ba3d6e8ad>
130 | --
131 | {-# INLINEABLE release #-}
132 | release :: Device -> IO ()
133 | #if CUDA_VERSION < 7000
134 | release _    = requireSDK 'release 7.0
135 | #else
136 | release !dev = nothingIfOk =<< cuDevicePrimaryCtxRelease dev
137 | 
138 | {-# INLINE cuDevicePrimaryCtxRelease #-}
139 | {# fun unsafe cuDevicePrimaryCtxRelease
140 |   { useDevice `Device' } -> `Status' cToEnum #}
141 | #endif
142 | 
143 | 
144 | -- |
145 | -- Retain the primary context for the given device, creating it if
146 | -- necessary, and increasing its usage count. The caller must call
147 | -- 'release' when done using the context. Unlike
148 | -- 'Foreign.CUDA.Driver.Context.Base.create' the newly retained context is
149 | -- not pushed onto the stack.
150 | --
151 | -- Requires CUDA-7.0.
152 | --
153 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g9051f2d5c31501997a6cb0530290a300>
154 | --
155 | {-# INLINEABLE retain #-}
156 | retain :: Device -> IO Context
157 | #if CUDA_VERSION < 7000
158 | retain _    = requireSDK 'retain 7.0
159 | #else
160 | retain !dev = resultIfOk =<< cuDevicePrimaryCtxRetain dev
161 | 
162 | {-# INLINE cuDevicePrimaryCtxRetain #-}
163 | {# fun unsafe cuDevicePrimaryCtxRetain
164 |   { alloca-   `Context' peekCtx*
165 |   , useDevice `Device'
166 |   } -> `Status' cToEnum #}
167 |   where
168 |     peekCtx = liftM Context . peek
169 | #endif
170 | 
171 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Graph/Base.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns               #-}
 2 | {-# LANGUAGE CPP                        #-}
 3 | {-# LANGUAGE EmptyDataDecls             #-}
 4 | {-# LANGUAGE ForeignFunctionInterface   #-}
 5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 6 | {-# LANGUAGE TemplateHaskell            #-}
 7 | #ifdef USE_EMPTY_CASE
 8 | {-# LANGUAGE EmptyCase                  #-}
 9 | #endif
10 | --------------------------------------------------------------------------------
11 | -- |
12 | -- Module    : Foreign.CUDA.Driver.Graph.Base
13 | -- Copyright : [2018..2023] Trevor L. McDonell
14 | -- License   : BSD
15 | --
16 | -- Graph execution functions for the low-level driver interface
17 | --
18 | -- Requires CUDA-10
19 | --
20 | --------------------------------------------------------------------------------
21 | 
22 | module Foreign.CUDA.Driver.Graph.Base
23 |   where
24 | 
25 | #include "cbits/stubs.h"
26 | {# context lib="cuda" #}
27 | 
28 | import Foreign.Storable
29 | import Foreign.Ptr
30 | 
31 | 
32 | --------------------------------------------------------------------------------
33 | -- Data Types
34 | --------------------------------------------------------------------------------
35 | 
36 | #if CUDA_VERSION < 10000
37 | data Graph
38 | #else
39 | newtype Graph = Graph { useGraph :: {# type CUgraph #}}
40 |   deriving (Eq, Show)
41 | #endif
42 | 
43 | data GraphFlag
44 | instance Enum GraphFlag where
45 | #ifdef USE_EMPTY_CASE
46 |   toEnum   x = error ("GraphFlag.toEnum: Cannot match " ++ show x)
47 |   fromEnum x = case x of {}
48 | #endif
49 | 
50 | #if CUDA_VERSION < 10000
51 | data Node
52 | data NodeType
53 | 
54 | instance Enum NodeType where
55 | #ifdef USE_EMPTY_CASE
56 |   toEnum   x = error ("NodeType.toEnum: Cannot match " ++ show x)
57 |   fromEnum x = case x of {}
58 | #endif
59 | #else
60 | newtype Node = Node { useNode :: {# type CUgraphNode #}}
61 |   deriving (Eq, Show, Storable)
62 | 
63 | {# enum CUgraphNodeType as NodeType
64 |   { underscoreToCase
65 |   , CU_GRAPH_NODE_TYPE_GRAPH as Subgraph
66 |   }
67 |   with prefix="CU_GRAPH_NODE_TYPE" deriving (Eq, Show, Bounded) #}
68 | #endif
69 | 
70 | 
71 | #if CUDA_VERSION < 10000
72 | data Executable
73 | #else
74 | newtype Executable = Executable { useExecutable :: {# type CUgraphExec #}}
75 |   deriving (Eq, Show)
76 | #endif
77 | 
78 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Graph/Capture.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE EmptyDataDecls           #-}
  4 | {-# LANGUAGE ForeignFunctionInterface #-}
  5 | {-# LANGUAGE TemplateHaskell          #-}
  6 | #ifdef USE_EMPTY_CASE
  7 | {-# LANGUAGE EmptyCase                #-}
  8 | #endif
  9 | --------------------------------------------------------------------------------
 10 | -- |
 11 | -- Module    : Foreign.CUDA.Driver.Graph.Capture
 12 | -- Copyright : [2018..2023] Trevor L. McDonell
 13 | -- License   : BSD
 14 | --
 15 | -- Requires CUDA-10
 16 | --
 17 | --------------------------------------------------------------------------------
 18 | 
 19 | module Foreign.CUDA.Driver.Graph.Capture (
 20 | 
 21 |   Status(..), Mode(..),
 22 |   start, stop, status, info, mode,
 23 | 
 24 | ) where
 25 | 
 26 | #include "cbits/stubs.h"
 27 | {# context lib="cuda" #}
 28 | 
 29 | import Foreign.CUDA.Driver.Error                          hiding ( Status )
 30 | import Foreign.CUDA.Driver.Graph.Base
 31 | import Foreign.CUDA.Driver.Stream
 32 | import Foreign.CUDA.Internal.C2HS
 33 | 
 34 | import Control.Monad                                      ( liftM )
 35 | 
 36 | import Foreign
 37 | import Foreign.C
 38 | 
 39 | 
 40 | #if CUDA_VERSION < 10000
 41 | data Status
 42 | 
 43 | instance Enum Status where
 44 | #ifdef USE_EMPTY_CASE
 45 |   toEnum   x = error ("Status.toEnum: Cannot match " ++ show x)
 46 |   fromEnum x = case x of {}
 47 | #endif
 48 | 
 49 | #else
 50 | {# enum CUstreamCaptureStatus as Status
 51 |   { underscoreToCase }
 52 |   with prefix="CU_STREAM_CAPTURE_STATUS" deriving (Eq, Show, Bounded) #}
 53 | #endif
 54 | 
 55 | #if CUDA_VERSION < 10010
 56 | data Mode
 57 | 
 58 | instance Enum Mode where
 59 | #ifdef USE_EMPTY_CASE
 60 |   toEnum   x = error ("Mode.toEnum: Cannot match " ++ show x)
 61 |   fromEnum x = case x of {}
 62 | #endif
 63 | 
 64 | #else
 65 | {# enum CUstreamCaptureMode as Mode
 66 |   { underscoreToCase }
 67 |   with prefix="CU_STREAM_CAPTURE_MODE" deriving (Eq, Show, Bounded) #}
 68 | #endif
 69 | 
 70 | 
 71 | --------------------------------------------------------------------------------
 72 | -- Graph capture
 73 | --------------------------------------------------------------------------------
 74 | 
 75 | -- | Begin graph capture on a stream
 76 | --
 77 | -- Requires CUDA-10.0
 78 | --
 79 | -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1gea22d4496b1c8d02d0607bb05743532f>
 80 | --
 81 | -- @since 0.10.0.0
 82 | --
 83 | #if CUDA_VERSION < 10000
 84 | start :: Stream -> Mode -> IO ()
 85 | start = requireSDK 'start 10.0
 86 | #elif CUDA_VERSION < 10010
 87 | start :: Stream -> Mode -> IO ()
 88 | start s _ = cuStreamBeginCapture s
 89 |   where
 90 |     {# fun unsafe cuStreamBeginCapture
 91 |       { useStream `Stream'
 92 |       }
 93 |       -> `()' checkStatus*- #}
 94 | #else
 95 | {# fun unsafe cuStreamBeginCapture as start
 96 |   { useStream `Stream'
 97 |   ,           `Mode'
 98 |   }
 99 |   -> `()' checkStatus*- #}
100 | #endif
101 | 
102 | 
103 | -- | End graph capture on a stream
104 | --
105 | -- Requires CUDA-10.0
106 | --
107 | -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g03dab8b2ba76b00718955177a929970c>
108 | --
109 | -- @since 0.10.0.0
110 | --
111 | #if CUDA_VERSION < 10000
112 | stop :: Stream -> IO Graph
113 | stop = requireSDK 'stop 10.0
114 | #else
115 | {# fun unsafe cuStreamEndCapture as stop
116 |   { useStream `Stream'
117 |   , alloca-   `Graph'  peekGraph*
118 |   }
119 |   -> `()' checkStatus*- #}
120 | #endif
121 | 
122 | 
123 | -- | Return a stream's capture status
124 | --
125 | -- Requires CUDA-10.0
126 | --
127 | -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g37823c49206e3704ae23c7ad78560bca>
128 | --
129 | -- @since 0.10.0.0
130 | --
131 | #if CUDA_VERSION < 10000
132 | status :: Stream -> IO Status
133 | status = requireSDK 'status 10.0
134 | #else
135 | {# fun unsafe cuStreamIsCapturing as status
136 |   { useStream `Stream'
137 |   , alloca-   `Status' peekEnum*
138 |   }
139 |   -> `()' checkStatus*- #}
140 | #endif
141 | 
142 | 
143 | -- | Query the capture status of a stream and get an id for the capture
144 | -- sequence, which is unique over the lifetime of the process.
145 | --
146 | -- Requires CUDA-10.1
147 | --
148 | -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g13145ece1d79a1d79a1d22abb9663216>
149 | --
150 | -- @since 0.10.1.0
151 | --
152 | #if CUDA_VERSION < 10010
153 | info :: Stream -> IO (Status, Int64)
154 | info = requireSDK 'info 10.1
155 | #else
156 | {# fun unsafe cuStreamGetCaptureInfo as info
157 |   { useStream `Stream'
158 |   , alloca-   `Status' peekEnum*
159 |   , alloca-   `Int64'  peekIntConv*
160 |   }
161 |   -> `()' checkStatus*- #}
162 | #endif
163 | 
164 | 
165 | -- | Set the stream capture interaction mode for this thread. Return the previous value.
166 | --
167 | -- Requires CUDA-10.1
168 | --
169 | -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g378135b262f02a43a7caeab239ae493d>
170 | --
171 | -- @since 0.10.1.0
172 | --
173 | #if CUDA_VERSION < 10010
174 | mode :: Mode -> IO Mode
175 | mode = requireSDK 'mode 10.1
176 | #else
177 | {# fun unsafe cuThreadExchangeStreamCaptureMode as mode
178 |   { withEnum* `Mode' peekEnum*
179 |   }
180 |   -> `()' checkStatus*- #}
181 | #endif
182 | 
183 | 
184 | --------------------------------------------------------------------------------
185 | -- Internal
186 | --------------------------------------------------------------------------------
187 | 
188 | #if CUDA_VERSION >= 10000
189 | {-# INLINE peekGraph #-}
190 | peekGraph :: Ptr {# type CUgraph #} -> IO Graph
191 | peekGraph = liftM Graph . peek
192 | #endif
193 | 
194 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/IPC/Event.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE ForeignFunctionInterface #-}
  4 | {-# LANGUAGE TemplateHaskell          #-}
  5 | --------------------------------------------------------------------------------
  6 | -- |
  7 | -- Module    : Foreign.CUDA.Driver.IPC.Event
  8 | -- Copyright : [2009..2023] Trevor L. McDonell
  9 | -- License   : BSD
 10 | --
 11 | -- IPC event management for low-level driver interface.
 12 | --
 13 | -- Restricted to devices which support unified addressing on Linux
 14 | -- operating systems.
 15 | --
 16 | -- Since CUDA-4.1.
 17 | --
 18 | --------------------------------------------------------------------------------
 19 | 
 20 | module Foreign.CUDA.Driver.IPC.Event (
 21 | 
 22 |   IPCEvent,
 23 |   export, open,
 24 | 
 25 | ) where
 26 | 
 27 | #include "cbits/stubs.h"
 28 | {# context lib="cuda" #}
 29 | 
 30 | -- Friends
 31 | import Foreign.CUDA.Driver.Error
 32 | import Foreign.CUDA.Driver.Event
 33 | import Foreign.CUDA.Internal.C2HS
 34 | 
 35 | -- System
 36 | import Control.Monad
 37 | import Prelude
 38 | 
 39 | import Foreign.C
 40 | import Foreign.Ptr
 41 | import Foreign.ForeignPtr
 42 | import Foreign.Marshal
 43 | import Foreign.Storable
 44 | 
 45 | 
 46 | --------------------------------------------------------------------------------
 47 | -- Data Types
 48 | --------------------------------------------------------------------------------
 49 | 
 50 | -- |
 51 | -- A CUDA inter-process event handle.
 52 | --
 53 | newtype IPCEvent = IPCEvent { useIPCEvent :: IPCEventHandle }
 54 |   deriving (Eq, Show)
 55 | 
 56 | 
 57 | --------------------------------------------------------------------------------
 58 | -- IPC event management
 59 | --------------------------------------------------------------------------------
 60 | 
 61 | -- |
 62 | -- Create an inter-process event handle for a previously allocated event.
 63 | -- The event must be created with the 'Interprocess' and 'DisableTiming'
 64 | -- event flags. The returned handle may then be sent to another process and
 65 | -- 'open'ed to allow efficient hardware synchronisation between GPU work in
 66 | -- other processes.
 67 | --
 68 | -- After the event has been opened in the importing process, 'record',
 69 | -- 'block', 'wait', 'query' may be used in either process.
 70 | --
 71 | -- Performing operations on the imported event after the event has been
 72 | -- 'destroy'ed in the exporting process is undefined.
 73 | --
 74 | -- Requires CUDA-4.0.
 75 | --
 76 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gea02eadd12483de5305878b13288a86c>
 77 | --
 78 | {-# INLINEABLE export #-}
 79 | export :: Event -> IO IPCEvent
 80 | #if CUDA_VERSION < 4010
 81 | export _   = requireSDK 'create 4.1
 82 | #else
 83 | export !ev = do
 84 |   h <- newIPCEventHandle
 85 |   r <- cuIpcGetEventHandle h ev
 86 |   resultIfOk (r, IPCEvent h)
 87 | 
 88 | {-# INLINE cuIpcGetEventHandle #-}
 89 | {# fun unsafe cuIpcGetEventHandle
 90 |   { withForeignPtr* `IPCEventHandle'
 91 |   , useEvent        `Event'
 92 |   }
 93 |   -> `Status' cToEnum #}
 94 | #endif
 95 | 
 96 | 
 97 | -- |
 98 | -- Open an inter-process event handle for use in the current process,
 99 | -- returning an event that can be used in the current process and behaving
100 | -- as a locally created event with the 'DisableTiming' flag specified.
101 | --
102 | -- The event must be freed with 'destroy'. Performing operations on the
103 | -- imported event after the exported event has been 'destroy'ed in the
104 | -- exporting process is undefined.
105 | --
106 | -- Requires CUDA-4.0.
107 | --
108 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf1d525918b6c643b99ca8c8e42e36c2e>
109 | --
110 | {-# INLINEABLE open #-}
111 | open :: IPCEvent -> IO Event
112 | #if CUDA_VERSION < 4010
113 | open _    = requireSDK 'open 4.1
114 | #else
115 | open !ev = resultIfOk =<< cuIpcOpenEventHandle (useIPCEvent ev)
116 | 
117 | {-# INLINE cuIpcOpenEventHandle #-}
118 | {# fun unsafe cuIpcOpenEventHandle
119 |   { alloca-         `Event'          peekEvent*
120 |   , withForeignPtr* `IPCEventHandle'
121 |   }
122 |   -> `Status' cToEnum #}
123 |   where
124 |     peekEvent = liftM Event . peek
125 | #endif
126 | 
127 | 
128 | --------------------------------------------------------------------------------
129 | -- Internal
130 | --------------------------------------------------------------------------------
131 | 
132 | type IPCEventHandle = ForeignPtr ()
133 | 
134 | newIPCEventHandle :: IO IPCEventHandle
135 | #if CUDA_VERSION < 4010
136 | newIPCEventHandle = requireSDK 'newIPCEventHandle 4.1
137 | #else
138 | newIPCEventHandle = mallocForeignPtrBytes {#sizeof CUipcEventHandle#}
139 | #endif
140 | 
141 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/IPC/Marshal.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE EmptyDataDecls           #-}
  4 | {-# LANGUAGE ForeignFunctionInterface #-}
  5 | {-# LANGUAGE TemplateHaskell          #-}
  6 | --------------------------------------------------------------------------------
  7 | -- |
  8 | -- Module    : Foreign.CUDA.Driver.IPC.Marshal
  9 | -- Copyright : [2009..2023] Trevor L. McDonell
 10 | -- License   : BSD
 11 | --
 12 | -- IPC memory management for low-level driver interface.
 13 | --
 14 | -- Restricted to devices which support unified addressing on Linux
 15 | -- operating systems.
 16 | --
 17 | -- Since CUDA-4.0.
 18 | --
 19 | --------------------------------------------------------------------------------
 20 | 
 21 | module Foreign.CUDA.Driver.IPC.Marshal (
 22 | 
 23 |   -- ** IPC memory management
 24 |   IPCDevicePtr, IPCFlag(..),
 25 |   export, open, close,
 26 | 
 27 | ) where
 28 | 
 29 | #include "cbits/stubs.h"
 30 | {# context lib="cuda" #}
 31 | 
 32 | -- Friends
 33 | import Foreign.CUDA.Ptr
 34 | import Foreign.CUDA.Driver.Error
 35 | import Foreign.CUDA.Internal.C2HS
 36 | import Foreign.CUDA.Driver.Marshal
 37 | 
 38 | -- System
 39 | import Control.Monad
 40 | import Prelude
 41 | 
 42 | import Foreign.C
 43 | import Foreign.Ptr
 44 | import Foreign.ForeignPtr
 45 | import Foreign.Marshal
 46 | 
 47 | 
 48 | --------------------------------------------------------------------------------
 49 | -- Data Types
 50 | --------------------------------------------------------------------------------
 51 | 
 52 | -- |
 53 | -- A CUDA memory handle used for inter-process communication.
 54 | --
 55 | newtype IPCDevicePtr a = IPCDevicePtr { useIPCDevicePtr :: IPCMemHandle }
 56 |   deriving (Eq, Show)
 57 | 
 58 | 
 59 | -- |
 60 | -- Flags for controlling IPC memory access
 61 | --
 62 | #if CUDA_VERSION < 4010
 63 | data IPCFlag
 64 | #else
 65 | {# enum CUipcMem_flags as IPCFlag
 66 |   { underscoreToCase }
 67 |   with prefix="CU_IPC_MEM" deriving (Eq, Show, Bounded) #}
 68 | #endif
 69 | 
 70 | 
 71 | --------------------------------------------------------------------------------
 72 | -- IPC memory management
 73 | --------------------------------------------------------------------------------
 74 | 
 75 | -- |
 76 | -- Create an inter-process memory handle for an existing device memory
 77 | -- allocation. The handle can then be sent to another process and made
 78 | -- available to that process via 'open'.
 79 | --
 80 | -- Requires CUDA-4.1.
 81 | --
 82 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1b5be767b275f016523b2ac49ebec1>
 83 | --
 84 | {-# INLINEABLE export #-}
 85 | export :: DevicePtr a -> IO (IPCDevicePtr a)
 86 | #if CUDA_VERSION < 4010
 87 | export _     = requireSDK 'export 4.1
 88 | #else
 89 | export !dptr = do
 90 |   h <- newIPCMemHandle
 91 |   r <- cuIpcGetMemHandle h dptr
 92 |   resultIfOk (r, IPCDevicePtr h)
 93 | 
 94 | {-# INLINE cuIpcGetMemHandle #-}
 95 | {# fun unsafe cuIpcGetMemHandle
 96 |   { withForeignPtr* `IPCMemHandle'
 97 |   , useDeviceHandle `DevicePtr a'
 98 |   }
 99 |   -> `Status' cToEnum #}
100 | #endif
101 | 
102 | 
103 | -- |
104 | -- Open an inter-process memory handle exported from another process,
105 | -- returning a device pointer usable in the current process.
106 | --
107 | -- Maps memory exported by another process with 'export into the current
108 | -- device address space. For contexts on different devices, 'open' can
109 | -- attempt to enable peer access if the user called
110 | -- 'Foreign.CUDA.Driver.Context.Peer.add', and is controlled by the
111 | -- 'LazyEnablePeerAccess' flag.
112 | --
113 | -- Each handle from a given device and context may only be 'open'ed by one
114 | -- context per device per other process. Memory returned by 'open' must be
115 | -- freed via 'close'.
116 | --
117 | -- Requires CUDA-4.1.
118 | --
119 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ga8bd126fcff919a0c996b7640f197b79>
120 | --
121 | {-# INLINEABLE open #-}
122 | open :: IPCDevicePtr a -> [IPCFlag]-> IO (DevicePtr a)
123 | #if CUDA_VERSION < 4010
124 | open _    _      = requireSDK 'open 4.1
125 | #else
126 | open !hdl !flags = resultIfOk =<< cuIpcOpenMemHandle (useIPCDevicePtr hdl) flags
127 | 
128 | {-# INLINE cuIpcOpenMemHandle #-}
129 | {# fun unsafe cuIpcOpenMemHandle
130 |   { alloca-         `DevicePtr a'  peekDeviceHandle*
131 |   , withForeignPtr* `IPCMemHandle'
132 |   , combineBitMasks `[IPCFlag]'
133 |   }
134 |   -> `Status' cToEnum #}
135 | #endif
136 | 
137 | 
138 | -- |
139 | -- Close and unmap memory returned by 'open'. The original allocation in
140 | -- the exporting process as well as imported mappings in other processes
141 | -- are unaffected.
142 | --
143 | -- Any resources used to enable peer access will be freed if this is the
144 | -- last mapping using them.
145 | --
146 | -- Requires CUDA-4.1.
147 | --
148 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gd6f5d5bcf6376c6853b64635b0157b9e>
149 | --
150 | {-# INLINEABLE close #-}
151 | close :: DevicePtr a -> IO ()
152 | #if CUDA_VERSION < 4010
153 | close _     = requireSDK 'close 4.1
154 | #else
155 | close !dptr = nothingIfOk =<< cuIpcCloseMemHandle dptr
156 | 
157 | {-# INLINE cuIpcCloseMemHandle #-}
158 | {# fun unsafe cuIpcCloseMemHandle
159 |   { useDeviceHandle `DevicePtr a'
160 |   }
161 |   -> `Status' cToEnum #}
162 | #endif
163 | 
164 | 
165 | --------------------------------------------------------------------------------
166 | -- Internal
167 | --------------------------------------------------------------------------------
168 | 
169 | type IPCMemHandle = ForeignPtr ()
170 | 
171 | newIPCMemHandle :: IO IPCMemHandle
172 | #if CUDA_VERSION < 4010
173 | newIPCMemHandle = requireSDK 'newIPCMemHandle 4.1
174 | #else
175 | newIPCMemHandle = mallocForeignPtrBytes {#sizeof CUipcMemHandle#}
176 | #endif
177 | 
178 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Module.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Foreign.CUDA.Driver.Module
 4 | -- Copyright : [2009..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Module management for low-level driver interface
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module Foreign.CUDA.Driver.Module (
12 | 
13 |   module Foreign.CUDA.Driver.Module.Base,
14 |   module Foreign.CUDA.Driver.Module.Query,
15 | 
16 | ) where
17 | 
18 | import Foreign.CUDA.Driver.Module.Base  hiding ( JITOptionInternal(..), useModule, jitOptionUnpack, jitTargetOfCompute )
19 | import Foreign.CUDA.Driver.Module.Query
20 | 
21 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Module/Query.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE ForeignFunctionInterface #-}
  4 | {-# LANGUAGE MagicHash                #-}
  5 | {-# LANGUAGE UnboxedTuples            #-}
  6 | --------------------------------------------------------------------------------
  7 | -- |
  8 | -- Module    : Foreign.CUDA.Driver.Module.Query
  9 | -- Copyright : [2009..2023] Trevor L. McDonell
 10 | -- License   : BSD
 11 | --
 12 | -- Querying module attributes for low-level driver interface
 13 | --
 14 | --------------------------------------------------------------------------------
 15 | 
 16 | module Foreign.CUDA.Driver.Module.Query (
 17 | 
 18 |   -- ** Querying module inhabitants
 19 |   getFun, getPtr, getTex,
 20 | 
 21 | ) where
 22 | 
 23 | #include "cbits/stubs.h"
 24 | {# context lib="cuda" #}
 25 | 
 26 | -- Friends
 27 | import Foreign.CUDA.Driver.Error
 28 | import Foreign.CUDA.Driver.Exec
 29 | import Foreign.CUDA.Driver.Marshal                      ( peekDeviceHandle )
 30 | import Foreign.CUDA.Driver.Module.Base
 31 | import Foreign.CUDA.Driver.Texture
 32 | import Foreign.CUDA.Internal.C2HS
 33 | import Foreign.CUDA.Ptr
 34 | 
 35 | -- System
 36 | import Foreign
 37 | import Foreign.C
 38 | import Control.Exception                                ( throwIO )
 39 | import Control.Monad                                    ( liftM )
 40 | import Data.ByteString.Short                            ( ShortByteString )
 41 | import qualified Data.ByteString.Short                  as BS
 42 | import qualified Data.ByteString.Short.Internal         as BI
 43 | import qualified Data.ByteString.Internal               as BI
 44 | import Prelude                                          as P
 45 | 
 46 | import GHC.Exts
 47 | import GHC.Base                                         ( IO(..) )
 48 | 
 49 | 
 50 | --------------------------------------------------------------------------------
 51 | -- Querying module attributes
 52 | --------------------------------------------------------------------------------
 53 | 
 54 | -- |
 55 | -- Returns a function handle.
 56 | --
 57 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1ga52be009b0d4045811b30c965e1cb2cf>
 58 | --
 59 | {-# INLINEABLE getFun #-}
 60 | getFun :: Module -> ShortByteString -> IO Fun
 61 | getFun !mdl !fn = resultIfFound "function" fn =<< cuModuleGetFunction mdl fn
 62 | 
 63 | {-# INLINE cuModuleGetFunction #-}
 64 | {# fun unsafe cuModuleGetFunction
 65 |   { alloca-       `Fun'             peekFun*
 66 |   , useModule     `Module'
 67 |   , useAsCString* `ShortByteString'
 68 |   }
 69 |   -> `Status' cToEnum #}
 70 |   where
 71 |     peekFun = liftM Fun . peek
 72 | 
 73 | 
 74 | -- |
 75 | -- Return a global pointer, and size of the global (in bytes).
 76 | --
 77 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43672e26073b1081476dbf47a86ab>
 78 | --
 79 | {-# INLINEABLE getPtr #-}
 80 | getPtr :: Module -> ShortByteString -> IO (DevicePtr a, Int)
 81 | getPtr !mdl !name = do
 82 |   (!status,!dptr,!bytes) <- cuModuleGetGlobal mdl name
 83 |   resultIfFound "global" name (status,(dptr,bytes))
 84 | 
 85 | {-# INLINE cuModuleGetGlobal #-}
 86 | {# fun unsafe cuModuleGetGlobal
 87 |   { alloca-       `DevicePtr a'     peekDeviceHandle*
 88 |   , alloca-       `Int'             peekIntConv*
 89 |   , useModule     `Module'
 90 |   , useAsCString* `ShortByteString'
 91 |   }
 92 |   -> `Status' cToEnum #}
 93 | 
 94 | 
 95 | -- |
 96 | -- Return a handle to a texture reference. This texture reference handle
 97 | -- should not be destroyed, as the texture will be destroyed automatically
 98 | -- when the module is unloaded.
 99 | --
100 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9607dcbf911c16420d5264273f2b5608>
101 | --
102 | {-# INLINEABLE getTex #-}
103 | getTex :: Module -> ShortByteString -> IO Texture
104 | getTex !mdl !name = resultIfFound "texture" name =<< cuModuleGetTexRef mdl name
105 | 
106 | {-# INLINE cuModuleGetTexRef #-}
107 | {# fun unsafe cuModuleGetTexRef
108 |   { alloca-       `Texture'         peekTex*
109 |   , useModule     `Module'
110 |   , useAsCString* `ShortByteString'
111 |   }
112 |   -> `Status' cToEnum #}
113 | 
114 | 
115 | --------------------------------------------------------------------------------
116 | -- Internal
117 | --------------------------------------------------------------------------------
118 | 
119 | {-# INLINE resultIfFound #-}
120 | resultIfFound :: String -> ShortByteString -> (Status, a) -> IO a
121 | resultIfFound kind name (!status,!result) =
122 |   case status of
123 |        Success  -> return result
124 |        NotFound -> cudaErrorIO (kind ++ ' ' : describe status ++ ": " ++ unpack name)
125 |        _        -> throwIO (ExitCode status)
126 | 
127 | 
128 | -- Utilities
129 | -- ---------
130 | 
131 | -- [Short]ByteStrings are not null-terminated, so can't be passed directly to C.
132 | --
133 | -- unsafeUseAsCString :: ShortByteString -> CString
134 | -- unsafeUseAsCString (BI.SBS ba#) = Ptr (byteArrayContents# ba#)
135 | 
136 | {-# INLINE useAsCString #-}
137 | useAsCString :: ShortByteString -> (CString -> IO a) -> IO a
138 | useAsCString (BI.SBS ba#) action = IO $ \s0 ->
139 |   case sizeofByteArray# ba#                              of { n# ->
140 |   case newPinnedByteArray# (n# +# 1#) s0                 of { (# s1, mba# #) ->
141 |   case byteArrayContents# (unsafeCoerce# mba#)           of { addr# ->
142 |   case copyByteArrayToAddr# ba# 0# addr# n# s1           of { s2 ->
143 |   case writeWord8OffAddr# addr# n# (wordToWord8# 0##) s2 of { s3 ->
144 |   case action (Ptr addr#)                                of { IO action' ->
145 |   case action' s3                                        of { (# s4, r  #) ->
146 |   case touch# mba# s4                                    of { s5 ->
147 |   (# s5, r #)
148 |  }}}}}}}}
149 | 
150 | 
151 | {-# INLINE unpack #-}
152 | unpack :: ShortByteString -> [Char]
153 | unpack = P.map BI.w2c . BS.unpack
154 | 
155 | #if __GLASGOW_HASKELL__ < 902
156 | {-# INLINE wordToWord8# #-}
157 | wordToWord8# :: Word# -> Word#
158 | wordToWord8# x = x
159 | #endif
160 | 
161 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Profiler.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ForeignFunctionInterface #-}
  2 | --------------------------------------------------------------------------------
  3 | -- |
  4 | -- Module    : Foreign.CUDA.Driver.Profiler
  5 | -- Copyright : [2009..2023] Trevor L. McDonell
  6 | -- License   : BSD
  7 | --
  8 | -- Profiler control for low-level driver interface
  9 | --
 10 | --------------------------------------------------------------------------------
 11 | 
 12 | module Foreign.CUDA.Driver.Profiler (
 13 | 
 14 |   OutputMode(..),
 15 |   initialise,
 16 |   start, stop,
 17 | 
 18 | ) where
 19 | 
 20 | #include "cbits/stubs.h"
 21 | {# context lib="cuda" #}
 22 | 
 23 | -- friends
 24 | import Foreign.CUDA.Driver.Error
 25 | import Foreign.CUDA.Internal.C2HS
 26 | 
 27 | -- system
 28 | import Foreign
 29 | import Foreign.C
 30 | 
 31 | 
 32 | -- | Profiler output mode
 33 | --
 34 | {# enum CUoutput_mode as OutputMode
 35 |     { underscoreToCase
 36 |     , CU_OUT_CSV as CSV }
 37 |     with prefix="CU_OUT" deriving (Eq, Show) #}
 38 | 
 39 | 
 40 | -- | Initialise the CUDA profiler.
 41 | --
 42 | -- The configuration file is used to specify profiling options and profiling
 43 | -- counters. Refer to the "Compute Command Line Profiler User Guide" for
 44 | -- supported profiler options and counters.
 45 | --
 46 | -- Note that the CUDA profiler can not be initialised with this function if
 47 | -- another profiling tool is already active.
 48 | --
 49 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER>
 50 | --
 51 | {-# INLINEABLE initialise #-}
 52 | initialise
 53 |     :: FilePath     -- ^ configuration file that itemises which counters and/or options to profile
 54 |     -> FilePath     -- ^ output file where profiling results will be stored
 55 |     -> OutputMode
 56 |     -> IO ()
 57 | initialise config output mode
 58 |   = nothingIfOk =<< cuProfilerInitialize config output mode
 59 | 
 60 | {-# INLINE cuProfilerInitialize #-}
 61 | {# fun unsafe cuProfilerInitialize
 62 |   {           `String'
 63 |   ,           `String'
 64 |   , cFromEnum `OutputMode'
 65 |   }
 66 |   -> `Status' cToEnum #}
 67 | 
 68 | 
 69 | -- | Begin profiling collection by the active profiling tool for the current
 70 | -- context. If profiling is already enabled, then this has no effect.
 71 | --
 72 | -- 'start' and 'stop' can be used to programatically control profiling
 73 | -- granularity, by allowing profiling to be done only on selected pieces of
 74 | -- code.
 75 | --
 76 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g8a5314de2292c2efac83ac7fcfa9190e>
 77 | --
 78 | {-# INLINEABLE start #-}
 79 | start :: IO ()
 80 | start = nothingIfOk =<< cuProfilerStart
 81 | 
 82 | {-# INLINE cuProfilerStart #-}
 83 | {# fun unsafe cuProfilerStart
 84 |   { } -> `Status' cToEnum #}
 85 | 
 86 | 
 87 | -- | Stop profiling collection by the active profiling tool for the current
 88 | -- context, and force all pending profiler events to be written to the output
 89 | -- file. If profiling is already inactive, this has no effect.
 90 | --
 91 | -- <http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g4d8edef6174fd90165e6ac838f320a5f>
 92 | --
 93 | {-# INLINEABLE stop #-}
 94 | stop :: IO ()
 95 | stop = nothingIfOk =<< cuProfilerStop
 96 | 
 97 | {-# INLINE cuProfilerStop #-}
 98 | {# fun unsafe cuProfilerStop
 99 |   { } -> `Status' cToEnum #}
100 | 
101 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Driver/Utils.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | --------------------------------------------------------------------------------
 3 | -- |
 4 | -- Module    : Foreign.CUDA.Driver.Utils
 5 | -- Copyright : [2009..2023] Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | -- Utility functions
 9 | --
10 | --------------------------------------------------------------------------------
11 | 
12 | module Foreign.CUDA.Driver.Utils (
13 | 
14 |   driverVersion,
15 |   libraryVersion,
16 | 
17 | ) where
18 | 
19 | #include "cbits/stubs.h"
20 | {# context lib="cuda" #}
21 | 
22 | -- Friends
23 | import Foreign.CUDA.Driver.Error
24 | import Foreign.CUDA.Internal.C2HS
25 | 
26 | -- System
27 | import Foreign
28 | import Foreign.C
29 | 
30 | 
31 | -- |
32 | -- Return the version number of the installed CUDA driver.
33 | --
34 | {-# INLINEABLE driverVersion #-}
35 | driverVersion :: IO Int
36 | driverVersion =  resultIfOk =<< cuDriverGetVersion
37 | 
38 | {-# INLINE cuDriverGetVersion #-}
39 | {# fun unsafe cuDriverGetVersion
40 |   { alloca- `Int' peekIntConv* } -> `Status' cToEnum #}
41 | 
42 | 
43 | -- |
44 | -- Return the version number of the CUDA library (API) that this package was
45 | -- compiled against.
46 | --
47 | {-# INLINEABLE libraryVersion #-}
48 | libraryVersion :: Int
49 | libraryVersion = {#const CUDA_VERSION #}
50 | 
51 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Path.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP #-}
 2 | --------------------------------------------------------------------------------
 3 | -- |
 4 | -- Module    : Foreign.CUDA.Path
 5 | -- Copyright : [2017..2023] Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | --------------------------------------------------------------------------------
 9 | 
10 | module Foreign.CUDA.Path (
11 | 
12 |   cudaInstallPath,
13 |   cudaBinPath, cudaLibraryPath, cudaIncludePath,
14 | 
15 | ) where
16 | 
17 | import System.FilePath
18 | 
19 | -- | The base path to the CUDA toolkit installation that this package was
20 | -- compiled against.
21 | --
22 | cudaInstallPath :: FilePath
23 | cudaInstallPath = {#const CUDA_INSTALL_PATH#}
24 | 
25 | -- | The path where the CUDA toolkit executables, such as @nvcc@ and @ptxas@,
26 | -- can be found.
27 | --
28 | cudaBinPath :: FilePath
29 | cudaBinPath = cudaInstallPath </> "bin"
30 | 
31 | -- | The path where the CUDA libraries this package was linked against are
32 | -- located
33 | --
34 | cudaLibraryPath :: FilePath
35 | cudaLibraryPath = {#const CUDA_LIBRARY_PATH#}
36 | 
37 | -- | The path where the CUDA headers this package was built against are located
38 | --
39 | cudaIncludePath :: FilePath
40 | cudaIncludePath = cudaInstallPath </> "include"
41 | 
42 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Runtime.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Foreign.CUDA.Runtime
 4 | -- Copyright : [2009..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | -- Top level bindings to the C-for-CUDA runtime API
 8 | --
 9 | --------------------------------------------------------------------------------
10 | 
11 | module Foreign.CUDA.Runtime (
12 | 
13 |   module Foreign.CUDA.Ptr,
14 |   module Foreign.CUDA.Runtime.Device,
15 |   module Foreign.CUDA.Runtime.Error,
16 |   module Foreign.CUDA.Runtime.Exec,
17 |   module Foreign.CUDA.Runtime.Marshal,
18 |   module Foreign.CUDA.Runtime.Utils
19 | 
20 | ) where
21 | 
22 | import Foreign.CUDA.Ptr
23 | import Foreign.CUDA.Runtime.Device
24 | import Foreign.CUDA.Runtime.Error
25 | import Foreign.CUDA.Runtime.Exec
26 | import Foreign.CUDA.Runtime.Marshal
27 | import Foreign.CUDA.Runtime.Utils
28 | 
29 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Runtime/Error.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE DeriveDataTypeable       #-}
  3 | {-# LANGUAGE ForeignFunctionInterface #-}
  4 | --------------------------------------------------------------------------------
  5 | -- |
  6 | -- Module    : Foreign.CUDA.Runtime.Error
  7 | -- Copyright : [2009..2023] Trevor L. McDonell
  8 | -- License   : BSD
  9 | --
 10 | -- Error handling functions
 11 | --
 12 | --------------------------------------------------------------------------------
 13 | 
 14 | module Foreign.CUDA.Runtime.Error (
 15 | 
 16 |   Status(..), CUDAException(..),
 17 | 
 18 |   cudaError, describe, requireSDK,
 19 |   resultIfOk, nothingIfOk, checkStatus,
 20 | 
 21 | ) where
 22 | 
 23 | -- Friends
 24 | import Foreign.CUDA.Internal.C2HS
 25 | import Text.Show.Describe
 26 | 
 27 | -- System
 28 | import Control.Exception
 29 | import Data.Typeable
 30 | import Foreign.C
 31 | import Foreign.Ptr
 32 | import Language.Haskell.TH
 33 | import System.IO.Unsafe
 34 | import Text.Printf
 35 | 
 36 | #include "cbits/stubs.h"
 37 | {# context lib="cudart" #}
 38 | 
 39 | 
 40 | --------------------------------------------------------------------------------
 41 | -- Return Status
 42 | --------------------------------------------------------------------------------
 43 | 
 44 | -- |
 45 | -- Return codes from API functions
 46 | --
 47 | {# enum cudaError as Status
 48 |     { cudaSuccess as Success }
 49 |     with prefix="cudaError" deriving (Eq, Show) #}
 50 | 
 51 | --------------------------------------------------------------------------------
 52 | -- Exceptions
 53 | --------------------------------------------------------------------------------
 54 | 
 55 | data CUDAException
 56 |   = ExitCode  Status
 57 |   | UserError String
 58 |   deriving Typeable
 59 | 
 60 | instance Exception CUDAException
 61 | 
 62 | instance Show CUDAException where
 63 |   showsPrec _ (ExitCode  s) = showString ("CUDA Exception: " ++ describe s)
 64 |   showsPrec _ (UserError s) = showString ("CUDA Exception: " ++ s)
 65 | 
 66 | 
 67 | -- |
 68 | -- Raise a 'CUDAException' in the IO Monad
 69 | --
 70 | cudaError :: String -> IO a
 71 | cudaError s = throwIO (UserError s)
 72 | 
 73 | -- |
 74 | -- A specially formatted error message
 75 | --
 76 | requireSDK :: Name -> Double -> IO a
 77 | requireSDK n v = cudaError $ printf "'%s' requires at least cuda-%3.1f\n" (show n) v
 78 | 
 79 | 
 80 | --------------------------------------------------------------------------------
 81 | -- Helper Functions
 82 | --------------------------------------------------------------------------------
 83 | 
 84 | -- |
 85 | -- Return the descriptive string associated with a particular error code
 86 | --
 87 | instance Describe Status where
 88 |     describe = cudaGetErrorString
 89 | 
 90 | -- Logically, this must be a pure function, returning a pointer to a statically
 91 | -- defined string constant.
 92 | --
 93 | {# fun pure unsafe cudaGetErrorString
 94 |     { cFromEnum `Status' } -> `String' #}
 95 | 
 96 | 
 97 | -- |
 98 | -- Return the results of a function on successful execution, otherwise return
 99 | -- the error string associated with the return code
100 | --
101 | {-# INLINE resultIfOk #-}
102 | resultIfOk :: (Status, a) -> IO a
103 | resultIfOk (status, !result) =
104 |     case status of
105 |         Success -> return  result
106 |         _       -> throwIO (ExitCode status)
107 | 
108 | 
109 | -- |
110 | -- Return the error string associated with an unsuccessful return code,
111 | -- otherwise Nothing
112 | --
113 | {-# INLINE nothingIfOk #-}
114 | nothingIfOk :: Status -> IO ()
115 | nothingIfOk status =
116 |     case status of
117 |         Success -> return  ()
118 |         _       -> throwIO (ExitCode status)
119 | 
120 | {-# INLINE checkStatus #-}
121 | checkStatus :: CInt -> IO ()
122 | checkStatus = nothingIfOk . cToEnum
123 | 
124 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Runtime/Event.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE CPP                      #-}
  3 | {-# LANGUAGE EmptyDataDecls           #-}
  4 | {-# LANGUAGE ForeignFunctionInterface #-}
  5 | {-# LANGUAGE TemplateHaskell          #-}
  6 | --------------------------------------------------------------------------------
  7 | -- |
  8 | -- Module    : Foreign.CUDA.Driver.Event
  9 | -- Copyright : [2009..2023] Trevor L. McDonell
 10 | -- License   : BSD
 11 | --
 12 | -- Event management for C-for-CUDA runtime environment
 13 | --
 14 | --------------------------------------------------------------------------------
 15 | 
 16 | module Foreign.CUDA.Runtime.Event (
 17 | 
 18 |   -- * Event Management
 19 |   Event, EventFlag(..), WaitFlag,
 20 |   create, destroy, elapsedTime, query, record, wait, block
 21 | 
 22 | ) where
 23 | 
 24 | #include "cbits/stubs.h"
 25 | {# context lib="cudart" #}
 26 | 
 27 | -- Friends
 28 | import Foreign.CUDA.Driver.Event                          ( Event(..), EventFlag(..), WaitFlag )
 29 | import Foreign.CUDA.Driver.Stream                         ( Stream(..), defaultStream )
 30 | import Foreign.CUDA.Internal.C2HS
 31 | import Foreign.CUDA.Runtime.Error
 32 | 
 33 | -- System
 34 | import Foreign
 35 | import Foreign.C
 36 | import Control.Monad                                      ( liftM )
 37 | import Control.Exception                                  ( throwIO )
 38 | import Data.Maybe                                         ( fromMaybe )
 39 | 
 40 | 
 41 | --------------------------------------------------------------------------------
 42 | -- Event management
 43 | --------------------------------------------------------------------------------
 44 | 
 45 | -- |
 46 | -- Create a new event
 47 | --
 48 | {-# INLINEABLE create #-}
 49 | create :: [EventFlag] -> IO Event
 50 | create !flags = resultIfOk =<< cudaEventCreateWithFlags flags
 51 | 
 52 | {-# INLINE cudaEventCreateWithFlags #-}
 53 | {# fun unsafe cudaEventCreateWithFlags
 54 |   { alloca-         `Event'       peekEvt*
 55 |   , combineBitMasks `[EventFlag]'          } -> `Status' cToEnum #}
 56 |   where peekEvt = liftM Event . peek
 57 | 
 58 | 
 59 | -- |
 60 | -- Destroy an event
 61 | --
 62 | {-# INLINEABLE destroy #-}
 63 | destroy :: Event -> IO ()
 64 | destroy !ev = nothingIfOk =<< cudaEventDestroy ev
 65 | 
 66 | {-# INLINE cudaEventDestroy #-}
 67 | {# fun unsafe cudaEventDestroy
 68 |   { useEvent `Event' } -> `Status' cToEnum #}
 69 | 
 70 | 
 71 | -- |
 72 | -- Determine the elapsed time (in milliseconds) between two events
 73 | --
 74 | {-# INLINEABLE elapsedTime #-}
 75 | elapsedTime :: Event -> Event -> IO Float
 76 | elapsedTime !ev1 !ev2 = resultIfOk =<< cudaEventElapsedTime ev1 ev2
 77 | 
 78 | {-# INLINE cudaEventElapsedTime #-}
 79 | {# fun unsafe cudaEventElapsedTime
 80 |   { alloca-  `Float' peekFloatConv*
 81 |   , useEvent `Event'
 82 |   , useEvent `Event'                } -> `Status' cToEnum #}
 83 | 
 84 | 
 85 | -- |
 86 | -- Determines if a event has actually been recorded
 87 | --
 88 | {-# INLINEABLE query #-}
 89 | query :: Event -> IO Bool
 90 | query !ev =
 91 |   cudaEventQuery ev >>= \rv ->
 92 |   case rv of
 93 |     Success  -> return True
 94 |     NotReady -> return False
 95 |     _        -> throwIO (ExitCode rv)
 96 | 
 97 | {-# INLINE cudaEventQuery #-}
 98 | {# fun unsafe cudaEventQuery
 99 |   { useEvent `Event' } -> `Status' cToEnum #}
100 | 
101 | 
102 | -- |
103 | -- Record an event once all operations in the current context (or optionally
104 | -- specified stream) have completed. This operation is asynchronous.
105 | --
106 | {-# INLINEABLE record #-}
107 | record :: Event -> Maybe Stream -> IO ()
108 | record !ev !mst =
109 |   nothingIfOk =<< cudaEventRecord ev (maybe defaultStream id mst)
110 | 
111 | {-# INLINE cudaEventRecord #-}
112 | {# fun unsafe cudaEventRecord
113 |   { useEvent  `Event'
114 |   , useStream `Stream' } -> `Status' cToEnum #}
115 | 
116 | 
117 | -- |
118 | -- Makes all future work submitted to the (optional) stream wait until the given
119 | -- event reports completion before beginning execution. Synchronisation is
120 | -- performed on the device, including when the event and stream are from
121 | -- different device contexts. Requires cuda-3.2.
122 | --
123 | {-# INLINEABLE wait #-}
124 | wait :: Event -> Maybe Stream -> [WaitFlag] -> IO ()
125 | #if CUDART_VERSION < 3020
126 | wait _ _ _           = requireSDK 'wait 3.2
127 | #else
128 | wait !ev !mst !flags =
129 |   let st = fromMaybe defaultStream mst
130 |   in  nothingIfOk =<< cudaStreamWaitEvent st ev flags
131 | 
132 | {-# INLINE cudaStreamWaitEvent #-}
133 | {# fun unsafe cudaStreamWaitEvent
134 |   { useStream       `Stream'
135 |   , useEvent        `Event'
136 |   , combineBitMasks `[WaitFlag]' } -> `Status' cToEnum #}
137 | #endif
138 | 
139 | -- |
140 | -- Wait until the event has been recorded
141 | --
142 | {-# INLINEABLE block #-}
143 | block :: Event -> IO ()
144 | block !ev = nothingIfOk =<< cudaEventSynchronize ev
145 | 
146 | {-# INLINE cudaEventSynchronize #-}
147 | {# fun cudaEventSynchronize
148 |   { useEvent `Event' } -> `Status' cToEnum #}
149 | 
150 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Runtime/Stream.chs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns             #-}
  2 | {-# LANGUAGE ForeignFunctionInterface #-}
  3 | --------------------------------------------------------------------------------
  4 | -- |
  5 | -- Module    : Foreign.CUDA.Runtime.Stream
  6 | -- Copyright : [2009..2023] Trevor L. McDonell
  7 | -- License   : BSD
  8 | --
  9 | -- Stream management routines
 10 | --
 11 | --------------------------------------------------------------------------------
 12 | 
 13 | module Foreign.CUDA.Runtime.Stream (
 14 | 
 15 |   -- * Stream Management
 16 |   Stream(..),
 17 |   create, destroy, finished, block,
 18 | 
 19 |   defaultStream, defaultStreamLegacy, defaultStreamPerThread,
 20 | 
 21 | ) where
 22 | 
 23 | #include "cbits/stubs.h"
 24 | {# context lib="cudart" #}
 25 | 
 26 | -- Friends
 27 | import Foreign.CUDA.Driver.Stream                         ( Stream(..), defaultStream, defaultStreamLegacy, defaultStreamPerThread )
 28 | import Foreign.CUDA.Internal.C2HS
 29 | import Foreign.CUDA.Runtime.Error
 30 | 
 31 | -- System
 32 | import Foreign
 33 | import Foreign.C
 34 | import Control.Monad                                      ( liftM )
 35 | import Control.Exception                                  ( throwIO )
 36 | 
 37 | 
 38 | --------------------------------------------------------------------------------
 39 | -- Functions
 40 | --------------------------------------------------------------------------------
 41 | 
 42 | -- |
 43 | -- Create a new asynchronous stream
 44 | --
 45 | {-# INLINEABLE create #-}
 46 | create :: IO Stream
 47 | create = resultIfOk =<< cudaStreamCreate
 48 | 
 49 | {-# INLINE cudaStreamCreate #-}
 50 | {# fun unsafe cudaStreamCreate
 51 |   { alloca- `Stream' peekStream* } -> `Status' cToEnum #}
 52 | 
 53 | 
 54 | -- |
 55 | -- Destroy and clean up an asynchronous stream
 56 | --
 57 | {-# INLINEABLE destroy #-}
 58 | destroy :: Stream -> IO ()
 59 | destroy !s = nothingIfOk =<< cudaStreamDestroy s
 60 | 
 61 | {-# INLINE cudaStreamDestroy #-}
 62 | {# fun unsafe cudaStreamDestroy
 63 |   { useStream `Stream' } -> `Status' cToEnum #}
 64 | 
 65 | 
 66 | -- |
 67 | -- Determine if all operations in a stream have completed
 68 | --
 69 | {-# INLINEABLE finished #-}
 70 | finished :: Stream -> IO Bool
 71 | finished !s =
 72 |   cudaStreamQuery s >>= \rv -> do
 73 |   case rv of
 74 |       Success  -> return True
 75 |       NotReady -> return False
 76 |       _        -> throwIO (ExitCode rv)
 77 | 
 78 | {-# INLINE cudaStreamQuery #-}
 79 | {# fun unsafe cudaStreamQuery
 80 |   { useStream `Stream' } -> `Status' cToEnum #}
 81 | 
 82 | 
 83 | -- |
 84 | -- Block until all operations in a Stream have been completed
 85 | --
 86 | {-# INLINEABLE block #-}
 87 | block :: Stream -> IO ()
 88 | block !s = nothingIfOk =<< cudaStreamSynchronize s
 89 | 
 90 | {-# INLINE cudaStreamSynchronize #-}
 91 | {# fun cudaStreamSynchronize
 92 |   { useStream `Stream' } -> `Status' cToEnum #}
 93 | 
 94 | 
 95 | -- |
 96 | -- The main execution stream (0)
 97 | --
 98 | -- {-# INLINE defaultStream #-}
 99 | -- defaultStream :: Stream
100 | -- #if CUDART_VERSION < 3010
101 | -- defaultStream = Stream 0
102 | -- #else
103 | -- defaultStream = Stream nullPtr
104 | -- #endif
105 | 
106 | --------------------------------------------------------------------------------
107 | -- Internal
108 | --------------------------------------------------------------------------------
109 | 
110 | {-# INLINE peekStream #-}
111 | peekStream :: Ptr {#type cudaStream_t#} -> IO Stream
112 | #if CUDART_VERSION < 3010
113 | peekStream = liftM Stream . peekIntConv
114 | #else
115 | peekStream = liftM Stream . peek
116 | #endif
117 | 
118 | 


--------------------------------------------------------------------------------
/src/Foreign/CUDA/Runtime/Utils.chs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | --------------------------------------------------------------------------------
 3 | -- |
 4 | -- Module    : Foreign.CUDA.Runtime.Utils
 5 | -- Copyright : [2009..2023] Trevor L. McDonell
 6 | -- License   : BSD
 7 | --
 8 | -- Utility functions
 9 | --
10 | --------------------------------------------------------------------------------
11 | 
12 | module Foreign.CUDA.Runtime.Utils (
13 | 
14 |   runtimeVersion,
15 |   driverVersion,
16 |   libraryVersion,
17 | 
18 | ) where
19 | 
20 | #include "cbits/stubs.h"
21 | {# context lib="cudart" #}
22 | 
23 | -- Friends
24 | import Foreign.CUDA.Runtime.Error
25 | import Foreign.CUDA.Internal.C2HS
26 | 
27 | -- System
28 | import Foreign
29 | import Foreign.C
30 | 
31 | 
32 | -- |
33 | -- Return the version number of the installed CUDA driver
34 | --
35 | {-# INLINEABLE runtimeVersion #-}
36 | runtimeVersion :: IO Int
37 | runtimeVersion =  resultIfOk =<< cudaRuntimeGetVersion
38 | 
39 | {-# INLINE cudaRuntimeGetVersion #-}
40 | {# fun unsafe cudaRuntimeGetVersion
41 |   { alloca- `Int' peekIntConv* } -> `Status' cToEnum #}
42 | 
43 | 
44 | -- |
45 | -- Return the version number of the installed CUDA runtime
46 | --
47 | {-# INLINEABLE driverVersion #-}
48 | driverVersion :: IO Int
49 | driverVersion =  resultIfOk =<< cudaDriverGetVersion
50 | 
51 | {-# INLINE cudaDriverGetVersion #-}
52 | {# fun unsafe cudaDriverGetVersion
53 |   { alloca- `Int' peekIntConv* } -> `Status' cToEnum #}
54 | 
55 | 
56 | -- |
57 | -- Return the version number of the CUDA library (API) that this package was
58 | -- compiled against.
59 | --
60 | {-# INLINEABLE libraryVersion #-}
61 | libraryVersion :: Int
62 | libraryVersion = {#const CUDART_VERSION #}
63 | 
64 | 


--------------------------------------------------------------------------------
/src/Text/Show/Describe.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | -- |
 3 | -- Module    : Text.Show.Describe
 4 | -- Copyright : [2016..2023] Trevor L. McDonell
 5 | -- License   : BSD
 6 | --
 7 | --------------------------------------------------------------------------------
 8 | 
 9 | module Text.Show.Describe
10 |   where
11 | 
12 | 
13 | -- | Like 'Text.Show.Show', but focuses on providing a more detailed description
14 | -- of the value rather than a 'Text.Read.read'able representation.
15 | --
16 | class Describe a where
17 |     describe :: a -> String
18 | 
19 | 


--------------------------------------------------------------------------------
/stack-7.10.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-6.35
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | extra-deps:
10 | - Cabal-1.24.2.0
11 | 
12 | # Override default flag values for local packages and extra-deps
13 | # flags: {}
14 | 
15 | # Extra global and per-package GHC options
16 | # ghc-options: {}
17 | 
18 | # Control whether we use the GHC we find on the path
19 | # system-ghc: true
20 | 
21 | # Require a specific version of stack, using version ranges
22 | # require-stack-version: -any # Default
23 | # require-stack-version: >= 0.1.4.0
24 | 
25 | # Override the architecture used by stack, especially useful on Windows
26 | # arch: i386
27 | # arch: x86_64
28 | 
29 | # Extra directories used by stack for building
30 | # extra-include-dirs: [/path/to/dir]
31 | # extra-lib-dirs: [/path/to/dir]
32 | 
33 | 


--------------------------------------------------------------------------------
/stack-7.8.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-2.22
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | extra-deps:
10 | - Cabal-1.24.2.0
11 | 
12 | # Override default flag values for local packages and extra-deps
13 | # flags: {}
14 | 
15 | # Extra global and per-package GHC options
16 | # ghc-options: {}
17 | 
18 | # Control whether we use the GHC we find on the path
19 | # system-ghc: true
20 | 
21 | # Require a specific version of stack, using version ranges
22 | # require-stack-version: -any # Default
23 | # require-stack-version: >= 0.1.4.0
24 | 
25 | # Override the architecture used by stack, especially useful on Windows
26 | # arch: i386
27 | # arch: x86_64
28 | 
29 | # Extra directories used by stack for building
30 | # extra-include-dirs: [/path/to/dir]
31 | # extra-lib-dirs: [/path/to/dir]
32 | 
33 | 


--------------------------------------------------------------------------------
/stack-8.0.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-9.21
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra global and per-package GHC options
15 | # ghc-options: {}
16 | 
17 | # Control whether we use the GHC we find on the path
18 | # system-ghc: true
19 | 
20 | # Require a specific version of stack, using version ranges
21 | # require-stack-version: -any # Default
22 | # require-stack-version: >= 0.1.4.0
23 | 
24 | # Override the architecture used by stack, especially useful on Windows
25 | # arch: i386
26 | # arch: x86_64
27 | 
28 | # Extra directories used by stack for building
29 | # extra-include-dirs: [/path/to/dir]
30 | # extra-lib-dirs: [/path/to/dir]
31 | 
32 | 


--------------------------------------------------------------------------------
/stack-8.10.yaml:
--------------------------------------------------------------------------------
 1 | # For advanced use and comprehensive documentation of the format, please see:
 2 | # https://docs.haskellstack.org/en/stable/yaml_configuration/
 3 | 
 4 | resolver: lts-18.28
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra package databases containing global packages
15 | # extra-package-dbs: []
16 | 
17 | # Control whether we use the GHC we find on the path
18 | # system-ghc: true
19 | #
20 | # Require a specific version of stack, using version ranges
21 | # require-stack-version: -any # Default
22 | # require-stack-version: ">=2.1"
23 | #
24 | # Override the architecture used by stack, especially useful on Windows
25 | # arch: i386
26 | # arch: x86_64
27 | #
28 | # Extra directories used by stack for building
29 | # extra-include-dirs: [/path/to/dir]
30 | # extra-lib-dirs: [/path/to/dir]
31 | #
32 | # Allow a newer minor version of GHC than the snapshot specifies
33 | # compiler-check: newer-minor
34 | 
35 | # vim: nospell
36 | 


--------------------------------------------------------------------------------
/stack-8.2.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-11.22
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra global and per-package GHC options
15 | # ghc-options: {}
16 | 
17 | # Control whether we use the GHC we find on the path
18 | # system-ghc: true
19 | 
20 | # Require a specific version of stack, using version ranges
21 | # require-stack-version: -any # Default
22 | # require-stack-version: >= 0.1.4.0
23 | 
24 | # Override the architecture used by stack, especially useful on Windows
25 | # arch: i386
26 | # arch: x86_64
27 | 
28 | # Extra directories used by stack for building
29 | # extra-include-dirs: [/path/to/dir]
30 | # extra-lib-dirs: [/path/to/dir]
31 | 
32 | 


--------------------------------------------------------------------------------
/stack-8.4.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-12.26
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra global and per-package GHC options
15 | # ghc-options: {}
16 | 
17 | # Extra package databases containing global packages
18 | # extra-package-dbs: []
19 | 
20 | # Control whether we use the GHC we find on the path
21 | # system-ghc: true
22 | 
23 | # Require a specific version of stack, using version ranges
24 | # require-stack-version: -any # Default
25 | # require-stack-version: >= 0.1.4.0
26 | 
27 | # Override the architecture used by stack, especially useful on Windows
28 | # arch: i386
29 | # arch: x86_64
30 | 
31 | # Extra directories used by stack for building
32 | # extra-include-dirs: [/path/to/dir]
33 | # extra-lib-dirs: [/path/to/dir]
34 | 
35 | 


--------------------------------------------------------------------------------
/stack-8.6.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | # vim: nospell
 3 | 
 4 | resolver: lts-14.27
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra global and per-package GHC options
15 | # ghc-options: {}
16 | 
17 | # Extra package databases containing global packages
18 | # extra-package-dbs: []
19 | 
20 | # Control whether we use the GHC we find on the path
21 | # system-ghc: true
22 | 
23 | # Require a specific version of stack, using version ranges
24 | # require-stack-version: -any # Default
25 | # require-stack-version: >= 0.1.4.0
26 | 
27 | # Override the architecture used by stack, especially useful on Windows
28 | # arch: i386
29 | # arch: x86_64
30 | 
31 | # Extra directories used by stack for building
32 | # extra-include-dirs: [/path/to/dir]
33 | # extra-lib-dirs: [/path/to/dir]
34 | 
35 | 


--------------------------------------------------------------------------------
/stack-8.8.yaml:
--------------------------------------------------------------------------------
 1 | # For advanced use and comprehensive documentation of the format, please see:
 2 | # https://docs.haskellstack.org/en/stable/yaml_configuration/
 3 | 
 4 | resolver: lts-16.31
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | # extra-deps: []
10 | 
11 | # Override default flag values for local packages and extra-deps
12 | # flags: {}
13 | 
14 | # Extra package databases containing global packages
15 | # extra-package-dbs: []
16 | 
17 | # Control whether we use the GHC we find on the path
18 | # system-ghc: true
19 | #
20 | # Require a specific version of stack, using version ranges
21 | # require-stack-version: -any # Default
22 | # require-stack-version: ">=2.1"
23 | #
24 | # Override the architecture used by stack, especially useful on Windows
25 | # arch: i386
26 | # arch: x86_64
27 | #
28 | # Extra directories used by stack for building
29 | # extra-include-dirs: [/path/to/dir]
30 | # extra-lib-dirs: [/path/to/dir]
31 | #
32 | # Allow a newer minor version of GHC than the snapshot specifies
33 | # compiler-check: newer-minor
34 | 
35 | # vim: nospell
36 | 


--------------------------------------------------------------------------------
/stack-9.0.yaml:
--------------------------------------------------------------------------------
 1 | # Some commonly used options have been documented as comments in this file.
 2 | # For advanced use and comprehensive documentation of the format, please see:
 3 | # https://docs.haskellstack.org/en/stable/yaml_configuration/
 4 | 
 5 | resolver: lts-19.33
 6 | 
 7 | packages:
 8 | - .
 9 | 
10 | # extra-deps: []
11 | 
12 | # Override default flag values for local packages and extra-deps
13 | # flags: {}
14 | 
15 | # Extra package databases containing global packages
16 | # extra-package-dbs: []
17 | 
18 | # Control whether we use the GHC we find on the path
19 | # system-ghc: true
20 | #
21 | # Require a specific version of stack, using version ranges
22 | # require-stack-version: -any # Default
23 | # require-stack-version: ">=1.9"
24 | #
25 | # Override the architecture used by stack, especially useful on Windows
26 | # arch: i386
27 | # arch: x86_64
28 | #
29 | # Extra directories used by stack for building
30 | # extra-include-dirs: [/path/to/dir]
31 | # extra-lib-dirs: [/path/to/dir]
32 | #
33 | # Allow a newer minor version of GHC than the snapshot specifies
34 | # compiler-check: newer-minor
35 | #
36 | # vim: nospell
37 | 


--------------------------------------------------------------------------------