├── .appveyor.yml ├── .github └── workflows │ └── ci-linux.yml ├── .gitignore ├── .gitmodules ├── .mailmap ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── Setup.hs ├── WINDOWS.md ├── cbits ├── init.c ├── stubs.c └── stubs.h ├── cuda.cabal ├── examples ├── Makefile ├── common │ ├── common.mk │ ├── include │ │ ├── cudpp │ │ │ ├── LICENSE │ │ │ ├── cudpp_globals.h │ │ │ ├── shared_mem.h │ │ │ └── type_vector.h │ │ ├── operator.h │ │ └── utils.h │ └── src │ │ ├── C2HS.hs │ │ ├── PrettyPrint.hs │ │ ├── RandomVector.hs │ │ ├── Time.hs │ │ ├── control.cpp │ │ └── cudpp │ │ ├── LICENSE │ │ ├── scan_cta.cu │ │ ├── scan_kernel.cu │ │ ├── segmented_scan_cta.cu │ │ ├── segmented_scan_kernel.cu │ │ └── vector_kernel.cu └── src │ ├── bandwidthTest │ ├── BandwidthTest.hs │ ├── Makefile │ └── results │ │ ├── Bandwidth.numbers │ │ ├── GT120.pdf │ │ └── Tesla.pdf │ ├── deviceQueryDrv │ └── DeviceQuery.hs │ ├── enum │ ├── Makefile │ ├── enum.cu │ └── enum.h │ ├── fold │ ├── Fold.chs │ ├── Makefile │ ├── fold.cu │ └── fold.h │ ├── foldSeg │ ├── foldSeg.cu │ └── foldSeg.h │ ├── map2 │ ├── Makefile │ ├── Map.hs │ └── map.cu │ ├── matrixMul │ ├── LICENSE │ ├── Makefile │ ├── MatrixMul.hs │ ├── matrix_mul.cu │ └── matrix_mul.h │ ├── matrixMulDrv │ ├── LICENSE │ ├── Makefile │ ├── MatrixMul.hs │ ├── matrix_mul.cu │ └── matrix_mul.h │ ├── multiGPU │ ├── Makefile │ ├── multiGPU.hs │ ├── simpleMultiGPU.h │ └── simpleMultiGPU_kernel.cu │ ├── permute │ └── permute.cu │ ├── replicate │ ├── Makefile │ ├── replicate.cu │ └── replicate.h │ ├── scan │ ├── Makefile │ ├── Scan.chs │ ├── scan.cu │ └── scan.h │ ├── simpleTexture │ ├── Makefile │ ├── SimpleTexture.hs │ ├── data │ │ ├── lena_bw.pgm │ │ └── ref_rotated.pgm │ └── simpleTexture.cu │ ├── smvm │ ├── Makefile │ ├── SMVM.chs │ ├── smvm-csr.cu │ ├── smvm-cudpp.cu │ ├── smvm.h │ └── texture.h │ ├── sort │ ├── Makefile │ ├── Sort.chs │ ├── radix_sort.cu │ └── sort.h │ ├── vectorAddDrv │ ├── Makefile │ ├── VectorAdd.hs │ └── vector_add.cu │ └── zipWith │ └── zipWith.cu ├── src ├── Foreign │ ├── C │ │ └── Extra.hs │ ├── CUDA.hs │ └── CUDA │ │ ├── Analysis.hs │ │ ├── Analysis │ │ ├── Device.chs │ │ └── Occupancy.hs │ │ ├── Driver.hs │ │ ├── Driver │ │ ├── Context.hs │ │ ├── Context │ │ │ ├── Base.chs │ │ │ ├── Config.chs │ │ │ ├── Peer.chs │ │ │ └── Primary.chs │ │ ├── Device.chs │ │ ├── Error.chs │ │ ├── Event.chs │ │ ├── Exec.chs │ │ ├── Graph │ │ │ ├── Base.chs │ │ │ ├── Build.chs │ │ │ ├── Capture.chs │ │ │ └── Exec.chs │ │ ├── IPC │ │ │ ├── Event.chs │ │ │ └── Marshal.chs │ │ ├── Marshal.chs │ │ ├── Module.hs │ │ ├── Module │ │ │ ├── Base.chs │ │ │ ├── Link.chs │ │ │ └── Query.chs │ │ ├── Profiler.chs │ │ ├── Stream.chs │ │ ├── Texture.chs │ │ ├── Unified.chs │ │ └── Utils.chs │ │ ├── Internal │ │ └── C2HS.hs │ │ ├── Path.chs │ │ ├── Ptr.hs │ │ ├── Runtime.hs │ │ └── Runtime │ │ ├── Device.chs │ │ ├── Error.chs │ │ ├── Event.chs │ │ ├── Exec.chs │ │ ├── Marshal.chs │ │ ├── Stream.chs │ │ ├── Texture.chs │ │ └── Utils.chs └── Text │ └── Show │ └── Describe.hs ├── stack-7.10.yaml ├── stack-7.8.yaml ├── stack-8.0.yaml ├── stack-8.10.yaml ├── stack-8.2.yaml ├── stack-8.4.yaml ├── stack-8.6.yaml ├── stack-8.8.yaml └── stack-9.0.yaml /.appveyor.yml: -------------------------------------------------------------------------------- 1 | # vim: nospell 2 | 3 | clone_folder: "c:\\cuda" 4 | 5 | skip_commits: 6 | message: /\[ci skip\]/ 7 | 8 | environment: 9 | global: 10 | STACK_ROOT: "c:\\sr" 11 | matrix: 12 | - GHC: "8.10" 13 | - GHC: "8.8" 14 | - GHC: "8.6" 15 | - GHC: "8.4" 16 | - GHC: "8.2" 17 | - GHC: "8.0" 18 | # - GHC: "7.10" 19 | # - GHC: "7.8" # failed to install ghc: https://ci.appveyor.com/project/tmcdonell/cuda/build/1.0.4/job/ufhtj0klyq73psas#L149 20 | 21 | before_build: 22 | # http://help.appveyor.com/discussions/problems/6312-curl-command-not-found 23 | - set PATH=C:\Program Files\Git\mingw64\bin;%PATH% 24 | - set PATH=C:\Users\appveyor\AppData\Roaming\local\bin;%PATH% 25 | 26 | # install CUDA-9.0 27 | - appveyor DownloadFile "https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_windows_network-exe" -FileName install_cuda.exe 28 | - install_cuda.exe -s compiler_9.0 cudart_9.0 cublas_9.0 cublas_dev_9.0 cufft_9.0 cufft_dev_9.0 cusolver_9.0 cusolver_dev_9.0 cusparse_9.0 cusparse_dev_9.0 29 | - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\bin;%PATH% 30 | - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;%PATH% 31 | - nvcc --version 32 | 33 | # CUDA refuses to install the driver if no compatible GPU can be found, so 34 | # copy these .dll files manually 35 | - appveyor DownloadFile "https://drive.google.com/uc?export=download&id=14x0RX8QlHQ6vKhimbR4FDRgfP7EoHfgc" -FileName nvdriver-9.0.176.7z 36 | - 7z x nvdriver-9.0.176.7z -oC:\Windows\System32 37 | 38 | # install stack 39 | - appveyor DownloadFile "https://www.stackage.org/stack/windows-x86_64" -FileName stack.zip 40 | - 7z x stack.zip stack.exe 41 | - stack --version 42 | 43 | - ln -s stack-%GHC%.yaml stack.yaml 44 | - stack setup --no-terminal > NUL 45 | - stack build --no-terminal --no-copy-bins --fast --jobs=1 --only-dependencies --test --no-run-tests --haddock --no-haddock-deps 46 | 47 | build_script: 48 | - stack build --no-terminal --fast --test --no-run-tests 49 | 50 | test_script: 51 | - stack test 52 | 53 | -------------------------------------------------------------------------------- /.github/workflows/ci-linux.yml: -------------------------------------------------------------------------------- 1 | name: ci-linux 2 | 3 | # Trigger the workflow on push or pull request 4 | on: 5 | pull_request: 6 | # branches: [master] 7 | push: 8 | paths: 9 | - '.github/workflows/ci-linux.yml' 10 | - 'stack*.yaml' 11 | - '*.cabal' 12 | - '*/src/**' 13 | - '*/cbits/**' 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | matrix: 20 | ghc: 21 | - "8.10" 22 | - "8.8" 23 | - "8.6" 24 | - "8.4" 25 | - "8.2" 26 | - "8.0" 27 | - "7.8" 28 | cuda: 29 | - "10.2" 30 | - "10.1" 31 | - "10.0" 32 | - "9.2" 33 | - "9.1" 34 | - "9.0" 35 | 36 | # include: 37 | # - os: windows-latest 38 | # ghc: "8.10" 39 | # cuda: "10.2.89.20191206" 40 | 41 | env: 42 | STACK_FLAGS: "--fast" 43 | HADDOCK_FLAGS: "--haddock --no-haddock-deps --no-haddock-hyperlink-source --haddock-arguments=\"--no-print-missing-docs\"" 44 | 45 | steps: 46 | - uses: actions/checkout@v2 47 | 48 | - run: ln -s stack-${{ matrix.ghc }}.yaml stack.yaml 49 | 50 | - uses: actions/cache@v2 51 | with: 52 | path: snapshot.pkgdb 53 | key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-snapshot.pkgdb 54 | 55 | - uses: actions/cache@v2 56 | with: 57 | path: | 58 | ~/.local/bin 59 | ~/.stack/programs 60 | ~/.stack/snapshots 61 | .stack-work 62 | key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }} 63 | restore-keys: | 64 | ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }} 65 | ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}- 66 | ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}- 67 | 68 | - name: Install stack 69 | run: | 70 | mkdir -p ~/.local/bin 71 | if [[ ! -x ~/.local/bin/stack ]]; then 72 | curl -sL https://get.haskellstack.org/stable/linux-x86_64.tar.gz | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack' 73 | chmod a+x ~/.local/bin/stack 74 | fi 75 | echo "~/.local/bin" >> $GITHUB_PATH 76 | 77 | - name: Install GHC 78 | run: stack setup --install-ghc 79 | 80 | - name: Install CUDA 81 | run: | 82 | MATRIX_CUDA=${{ matrix.cuda }} 83 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin 84 | sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 85 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub 86 | sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" 87 | sudo apt-get update 88 | sudo apt-get -y install cuda-${MATRIX_CUDA/./-} 89 | echo "CUDA_HOME=/usr/local/cuda-${MATRIX_CUDA}" >> $GITHUB_ENV 90 | echo "LD_LIBRARY_PATH=/usr/local/cuda-${MATRIX_CUDA}/lib64:$(stack exec ghc -- --print-libdir)/rts:/usr/local/cuda-${MATRIX_CUDA}/nvvm/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV 91 | echo "/usr/local/cuda-${MATRIX_CUDA}/bin" >> $GITHUB_PATH 92 | 93 | - name: Build dependencies 94 | run: stack build $STACK_FLAGS --only-dependencies 95 | 96 | - name: Build 97 | run: stack build $STACK_FLAGS $HADDOCK_FLAGS 98 | 99 | - name: Save snapshot.pkgdb 100 | run: stack exec ghc-pkg -- --package-db=$(stack path --snapshot-pkg-db) list > snapshot.pkgdb 101 | 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /cuda.buildinfo 2 | /cuda.buildinfo.generated 3 | *.i 4 | dist/ 5 | /stack.yaml 6 | /.stack-work 7 | /stack.yaml.lock 8 | dist-newstyle 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule ".travis"] 2 | path = .travis 3 | url = https://github.com/tmcdonell/travis-scripts.git 4 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | Trevor L. McDonell 2 | Trevor L. McDonell 3 | Sean Lee 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # vim: nospell 2 | # 3 | language: c 4 | dist: trusty 5 | 6 | cache: 7 | timeout: 600 8 | directories: 9 | - $HOME/.stack 10 | - $HOME/.local/bin 11 | - $TRAVIS_BUILD_DIR/.stack-work 12 | 13 | addons: 14 | apt: 15 | sources: 16 | - hvr-ghc 17 | packages: 18 | - alex-3.1.7 19 | - happy-1.19.5 20 | 21 | compiler: 22 | - GHC-8.10.1 23 | - GHC-8.8.3 24 | - GHC-8.6.5 25 | - GHC-8.4.4 26 | - GHC-8.2.2 27 | - GHC-8.0.2 28 | # - GHC-7.10.3 29 | # - GHC-7.8.4 30 | 31 | env: 32 | # global: 33 | matrix: 34 | - CUDA=10.1.105-1 35 | - CUDA=10.0.130-1 36 | - CUDA=8.0.61-1 37 | - CUDA=7.5-18 38 | - CUDA=7.0-28 39 | - CUDA=6.5-14 40 | 41 | matrix: 42 | fast_finish: true 43 | # allow_failures: 44 | 45 | before_install: 46 | - export GHC=${CC:4} 47 | - unset CC 48 | - export PATH=/opt/alex/3.1.7/bin:/opt/happy/1.19.5/bin:$PATH 49 | - export CUDA_INSTALL_EXTRA_LIBS=0 50 | - source .travis/install-cuda-trusty.sh 51 | - source .travis/install-stack.sh 52 | 53 | # # ghc-8.6 builds a broken version of c2hs 54 | # - | 55 | # if [ ${GHC%.*} == "8.6" -a ! -x "$(which c2hs)" ]; then 56 | # travis_retry stack --stack-yaml=stack-8.4.yaml setup --no-terminal --no-system-ghc 57 | # travis_retry stack --stack-yaml=stack-8.4.yaml install c2hs --fast --no-terminal 58 | # fi 59 | 60 | # build environment 61 | - echo "$(stack exec ghc -- --version) [$(stack exec ghc -- --print-project-git-commit-id 2> /dev/null || echo '?')]" 62 | - stack --version 63 | - nvcc --version 64 | 65 | install: 66 | - export FLAGS="--fast --jobs=2 --no-terminal --no-copy-bins --no-interleaved-output" 67 | - travis_retry stack build --only-dependencies 68 | 69 | script: 70 | - travis_retry stack build --haddock --no-haddock-deps 71 | 72 | after_success: 73 | - source .travis/update-accelerate-buildbot.sh 74 | 75 | after_failure: 76 | - dmesg 77 | - ls -R /usr/local/cuda* 78 | 79 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) [2009..2014] Trevor L. McDonell, University of New South Wales. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the University of New South Wales nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Haskell FFI Bindings to CUDA 2 | ============================ 3 | 4 | [![CI-Linux](https://github.com/tmcdonell/cuda/actions/workflows/ci-linux.yml/badge.svg)](https://github.com/tmcdonell/cuda/actions/workflows/ci-linux.yml) 5 | [![CI-Windows](https://github.com/tmcdonell/cuda/actions/workflows/ci-windows.yml/badge.svg)](https://github.com/tmcdonell/cuda/actions/workflows/ci-windows.yml) 6 | [![Stackage LTS](https://stackage.org/package/cuda/badge/lts)](https://stackage.org/lts/package/cuda) 7 | [![Stackage Nightly](https://stackage.org/package/cuda/badge/nightly)](https://stackage.org/nightly/package/cuda) 8 | [![Hackage](https://img.shields.io/hackage/v/cuda.svg)](https://hackage.haskell.org/package/cuda) 9 | 10 | The CUDA library provides a direct, general purpose C-like SPMD programming 11 | model for NVIDIA graphics cards (G8x series onwards). This is a collection of 12 | bindings to allow you to call and control, although not write, such functions 13 | from Haskell-land. You will need to install the CUDA driver and developer 14 | toolkit. 15 | 16 | 17 | 18 | The configure step will look for your CUDA installation in the standard places, 19 | and if the `nvcc` compiler is found in your `PATH`, relative to that. 20 | 21 | For important information on installing on Windows, see: 22 | 23 | 24 | 25 | 26 | ## Missing functionality 27 | 28 | An incomplete list of missing bindings. Pull requests welcome! 29 | 30 | ### CUDA-9 31 | 32 | - cuLaunchCooperativeKernelMultiDevice 33 | 34 | ### CUDA-10.0 35 | 36 | - cuDeviceGetLuid (windows only?) 37 | - cuLaunchHostFunc 38 | - cuGraphHostNode[Get/Set]Params 39 | - cuGraphKernelNode[Get/Set]Params 40 | - cuGraphMemcpyNode[Get/Set]Params 41 | - cuGraphMemsetNode[Get/Set]Params 42 | 43 | ### CUDA-10.2 44 | 45 | - cuDeviceGetNvSciSyncAttributes 46 | - cuMemAddressFree 47 | - cuMemAddressReserve 48 | - cuMemCreate 49 | - cuMemExportToShareableHandle 50 | - cuMemGetAccess 51 | - cuMemGetAllocationGranularity 52 | - cuMemGetAllocationPrepertiesFromHandle 53 | - cuMemImportFromShareableHandle 54 | - cuMemMap 55 | - cuMemRelease 56 | - cuMemSetAccess 57 | - cuMemUnmap 58 | - cuGraphExecHostNodeSetParams 59 | - cuGraphExecMemcpyNodeSetParams 60 | - cuGraphExecMemsetNodeSetParams 61 | - cuGraphExecUpdate 62 | 63 | ### CUDA-11.0 64 | 65 | - cuCtxResetPersistentingL2Cache 66 | - cuMemRetainAllocationHandle 67 | - cuStreamCopyAttributes 68 | - cuStreamGetAttribute 69 | - cuStreamSetAttribute 70 | - cuGraphKernelNodeCopyAttributes 71 | - cuGraphKernelNodeGetAttribute 72 | - cuGraphKernelNodeSetAttribute 73 | - cuOccupancyAvailableDynamicSMemPerBlock 74 | 75 | ### CUDA-11.1 76 | 77 | - cuDeviceGetTexture1DLinearMaxWidth 78 | - cuArrayGetSparseProperties 79 | - cuMipmappedArrayGetSparseProperties 80 | - cuMemMapArrayAsync 81 | - cuEventRecordWithFlags 82 | - cuGraphAddEventRecordNode 83 | - cuGraphAddEventWaitNode 84 | - cuGraphEventRecordNodeGetEvent 85 | - cuGraphEventRecordNodeSetEvent 86 | - cuGraphEventWaitNodeGetEvent 87 | - cuGraphEventWaitNodeSetEvent 88 | - cuGraphExecChildGraphNodeSetParams 89 | - cuGraphExecEventRecordNodeSetEvent 90 | - cuGraphExecEventWaitNodeSetEvent 91 | - cuGraphUpload 92 | 93 | ### CUDA-11.2 94 | 95 | - cuDeviceGetDefaultMemPool 96 | - cuDeviceGetMemPool 97 | - cuDeviceSetMemPool 98 | - cuArrayGetPlane 99 | - cuMemAllocAsync 100 | - cuMemAllocFromPoolAsync 101 | - cuMemFreeAsync 102 | - cuMemPoolCreate 103 | - cuMemPoolDestroy 104 | - cuMemPoolExportPointer 105 | - cuMemPoolExportToShareableHandle 106 | - cuMemPoolGetAccess 107 | - cuMemPoolGetAttribute 108 | - cuMemPoolImportFromShareableHandle 109 | - cuMemPoolImportPointer 110 | - cuMemPoolSetAccess 111 | - cuMemPoolSetAttribute 112 | - cuMemPoolTrimTo 113 | - cuGraphAddExternalSemaphoresSignalNode 114 | - cuGraphAddExternalSemaphoresWaitNode 115 | - cuGraphExecExternalSemaphoresSignalNodeSetParams 116 | - cuGraphExecExternalSemaphoresWaitNodeSetParams 117 | - cuGraphExternalSemaphoresSignalNodeGetParams 118 | - cuGraphExternalSemaphoresSignalNodeSetParams 119 | - cuGraphExternalSemaphoresWaitNodeGetParams 120 | - cuGraphExternalSemaphoresWaitNodeSetParams 121 | 122 | ### CUDA-11.3 123 | 124 | - cuStreamGetCaptureInfo_v2 125 | - cuFuncGetModule 126 | - cuGraphDebugDotPrint 127 | - cuGraphReleaseUserObject 128 | - cuGraphRetainUserObject 129 | - cuUserObjectCreate 130 | - cuUserObjectRelease 131 | - cuUserObjectRetain 132 | - cuGetProcAddress 133 | 134 | ### CUDA-11.4 135 | 136 | - cuDeviceGetUuid_v2 137 | - cuCtxCreate_v3 138 | - cuCtxGetExecAffinity 139 | - cuDeviceGetGraphMemAttribute 140 | - cuDeviceGraphMemTrim 141 | - cuDeviceSetGraphMemAttribute 142 | - cuGraphAddMemAllocNode 143 | - cuGraphAddMemFreeNode 144 | - cuGraphInstantiateWithFlags 145 | - cuGraphMemAllocNodeGetParams 146 | - cuGraphMemFreeNodeGetParams 147 | 148 | -------------------------------------------------------------------------------- /WINDOWS.md: -------------------------------------------------------------------------------- 1 | Using the CUDA package on Windows 2 | ================================= 3 | 4 | The CUDA package works on Windows and is actively maintained. If you encounter 5 | any other issues, please report them. 6 | 7 | Note that if you build your applications for the Windows 64-bit architecture, 8 | you'll need to update your `ld.exe` as described below. 9 | 10 | 11 | Windows 64-bit 12 | -------------- 13 | 14 | There is a known issue with the version of `ld.exe` that ships with the 64-bit 15 | versions of (at least) GHC-7.8.4 and GHC-7.10.2. The version of `ld.exe` that 16 | ships with these GHC distributions does not properly link against MS-style 17 | dynamic libraries (such as those that ship with the CUDA toolkit), causing the 18 | application to crash at runtime once those library routines are called. The 19 | configure step will fail if it detects an old version of `ld.exe` (< 2.25.1), 20 | which are known to be broken. 21 | 22 | If you are using the 64-bit GHC distributions mentioned above, you will need to 23 | apply the following steps. This bug does not affect 32-bit GHC distributions. 24 | The bug has been fixed in MinGW binutils `ld.exe` >= 2.25.1, so it is expected 25 | that newer releases of GHC will not have this issue. 26 | 27 | The problem is fixed by replacing the linker binary `ld.exe` with the newer 28 | (patched) version, available as part of the MSys2 binutils package here: 29 | 30 | > 31 | 32 | The updated `ld.exe` binary must replace the version at the path: 33 | 34 | > `GHC_PATH\mingw\x86_64-w64-mingw32\bin\` 35 | 36 | Note that there is another copy of `ld.exe` located at `GHC_PATH\mingw\bin\`, 37 | but this version does not seem to be used, so replacing it as well is not 38 | necessary. It is not sufficient to replace whatever version of `ld.exe` appears 39 | first in your `PATH`. 40 | 41 | Please note that having another MinGW installation in `PATH` before the one 42 | shipped with GHC may break things, particularly if you mix 32/64-bit 43 | distributions of MinGW and GHC. 44 | 45 | For further discussion of the bug, see: 46 | 47 | * [CUDA package issue][cuda31] 48 | * [GHC issue][ghc10885] 49 | * [binutils issue][binutils16598] 50 | 51 | [cuda31]: https://github.com/tmcdonell/cuda/issues/31 52 | [ghc10885]: https://ghc.haskell.org/trac/ghc/ticket/10885 53 | [binutils16598]: https://sourceware.org/bugzilla/show_bug.cgi?id=16598 54 | 55 | -------------------------------------------------------------------------------- /cbits/init.c: -------------------------------------------------------------------------------- 1 | #include "cbits/stubs.h" 2 | #include 3 | 4 | /* 5 | * Make sure that the linker always touches this module so that it notices the 6 | * below constructor function. Calling this empty function as part of 7 | * 'Foreign.CUDA.Driver.initialise' should be sufficient to prevent it from ever 8 | * being stripped. 9 | */ 10 | void enable_constructors() { } 11 | 12 | /* 13 | * GHC-8 introduced a new (simpler) 64-bit allocator, which on startup 'mmap's 14 | * 1TB of address space and then commits sub-portions of that memory as needed. 15 | * 16 | * The CUDA driver also appears to 'mmap' a large chunk of address space on 17 | * 'cuInit', probably as the arena for shuffling memory to and from the device, 18 | * but attempts to do so at a _fixed_ address. If the GHC RTS has already taken 19 | * that address at the time we call 'cuInit', driver initialisation will fail 20 | * with an "out of memory" error. 21 | * 22 | * The workaround is to call 'cuInit' before initialising the RTS. Then the 23 | * RTS's allocation will avoid CUDA's allocation, since the RTS doesn't care 24 | * where in the address space it gets that memory. Embedding the following 25 | * __attribute__((constructor)) function in the library does the trick nicely, 26 | * and the linker will ensure that this gets executed when the shared library is 27 | * loaded (during program startup). 28 | * 29 | * Another way around this, without actually calling 'cuInit', would be to just 30 | * reserve the regions that 'cuInit' requires in the constructor function so 31 | * that the RTS avoids them, then release them before calling 'cuInit'. However, 32 | * since the CUDA driver is closed and we don't know exactly which regions to 33 | * reserve, that approach would be fragile. 34 | * 35 | * See: https://github.com/tmcdonell/cuda/issues/39 36 | */ 37 | #ifdef CUDA_PRELOAD 38 | __attribute__((constructor)) void preinitialise_cuda() 39 | { 40 | CUresult status = cuInit (0); 41 | 42 | if ( status != CUDA_SUCCESS ) { 43 | #if CUDA_VERSION >= 6000 44 | const char* str = NULL; 45 | 46 | cuGetErrorString(status, &str); 47 | fprintf(stderr, "Failed to pre-initialise CUDA: %s\n", str); 48 | #else 49 | fprintf(stderr, "Failed to pre-initialise CUDA (%d)\n", status); 50 | #endif 51 | } 52 | } 53 | #endif 54 | 55 | -------------------------------------------------------------------------------- /examples/Makefile: -------------------------------------------------------------------------------- 1 | ifneq ($(emu),1) 2 | PROJECTS := $(shell find src -name Makefile) 3 | else 4 | PROJECTS := $(shell find src -name Makefile | xargs grep -L 'USEDRVAPI') 5 | endif 6 | 7 | 8 | %.do : 9 | $(MAKE) -C $(dir $*) $(MAKECMDGOALS) 10 | 11 | all : $(addsuffix .do,$(PROJECTS)) 12 | @echo "Finished building all" 13 | 14 | clean : $(addsuffix .do,$(PROJECTS)) 15 | @echo "Finished cleaning all" 16 | 17 | clobber : $(addsuffix .do,$(PROJECTS)) 18 | @echo "Finished cleaning all" 19 | -------------------------------------------------------------------------------- /examples/common/include/cudpp/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2007-2009 The Regents of the University of California, Davis 2 | campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of the The Regents, nor NVIDIA, nor the names of its 13 | contributors may be used to endorse or promote products derived from this 14 | software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 20 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /examples/common/include/cudpp/cudpp_globals.h: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------- 2 | // cuDPP -- CUDA Data Parallel Primitives library 3 | // ------------------------------------------------------------- 4 | // $Revision: 5632 $ 5 | // $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $ 6 | // ------------------------------------------------------------- 7 | // This source code is distributed under the terms of license.txt in 8 | // the root directory of this source distribution. 9 | // ------------------------------------------------------------- 10 | 11 | /** 12 | * @file 13 | * cudpp_globals.h 14 | * 15 | * @brief Global declarations defining machine characteristics of GPU target 16 | * These are currently set for best performance on G8X GPUs. The optimal 17 | * parameters may change on future GPUs. In the future, we hope to make 18 | * CUDPP a self-tuning library. 19 | */ 20 | 21 | #ifndef __CUDPP_GLOBALS_H__ 22 | #define __CUDPP_GLOBALS_H__ 23 | 24 | const int NUM_BANKS = 16; /**< Number of shared memory banks */ 25 | const int LOG_NUM_BANKS = 4; /**< log_2(NUM_BANKS) */ 26 | const int CTA_SIZE = 128; /**< Number of threads in a CTA */ 27 | const int WARP_SIZE = 32; /**< Number of threads in a warp */ 28 | const int LOG_CTA_SIZE = 7; /**< log_2(CTA_SIZE) */ 29 | const int LOG_WARP_SIZE = 5; /**< log_2(WARP_SIZE) */ 30 | const int LOG_SIZEOF_FLOAT = 2; /**< log_2(sizeof(float)) */ 31 | const int SCAN_ELTS_PER_THREAD = 8; /**< Number of elements per scan thread */ 32 | const int SEGSCAN_ELTS_PER_THREAD = 8; /**< Number of elements per segmented scan thread */ 33 | 34 | const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared 35 | memory in each block */ 36 | const int maxThreadsPerBlock = CTA_SIZE; /**< Maximum number of 37 | * threads in a CTA */ 38 | 39 | #define AVOID_BANK_CONFLICTS /**< Set if by default, we want our 40 | * shared memory allocation to perform 41 | * additional computation to avoid bank 42 | * conflicts */ 43 | 44 | #ifdef AVOID_BANK_CONFLICTS 45 | #define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS) 46 | #else 47 | #define CONFLICT_FREE_OFFSET(index) (0) 48 | #endif 49 | 50 | #endif // __CUDPP_GLOBALS_H__ 51 | 52 | // Leave this at the end of the file 53 | // Local Variables: 54 | // mode:c++ 55 | // c-file-style: "NVIDIA" 56 | // End: 57 | -------------------------------------------------------------------------------- /examples/common/include/cudpp/shared_mem.h: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------- 2 | // cuDPP -- CUDA Data Parallel Primitives library 3 | // ------------------------------------------------------------- 4 | // $Revision: 5633 $ 5 | // $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ 6 | // ------------------------------------------------------------- 7 | // This source code is distributed under the terms of license.txt 8 | // in the root directory of this source distribution. 9 | // ------------------------------------------------------------- 10 | 11 | /** 12 | * @file 13 | * sharedmem.h 14 | * 15 | * @brief Shared memory declaration struct for templatized types. 16 | * 17 | * Because dynamically sized shared memory arrays are declared "extern" in CUDA, 18 | * we can't templatize their types directly. To get around this, we declare a 19 | * simple wrapper struct that will declare the extern array with a different 20 | * name depending on the type. This avoids linker errors about multiple 21 | * definitions. 22 | * 23 | * To use dynamically allocated shared memory in a templatized __global__ or 24 | * __device__ function, just replace code like this: 25 | * 26 | *
 27 |  *  template
 28 |  *  __global__ void
 29 |  *  foo( T* d_out, T* d_in)
 30 |  *  {
 31 |  *      // Shared mem size is determined by the host app at run time
 32 |  *      extern __shared__  T sdata[];
 33 |  *      ...
 34 |  *      doStuff(sdata);
 35 |  *      ...
 36 |  *  }
 37 |  * 
38 | * 39 | * With this 40 | *
 41 |  *  template
 42 |  *  __global__ void
 43 |  *  foo( T* d_out, T* d_in)
 44 |  *  {
 45 |  *      // Shared mem size is determined by the host app at run time
 46 |  *      SharedMemory smem;
 47 |  *      T* sdata = smem.getPointer();
 48 |  *      ...
 49 |  *      doStuff(sdata);
 50 |  *      ...
 51 |  *  }
 52 |  * 
53 | */ 54 | 55 | #ifndef __SHARED_MEM_H__ 56 | #define __SHARED_MEM_H__ 57 | 58 | 59 | /** @brief Wrapper class for templatized dynamic shared memory arrays. 60 | * 61 | * This struct uses template specialization on the type \a T to declare 62 | * a differently named dynamic shared memory array for each type 63 | * (\code extern __shared__ T s_type[] \endcode). 64 | * 65 | * Currently there are specializations for the following types: 66 | * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long, 67 | * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it 68 | * for user defined types. 69 | */ 70 | template 71 | struct SharedMemory 72 | { 73 | /** Return a pointer to the runtime-sized shared memory array. **/ 74 | __device__ T* getPointer() 75 | { 76 | extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types 77 | Error_UnsupportedType(); 78 | return (T*)0; 79 | } 80 | // TODO: Use operator overloading to make this class look like a regular array 81 | }; 82 | 83 | // Following are the specializations for the following types. 84 | // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double 85 | // One could also specialize it for user-defined types. 86 | 87 | #define SPEC_SHAREDMEM(T, name) \ 88 | template <> struct SharedMemory \ 89 | { \ 90 | __device__ T* getPointer() { extern __shared__ T s_##name[]; return s_##name; } \ 91 | } 92 | 93 | SPEC_SHAREDMEM(int, int); 94 | SPEC_SHAREDMEM(char, char); 95 | SPEC_SHAREDMEM(long, long); 96 | SPEC_SHAREDMEM(short, short); 97 | SPEC_SHAREDMEM(bool, bool); 98 | SPEC_SHAREDMEM(float, float); 99 | SPEC_SHAREDMEM(double, double); 100 | 101 | SPEC_SHAREDMEM(unsigned int, uint); 102 | SPEC_SHAREDMEM(unsigned char, uchar); 103 | SPEC_SHAREDMEM(unsigned long, ulong); 104 | SPEC_SHAREDMEM(unsigned short, ushort); 105 | 106 | SPEC_SHAREDMEM(uchar4, uchar4); 107 | 108 | #undef SPEC_SHAREDMEM 109 | #endif // __SHARED_MEM_H__ 110 | 111 | // Leave this at the end of the file 112 | // Local Variables: 113 | // mode:c++ 114 | // c-file-style: "NVIDIA" 115 | // End: 116 | -------------------------------------------------------------------------------- /examples/common/include/cudpp/type_vector.h: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------- 2 | // cuDPP -- CUDA Data Parallel Primitives library 3 | // ------------------------------------------------------------- 4 | // $Revision: 5632 $ 5 | // $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $ 6 | // ------------------------------------------------------------- 7 | // This source code is distributed under the terms of license.txt in 8 | // the root directory of this source distribution. 9 | // ------------------------------------------------------------- 10 | 11 | #ifndef __TYPE_VECTOR_H__ 12 | #define __TYPE_VECTOR_H__ 13 | 14 | /** @brief Utility template struct for generating small vector types from scalar types 15 | * 16 | * Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as 17 | * template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the 18 | * specified length and base type. For example: 19 | * \code 20 | * template 21 | * __device__ void myKernel(T *data) 22 | * { 23 | * typeToVector::Result myVec4; // create a vec4 of type T 24 | * myVec4 = (typeToVector::Result*)data[0]; // load first element of data as a vec4 25 | * } 26 | * \endcode 27 | * 28 | * This functionality is implemented using template specialization. Currently specializations 29 | * for int, float, and unsigned int vectors of lengths 2-4 are defined. Note that this results 30 | * in types being generated at compile time -- there is no runtime cost. typeToVector is used by 31 | * the optimized scan \c __device__ functions in scan_cta.cu. 32 | */ 33 | template 34 | struct typeToVector 35 | { 36 | typedef T Result; 37 | }; 38 | 39 | #define TYPE_VECTOR(type, name) \ 40 | template <> struct typeToVector { typedef name##2 Result; }; \ 41 | template <> struct typeToVector { typedef name##3 Result; }; \ 42 | template <> struct typeToVector { typedef name##4 Result; } 43 | 44 | TYPE_VECTOR(int, int); 45 | TYPE_VECTOR(float, float); 46 | TYPE_VECTOR(unsigned int, uint); 47 | 48 | 49 | #undef TYPE_VECTOR 50 | #endif 51 | 52 | -------------------------------------------------------------------------------- /examples/common/include/operator.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Operator 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * A template class for unary/binary kernel operations 8 | * 9 | * ---------------------------------------------------------------------------*/ 10 | 11 | 12 | #ifndef __OPERATOR_H__ 13 | #define __OPERATOR_H__ 14 | 15 | #include 16 | #include 17 | 18 | 19 | /* 20 | * Template class for an operation that can be mapped over an array. 21 | */ 22 | template 23 | class Functor 24 | { 25 | public: 26 | /* 27 | * Apply the operation to the given operand. 28 | */ 29 | static __device__ Tb apply(const Ta &x); 30 | }; 31 | 32 | template 33 | class fromIntegral : Functor 34 | { 35 | public: 36 | static __device__ Tb apply(const Ta &x) { return (Tb) x; } 37 | }; 38 | 39 | 40 | /* 41 | * Template class for binary operators. Certain algorithms may require the 42 | * operator to be associative (that is, Ta == Tb), such as parallel scan and 43 | * reduction. 44 | * 45 | * As this is template code, it should compile down to something efficient... 46 | */ 47 | template 48 | class BinaryOp 49 | { 50 | public: 51 | /* 52 | * Apply the operation to the given operands. 53 | */ 54 | static __device__ Tc apply(const Ta &a, const Tb &b); 55 | 56 | /* 57 | * Return an identity element for the type Tc. 58 | * 59 | * This may have special meaning for a given implementation, for example a 60 | * `max' operation over integers may want to return INT_MIN. 61 | */ 62 | static __device__ Tc identity(); 63 | }; 64 | 65 | /* 66 | * Return the minimum or maximum value of a type 67 | */ 68 | template inline __device__ T getMin(); 69 | template inline __device__ T getMax(); 70 | 71 | #define SPEC_MINMAX(type,vmin,vmax) \ 72 | template <> inline __device__ type getMin() { return vmin; }; \ 73 | template <> inline __device__ type getMax() { return vmax; } \ 74 | 75 | SPEC_MINMAX(float, -FLT_MAX, FLT_MAX); 76 | SPEC_MINMAX(int, INT_MIN, INT_MAX); 77 | SPEC_MINMAX(char, CHAR_MIN, CHAR_MAX); 78 | SPEC_MINMAX(unsigned int, 0, UINT_MAX); 79 | SPEC_MINMAX(unsigned char, 0, UCHAR_MAX); 80 | 81 | 82 | /* 83 | * Basic binary arithmetic operations. We take advantage of automatic type 84 | * promotion to keep the parameters general. 85 | */ 86 | #define BASIC_OP(name,expr,id) \ 87 | template \ 88 | class name : BinaryOp \ 89 | { \ 90 | public: \ 91 | static __device__ Tc apply(const Ta &a, const Tb &b) { return expr; } \ 92 | static __device__ Tc identity() { return id; } \ 93 | } 94 | 95 | #define LOGICAL_OP(name,expr,id) \ 96 | template \ 97 | class name : BinaryOp \ 98 | { \ 99 | public: \ 100 | static __device__ bool apply(const Ta &a, const Tb &b) { return expr; }\ 101 | static __device__ bool identity() { return id; } \ 102 | } 103 | 104 | BASIC_OP(Plus, a + b, 0); 105 | BASIC_OP(Times, a * b, 1); 106 | BASIC_OP(Min, min(a,b), getMax()); 107 | BASIC_OP(Max, max(a,b), getMin()); 108 | 109 | LOGICAL_OP(Eq, a == b, false); 110 | 111 | #undef SPEC_MINMAX 112 | #undef BASIC_OP 113 | #endif 114 | 115 | -------------------------------------------------------------------------------- /examples/common/include/utils.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Utils 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | 10 | #ifndef __UTILS_H__ 11 | #define __UTILS_H__ 12 | 13 | #include 14 | #include 15 | 16 | 17 | /* 18 | * Core assert function. Don't let this escape... 19 | */ 20 | #if defined(__CUDACC__) || !defined(__DEVICE_EMULATION__) 21 | #define __assert(e, file, line) ((void)0) 22 | #else 23 | #define __assert(e, file, line) \ 24 | ((void) fprintf (stderr, "%s:%u: failed assertion `%s'\n", file, line, e), abort()) 25 | #endif 26 | 27 | /* 28 | * Test the given expression, and abort the program if it evaluates to false. 29 | * Only available in debug mode. 30 | */ 31 | #ifndef _DEBUG 32 | #define assert(e) ((void)0) 33 | #else 34 | #define assert(e) \ 35 | ((void) ((e) ? (void(0)) : __assert (#e, __FILE__, __LINE__))) 36 | #endif 37 | 38 | 39 | /* 40 | * Macro to insert __syncthreads() in device emulation mode 41 | */ 42 | #ifdef __DEVICE_EMULATION__ 43 | #define __EMUSYNC __syncthreads() 44 | #else 45 | #define __EMUSYNC 46 | #endif 47 | 48 | 49 | /* 50 | * Check the return status of CUDA API calls, and abort with an appropriate 51 | * error string on failure. 52 | */ 53 | #define CUDA_SAFE_CALL_NO_SYNC(call) \ 54 | do { \ 55 | cudaError err = call; \ 56 | if(cudaSuccess != err) { \ 57 | const char *str = cudaGetErrorString(err); \ 58 | __assert(str, __FILE__, __LINE__); \ 59 | } \ 60 | } while (0) 61 | 62 | #define CUDA_SAFE_CALL(call) \ 63 | do { \ 64 | CUDA_SAFE_CALL_NO_SYNC(call); \ 65 | CUDA_SAFE_CALL_NO_SYNC(cudaThreadSynchronize()); \ 66 | } while (0) 67 | 68 | 69 | #ifdef __cplusplus 70 | extern "C" { 71 | #endif 72 | 73 | /* 74 | * Determine if the input is a power of two 75 | */ 76 | inline bool 77 | isPow2(unsigned int x) 78 | { 79 | return ((x&(x-1)) == 0); 80 | } 81 | 82 | 83 | /* 84 | * Compute the next highest power of two 85 | */ 86 | inline unsigned int 87 | ceilPow2(unsigned int x) 88 | { 89 | #if 0 90 | --x; 91 | x |= x >> 1; 92 | x |= x >> 2; 93 | x |= x >> 4; 94 | x |= x >> 8; 95 | x |= x >> 16; 96 | return ++x; 97 | #endif 98 | 99 | return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x)); 100 | } 101 | 102 | 103 | /* 104 | * Compute the next lowest power of two 105 | */ 106 | inline unsigned int 107 | floorPow2(unsigned int x) 108 | { 109 | #if 0 110 | float nf = (float) n; 111 | return 1 << (((*(int*)&nf) >> 23) - 127); 112 | #endif 113 | 114 | int exp; 115 | frexp(x, &exp); 116 | return 1 << (exp - 1); 117 | } 118 | 119 | 120 | /* 121 | * computes next highest multiple of f from x 122 | */ 123 | inline unsigned int 124 | multiple(unsigned int x, unsigned int f) 125 | { 126 | return ((x + (f-1)) / f); 127 | } 128 | 129 | 130 | /* 131 | * MS Excel-style CEIL() function. Rounds x up to nearest multiple of f 132 | */ 133 | inline unsigned int 134 | ceiling(unsigned int x, unsigned int f) 135 | { 136 | return multiple(x, f) * f; 137 | } 138 | 139 | 140 | #undef __asert 141 | 142 | #ifdef __cplusplus 143 | } 144 | #endif 145 | #endif 146 | 147 | -------------------------------------------------------------------------------- /examples/common/src/PrettyPrint.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : PrettyPrint 4 | -- Copyright : (c) 2009 Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Simple layout and pretty printing 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module PrettyPrint where 12 | 13 | import Data.List 14 | import Text.PrettyPrint 15 | import System.IO 16 | 17 | 18 | -------------------------------------------------------------------------------- 19 | -- Printing 20 | -------------------------------------------------------------------------------- 21 | 22 | printDoc :: Doc -> IO () 23 | printDoc = putStrLn . flip (++) "\n" . render 24 | 25 | -- 26 | -- stolen from $fptools/ghc/compiler/utils/Pretty.lhs 27 | -- 28 | -- This code has a BSD-style license 29 | -- 30 | printDocFull :: Mode -> Handle -> Doc -> IO () 31 | printDocFull m hdl doc = do 32 | fullRender m cols 1.5 put done doc 33 | hFlush hdl 34 | where 35 | put (Chr c) next = hPutChar hdl c >> next 36 | put (Str s) next = hPutStr hdl s >> next 37 | put (PStr s) next = hPutStr hdl s >> next 38 | 39 | done = hPutChar hdl '\n' 40 | cols = 80 41 | 42 | 43 | -------------------------------------------------------------------------------- 44 | -- Layout 45 | -------------------------------------------------------------------------------- 46 | 47 | -- 48 | -- Display the given grid of renderable data, given as either a list of rows or 49 | -- columns, using the minimum size required for each column. An additional 50 | -- parameter specifies extra space to be inserted between each column. 51 | -- 52 | ppAsRows :: Int -> [[Doc]] -> Doc 53 | ppAsRows q = ppAsColumns q . transpose 54 | 55 | ppAsColumns :: Int -> [[Doc]] -> Doc 56 | ppAsColumns q = vcat . map hsep . transpose . map (\col -> pad (width col) col) 57 | where 58 | len = length . render 59 | width = maximum . map len 60 | pad w = map (\x -> x <> (hcat $ replicate (w - (len x) + q) space)) 61 | 62 | -------------------------------------------------------------------------------- /examples/common/src/RandomVector.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts, ParallelListComp #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : RandomVector 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Storable multi-dimensional arrays and lists of random numbers 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module RandomVector 13 | ( 14 | Storable, 15 | module RandomVector, 16 | module Data.Array.Storable 17 | ) 18 | where 19 | 20 | import Foreign (Ptr, Storable) 21 | import Control.Monad (join) 22 | import Control.Exception (evaluate) 23 | import Data.Array.Storable 24 | import System.Random 25 | 26 | 27 | -------------------------------------------------------------------------------- 28 | -- Arrays 29 | -------------------------------------------------------------------------------- 30 | 31 | type Vector e = StorableArray Int e 32 | type Matrix e = StorableArray (Int,Int) e 33 | 34 | withVector :: Vector e -> (Ptr e -> IO a) -> IO a 35 | withVector = withStorableArray 36 | 37 | withMatrix :: Matrix e -> (Ptr e -> IO a) -> IO a 38 | withMatrix = withStorableArray 39 | 40 | 41 | -- 42 | -- To ensure the array is fully evaluated, force one element 43 | -- 44 | evaluateArr :: (Ix i, MArray StorableArray e IO) 45 | => i -> StorableArray i e -> IO (StorableArray i e) 46 | evaluateArr l arr = (join $ evaluate (arr `readArray` l)) >> return arr 47 | 48 | 49 | -- 50 | -- Generate a new random array 51 | -- 52 | randomArrR :: (Ix i, Num e, Storable e, Random e, MArray StorableArray e IO) 53 | => (i,i) -> (e,e) -> IO (StorableArray i e) 54 | randomArrR (l,u) bnds = do 55 | rg <- newStdGen 56 | let -- The standard random number generator is too slow to generate really 57 | -- large vectors. Instead, we generate a short vector and repeat that. 58 | k = 1000 59 | rands = take k (randomRs bnds rg) 60 | 61 | newListArray (l,u) [rands !! (index (l,u) i`mod`k) | i <- range (l,u)] >>= evaluateArr l 62 | 63 | 64 | randomArr :: (Ix i, Num e, Storable e, Random e, MArray StorableArray e IO) 65 | => (i,i) -> IO (StorableArray i e) 66 | randomArr (l,u) = randomArrR (l,u) (-1,1) 67 | 68 | 69 | -- 70 | -- Verify similarity of two arrays 71 | -- 72 | verify :: (Ix i, Ord e, Fractional e, Storable e) 73 | => StorableArray i e -> StorableArray i e -> IO (Bool) 74 | verify ref arr = do 75 | as <- getElems arr 76 | bs <- getElems ref 77 | return (verifyList as bs) 78 | 79 | 80 | -------------------------------------------------------------------------------- 81 | -- Lists 82 | -------------------------------------------------------------------------------- 83 | 84 | randomListR :: (Num e, Random e, Storable e) => Int -> (e,e) -> IO [e] 85 | randomListR len bnds = do 86 | rg <- newStdGen 87 | let -- The standard random number generator is too slow to generate really 88 | -- large vectors. Instead, we generate a short vector and repeat that. 89 | k = 1000 90 | rands = take k (randomRs bnds rg) 91 | 92 | evaluate [rands !! (i`mod`k) | i <- [0..len-1]] 93 | 94 | randomList :: (Num e, Random e, Storable e) => Int -> IO [e] 95 | randomList len = randomListR len (-1,1) 96 | 97 | 98 | verifyList :: (Ord e, Fractional e) => [e] -> [e] -> Bool 99 | verifyList xs ys = all (< epsilon) [abs ((x-y)/(x+y+epsilon)) | x <- xs | y <- ys] 100 | where epsilon = 0.0005 101 | 102 | -------------------------------------------------------------------------------- /examples/common/src/Time.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- 3 | -- Module : Time 4 | -- Copyright : (c) 2009 Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Simple timing benchmarks 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Time where 12 | 13 | import System.CPUTime 14 | import Control.Monad 15 | 16 | 17 | -- Timing 18 | -- 19 | data Time = Time { cpu_time :: Integer } 20 | 21 | type TimeUnit = Integer -> Integer 22 | 23 | picosecond, millisecond, second :: TimeUnit 24 | picosecond n = n 25 | millisecond n = n `div` 1000000000 26 | second n = n `div` 1000000000000 27 | 28 | getTime :: IO Time 29 | getTime = Time `fmap` getCPUTime 30 | 31 | timeIn :: TimeUnit -> Time -> Integer 32 | timeIn u (Time t) = u t 33 | 34 | elapsedTime :: Time -> Time -> Time 35 | elapsedTime (Time t1) (Time t2) = Time (t2 - t1) 36 | 37 | 38 | -- Simple benchmarking 39 | -- 40 | {-# NOINLINE benchmark #-} 41 | benchmark 42 | :: Int -- Number of times to repeat test 43 | -> IO a -- Test to run 44 | -> IO b -- Finaliser to before measuring elapsed time 45 | -> IO (Time,a) 46 | benchmark n testee finaliser = do 47 | t1 <- getTime 48 | (r:_) <- replicateM n testee 49 | _ <- finaliser 50 | t2 <- getTime 51 | return (elapsedTime t1 t2, r) 52 | 53 | -------------------------------------------------------------------------------- /examples/common/src/control.cpp: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Control 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #include "utils.h" 10 | #include 11 | 12 | typedef void* KernelPointer; 13 | 14 | /* 15 | * Stolen from the RadixSort example from the CUDA SDK. 16 | */ 17 | 18 | void 19 | computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) 20 | { 21 | int deviceID = -1; 22 | cudaDeviceProp devprop; 23 | cudaFuncAttributes attr; 24 | 25 | assert(cudaSuccess == cudaGetDevice(&deviceID)); 26 | assert(cudaSuccess == cudaGetDeviceProperties(&devprop, deviceID)); 27 | assert(cudaSuccess == cudaFuncGetAttributes(&attr, (const char*)kernel)); 28 | 29 | /* 30 | * Determine the maximum number of CTAs that can be run simultaneously for 31 | * each kernel. This is equivalent to the calculation done in the CUDA 32 | * Occupancy Calculator spreadsheet 33 | */ 34 | const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers 35 | const unsigned int warpAllocationMultiple = 2; 36 | const unsigned int smemAllocationUnit = 512; // in bytes 37 | const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024 38 | const unsigned int maxBlocksPerSM = 8; 39 | 40 | size_t numWarps; 41 | numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Number of warps (round up to nearest whole multiple of warp size) 42 | numWarps = ceiling(numWarps, warpAllocationMultiple); // Round up to warp allocation multiple 43 | 44 | size_t regsPerCTA; 45 | regsPerCTA = attr.numRegs * devprop.warpSize * numWarps; // Number of regs is (regs per thread * number of warps * warp size) 46 | regsPerCTA = ceiling(regsPerCTA, regAllocationUnit); // Round up to multiple of register allocation unit size 47 | 48 | size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; 49 | size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); 50 | 51 | size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; 52 | size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; 53 | size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; 54 | 55 | unsigned int numSMs = devprop.multiProcessorCount; 56 | unsigned int maxCTAs = numSMs * std::min(ctaLimitRegs, std::min(ctaLimitSMem, std::min(ctaLimitThreads, maxBlocksPerSM))); 57 | // setNumCTAs(kernel, maxCTAs); 58 | } 59 | 60 | 61 | -------------------------------------------------------------------------------- /examples/common/src/cudpp/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2007-2009 The Regents of the University of California, Davis 2 | campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of the The Regents, nor NVIDIA, nor the names of its 13 | contributors may be used to endorse or promote products derived from this 14 | software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 20 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /examples/common/src/cudpp/scan_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------- 2 | // cuDPP -- CUDA Data Parallel Primitives library 3 | // ------------------------------------------------------------- 4 | // $Revision: 5633 $ 5 | // $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ 6 | // ------------------------------------------------------------- 7 | // This source code is distributed under the terms of license.txt 8 | // in the root directory of this source distribution. 9 | // ------------------------------------------------------------- 10 | 11 | /** 12 | * @file 13 | * scan_kernel.cu 14 | * 15 | * @brief CUDPP kernel-level scan routines 16 | */ 17 | 18 | /** \defgroup cudpp_kernel CUDPP Kernel-Level API 19 | * The CUDPP Kernel-Level API contains functions that run on the GPU 20 | * device across a grid of Cooperative Thread Array (CTA, aka Thread 21 | * Block). These kernels are declared \c __global__ so that they 22 | * must be invoked from host (CPU) code. They generally invoke GPU 23 | * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink. 24 | * Kernel-Level API functions are used by CUDPP 25 | * \link cudpp_app Application-Level\endlink functions to implement their 26 | * functionality. 27 | * @{ 28 | */ 29 | 30 | /** @name Scan Functions 31 | * @{ 32 | */ 33 | 34 | #include "cudpp/cudpp_globals.h" 35 | #include "cudpp/scan_cta.cu" 36 | #include "cudpp/shared_mem.h" 37 | 38 | /** 39 | * @brief Main scan kernel 40 | * 41 | * This __global__ device function performs one level of a multiblock scan on 42 | * an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out 43 | * (which may point to the same array). The same function may be used for 44 | * single or multi-row scans. To perform a multirow scan, pass the width of 45 | * each row of the input row (in elements) in \a dataRowPitch, and the width of 46 | * the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke 47 | * with a thread block grid with height greater than 1. 48 | * 49 | * This function peforms one level of a recursive, multiblock scan. At the 50 | * app level, this function is called by cudppScan and cudppMultiScan and used 51 | * in combination with vectorAddUniform4() to produce a complete scan. 52 | * 53 | * Template parameter \a T is the datatype of the array to be scanned. 54 | * Template parameter \a traits is the ScanTraits struct containing 55 | * compile-time options for the scan, such as whether it is forward or 56 | * backward, exclusive or inclusive, multi- or single-row, etc. 57 | * 58 | * @param[out] d_out The output (scanned) array 59 | * @param[in] d_in The input array to be scanned 60 | * @param[out] d_blockSums The array of per-block sums 61 | * @param[in] numElements The number of elements to scan 62 | * @param[in] dataRowPitch The width of each row of \a d_in in elements 63 | * (for multi-row scans) 64 | * @param[in] blockSumRowPitch The with of each row of \a d_blockSums in elements 65 | * (for multi-row scans) 66 | */ 67 | template 68 | __global__ void scan4(T *d_out, 69 | const T *d_in, 70 | T *d_blockSums, 71 | int numElements, 72 | unsigned int dataRowPitch, 73 | unsigned int blockSumRowPitch) 74 | { 75 | SharedMemory smem; 76 | T* temp = smem.getPointer(); 77 | 78 | int devOffset, ai, bi, aiDev, biDev; 79 | T threadScan0[4], threadScan1[4]; 80 | 81 | unsigned int blockN = numElements; 82 | unsigned int blockSumIndex = blockIdx.x; 83 | 84 | if (traits::isMultiRow()) 85 | { 86 | //int width = __mul24(gridDim.x, blockDim.x) << 1; 87 | int yIndex = __umul24(blockDim.y, blockIdx.y) + threadIdx.y; 88 | devOffset = __umul24(dataRowPitch, yIndex); 89 | blockN += (devOffset << 2); 90 | devOffset += __umul24(blockIdx.x, blockDim.x << 1); 91 | blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ; 92 | } 93 | else 94 | { 95 | devOffset = __umul24(blockIdx.x, (blockDim.x << 1)); 96 | } 97 | 98 | // load data into shared memory 99 | loadSharedChunkFromMem4 100 | (temp, threadScan0, threadScan1, d_in, 101 | blockN, devOffset, ai, bi, aiDev, biDev); 102 | 103 | scanCTA(temp, d_blockSums, blockSumIndex); 104 | 105 | // write results to device memory 106 | storeSharedChunkToMem4 107 | (d_out, threadScan0, threadScan1, temp, 108 | blockN, devOffset, ai, bi, aiDev, biDev); 109 | 110 | } 111 | 112 | /** @} */ // end scan functions 113 | /** @} */ // end cudpp_kernel 114 | -------------------------------------------------------------------------------- /examples/common/src/cudpp/segmented_scan_kernel.cu: -------------------------------------------------------------------------------- 1 | // *************************************************************** 2 | // cuDPP -- CUDA Data Parallel Primitives library 3 | // ------------------------------------------------------------- 4 | // $Revision: 3505 $ 5 | // $Date: 2007-07-06 09:26:06 -0700 (Fri, 06 Jul 2007) $ 6 | // ------------------------------------------------------------- 7 | // This source code is distributed under the terms of license.txt in 8 | // the root directory of this source distribution. 9 | // ------------------------------------------------------------- 10 | 11 | /** 12 | * @file 13 | * segmented_scan_kernel.cu 14 | * 15 | * @brief CUDPP kernel-level scan routines 16 | */ 17 | 18 | /** \defgroup cudpp_kernel CUDPP Kernel-Level API 19 | * The CUDPP Kernel-Level API contains functions that run on the GPU 20 | * device across a grid of Cooperative Thread Array (CTA, aka Thread 21 | * Block). These kernels are declared \c __global__ so that they 22 | * must be invoked from host (CPU) code. They generally invoke GPU 23 | * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink. 24 | * Kernel-Level API functions are used by CUDPP 25 | * \link cudpp_app Application-Level\endlink functions to implement their 26 | * functionality. 27 | * @{ 28 | */ 29 | 30 | /** @name Segmented scan Functions 31 | * @{ 32 | */ 33 | 34 | #include "cudpp/cudpp_globals.h" 35 | #include "cudpp/segmented_scan_cta.cu" 36 | #include "shared_mem.h" 37 | 38 | /** 39 | * @brief Main segmented scan kernel 40 | * 41 | * This __global__ device function performs one level of a multiblock 42 | * segmented scan on an one-dimensioned array in \a d_idata, returning the 43 | * result in \a d_odata (which may point to the same array). 44 | * 45 | * This function performs one level of a recursive, multiblock scan. At the 46 | * app level, this function is called by cudppSegmentedScan and used in combination 47 | * with either vectorSegmentedAddUniform4() (forward) or 48 | * vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan. 49 | * 50 | * Template parameter \a T is the datatype of the array to be scanned. 51 | * Template parameter \a traits is the SegmentedScanTraits struct containing 52 | * compile-time options for the segmented scan, such as whether it is forward 53 | * or backward, inclusive or exclusive, etc. 54 | * 55 | * @param[out] d_odata The output (scanned) array 56 | * @param[in] d_idata The input array to be scanned 57 | * @param[in] d_iflags The input array of flags 58 | * @param[out] d_blockSums The array of per-block sums 59 | * @param[out] d_blockFlags The array of per-block OR-reduction of flags 60 | * @param[out] d_blockIndices The array of per-block min-reduction of indices 61 | * @param[in] numElements The number of elements to scan 62 | */ 63 | template 64 | __global__ 65 | void segmentedScan4(T *d_odata, 66 | const T *d_idata, 67 | const unsigned int *d_iflags, 68 | unsigned int numElements, 69 | T *d_blockSums=0, 70 | unsigned int *d_blockFlags=0, 71 | unsigned int *d_blockIndices=0 72 | ) 73 | { 74 | SharedMemory smem; 75 | T* temp = smem.getPointer(); 76 | 77 | int ai, bi, aiDev, biDev; 78 | 79 | // Last index in shared memory which contains data 80 | unsigned int lastIdx = ((blockDim.x << 1)-1); 81 | 82 | // Chop up the shared memory into 4 contiguous spaces - the first 83 | // for the data, the second for the indices, the third for the 84 | // read-only version of the flags and the last for the read-write 85 | // version of the flags 86 | unsigned int* indices = (unsigned int *)(&(temp[lastIdx + 1])); 87 | unsigned int* flags = (unsigned int *)(&(temp[2*(lastIdx + 1)])); 88 | 89 | T threadScan0[4]; 90 | T threadScan1[4]; 91 | unsigned int threadFlag = 0; 92 | 93 | int devOffset = blockIdx.x * (blockDim.x << 1); 94 | 95 | // load data into shared memory 96 | loadForSegmentedScanSharedChunkFromMem4( 97 | temp, threadScan0, threadScan1, threadFlag, 98 | flags, indices, d_idata, d_iflags, 99 | numElements, devOffset, ai, bi, aiDev, biDev); 100 | 101 | segmentedScanCTA( 102 | temp, flags, indices, 103 | d_blockSums, d_blockFlags, d_blockIndices); 104 | 105 | // write results to device memory 106 | storeForSegmentedScanSharedChunkToMem4( 107 | d_odata, threadScan0, threadScan1, threadFlag, 108 | temp, numElements, devOffset, ai, bi, aiDev, biDev); 109 | } 110 | 111 | /** @} */ // end scan functions 112 | /** @} */ // end cudpp_kernel 113 | -------------------------------------------------------------------------------- /examples/src/bandwidthTest/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := bandwidthTest 9 | 10 | HSMAIN := BandwidthTest.hs 11 | CUFILES := 12 | 13 | EXTRALIBS := 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/bandwidthTest/results/Bandwidth.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/Bandwidth.numbers -------------------------------------------------------------------------------- /examples/src/bandwidthTest/results/GT120.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/GT120.pdf -------------------------------------------------------------------------------- /examples/src/bandwidthTest/results/Tesla.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/bandwidthTest/results/Tesla.pdf -------------------------------------------------------------------------------- /examples/src/enum/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | STATIC_LIB := libenum.a 9 | 10 | CUFILES := enum.cu 11 | 12 | EXTRALIBS := stdc++ 13 | 14 | # ------------------------------------------------------------------------------ 15 | # Haskell/CUDA build system 16 | # ------------------------------------------------------------------------------ 17 | include ../../common/common.mk 18 | -------------------------------------------------------------------------------- /examples/src/enum/enum.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Enum 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #include "enum.h" 10 | 11 | #include "algorithms.h" 12 | #include "utils.h" 13 | #include 14 | 15 | 16 | static void 17 | enum_control(int32_t n, uint32_t &blocks, uint32_t &threads) 18 | { 19 | threads = min(ceilPow2(n), MAX_THREADS); 20 | blocks = (n + threads - 1) / threads; 21 | } 22 | 23 | 24 | template 25 | __global__ static void 26 | enum_core(T *out, const T from, const T then, const T to) 27 | { 28 | uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | const T val = from + (then-from) * idx; 30 | 31 | if (increasing) if (val <= to) out[idx] = val; 32 | else if (val >= to) out[idx] = val; 33 | } 34 | 35 | template 36 | static void 37 | enumFromThenTo(T *out, const T from, const T then, const T to) 38 | { 39 | uint32_t threads; 40 | uint32_t blocks; 41 | int32_t n = 1 + (to - from) / (then - from); 42 | bool increasing = then-from > 0; 43 | 44 | if (n <= 0) return; 45 | 46 | enum_control(n, blocks, threads); 47 | if (increasing) enum_core <<>>(out, from, then, to); 48 | else enum_core<<>>(out, from, then, to); 49 | } 50 | 51 | 52 | // ----------------------------------------------------------------------------- 53 | // Instances 54 | // ----------------------------------------------------------------------------- 55 | 56 | void enumFromTo_i(int32_t *out, int32_t l, int32_t u) 57 | { 58 | enumFromThenTo(out, l, l+1, u); 59 | } 60 | 61 | -------------------------------------------------------------------------------- /examples/src/enum/enum.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Enum 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | 10 | #ifndef __ENUM_H__ 11 | #define __ENUM_H__ 12 | 13 | #define MAX_THREADS 128 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /examples/src/fold/Fold.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : Fold 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Reduce a vector to a single value 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Main where 13 | 14 | #include "fold.h" 15 | 16 | -- Friends 17 | import C2HS 18 | import Time 19 | import RandomVector 20 | 21 | -- System 22 | import Control.Exception 23 | import qualified Foreign.CUDA.Runtime as CUDA 24 | 25 | 26 | -------------------------------------------------------------------------------- 27 | -- Reference 28 | -------------------------------------------------------------------------------- 29 | 30 | foldRef :: Num e => [e] -> IO e 31 | foldRef xs = do 32 | (t,r) <- benchmark 100 (evaluate (foldl (+) 0 xs)) (return ()) 33 | putStrLn $ "== Reference: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms" 34 | return r 35 | 36 | -------------------------------------------------------------------------------- 37 | -- CUDA 38 | -------------------------------------------------------------------------------- 39 | 40 | -- 41 | -- Note that this requires two memory copies: once from a Haskell list to the C 42 | -- heap, and from there into the graphics card memory. See the `bandwidthTest' 43 | -- example for the atrocious performance of this operation. 44 | -- 45 | -- For this test, cheat a little and just time the pure computation. 46 | -- 47 | foldCUDA :: [Float] -> IO Float 48 | foldCUDA xs = do 49 | let len = length xs 50 | CUDA.withListArray xs $ \d_xs -> do 51 | (t,r) <- benchmark 100 (fold_plusf d_xs len) CUDA.sync 52 | putStrLn $ "== CUDA: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms" 53 | return r 54 | 55 | {# fun unsafe fold_plusf 56 | { withDP* `CUDA.DevicePtr Float' 57 | , `Int' 58 | } 59 | -> `Float' cFloatConv #} 60 | where 61 | withDP p a = CUDA.withDevicePtr p $ \p' -> a (castPtr p') 62 | 63 | 64 | -------------------------------------------------------------------------------- 65 | -- Main 66 | -------------------------------------------------------------------------------- 67 | 68 | main :: IO () 69 | main = do 70 | dev <- CUDA.get 71 | props <- CUDA.props dev 72 | putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props 73 | 74 | xs <- randomList 30000 75 | ref <- foldRef xs 76 | cuda <- foldCUDA xs 77 | 78 | putStrLn $ "== Validating: " ++ if ((ref-cuda)/ref) < 0.0001 then "Ok!" else "INVALID!" 79 | 80 | -------------------------------------------------------------------------------- /examples/src/fold/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := fold 9 | 10 | HSMAIN := Fold.chs 11 | CUFILES := fold.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/fold/fold.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Fold 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #ifndef __FOLD_H__ 10 | #define __FOLD_H__ 11 | 12 | /* 13 | * Optimised for Tesla. 14 | * Maximum thread occupancy for your card may be achieved with different values. 15 | */ 16 | #define MAX_THREADS 128 17 | #define MAX_BLOCKS 64 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | /* 24 | * Instances 25 | */ 26 | float fold_plusf(float *xs, int N); 27 | 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | #endif 33 | -------------------------------------------------------------------------------- /examples/src/foldSeg/foldSeg.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : FoldSeg 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #ifndef __FOLD_H__ 10 | #define __FOLD_H__ 11 | 12 | /* 13 | * Optimised for Tesla. 14 | * Maximum thread occupancy for your card may be achieved with different values. 15 | */ 16 | #define MAX_THREADS 128 17 | #define MAX_BLOCKS 64 18 | #define WARP_SIZE 32 19 | #define ELTS_PER_THREAD 8 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | /* 26 | * Instances 27 | */ 28 | float foldSeg_plusf(float *xs, int N); 29 | 30 | 31 | #ifdef __cplusplus 32 | } 33 | #endif 34 | #endif 35 | 36 | -------------------------------------------------------------------------------- /examples/src/map2/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := map 9 | 10 | HSMAIN := Map.hs 11 | CUFILES := map.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/map2/Map.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : Map 5 | -- Copyright : (c) 2010 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -------------------------------------------------------------------------------- 9 | 10 | module Main where 11 | 12 | import qualified Foreign.CUDA as CUDA 13 | 14 | import Foreign.Storable 15 | import Foreign.Storable.Tuple () 16 | import qualified Foreign.Storable.Traversable as Store 17 | 18 | instance Storable a => Storable [a] where 19 | sizeOf = Store.sizeOf 20 | alignment = Store.alignment 21 | peek = Store.peek (error "instance Traversable [a] is lazy, so we do not provide a real value here") 22 | poke = Store.poke 23 | 24 | main :: IO () 25 | main = 26 | let len = 16 27 | xs = take len [0..] :: [Int] 28 | ys = take len [0..] :: [Int] 29 | in 30 | CUDA.allocaArray len $ \d_out -> 31 | CUDA.withListArray xs $ \d_in0 -> 32 | CUDA.withListArray ys $ \d_in1 -> do 33 | CUDA.setConfig (1,1) (len,1,1) 0 Nothing 34 | CUDA.setParams [CUDA.VArg d_out, CUDA.VArg (d_in0, d_in1), CUDA.IArg len] 35 | CUDA.launch "map" 36 | 37 | out :: [Int] <- CUDA.peekListArray len d_out 38 | print out 39 | 40 | -------------------------------------------------------------------------------- /examples/src/map2/map.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Map 4 | * Copyright : (c) 2010 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | 10 | typedef unsigned int Ix; 11 | 12 | typedef int TyOut; 13 | typedef int TyIn0; 14 | typedef int TyIn1; 15 | 16 | typedef TyOut* ArrOut; 17 | 18 | typedef struct { 19 | TyIn0 a; 20 | TyIn1 b; 21 | } ArrElem0; 22 | 23 | typedef struct { 24 | const TyIn0 *a; 25 | const TyIn1 *b; 26 | } ArrIn0; 27 | 28 | 29 | static inline __device__ 30 | ArrElem0 indexArray0(const ArrIn0 arr, const Ix idx) 31 | { 32 | ArrElem0 x = { arr.a[idx], arr.b[idx] }; 33 | return x; 34 | } 35 | 36 | __device__ static TyOut 37 | apply(const ArrElem0 in0) 38 | { 39 | return in0.a + in0.b; 40 | } 41 | 42 | /* 43 | * Apply the function to each element of the array. Each thread processes 44 | * multiple elements, striding the array by the grid size. 45 | */ 46 | extern "C" 47 | __global__ void 48 | map 49 | ( 50 | ArrOut d_out, 51 | const ArrIn0 d_in0, 52 | const unsigned int length 53 | ) 54 | { 55 | unsigned int idx; 56 | const unsigned int gridSize = __umul24(blockDim.x, gridDim.x); 57 | 58 | for (idx = __umul24(blockDim.x, blockIdx.x) + threadIdx.x; idx < length; idx += gridSize) 59 | { 60 | d_out[idx] = apply(indexArray0(d_in0, idx)); 61 | } 62 | } 63 | 64 | // vim:filetype=cuda.c 65 | 66 | -------------------------------------------------------------------------------- /examples/src/matrixMul/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := matrixMul 9 | 10 | HSMAIN := MatrixMul.hs 11 | CUFILES := matrix_mul.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/matrixMul/MatrixMul.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : MatrixMul 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Matrix multiplication using runtime interface and execution control instead 9 | -- of calling C functions via the FFI. 10 | -- 11 | -------------------------------------------------------------------------------- 12 | 13 | module Main where 14 | 15 | #include "matrix_mul.h" 16 | 17 | -- Friends 18 | import Time 19 | import RandomVector 20 | 21 | -- System 22 | import Data.Array 23 | import System.IO 24 | import Foreign 25 | import qualified Foreign.CUDA as CUDA 26 | 27 | 28 | -------------------------------------------------------------------------------- 29 | -- Reference 30 | -------------------------------------------------------------------------------- 31 | 32 | matMult :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) 33 | matMult mx my = do 34 | x <- unsafeFreeze mx 35 | y <- unsafeFreeze my 36 | let ((li, lj), (ui, uj)) = bounds x 37 | ((li',lj'),(ui',uj')) = bounds y 38 | resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj')) 39 | | otherwise = error "matrix dimensions must agree" 40 | 41 | newListArray resBnds [sum [x!(i,k) * y!(k,j) | k <- range (lj,uj)] 42 | | i <- range (li,ui) 43 | , j <- range (lj',uj') ] 44 | 45 | 46 | -------------------------------------------------------------------------------- 47 | -- CUDA 48 | -------------------------------------------------------------------------------- 49 | 50 | matMultCUDA :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) 51 | matMultCUDA xs' ys' = doMult undefined xs' ys' 52 | where 53 | doMult :: (Num e', Storable e') => e' -> Matrix e' -> Matrix e' -> IO (Matrix e') 54 | doMult dummy xs ys = do 55 | 56 | -- Setup matrix parameters 57 | -- 58 | ((li, lj), (ui, uj)) <- getBounds xs 59 | ((li',lj'),(ui',uj')) <- getBounds ys 60 | let wx = rangeSize (lj,uj) 61 | hx = rangeSize (li,ui) 62 | wy = rangeSize (lj',uj') 63 | hy = rangeSize (li',ui') 64 | resBnds | wx == hy = ((li,lj'),(ui,uj')) 65 | | otherwise = error "matrix dimensions must agree" 66 | 67 | -- Allocate memory and copy test data 68 | -- 69 | CUDA.allocaArray (wx*hx) $ \d_xs -> do 70 | CUDA.allocaArray (wy*hy) $ \d_ys -> do 71 | CUDA.allocaArray (wy*hx) $ \d_zs -> do 72 | withMatrix xs $ \p -> CUDA.pokeArray (wx*hx) p d_xs 73 | withMatrix ys $ \p -> CUDA.pokeArray (wy*hy) p d_ys 74 | 75 | -- Launch the kernel 76 | -- 77 | let gridDim = (wy`div`BLOCK_SIZE, hx`div`BLOCK_SIZE) 78 | blockDim = (BLOCK_SIZE,BLOCK_SIZE,1) 79 | sharedMem = 2 * BLOCK_SIZE * BLOCK_SIZE * fromIntegral (sizeOf dummy) 80 | 81 | CUDA.setConfig gridDim blockDim sharedMem Nothing 82 | CUDA.setParams [CUDA.VArg d_xs, CUDA.VArg d_ys, CUDA.VArg d_zs, CUDA.IArg wx, CUDA.IArg wy] 83 | CUDA.launch "matrixMul" 84 | 85 | -- Copy back result 86 | zs <- newArray_ resBnds 87 | withMatrix zs $ \p -> CUDA.peekArray (wy*hx) d_zs p 88 | return zs 89 | 90 | 91 | -------------------------------------------------------------------------------- 92 | -- Main 93 | -------------------------------------------------------------------------------- 94 | 95 | main :: IO () 96 | main = do 97 | dev <- CUDA.get 98 | props <- CUDA.props dev 99 | putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props 100 | 101 | xs <- randomArr ((1,1),(8*BLOCK_SIZE, 4*BLOCK_SIZE)) :: IO (Matrix Float) 102 | ys <- randomArr ((1,1),(4*BLOCK_SIZE,12*BLOCK_SIZE)) :: IO (Matrix Float) 103 | 104 | putStr "== Reference: " >> hFlush stdout 105 | (tr,ref) <- benchmark 100 (matMult xs ys) (return ()) 106 | putStrLn $ shows (fromInteger (timeIn millisecond tr) / 100::Float) " ms" 107 | 108 | putStr "== CUDA: " >> hFlush stdout 109 | (tc,mat) <- benchmark 100 (matMultCUDA xs ys) (CUDA.sync) 110 | putStrLn $ shows (fromInteger (timeIn millisecond tc) / 100::Float) " ms" 111 | 112 | putStr "== Validating: " 113 | verify ref mat >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!" 114 | 115 | -------------------------------------------------------------------------------- /examples/src/matrixMul/matrix_mul.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | /* Matrix multiplication: C = A * B. 17 | * Device code. 18 | */ 19 | 20 | #ifndef _MATRIXMUL_KERNEL_H_ 21 | #define _MATRIXMUL_KERNEL_H_ 22 | 23 | #include 24 | #include "matrix_mul.h" 25 | 26 | #define CHECK_BANK_CONFLICTS 0 27 | #if CHECK_BANK_CONFLICTS 28 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j)) 29 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j)) 30 | #else 31 | #define AS(i, j) As[i][j] 32 | #define BS(i, j) Bs[i][j] 33 | #endif 34 | 35 | //////////////////////////////////////////////////////////////////////////////// 36 | //! Matrix multiplication on the device: C = A * B 37 | //! wA is A's width and wB is B's width 38 | //////////////////////////////////////////////////////////////////////////////// 39 | extern "C" __global__ void 40 | matrixMul(float* A, float* B, float* C, int wA, int wB) 41 | { 42 | // Block index 43 | int bx = blockIdx.x; 44 | int by = blockIdx.y; 45 | 46 | // Thread index 47 | int tx = threadIdx.x; 48 | int ty = threadIdx.y; 49 | 50 | // Index of the first sub-matrix of A processed by the block 51 | int aBegin = wA * BLOCK_SIZE * by; 52 | 53 | // Index of the last sub-matrix of A processed by the block 54 | int aEnd = aBegin + wA - 1; 55 | 56 | // Step size used to iterate through the sub-matrices of A 57 | int aStep = BLOCK_SIZE; 58 | 59 | // Index of the first sub-matrix of B processed by the block 60 | int bBegin = BLOCK_SIZE * bx; 61 | 62 | // Step size used to iterate through the sub-matrices of B 63 | int bStep = BLOCK_SIZE * wB; 64 | 65 | // Csub is used to store the element of the block sub-matrix 66 | // that is computed by the thread 67 | float Csub = 0; 68 | 69 | // Loop over all the sub-matrices of A and B 70 | // required to compute the block sub-matrix 71 | for (int a = aBegin, b = bBegin; 72 | a <= aEnd; 73 | a += aStep, b += bStep) { 74 | 75 | // Declaration of the shared memory array As used to 76 | // store the sub-matrix of A 77 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 78 | 79 | // Declaration of the shared memory array Bs used to 80 | // store the sub-matrix of B 81 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 82 | 83 | // Load the matrices from device memory 84 | // to shared memory; each thread loads 85 | // one element of each matrix 86 | AS(ty, tx) = A[a + wA * ty + tx]; 87 | BS(ty, tx) = B[b + wB * ty + tx]; 88 | 89 | // Synchronize to make sure the matrices are loaded 90 | __syncthreads(); 91 | 92 | // Multiply the two matrices together; 93 | // each thread computes one element 94 | // of the block sub-matrix 95 | for (int k = 0; k < BLOCK_SIZE; ++k) 96 | Csub += AS(ty, k) * BS(k, tx); 97 | 98 | // Synchronize to make sure that the preceding 99 | // computation is done before loading two new 100 | // sub-matrices of A and B in the next iteration 101 | __syncthreads(); 102 | } 103 | 104 | // Write the block sub-matrix to device memory; 105 | // each thread writes one element 106 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 107 | C[c + wB * ty + tx] = Csub; 108 | } 109 | 110 | #endif // #ifndef _MATRIXMUL_KERNEL_H_ 111 | -------------------------------------------------------------------------------- /examples/src/matrixMul/matrix_mul.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | #ifndef _MATRIXMUL_H_ 17 | #define _MATRIXMUL_H_ 18 | 19 | /* Thread block size */ 20 | #define BLOCK_SIZE 16 21 | 22 | /* Matrix dimensions 23 | * (chosen as multiples of the thread block size for simplicity) 24 | */ 25 | #define WA (3 * BLOCK_SIZE) /* Matrix A width */ 26 | #define HA (5 * BLOCK_SIZE) /* Matrix A height */ 27 | #define WB (8 * BLOCK_SIZE) /* Matrix B width */ 28 | #define HB WA /* Matrix B height */ 29 | #define WC WB /* Matrix C width */ 30 | #define HC HA /* Matrix C height */ 31 | 32 | #endif /* _MATRIXMUL_H_ */ 33 | 34 | -------------------------------------------------------------------------------- /examples/src/matrixMulDrv/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := matrixMulDrv 9 | 10 | HSMAIN := MatrixMul.hs 11 | PTXFILES := matrix_mul.cu 12 | 13 | USEDRVAPI := 1 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/matrixMulDrv/MatrixMul.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : MatrixMul 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Matrix multiplication using driver interface 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Main where 13 | 14 | #include "matrix_mul.h" 15 | 16 | -- Friends 17 | import RandomVector 18 | 19 | -- System 20 | import Numeric 21 | import Data.Array 22 | import Control.Exception 23 | import Data.Array.Storable 24 | import Foreign.Storable 25 | import qualified Data.ByteString.Char8 as B 26 | import qualified Foreign.CUDA.Driver as CUDA 27 | 28 | 29 | -- Return the (width,height) of a matrix 30 | -- 31 | getSize :: Storable e => Matrix e -> IO (Int,Int) 32 | getSize mat = do 33 | ((li,lj),(ui,uj)) <- getBounds mat 34 | return (rangeSize (lj,uj), rangeSize (li,ui)) 35 | 36 | -------------------------------------------------------------------------------- 37 | -- Reference implementation 38 | -------------------------------------------------------------------------------- 39 | 40 | matMult :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) 41 | matMult mx my = do 42 | x <- unsafeFreeze mx 43 | y <- unsafeFreeze my 44 | let ((li, lj), (ui, uj)) = bounds x 45 | ((li',lj'),(ui',uj')) = bounds y 46 | resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj')) 47 | | otherwise = error "matrix dimensions must agree" 48 | 49 | newListArray resBnds [sum [x!(i,k) * y!(k,j) | k <- range (lj,uj)] 50 | | i <- range (li,ui) 51 | , j <- range (lj',uj') ] 52 | 53 | 54 | -------------------------------------------------------------------------------- 55 | -- CUDA 56 | -------------------------------------------------------------------------------- 57 | 58 | -- 59 | -- Initialise the device and context. Load the PTX source code, and return a 60 | -- reference to the kernel function. 61 | -- 62 | initCUDA :: IO (CUDA.Context, CUDA.Fun) 63 | initCUDA = do 64 | CUDA.initialise [] 65 | dev <- CUDA.device 0 66 | ctx <- CUDA.create dev [] 67 | ptx <- B.readFile "data/matrix_mul.ptx" 68 | (mdl,r) <- CUDA.loadDataEx ptx [CUDA.ThreadsPerBlock (BLOCK_SIZE*BLOCK_SIZE)] 69 | fun <- CUDA.getFun mdl "matrixMul" 70 | 71 | putStrLn $ ">> PTX JIT compilation (" ++ showFFloat (Just 2) (CUDA.jitTime r) " ms)" 72 | B.putStrLn (CUDA.jitInfoLog r) 73 | return (ctx,fun) 74 | 75 | 76 | -- 77 | -- Allocate some memory, and copy over the input data to the device. Should 78 | -- probably catch allocation exceptions individually... 79 | -- 80 | initData :: (Num e, Storable e) 81 | => Matrix e -> Matrix e -> IO (CUDA.DevicePtr e, CUDA.DevicePtr e, CUDA.DevicePtr e) 82 | initData xs ys = do 83 | (wx,hx) <- getSize xs 84 | (wy,hy) <- getSize ys 85 | dxs <- CUDA.mallocArray (wx*hx) 86 | dys <- CUDA.mallocArray (wy*hy) 87 | res <- CUDA.mallocArray (wy*hx) 88 | 89 | flip onException (mapM_ CUDA.free [dxs,dys,res]) $ do 90 | withMatrix xs $ \p -> CUDA.pokeArray (wx*hx) p dxs 91 | withMatrix ys $ \p -> CUDA.pokeArray (wy*hy) p dys 92 | return (dxs, dys, res) 93 | 94 | 95 | -- 96 | -- Run the test 97 | -- 98 | testCUDA :: (Num e, Storable e) => Matrix e -> Matrix e -> IO (Matrix e) 99 | testCUDA xs' ys' = doTest undefined xs' ys' 100 | where 101 | doTest :: (Num e', Storable e') => e' -> Matrix e' -> Matrix e' -> IO (Matrix e') 102 | doTest dummy xs ys = do 103 | (widthX,heightX) <- getSize xs 104 | (widthY,_) <- getSize ys 105 | ((li, lj), (ui, uj)) <- getBounds xs 106 | ((li',lj'),(ui',uj')) <- getBounds ys 107 | let resBnds | (lj,uj) == (li',ui') = ((li,lj'),(ui,uj')) 108 | | otherwise = error "matrix dimensions must agree" 109 | 110 | -- Initialise environment and copy over test data 111 | -- 112 | putStrLn ">> Initialising" 113 | bracket initCUDA (\(ctx,_) -> CUDA.destroy ctx) $ \(_,matMul) -> do 114 | 115 | -- Ensure we release the memory, even if there was an error 116 | -- 117 | putStrLn ">> Executing" 118 | bracket 119 | (initData xs ys) 120 | (\(dx,dy,dz) -> mapM_ CUDA.free [dx,dy,dz]) $ 121 | \(dx,dy,dz) -> do 122 | -- Repeat test many times... 123 | -- 124 | CUDA.setParams matMul [CUDA.VArg dx, CUDA.VArg dy, CUDA.VArg dz, CUDA.IArg widthX, CUDA.IArg widthY] 125 | CUDA.setBlockShape matMul (BLOCK_SIZE,BLOCK_SIZE,1) 126 | CUDA.setSharedSize matMul (fromIntegral (2 * BLOCK_SIZE * BLOCK_SIZE * sizeOf dummy)) 127 | CUDA.launch matMul (widthY `div` BLOCK_SIZE, heightX `div` BLOCK_SIZE) Nothing 128 | CUDA.sync 129 | 130 | -- Copy back result 131 | -- 132 | zs <- newArray_ resBnds 133 | withMatrix zs $ \p -> CUDA.peekArray (widthY*heightX) dz p 134 | return zs 135 | 136 | 137 | -------------------------------------------------------------------------------- 138 | -- Test & Verify 139 | -------------------------------------------------------------------------------- 140 | 141 | main :: IO () 142 | main = do 143 | putStrLn "== Generating random matrices" 144 | xs <- randomArr ((1,1),(8*BLOCK_SIZE, 4*BLOCK_SIZE)) :: IO (Matrix Float) 145 | ys <- randomArr ((1,1),(4*BLOCK_SIZE,12*BLOCK_SIZE)) :: IO (Matrix Float) 146 | 147 | putStrLn "== Generating reference solution" 148 | ref <- matMult xs ys 149 | 150 | putStrLn "== Testing CUDA" 151 | mat <- testCUDA xs ys 152 | 153 | putStr "== Validating: " 154 | verify ref mat >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!" 155 | 156 | -------------------------------------------------------------------------------- /examples/src/matrixMulDrv/matrix_mul.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | /* Matrix multiplication: C = A * B. 17 | * Device code. 18 | */ 19 | 20 | #ifndef _MATRIXMUL_KERNEL_H_ 21 | #define _MATRIXMUL_KERNEL_H_ 22 | 23 | #include 24 | #include "matrix_mul.h" 25 | 26 | #define CHECK_BANK_CONFLICTS 0 27 | #if CHECK_BANK_CONFLICTS 28 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j)) 29 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j)) 30 | #else 31 | #define AS(i, j) As[i][j] 32 | #define BS(i, j) Bs[i][j] 33 | #endif 34 | 35 | //////////////////////////////////////////////////////////////////////////////// 36 | //! Matrix multiplication on the device: C = A * B 37 | //! wA is A's width and wB is B's width 38 | //////////////////////////////////////////////////////////////////////////////// 39 | extern "C" __global__ void 40 | matrixMul(float* A, float* B, float* C, int wA, int wB) 41 | { 42 | // Block index 43 | int bx = blockIdx.x; 44 | int by = blockIdx.y; 45 | 46 | // Thread index 47 | int tx = threadIdx.x; 48 | int ty = threadIdx.y; 49 | 50 | // Index of the first sub-matrix of A processed by the block 51 | int aBegin = wA * BLOCK_SIZE * by; 52 | 53 | // Index of the last sub-matrix of A processed by the block 54 | int aEnd = aBegin + wA - 1; 55 | 56 | // Step size used to iterate through the sub-matrices of A 57 | int aStep = BLOCK_SIZE; 58 | 59 | // Index of the first sub-matrix of B processed by the block 60 | int bBegin = BLOCK_SIZE * bx; 61 | 62 | // Step size used to iterate through the sub-matrices of B 63 | int bStep = BLOCK_SIZE * wB; 64 | 65 | // Csub is used to store the element of the block sub-matrix 66 | // that is computed by the thread 67 | float Csub = 0; 68 | 69 | // Loop over all the sub-matrices of A and B 70 | // required to compute the block sub-matrix 71 | for (int a = aBegin, b = bBegin; 72 | a <= aEnd; 73 | a += aStep, b += bStep) { 74 | 75 | // Declaration of the shared memory array As used to 76 | // store the sub-matrix of A 77 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 78 | 79 | // Declaration of the shared memory array Bs used to 80 | // store the sub-matrix of B 81 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 82 | 83 | // Load the matrices from device memory 84 | // to shared memory; each thread loads 85 | // one element of each matrix 86 | AS(ty, tx) = A[a + wA * ty + tx]; 87 | BS(ty, tx) = B[b + wB * ty + tx]; 88 | 89 | // Synchronize to make sure the matrices are loaded 90 | __syncthreads(); 91 | 92 | // Multiply the two matrices together; 93 | // each thread computes one element 94 | // of the block sub-matrix 95 | for (int k = 0; k < BLOCK_SIZE; ++k) 96 | Csub += AS(ty, k) * BS(k, tx); 97 | 98 | // Synchronize to make sure that the preceding 99 | // computation is done before loading two new 100 | // sub-matrices of A and B in the next iteration 101 | __syncthreads(); 102 | } 103 | 104 | // Write the block sub-matrix to device memory; 105 | // each thread writes one element 106 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 107 | C[c + wB * ty + tx] = Csub; 108 | } 109 | 110 | #endif // #ifndef _MATRIXMUL_KERNEL_H_ 111 | -------------------------------------------------------------------------------- /examples/src/matrixMulDrv/matrix_mul.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | #ifndef _MATRIXMUL_H_ 17 | #define _MATRIXMUL_H_ 18 | 19 | /* Thread block size */ 20 | #define BLOCK_SIZE 16 21 | 22 | /* Matrix dimensions 23 | * (chosen as multiples of the thread block size for simplicity) 24 | */ 25 | #define WA (3 * BLOCK_SIZE) /* Matrix A width */ 26 | #define HA (5 * BLOCK_SIZE) /* Matrix A height */ 27 | #define WB (8 * BLOCK_SIZE) /* Matrix B width */ 28 | #define HB WA /* Matrix B height */ 29 | #define WC WB /* Matrix C width */ 30 | #define HC HA /* Matrix C height */ 31 | 32 | #endif /* _MATRIXMUL_H_ */ 33 | 34 | -------------------------------------------------------------------------------- /examples/src/multiGPU/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := multiGPU 9 | 10 | HSMAIN := multiGPU.hs 11 | CUFILES := simpleMultiGPU_kernel.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | 20 | -------------------------------------------------------------------------------- /examples/src/multiGPU/multiGPU.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ParallelListComp, CPP, BangPatterns, ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : MultiGPU 5 | -- Copyright : (c) 2012 Kevin Ying 6 | -- License : BSD 7 | -- 8 | -- An implemenations of Nvidia's simpleMultiGPU example 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Main where 13 | 14 | import Time 15 | 16 | -- System 17 | import System.Random.MWC 18 | import Data.List 19 | import Control.Monad 20 | import Foreign 21 | import Foreign.CUDA (HostPtr, DevicePtr, withDevicePtr, withHostPtr) 22 | import Foreign.CUDA.Runtime.Stream as CR 23 | import qualified Foreign.CUDA.Runtime as CR 24 | 25 | import qualified Data.Vector.Unboxed as U 26 | 27 | -- Settings 28 | data_n = 1048576 * 32 29 | block_n = 32 30 | thread_n = 256 31 | accum_n = block_n * thread_n 32 | 33 | -- 34 | -- |A set of data prepared for execution on a GPU 35 | -- 36 | data Plan = Plan 37 | { 38 | device :: Int, -- ^ device number 39 | stream :: Stream, -- ^ stream on the device 40 | dataN :: Int, -- ^ number of elements in data 41 | v_data :: U.Vector Float, -- ^ host data 42 | h_data :: HostPtr Float, -- ^ host data in pagelocked memory 43 | 44 | d_data :: DevicePtr Float, -- ^ device data 45 | d_sum :: DevicePtr Float, -- ^ device results 46 | 47 | h_sum_from_device :: HostPtr Float -- ^ results from device 48 | } 49 | deriving Show 50 | 51 | 52 | main :: IO () 53 | main = do 54 | gpu_n <- CR.count 55 | putStrLn $ "Number of GPUs found: " ++ show gpu_n 56 | 57 | -- Execute on all GPUs 58 | putStrLn $ "Executing " ++ (show gpu_n) ++ " sets of data over " ++ (show gpu_n) ++ " GPU(s)" 59 | withPlans gpu_n gpu_n executePlans 60 | 61 | -- Use only first GPU 62 | putStrLn $ "Executing " ++ (show gpu_n) ++ " sets of data over 1 GPU" 63 | withPlans gpu_n 1 executePlans 64 | 65 | putStrLn "Finished" 66 | 67 | 68 | -- |Launches kernels on the specified gpu and compares with the CPU 69 | executePlans :: [Plan] -> IO () 70 | executePlans plans = do 71 | --Copy data to GPU, launch Kernel, copy data back all asynchronously 72 | (t,_) <- flip (benchmark 100) CR.sync $ forM_ plans $ \plan -> do 73 | CR.set (device plan) 74 | --Copy input data from CPU 75 | CR.pokeArrayAsync (dataN plan) (h_data plan) (d_data plan) (Just $ stream plan) 76 | --Perform GPU computations 77 | reduceKernel (d_sum plan) (d_data plan) (dataN plan) block_n thread_n (stream plan) 78 | --Read back GPU results 79 | CR.peekArrayAsync accum_n (d_sum plan) (h_sum_from_device plan) (Just $ stream plan) 80 | 81 | --Process results 82 | h_sumGPU <- forM plans $ \plan -> do 83 | CR.set (device plan) 84 | CR.block (stream plan) 85 | hs_sum <- withHostPtr (h_sum_from_device plan) $ \ptr -> peekArray accum_n ptr 86 | return $ sum hs_sum 87 | let sumGPU = sum h_sumGPU 88 | 89 | putStrLn $ " GPU processing time: " ++ (show $ timeIn millisecond t) ++ " ms" 90 | 91 | 92 | let sumCPU = sum $ map (\p -> U.sum $ v_data p) plans 93 | 94 | let diff = (abs $ sumCPU - sumGPU) / (abs sumCPU); 95 | putStrLn $ " GPU sum: " ++ show sumGPU 96 | putStrLn $ " CPU sum: " ++ show sumCPU 97 | putStrLn $ " Relative difference: " ++ show diff 98 | 99 | if (diff < 1e-4) then 100 | putStrLn $ "Passed!" 101 | else 102 | putStrLn $ "Failed!" 103 | 104 | -- |Makes n plans spread over ndevices 105 | withPlans :: Int -> Int -> ([Plan] -> IO ()) -> IO () 106 | withPlans n ndevices action = do 107 | devStrms' <- forM [0..(ndevices-1)] $ \dev -> do 108 | CR.set dev 109 | strm <- CR.create 110 | return (dev, strm) 111 | 112 | let devStrms = take n $ cycle devStrms' 113 | 114 | plans <- forM devStrms $ \(i, stream) -> do 115 | let dataN = if (i < data_n `mod` n) then (data_n `div` n) + 1 else data_n `div` n 116 | CR.set i 117 | -- Allocate memory 118 | -- Host 119 | v_data <- randomList dataN 120 | h_data <- CR.mallocHostArray [] dataN 121 | withHostPtr h_data $ \ptr -> pokeArray ptr (U.toList v_data) 122 | 123 | h_sum_from_device <- CR.mallocHostArray [] accum_n 124 | 125 | -- Device 126 | d_data <- CR.mallocArray (dataN * sizeOf (undefined :: DevicePtr Float)) 127 | d_sum <- CR.mallocArray (accum_n * sizeOf (undefined :: DevicePtr Float)) 128 | 129 | 130 | return $ Plan i stream dataN v_data h_data d_data d_sum h_sum_from_device 131 | 132 | action plans 133 | 134 | -- clean up 135 | forM_ plans $ \plan -> do 136 | CR.set (device plan) 137 | CR.freeHost (h_data plan) 138 | CR.freeHost (h_sum_from_device plan) 139 | 140 | CR.free (d_data plan) 141 | CR.free (d_sum plan) 142 | --CR.destroy (stream plan) 143 | forM_ devStrms' $ \(_,stream) -> CR.destroy stream 144 | 145 | 146 | randomList :: Int -> IO (U.Vector Float) 147 | randomList n = withSystemRandom $ \gen -> U.replicateM n (uniform gen :: IO Float) 148 | 149 | -- |Binding for the reduce kernel 150 | reduceKernel :: DevicePtr Float -> DevicePtr Float -> Int -> Int -> Int -> Stream -> IO () 151 | reduceKernel a1 a2 a3 a4 a5 a6 = 152 | withDevicePtr a1 $ \a1' -> 153 | withDevicePtr a2 $ \a2' -> 154 | reduceKernel'_ a1' a2' a3 a4 a5 a6 155 | 156 | foreign import ccall unsafe "simpleMultiGPU.h launch_reduceKernel" 157 | reduceKernel'_ :: Ptr Float -> Ptr Float -> Int -> Int -> Int -> Stream -> IO () 158 | -------------------------------------------------------------------------------- /examples/src/multiGPU/simpleMultiGPU.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /* 13 | * This application demonstrates how to use the CUDA API to use multiple GPUs. 14 | * 15 | * Note that in order to detect multiple GPUs in your system you have to disable 16 | * SLI in the nvidia control panel. Otherwise only one GPU is visible to the 17 | * application. On the other side, you can still extend your desktop to screens 18 | * attached to both GPUs. 19 | */ 20 | 21 | #ifndef SIMPLEMULTIGPU_H 22 | #define SIMPLEMULTIGPU_H 23 | 24 | typedef struct { 25 | //Host-side input data 26 | int dataN; 27 | float *h_Data; 28 | 29 | //Partial sum for this GPU 30 | float *h_Sum; 31 | 32 | //Device buffers 33 | float *d_Data,*d_Sum; 34 | 35 | //Reduction copied back from GPU 36 | float *h_Sum_from_device; 37 | 38 | //Stream for asynchronous command execution 39 | cudaStream_t stream; 40 | 41 | } TGPUplan; 42 | 43 | extern "C" 44 | void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t s); 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /examples/src/multiGPU/simpleMultiGPU_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /* 13 | * This application demonstrates how to use the CUDA API to use multiple GPUs. 14 | * 15 | * Note that in order to detect multiple GPUs in your system you have to disable 16 | * SLI in the nvidia control panel. Otherwise only one GPU is visible to the 17 | * application. On the other side, you can still extend your desktop to screens 18 | * attached to both GPUs. 19 | */ 20 | 21 | #include 22 | 23 | //////////////////////////////////////////////////////////////////////////////// 24 | // Simple reduction kernel. 25 | // Refer to the 'reduction' CUDA SDK sample describing 26 | // reduction optimization strategies 27 | //////////////////////////////////////////////////////////////////////////////// 28 | __global__ static void reduceKernel(float *d_Result, float *d_Input, int N){ 29 | const int tid = blockIdx.x * blockDim.x + threadIdx.x; 30 | const int threadN = gridDim.x * blockDim.x; 31 | float sum = 0; 32 | for(int pos = tid; pos < N; pos += threadN) 33 | sum += d_Input[pos]; 34 | 35 | d_Result[tid] = sum; 36 | } 37 | 38 | extern "C" 39 | void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t s) 40 | { 41 | reduceKernel<<>>(d_Result, d_Input, N); 42 | } 43 | 44 | -------------------------------------------------------------------------------- /examples/src/permute/permute.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Module : Prelude 3 | * Copyright : (c) 2009 Trevor L. McDonell 4 | * License : BSD 5 | */ 6 | 7 | #include "utils.h" 8 | 9 | 10 | static void 11 | permute_control 12 | ( 13 | unsigned int n, 14 | unsigned int &blocks, 15 | unsigned int &threads, 16 | unsigned int maxThreads = MAX_THREADS, 17 | unsigned int maxBlocks = MAX_BLOCKS 18 | ) 19 | { 20 | threads = min(ceilPow2(n), MAX_THREADS); 21 | blocks = (n + threads - 1) / threads; 22 | // blocks = min(blocks, maxBlocks); 23 | } 24 | 25 | 26 | /* 27 | * Permute an array according to the permutation indices. This handles both 28 | * forward and backward permutation, where: 29 | * 30 | * bpermute :: [a] -> [Int] -> [a] 31 | * bpermute v is = [ v!i | i <- is ] 32 | * 33 | * In this case, `length' specifies the number of elements in the `indices' and 34 | * `out' arrays. 35 | * 36 | * An alternative to back permute, where we do not explicitly know the offset 37 | * indices, is to use an array of [0,1] flags specifying valid elements, which 38 | * we can exclusive-sum-scan to get the offsets. In this case, `length' 39 | * specifies the number of elements in the `in' array. The template parameter 40 | * `backward' specifies whether the offsets were calculated with a left or right 41 | * scan. 42 | * 43 | * We return the number of valid elements found via `num_valid', but blunder on 44 | * ahead regardless of whether the `out' array is large enough or not. 45 | */ 46 | template 47 | __global__ static void 48 | permute_core 49 | ( 50 | const T *in, 51 | T *out, 52 | unsigned int *indices, 53 | unsigned int length, 54 | unsigned int *valid = NULL, 55 | unsigned int *num_valid = NULL 56 | ) 57 | { 58 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 59 | 60 | /* 61 | * Return the number of valid entries found 62 | */ 63 | if (compact && threadIdx.x == 0) 64 | { 65 | if (backward) 66 | num_valid[0] = valid[0] + indices[0]; 67 | else 68 | num_valid[0] = valid[length-1] + indices[length-1]; 69 | } 70 | 71 | if (idx < length) 72 | { 73 | if (compact && valid[idx]) 74 | out[indices[idx]] = in[idx]; 75 | else if (backward) 76 | out[idx] = in[indices[idx]]; 77 | else 78 | out[indices[idx]] = in[idx]; 79 | } 80 | } 81 | 82 | 83 | template 84 | static void 85 | permute 86 | ( 87 | const T *in, 88 | T *out, 89 | unsigned int *indices, 90 | unsigned int length, 91 | unsigned int *valid = NULL, 92 | unsigned int *num_valid = NULL 93 | ) 94 | { 95 | unsigned int threads; 96 | unsigned int blocks; 97 | 98 | permute_control(length, blocks, threads); 99 | permute_core< T,backward,compact > 100 | <<>>(in, out, indices, length, valid, num_valid); 101 | } 102 | 103 | 104 | template 105 | static int 106 | compact 107 | ( 108 | const T *in, 109 | T *out, 110 | unsigned int *flags, 111 | unsigned int length 112 | ) 113 | { 114 | unsigned int N; 115 | unsigned int *num; 116 | unsigned int *indices; 117 | 118 | cudaMalloc((void**) &num, sizeof(unsigned int)); 119 | cudaMalloc((void**) &indices, length * sizeof(unsigned int)); 120 | 121 | if (backward) scanr_plusui(flags, indices, length); 122 | else scanl_plusui(flags, indices, length); 123 | 124 | /* 125 | * At this point, we know exactly how many elements will be required for the 126 | * `out' array. Maybe we should allocate that array here? 127 | */ 128 | permute(in, out, indices, length, flags, num); 129 | 130 | cudaMemcpy(&N, num, sizeof(unsigned int), cudaMemcpyDeviceToHost); 131 | cudaFree(num); 132 | cudaFree(indices); 133 | 134 | return N; 135 | } 136 | 137 | 138 | // ----------------------------------------------------------------------------- 139 | // Instances 140 | // ----------------------------------------------------------------------------- 141 | 142 | void permute_ui(unsigned int *in, unsigned int *out, unsigned int *indices, int length) 143 | { 144 | permute(in, out, indices, length); 145 | } 146 | 147 | 148 | void bpermute_ui(unsigned int *in, unsigned int *out, unsigned int *indices, int length) 149 | { 150 | permute(in, out, indices, length); 151 | } 152 | 153 | void bpermute_f(float *in, float *out, unsigned int *indices, int length) 154 | { 155 | permute(in, out, indices, length); 156 | } 157 | 158 | 159 | int compact_f(float *in, float *out, unsigned int *flags, int length) 160 | { 161 | int N = compact(in, out, flags, length); 162 | return N; 163 | } 164 | 165 | -------------------------------------------------------------------------------- /examples/src/replicate/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | STATIC_LIB := libreplicate.a 9 | 10 | CUFILES := replicate.cu 11 | 12 | EXTRALIBS := stdc++ 13 | 14 | # ------------------------------------------------------------------------------ 15 | # Haskell/CUDA build system 16 | # ------------------------------------------------------------------------------ 17 | include ../../common/common.mk 18 | -------------------------------------------------------------------------------- /examples/src/replicate/replicate.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Replicate 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #include "utils.h" 10 | #include "replicate.h" 11 | #include "algorithms.h" 12 | 13 | #include 14 | 15 | static void 16 | replicate_control(uint32_t n, uint32_t &blocks, uint32_t &threads) 17 | { 18 | threads = min(ceilPow2(n), MAX_THREADS); 19 | blocks = (n + threads - 1) / threads; 20 | } 21 | 22 | 23 | /* 24 | * Apply a function each element of an array. A single thread is used to compute 25 | * each result. 26 | */ 27 | template 28 | __global__ static void 29 | replicate_core 30 | ( 31 | uint32_t *d_out, 32 | const uint32_t symbol, 33 | const uint32_t length 34 | ) 35 | { 36 | uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; 37 | 38 | if (lengthIsPow2 || idx < length) 39 | d_out[idx] = symbol; 40 | } 41 | 42 | 43 | /* 44 | * TODO: Generalise to 8- and 16-bit values. 45 | */ 46 | void 47 | replicate 48 | ( 49 | void *d_out, 50 | const uint32_t symbol, 51 | const uint32_t length 52 | ) 53 | { 54 | uint32_t threads; 55 | uint32_t blocks; 56 | 57 | replicate_control(length, blocks, threads); 58 | 59 | if (isPow2(length)) replicate_core <<>>((uint32_t*) d_out, symbol, length); 60 | else replicate_core<<>>((uint32_t*) d_out, symbol, length); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /examples/src/replicate/replicate.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Replicate 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | 10 | #ifndef __REPLICATE_H__ 11 | #define __REPLICATE_H__ 12 | 13 | #define MAX_THREADS 128 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /examples/src/scan/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := scan 9 | 10 | HSMAIN := Scan.chs 11 | CUFILES := scan.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/scan/Scan.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : Scan 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Apply a binary operator to an array similar to 'fold', but return a 9 | -- successive list of values reduced from the left (or right). 10 | -- 11 | -------------------------------------------------------------------------------- 12 | 13 | module Main where 14 | 15 | #include "scan.h" 16 | 17 | -- Friends 18 | import C2HS hiding (newArray) 19 | import Time 20 | import RandomVector 21 | 22 | -- System 23 | import Control.Monad 24 | import Control.Exception 25 | import qualified Foreign.CUDA as CUDA 26 | 27 | 28 | -------------------------------------------------------------------------------- 29 | -- Reference 30 | -------------------------------------------------------------------------------- 31 | 32 | scanList :: (Num e, Storable e) => Vector e -> IO (Vector e) 33 | scanList xs = do 34 | bnds <- getBounds xs 35 | xs' <- getElems xs 36 | (t,zs') <- benchmark 100 (return (scanl1 (+) xs')) (return ()) 37 | putStrLn $ "List: " ++ shows (fromInteger (timeIn millisecond t`div`100)::Float) " ms" 38 | newListArray bnds zs' 39 | 40 | 41 | scanArr :: (Num e, Storable e) => Vector e -> IO (Vector e) 42 | scanArr xs = do 43 | bnds <- getBounds xs 44 | zs <- newArray_ bnds 45 | let idx = range bnds 46 | (t,_) <- benchmark 100 (foldM_ (k zs) 0 idx) (return ()) 47 | putStrLn $ "Array: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms" 48 | return zs 49 | where 50 | k zs a i = do 51 | x <- readArray xs i 52 | let z = x+a 53 | writeArray zs i z 54 | return z 55 | 56 | 57 | -------------------------------------------------------------------------------- 58 | -- CUDA 59 | -------------------------------------------------------------------------------- 60 | 61 | -- 62 | -- Include the time to copy the data to/from the storable array (significantly 63 | -- faster than from a Haskell list) 64 | -- 65 | scanCUDA :: Vector Float -> IO (Vector Float) 66 | scanCUDA xs = do 67 | bnds <- getBounds xs 68 | zs <- newArray_ bnds 69 | let len = rangeSize bnds 70 | CUDA.allocaArray len $ \d_xs -> do 71 | CUDA.allocaArray len $ \d_zs -> do 72 | (t,_) <- flip (benchmark 100) CUDA.sync $ do 73 | withVector xs $ \p -> CUDA.pokeArray len p d_xs 74 | scanl1_plusf d_xs d_zs len 75 | withVector zs $ \p -> CUDA.peekArray len d_zs p 76 | putStrLn $ "CUDA: " ++ shows (fromInteger (timeIn millisecond t)/100::Float) " ms (with copy)" 77 | 78 | (t',_) <- benchmark 100 (scanl1_plusf d_xs d_zs len) CUDA.sync 79 | putStrLn $ "CUDA: " ++ shows (fromInteger (timeIn millisecond t')/100::Float) " ms (compute only)" 80 | 81 | return zs 82 | 83 | {# fun unsafe scanl1_plusf 84 | { withDP* `CUDA.DevicePtr Float' 85 | , withDP* `CUDA.DevicePtr Float' 86 | , `Int' 87 | } -> `()' #} 88 | where 89 | withDP p a = CUDA.withDevicePtr p $ \p' -> a (castPtr p') 90 | 91 | 92 | -------------------------------------------------------------------------------- 93 | -- Main 94 | -------------------------------------------------------------------------------- 95 | 96 | main :: IO () 97 | main = do 98 | dev <- CUDA.get 99 | props <- CUDA.props dev 100 | putStrLn $ "Using device " ++ show dev ++ ": " ++ CUDA.deviceName props 101 | 102 | arr <- randomArr (1,100000) :: IO (Vector Float) 103 | ref <- scanList arr 104 | ref' <- scanArr arr 105 | cuda <- scanCUDA arr 106 | 107 | return () 108 | 109 | putStr "== Validating: " 110 | verify ref ref' >>= \rv -> assert rv (return ()) 111 | verify ref cuda >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!" 112 | 113 | -------------------------------------------------------------------------------- /examples/src/scan/scan.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Scan 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #include "scan.h" 10 | 11 | #include "utils.h" 12 | #include "operator.h" 13 | #include "cudpp/cudpp_globals.h" 14 | #include "cudpp/scan_kernel.cu" 15 | #include "cudpp/vector_kernel.cu" 16 | 17 | template 18 | struct scan_plan 19 | { 20 | T **block_sums; 21 | size_t num_levels; 22 | }; 23 | 24 | static inline unsigned int 25 | calc_num_blocks(unsigned int N) 26 | { 27 | return max(1u, (unsigned int)ceil((double)N / (SCAN_ELTS_PER_THREAD * CTA_SIZE))); 28 | } 29 | 30 | 31 | /* 32 | * This is the CPU-side workhorse of the scan operation, invoking the kernel on 33 | * each of the reduction blocks. 34 | */ 35 | template 36 | static void 37 | scan_recursive 38 | ( 39 | const T *in, 40 | T *out, 41 | scan_plan *plan, 42 | int N, 43 | int level 44 | ) 45 | { 46 | size_t num_blocks = calc_num_blocks(N); 47 | bool is_full = N == num_blocks * SCAN_ELTS_PER_THREAD * CTA_SIZE; 48 | 49 | dim3 grid(num_blocks, 1, 1); 50 | dim3 block(CTA_SIZE, 1, 1); 51 | size_t smem = sizeof(T) * CTA_SIZE * 2; 52 | 53 | #define MULTIBLOCK 0x01 54 | #define FULLBLOCK 0x04 55 | int traits = 0; 56 | if (num_blocks > 1) traits |= MULTIBLOCK; 57 | if (is_full) traits |= FULLBLOCK; 58 | 59 | /* 60 | * Set up execution parameters, and execute the scan 61 | */ 62 | switch (traits) 63 | { 64 | case 0: 65 | scan4 66 | < T, ScanTraits > 67 | <<>>(out, in, NULL, N, 1, 1); 68 | break; 69 | 70 | case MULTIBLOCK: 71 | scan4 72 | < T, ScanTraits > 73 | <<>>(out, in, plan->block_sums[level], N, 1, 1); 74 | break; 75 | 76 | case FULLBLOCK: 77 | scan4 78 | < T, ScanTraits > 79 | <<>>(out, in, NULL, N, 1, 1); 80 | break; 81 | 82 | case MULTIBLOCK | FULLBLOCK: 83 | scan4 84 | < T, ScanTraits > 85 | <<>>(out, in, plan->block_sums[level], N, 1, 1); 86 | break; 87 | 88 | default: 89 | assert(!"Non-exhaustive patterns in match"); 90 | } 91 | 92 | /* 93 | * After scanning the sub-blocks, we now need to combine those results by 94 | * taking the last value from each sub-block, and adding that to each of the 95 | * successive blocks (i.e. scan across the sub-computations) 96 | */ 97 | if (num_blocks > 1) 98 | { 99 | T *sums = plan->block_sums[level]; 100 | 101 | scan_recursive 102 | 103 | (sums, sums, plan, num_blocks, level+1); 104 | 105 | vectorAddUniform4 106 | 107 | <<>> 108 | (out, sums, N, 4, 4, 0, 0); 109 | } 110 | 111 | #undef MULTIBLOCK 112 | #undef FULLBLOCK 113 | } 114 | 115 | 116 | /* 117 | * Allocate temporary memory used by the scan. 118 | */ 119 | template 120 | static void 121 | scan_init(int N, scan_plan *plan) 122 | { 123 | size_t level = 0; 124 | size_t elements = N; 125 | size_t num_blocks; 126 | 127 | /* 128 | * Determine how many intermediate block-level summations will be required 129 | */ 130 | for (elements = N; elements > 1; elements = num_blocks) 131 | { 132 | num_blocks = calc_num_blocks(elements); 133 | 134 | if (num_blocks > 1) 135 | ++level; 136 | } 137 | 138 | plan->block_sums = (T**) malloc(level * sizeof(T*)); 139 | plan->num_levels = level; 140 | 141 | /* 142 | * Now, allocate the necessary storage at each level 143 | */ 144 | for (elements = N, level = 0; elements > 1; elements = num_blocks, level++) 145 | { 146 | num_blocks = calc_num_blocks(elements); 147 | 148 | if (num_blocks > 1) 149 | cudaMalloc((void**) &plan->block_sums[level], num_blocks * sizeof(T)); 150 | } 151 | } 152 | 153 | 154 | /* 155 | * Clean up temporary memory used by the scan 156 | */ 157 | template 158 | static void 159 | scan_finalise(scan_plan *p) 160 | { 161 | for (size_t l = 0; l < p->num_levels; ++l) 162 | cudaFree(p->block_sums[l]); 163 | 164 | free(p->block_sums); 165 | } 166 | 167 | 168 | /* 169 | * Apply a binary operator to an array similar to `fold', but return a 170 | * successive list of values reduced from the left. The reduction will take 171 | * place in parallel, so the operator must be associative. 172 | */ 173 | template 174 | void 175 | scan 176 | ( 177 | const T *in, 178 | T *out, 179 | int length 180 | ) 181 | { 182 | scan_plan plan; 183 | scan_init(length, &plan); 184 | 185 | scan_recursive(in, out, &plan, length, 0); 186 | 187 | scan_finalise(&plan); 188 | } 189 | 190 | 191 | // ----------------------------------------------------------------------------- 192 | // Instances 193 | // ----------------------------------------------------------------------------- 194 | 195 | void scanl_plusf(float *in, float *out, int N) 196 | { 197 | scan< Plus, float, false, true >(in, out, N); 198 | } 199 | 200 | void scanl1_plusf(float *in, float *out, int N) 201 | { 202 | scan< Plus, float, false, false >(in, out, N); 203 | } 204 | 205 | -------------------------------------------------------------------------------- /examples/src/scan/scan.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Scan 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #ifndef __SCAN_H__ 10 | #define __SCAN_H__ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | /* 17 | * Instances 18 | */ 19 | void scanl_plusf(float *in, float *out, int N); 20 | void scanl1_plusf(float *in, float *out, int N); 21 | 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | #endif 27 | -------------------------------------------------------------------------------- /examples/src/simpleTexture/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := simpleTexture 9 | 10 | HSMAIN := SimpleTexture.hs 11 | CUFILES := simpleTexture.cu 12 | 13 | EXTRALIBS := stdc++ 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/simpleTexture/SimpleTexture.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ParallelListComp #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : SimpleTexture 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Demonstrates texture references. Takes an input PGM file and generates an 9 | -- output that has been rotated. The kernel performs a simple 2D transformation 10 | -- on the texture coordinates (u,v). 11 | -- 12 | -------------------------------------------------------------------------------- 13 | 14 | module Main where 15 | 16 | import Data.Array.Unboxed 17 | import Graphics.Pgm 18 | import qualified Foreign.CUDA as CUDA 19 | import qualified Foreign.CUDA.Runtime.Texture as CUDA 20 | 21 | 22 | imageFile, outputFile, referenceFile :: FilePath 23 | imageFile = "data/lena_bw.pgm" 24 | outputFile = "data/lena_bw_out.pgm" 25 | referenceFile = "data/ref_rotated.pgm" 26 | 27 | rotation :: Float 28 | rotation = 0.5 -- radians 29 | 30 | 31 | main :: IO () 32 | main = do 33 | img <- either (error . show) head `fmap` pgmsFromFile imageFile 34 | ref <- either (error . show) head `fmap` pgmsFromFile referenceFile 35 | runTest img ref 36 | 37 | 38 | runTest :: UArray (Int,Int) Int -> UArray (Int,Int) Int -> IO () 39 | runTest img ref' = do 40 | rotated <- transform (amap (\x-> fromIntegral x/255) img) 41 | arrayToFile outputFile (amap (\x -> round (x*255)) rotated) 42 | if verify (elems ref) (elems rotated) 43 | then putStrLn "PASSED" 44 | else putStrLn "FAILED!" 45 | 46 | where 47 | ref = amap (\x -> fromIntegral x / 255) ref' 48 | verify xs ys = all (< epsilon) [abs ((x-y)/(x+y+epsilon)) | x <- xs | y <- ys] 49 | epsilon = 0.05 50 | 51 | 52 | transform :: UArray (Int,Int) Float -> IO (UArray (Int,Int) Float) 53 | transform pgm = 54 | let bnds@((x0,y0),(x1,y1)) = bounds pgm 55 | width = x1-x0+1 56 | height = y1-y0+1 57 | desc = CUDA.FormatDesc (32,0,0,0) CUDA.Float 58 | texture = CUDA.Texture True CUDA.Linear (CUDA.Wrap,CUDA.Wrap,CUDA.Wrap) desc 59 | in 60 | CUDA.allocaArray (width * height) $ \d_out -> 61 | CUDA.withListArray (elems pgm) $ \d_tex -> do 62 | CUDA.bind2D "tex" texture d_tex (width,height) (fromIntegral $ height * 4) 63 | CUDA.setConfig (width `div` 8,height `div` 8) (8,8,1) 0 Nothing 64 | CUDA.setParams [CUDA.VArg d_out, CUDA.IArg width, CUDA.IArg height, CUDA.FArg rotation] 65 | CUDA.launch "transformKernel" 66 | 67 | listArray bnds `fmap` CUDA.peekListArray (width*height) d_out 68 | 69 | -------------------------------------------------------------------------------- /examples/src/simpleTexture/data/lena_bw.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/simpleTexture/data/lena_bw.pgm -------------------------------------------------------------------------------- /examples/src/simpleTexture/data/ref_rotated.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmcdonell/cuda/5bc08e3101946d1c9f7a35d5d57426b59b840479/examples/src/simpleTexture/data/ref_rotated.pgm -------------------------------------------------------------------------------- /examples/src/simpleTexture/simpleTexture.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | #ifndef _SIMPLETEXTURE_KERNEL_H_ 17 | #define _SIMPLETEXTURE_KERNEL_H_ 18 | 19 | // declare texture reference for 2D float texture 20 | texture tex; 21 | 22 | //////////////////////////////////////////////////////////////////////////////// 23 | //! Transform an image using texture lookups 24 | //! @param g_odata output data in global memory 25 | //////////////////////////////////////////////////////////////////////////////// 26 | extern "C" 27 | __global__ void 28 | transformKernel( float* g_odata, int width, int height, float theta) 29 | { 30 | // calculate normalized texture coordinates 31 | unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 32 | unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 33 | 34 | float u = x / (float) width; 35 | float v = y / (float) height; 36 | 37 | // transform coordinates 38 | u -= 0.5f; 39 | v -= 0.5f; 40 | float tu = u*cosf(theta) - v*sinf(theta) + 0.5f; 41 | float tv = v*cosf(theta) + u*sinf(theta) + 0.5f; 42 | 43 | // read from texture and write to global memory 44 | g_odata[y*width + x] = tex2D(tex, tu, tv); 45 | } 46 | 47 | #endif // #ifndef _SIMPLETEXTURE_KERNEL_H_ 48 | -------------------------------------------------------------------------------- /examples/src/smvm/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := smvm 9 | 10 | HSMAIN := SMVM.chs 11 | CUFILES := smvm-csr.cu \ 12 | smvm-cudpp.cu 13 | 14 | USECUDPP := 1 15 | EXTRALIBS := stdc++ 16 | 17 | # ------------------------------------------------------------------------------ 18 | # Haskell/CUDA build system 19 | # ------------------------------------------------------------------------------ 20 | include ../../common/common.mk 21 | -------------------------------------------------------------------------------- /examples/src/smvm/smvm-cudpp.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : SMVM 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | 10 | #include "smvm.h" 11 | #include 12 | 13 | template CUDPPDatatype getType(); 14 | template <> CUDPPDatatype getType() { return CUDPP_FLOAT; } 15 | template <> CUDPPDatatype getType() { return CUDPP_UINT; } 16 | 17 | 18 | /* 19 | * Sparse matrix-dense vector multiply. Hook directly into the CUDPP 20 | * implementation. 21 | */ 22 | template 23 | void smvm_cudpp 24 | ( 25 | float *d_y, 26 | const float *d_x, 27 | const float *h_data, 28 | const unsigned int *h_rowPtr, 29 | const unsigned int *h_colIdx, 30 | const unsigned int num_rows, 31 | const unsigned int num_nonzeros 32 | ) 33 | { 34 | CUDPPConfiguration cp; 35 | CUDPPHandle sm; 36 | 37 | cp.datatype = getType(); 38 | cp.options = 0; 39 | cp.algorithm = CUDPP_SPMVMULT; 40 | 41 | cudppSparseMatrix(&sm, cp, num_nonzeros, num_rows, h_data, h_rowPtr, h_colIdx); 42 | cudppSparseMatrixVectorMultiply(sm, d_y, d_x); 43 | 44 | cudppDestroySparseMatrix(sm); 45 | } 46 | 47 | 48 | // ----------------------------------------------------------------------------- 49 | // Instances 50 | // ----------------------------------------------------------------------------- 51 | 52 | void smvm_cudpp_f(float *d_y, float *d_x, float *h_data, unsigned int *h_rowPtr, unsigned int *h_colIdx, unsigned int num_rows, unsigned int num_nonzeros) 53 | { 54 | smvm_cudpp(d_y, d_x, h_data, h_rowPtr, h_colIdx, num_rows, num_nonzeros); 55 | } 56 | 57 | -------------------------------------------------------------------------------- /examples/src/smvm/smvm.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : SMVM 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #ifndef __SMVM_H__ 10 | #define __SMVM_H__ 11 | 12 | /* 13 | * Optimised for Tesla C1060 (compute 1.3) 14 | * Maximum performance for your card may be achieved with different values. 15 | * 16 | * http://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls 17 | */ 18 | #define MAX_THREADS 128 19 | #define MAX_BLOCKS_PER_SM 8 20 | #define MAX_BLOCKS (MAX_BLOCKS_PER_SM * 30) 21 | #define WARP_SIZE 32 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | void smvm_csr_f(float *d_y, float *d_x, float *d_data, unsigned int *d_rowPtr, unsigned int *d_colIdx, unsigned int num_rows); 28 | void smvm_cudpp_f(float *d_y, float *d_x, float *h_data, unsigned int *h_rowPtr, unsigned int *h_colIdx, unsigned int num_rows, unsigned int num_nonzeros); 29 | 30 | #ifdef __cplusplus 31 | } 32 | #endif 33 | #endif 34 | -------------------------------------------------------------------------------- /examples/src/smvm/texture.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2009 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #ifndef __TEXTURE_H__ 19 | #define __TEXTURE_H__ 20 | 21 | #include "utils.h" 22 | #include 23 | 24 | /* 25 | * These textures are (optionally) used to cache the 'x' vector in y += A*x 26 | * Use int2 to pull doubles through texture cache. 27 | */ 28 | texture tex_x_float; 29 | texture tex_x_double; 30 | 31 | inline void 32 | bind_x(const float * x) 33 | { 34 | size_t offset = size_t(-1); 35 | 36 | CUDA_SAFE_CALL(cudaBindTexture(&offset, tex_x_float, x)); 37 | if (offset != 0) 38 | assert(!"memory is not aligned, refusing to use texture cache"); 39 | } 40 | 41 | inline void 42 | bind_x(const double * x) 43 | { 44 | size_t offset = size_t(-1); 45 | 46 | CUDA_SAFE_CALL(cudaBindTexture(&offset, tex_x_double, x)); 47 | if (offset != 0) 48 | assert(!"memory is not aligned, refusing to use texture cache"); 49 | } 50 | 51 | /* 52 | * NOTE: the parameter is unused only to distinguish the two unbind functions 53 | */ 54 | inline void 55 | unbind_x(const float *) 56 | { 57 | CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_float)); 58 | } 59 | 60 | inline void 61 | unbind_x(const double *) 62 | { 63 | CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_double)); 64 | } 65 | 66 | template 67 | __inline__ __device__ float 68 | fetch_x(const int& i, const float * x) 69 | { 70 | if (UseCache) return tex1Dfetch(tex_x_float, i); 71 | else return x[i]; 72 | } 73 | 74 | #ifndef CUDA_NO_SM_13_DOUBLE_INTRINSICS 75 | template 76 | __inline__ __device__ double fetch_x(const int& i, const double * x) 77 | { 78 | #if __CUDA_ARCH__ < 130 79 | #error "double precision require Compute Compatibility 1.3 or greater" 80 | #endif 81 | if (UseCache) 82 | { 83 | int2 v = tex1Dfetch(tex_x_double, i); 84 | return __hiloint2double(v.y, v.x); 85 | } 86 | else 87 | { 88 | return x[i]; 89 | } 90 | } 91 | #endif 92 | 93 | #endif // __TEXTURE_H__ 94 | 95 | -------------------------------------------------------------------------------- /examples/src/sort/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := sort 9 | 10 | HSMAIN := Sort.chs 11 | CUFILES := radix_sort.cu 12 | 13 | USECUDPP := 1 14 | EXTRALIBS := stdc++ 15 | 16 | # ------------------------------------------------------------------------------ 17 | # Haskell/CUDA build system 18 | # ------------------------------------------------------------------------------ 19 | include ../../common/common.mk 20 | -------------------------------------------------------------------------------- /examples/src/sort/Sort.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- 4 | -- Module : Sort 5 | -- Copyright : (c) 2009 Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Reduce a vector of (key,value) pairs 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Main where 13 | 14 | #include "sort.h" 15 | 16 | import C2HS 17 | import RandomVector 18 | 19 | import Data.Ord 20 | import Data.List 21 | import Control.Monad 22 | import qualified Foreign.CUDA as C 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- 27 | -- CUDA 28 | 29 | test_f :: (Storable a, Eq a) => [(Float,a)] -> IO Bool 30 | test_f kv = 31 | let l = length kv 32 | (k,v) = unzip kv 33 | in 34 | C.withListArray k $ \d_k -> 35 | C.withListArray v $ \d_v -> do 36 | 37 | sort_f d_k d_v (length kv) 38 | res <- liftM2 zip (C.peekListArray l d_k) (C.peekListArray l d_v) 39 | return (res == sortBy (comparing fst) kv) 40 | 41 | 42 | test_i :: (Storable a, Eq a) => [(Int,a)] -> IO Bool 43 | test_i kv = 44 | let l = length kv 45 | (k,v) = unzip kv 46 | in 47 | C.withListArray k $ \d_k -> 48 | C.withListArray v $ \d_v -> do 49 | 50 | sort_ui d_k d_v (length kv) 51 | res <- liftM2 zip (C.peekListArray l d_k) (C.peekListArray l d_v) 52 | return (res == sortBy (comparing fst) kv) 53 | 54 | 55 | {# fun unsafe sort_f 56 | { withDP* `C.DevicePtr Float' 57 | , withDP* `C.DevicePtr a' 58 | , `Int' 59 | } -> `()' #} 60 | where 61 | withDP p a = C.withDevicePtr p $ \p' -> a (castPtr p') 62 | 63 | {# fun unsafe sort_ui 64 | { withDP* `C.DevicePtr Int' 65 | , withDP* `C.DevicePtr a' 66 | , `Int' 67 | } -> `()' #} 68 | where 69 | withDP p a = C.withDevicePtr p $ \p' -> a (castPtr p') 70 | 71 | -- 72 | -- I don't need to learn template haskell or quick check... nah, not at all... 73 | -- 74 | main :: IO () 75 | main = do 76 | f <- randomList 10000 77 | i <- randomListR 10000 (0,1000) 78 | 79 | putStr "Test (float,int): " 80 | test_f (zip f i) >>= \r -> case r of 81 | True -> putStrLn "Ok!" 82 | _ -> putStrLn "INVALID!" 83 | 84 | putStr "Test (float,float): " 85 | test_f (zip f f) >>= \r -> case r of 86 | True -> putStrLn "Ok!" 87 | _ -> putStrLn "INVALID!" 88 | 89 | putStr "Test (int,int): " 90 | test_i (zip i i) >>= \r -> case r of 91 | True -> putStrLn "Ok!" 92 | _ -> putStrLn "INVALID!" 93 | 94 | putStr "Test (int,float): " 95 | test_i (zip i f) >>= \r -> case r of 96 | True -> putStrLn "Ok!" 97 | _ -> putStrLn "INVALID!" 98 | 99 | -------------------------------------------------------------------------------- /examples/src/sort/radix_sort.cu: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Sort 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | *----------------------------------------------------------------------------*/ 8 | 9 | #include 10 | 11 | #include "sort.h" 12 | 13 | 14 | template CUDPPDatatype static getType(); 15 | template <> CUDPPDatatype getType() { return CUDPP_FLOAT; } 16 | template <> CUDPPDatatype getType() { return CUDPP_UINT; } 17 | 18 | 19 | /* 20 | * In-place radix sort of values or key-value pairs. Values can be any 32-bit 21 | * type, as their payload is never inspected or manipulated. 22 | */ 23 | template 24 | static void 25 | radix_sort 26 | ( 27 | unsigned int length, 28 | T *d_keys, 29 | void *d_vals = NULL, 30 | int bits = 8 * sizeof(T) 31 | ) 32 | { 33 | CUDPPHandle plan; 34 | CUDPPConfiguration cp; 35 | 36 | cp.datatype = getType(); 37 | cp.algorithm = CUDPP_SORT_RADIX; 38 | cp.options = (d_vals != NULL) ? CUDPP_OPTION_KEY_VALUE_PAIRS 39 | : CUDPP_OPTION_KEYS_ONLY; 40 | 41 | cudppPlan(&plan, cp, length, 1, 0); 42 | cudppSort(plan, d_keys, d_vals, bits, length); 43 | 44 | cudppDestroyPlan(plan); 45 | } 46 | 47 | 48 | /* ----------------------------------------------------------------------------- 49 | * Instances 50 | * ---------------------------------------------------------------------------*/ 51 | 52 | void sort_f(float *d_keys, void *d_vals, unsigned int length) 53 | { 54 | radix_sort(length, d_keys, d_vals); 55 | } 56 | 57 | void sort_ui(unsigned int *d_keys, void *d_vals, unsigned int length) 58 | { 59 | radix_sort(length, d_keys, d_vals); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /examples/src/sort/sort.h: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------------- 2 | * 3 | * Module : Sort 4 | * Copyright : (c) 2009 Trevor L. McDonell 5 | * License : BSD 6 | * 7 | * ---------------------------------------------------------------------------*/ 8 | 9 | #ifndef __SORT_PRIV_H__ 10 | #define __SORT_PRIV_H__ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | void sort_f(float *d_keys, void *d_vals, unsigned int length); 17 | void sort_ui(unsigned int *d_keys, void *d_vals, unsigned int length); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | #endif 23 | 24 | -------------------------------------------------------------------------------- /examples/src/vectorAddDrv/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Baking! 3 | # 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Input files 7 | # ------------------------------------------------------------------------------ 8 | EXECUTABLE := vectorAddDrv 9 | 10 | HSMAIN := VectorAdd.hs 11 | PTXFILES := vector_add.cu 12 | 13 | USEDRVAPI := 1 14 | 15 | # ------------------------------------------------------------------------------ 16 | # Haskell/CUDA build system 17 | # ------------------------------------------------------------------------------ 18 | include ../../common/common.mk 19 | -------------------------------------------------------------------------------- /examples/src/vectorAddDrv/VectorAdd.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- 3 | -- Module : VectorAdd 4 | -- Copyright : (c) 2009 Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Element-wise addition of two vectors 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Main where 12 | 13 | -- Friends 14 | import RandomVector 15 | 16 | -- System 17 | import Numeric 18 | import Control.Monad 19 | import Control.Exception 20 | import Data.Array.Storable 21 | import qualified Data.ByteString.Char8 as B 22 | import qualified Foreign.CUDA.Driver as CUDA 23 | 24 | 25 | -------------------------------------------------------------------------------- 26 | -- Reference implementation 27 | -------------------------------------------------------------------------------- 28 | 29 | testRef :: (Num e, Storable e) => Vector e -> Vector e -> IO (Vector e) 30 | testRef xs ys = do 31 | (i,j) <- getBounds xs 32 | res <- newArray_ (i,j) 33 | forM_ [i..j] (add res) 34 | return res 35 | where 36 | add res idx = do 37 | a <- readArray xs idx 38 | b <- readArray ys idx 39 | writeArray res idx (a+b) 40 | 41 | -------------------------------------------------------------------------------- 42 | -- CUDA 43 | -------------------------------------------------------------------------------- 44 | 45 | -- 46 | -- Initialise the device and context. Load the PTX source code, and return a 47 | -- reference to the kernel function. 48 | -- 49 | initCUDA :: IO (CUDA.Context, CUDA.Fun) 50 | initCUDA = do 51 | CUDA.initialise [] 52 | dev <- CUDA.device 0 53 | ctx <- CUDA.create dev [] 54 | ptx <- B.readFile "data/vector_add.ptx" 55 | (mdl,r) <- CUDA.loadDataEx ptx [CUDA.MaxRegisters 32] 56 | fun <- CUDA.getFun mdl "VecAdd" 57 | 58 | putStrLn $ ">> PTX JIT compilation (" ++ showFFloat (Just 2) (CUDA.jitTime r) " ms)" 59 | B.putStrLn (CUDA.jitInfoLog r) 60 | return (ctx,fun) 61 | 62 | 63 | -- 64 | -- Run the test 65 | -- 66 | testCUDA :: (Num e, Storable e) => Vector e -> Vector e -> IO (Vector e) 67 | testCUDA xs ys = do 68 | (m,n) <- getBounds xs 69 | let len = (n-m+1) 70 | 71 | -- Initialise environment and copy over test data 72 | -- 73 | putStrLn ">> Initialising" 74 | bracket initCUDA (\(ctx,_) -> CUDA.destroy ctx) $ \(_,addVec) -> do 75 | 76 | -- Allocate some device memory. This will be freed once the computation 77 | -- terminates, either normally or by exception. 78 | -- 79 | putStrLn ">> Executing" 80 | CUDA.allocaArray len $ \dx -> do 81 | CUDA.allocaArray len $ \dy -> do 82 | CUDA.allocaArray len $ \dz -> do 83 | 84 | -- Copy over the data 85 | -- 86 | withVector xs $ \p -> CUDA.pokeArray len p dx 87 | withVector ys $ \p -> CUDA.pokeArray len p dy 88 | 89 | -- Setup and execute the kernel (repeat test many times...) 90 | -- 91 | CUDA.setParams addVec [CUDA.VArg dx, CUDA.VArg dy, CUDA.VArg dz, CUDA.IArg len] 92 | CUDA.setBlockShape addVec (128,1,1) 93 | CUDA.launch addVec ((len+128-1) `div` 128, 1) Nothing 94 | CUDA.sync 95 | 96 | -- Copy back result 97 | -- 98 | zs <- newArray_ (m,n) 99 | withVector zs $ \p -> CUDA.peekArray len dz p 100 | return zs 101 | 102 | 103 | -------------------------------------------------------------------------------- 104 | -- Test & Verify 105 | -------------------------------------------------------------------------------- 106 | 107 | main :: IO () 108 | main = do 109 | putStrLn "== Generating random vectors" 110 | xs <- randomArr (1,10000) :: IO (Vector Float) 111 | ys <- randomArr (1,10000) :: IO (Vector Float) 112 | 113 | putStrLn "== Generating reference solution" 114 | ref <- testRef xs ys 115 | 116 | putStrLn "== Testing CUDA" 117 | arr <- testCUDA xs ys 118 | 119 | putStr "== Validating: " 120 | verify ref arr >>= \rv -> putStrLn $ if rv then "Ok!" else "INVALID!" 121 | 122 | -------------------------------------------------------------------------------- /examples/src/vectorAddDrv/vector_add.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Name : VectorAdd 3 | * Copyright : (c) 2009 Trevor L. McDonell 4 | * License : BSD 5 | * 6 | * Element-wise addition of two (floating-point) vectors 7 | */ 8 | 9 | 10 | extern "C" 11 | __global__ void VecAdd(const float *xs, const float *ys, float *out, const unsigned int N) 12 | { 13 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 14 | 15 | if (idx < N) 16 | out[idx] = xs[idx] + ys[idx]; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /examples/src/zipWith/zipWith.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Module : Prelude 3 | * Copyright : (c) 2009 Trevor L. McDonell 4 | * License : BSD 5 | */ 6 | 7 | #include "utils.h" 8 | #include "kernels.h" 9 | #include "operator.h" 10 | #include "cudpp/cudpp_globals.h" 11 | 12 | 13 | /* 14 | * Combine two arrays using the given binary operator function. A single thread 15 | * is used to compute each result pair. 16 | */ 17 | template 18 | __global__ static void 19 | zipWith_core 20 | ( 21 | Ta *xs, 22 | Tb *ys, 23 | Tc *out, 24 | int length 25 | ) 26 | { 27 | unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x; 28 | 29 | if (lengthIsPow2 || idx < length) 30 | out[idx] = op::apply(xs[idx], ys[idx]); 31 | } 32 | 33 | 34 | template 35 | void 36 | zipWith 37 | ( 38 | Ta *xs, 39 | Tb *ys, 40 | Tc *zs, 41 | int length 42 | ) 43 | { 44 | unsigned int threads = min(ceilPow2(length), CTA_SIZE); 45 | unsigned int blocks = (length + threads - 1) / threads; 46 | 47 | if (isPow2(length)) 48 | zipWith_core< op,true ><<>>(xs, ys, zs, length); 49 | else 50 | zipWith_core< op,false ><<>>(xs, ys, zs, length); 51 | } 52 | 53 | 54 | // ----------------------------------------------------------------------------- 55 | // Instances 56 | // ----------------------------------------------------------------------------- 57 | 58 | #if 0 59 | void zipWith_plusf(float *xs, float *ys, float *zs, int N) 60 | { 61 | zipWith< Plus >(xs, ys, zs, N); 62 | } 63 | 64 | void zipWith_timesf(float *xs, float *ys, float *zs, int N) 65 | { 66 | zipWith< Times >(xs, ys, zs, N); 67 | } 68 | #endif 69 | 70 | void zipWith_timesif(int *xs, float *ys, float *zs, int N) 71 | { 72 | zipWith< Times >(xs, ys, zs, N); 73 | } 74 | 75 | -------------------------------------------------------------------------------- /src/Foreign/C/Extra.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE ForeignFunctionInterface #-} 3 | -------------------------------------------------------------------------------- 4 | -- | 5 | -- Module : Foreign.C.Extra 6 | -- Copyright : [2018..2023] Trevor L. McDonell 7 | -- License : BSD 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Foreign.C.Extra ( 12 | 13 | c_strnlen, 14 | 15 | ) where 16 | 17 | import Foreign.C 18 | 19 | 20 | #if defined(WIN32) 21 | {-# INLINE c_strnlen' #-} 22 | c_strnlen' :: CString -> CSize -> IO CSize 23 | c_strnlen' str size = do 24 | str' <- peekCStringLen (str, fromIntegral size) 25 | return $ stringLen 0 str' 26 | where 27 | stringLen acc [] = acc 28 | stringLen acc ('\0':_) = acc 29 | stringLen acc (_:xs) = stringLen (acc+1) xs 30 | #else 31 | foreign import ccall unsafe "string.h strnlen" c_strnlen' 32 | :: CString -> CSize -> IO CSize 33 | #endif 34 | 35 | {-# INLINE c_strnlen #-} 36 | c_strnlen :: CString -> Int -> IO Int 37 | c_strnlen str maxlen = fromIntegral `fmap` c_strnlen' str (fromIntegral maxlen) 38 | 39 | -------------------------------------------------------------------------------- /src/Foreign/CUDA.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Foreign.CUDA 4 | -- Copyright : [2009..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Top level bindings. By default, expose the C-for-CUDA runtime API bindings, 8 | -- as they are slightly more user friendly. 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Foreign.CUDA ( 13 | 14 | module Foreign.CUDA.Runtime 15 | 16 | ) where 17 | 18 | import Foreign.CUDA.Runtime 19 | 20 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Analysis.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Foreign.CUDA.Analysis 4 | -- Copyright : [2009..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Meta-module exporting CUDA analysis routines 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Foreign.CUDA.Analysis ( 12 | 13 | module Foreign.CUDA.Analysis.Device, 14 | module Foreign.CUDA.Analysis.Occupancy 15 | 16 | ) where 17 | 18 | import Foreign.CUDA.Analysis.Device 19 | import Foreign.CUDA.Analysis.Occupancy 20 | 21 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Context.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Foreign.CUDA.Driver.Context 4 | -- Copyright : [2009..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Context management for low-level driver interface 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | 12 | module Foreign.CUDA.Driver.Context ( 13 | 14 | module Foreign.CUDA.Driver.Context.Base, 15 | module Foreign.CUDA.Driver.Context.Config, 16 | module Foreign.CUDA.Driver.Context.Peer, 17 | 18 | ) where 19 | 20 | import Foreign.CUDA.Driver.Context.Base 21 | import Foreign.CUDA.Driver.Context.Config 22 | import Foreign.CUDA.Driver.Context.Peer 23 | 24 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Context/Peer.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE EmptyDataDecls #-} 4 | {-# LANGUAGE ForeignFunctionInterface #-} 5 | {-# LANGUAGE TemplateHaskell #-} 6 | #ifdef USE_EMPTY_CASE 7 | {-# LANGUAGE EmptyCase #-} 8 | #endif 9 | -------------------------------------------------------------------------------- 10 | -- | 11 | -- Module : Foreign.CUDA.Driver.Context.Peer 12 | -- Copyright : [2009..2023] Trevor L. McDonell 13 | -- License : BSD 14 | -- 15 | -- Direct peer context access functions for the low-level driver interface. 16 | -- 17 | -- Since: CUDA-4.0 18 | -- 19 | -------------------------------------------------------------------------------- 20 | 21 | module Foreign.CUDA.Driver.Context.Peer ( 22 | 23 | -- * Peer Access 24 | PeerFlag, PeerAttribute(..), 25 | accessible, add, remove, getAttribute, 26 | 27 | ) where 28 | 29 | #include "cbits/stubs.h" 30 | {# context lib="cuda" #} 31 | 32 | -- Friends 33 | import Foreign.CUDA.Driver.Context.Base ( Context(..) ) 34 | import Foreign.CUDA.Driver.Device ( Device(..) ) 35 | import Foreign.CUDA.Driver.Error 36 | import Foreign.CUDA.Internal.C2HS 37 | 38 | -- System 39 | import Foreign 40 | import Foreign.C 41 | 42 | 43 | -------------------------------------------------------------------------------- 44 | -- Data Types 45 | -------------------------------------------------------------------------------- 46 | 47 | -- | 48 | -- Possible option values for direct peer memory access 49 | -- 50 | data PeerFlag 51 | instance Enum PeerFlag where 52 | #ifdef USE_EMPTY_CASE 53 | toEnum x = error ("PeerFlag.toEnum: Cannot match " ++ show x) 54 | fromEnum x = case x of {} 55 | #endif 56 | 57 | 58 | -- | Peer-to-peer attributes 59 | -- 60 | #if CUDA_VERSION < 8000 61 | data PeerAttribute 62 | instance Enum PeerAttribute where 63 | #ifdef USE_EMPTY_CASE 64 | toEnum x = error ("PeerAttribute.toEnum: Cannot match " ++ show x) 65 | fromEnum x = case x of {} 66 | #endif 67 | #else 68 | {# enum CUdevice_P2PAttribute as PeerAttribute 69 | { underscoreToCase } 70 | with prefix="CU_DEVICE_P2P_ATTRIBUTE" deriving (Eq, Show) #} 71 | #endif 72 | 73 | 74 | -------------------------------------------------------------------------------- 75 | -- Peer access 76 | -------------------------------------------------------------------------------- 77 | 78 | -- | 79 | -- Queries if the first device can directly access the memory of the second. If 80 | -- direct access is possible, it can then be enabled with 'add'. 81 | -- 82 | -- Requires CUDA-4.0. 83 | -- 84 | -- 85 | -- 86 | {-# INLINEABLE accessible #-} 87 | accessible :: Device -> Device -> IO Bool 88 | #if CUDA_VERSION < 4000 89 | accessible _ _ = requireSDK 'accessible 4.0 90 | #else 91 | accessible !dev !peer = resultIfOk =<< cuDeviceCanAccessPeer dev peer 92 | 93 | {-# INLINE cuDeviceCanAccessPeer #-} 94 | {# fun unsafe cuDeviceCanAccessPeer 95 | { alloca- `Bool' peekBool* 96 | , useDevice `Device' 97 | , useDevice `Device' } -> `Status' cToEnum #} 98 | #endif 99 | 100 | 101 | -- | 102 | -- If the devices of both the current and supplied contexts support unified 103 | -- addressing, then enable allocations in the supplied context to be accessible 104 | -- by the current context. 105 | -- 106 | -- Note that access is unidirectional, and in order to access memory in the 107 | -- current context from the peer context, a separate symmetric call to 108 | -- 'add' is required. 109 | -- 110 | -- Requires CUDA-4.0. 111 | -- 112 | -- 113 | -- 114 | {-# INLINEABLE add #-} 115 | add :: Context -> [PeerFlag] -> IO () 116 | #if CUDA_VERSION < 4000 117 | add _ _ = requireSDK 'add 4.0 118 | #else 119 | add !ctx !flags = nothingIfOk =<< cuCtxEnablePeerAccess ctx flags 120 | 121 | {-# INLINE cuCtxEnablePeerAccess #-} 122 | {# fun unsafe cuCtxEnablePeerAccess 123 | { useContext `Context' 124 | , combineBitMasks `[PeerFlag]' } -> `Status' cToEnum #} 125 | #endif 126 | 127 | 128 | -- | 129 | -- Disable direct memory access from the current context to the supplied 130 | -- peer context, and unregisters any registered allocations. 131 | -- 132 | -- Requires CUDA-4.0. 133 | -- 134 | -- 135 | -- 136 | {-# INLINEABLE remove #-} 137 | remove :: Context -> IO () 138 | #if CUDA_VERSION < 4000 139 | remove _ = requireSDK 'remave 4.0 140 | #else 141 | remove !ctx = nothingIfOk =<< cuCtxDisablePeerAccess ctx 142 | 143 | {-# INLINE cuCtxDisablePeerAccess #-} 144 | {# fun unsafe cuCtxDisablePeerAccess 145 | { useContext `Context' } -> `Status' cToEnum #} 146 | #endif 147 | 148 | 149 | -- | 150 | -- Queries attributes of the link between two devices 151 | -- 152 | -- 153 | -- 154 | -- Requires CUDA-8.0 155 | -- 156 | -- @since 0.9.0.0@ 157 | -- 158 | {-# INLINEABLE getAttribute #-} 159 | getAttribute :: PeerAttribute -> Device -> Device -> IO Int 160 | #if CUDA_VERSION < 8000 161 | getAttribute _ _ _ = requireSDK 'getAttribute 8.0 162 | #else 163 | getAttribute attrib src dst = resultIfOk =<< cuDeviceGetP2PAttribute attrib src dst 164 | 165 | {-# INLINE cuDeviceGetP2PAttribute #-} 166 | {# fun unsafe cuDeviceGetP2PAttribute 167 | { alloca- `Int' peekIntConv* 168 | , cFromEnum `PeerAttribute' 169 | , useDevice `Device' 170 | , useDevice `Device' 171 | } -> `Status' cToEnum #} 172 | #endif 173 | 174 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Context/Primary.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE ForeignFunctionInterface #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | -------------------------------------------------------------------------------- 6 | -- | 7 | -- Module : Foreign.CUDA.Driver.Context.Primary 8 | -- Copyright : [2009..2023] Trevor L. McDonell 9 | -- License : BSD 10 | -- 11 | -- Primary context management for low-level driver interface. The primary 12 | -- context is unique per device and shared with the Runtime API. This 13 | -- allows integration with other libraries using CUDA. 14 | -- 15 | -- Since: CUDA-7.0 16 | -- 17 | -------------------------------------------------------------------------------- 18 | 19 | module Foreign.CUDA.Driver.Context.Primary ( 20 | 21 | status, setup, reset, retain, release, 22 | 23 | ) where 24 | 25 | #include "cbits/stubs.h" 26 | {# context lib="cuda" #} 27 | 28 | -- Friends 29 | import Foreign.CUDA.Driver.Context.Base 30 | import Foreign.CUDA.Driver.Device 31 | import Foreign.CUDA.Driver.Error 32 | import Foreign.CUDA.Internal.C2HS 33 | 34 | -- System 35 | import Control.Exception 36 | import Control.Monad 37 | import Foreign 38 | import Foreign.C 39 | 40 | 41 | -------------------------------------------------------------------------------- 42 | -- Primary context management 43 | -------------------------------------------------------------------------------- 44 | 45 | 46 | -- | 47 | -- Get the status of the primary context. Returns whether the current 48 | -- context is active, and the flags it was (or will be) created with. 49 | -- 50 | -- Requires CUDA-7.0. 51 | -- 52 | -- 53 | -- 54 | {-# INLINEABLE status #-} 55 | status :: Device -> IO (Bool, [ContextFlag]) 56 | #if CUDA_VERSION < 7000 57 | status _ = requireSDK 'status 7.0 58 | #else 59 | status !dev = 60 | cuDevicePrimaryCtxGetState dev >>= \(rv, !flags, !active) -> 61 | case rv of 62 | Success -> return (active, flags) 63 | _ -> throwIO (ExitCode rv) 64 | 65 | {# fun unsafe cuDevicePrimaryCtxGetState 66 | { useDevice `Device' 67 | , alloca- `[ContextFlag]' peekFlags* 68 | , alloca- `Bool' peekBool* 69 | } -> `Status' cToEnum #} 70 | where 71 | peekFlags = liftM extractBitMasks . peek 72 | #endif 73 | 74 | 75 | -- | 76 | -- Specify the flags that the primary context should be created with. Note 77 | -- that this is an error if the primary context is already active. 78 | -- 79 | -- Requires CUDA-7.0. 80 | -- 81 | -- 82 | -- 83 | {-# INLINEABLE setup #-} 84 | setup :: Device -> [ContextFlag] -> IO () 85 | #if CUDA_VERSION < 7000 86 | setup _ _ = requireSDK 'setup 7.0 87 | #else 88 | setup !dev !flags = nothingIfOk =<< cuDevicePrimaryCtxSetFlags dev flags 89 | 90 | {-# INLINE cuDevicePrimaryCtxSetFlags #-} 91 | {# fun unsafe cuDevicePrimaryCtxSetFlags 92 | { useDevice `Device' 93 | , combineBitMasks `[ContextFlag]' 94 | } -> `Status' cToEnum #} 95 | #endif 96 | 97 | 98 | -- | 99 | -- Destroy all allocations and reset all state on the primary context of 100 | -- the given device in the current process. Requires cuda-7.0 101 | -- 102 | -- Requires CUDA-7.0. 103 | -- 104 | -- 105 | -- 106 | {-# INLINEABLE reset #-} 107 | reset :: Device -> IO () 108 | #if CUDA_VERSION < 7000 109 | reset _ = requireSDK 'reset 7.0 110 | #else 111 | reset !dev = nothingIfOk =<< cuDevicePrimaryCtxReset dev 112 | 113 | {-# INLINE cuDevicePrimaryCtxReset #-} 114 | {# fun unsafe cuDevicePrimaryCtxReset 115 | { useDevice `Device' } -> `Status' cToEnum #} 116 | #endif 117 | 118 | 119 | -- | 120 | -- Release the primary context on the given device. If there are no more 121 | -- references to the primary context it will be destroyed, regardless of 122 | -- how many threads it is current to. 123 | -- 124 | -- Unlike 'Foreign.CUDA.Driver.Context.Base.pop' this does not pop the 125 | -- context from the stack in any circumstances. 126 | -- 127 | -- Requires CUDA-7.0. 128 | -- 129 | -- 130 | -- 131 | {-# INLINEABLE release #-} 132 | release :: Device -> IO () 133 | #if CUDA_VERSION < 7000 134 | release _ = requireSDK 'release 7.0 135 | #else 136 | release !dev = nothingIfOk =<< cuDevicePrimaryCtxRelease dev 137 | 138 | {-# INLINE cuDevicePrimaryCtxRelease #-} 139 | {# fun unsafe cuDevicePrimaryCtxRelease 140 | { useDevice `Device' } -> `Status' cToEnum #} 141 | #endif 142 | 143 | 144 | -- | 145 | -- Retain the primary context for the given device, creating it if 146 | -- necessary, and increasing its usage count. The caller must call 147 | -- 'release' when done using the context. Unlike 148 | -- 'Foreign.CUDA.Driver.Context.Base.create' the newly retained context is 149 | -- not pushed onto the stack. 150 | -- 151 | -- Requires CUDA-7.0. 152 | -- 153 | -- 154 | -- 155 | {-# INLINEABLE retain #-} 156 | retain :: Device -> IO Context 157 | #if CUDA_VERSION < 7000 158 | retain _ = requireSDK 'retain 7.0 159 | #else 160 | retain !dev = resultIfOk =<< cuDevicePrimaryCtxRetain dev 161 | 162 | {-# INLINE cuDevicePrimaryCtxRetain #-} 163 | {# fun unsafe cuDevicePrimaryCtxRetain 164 | { alloca- `Context' peekCtx* 165 | , useDevice `Device' 166 | } -> `Status' cToEnum #} 167 | where 168 | peekCtx = liftM Context . peek 169 | #endif 170 | 171 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Graph/Base.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE EmptyDataDecls #-} 4 | {-# LANGUAGE ForeignFunctionInterface #-} 5 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 6 | {-# LANGUAGE TemplateHaskell #-} 7 | #ifdef USE_EMPTY_CASE 8 | {-# LANGUAGE EmptyCase #-} 9 | #endif 10 | -------------------------------------------------------------------------------- 11 | -- | 12 | -- Module : Foreign.CUDA.Driver.Graph.Base 13 | -- Copyright : [2018..2023] Trevor L. McDonell 14 | -- License : BSD 15 | -- 16 | -- Graph execution functions for the low-level driver interface 17 | -- 18 | -- Requires CUDA-10 19 | -- 20 | -------------------------------------------------------------------------------- 21 | 22 | module Foreign.CUDA.Driver.Graph.Base 23 | where 24 | 25 | #include "cbits/stubs.h" 26 | {# context lib="cuda" #} 27 | 28 | import Foreign.Storable 29 | import Foreign.Ptr 30 | 31 | 32 | -------------------------------------------------------------------------------- 33 | -- Data Types 34 | -------------------------------------------------------------------------------- 35 | 36 | #if CUDA_VERSION < 10000 37 | data Graph 38 | #else 39 | newtype Graph = Graph { useGraph :: {# type CUgraph #}} 40 | deriving (Eq, Show) 41 | #endif 42 | 43 | data GraphFlag 44 | instance Enum GraphFlag where 45 | #ifdef USE_EMPTY_CASE 46 | toEnum x = error ("GraphFlag.toEnum: Cannot match " ++ show x) 47 | fromEnum x = case x of {} 48 | #endif 49 | 50 | #if CUDA_VERSION < 10000 51 | data Node 52 | data NodeType 53 | 54 | instance Enum NodeType where 55 | #ifdef USE_EMPTY_CASE 56 | toEnum x = error ("NodeType.toEnum: Cannot match " ++ show x) 57 | fromEnum x = case x of {} 58 | #endif 59 | #else 60 | newtype Node = Node { useNode :: {# type CUgraphNode #}} 61 | deriving (Eq, Show, Storable) 62 | 63 | {# enum CUgraphNodeType as NodeType 64 | { underscoreToCase 65 | , CU_GRAPH_NODE_TYPE_GRAPH as Subgraph 66 | } 67 | with prefix="CU_GRAPH_NODE_TYPE" deriving (Eq, Show, Bounded) #} 68 | #endif 69 | 70 | 71 | #if CUDA_VERSION < 10000 72 | data Executable 73 | #else 74 | newtype Executable = Executable { useExecutable :: {# type CUgraphExec #}} 75 | deriving (Eq, Show) 76 | #endif 77 | 78 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Graph/Capture.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE EmptyDataDecls #-} 4 | {-# LANGUAGE ForeignFunctionInterface #-} 5 | {-# LANGUAGE TemplateHaskell #-} 6 | #ifdef USE_EMPTY_CASE 7 | {-# LANGUAGE EmptyCase #-} 8 | #endif 9 | -------------------------------------------------------------------------------- 10 | -- | 11 | -- Module : Foreign.CUDA.Driver.Graph.Capture 12 | -- Copyright : [2018..2023] Trevor L. McDonell 13 | -- License : BSD 14 | -- 15 | -- Requires CUDA-10 16 | -- 17 | -------------------------------------------------------------------------------- 18 | 19 | module Foreign.CUDA.Driver.Graph.Capture ( 20 | 21 | Status(..), Mode(..), 22 | start, stop, status, info, mode, 23 | 24 | ) where 25 | 26 | #include "cbits/stubs.h" 27 | {# context lib="cuda" #} 28 | 29 | import Foreign.CUDA.Driver.Error hiding ( Status ) 30 | import Foreign.CUDA.Driver.Graph.Base 31 | import Foreign.CUDA.Driver.Stream 32 | import Foreign.CUDA.Internal.C2HS 33 | 34 | import Control.Monad ( liftM ) 35 | 36 | import Foreign 37 | import Foreign.C 38 | 39 | 40 | #if CUDA_VERSION < 10000 41 | data Status 42 | 43 | instance Enum Status where 44 | #ifdef USE_EMPTY_CASE 45 | toEnum x = error ("Status.toEnum: Cannot match " ++ show x) 46 | fromEnum x = case x of {} 47 | #endif 48 | 49 | #else 50 | {# enum CUstreamCaptureStatus as Status 51 | { underscoreToCase } 52 | with prefix="CU_STREAM_CAPTURE_STATUS" deriving (Eq, Show, Bounded) #} 53 | #endif 54 | 55 | #if CUDA_VERSION < 10010 56 | data Mode 57 | 58 | instance Enum Mode where 59 | #ifdef USE_EMPTY_CASE 60 | toEnum x = error ("Mode.toEnum: Cannot match " ++ show x) 61 | fromEnum x = case x of {} 62 | #endif 63 | 64 | #else 65 | {# enum CUstreamCaptureMode as Mode 66 | { underscoreToCase } 67 | with prefix="CU_STREAM_CAPTURE_MODE" deriving (Eq, Show, Bounded) #} 68 | #endif 69 | 70 | 71 | -------------------------------------------------------------------------------- 72 | -- Graph capture 73 | -------------------------------------------------------------------------------- 74 | 75 | -- | Begin graph capture on a stream 76 | -- 77 | -- Requires CUDA-10.0 78 | -- 79 | -- 80 | -- 81 | -- @since 0.10.0.0 82 | -- 83 | #if CUDA_VERSION < 10000 84 | start :: Stream -> Mode -> IO () 85 | start = requireSDK 'start 10.0 86 | #elif CUDA_VERSION < 10010 87 | start :: Stream -> Mode -> IO () 88 | start s _ = cuStreamBeginCapture s 89 | where 90 | {# fun unsafe cuStreamBeginCapture 91 | { useStream `Stream' 92 | } 93 | -> `()' checkStatus*- #} 94 | #else 95 | {# fun unsafe cuStreamBeginCapture as start 96 | { useStream `Stream' 97 | , `Mode' 98 | } 99 | -> `()' checkStatus*- #} 100 | #endif 101 | 102 | 103 | -- | End graph capture on a stream 104 | -- 105 | -- Requires CUDA-10.0 106 | -- 107 | -- 108 | -- 109 | -- @since 0.10.0.0 110 | -- 111 | #if CUDA_VERSION < 10000 112 | stop :: Stream -> IO Graph 113 | stop = requireSDK 'stop 10.0 114 | #else 115 | {# fun unsafe cuStreamEndCapture as stop 116 | { useStream `Stream' 117 | , alloca- `Graph' peekGraph* 118 | } 119 | -> `()' checkStatus*- #} 120 | #endif 121 | 122 | 123 | -- | Return a stream's capture status 124 | -- 125 | -- Requires CUDA-10.0 126 | -- 127 | -- 128 | -- 129 | -- @since 0.10.0.0 130 | -- 131 | #if CUDA_VERSION < 10000 132 | status :: Stream -> IO Status 133 | status = requireSDK 'status 10.0 134 | #else 135 | {# fun unsafe cuStreamIsCapturing as status 136 | { useStream `Stream' 137 | , alloca- `Status' peekEnum* 138 | } 139 | -> `()' checkStatus*- #} 140 | #endif 141 | 142 | 143 | -- | Query the capture status of a stream and get an id for the capture 144 | -- sequence, which is unique over the lifetime of the process. 145 | -- 146 | -- Requires CUDA-10.1 147 | -- 148 | -- 149 | -- 150 | -- @since 0.10.1.0 151 | -- 152 | #if CUDA_VERSION < 10010 153 | info :: Stream -> IO (Status, Int64) 154 | info = requireSDK 'info 10.1 155 | #else 156 | {# fun unsafe cuStreamGetCaptureInfo as info 157 | { useStream `Stream' 158 | , alloca- `Status' peekEnum* 159 | , alloca- `Int64' peekIntConv* 160 | } 161 | -> `()' checkStatus*- #} 162 | #endif 163 | 164 | 165 | -- | Set the stream capture interaction mode for this thread. Return the previous value. 166 | -- 167 | -- Requires CUDA-10.1 168 | -- 169 | -- 170 | -- 171 | -- @since 0.10.1.0 172 | -- 173 | #if CUDA_VERSION < 10010 174 | mode :: Mode -> IO Mode 175 | mode = requireSDK 'mode 10.1 176 | #else 177 | {# fun unsafe cuThreadExchangeStreamCaptureMode as mode 178 | { withEnum* `Mode' peekEnum* 179 | } 180 | -> `()' checkStatus*- #} 181 | #endif 182 | 183 | 184 | -------------------------------------------------------------------------------- 185 | -- Internal 186 | -------------------------------------------------------------------------------- 187 | 188 | #if CUDA_VERSION >= 10000 189 | {-# INLINE peekGraph #-} 190 | peekGraph :: Ptr {# type CUgraph #} -> IO Graph 191 | peekGraph = liftM Graph . peek 192 | #endif 193 | 194 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/IPC/Event.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE ForeignFunctionInterface #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | -------------------------------------------------------------------------------- 6 | -- | 7 | -- Module : Foreign.CUDA.Driver.IPC.Event 8 | -- Copyright : [2009..2023] Trevor L. McDonell 9 | -- License : BSD 10 | -- 11 | -- IPC event management for low-level driver interface. 12 | -- 13 | -- Restricted to devices which support unified addressing on Linux 14 | -- operating systems. 15 | -- 16 | -- Since CUDA-4.1. 17 | -- 18 | -------------------------------------------------------------------------------- 19 | 20 | module Foreign.CUDA.Driver.IPC.Event ( 21 | 22 | IPCEvent, 23 | export, open, 24 | 25 | ) where 26 | 27 | #include "cbits/stubs.h" 28 | {# context lib="cuda" #} 29 | 30 | -- Friends 31 | import Foreign.CUDA.Driver.Error 32 | import Foreign.CUDA.Driver.Event 33 | import Foreign.CUDA.Internal.C2HS 34 | 35 | -- System 36 | import Control.Monad 37 | import Prelude 38 | 39 | import Foreign.C 40 | import Foreign.Ptr 41 | import Foreign.ForeignPtr 42 | import Foreign.Marshal 43 | import Foreign.Storable 44 | 45 | 46 | -------------------------------------------------------------------------------- 47 | -- Data Types 48 | -------------------------------------------------------------------------------- 49 | 50 | -- | 51 | -- A CUDA inter-process event handle. 52 | -- 53 | newtype IPCEvent = IPCEvent { useIPCEvent :: IPCEventHandle } 54 | deriving (Eq, Show) 55 | 56 | 57 | -------------------------------------------------------------------------------- 58 | -- IPC event management 59 | -------------------------------------------------------------------------------- 60 | 61 | -- | 62 | -- Create an inter-process event handle for a previously allocated event. 63 | -- The event must be created with the 'Interprocess' and 'DisableTiming' 64 | -- event flags. The returned handle may then be sent to another process and 65 | -- 'open'ed to allow efficient hardware synchronisation between GPU work in 66 | -- other processes. 67 | -- 68 | -- After the event has been opened in the importing process, 'record', 69 | -- 'block', 'wait', 'query' may be used in either process. 70 | -- 71 | -- Performing operations on the imported event after the event has been 72 | -- 'destroy'ed in the exporting process is undefined. 73 | -- 74 | -- Requires CUDA-4.0. 75 | -- 76 | -- 77 | -- 78 | {-# INLINEABLE export #-} 79 | export :: Event -> IO IPCEvent 80 | #if CUDA_VERSION < 4010 81 | export _ = requireSDK 'create 4.1 82 | #else 83 | export !ev = do 84 | h <- newIPCEventHandle 85 | r <- cuIpcGetEventHandle h ev 86 | resultIfOk (r, IPCEvent h) 87 | 88 | {-# INLINE cuIpcGetEventHandle #-} 89 | {# fun unsafe cuIpcGetEventHandle 90 | { withForeignPtr* `IPCEventHandle' 91 | , useEvent `Event' 92 | } 93 | -> `Status' cToEnum #} 94 | #endif 95 | 96 | 97 | -- | 98 | -- Open an inter-process event handle for use in the current process, 99 | -- returning an event that can be used in the current process and behaving 100 | -- as a locally created event with the 'DisableTiming' flag specified. 101 | -- 102 | -- The event must be freed with 'destroy'. Performing operations on the 103 | -- imported event after the exported event has been 'destroy'ed in the 104 | -- exporting process is undefined. 105 | -- 106 | -- Requires CUDA-4.0. 107 | -- 108 | -- 109 | -- 110 | {-# INLINEABLE open #-} 111 | open :: IPCEvent -> IO Event 112 | #if CUDA_VERSION < 4010 113 | open _ = requireSDK 'open 4.1 114 | #else 115 | open !ev = resultIfOk =<< cuIpcOpenEventHandle (useIPCEvent ev) 116 | 117 | {-# INLINE cuIpcOpenEventHandle #-} 118 | {# fun unsafe cuIpcOpenEventHandle 119 | { alloca- `Event' peekEvent* 120 | , withForeignPtr* `IPCEventHandle' 121 | } 122 | -> `Status' cToEnum #} 123 | where 124 | peekEvent = liftM Event . peek 125 | #endif 126 | 127 | 128 | -------------------------------------------------------------------------------- 129 | -- Internal 130 | -------------------------------------------------------------------------------- 131 | 132 | type IPCEventHandle = ForeignPtr () 133 | 134 | newIPCEventHandle :: IO IPCEventHandle 135 | #if CUDA_VERSION < 4010 136 | newIPCEventHandle = requireSDK 'newIPCEventHandle 4.1 137 | #else 138 | newIPCEventHandle = mallocForeignPtrBytes {#sizeof CUipcEventHandle#} 139 | #endif 140 | 141 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/IPC/Marshal.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE EmptyDataDecls #-} 4 | {-# LANGUAGE ForeignFunctionInterface #-} 5 | {-# LANGUAGE TemplateHaskell #-} 6 | -------------------------------------------------------------------------------- 7 | -- | 8 | -- Module : Foreign.CUDA.Driver.IPC.Marshal 9 | -- Copyright : [2009..2023] Trevor L. McDonell 10 | -- License : BSD 11 | -- 12 | -- IPC memory management for low-level driver interface. 13 | -- 14 | -- Restricted to devices which support unified addressing on Linux 15 | -- operating systems. 16 | -- 17 | -- Since CUDA-4.0. 18 | -- 19 | -------------------------------------------------------------------------------- 20 | 21 | module Foreign.CUDA.Driver.IPC.Marshal ( 22 | 23 | -- ** IPC memory management 24 | IPCDevicePtr, IPCFlag(..), 25 | export, open, close, 26 | 27 | ) where 28 | 29 | #include "cbits/stubs.h" 30 | {# context lib="cuda" #} 31 | 32 | -- Friends 33 | import Foreign.CUDA.Ptr 34 | import Foreign.CUDA.Driver.Error 35 | import Foreign.CUDA.Internal.C2HS 36 | import Foreign.CUDA.Driver.Marshal 37 | 38 | -- System 39 | import Control.Monad 40 | import Prelude 41 | 42 | import Foreign.C 43 | import Foreign.Ptr 44 | import Foreign.ForeignPtr 45 | import Foreign.Marshal 46 | 47 | 48 | -------------------------------------------------------------------------------- 49 | -- Data Types 50 | -------------------------------------------------------------------------------- 51 | 52 | -- | 53 | -- A CUDA memory handle used for inter-process communication. 54 | -- 55 | newtype IPCDevicePtr a = IPCDevicePtr { useIPCDevicePtr :: IPCMemHandle } 56 | deriving (Eq, Show) 57 | 58 | 59 | -- | 60 | -- Flags for controlling IPC memory access 61 | -- 62 | #if CUDA_VERSION < 4010 63 | data IPCFlag 64 | #else 65 | {# enum CUipcMem_flags as IPCFlag 66 | { underscoreToCase } 67 | with prefix="CU_IPC_MEM" deriving (Eq, Show, Bounded) #} 68 | #endif 69 | 70 | 71 | -------------------------------------------------------------------------------- 72 | -- IPC memory management 73 | -------------------------------------------------------------------------------- 74 | 75 | -- | 76 | -- Create an inter-process memory handle for an existing device memory 77 | -- allocation. The handle can then be sent to another process and made 78 | -- available to that process via 'open'. 79 | -- 80 | -- Requires CUDA-4.1. 81 | -- 82 | -- 83 | -- 84 | {-# INLINEABLE export #-} 85 | export :: DevicePtr a -> IO (IPCDevicePtr a) 86 | #if CUDA_VERSION < 4010 87 | export _ = requireSDK 'export 4.1 88 | #else 89 | export !dptr = do 90 | h <- newIPCMemHandle 91 | r <- cuIpcGetMemHandle h dptr 92 | resultIfOk (r, IPCDevicePtr h) 93 | 94 | {-# INLINE cuIpcGetMemHandle #-} 95 | {# fun unsafe cuIpcGetMemHandle 96 | { withForeignPtr* `IPCMemHandle' 97 | , useDeviceHandle `DevicePtr a' 98 | } 99 | -> `Status' cToEnum #} 100 | #endif 101 | 102 | 103 | -- | 104 | -- Open an inter-process memory handle exported from another process, 105 | -- returning a device pointer usable in the current process. 106 | -- 107 | -- Maps memory exported by another process with 'export into the current 108 | -- device address space. For contexts on different devices, 'open' can 109 | -- attempt to enable peer access if the user called 110 | -- 'Foreign.CUDA.Driver.Context.Peer.add', and is controlled by the 111 | -- 'LazyEnablePeerAccess' flag. 112 | -- 113 | -- Each handle from a given device and context may only be 'open'ed by one 114 | -- context per device per other process. Memory returned by 'open' must be 115 | -- freed via 'close'. 116 | -- 117 | -- Requires CUDA-4.1. 118 | -- 119 | -- 120 | -- 121 | {-# INLINEABLE open #-} 122 | open :: IPCDevicePtr a -> [IPCFlag]-> IO (DevicePtr a) 123 | #if CUDA_VERSION < 4010 124 | open _ _ = requireSDK 'open 4.1 125 | #else 126 | open !hdl !flags = resultIfOk =<< cuIpcOpenMemHandle (useIPCDevicePtr hdl) flags 127 | 128 | {-# INLINE cuIpcOpenMemHandle #-} 129 | {# fun unsafe cuIpcOpenMemHandle 130 | { alloca- `DevicePtr a' peekDeviceHandle* 131 | , withForeignPtr* `IPCMemHandle' 132 | , combineBitMasks `[IPCFlag]' 133 | } 134 | -> `Status' cToEnum #} 135 | #endif 136 | 137 | 138 | -- | 139 | -- Close and unmap memory returned by 'open'. The original allocation in 140 | -- the exporting process as well as imported mappings in other processes 141 | -- are unaffected. 142 | -- 143 | -- Any resources used to enable peer access will be freed if this is the 144 | -- last mapping using them. 145 | -- 146 | -- Requires CUDA-4.1. 147 | -- 148 | -- 149 | -- 150 | {-# INLINEABLE close #-} 151 | close :: DevicePtr a -> IO () 152 | #if CUDA_VERSION < 4010 153 | close _ = requireSDK 'close 4.1 154 | #else 155 | close !dptr = nothingIfOk =<< cuIpcCloseMemHandle dptr 156 | 157 | {-# INLINE cuIpcCloseMemHandle #-} 158 | {# fun unsafe cuIpcCloseMemHandle 159 | { useDeviceHandle `DevicePtr a' 160 | } 161 | -> `Status' cToEnum #} 162 | #endif 163 | 164 | 165 | -------------------------------------------------------------------------------- 166 | -- Internal 167 | -------------------------------------------------------------------------------- 168 | 169 | type IPCMemHandle = ForeignPtr () 170 | 171 | newIPCMemHandle :: IO IPCMemHandle 172 | #if CUDA_VERSION < 4010 173 | newIPCMemHandle = requireSDK 'newIPCMemHandle 4.1 174 | #else 175 | newIPCMemHandle = mallocForeignPtrBytes {#sizeof CUipcMemHandle#} 176 | #endif 177 | 178 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Module.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Foreign.CUDA.Driver.Module 4 | -- Copyright : [2009..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Module management for low-level driver interface 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Foreign.CUDA.Driver.Module ( 12 | 13 | module Foreign.CUDA.Driver.Module.Base, 14 | module Foreign.CUDA.Driver.Module.Query, 15 | 16 | ) where 17 | 18 | import Foreign.CUDA.Driver.Module.Base hiding ( JITOptionInternal(..), useModule, jitOptionUnpack, jitTargetOfCompute ) 19 | import Foreign.CUDA.Driver.Module.Query 20 | 21 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Module/Query.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE ForeignFunctionInterface #-} 4 | {-# LANGUAGE MagicHash #-} 5 | {-# LANGUAGE UnboxedTuples #-} 6 | -------------------------------------------------------------------------------- 7 | -- | 8 | -- Module : Foreign.CUDA.Driver.Module.Query 9 | -- Copyright : [2009..2023] Trevor L. McDonell 10 | -- License : BSD 11 | -- 12 | -- Querying module attributes for low-level driver interface 13 | -- 14 | -------------------------------------------------------------------------------- 15 | 16 | module Foreign.CUDA.Driver.Module.Query ( 17 | 18 | -- ** Querying module inhabitants 19 | getFun, getPtr, getTex, 20 | 21 | ) where 22 | 23 | #include "cbits/stubs.h" 24 | {# context lib="cuda" #} 25 | 26 | -- Friends 27 | import Foreign.CUDA.Driver.Error 28 | import Foreign.CUDA.Driver.Exec 29 | import Foreign.CUDA.Driver.Marshal ( peekDeviceHandle ) 30 | import Foreign.CUDA.Driver.Module.Base 31 | import Foreign.CUDA.Driver.Texture 32 | import Foreign.CUDA.Internal.C2HS 33 | import Foreign.CUDA.Ptr 34 | 35 | -- System 36 | import Foreign 37 | import Foreign.C 38 | import Control.Exception ( throwIO ) 39 | import Control.Monad ( liftM ) 40 | import Data.ByteString.Short ( ShortByteString ) 41 | import qualified Data.ByteString.Short as BS 42 | import qualified Data.ByteString.Short.Internal as BI 43 | import qualified Data.ByteString.Internal as BI 44 | import Prelude as P 45 | 46 | import GHC.Exts 47 | import GHC.Base ( IO(..) ) 48 | 49 | 50 | -------------------------------------------------------------------------------- 51 | -- Querying module attributes 52 | -------------------------------------------------------------------------------- 53 | 54 | -- | 55 | -- Returns a function handle. 56 | -- 57 | -- 58 | -- 59 | {-# INLINEABLE getFun #-} 60 | getFun :: Module -> ShortByteString -> IO Fun 61 | getFun !mdl !fn = resultIfFound "function" fn =<< cuModuleGetFunction mdl fn 62 | 63 | {-# INLINE cuModuleGetFunction #-} 64 | {# fun unsafe cuModuleGetFunction 65 | { alloca- `Fun' peekFun* 66 | , useModule `Module' 67 | , useAsCString* `ShortByteString' 68 | } 69 | -> `Status' cToEnum #} 70 | where 71 | peekFun = liftM Fun . peek 72 | 73 | 74 | -- | 75 | -- Return a global pointer, and size of the global (in bytes). 76 | -- 77 | -- 78 | -- 79 | {-# INLINEABLE getPtr #-} 80 | getPtr :: Module -> ShortByteString -> IO (DevicePtr a, Int) 81 | getPtr !mdl !name = do 82 | (!status,!dptr,!bytes) <- cuModuleGetGlobal mdl name 83 | resultIfFound "global" name (status,(dptr,bytes)) 84 | 85 | {-# INLINE cuModuleGetGlobal #-} 86 | {# fun unsafe cuModuleGetGlobal 87 | { alloca- `DevicePtr a' peekDeviceHandle* 88 | , alloca- `Int' peekIntConv* 89 | , useModule `Module' 90 | , useAsCString* `ShortByteString' 91 | } 92 | -> `Status' cToEnum #} 93 | 94 | 95 | -- | 96 | -- Return a handle to a texture reference. This texture reference handle 97 | -- should not be destroyed, as the texture will be destroyed automatically 98 | -- when the module is unloaded. 99 | -- 100 | -- 101 | -- 102 | {-# INLINEABLE getTex #-} 103 | getTex :: Module -> ShortByteString -> IO Texture 104 | getTex !mdl !name = resultIfFound "texture" name =<< cuModuleGetTexRef mdl name 105 | 106 | {-# INLINE cuModuleGetTexRef #-} 107 | {# fun unsafe cuModuleGetTexRef 108 | { alloca- `Texture' peekTex* 109 | , useModule `Module' 110 | , useAsCString* `ShortByteString' 111 | } 112 | -> `Status' cToEnum #} 113 | 114 | 115 | -------------------------------------------------------------------------------- 116 | -- Internal 117 | -------------------------------------------------------------------------------- 118 | 119 | {-# INLINE resultIfFound #-} 120 | resultIfFound :: String -> ShortByteString -> (Status, a) -> IO a 121 | resultIfFound kind name (!status,!result) = 122 | case status of 123 | Success -> return result 124 | NotFound -> cudaErrorIO (kind ++ ' ' : describe status ++ ": " ++ unpack name) 125 | _ -> throwIO (ExitCode status) 126 | 127 | 128 | -- Utilities 129 | -- --------- 130 | 131 | -- [Short]ByteStrings are not null-terminated, so can't be passed directly to C. 132 | -- 133 | -- unsafeUseAsCString :: ShortByteString -> CString 134 | -- unsafeUseAsCString (BI.SBS ba#) = Ptr (byteArrayContents# ba#) 135 | 136 | {-# INLINE useAsCString #-} 137 | useAsCString :: ShortByteString -> (CString -> IO a) -> IO a 138 | useAsCString (BI.SBS ba#) action = IO $ \s0 -> 139 | case sizeofByteArray# ba# of { n# -> 140 | case newPinnedByteArray# (n# +# 1#) s0 of { (# s1, mba# #) -> 141 | case byteArrayContents# (unsafeCoerce# mba#) of { addr# -> 142 | case copyByteArrayToAddr# ba# 0# addr# n# s1 of { s2 -> 143 | case writeWord8OffAddr# addr# n# (wordToWord8# 0##) s2 of { s3 -> 144 | case action (Ptr addr#) of { IO action' -> 145 | case action' s3 of { (# s4, r #) -> 146 | case touch# mba# s4 of { s5 -> 147 | (# s5, r #) 148 | }}}}}}}} 149 | 150 | 151 | {-# INLINE unpack #-} 152 | unpack :: ShortByteString -> [Char] 153 | unpack = P.map BI.w2c . BS.unpack 154 | 155 | #if __GLASGOW_HASKELL__ < 902 156 | {-# INLINE wordToWord8# #-} 157 | wordToWord8# :: Word# -> Word# 158 | wordToWord8# x = x 159 | #endif 160 | 161 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Profiler.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- | 4 | -- Module : Foreign.CUDA.Driver.Profiler 5 | -- Copyright : [2009..2023] Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Profiler control for low-level driver interface 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Foreign.CUDA.Driver.Profiler ( 13 | 14 | OutputMode(..), 15 | initialise, 16 | start, stop, 17 | 18 | ) where 19 | 20 | #include "cbits/stubs.h" 21 | {# context lib="cuda" #} 22 | 23 | -- friends 24 | import Foreign.CUDA.Driver.Error 25 | import Foreign.CUDA.Internal.C2HS 26 | 27 | -- system 28 | import Foreign 29 | import Foreign.C 30 | 31 | 32 | -- | Profiler output mode 33 | -- 34 | {# enum CUoutput_mode as OutputMode 35 | { underscoreToCase 36 | , CU_OUT_CSV as CSV } 37 | with prefix="CU_OUT" deriving (Eq, Show) #} 38 | 39 | 40 | -- | Initialise the CUDA profiler. 41 | -- 42 | -- The configuration file is used to specify profiling options and profiling 43 | -- counters. Refer to the "Compute Command Line Profiler User Guide" for 44 | -- supported profiler options and counters. 45 | -- 46 | -- Note that the CUDA profiler can not be initialised with this function if 47 | -- another profiling tool is already active. 48 | -- 49 | -- 50 | -- 51 | {-# INLINEABLE initialise #-} 52 | initialise 53 | :: FilePath -- ^ configuration file that itemises which counters and/or options to profile 54 | -> FilePath -- ^ output file where profiling results will be stored 55 | -> OutputMode 56 | -> IO () 57 | initialise config output mode 58 | = nothingIfOk =<< cuProfilerInitialize config output mode 59 | 60 | {-# INLINE cuProfilerInitialize #-} 61 | {# fun unsafe cuProfilerInitialize 62 | { `String' 63 | , `String' 64 | , cFromEnum `OutputMode' 65 | } 66 | -> `Status' cToEnum #} 67 | 68 | 69 | -- | Begin profiling collection by the active profiling tool for the current 70 | -- context. If profiling is already enabled, then this has no effect. 71 | -- 72 | -- 'start' and 'stop' can be used to programatically control profiling 73 | -- granularity, by allowing profiling to be done only on selected pieces of 74 | -- code. 75 | -- 76 | -- 77 | -- 78 | {-# INLINEABLE start #-} 79 | start :: IO () 80 | start = nothingIfOk =<< cuProfilerStart 81 | 82 | {-# INLINE cuProfilerStart #-} 83 | {# fun unsafe cuProfilerStart 84 | { } -> `Status' cToEnum #} 85 | 86 | 87 | -- | Stop profiling collection by the active profiling tool for the current 88 | -- context, and force all pending profiler events to be written to the output 89 | -- file. If profiling is already inactive, this has no effect. 90 | -- 91 | -- 92 | -- 93 | {-# INLINEABLE stop #-} 94 | stop :: IO () 95 | stop = nothingIfOk =<< cuProfilerStop 96 | 97 | {-# INLINE cuProfilerStop #-} 98 | {# fun unsafe cuProfilerStop 99 | { } -> `Status' cToEnum #} 100 | 101 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Driver/Utils.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- | 4 | -- Module : Foreign.CUDA.Driver.Utils 5 | -- Copyright : [2009..2023] Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Utility functions 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Foreign.CUDA.Driver.Utils ( 13 | 14 | driverVersion, 15 | libraryVersion, 16 | 17 | ) where 18 | 19 | #include "cbits/stubs.h" 20 | {# context lib="cuda" #} 21 | 22 | -- Friends 23 | import Foreign.CUDA.Driver.Error 24 | import Foreign.CUDA.Internal.C2HS 25 | 26 | -- System 27 | import Foreign 28 | import Foreign.C 29 | 30 | 31 | -- | 32 | -- Return the version number of the installed CUDA driver. 33 | -- 34 | {-# INLINEABLE driverVersion #-} 35 | driverVersion :: IO Int 36 | driverVersion = resultIfOk =<< cuDriverGetVersion 37 | 38 | {-# INLINE cuDriverGetVersion #-} 39 | {# fun unsafe cuDriverGetVersion 40 | { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} 41 | 42 | 43 | -- | 44 | -- Return the version number of the CUDA library (API) that this package was 45 | -- compiled against. 46 | -- 47 | {-# INLINEABLE libraryVersion #-} 48 | libraryVersion :: Int 49 | libraryVersion = {#const CUDA_VERSION #} 50 | 51 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Path.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | -------------------------------------------------------------------------------- 3 | -- | 4 | -- Module : Foreign.CUDA.Path 5 | -- Copyright : [2017..2023] Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -------------------------------------------------------------------------------- 9 | 10 | module Foreign.CUDA.Path ( 11 | 12 | cudaInstallPath, 13 | cudaBinPath, cudaLibraryPath, cudaIncludePath, 14 | 15 | ) where 16 | 17 | import System.FilePath 18 | 19 | -- | The base path to the CUDA toolkit installation that this package was 20 | -- compiled against. 21 | -- 22 | cudaInstallPath :: FilePath 23 | cudaInstallPath = {#const CUDA_INSTALL_PATH#} 24 | 25 | -- | The path where the CUDA toolkit executables, such as @nvcc@ and @ptxas@, 26 | -- can be found. 27 | -- 28 | cudaBinPath :: FilePath 29 | cudaBinPath = cudaInstallPath "bin" 30 | 31 | -- | The path where the CUDA libraries this package was linked against are 32 | -- located 33 | -- 34 | cudaLibraryPath :: FilePath 35 | cudaLibraryPath = {#const CUDA_LIBRARY_PATH#} 36 | 37 | -- | The path where the CUDA headers this package was built against are located 38 | -- 39 | cudaIncludePath :: FilePath 40 | cudaIncludePath = cudaInstallPath "include" 41 | 42 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Runtime.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Foreign.CUDA.Runtime 4 | -- Copyright : [2009..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -- Top level bindings to the C-for-CUDA runtime API 8 | -- 9 | -------------------------------------------------------------------------------- 10 | 11 | module Foreign.CUDA.Runtime ( 12 | 13 | module Foreign.CUDA.Ptr, 14 | module Foreign.CUDA.Runtime.Device, 15 | module Foreign.CUDA.Runtime.Error, 16 | module Foreign.CUDA.Runtime.Exec, 17 | module Foreign.CUDA.Runtime.Marshal, 18 | module Foreign.CUDA.Runtime.Utils 19 | 20 | ) where 21 | 22 | import Foreign.CUDA.Ptr 23 | import Foreign.CUDA.Runtime.Device 24 | import Foreign.CUDA.Runtime.Error 25 | import Foreign.CUDA.Runtime.Exec 26 | import Foreign.CUDA.Runtime.Marshal 27 | import Foreign.CUDA.Runtime.Utils 28 | 29 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Runtime/Error.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | {-# LANGUAGE ForeignFunctionInterface #-} 4 | -------------------------------------------------------------------------------- 5 | -- | 6 | -- Module : Foreign.CUDA.Runtime.Error 7 | -- Copyright : [2009..2023] Trevor L. McDonell 8 | -- License : BSD 9 | -- 10 | -- Error handling functions 11 | -- 12 | -------------------------------------------------------------------------------- 13 | 14 | module Foreign.CUDA.Runtime.Error ( 15 | 16 | Status(..), CUDAException(..), 17 | 18 | cudaError, describe, requireSDK, 19 | resultIfOk, nothingIfOk, checkStatus, 20 | 21 | ) where 22 | 23 | -- Friends 24 | import Foreign.CUDA.Internal.C2HS 25 | import Text.Show.Describe 26 | 27 | -- System 28 | import Control.Exception 29 | import Data.Typeable 30 | import Foreign.C 31 | import Foreign.Ptr 32 | import Language.Haskell.TH 33 | import System.IO.Unsafe 34 | import Text.Printf 35 | 36 | #include "cbits/stubs.h" 37 | {# context lib="cudart" #} 38 | 39 | 40 | -------------------------------------------------------------------------------- 41 | -- Return Status 42 | -------------------------------------------------------------------------------- 43 | 44 | -- | 45 | -- Return codes from API functions 46 | -- 47 | {# enum cudaError as Status 48 | { cudaSuccess as Success } 49 | with prefix="cudaError" deriving (Eq, Show) #} 50 | 51 | -------------------------------------------------------------------------------- 52 | -- Exceptions 53 | -------------------------------------------------------------------------------- 54 | 55 | data CUDAException 56 | = ExitCode Status 57 | | UserError String 58 | deriving Typeable 59 | 60 | instance Exception CUDAException 61 | 62 | instance Show CUDAException where 63 | showsPrec _ (ExitCode s) = showString ("CUDA Exception: " ++ describe s) 64 | showsPrec _ (UserError s) = showString ("CUDA Exception: " ++ s) 65 | 66 | 67 | -- | 68 | -- Raise a 'CUDAException' in the IO Monad 69 | -- 70 | cudaError :: String -> IO a 71 | cudaError s = throwIO (UserError s) 72 | 73 | -- | 74 | -- A specially formatted error message 75 | -- 76 | requireSDK :: Name -> Double -> IO a 77 | requireSDK n v = cudaError $ printf "'%s' requires at least cuda-%3.1f\n" (show n) v 78 | 79 | 80 | -------------------------------------------------------------------------------- 81 | -- Helper Functions 82 | -------------------------------------------------------------------------------- 83 | 84 | -- | 85 | -- Return the descriptive string associated with a particular error code 86 | -- 87 | instance Describe Status where 88 | describe = cudaGetErrorString 89 | 90 | -- Logically, this must be a pure function, returning a pointer to a statically 91 | -- defined string constant. 92 | -- 93 | {# fun pure unsafe cudaGetErrorString 94 | { cFromEnum `Status' } -> `String' #} 95 | 96 | 97 | -- | 98 | -- Return the results of a function on successful execution, otherwise return 99 | -- the error string associated with the return code 100 | -- 101 | {-# INLINE resultIfOk #-} 102 | resultIfOk :: (Status, a) -> IO a 103 | resultIfOk (status, !result) = 104 | case status of 105 | Success -> return result 106 | _ -> throwIO (ExitCode status) 107 | 108 | 109 | -- | 110 | -- Return the error string associated with an unsuccessful return code, 111 | -- otherwise Nothing 112 | -- 113 | {-# INLINE nothingIfOk #-} 114 | nothingIfOk :: Status -> IO () 115 | nothingIfOk status = 116 | case status of 117 | Success -> return () 118 | _ -> throwIO (ExitCode status) 119 | 120 | {-# INLINE checkStatus #-} 121 | checkStatus :: CInt -> IO () 122 | checkStatus = nothingIfOk . cToEnum 123 | 124 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Runtime/Event.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE EmptyDataDecls #-} 4 | {-# LANGUAGE ForeignFunctionInterface #-} 5 | {-# LANGUAGE TemplateHaskell #-} 6 | -------------------------------------------------------------------------------- 7 | -- | 8 | -- Module : Foreign.CUDA.Driver.Event 9 | -- Copyright : [2009..2023] Trevor L. McDonell 10 | -- License : BSD 11 | -- 12 | -- Event management for C-for-CUDA runtime environment 13 | -- 14 | -------------------------------------------------------------------------------- 15 | 16 | module Foreign.CUDA.Runtime.Event ( 17 | 18 | -- * Event Management 19 | Event, EventFlag(..), WaitFlag, 20 | create, destroy, elapsedTime, query, record, wait, block 21 | 22 | ) where 23 | 24 | #include "cbits/stubs.h" 25 | {# context lib="cudart" #} 26 | 27 | -- Friends 28 | import Foreign.CUDA.Driver.Event ( Event(..), EventFlag(..), WaitFlag ) 29 | import Foreign.CUDA.Driver.Stream ( Stream(..), defaultStream ) 30 | import Foreign.CUDA.Internal.C2HS 31 | import Foreign.CUDA.Runtime.Error 32 | 33 | -- System 34 | import Foreign 35 | import Foreign.C 36 | import Control.Monad ( liftM ) 37 | import Control.Exception ( throwIO ) 38 | import Data.Maybe ( fromMaybe ) 39 | 40 | 41 | -------------------------------------------------------------------------------- 42 | -- Event management 43 | -------------------------------------------------------------------------------- 44 | 45 | -- | 46 | -- Create a new event 47 | -- 48 | {-# INLINEABLE create #-} 49 | create :: [EventFlag] -> IO Event 50 | create !flags = resultIfOk =<< cudaEventCreateWithFlags flags 51 | 52 | {-# INLINE cudaEventCreateWithFlags #-} 53 | {# fun unsafe cudaEventCreateWithFlags 54 | { alloca- `Event' peekEvt* 55 | , combineBitMasks `[EventFlag]' } -> `Status' cToEnum #} 56 | where peekEvt = liftM Event . peek 57 | 58 | 59 | -- | 60 | -- Destroy an event 61 | -- 62 | {-# INLINEABLE destroy #-} 63 | destroy :: Event -> IO () 64 | destroy !ev = nothingIfOk =<< cudaEventDestroy ev 65 | 66 | {-# INLINE cudaEventDestroy #-} 67 | {# fun unsafe cudaEventDestroy 68 | { useEvent `Event' } -> `Status' cToEnum #} 69 | 70 | 71 | -- | 72 | -- Determine the elapsed time (in milliseconds) between two events 73 | -- 74 | {-# INLINEABLE elapsedTime #-} 75 | elapsedTime :: Event -> Event -> IO Float 76 | elapsedTime !ev1 !ev2 = resultIfOk =<< cudaEventElapsedTime ev1 ev2 77 | 78 | {-# INLINE cudaEventElapsedTime #-} 79 | {# fun unsafe cudaEventElapsedTime 80 | { alloca- `Float' peekFloatConv* 81 | , useEvent `Event' 82 | , useEvent `Event' } -> `Status' cToEnum #} 83 | 84 | 85 | -- | 86 | -- Determines if a event has actually been recorded 87 | -- 88 | {-# INLINEABLE query #-} 89 | query :: Event -> IO Bool 90 | query !ev = 91 | cudaEventQuery ev >>= \rv -> 92 | case rv of 93 | Success -> return True 94 | NotReady -> return False 95 | _ -> throwIO (ExitCode rv) 96 | 97 | {-# INLINE cudaEventQuery #-} 98 | {# fun unsafe cudaEventQuery 99 | { useEvent `Event' } -> `Status' cToEnum #} 100 | 101 | 102 | -- | 103 | -- Record an event once all operations in the current context (or optionally 104 | -- specified stream) have completed. This operation is asynchronous. 105 | -- 106 | {-# INLINEABLE record #-} 107 | record :: Event -> Maybe Stream -> IO () 108 | record !ev !mst = 109 | nothingIfOk =<< cudaEventRecord ev (maybe defaultStream id mst) 110 | 111 | {-# INLINE cudaEventRecord #-} 112 | {# fun unsafe cudaEventRecord 113 | { useEvent `Event' 114 | , useStream `Stream' } -> `Status' cToEnum #} 115 | 116 | 117 | -- | 118 | -- Makes all future work submitted to the (optional) stream wait until the given 119 | -- event reports completion before beginning execution. Synchronisation is 120 | -- performed on the device, including when the event and stream are from 121 | -- different device contexts. Requires cuda-3.2. 122 | -- 123 | {-# INLINEABLE wait #-} 124 | wait :: Event -> Maybe Stream -> [WaitFlag] -> IO () 125 | #if CUDART_VERSION < 3020 126 | wait _ _ _ = requireSDK 'wait 3.2 127 | #else 128 | wait !ev !mst !flags = 129 | let st = fromMaybe defaultStream mst 130 | in nothingIfOk =<< cudaStreamWaitEvent st ev flags 131 | 132 | {-# INLINE cudaStreamWaitEvent #-} 133 | {# fun unsafe cudaStreamWaitEvent 134 | { useStream `Stream' 135 | , useEvent `Event' 136 | , combineBitMasks `[WaitFlag]' } -> `Status' cToEnum #} 137 | #endif 138 | 139 | -- | 140 | -- Wait until the event has been recorded 141 | -- 142 | {-# INLINEABLE block #-} 143 | block :: Event -> IO () 144 | block !ev = nothingIfOk =<< cudaEventSynchronize ev 145 | 146 | {-# INLINE cudaEventSynchronize #-} 147 | {# fun cudaEventSynchronize 148 | { useEvent `Event' } -> `Status' cToEnum #} 149 | 150 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Runtime/Stream.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE ForeignFunctionInterface #-} 3 | -------------------------------------------------------------------------------- 4 | -- | 5 | -- Module : Foreign.CUDA.Runtime.Stream 6 | -- Copyright : [2009..2023] Trevor L. McDonell 7 | -- License : BSD 8 | -- 9 | -- Stream management routines 10 | -- 11 | -------------------------------------------------------------------------------- 12 | 13 | module Foreign.CUDA.Runtime.Stream ( 14 | 15 | -- * Stream Management 16 | Stream(..), 17 | create, destroy, finished, block, 18 | 19 | defaultStream, defaultStreamLegacy, defaultStreamPerThread, 20 | 21 | ) where 22 | 23 | #include "cbits/stubs.h" 24 | {# context lib="cudart" #} 25 | 26 | -- Friends 27 | import Foreign.CUDA.Driver.Stream ( Stream(..), defaultStream, defaultStreamLegacy, defaultStreamPerThread ) 28 | import Foreign.CUDA.Internal.C2HS 29 | import Foreign.CUDA.Runtime.Error 30 | 31 | -- System 32 | import Foreign 33 | import Foreign.C 34 | import Control.Monad ( liftM ) 35 | import Control.Exception ( throwIO ) 36 | 37 | 38 | -------------------------------------------------------------------------------- 39 | -- Functions 40 | -------------------------------------------------------------------------------- 41 | 42 | -- | 43 | -- Create a new asynchronous stream 44 | -- 45 | {-# INLINEABLE create #-} 46 | create :: IO Stream 47 | create = resultIfOk =<< cudaStreamCreate 48 | 49 | {-# INLINE cudaStreamCreate #-} 50 | {# fun unsafe cudaStreamCreate 51 | { alloca- `Stream' peekStream* } -> `Status' cToEnum #} 52 | 53 | 54 | -- | 55 | -- Destroy and clean up an asynchronous stream 56 | -- 57 | {-# INLINEABLE destroy #-} 58 | destroy :: Stream -> IO () 59 | destroy !s = nothingIfOk =<< cudaStreamDestroy s 60 | 61 | {-# INLINE cudaStreamDestroy #-} 62 | {# fun unsafe cudaStreamDestroy 63 | { useStream `Stream' } -> `Status' cToEnum #} 64 | 65 | 66 | -- | 67 | -- Determine if all operations in a stream have completed 68 | -- 69 | {-# INLINEABLE finished #-} 70 | finished :: Stream -> IO Bool 71 | finished !s = 72 | cudaStreamQuery s >>= \rv -> do 73 | case rv of 74 | Success -> return True 75 | NotReady -> return False 76 | _ -> throwIO (ExitCode rv) 77 | 78 | {-# INLINE cudaStreamQuery #-} 79 | {# fun unsafe cudaStreamQuery 80 | { useStream `Stream' } -> `Status' cToEnum #} 81 | 82 | 83 | -- | 84 | -- Block until all operations in a Stream have been completed 85 | -- 86 | {-# INLINEABLE block #-} 87 | block :: Stream -> IO () 88 | block !s = nothingIfOk =<< cudaStreamSynchronize s 89 | 90 | {-# INLINE cudaStreamSynchronize #-} 91 | {# fun cudaStreamSynchronize 92 | { useStream `Stream' } -> `Status' cToEnum #} 93 | 94 | 95 | -- | 96 | -- The main execution stream (0) 97 | -- 98 | -- {-# INLINE defaultStream #-} 99 | -- defaultStream :: Stream 100 | -- #if CUDART_VERSION < 3010 101 | -- defaultStream = Stream 0 102 | -- #else 103 | -- defaultStream = Stream nullPtr 104 | -- #endif 105 | 106 | -------------------------------------------------------------------------------- 107 | -- Internal 108 | -------------------------------------------------------------------------------- 109 | 110 | {-# INLINE peekStream #-} 111 | peekStream :: Ptr {#type cudaStream_t#} -> IO Stream 112 | #if CUDART_VERSION < 3010 113 | peekStream = liftM Stream . peekIntConv 114 | #else 115 | peekStream = liftM Stream . peek 116 | #endif 117 | 118 | -------------------------------------------------------------------------------- /src/Foreign/CUDA/Runtime/Utils.chs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -------------------------------------------------------------------------------- 3 | -- | 4 | -- Module : Foreign.CUDA.Runtime.Utils 5 | -- Copyright : [2009..2023] Trevor L. McDonell 6 | -- License : BSD 7 | -- 8 | -- Utility functions 9 | -- 10 | -------------------------------------------------------------------------------- 11 | 12 | module Foreign.CUDA.Runtime.Utils ( 13 | 14 | runtimeVersion, 15 | driverVersion, 16 | libraryVersion, 17 | 18 | ) where 19 | 20 | #include "cbits/stubs.h" 21 | {# context lib="cudart" #} 22 | 23 | -- Friends 24 | import Foreign.CUDA.Runtime.Error 25 | import Foreign.CUDA.Internal.C2HS 26 | 27 | -- System 28 | import Foreign 29 | import Foreign.C 30 | 31 | 32 | -- | 33 | -- Return the version number of the installed CUDA driver 34 | -- 35 | {-# INLINEABLE runtimeVersion #-} 36 | runtimeVersion :: IO Int 37 | runtimeVersion = resultIfOk =<< cudaRuntimeGetVersion 38 | 39 | {-# INLINE cudaRuntimeGetVersion #-} 40 | {# fun unsafe cudaRuntimeGetVersion 41 | { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} 42 | 43 | 44 | -- | 45 | -- Return the version number of the installed CUDA runtime 46 | -- 47 | {-# INLINEABLE driverVersion #-} 48 | driverVersion :: IO Int 49 | driverVersion = resultIfOk =<< cudaDriverGetVersion 50 | 51 | {-# INLINE cudaDriverGetVersion #-} 52 | {# fun unsafe cudaDriverGetVersion 53 | { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} 54 | 55 | 56 | -- | 57 | -- Return the version number of the CUDA library (API) that this package was 58 | -- compiled against. 59 | -- 60 | {-# INLINEABLE libraryVersion #-} 61 | libraryVersion :: Int 62 | libraryVersion = {#const CUDART_VERSION #} 63 | 64 | -------------------------------------------------------------------------------- /src/Text/Show/Describe.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Text.Show.Describe 4 | -- Copyright : [2016..2023] Trevor L. McDonell 5 | -- License : BSD 6 | -- 7 | -------------------------------------------------------------------------------- 8 | 9 | module Text.Show.Describe 10 | where 11 | 12 | 13 | -- | Like 'Text.Show.Show', but focuses on providing a more detailed description 14 | -- of the value rather than a 'Text.Read.read'able representation. 15 | -- 16 | class Describe a where 17 | describe :: a -> String 18 | 19 | -------------------------------------------------------------------------------- /stack-7.10.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-6.35 5 | 6 | packages: 7 | - . 8 | 9 | extra-deps: 10 | - Cabal-1.24.2.0 11 | 12 | # Override default flag values for local packages and extra-deps 13 | # flags: {} 14 | 15 | # Extra global and per-package GHC options 16 | # ghc-options: {} 17 | 18 | # Control whether we use the GHC we find on the path 19 | # system-ghc: true 20 | 21 | # Require a specific version of stack, using version ranges 22 | # require-stack-version: -any # Default 23 | # require-stack-version: >= 0.1.4.0 24 | 25 | # Override the architecture used by stack, especially useful on Windows 26 | # arch: i386 27 | # arch: x86_64 28 | 29 | # Extra directories used by stack for building 30 | # extra-include-dirs: [/path/to/dir] 31 | # extra-lib-dirs: [/path/to/dir] 32 | 33 | -------------------------------------------------------------------------------- /stack-7.8.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-2.22 5 | 6 | packages: 7 | - . 8 | 9 | extra-deps: 10 | - Cabal-1.24.2.0 11 | 12 | # Override default flag values for local packages and extra-deps 13 | # flags: {} 14 | 15 | # Extra global and per-package GHC options 16 | # ghc-options: {} 17 | 18 | # Control whether we use the GHC we find on the path 19 | # system-ghc: true 20 | 21 | # Require a specific version of stack, using version ranges 22 | # require-stack-version: -any # Default 23 | # require-stack-version: >= 0.1.4.0 24 | 25 | # Override the architecture used by stack, especially useful on Windows 26 | # arch: i386 27 | # arch: x86_64 28 | 29 | # Extra directories used by stack for building 30 | # extra-include-dirs: [/path/to/dir] 31 | # extra-lib-dirs: [/path/to/dir] 32 | 33 | -------------------------------------------------------------------------------- /stack-8.0.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-9.21 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra global and per-package GHC options 15 | # ghc-options: {} 16 | 17 | # Control whether we use the GHC we find on the path 18 | # system-ghc: true 19 | 20 | # Require a specific version of stack, using version ranges 21 | # require-stack-version: -any # Default 22 | # require-stack-version: >= 0.1.4.0 23 | 24 | # Override the architecture used by stack, especially useful on Windows 25 | # arch: i386 26 | # arch: x86_64 27 | 28 | # Extra directories used by stack for building 29 | # extra-include-dirs: [/path/to/dir] 30 | # extra-lib-dirs: [/path/to/dir] 31 | 32 | -------------------------------------------------------------------------------- /stack-8.10.yaml: -------------------------------------------------------------------------------- 1 | # For advanced use and comprehensive documentation of the format, please see: 2 | # https://docs.haskellstack.org/en/stable/yaml_configuration/ 3 | 4 | resolver: lts-18.28 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra package databases containing global packages 15 | # extra-package-dbs: [] 16 | 17 | # Control whether we use the GHC we find on the path 18 | # system-ghc: true 19 | # 20 | # Require a specific version of stack, using version ranges 21 | # require-stack-version: -any # Default 22 | # require-stack-version: ">=2.1" 23 | # 24 | # Override the architecture used by stack, especially useful on Windows 25 | # arch: i386 26 | # arch: x86_64 27 | # 28 | # Extra directories used by stack for building 29 | # extra-include-dirs: [/path/to/dir] 30 | # extra-lib-dirs: [/path/to/dir] 31 | # 32 | # Allow a newer minor version of GHC than the snapshot specifies 33 | # compiler-check: newer-minor 34 | 35 | # vim: nospell 36 | -------------------------------------------------------------------------------- /stack-8.2.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-11.22 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra global and per-package GHC options 15 | # ghc-options: {} 16 | 17 | # Control whether we use the GHC we find on the path 18 | # system-ghc: true 19 | 20 | # Require a specific version of stack, using version ranges 21 | # require-stack-version: -any # Default 22 | # require-stack-version: >= 0.1.4.0 23 | 24 | # Override the architecture used by stack, especially useful on Windows 25 | # arch: i386 26 | # arch: x86_64 27 | 28 | # Extra directories used by stack for building 29 | # extra-include-dirs: [/path/to/dir] 30 | # extra-lib-dirs: [/path/to/dir] 31 | 32 | -------------------------------------------------------------------------------- /stack-8.4.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-12.26 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra global and per-package GHC options 15 | # ghc-options: {} 16 | 17 | # Extra package databases containing global packages 18 | # extra-package-dbs: [] 19 | 20 | # Control whether we use the GHC we find on the path 21 | # system-ghc: true 22 | 23 | # Require a specific version of stack, using version ranges 24 | # require-stack-version: -any # Default 25 | # require-stack-version: >= 0.1.4.0 26 | 27 | # Override the architecture used by stack, especially useful on Windows 28 | # arch: i386 29 | # arch: x86_64 30 | 31 | # Extra directories used by stack for building 32 | # extra-include-dirs: [/path/to/dir] 33 | # extra-lib-dirs: [/path/to/dir] 34 | 35 | -------------------------------------------------------------------------------- /stack-8.6.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | # vim: nospell 3 | 4 | resolver: lts-14.27 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra global and per-package GHC options 15 | # ghc-options: {} 16 | 17 | # Extra package databases containing global packages 18 | # extra-package-dbs: [] 19 | 20 | # Control whether we use the GHC we find on the path 21 | # system-ghc: true 22 | 23 | # Require a specific version of stack, using version ranges 24 | # require-stack-version: -any # Default 25 | # require-stack-version: >= 0.1.4.0 26 | 27 | # Override the architecture used by stack, especially useful on Windows 28 | # arch: i386 29 | # arch: x86_64 30 | 31 | # Extra directories used by stack for building 32 | # extra-include-dirs: [/path/to/dir] 33 | # extra-lib-dirs: [/path/to/dir] 34 | 35 | -------------------------------------------------------------------------------- /stack-8.8.yaml: -------------------------------------------------------------------------------- 1 | # For advanced use and comprehensive documentation of the format, please see: 2 | # https://docs.haskellstack.org/en/stable/yaml_configuration/ 3 | 4 | resolver: lts-16.31 5 | 6 | packages: 7 | - . 8 | 9 | # extra-deps: [] 10 | 11 | # Override default flag values for local packages and extra-deps 12 | # flags: {} 13 | 14 | # Extra package databases containing global packages 15 | # extra-package-dbs: [] 16 | 17 | # Control whether we use the GHC we find on the path 18 | # system-ghc: true 19 | # 20 | # Require a specific version of stack, using version ranges 21 | # require-stack-version: -any # Default 22 | # require-stack-version: ">=2.1" 23 | # 24 | # Override the architecture used by stack, especially useful on Windows 25 | # arch: i386 26 | # arch: x86_64 27 | # 28 | # Extra directories used by stack for building 29 | # extra-include-dirs: [/path/to/dir] 30 | # extra-lib-dirs: [/path/to/dir] 31 | # 32 | # Allow a newer minor version of GHC than the snapshot specifies 33 | # compiler-check: newer-minor 34 | 35 | # vim: nospell 36 | -------------------------------------------------------------------------------- /stack-9.0.yaml: -------------------------------------------------------------------------------- 1 | # Some commonly used options have been documented as comments in this file. 2 | # For advanced use and comprehensive documentation of the format, please see: 3 | # https://docs.haskellstack.org/en/stable/yaml_configuration/ 4 | 5 | resolver: lts-19.33 6 | 7 | packages: 8 | - . 9 | 10 | # extra-deps: [] 11 | 12 | # Override default flag values for local packages and extra-deps 13 | # flags: {} 14 | 15 | # Extra package databases containing global packages 16 | # extra-package-dbs: [] 17 | 18 | # Control whether we use the GHC we find on the path 19 | # system-ghc: true 20 | # 21 | # Require a specific version of stack, using version ranges 22 | # require-stack-version: -any # Default 23 | # require-stack-version: ">=1.9" 24 | # 25 | # Override the architecture used by stack, especially useful on Windows 26 | # arch: i386 27 | # arch: x86_64 28 | # 29 | # Extra directories used by stack for building 30 | # extra-include-dirs: [/path/to/dir] 31 | # extra-lib-dirs: [/path/to/dir] 32 | # 33 | # Allow a newer minor version of GHC than the snapshot specifies 34 | # compiler-check: newer-minor 35 | # 36 | # vim: nospell 37 | --------------------------------------------------------------------------------