├── .github
    └── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── project.clj
├── src
    ├── clojure
    │   └── uncomplicate
    │   │   └── clojurecuda
    │   │       ├── core.clj
    │   │       ├── info.clj
    │   │       ├── internal
    │   │           ├── constants.clj
    │   │           ├── impl.clj
    │   │           └── utils.clj
    │   │       └── toolbox.clj
    ├── cuda
    │   └── uncomplicate
    │   │   └── clojurecuda
    │   │       ├── include
    │   │           └── jitify
    │   │           │   ├── LICENSE
    │   │           │   ├── float.h
    │   │           │   ├── stddef.h
    │   │           │   └── stdint.h
    │   │       └── kernels
    │   │           └── reduction.cu
    └── java
    │   └── uncomplicate
    │       └── clojurecuda
    │           └── internal
    │               └── javacpp
    │                   ├── CUHostFn.java
    │                   └── CUStreamCallback.java
└── test
    ├── clojure
        └── uncomplicate
        │   └── clojurecuda
        │       ├── core_test.clj
        │       ├── examples
        │           ├── dynamic_parallelism_test.clj
        │           └── vector_add_test.clj
        │       ├── info_test.clj
        │       ├── toolbox_test.clj
        │       └── utils_test.clj
    └── cuda
        ├── examples
            ├── dynamic-parallelism.cu
            └── jnvrtc-vector-add.cu
        └── uncomplicate
            └── clojurecuda
                └── kernels
                    ├── test.cu
                    └── toolbox-test.cu


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: draganrocks
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /lib
 3 | /classes
 4 | /checkouts
 5 | pom.xml
 6 | pom.xml.asc
 7 | *.jar
 8 | *.class
 9 | /.lein-*
10 | /.nrepl-port
11 | doc
12 | docs
13 | hs_*.log
14 | .#*
15 | .DS_Store
16 | *.o
17 | *.so
18 | */nrepl-port
19 | */target
20 | .idea
21 | /*.iml
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
  2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
  3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
  4 | 
  5 | 1. DEFINITIONS
  6 | 
  7 | "Contribution" means:
  8 | 
  9 | a) in the case of the initial Contributor, the initial code and
 10 | documentation distributed under this Agreement, and
 11 | 
 12 | b) in the case of each subsequent Contributor:
 13 | 
 14 | i) changes to the Program, and
 15 | 
 16 | ii) additions to the Program;
 17 | 
 18 | where such changes and/or additions to the Program originate from and are
 19 | distributed by that particular Contributor. A Contribution 'originates' from
 20 | a Contributor if it was added to the Program by such Contributor itself or
 21 | anyone acting on such Contributor's behalf. Contributions do not include
 22 | additions to the Program which: (i) are separate modules of software
 23 | distributed in conjunction with the Program under their own license
 24 | agreement, and (ii) are not derivative works of the Program.
 25 | 
 26 | "Contributor" means any person or entity that distributes the Program.
 27 | 
 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
 29 | necessarily infringed by the use or sale of its Contribution alone or when
 30 | combined with the Program.
 31 | 
 32 | "Program" means the Contributions distributed in accordance with this
 33 | Agreement.
 34 | 
 35 | "Recipient" means anyone who receives the Program under this Agreement,
 36 | including all Contributors.
 37 | 
 38 | 2. GRANT OF RIGHTS
 39 | 
 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
 42 | reproduce, prepare derivative works of, publicly display, publicly perform,
 43 | distribute and sublicense the Contribution of such Contributor, if any, and
 44 | such derivative works, in source code and object code form.
 45 | 
 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
 49 | transfer the Contribution of such Contributor, if any, in source code and
 50 | object code form.  This patent license shall apply to the combination of the
 51 | Contribution and the Program if, at the time the Contribution is added by the
 52 | Contributor, such addition of the Contribution causes such combination to be
 53 | covered by the Licensed Patents. The patent license shall not apply to any
 54 | other combinations which include the Contribution. No hardware per se is
 55 | licensed hereunder.
 56 | 
 57 | c) Recipient understands that although each Contributor grants the licenses
 58 | to its Contributions set forth herein, no assurances are provided by any
 59 | Contributor that the Program does not infringe the patent or other
 60 | intellectual property rights of any other entity. Each Contributor disclaims
 61 | any liability to Recipient for claims brought by any other entity based on
 62 | infringement of intellectual property rights or otherwise. As a condition to
 63 | exercising the rights and licenses granted hereunder, each Recipient hereby
 64 | assumes sole responsibility to secure any other intellectual property rights
 65 | needed, if any. For example, if a third party patent license is required to
 66 | allow Recipient to distribute the Program, it is Recipient's responsibility
 67 | to acquire that license before distributing the Program.
 68 | 
 69 | d) Each Contributor represents that to its knowledge it has sufficient
 70 | copyright rights in its Contribution, if any, to grant the copyright license
 71 | set forth in this Agreement.
 72 | 
 73 | 3. REQUIREMENTS
 74 | 
 75 | A Contributor may choose to distribute the Program in object code form under
 76 | its own license agreement, provided that:
 77 | 
 78 | a) it complies with the terms and conditions of this Agreement; and
 79 | 
 80 | b) its license agreement:
 81 | 
 82 | i) effectively disclaims on behalf of all Contributors all warranties and
 83 | conditions, express and implied, including warranties or conditions of title
 84 | and non-infringement, and implied warranties or conditions of merchantability
 85 | and fitness for a particular purpose;
 86 | 
 87 | ii) effectively excludes on behalf of all Contributors all liability for
 88 | damages, including direct, indirect, special, incidental and consequential
 89 | damages, such as lost profits;
 90 | 
 91 | iii) states that any provisions which differ from this Agreement are offered
 92 | by that Contributor alone and not by any other party; and
 93 | 
 94 | iv) states that source code for the Program is available from such
 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
 96 | or through a medium customarily used for software exchange.
 97 | 
 98 | When the Program is made available in source code form:
 99 | 
100 | a) it must be made available under this Agreement; and
101 | 
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 | 
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 | 
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 | 
111 | 4. COMMERCIAL DISTRIBUTION
112 | 
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering.  The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor tocontrol, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 | 
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 | 
144 | 5. NO WARRANTY
145 | 
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 | 
157 | 6. DISCLAIMER OF LIABILITY
158 | 
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | nEXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 | 
168 | 7. GENERAL
169 | 
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 | 
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 | 
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 | 
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 | 
210 | This Agreement is governed by the laws of the State of New York and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.
215 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [New books available for subscription](https://aiprobook.com)
 2 | 
 3 | <img src="http://aiprobook.com/img/dlfp-cover.png" alt="Deep Learning for Programmers" title="Deep Learning for Programmers" align="left" width="250"/>
 4 | 
 5 | <img src="http://aiprobook.com/img/lafp-cover.png" alt="Numerical Linear Algebra for Programmers" title="Numerical Linear Algebra for Programmers" align="right" width="250"/>
 6 | 
 7 | # ClojureCUDA
 8 | 
 9 | [Adopt your pet function](https://dragan.rocks/articles/18/Patreon-Announcement-Adopt-a-Function) and [become a patron](https://patreon.com/draganrocks).
10 | 
11 | Clojure library for CUDA development. See the documentation at [ClojureCUDA website](https://clojurecuda.uncomplicate.org).
12 | 
13 | ## License
14 | 
15 | Copyright © 2017-2019 Dragan Djuric
16 | 
17 | Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version.
18 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (defproject uncomplicate/clojurecuda "0.21.1-SNAPSHOT"
10 |   :description "ClojureCUDA is a Clojure library for parallel computations with Nvidia's CUDA."
11 |   :url "https://github.com/uncomplicate/clojurecuda"
12 |   :scm {:name "git"
13 |         :url "https://github.com/uncomplicate/clojurecuda"}
14 |   :license {:name "Eclipse Public License"
15 |             :url "http://www.eclipse.org/legal/epl-v10.html"}
16 |   :dependencies [[org.clojure/clojure "1.12.0"]
17 |                  [org.clojure/core.async "1.7.701"]
18 |                  [uncomplicate/commons "0.16.1"]
19 |                  [uncomplicate/fluokitten "0.10.0"]
20 |                  [org.uncomplicate/clojure-cpp "0.4.1-SNAPSHOT"]
21 |                  [org.bytedeco/cuda-platform "12.8-9.8-1.5.12-SNAPSHOT"]]
22 | 
23 |   :profiles {:dev [:dev/all ~(leiningen.core.utils/get-os)]
24 |              :dev/all {:plugins [[lein-midje "3.2.1"]
25 |                                  [lein-codox "0.10.8"]
26 |                                  [com.github.clj-kondo/lein-clj-kondo "0.2.5"]]
27 |                        :global-vars {*warn-on-reflection* true
28 |                                      *assert* true
29 |                                      *unchecked-math* :warn-on-boxed
30 |                                      *print-length* 128}
31 |                        :dependencies [[midje "1.10.10"]
32 |                                       [codox-theme-rdash "0.1.2"]]
33 |                        :codox {:metadata {:doc/format :markdown}
34 |                                :source-uri "http://github.com/uncomplicate/clojurecuda/blob/master/{filepath}#L{line}"
35 |                                :output-path "docs/codox"
36 |                                :themes [:rdash]
37 |                                :namespaces [uncomplicate.clojurecuda.core
38 |                                             uncomplicate.clojurecuda.info
39 |                                             uncomplicate.clojurecuda.toolbox
40 |                                             uncomplicate.clojurecuda.internal.constants]}}
41 |              :linux {:dependencies [[org.bytedeco/cuda "12.8-9.8-1.5.12-SNAPSHOT" :classifier linux-x86_64-redist]]}
42 |              :windows {:dependencies [[org.bytedeco/cuda "12.9-9.9-1.5.12-SNAPSHOT" :classifier windows-x86_64-redist]]}}
43 | 
44 |   :repositories [["snapshots" "https://oss.sonatype.org/content/repositories/snapshots"]]
45 | 
46 |   :javac-options ["-target" "1.8" "-source" "1.8" "-Xlint:-options"]
47 | 
48 |   :source-paths ["src/clojure" "src/cuda"]
49 |   :test-paths ["test/clojure" "test/cuda"]
50 |   :java-source-paths ["src/java"])
51 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/core.clj:
--------------------------------------------------------------------------------
  1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
  2 | ;;   The use and distribution terms for this software are covered by the
  3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
  4 | ;;   which can be found in the file LICENSE at the root of this distribution.
  5 | ;;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;;   the terms of this license.
  7 | ;;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:author "Dragan Djuric"}
 10 |     uncomplicate.clojurecuda.core
 11 |   "Core ClojureCUDA functions for CUDA **host** programming. The kernels should
 12 |   be provided as strings (that may be stored and read from files) or binaries, written in CUDA C/C++.
 13 | 
 14 |   Many examples are available in ClojureCUDA [core test](https://github.com/uncomplicate/clojurecuda/blob/master/test/clojure/uncomplicate/clojurecuda/core_test.clj).
 15 |   You can see how to write CUDA [kernels here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/examples)
 16 |   and [here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/uncomplicate/clojurecuda/kernels)
 17 |   and examples of [how to load them here](https://github.com/uncomplicate/clojurecuda/tree/master/test/clojure/uncomplicate/clojurecuda/examples/).
 18 | 
 19 |   For more advanced examples, please read the source code of the CUDA engine of [Neanderthal linear algebra library](https://github.com/uncomplicate/neanderthal) (mainly general CUDA and cuBLAS are used there),
 20 |   and the [Deep Diamond tensor and linear algebra library](https://github.com/uncomplicate/neanderthal) (for extensive use of cuDNN).
 21 | 
 22 |   Here's a categorized map of core functions. Most functions throw `ExceptionInfo` in case of errors
 23 |   thrown by the CUDA driver.
 24 | 
 25 |   - Device management: [[init]], [[device-count]], [[device]].
 26 |   - Context management: [[context]], [[current-context]], [[current-context!]], [[put-context!]],
 27 |   [[push-context!]], [[in-context]], [[with-context]], [[with-default]].
 28 |   - Memory management: [[memcpy!]], [[mumcpy-to-host!]], [[memcpy-to-device!]], [[memset!]].
 29 |   [[mem-sub-region]], [[mem-alloc-driver]], [[mem-alloc-runtime]], [[cuda-malloc]], [[cuda-free!]]
 30 |   [[mem-alloc-pinned]], [[mem-register-pinned!]], [[mem-alloc-mapped]],
 31 |   - Module management: [[link]], [[link-complete!]], [[load!]], [[module]].
 32 |   - Execution control: [[gdid-1d]], [[grid-2d]], [[grid-3d]], [[global]], [[set-parameter!]],
 33 |   [[parameters]], [[function]], [[launch!]].
 34 |   - Stream management: [[stream]], [[default-stream]], [[ready?]], [[synchronize!]],
 35 |   [[add-host-fn!]], [[listen!]], [[wait-event!]], [[attach-mem!]].
 36 |   - Event management: [[event]], [[elapsed-time!]], [[record!]], [[can-access-peer]],
 37 |   [[p2p-attribute]], [[disable-peer-access!]], [[enable-peer-access!]].
 38 |   - NVRTC program JIT: [[program]], [[program-log]], [[compile!]], [[ptx]].
 39 | 
 40 |   Please see [CUDA Driver API](https://docs.nvidia.com/cuda/pdf/CUDA_Driver_API.pdf) for details
 41 |   not discussed in ClojureCUDA documentation.
 42 |   "
 43 |   (:require [uncomplicate.commons
 44 |              [core :refer [with-release let-release info bytesize sizeof size]]
 45 |              [utils :refer [mask count-groups dragan-says-ex]]]
 46 |             [uncomplicate.fluokitten.protocols :refer [extract]]
 47 |             [uncomplicate.clojure-cpp
 48 |              :refer [null? pointer byte-pointer string-pointer int-pointer long-pointer
 49 |                      size-t-pointer pointer-pointer get-entry put-entry! safe type-pointer position!
 50 |                      capacity! address]]
 51 |             [uncomplicate.clojurecuda.info :as cuda-info]
 52 |             [uncomplicate.clojurecuda.internal
 53 |              [constants :refer [ctx-flags event-flags mem-attach-flags mem-host-alloc-flags
 54 |                                 mem-host-register-flags p2p-attributes stream-flags]]
 55 |              [impl :refer [->CUDevice ->CUDevicePtr add-host-fn* attach-mem* can-access-peer*
 56 |                            compile* context* cu-address* current-context* event* host-fn* link*
 57 |                            malloc-runtime* mem-alloc-host* mem-alloc-managed* mem-host-alloc*
 58 |                            mem-host-register* memcpy* memcpy-host* memset* module-load* offset
 59 |                            p2p-attribute* program* program-log* ptx* ready* set-parameter* stream*]]
 60 |              [utils :refer [with-check]]])
 61 |   (:import [org.bytedeco.javacpp Pointer LongPointer SizeTPointer PointerPointer]
 62 |            org.bytedeco.cuda.global.cudart
 63 |            [org.bytedeco.cuda.cudart CUctx_st CUlinkState_st CUmod_st CUfunc_st CUstream_st CUevent_st]))
 64 | 
 65 | (defn init
 66 |   "Initializes the CUDA driver. This function must be called before any other function
 67 |   from ClojureCUDA in the current process.
 68 |   See [CUDA Initialization](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html)
 69 |   "
 70 |   []
 71 |   (with-check (cudart/cuInit 0) true))
 72 | 
 73 | ;; ================== Device Management ====================================
 74 | 
 75 | (defn device-count
 76 |   "Returns the number of CUDA devices on the system.
 77 |   See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html).
 78 |   "
 79 |   ^long []
 80 |   (let [res (int-pointer 1)]
 81 |     (with-check (cudart/cuDeviceGetCount res) (get-entry res 0))))
 82 | 
 83 | (defn device
 84 |   "Returns a device specified with its ordinal number `id` or string PCI Bus `id`.
 85 |   See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html).
 86 |   "
 87 |   ([id]
 88 |    (with-release [res (int-pointer 1)]
 89 |      (with-check
 90 |        (if (number? id)
 91 |          (cudart/cuDeviceGet res (long id))
 92 |          (cudart/cuDeviceGetByPCIBusId res ^String id))
 93 |        {:device-id id}
 94 |        (->CUDevice (get-entry res 0)))))
 95 |   ([]
 96 |    (device 0)))
 97 | 
 98 | ;; =================== Context Management ==================================
 99 | 
100 | (defn context
101 |   "Creates a CUDA context on the `device` using a keyword `flag`.
102 |   For available flags, see [[internal.constants/ctx-flags]]. The default is none.
103 |   The context must be released after use.
104 | 
105 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
106 |   "
107 |   ([dev flag]
108 |    (context* (extract dev)
109 |              (or (ctx-flags flag)
110 |                  (throw (ex-info "Unknown context flag." {:flag flag :available ctx-flags})))))
111 |   ([dev]
112 |    (context* (extract dev) 0)))
113 | 
114 | (defn current-context
115 |   "Returns the CUDA context bound to the calling CPU thread.
116 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
117 |   "
118 |   []
119 |   (current-context*))
120 | 
121 | (defn current-context!
122 |   "Binds the specified CUDA context `ctx` to the calling CPU thread.
123 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
124 |   "
125 |   [ctx]
126 |   (current-context* ctx)
127 |   ctx)
128 | 
129 | (defn pop-context!
130 |   "Pops the current CUDA context `ctx` from the current CPU thread.
131 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
132 |   "
133 |   []
134 |   (let [ctx (CUctx_st.)]
135 |     (with-check (cudart/cuCtxPopCurrent ctx) ctx)))
136 | 
137 | (defn push-context!
138 |   "Pushes a context `ctx` on the current CPU thread.
139 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
140 |   "
141 |   [^CUctx_st ctx]
142 |   (with-check (cudart/cuCtxPushCurrent ctx) ctx))
143 | 
144 | (defmacro in-context
145 |   "Pushes the context `ctx` to the top of the context stack, evaluates the body with `ctx`
146 |   as the current context, and pops the context from the stack.
147 |   Does NOT release the context, unlike [[with-context]].
148 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
149 |   "
150 |   [ctx & body]
151 |   `(try
152 |      (push-context! ~ctx)
153 |      ~@body
154 |      (finally (pop-context!))))
155 | 
156 | (defmacro with-context
157 |   "Pushes the context `ctx` to the top of the context stack, evaluates the body, and pops the context
158 |   from the stack. Releases the context, unlike [[in-context]].
159 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
160 |   "
161 |   [ctx & body]
162 |   `(with-release [ctx# ~ctx]
163 |      (in-context ctx# ~@body)))
164 | 
165 | (defmacro with-default
166 |   "Initializes CUDA, creates the default context and executes the body in it.
167 |   See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
168 |   "
169 |   [& body]
170 |   `(do
171 |      (init)
172 |      (with-release [dev# (device)]
173 |        (with-context (context dev#)
174 |          ~@body))))
175 | 
176 | ;; ================== Memory Management  ==============================================
177 | 
178 | (defn ^:private check-size [ptr ^long offset ^long byte-count]
179 |   (when-not (<= 0 offset (+ offset byte-count) (bytesize ptr))
180 |     (dragan-says-ex "Requested bytes are out of the bounds of this device pointer."
181 |                     {:offset offset :requested byte-count :available (bytesize ptr)})))
182 | 
183 | (defn memcpy!
184 |   "Copies `byte-count` or maximum available device memory from `src` to `dst`.
185 |   TODO mapped, pinned
186 |   If `hstream` is provided, executes asynchronously.
187 |   See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
188 |   "
189 |   ([src dst]
190 |    (memcpy! src dst (min (bytesize src) (bytesize dst)) nil))
191 |   ([src dst byte-count-or-stream]
192 |    (if (number? byte-count-or-stream)
193 |      (do (check-size src 0 byte-count-or-stream)
194 |          (check-size dst 0 byte-count-or-stream)
195 |          (memcpy* dst src byte-count-or-stream nil))
196 |      (memcpy! src dst (min (bytesize src) (bytesize dst)) byte-count-or-stream))
197 |    dst)
198 |   ([src dst ^long byte-count hstream]
199 |    (check-size src 0 byte-count)
200 |    (check-size dst 0 byte-count)
201 |    (memcpy* dst src byte-count hstream)
202 |    dst))
203 | 
204 | (defn memcpy-to-host!
205 |   "Copies `byte-count` or maximum available memory from device `src` to host `dst`. Useful when `src`
206 |   or `dst` is a generic pointer for which it cannot be determined whether it manages memory on host
207 |   or on device (see [[cuda-malloc!]]).
208 |   If `hstream` is provided, executes asynchronously.
209 |   See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
210 |   "
211 |   ([^Pointer src ^Pointer dst ^long byte-count hstream]
212 |    (check-size src 0 byte-count)
213 |    (check-size dst 0 byte-count)
214 |    (with-check
215 |      (if hstream
216 |        (cudart/cuMemcpyDtoHAsync (extract dst) (address (extract src)) byte-count hstream)
217 |        (cudart/cuMemcpyDtoH (extract dst) (address (extract src)) byte-count))
218 |      dst))
219 |   ([src dst count-or-stream]
220 |    (if (integer? count-or-stream)
221 |      (memcpy-to-host! src dst count-or-stream nil)
222 |      (memcpy-to-host! src dst (min (bytesize src) (bytesize dst)) count-or-stream))
223 |    dst)
224 |   ([src dst]
225 |    (memcpy-to-host! src dst (min (bytesize src) (bytesize dst)))
226 |    dst))
227 | 
228 | (defn memcpy-to-device!
229 |   "Copies `byte-count` or all possible memory from host `src` to device `dst`. Useful when `src` or
230 |   `dst` is a generic pointer for which it cannot be determined whether it manages memory on host or
231 |   on device (see [[cuda-malloc!]]).
232 |   If `hstream` is provided, executes asynchronously.
233 |   See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
234 |   "
235 |   ([^Pointer src ^Pointer dst ^long byte-count hstream]
236 |    (check-size src 0 byte-count)
237 |    (check-size dst 0 byte-count)
238 |    (with-check
239 |      (if hstream
240 |        (cudart/cuMemcpyHtoDAsync (address (extract dst)) (extract src) byte-count hstream)
241 |        (cudart/cuMemcpyHtoD (address (extract dst)) (extract src) byte-count))
242 |      dst))
243 |   ([src dst count-or-stream]
244 |    (if (integer? count-or-stream)
245 |      (memcpy-to-device! src dst count-or-stream nil)
246 |      (memcpy-to-device! src dst (min (bytesize src) (bytesize dst)) count-or-stream))
247 |    dst)
248 |   ([src dst]
249 |    (memcpy-to-device! src dst (min (bytesize src) (bytesize dst)))
250 |    dst))
251 | 
252 | (defn memcpy-host!
253 |   "Copies `byte-count` or all possible memory from `src` to `dst`, one of which
254 |   has to be accessible from the host. If `hstream` is provided, executes asynchronously.
255 |   A polymorphic function that figures out what needs to be done. Supports everything
256 |   except pointers created by [[cuda-malloc!]].
257 |   See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
258 |   "
259 |   ([src dst ^long byte-count hstream]
260 |    (check-size src 0 byte-count)
261 |    (check-size dst 0 byte-count)
262 |    (if hstream
263 |      (memcpy-host* dst src byte-count hstream)
264 |      (memcpy-host* dst src byte-count))
265 |    dst)
266 |   ([src dst count-or-stream]
267 |    (if (integer? count-or-stream)
268 |      (memcpy-host! src dst count-or-stream nil)
269 |      (memcpy-host* dst src (min (bytesize src) (bytesize dst)) count-or-stream))
270 |    dst)
271 |   ([src dst]
272 |    (memcpy-host* dst src (min (bytesize src) (bytesize dst)))
273 |    dst))
274 | 
275 | (defn memset!
276 |   "Sets `n` elements or all segments of `dptr` memory to `value` (supports all Java primitive number
277 |   types except `double`, and `long` with value larger than `Integer/MAX_VALUE`). If `hstream` is
278 |   provided, executes asynchronously.
279 |   See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
280 |   "
281 |   ([dptr value]
282 |    (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value)))
283 |    dptr)
284 |   ([dptr value n-or-hstream]
285 |    (if (integer? n-or-hstream)
286 |      (do (check-size dptr 0 (* (sizeof value) (long n-or-hstream)))
287 |          (memset* value (cu-address* dptr) n-or-hstream))
288 |      (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value)) n-or-hstream))
289 |    dptr)
290 |   ([dptr value ^long n hstream]
291 |    (if hstream
292 |      (do (check-size dptr 0 (* (sizeof value) n))
293 |          (memset* value (cu-address* dptr) n hstream))
294 |      (memset! dptr value n))
295 |    dptr))
296 | 
297 | ;; ==================== Driver-managed device memory ===============================================
298 | 
299 | (defn mem-sub-region
300 |   "Creates CUDA device memory object that references a sub-region of `mem` from `origin`
301 |   to `byte-count`, or maximum available byte size.
302 |   "
303 |   ([mem ^long origin ^long byte-count]
304 |    (check-size mem origin byte-count)
305 |    (let-release [sub-dptr (long-pointer 1)]
306 |      (->CUDevicePtr (put-entry! sub-dptr 0 (offset mem origin)) byte-count false)))
307 |   ([mem ^long origin]
308 |    (mem-sub-region mem origin (bytesize mem))))
309 | 
310 | (defn mem-alloc-driver
311 |   "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the
312 |   Unified Memory system, specified by a keyword `flag`. For available flags, see [[internal.constants/mem-attach-flags]].
313 |   Returns a CUDA device memory object, which can NOT be extracted as a `Pointer`, but can be accessed
314 |   directly through its address in the device memory.
315 |   See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
316 |   "
317 |   ([^long byte-size flag]
318 |    (mem-alloc-managed* (max 0 byte-size)
319 |                        (or (mem-attach-flags flag)
320 |                            (throw (ex-info "Unknown mem-attach flag."
321 |                                            {:flag flag :available mem-attach-flags})))))
322 |   ([^long byte-size]
323 |    (mem-alloc-managed* byte-size cudart/CU_MEM_ATTACH_GLOBAL)))
324 | 
325 | ;; =================== Runtime API Memory ================================================
326 | 
327 | (defn mem-alloc-runtime
328 |   "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the
329 |   Unified Memory system. Returns a CUDA device memory object managed by the CUDA runtime API, which
330 |   can be extracted as a `Pointer`. Equivalent unwrapped `Pointer` can be created by [[cuda-malloc]].
331 |   See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
332 |   "
333 |   ([^long byte-size type] ;;TODO functions that receive type should accept size instead of bytesize
334 |    (if-let [t (type-pointer type)]
335 |      (malloc-runtime* (max 0 byte-size) t)
336 |      (throw (ex-info (format "Unknown data type: %s." (str type)) {}))))
337 |   ([^long byte-size]
338 |    (malloc-runtime* (max 0 byte-size))))
339 | 
340 | (defn cuda-malloc
341 |   "Returns a `Pointer` to `byte-size` bytes of uninitialized memory that will be automatically
342 |   managed by the Unified Memory system. The pointer is managed by the CUDA runtime API.
343 |   Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for
344 |   `FloatPointer`, etc.).
345 |   This pointer has to be manually released by [[cuda-free!]]. For a more seamless experience,
346 |   use the wrapper provided by the [[mem-alloc-runtime]] function.
347 |   See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
348 |   "
349 |   ([^long byte-size]
350 |    (let [byte-size (max 0 byte-size)]
351 |      (let-release [p (byte-pointer nil)]
352 |        (with-check (cudart/cudaMalloc p byte-size) (capacity! p byte-size)))))
353 |   ([^long byte-size type]
354 |    (if-let [pt (type-pointer type)]
355 |      (let [byte-size (max 0 byte-size)]
356 |        (let-release [p (byte-pointer nil)]
357 |          (with-check (cudart/cudaMalloc p byte-size) (pt (capacity! p byte-size)))))
358 |      (throw (ex-info (format "Unknown data type: %s." (str type)) {})))))
359 | 
360 | (defn cuda-free!
361 |   "Frees the runtime device memory that has been created by [[cuda-malloc]].
362 |   See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
363 |   "
364 |   [^Pointer dptr]
365 |   (when-not (null? dptr)
366 |     (with-check (cudart/cudaFree (position! dptr 0))
367 |       (do (.deallocate dptr) (.setNull dptr))))
368 |   dptr)
369 | 
370 | ;; =================== Pinned and Mapped Memory ================================================
371 | 
372 | (defn mem-alloc-pinned
373 |   "Allocates `byte-size` bytes of uninitialized page-locked memory, 'pinned' on the host, using
374 |   keyword `flags`. For available flags, see [[internal.constants/mem-host-alloc-flags]]; the default
375 |   is `:none`. Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for
376 |   `FloatPointer`, etc.).
377 |   Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is optimized
378 |   for [[memcpy!]].
379 |   See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
380 |   "
381 |   ([^long byte-size]
382 |    (mem-host-alloc* (max 0 byte-size) 0))
383 |   ([^long byte-size type-or-flags]
384 |    (if-let [t (type-pointer type-or-flags)]
385 |      (mem-host-alloc* (max 0 byte-size) 0 t)
386 |      (mem-host-alloc* (max 0 byte-size)
387 |                       (if (keyword? type-or-flags)
388 |                         (or (mem-host-alloc-flags type-or-flags)
389 |                             (throw (ex-info "Unknown mem-host-alloc flag."
390 |                                             {:flag type-or-flags :available mem-host-alloc-flags})))
391 |                         (mask mem-host-alloc-flags type-or-flags)))))
392 |   ([^long byte-size type flags]
393 |    (if-let [t (type-pointer type)]
394 |      (mem-host-alloc* (max 0 byte-size)
395 |                       (if (keyword? flags)
396 |                         (or (mem-host-alloc-flags flags)
397 |                             (throw (ex-info "Unknown mem-host-alloc flag."
398 |                                             {:flag flags :available mem-host-alloc-flags})))
399 |                         (mask mem-host-alloc-flags flags))
400 |                       t)
401 |      (throw (ex-info (format "Unknown data type: %s." (str type)) {})))))
402 | 
403 | (defn mem-register-pinned!
404 |   "Registers previously instantiated host pointer, 'pinned' from the device, using
405 |   keyword `flags`. For available flags, see [[internal.constants/mem-host-register-flags]]; the
406 |   default is `:none`. Returns the pinned object equivalent to the one created by [[mem-alloc-pinned]].
407 |   Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is
408 |   optimized for [[memcpy!]].
409 |   See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
410 |   "
411 |   ([memory flags]
412 |    (mem-host-register* memory (if (keyword? flags)
413 |                                 (or (mem-host-register-flags flags)
414 |                                     (throw (ex-info "Unknown mem-host-register flag."
415 |                                                     {:flag flags :available mem-host-register-flags})))
416 |                                 (mask mem-host-register-flags flags))))
417 |   ([memory]
418 |    (mem-host-register* memory 0)))
419 | 
420 | (defn mem-alloc-mapped
421 |   "Allocates `byte-size` bytes of uninitialized host memory, 'mapped' to the device. Optionally,
422 |   accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for `FloatPointer`, etc.).
423 |   Mapped memory is optimized for the [[memcpy!]] operation, while 'pinned' memory is optimized for
424 |   [[memcpy-host!]].
425 |   See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
426 |   "
427 |   ([^long byte-size]
428 |    (mem-alloc-host* (max 0 byte-size)))
429 |   ([^long byte-size type]
430 |    (mem-alloc-host* (max 0 byte-size) (type-pointer type))))
431 | 
432 | ;; ================== Module Management =====================================
433 | 
434 | (defn link
435 |   "Invokes the CUDA linker on data provided as a vector `[[type source <options> <name>], ...]`.
436 |   Produces a cubin compiled for a particular Nvidia architecture.
437 |   Please see relevant examples from the test folder.
438 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
439 |   "
440 |   ([data options]
441 |    (let-release [res (CUlinkState_st.)]
442 |      (link* res data options)))
443 |   ([data]
444 |    (link data nil))
445 |   ([]
446 |    (CUlinkState_st.)))
447 | 
448 | (defn link-complete!
449 |   "Completes the link state created by [[link]], so that it can be loaded by the [[module]] function.
450 |   Please see relevant examples from the test folder."
451 |   [^CUlinkState_st link-state]
452 |   (let-release [cubin-image (byte-pointer nil)]
453 |     (with-release [size-out (size-t-pointer 1)]
454 |       (with-check
455 |         (cudart/cuLinkComplete link-state cubin-image size-out)
456 |         (capacity! cubin-image (get-entry size-out 0))))))
457 | 
458 | (defn load!
459 |   "Load module's data from a [[ptx]] string, nvrtc program, java path, or binary `data`.
460 |   Please see relevant examples from the test folder.
461 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
462 |   "
463 |   [m data]
464 |   (module-load* (safe (pointer data)) m)
465 |   m)
466 | 
467 | (defn module
468 |   "Creates a new CUDA module and loads a string, nvrtc program, or binary `data`.
469 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)"
470 |   ([]
471 |    (CUmod_st.))
472 |   ([data]
473 |    (load! (module) data)))
474 | 
475 | (defrecord GridDim [^long grid-x ^long grid-y ^long grid-z ^long block-x ^long block-y ^long block-z])
476 | 
477 | (defn grid-1d
478 |   "Creates a 1-dimensional [[GridDim]] record with grid and block dimensions x.
479 |   Note: dim-x is the total number of threads globally, not the number of blocks."
480 |   ([^long dim-x]
481 |    (let [block-x (min dim-x 1024)]
482 |      (grid-1d dim-x block-x)))
483 |   ([^long dim-x ^long block-x]
484 |    (let [block-x (min dim-x block-x)]
485 |      (GridDim. (count-groups block-x dim-x) 1 1 block-x 1 1))))
486 | 
487 | (defn grid-2d
488 |   "Creates a 2-dimensional [[GridDim]] record with grid and block dimensions x and y.
489 |   Note: dim-x is the total number of threads globally, not the number of blocks."
490 |   ([^long dim-x ^long dim-y]
491 |    (let [block-x (min dim-x 32)
492 |          block-y (min dim-y (long (/ 1024 block-x)))]
493 |      (grid-2d dim-x dim-y block-x block-y)))
494 |   ([^long dim-x ^long dim-y ^long block-x ^long block-y]
495 |    (let [block-x (min dim-x block-x)
496 |          block-y (min dim-y block-y)]
497 |      (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y) 1 block-x block-y 1))))
498 | 
499 | (defn grid-3d
500 |   "Creates a 3-dimensional [[GridDim]] record with grid and block dimensions x, y, and z.
501 |   Note: dim-x is the total number of threads globally, not the number of blocks."
502 |   ([^long dim-x ^long dim-y ^long dim-z]
503 |    (let [block-x (min dim-x 32)
504 |          block-y (min dim-y (long (/ 1024 block-x)))
505 |          block-z (min dim-z (long (/ 1024 (* block-x block-y))))]
506 |      (grid-3d dim-x dim-y dim-z block-x block-y block-z)))
507 |   ([dim-x dim-y dim-z block-x block-y block-z]
508 |    (let [block-x (min (long dim-x) (long block-x))
509 |          block-y (min (long dim-y) (long block-y))
510 |          block-z (min (long dim-z) (long block-z))]
511 |      (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y)
512 |                (count-groups block-z dim-z) block-x block-y block-z))))
513 | 
514 | (defn global
515 |   "Returns CUDA global device memory object named `name` from module `m`. Global memory is
516 |   typically defined in C++ source files of CUDA kernels.
517 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
518 |   "
519 |   [^CUmod_st m ^String name]
520 |   (let-release [dptr (long-pointer 1)]
521 |     (with-release [byte-size (size-t-pointer 1)]
522 |       (with-check
523 |         (cudart/cuModuleGetGlobal ^LongPointer dptr ^SizeTPointer byte-size m name)
524 |         {:name name}
525 |         (->CUDevicePtr dptr (get-entry byte-size 0) false)))))
526 | 
527 | (defn set-parameter!
528 |   "Sets the `i`th parameter in a parameter array `pp` and the rest of `parameters` in places after `i`."
529 |   [^PointerPointer pp i parameter & parameters]
530 |   (if (< -1 (long i) (size pp))
531 |     (set-parameter* parameter (extract pp) i)
532 |     (throw (ex-info "Index out of bounds." {:requested i :available (size pp)})))
533 |   (if parameters
534 |     (recur pp (inc (long i)) (first parameters) (next parameters))
535 |     pp))
536 | 
537 | (defn parameters
538 |   "Creates an `PointerPointer`s to CUDA `parameter`'s. `parameter` can be any object on
539 |   device (Device API memory, Runtime API memory, JavaCPP pointers), or host (arrays, numbers, JavaCPP
540 |   pointers) that makes sense as a kernel parameter per CUDA specification. Use the result as a parameter
541 |   argument in [[launch!]].
542 |   "
543 |   ([parameter & parameters]
544 |    (let-release [len (if parameters (inc (count parameters)) 1)
545 |                  pp (pointer-pointer len)]
546 |      (apply set-parameter! pp 0 parameter parameters))))
547 | 
548 | ;; ====================== Execution Control ==================================
549 | 
550 | (defn function
551 |   "Returns CUDA kernel function named `name` located in module `m`.
552 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
553 |   "
554 |   [^CUmod_st m ^String name]
555 |   (let [res (CUfunc_st.)]
556 |     (with-check (cudart/cuModuleGetFunction res m name) {:name name} res)))
557 | 
558 | (defn launch!
559 |   "Invokes the kernel `fun` on a `grid-dim` grid of blocks, usinng `params` `PointerPointer`.
560 |   Optionally, you can specify the amount of shared memory that will be available to each thread block,
561 |   and `hstream` to use for execution.
562 |   See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
563 |   "
564 |   ([^CUfunc_st fun ^GridDim grid-dim shared-mem-bytes ^CUstream_st hstream ^PointerPointer params]
565 |    (with-check
566 |      (cudart/cuLaunchKernel fun (.grid-x grid-dim) (.grid-y grid-dim) (.grid-z grid-dim)
567 |                             (.block-x grid-dim) (.block-y grid-dim) (.block-z grid-dim)
568 |                             (int shared-mem-bytes) hstream params nil)
569 |      {:kernel (info fun) :grid-dim grid-dim :hstream (info hstream)}
570 |      hstream))
571 |   ([^CUfunc_st fun ^GridDim grid-dim hstream params]
572 |    (launch! fun grid-dim 0 hstream params))
573 |   ([^CUfunc_st fun ^GridDim grid-dim params]
574 |    (launch! fun grid-dim 0 nil params)))
575 | 
576 | ;; ================== Stream Management ======================================
577 | 
578 | (defn stream
579 |   "Creates a stream using an optional integer `priority` and a keyword `flag`.
580 |   For available flags, see [[internal.constants/stream-flags]]
581 |   See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
582 |   "
583 |   ([]
584 |    (stream* cudart/CU_STREAM_DEFAULT))
585 |   ([flag]
586 |    (stream* (or (stream-flags flag)
587 |                 (throw (ex-info "Invalid stream flag." {:flag flag :available stream-flags})))))
588 |   ([^long priority flag]
589 |    (stream* priority (or (stream-flags flag)
590 |                          (throw (ex-info  "Invaling stream flag."
591 |                                           {:flag flag :available stream-flags}))))))
592 | 
593 | (def default-stream
594 |   ^{:const true
595 |     :doc "The default per-thread stream."}
596 |    cudart/CU_STREAM_PER_THREAD)
597 | 
598 | (defn ready?
599 |   "Determines status (ready or not) of a compute stream or event `obj`.
600 |   See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
601 |   and [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
602 |   "
603 |   [obj]
604 |   (= cudart/CUDA_SUCCESS (ready* (extract obj))))
605 | 
606 | (defn synchronize!
607 |   "Blocks the current thread until the context's or `hstream`'s tasks complete."
608 |   ([]
609 |    (with-check (cudart/cuCtxSynchronize) true))
610 |   ([^CUstream_st hstream]
611 |    (with-check (cudart/cuStreamSynchronize hstream) hstream)))
612 | 
613 | (defn add-host-fn!
614 |   "Adds host function `f` to a compute stream, with optional `data` related to the call.
615 |   If `data` is not provided, places `hstream` under data.
616 |   "
617 |   ([hstream f data]
618 |    (add-host-fn* hstream f data)
619 |    hstream)
620 |   ([hstream f]
621 |    (add-host-fn* hstream f hstream)
622 |    hstream))
623 | 
624 | (defn listen!
625 |   "Adds a host function listener to a compute stream, with optional `data` related to the call,
626 |   and connects it to a Clojure channel `chan`. If `data` is not provided, places `hstream` under data.
627 |   "
628 |   ([hstream ch data]
629 |    (let [data (safe (pointer data))]
630 |      (add-host-fn* hstream (host-fn* data ch) data)
631 |      hstream))
632 |   ([hstream ch]
633 |    (add-host-fn* hstream (host-fn* hstream ch) hstream)
634 |    hstream))
635 | 
636 | (defn wait-event!
637 |   "Makes a compute stream `hstream` wait on an event `ev`.
638 |   See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
639 |   "
640 |   [^CUstream_st hstream ^CUevent_st ev]
641 |   (with-check (cudart/cuStreamWaitEvent hstream ev 0) hstream))
642 | 
643 | (defn attach-mem!
644 |   "Attaches memory `mem` of size `size`, specified by `flag` to a `hstream` asynchronously.
645 |   For available flags, see [[internal.constants/mem-attach-flags]]. Te default is `:single`.
646 |   If :global flag is specified, the memory can be accessed by any stream on any device.
647 |   If :host flag is specified, the program makes a guarantee that it won't access the memory on
648 |   the device from any stream on a device that has no `concurrent-managed-access` capability.
649 |   If :single flag is specified and `hStream` is associated with a device that has no
650 |   `concurrent-managed-access` capability, the program makes a guarantee that it will only access
651 |   the memory on the device from `hStream`. It is illegal to attach singly to the nil stream,
652 |   because the nil stream is a virtual global stream and not a specific stream. An error will
653 |   be returned in this case.
654 | 
655 |   When memory is associated with a single stream, the Unified Memory system will allow CPU access
656 |   to this memory region so long as all operations in hStream have completed, regardless of whether
657 |   other streams are active. In effect, this constrains exclusive ownership of the managed memory
658 |   region by an active GPU to per-stream activity instead of whole-GPU activity.
659 | 
660 |   See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)."
661 |   ([^CUstream_st hstream mem ^long byte-size flag]
662 |    (attach-mem* (or (extract hstream)
663 |                     (when-not (= :global flag)
664 |                       (throw (ex-info "nil stream is a virtual global stream and not a specific stream that may be only used with :global mem-attach flag."
665 |                                       {:flag flag :available mem-attach-flags}))))
666 |                 (cu-address* mem) byte-size
667 |                 (or (mem-attach-flags flag)
668 |                     (throw (ex-info "Unknown mem-attach flag."
669 |                                     {:flag flag :available mem-attach-flags}))))
670 |    hstream)
671 |   ([mem byte-size flag]
672 |    (attach-mem! default-stream mem byte-size flag)))
673 | 
674 | ;; ================== Event Management =======================================
675 | 
676 | (defn event
677 |   "Creates an event specified by keyword `flags`. For available flags, see
678 |   [[internal.constants/event-flags]].
679 |   See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
680 |   "
681 |   ([]
682 |    (event* cudart/CU_EVENT_DEFAULT))
683 |   ([flag & flags]
684 |    (event* (if flags
685 |              (mask event-flags (cons flag flags))
686 |              (or (event-flags flag)
687 |                  (throw (ex-info  "Unknown event flag." {:flag flag :available event-flags})))))))
688 | 
689 | (defn elapsed-time!
690 |   "Computes the elapsed time in milliseconds between `start-event` and `end-event`.
691 |   See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
692 |   "
693 |   ^double [^CUevent_st start-event ^CUevent_st end-event]
694 |   (let [res (float-array 1)]
695 |     (with-check (cudart/cuEventElapsedTime res start-event end-event) (aget res 0))))
696 | 
697 | (defn record!
698 |   "Records an even! `ev` on optional `stream`.
699 |   See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
700 |   "
701 |   ([^CUstream_st stream ^CUevent_st event]
702 |    (with-check (cudart/cuEventRecord event stream) stream))
703 |   ([^CUevent_st event]
704 |    (with-check (cudart/cuEventRecord event nil) default-stream)))
705 | 
706 | ;; ================== Peer Context Memory Access =============================
707 | 
708 | (defn can-access-peer
709 |   "Queries if a device may directly access a peer device's memory.
710 |   See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
711 |   "
712 |   [dev peer]
713 |   (can-access-peer* (extract dev) (extract peer)))
714 | 
715 | (defn p2p-attribute
716 |   "Queries attributes of the link between two devices.
717 |   See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
718 |   "
719 |   [dev peer attribute]
720 |   (p2p-attribute* (extract dev) (extract peer) (or (p2p-attributes attribute)
721 |                                                    (throw (ex-info "Unknown p2p attribute."
722 |                                                                    {:attribute attribute :available p2p-attributes})))))
723 | 
724 | (defn disable-peer-access!
725 |   "Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
726 |   See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
727 |   "
728 |   ([ctx]
729 |    (with-check (cudart/cuCtxDisablePeerAccess ctx) ctx))
730 |   ([]
731 |    (disable-peer-access! (current-context))))
732 | 
733 | (defn enable-peer-access!
734 |   "Enables direct access to memory allocations in a peer context and unregisters any registered allocations.
735 |   See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
736 |   "
737 |   ([ctx]
738 |    (with-check (cudart/cuCtxEnablePeerAccess ctx 0) ctx)
739 |    ctx)
740 |   ([]
741 |    (enable-peer-access! (current-context))))
742 | 
743 | ;; ====================== Nvrtc program JIT ========================================
744 | 
745 | (defn program
746 |   "Creates a CUDA program from the `source-code`, with an optional `name` and an optional
747 |   hash map of `headers` (as strings) and their names.
748 |   "
749 |   ([^String name ^String source-code headers]
750 |    (program* (string-pointer name) (string-pointer source-code)
751 |              (pointer-pointer (into-array String (vals headers)))
752 |              (pointer-pointer (into-array String (keys headers)))))
753 |   ([source-code headers]
754 |    (program nil source-code headers))
755 |   ([source-code]
756 |    (program nil source-code nil)))
757 | 
758 | (defn program-log
759 |   "Returns the log string generated by the previous compilation of `prog`."
760 |   [prog]
761 |   (program-log* prog))
762 | 
763 | (defn compile!
764 |   "Compiles the given `prog` using a list of string `options`."
765 |   ([prog options]
766 |    (compile* prog (pointer-pointer (into-array String options)))
767 |    prog)
768 |   ([prog]
769 |    (compile! prog nil)))
770 | 
771 | (defn ptx
772 |   "Returns the PTX generated by the previous compilation of `prog`."
773 |   [prog]
774 |   (ptx* prog))
775 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/info.clj:
--------------------------------------------------------------------------------
  1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
  2 | ;;   The use and distribution terms for this software are covered by the
  3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
  4 | ;;   which can be found in the file LICENSE at the root of this distribution.
  5 | ;;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;;   the terms of this license.
  7 | ;;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:author "Dragan Djuric"}
 10 |     uncomplicate.clojurecuda.info
 11 |   "Info functions for all CUDA objects (devices, etc...).
 12 |   "
 13 |   (:require [clojure.string :as str]
 14 |             [uncomplicate.commons.core :refer [with-release Info]]
 15 |             [uncomplicate.fluokitten.core :refer [fmap op]]
 16 |             [uncomplicate.clojure-cpp :as cpp
 17 |              :refer [int-pointer byte-pointer size-t-pointer get-string get-entry]]
 18 |             [uncomplicate.clojurecuda.internal
 19 |              [constants :refer [ctx-limits dec-compute-mode dec-func-cache-config dec-shared-config
 20 |                                 dec-stream-flag func-cache-config shared-config-map]]
 21 |              [utils :refer [with-check maybe]]
 22 |              [impl :refer [current-context* ->CUDevice]]])
 23 |   (:import [org.bytedeco.cuda.global cudart]
 24 |            [org.bytedeco.cuda.cudart CUctx_st CUfunc_st CUstream_st]
 25 |            [uncomplicate.clojurecuda.internal.impl CUDevice]))
 26 | 
 27 | ;; =================== Info* utility macros ===============================
 28 | 
 29 | (defmacro ^:private info-attribute* [method object attribute]
 30 |   `(long (with-release [res# (int-pointer 1)]
 31 |            (with-check (~method res# ~attribute ~object)
 32 |              (get-entry res# 0)))))
 33 | 
 34 | ;; =================== Version Management =================================
 35 | 
 36 | (defn driver-version ^long []
 37 |   (with-release [res (int-pointer 1)]
 38 |     (with-check (cudart/cuDriverGetVersion res) (get-entry res 0))))
 39 | 
 40 | ;; =================== Device info  =======================================
 41 | 
 42 | (defn device-name [^CUDevice device]
 43 |   (with-release [res (byte-pointer 64)]
 44 |     (with-check (cudart/cuDeviceGetName res 64 (.dev device))
 45 |       (clojure.string/replace (get-string res) #" " ""))))
 46 | 
 47 | (defn total-mem [^CUDevice device]
 48 |   (with-release [res (size-t-pointer 1)]
 49 |     (with-check (cudart/cuDeviceTotalMem res (.dev device))
 50 |       (get-entry res 0))))
 51 | 
 52 | (defn async-engine-count ^long [^CUDevice device]
 53 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 54 |                    cudart/CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT))
 55 | 
 56 | (defn can-map-host-memory [^CUDevice device]
 57 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 58 |                          cudart/CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY)))
 59 | 
 60 | (defn clock-rate ^long [^CUDevice device]
 61 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 62 |                    cudart/CU_DEVICE_ATTRIBUTE_CLOCK_RATE))
 63 | 
 64 | (defn compute-capability-major ^long [^CUDevice device]
 65 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 66 |                    cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR))
 67 | 
 68 | (defn compute-capability-minor ^long [^CUDevice device]
 69 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 70 |                    cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR))
 71 | 
 72 | (defn compute-mode [^CUDevice device]
 73 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 74 |                    cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_MODE))
 75 | 
 76 | (defn concurrent-kernels ^long [^CUDevice device]
 77 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 78 |                    cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS))
 79 | 
 80 | (defn ecc-enabled [^CUDevice device]
 81 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 82 |                          cudart/CU_DEVICE_ATTRIBUTE_ECC_ENABLED)))
 83 | 
 84 | (defn global-L1-cache-supported [^CUDevice device]
 85 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 86 |                          cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED)))
 87 | 
 88 | (defn global-memory-bus-width ^long [^CUDevice device]
 89 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 90 |                    cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH))
 91 | 
 92 | (defn integrated [^CUDevice device]
 93 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 94 |                          cudart/CU_DEVICE_ATTRIBUTE_INTEGRATED)))
 95 | 
 96 | (defn kernel-exec-timeout [^CUDevice device]
 97 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
 98 |                          cudart/CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)))
 99 | 
100 | (defn L2-cache-size ^long [^CUDevice device]
101 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
102 |                    cudart/CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE))
103 | 
104 | (defn local-L1-cache-supported [^CUDevice device]
105 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
106 |                          cudart/CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED)))
107 | 
108 | (defn managed-memory [^CUDevice device]
109 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
110 |                          cudart/CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)))
111 | 
112 | (defn concurrent-managed-access [^CUDevice device]
113 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
114 |                          cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)))
115 | 
116 | (defn max-block-dim-x ^long [^CUDevice device]
117 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
118 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X))
119 | 
120 | (defn max-block-dim-y ^long [^CUDevice device]
121 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
122 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y))
123 | 
124 | (defn max-block-dim-z ^long [^CUDevice device]
125 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
126 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
127 | 
128 | (defn max-grid-dim-x ^long [^CUDevice device]
129 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
130 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X))
131 | 
132 | (defn max-grid-dim-y ^long [^CUDevice device]
133 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
134 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y))
135 | 
136 | (defn max-grid-dim-z ^long [^CUDevice device]
137 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
138 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
139 | 
140 | (defn max-pitch ^long [^CUDevice device]
141 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
142 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_PITCH))
143 | 
144 | (defn max-registers-per-block ^long [^CUDevice device]
145 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
146 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK))
147 | 
148 | (defn max-registers-per-multiprocessor ^long [^CUDevice device]
149 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
150 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR))
151 | 
152 | (defn max-shared-memory-per-block ^long [^CUDevice device]
153 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
154 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK))
155 | 
156 | (defn max-shared-memory-per-multiprocessor ^long [^CUDevice device]
157 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
158 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR))
159 | 
160 | (defn max-threads-per-block ^long [^CUDevice device]
161 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
162 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
163 | 
164 | (defn max-threads-per-multiprocessor ^long [^CUDevice device]
165 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
166 |                    cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR))
167 | 
168 | (defn maximum-surface1d-layered-layers ^long [^CUDevice device]
169 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
170 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS))
171 | 
172 | (defn maximum-surface1d-layered-width ^long [^CUDevice device]
173 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
174 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH))
175 | 
176 | (defn maximum-surface1d-width ^long [^CUDevice device]
177 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
178 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH))
179 | 
180 | (defn maximum-surface2d-height ^long [^CUDevice device]
181 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
182 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT))
183 | 
184 | (defn maximum-surface2d-width ^long [^CUDevice device]
185 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
186 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH))
187 | 
188 | (defn maximum-surface2d-layered-height ^long [^CUDevice device]
189 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
190 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT))
191 | 
192 | (defn maximum-surface2d-layered-width ^long [^CUDevice device]
193 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
194 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH))
195 | 
196 | (defn maximum-surface2d-layered-layers ^long [^CUDevice device]
197 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
198 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS))
199 | 
200 | (defn maximum-surface3d-depth ^long [^CUDevice device]
201 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
202 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH))
203 | 
204 | (defn maximum-surface3d-height ^long [^CUDevice device]
205 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
206 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT))
207 | 
208 | (defn maximum-surface3d-width ^long [^CUDevice device]
209 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
210 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH))
211 | 
212 | (defn maximum-surfacecubemap-layered-width ^long [^CUDevice device]
213 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
214 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH))
215 | 
216 | (defn maximum-surfacecubemap-layered-layers ^long [^CUDevice device]
217 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
218 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS))
219 | 
220 | (defn maximum-surfacecubemap-width ^long [^CUDevice device]
221 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
222 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH))
223 | 
224 | (defn maximum-texture1d-layered-width ^long [^CUDevice device]
225 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
226 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH))
227 | 
228 | (defn maximum-texture1d-layered-layers ^long [^CUDevice device]
229 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
230 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS))
231 | 
232 | (defn maximum-texture1d-linear-width ^long [^CUDevice device]
233 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
234 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH))
235 | 
236 | (defn maximum-texture1d-mipmapped-width ^long [^CUDevice device]
237 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
238 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH))
239 | 
240 | (defn maximum-texture1d-width ^long [^CUDevice device]
241 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
242 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH))
243 | 
244 | (defn maximum-texture2d-height ^long [^CUDevice device]
245 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
246 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT))
247 | 
248 | (defn maximum-texture2d-layered-height ^long [^CUDevice device]
249 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
250 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))
251 | 
252 | (defn maximum-texture2d-layered-layers ^long [^CUDevice device]
253 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
254 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS))
255 | 
256 | (defn maximum-texture2d-linear-height ^long [^CUDevice device]
257 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
258 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT))
259 | 
260 | (defn maximum-texture2d-linear-pitch ^long [^CUDevice device]
261 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
262 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH))
263 | 
264 | (defn maximum-texture2d-linear-width ^long [^CUDevice device]
265 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
266 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH))
267 | 
268 | (defn maximum-texture2d-mipmapped-width ^long [^CUDevice device]
269 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
270 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH))
271 | 
272 | (defn maximum-texture2d-mipmapped-height ^long [^CUDevice device]
273 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
274 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT))
275 | 
276 | (defn maximum-texture2d-width ^long [^CUDevice device]
277 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
278 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH))
279 | 
280 | (defn maximum-texture3d-depth ^long [^CUDevice device]
281 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
282 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH))
283 | 
284 | (defn maximum-texture3d-depth-alternate ^long [^CUDevice device]
285 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
286 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE))
287 | 
288 | (defn maximum-texture3d-height ^long [^CUDevice device]
289 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
290 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT))
291 | 
292 | (defn maximum-texture3d-height-alternate ^long [^CUDevice device]
293 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
294 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE))
295 | 
296 | (defn maximum-texture3d-width ^long [^CUDevice device]
297 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
298 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH))
299 | 
300 | (defn maximum-texture3d-width-alternate ^long [^CUDevice device]
301 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
302 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE))
303 | 
304 | (defn maximum-texturecubemap-layered-layers ^long [^CUDevice device]
305 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
306 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS))
307 | 
308 | (defn maximum-texturecubemap-layered-width ^long [^CUDevice device]
309 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
310 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH))
311 | 
312 | (defn maximum-texturecubemap-width ^long [^CUDevice device]
313 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
314 |                    cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH))
315 | 
316 | (defn memory-clock-rate ^long [^CUDevice device]
317 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
318 |                    cudart/CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE))
319 | 
320 | (defn multi-gpu-board [^CUDevice device]
321 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
322 |                          cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD)))
323 | 
324 | (defn multi-gpu-board-group-id ^long [^CUDevice device]
325 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
326 |                    cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID))
327 | 
328 | (defn multiprocessor-count ^long [^CUDevice device]
329 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
330 |                    cudart/CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
331 | 
332 | (defn pci-bus-id ^long [^CUDevice device]
333 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
334 |                    cudart/CU_DEVICE_ATTRIBUTE_PCI_BUS_ID))
335 | 
336 | (defn pci-bus-id-string [^CUDevice device]
337 |   (with-release [res (byte-pointer 13)
338 |                  res2 (byte-pointer 12)]
339 |     (with-check (cudart/cuDeviceGetPCIBusId res 13 (.dev device))
340 |       (do
341 |         (cpp/memcpy! res res2 12)
342 |         (get-string res2)))))
343 | 
344 | (defn pci-device-id ^long [^CUDevice device]
345 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
346 |                    cudart/CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID))
347 | 
348 | (defn pci-domain-id ^long [^CUDevice device]
349 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
350 |                    cudart/CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID))
351 | 
352 | (defn stream-priorities-supported [^CUDevice device]
353 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
354 |                          cudart/CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED)))
355 | 
356 | (defn surface-alignment ^long [^CUDevice device]
357 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
358 |                    cudart/CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT))
359 | 
360 | (defn tcc-driver [^CUDevice device]
361 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
362 |                          cudart/CU_DEVICE_ATTRIBUTE_TCC_DRIVER)))
363 | 
364 | (defn texture-alignment ^long [^CUDevice device]
365 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
366 |                    cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT))
367 | 
368 | (defn texture-pitch-alignment ^long [^CUDevice device]
369 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
370 |                    cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT))
371 | 
372 | (defn total-constant-memory ^long [^CUDevice device]
373 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
374 |                    cudart/CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY))
375 | 
376 | (defn unified-addressing [^CUDevice device]
377 |   (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
378 |                          cudart/CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)))
379 | 
380 | (defn warp-size ^long [^CUDevice device]
381 |   (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
382 |                    cudart/CU_DEVICE_ATTRIBUTE_WARP_SIZE))
383 | 
384 | (def ^:no-doc
385 |   device-attributes
386 |   {:name device-name
387 |    :total-mem total-mem
388 |    :async-engine-count async-engine-count
389 |    :can-map-host-memory can-map-host-memory
390 |    :clock-rate clock-rate
391 |    :compute-capability-major compute-capability-major
392 |    :compute-capability-minor compute-capability-minor
393 |    :compute-mode (comp dec-compute-mode compute-mode)
394 |    :concurrent-kernels concurrent-kernels
395 |    :ecc-enabled ecc-enabled
396 |    :global-L1-cache-supported global-L1-cache-supported
397 |    :global-memory-bus-width global-memory-bus-width
398 |    :integrated integrated
399 |    :kernel-exec-timeout kernel-exec-timeout
400 |    :L2-cache-size L2-cache-size
401 |    :local-L1-cache-supported local-L1-cache-supported
402 |    :managed-memory managed-memory
403 |    :max-block-dim-x max-block-dim-x
404 |    :max-block-dim-y max-block-dim-y
405 |    :max-block-dim-z max-block-dim-z
406 |    :max-grid-dim-x max-grid-dim-x
407 |    :max-grid-dim-y max-grid-dim-y
408 |    :max-grid-dim-z max-grid-dim-z
409 |    :max-pitch max-pitch
410 |    :max-registers-per-block max-registers-per-block
411 |    :max-registers-per-multiprocessor max-registers-per-multiprocessor
412 |    :max-shared-memory-per-block max-shared-memory-per-block
413 |    :max-shared-memory-per-multiprocessor max-shared-memory-per-multiprocessor
414 |    :max-threads-per-block max-threads-per-block
415 |    :max-threads-per-multiprocessor max-threads-per-multiprocessor
416 |    :maximum-surface1d-layered-layers maximum-surface1d-layered-layers
417 |    :maximum-surface1d-layered-width maximum-surface1d-layered-width
418 |    :maximum-surface1d-width maximum-surface1d-width
419 |    :maximum-surface2d-height maximum-surface2d-height
420 |    :maximum-surface2d-width maximum-surface2d-width
421 |    :maximum-surface2d-layered-height maximum-surface2d-layered-height
422 |    :maximum-surface2d-layered-width maximum-surface2d-layered-width
423 |    :maximum-surface2d-layered-layers maximum-surface2d-layered-layers
424 |    :maximum-surface3d-depth maximum-surface3d-depth
425 |    :maximum-surface3d-height maximum-surface3d-height
426 |    :maximum-surface3d-width maximum-surface3d-width
427 |    :maximum-surfacecubemap-layered-width maximum-surfacecubemap-layered-width
428 |    :maximum-surfacecubemap-layered-layers maximum-surfacecubemap-layered-layers
429 |    :maximum-surfacecubemap-width maximum-surfacecubemap-width
430 |    :maximum-texture1d-layered-width maximum-texture1d-layered-width
431 |    :maximum-texture1d-layered-layers maximum-texture1d-layered-layers
432 |    :maximum-texture1d-linear-width maximum-texture1d-linear-width
433 |    :maximum-texture1d-mipmapped-width maximum-texture1d-mipmapped-width
434 |    :maximum-texture1d-width maximum-texture1d-width
435 |    :maximum-texture2d-height maximum-texture2d-height
436 |    :maximum-texture2d-layered-height maximum-texture2d-layered-height
437 |    :maximum-texture2d-layered-layers maximum-texture2d-layered-layers
438 |    :maximum-texture2d-linear-height maximum-texture2d-linear-height
439 |    :maximum-texture2d-linear-pitch maximum-texture2d-linear-pitch
440 |    :maximum-texture2d-linear-width maximum-texture2d-linear-width
441 |    :maximum-texture2d-mipmapped-width maximum-texture2d-mipmapped-width
442 |    :maximum-texture2d-mipmapped-height maximum-texture2d-mipmapped-height
443 |    :maximum-texture2d-width maximum-texture2d-width
444 |    :maximum-texture3d-depth maximum-texture3d-depth
445 |    :maximum-texture3d-depth-alternate maximum-texture3d-depth-alternate
446 |    :maximum-texture3d-height maximum-texture3d-height
447 |    :maximum-texture3d-height-alternate maximum-texture3d-height-alternate
448 |    :maximum-texture3d-width maximum-texture3d-width
449 |    :maximum-texture3d-width-alternate maximum-texture3d-width-alternate
450 |    :maximum-texturecubemap-layered-layers maximum-texturecubemap-layered-layers
451 |    :maximum-texturecubemap-layered-width maximum-texturecubemap-layered-width
452 |    :maximum-texturecubemap-width maximum-texturecubemap-width
453 |    :memory-clock-rate memory-clock-rate
454 |    :multi-gpu-board multi-gpu-board
455 |    :multi-gpu-board-group-id multi-gpu-board-group-id
456 |    :multiprocessor-count multiprocessor-count
457 |    :pci-bus-id pci-bus-id
458 |    :pci-bus-id-string pci-bus-id-string
459 |    :pci-device-id pci-device-id
460 |    :pci-domain-id pci-domain-id
461 |    :stream-priorities-supported stream-priorities-supported
462 |    :surface-alignment surface-alignment
463 |    :tcc-driver tcc-driver
464 |    :texture-alignment texture-alignment
465 |    :texture-pitch-alignment texture-pitch-alignment
466 |    :total-constant-memory total-constant-memory
467 |    :unified-addressing unified-addressing
468 |    :warp-size warp-size})
469 | 
470 | (extend-type CUDevice
471 |   Info
472 |   (info
473 |     ([d attribute]
474 |      (if-let [attribute-fn (device-attributes attribute)]
475 |        (maybe (attribute-fn d))
476 |        (throw (ex-info "Unknown attribute." {:attribute attribute}))))
477 |     ([d]
478 |      (fmap #(maybe (% d)) device-attributes))))
479 | 
480 | ;; =======================  Context Info ==================================
481 | 
482 | (defn api-version
483 |   "Gets the context's API version."
484 |   ([^CUctx_st ctx]
485 |    (with-release [res (int-pointer 1)]
486 |      (with-check (cudart/cuCtxGetApiVersion ctx res) (get-entry res 0))))
487 |   ([]
488 |    (with-release [res (int-pointer 1)]
489 |      (with-check (cudart/cuCtxGetApiVersion ^CUctx_st (current-context*) res)
490 |        (get-entry res 0)))))
491 | 
492 | (defn cache-config
493 |   "Returns the preferred cache configuration for the current context.
494 | 
495 |   See [cuCtxGetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
496 |   "
497 |   []
498 |   (with-release [res (int-pointer 1)]
499 |     (dec-func-cache-config (with-check (cudart/cuCtxGetCacheConfig res) (get-entry res 0)))))
500 | 
501 | (defn limit*
502 |   "Returns or sets resource limits for the attribute specified by integer `limit`.
503 | 
504 |   See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
505 |   "
506 |   (^long [limit]
507 |    (with-release [res (size-t-pointer 1)]
508 |      (with-check (cudart/cuCtxGetLimit res limit) (get-entry res 0))))
509 |   (^long [limit ^long value]
510 |    (with-check (cudart/cuCtxSetLimit limit value) value)))
511 | 
512 | (defn limit
513 |   "Returns resource limits for the attribute specified by keyword `limit`.
514 | 
515 |   Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`,
516 |   `dev-runtime-pending-launch-count`.
517 | 
518 |   See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
519 |   "
520 |   ^long [limit]
521 |   (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits})))))
522 | 
523 | (defn limit!
524 |   "Sets resource limit for the attribute specified by keyword `limit` to `value`.
525 | 
526 |   Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`,
527 |   `dev-runtime-pending-launch-count`.
528 | 
529 |   See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
530 |   "
531 |   ^long [limit ^long value]
532 |   (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits})))
533 |           value))
534 | 
535 | (defn ctx-device
536 |   "Returns the device for the current context."
537 |   []
538 |   (with-release [res (int-pointer 1)]
539 |     (with-check (cudart/cuCtxGetDevice res) (->CUDevice (get-entry res 0)))))
540 | 
541 | (defn shared-config*
542 |   "Sets or gets the current shared memory configuration for the current context or kernel `func`.
543 | 
544 |   See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
545 |   See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
546 |   See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
547 |   "
548 |   (^long []
549 |    (with-release [res (int-pointer 1)]
550 |      (with-check (cudart/cuCtxGetSharedMemConfig res) (get-entry res 0))))
551 |   (^long [^long config]
552 |    (with-check (cudart/cuCtxSetSharedMemConfig config) config))
553 |   ([^CUfunc_st func ^long config]
554 |    (with-check (cudart/cuFuncSetSharedMemConfig func config) func)))
555 | 
556 | (defn shared-config
557 |   "Gets the current shared memory configuration for the current context.
558 | 
559 |   See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
560 |   "
561 |   []
562 |   (dec-shared-config (shared-config*)))
563 | 
564 | (defn shared-config!
565 |   "Sets the current shared memory configuration for the current context.
566 | 
567 |   See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
568 |   See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
569 |   "
570 |   ([config]
571 |    (shared-config* (or (shared-config-map config)
572 |                        (ex-info "Unknown config." {:config config :available shared-config-map}))))
573 |   ([func config]
574 |    (shared-config* func (or (shared-config-map config)
575 |                             (ex-info "Unknown config." {:config config :available shared-config-map})))))
576 | 
577 | (defn stream-priority-range
578 |   "Returns a vector of 2 numerical values that correspond to the least and greatest stream priorities.
579 | 
580 |   See [cuCtxGetStreamPriorityRange](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
581 |   "
582 |   []
583 |   (with-release [least (int-pointer 1)
584 |                  greatest (int-pointer 1)]
585 |     (with-check (cudart/cuCtxGetStreamPriorityRange least greatest)
586 |       [(get-entry least 0) (get-entry greatest 0)])))
587 | 
588 | (extend-type CUctx_st
589 |   Info
590 |   (info
591 |     ([_ info-type]
592 |      (maybe
593 |       (case info-type
594 |         :api-version (api-version)
595 |         :cache-config (cache-config)
596 |         :stack-size (limit* cudart/CU_LIMIT_STACK_SIZE)
597 |         :malloc-heap-size (limit* cudart/CU_LIMIT_MALLOC_HEAP_SIZE)
598 |         :printf-fifo-size (limit* cudart/CU_LIMIT_PRINTF_FIFO_SIZE)
599 |         :dev-runtime-sync-depth (limit* cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH)
600 |         :dev-runtime-pending-launch-count (limit* cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT)
601 |         :limits (fmap #(maybe (limit* %)) ctx-limits)
602 |         :device (ctx-device)
603 |         :shared-config (shared-config)
604 |         :stream-priority-range (stream-priority-range)
605 |         nil)))
606 |     ([_]
607 |      (op {:api-version (maybe (api-version))
608 |           :cache-config (maybe (cache-config))
609 |           :device (maybe (ctx-device))
610 |           :shared-config (shared-config)
611 |           :stream-priority-range (stream-priority-range)}
612 |          (fmap #(maybe (limit* %)) ctx-limits)))))
613 | 
614 | ;; =========================== Stream Management ================================
615 | 
616 | (defn stream-flag [^CUstream_st hstream]
617 |   (with-release [res (int-pointer 1)]
618 |     (with-check (cudart/cuStreamGetFlags hstream res) (get-entry res 0))))
619 | 
620 | (defn stream-priority ^long [^CUstream_st hstream]
621 |   (with-release [res (int-pointer 1)]
622 |     (with-check (cudart/cuStreamGetPriority hstream res) (get-entry res 0))))
623 | 
624 | (extend-type CUstream_st
625 |   Info
626 |   (info
627 |     ([hstream info-type]
628 |      (maybe
629 |       (case info-type
630 |         :flag (dec-stream-flag (stream-flag hstream))
631 |         :priority (stream-priority hstream)
632 |         nil)))
633 |     ([hstream]
634 |      {:flag (maybe (dec-stream-flag (stream-flag hstream)))
635 |       :priority (maybe (stream-priority hstream))})))
636 | 
637 | ;; ============================= Execution Management ==========================
638 | 
639 | (defn max-threads-per-block-fn ^long [^CUfunc_st function]
640 |   (info-attribute* cudart/cuFuncGetAttribute function
641 |                    cudart/CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
642 | 
643 | (defn shared-size ^long [^CUfunc_st function]
644 |   (info-attribute* cudart/cuFuncGetAttribute function
645 |                    cudart/CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES))
646 | 
647 | (defn const-size ^long [^CUfunc_st function]
648 |   (info-attribute* cudart/cuFuncGetAttribute function
649 |                    cudart/CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES))
650 | 
651 | (defn local-size ^long [^CUfunc_st function]
652 |   (info-attribute* cudart/cuFuncGetAttribute function
653 |                    cudart/CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES))
654 | 
655 | (defn num-regs ^long [^CUfunc_st function]
656 |   (info-attribute* cudart/cuFuncGetAttribute function
657 |                    cudart/CU_FUNC_ATTRIBUTE_NUM_REGS))
658 | 
659 | (defn ptx-version ^long [^CUfunc_st function]
660 |   (info-attribute* cudart/cuFuncGetAttribute function
661 |                    cudart/CU_FUNC_ATTRIBUTE_PTX_VERSION))
662 | 
663 | (defn binary-version ^long [^CUfunc_st function]
664 |   (info-attribute* cudart/cuFuncGetAttribute function
665 |                    cudart/CU_FUNC_ATTRIBUTE_BINARY_VERSION))
666 | 
667 | (defn cache-config*
668 |   "Sets the preferred cache configuration for a device function `fun`, as an integer `config`.
669 | 
670 |   See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
671 |   "
672 |   [fun ^long config]
673 |   (with-check (cudart/cuFuncSetCacheConfig fun config) fun))
674 | 
675 | (defn cache-config!
676 |   "Sets the preferred cache configuration for a device function `fun`, as a keyword `config`.
677 | 
678 |   Available configs are `:prefer-none`, `:prefer-shared`, `:prefer-L1`, and `:prefer-equal`.
679 | 
680 |   See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
681 |   "
682 |   [fun config]
683 |   (cache-config* fun (or (func-cache-config config)
684 |                          (throw (ex-info "Invaling cache config."
685 |                                          {:config config :available func-cache-config})))))
686 | 
687 | (extend-type CUfunc_st
688 |   Info
689 |   (info
690 |     ([fun info-type]
691 |      (maybe
692 |       (case info-type
693 |         :max-threads-per-block (max-threads-per-block-fn fun)
694 |         :shared-size (shared-size fun)
695 |         :const-size (const-size fun)
696 |         :local-size (local-size fun)
697 |         :num-regs (num-regs fun)
698 |         :ptx-version (ptx-version fun)
699 |         :binary-version (binary-version fun)
700 |         nil)))
701 |     ([fun]
702 |      {:max-threads-per-block (maybe (max-threads-per-block-fn fun))
703 |       :shared-size (maybe (shared-size fun))
704 |       :const-size (maybe (const-size fun))
705 |       :local-size (maybe (local-size fun))
706 |       :num-regs (maybe (num-regs fun))
707 |       :ptx-version (maybe (ptx-version fun))
708 |       :binary-version (maybe (binary-version fun))})))
709 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/constants.clj:
--------------------------------------------------------------------------------
  1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
  2 | ;;   The use and distribution terms for this software are covered by the
  3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
  4 | ;;   which can be found in the file LICENSE at the root of this distribution.
  5 | ;;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;;   the terms of this license.
  7 | ;;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:author "Dragan Djuric"}
 10 |     uncomplicate.clojurecuda.internal.constants
 11 |   "Defines constants and mappings from/to CUDA constants."
 12 |   (:import [org.bytedeco.cuda.global cudart nvrtc]))
 13 | 
 14 | ;; ==================== Keyword mapping ======================================
 15 | 
 16 | (def ^{:const true
 17 |        :doc "Available context flags defined in the CUDA standard."}
 18 |   ctx-flags
 19 |   {:blocking-sync cudart/CU_CTX_BLOCKING_SYNC
 20 |    :coredump cudart/CU_CTX_COREDUMP_ENABLE
 21 |    :flags-mask cudart/CU_CTX_FLAGS_MASK
 22 |    :lmem-resize-to-max cudart/CU_CTX_LMEM_RESIZE_TO_MAX
 23 |    :map-host cudart/CU_CTX_MAP_HOST
 24 |    :sched-auto cudart/CU_CTX_SCHED_AUTO
 25 |    :sched-blocking-sync cudart/CU_CTX_SCHED_BLOCKING_SYNC
 26 |    :sched-mask cudart/CU_CTX_SCHED_MASK
 27 |    :sched-spin cudart/CU_CTX_SCHED_SPIN
 28 |    :sched-yield cudart/CU_CTX_SCHED_YIELD
 29 |    :sync-memops cudart/CU_CTX_SYNC_MEMOPS
 30 |    :user-coredump cudart/CU_CTX_USER_COREDUMP_ENABLE})
 31 | 
 32 | (def ^{:const true
 33 |        :doc "Available context limits."}
 34 |   ctx-limits
 35 |   {:dev-runtime-pending-launch-count cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT
 36 |    :dev-runtime-sync-depth cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH
 37 |    :malloc-heap-size cudart/CU_LIMIT_MALLOC_HEAP_SIZE
 38 |    :max cudart/CU_LIMIT_MAX
 39 |    :max-l2-fetch-granularity cudart/CU_LIMIT_MAX_L2_FETCH_GRANULARITY
 40 |    :persisting-l2-cache-size cudart/CU_LIMIT_PERSISTING_L2_CACHE_SIZE
 41 |    :printf-fifo-size cudart/CU_LIMIT_PRINTF_FIFO_SIZE
 42 |    :stack-size cudart/CU_LIMIT_STACK_SIZE})
 43 | 
 44 | (def ^{:const true
 45 |        :doc "Available shared memory configurations."}
 46 |   shared-config-map
 47 |   {:default-bank-size cudart/CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE
 48 |    :four-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE
 49 |    :eight-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE})
 50 | 
 51 | (defn dec-shared-config [^long config]
 52 |   (case config
 53 |     0 :default-bank-size
 54 |     1 :four-byte-bank-size
 55 |     2 :eight-byte-bank-size
 56 |     config))
 57 | 
 58 | (def ^{:const true
 59 |        :doc "Available device P2P attributes."}
 60 |   p2p-attributes
 61 |   {:access-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED
 62 |    :access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED
 63 |    :cuda-array-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED
 64 |    :native-atomic-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED
 65 |    :performance-rank cudart/CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK})
 66 | 
 67 | (defn dec-compute-mode [^long mode]
 68 |   (case mode
 69 |     0 :default
 70 |     1 :exclusive
 71 |     2 :prohibited
 72 |     3 :exclusive-process
 73 |     mode) )
 74 | 
 75 | (def ^{:const true
 76 |        :doc "Available flags for the [[core/mem-host-alloc]] function."}
 77 |   mem-host-alloc-flags
 78 |   {:portable cudart/CU_MEMHOSTALLOC_PORTABLE
 79 |    :devicemap cudart/CU_MEMHOSTALLOC_DEVICEMAP
 80 |    :writecombined cudart/CU_MEMHOSTALLOC_WRITECOMBINED})
 81 | 
 82 | (def ^{:const true
 83 |        :doc "Available flags for the [[core/mem-host-register]] function."}
 84 |   mem-host-register-flags
 85 |   {:devicemap cudart/CU_MEMHOSTREGISTER_DEVICEMAP
 86 |    :iomemory cudart/CU_MEMHOSTREGISTER_IOMEMORY
 87 |    :portable cudart/CU_MEMHOSTREGISTER_PORTABLE
 88 |    :read-onlyp cudart/CU_MEMHOSTREGISTER_READ_ONLY})
 89 | 
 90 | (def ^{:const true
 91 |        :doc "Available flags for the [[core/mem-host-attach]] function."}
 92 |   mem-attach-flags
 93 |   {:global cudart/CU_MEM_ATTACH_GLOBAL
 94 |    :host cudart/CU_MEM_ATTACH_HOST
 95 |    :single cudart/CU_MEM_ATTACH_SINGLE})
 96 | 
 97 | (def ^{:const true
 98 |        :doc "Available flags for the [[core/mem-host-attach]] function."}
 99 |   stream-flags
100 |   {:default cudart/CU_STREAM_DEFAULT
101 |    :non-blocking cudart/CU_STREAM_NON_BLOCKING})
102 | 
103 | (defn dec-stream-flag [^long flag]
104 |   (case flag
105 |     0 :default
106 |     1 :non-blocking
107 |     flag))
108 | 
109 | (def ^{:const true
110 |        :doc "Available flags for the [[core/event]] function."}
111 |   event-flags
112 |   {:blocking-sync cudart/CU_EVENT_BLOCKING_SYNC
113 |    :default cudart/CU_EVENT_DEFAULT
114 |    :disable-timing cudart/CU_EVENT_DISABLE_TIMING
115 |    :interprocess cudart/CU_EVENT_INTERPROCESS})
116 | 
117 | (def ^{:const true
118 |        :doc "Available config for the [[core/cache-config!]] function."}
119 |   func-cache-config
120 |   {:prefer-none cudart/CU_FUNC_CACHE_PREFER_NONE
121 |    :prefer-shared cudart/CU_FUNC_CACHE_PREFER_SHARED
122 |    :prefer-L1 cudart/CU_FUNC_CACHE_PREFER_L1
123 |    :prefer-equal cudart/CU_FUNC_CACHE_PREFER_EQUAL})
124 | 
125 | (defn dec-func-cache-config [^long mode]
126 |   (case mode
127 |     0 :prefer-none
128 |     1 :prefer-shared
129 |     2 :prefer-L1
130 |     3 :prefer-equal
131 |     mode))
132 | 
133 | (def ^{:const true
134 |        :doc "Available jit options defined in the CUDA standard."}
135 |   jit-options
136 |   {:cache-mode cudart/CU_JIT_CACHE_MODE
137 |    :cache-option-ca cudart/CU_JIT_CACHE_OPTION_CA
138 |    :cache-option-cg cudart/CU_JIT_CACHE_OPTION_CG
139 |    :cache-option-none cudart/CU_JIT_CACHE_OPTION_NONE
140 |    :error-log-buffer cudart/CU_JIT_ERROR_LOG_BUFFER
141 |    :error-log-buffer-size-bytes cudart/CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
142 |    :fallback-strategy cudart/CU_JIT_FALLBACK_STRATEGY
143 |    :fast-compile-strategy cudart/CU_JIT_FAST_COMPILE
144 |    :fma cudart/CU_JIT_FMA
145 |    :ftz cudart/CU_JIT_FTZ
146 |    :generate-debug-info cudart/CU_JIT_GENERATE_DEBUG_INFO
147 |    :generate-line-info cudart/CU_JIT_GENERATE_LINE_INFO
148 |    :global-symbol-addresses cudart/CU_JIT_GLOBAL_SYMBOL_ADDRESSES
149 |    :global-symbol-count cudart/CU_JIT_GLOBAL_SYMBOL_COUNT
150 |    :global-symbol-names cudart/CU_JIT_GLOBAL_SYMBOL_NAMES
151 |    :info-log-buffer cudart/CU_JIT_INFO_LOG_BUFFER
152 |    :info-log-buffer-size-bytes cudart/CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
153 |    :input-cubin cudart/CU_JIT_INPUT_CUBIN
154 |    :input-fatbinary cudart/CU_JIT_INPUT_FATBINARY
155 |    :input-library cudart/CU_JIT_INPUT_LIBRARY
156 |    :input-nvvm cudart/CU_JIT_INPUT_NVVM
157 |    :input-object cudart/CU_JIT_INPUT_OBJECT
158 |    :input-ptx cudart/CU_JIT_INPUT_PTX
159 |    :log-verbose cudart/CU_JIT_LOG_VERBOSE
160 |    :lto cudart/CU_JIT_LTO
161 |    :max-registers cudart/CU_JIT_MAX_REGISTERS
162 |    :new-sm3x-opt cudart/CU_JIT_NEW_SM3X_OPT
163 |    :num-input-tupes cudart/CU_JIT_NUM_INPUT_TYPES
164 |    :num-options cudart/CU_JIT_NUM_OPTIONS
165 |    :optimization-level cudart/CU_JIT_OPTIMIZATION_LEVEL
166 |    :optimize-unused-device-variables cudart/CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES
167 |    :position-independent-code cudart/CU_JIT_POSITION_INDEPENDENT_CODE
168 |    :prec-div cudart/CU_JIT_PREC_DIV
169 |    :prec-sqrt cudart/CU_JIT_PREC_SQRT
170 |    :referenced-kernel-count cudart/CU_JIT_REFERENCED_KERNEL_COUNT
171 |    :referenced-kernel-names cudart/CU_JIT_REFERENCED_KERNEL_NAMES
172 |    :referenced-variable-count cudart/CU_JIT_REFERENCED_VARIABLE_COUNT
173 |    :referenced-variable-names cudart/CU_JIT_REFERENCED_VARIABLE_NAMES
174 |    :target cudart/CU_JIT_TARGET
175 |    :target-from-cucontext cudart/CU_JIT_TARGET_FROM_CUCONTEXT
176 |    :threads-per-block cudart/CU_JIT_THREADS_PER_BLOCK
177 |    :wall-time cudart/CU_JIT_WALL_TIME})
178 | 
179 | (def ^{:const true
180 |        :doc "Available jit input types defined in the CUDA standard."}
181 |   jit-input-types
182 |   {:cubin cudart/CU_JIT_INPUT_CUBIN
183 |    :ptx cudart/CU_JIT_INPUT_PTX
184 |    :fatbinary cudart/CU_JIT_INPUT_FATBINARY
185 |    :object cudart/CU_JIT_INPUT_OBJECT
186 |    :library cudart/CU_JIT_INPUT_LIBRARY
187 |    :nvvm cudart/CU_JIT_INPUT_NVVM
188 |    :num cudart/CU_JIT_NUM_INPUT_TYPES})
189 | 
190 | (def ^{:const true
191 |        :doc "CUDA Error messages as defined in CUresult."}
192 |   cu-result-codes
193 |   {cudart/CUDA_SUCCESS :success
194 |    cudart/CUDA_ERROR_ALREADY_ACQUIRED :already-acquired
195 |    cudart/CUDA_ERROR_ALREADY_MAPPED :already-mapped
196 |    cudart/CUDA_ERROR_ARRAY_IS_MAPPED :array-is-mapped
197 |    cudart/CUDA_ERROR_ASSERT :assert
198 |    cudart/CUDA_ERROR_CAPTURED_EVENT :captured-event
199 |    cudart/CUDA_ERROR_CDP_NOT_SUPPORTED :cdp-not-supported
200 |    cudart/CUDA_ERROR_CDP_VERSION_MISMATCH :sdp-version-mismatch
201 |    cudart/CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE :compat-not-supported-on-device
202 |    cudart/CUDA_ERROR_CONTEXT_ALREADY_CURRENT :context-already-current
203 |    cudart/CUDA_ERROR_CONTEXT_ALREADY_IN_USE :context-already-in-use
204 |    cudart/CUDA_ERROR_CONTEXT_IS_DESTROYED :context-is-destroyed
205 |    cudart/CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE :cooperative-launch-too-large
206 |    cudart/CUDA_ERROR_DEINITIALIZED :deinitialized
207 |    cudart/CUDA_ERROR_DEVICE_NOT_LICENSED :device-not-licensed
208 |    cudart/CUDA_ERROR_DEVICE_UNAVAILABLE :unavailable
209 |    cudart/CUDA_ERROR_ECC_UNCORRECTABLE :ecc-uncorrectable
210 |    cudart/CUDA_ERROR_EXTERNAL_DEVICE :external-device
211 |    cudart/CUDA_ERROR_FILE_NOT_FOUND :file-not-found
212 |    cudart/CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE :graph-exec-update-failure
213 |    cudart/CUDA_ERROR_HARDWARE_STACK_ERROR :hardware-stack-errox
214 |    cudart/CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED :host-memory-already-registered
215 |    cudart/CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED :host-memory-not-registered
216 |    cudart/CUDA_ERROR_ILLEGAL_ADDRESS :illegal-address
217 |    cudart/CUDA_ERROR_ILLEGAL_INSTRUCTION :illegal-instruction
218 |    cudart/CUDA_ERROR_ILLEGAL_STATE :illegal-state
219 |    cudart/CUDA_ERROR_INVALID_ADDRESS_SPACE :invalid-address-space
220 |    cudart/CUDA_ERROR_INVALID_CLUSTER_SIZE :invalid-cluster-size
221 |    cudart/CUDA_ERROR_INVALID_CONTEXT :invalid-context
222 |    cudart/CUDA_ERROR_INVALID_DEVICE :invalid-device
223 |    cudart/CUDA_ERROR_INVALID_GRAPHICS_CONTEXT :invalid-graphics-context
224 |    cudart/CUDA_ERROR_INVALID_HANDLE :invalid-handle
225 |    cudart/CUDA_ERROR_INVALID_IMAGE :invalid-image
226 |    cudart/CUDA_ERROR_INVALID_PC :invalid-pc
227 |    cudart/CUDA_ERROR_INVALID_PTX :invalid-ptx
228 |    cudart/CUDA_ERROR_INVALID_SOURCE :invalid-source
229 |    cudart/CUDA_ERROR_INVALID_VALUE :invalid-value
230 |    cudart/CUDA_ERROR_JIT_COMPILATION_DISABLED :jit-compilation-disabled
231 |    cudart/CUDA_ERROR_JIT_COMPILER_NOT_FOUND :jit-compiler-not-found
232 |    cudart/CUDA_ERROR_LAUNCH_FAILED :launch-failed
233 |    cudart/CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING :launch-incompatible-texturing
234 |    cudart/CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES :launch-out-of-resources
235 |    cudart/CUDA_ERROR_LAUNCH_TIMEOUT :launch-timeout
236 |    cudart/CUDA_ERROR_MAP_FAILED :map-failed
237 |    cudart/CUDA_ERROR_MISALIGNED_ADDRESS :misaligned-address
238 |    cudart/CUDA_ERROR_MPS_CLIENT_TERMINATED :client-terminated
239 |    cudart/CUDA_ERROR_MPS_CONNECTION_FAILED :connection-failed
240 |    cudart/CUDA_ERROR_MPS_MAX_CLIENTS_REACHED :clients-reached
241 |    cudart/CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED :connection-reached
242 |    cudart/CUDA_ERROR_MPS_RPC_FAILURE :rpc-failure
243 |    cudart/CUDA_ERROR_MPS_SERVER_NOT_READY :server-not-ready
244 |    cudart/CUDA_ERROR_NO_BINARY_FOR_GPU :binary-for-gpu
245 |    cudart/CUDA_ERROR_NO_DEVICE :no-device
246 |    cudart/CUDA_ERROR_NOT_FOUND :not-found
247 |    cudart/CUDA_ERROR_NOT_INITIALIZED :not-initialized
248 |    cudart/CUDA_ERROR_NOT_MAPPED :not-mapped
249 |    cudart/CUDA_ERROR_NOT_MAPPED_AS_ARRAY :not-mapped-as-array
250 |    cudart/CUDA_ERROR_NOT_MAPPED_AS_POINTER :mapped-as-pointer
251 |    cudart/CUDA_ERROR_NOT_READY :not-ready
252 |    cudart/CUDA_ERROR_NOT_SUPPORTED :not-supported
253 |    cudart/CUDA_ERROR_NVLINK_UNCORRECTABLE :nvlink-uncorrectable
254 |    cudart/CUDA_ERROR_OPERATING_SYSTEM :operating-system
255 |    cudart/CUDA_ERROR_OUT_OF_MEMORY :out-of-memory
256 |    cudart/CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED :already-enabled
257 |    cudart/CUDA_ERROR_PEER_ACCESS_NOT_ENABLED :access-not-enabled
258 |    cudart/CUDA_ERROR_PEER_ACCESS_UNSUPPORTED :access-unsupported
259 |    cudart/CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE :context-active
260 |    cudart/CUDA_ERROR_PROFILER_ALREADY_STARTED :profiler-already-started
261 |    cudart/CUDA_ERROR_PROFILER_ALREADY_STOPPED :profiler-already-stopped
262 |    cudart/CUDA_ERROR_PROFILER_DISABLED :profiler-disabled
263 |    cudart/CUDA_ERROR_PROFILER_NOT_INITIALIZED :profiler-not-initialized
264 |    cudart/CUDA_ERROR_SHARED_OBJECT_INIT_FAILED :shared-object-init-failed
265 |    cudart/CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND :shared-object-symblol-not-found
266 |    cudart/CUDA_ERROR_STREAM_CAPTURE_IMPLICIT :stream-capture-implicit
267 |    cudart/CUDA_ERROR_STREAM_CAPTURE_INVALIDATED :stream-capture-invalidated
268 |    cudart/CUDA_ERROR_STREAM_CAPTURE_ISOLATION :stream-capture-isolation
269 |    cudart/CUDA_ERROR_STREAM_CAPTURE_MERGE :stream-capture-merge
270 |    cudart/CUDA_ERROR_STREAM_CAPTURE_UNJOINED :stream-capture-unjoined
271 |    cudart/CUDA_ERROR_STREAM_CAPTURE_UNMATCHED :stream-capture-unmatched
272 |    cudart/CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED :stream-capture-unsupported
273 |    cudart/CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD :stream-capture-wrong-thread
274 |    cudart/CUDA_ERROR_STUB_LIBRARY :stub-library
275 |    cudart/CUDA_ERROR_SYSTEM_DRIVER_MISMATCH :driver-mismatch
276 |    cudart/CUDA_ERROR_SYSTEM_NOT_READY :system-not-ready
277 |    cudart/CUDA_ERROR_TIMEOUT :timeout
278 |    cudart/CUDA_ERROR_TOO_MANY_PEERS :too-many-peers
279 |    cudart/CUDA_ERROR_UNKNOWN :unknown
280 |    cudart/CUDA_ERROR_UNMAP_FAILED :unmap-failed
281 |    cudart/CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC :devside-sync
282 |    cudart/CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY :exec-affinity
283 |    cudart/CUDA_ERROR_UNSUPPORTED_LIMIT :unsupported-limit
284 |    cudart/CUDA_ERROR_UNSUPPORTED_PTX_VERSION :unsupported-ptx-version
285 |    cudart/cudaErrorAddressOfConstant :address-of-constant
286 |    cudart/cudaErrorApiFailureBase :failure-base
287 |    cudart/cudaErrorCallRequiresNewerDriver :call-requires-newer-driver
288 |    cudart/cudaErrorDuplicateSurfaceName :duplicate-surface-name
289 |    cudart/cudaErrorDuplicateTextureName :duplicate-texture-name
290 |    cudart/cudaErrorDuplicateVariableName :duplicate-variable-name
291 |    cudart/cudaErrorIncompatibleDriverContext :incompatible-context
292 |    cudart/cudaErrorInsufficientDriver :insufficient-driver
293 |    cudart/cudaErrorInvalidChannelDescriptor :invalid-channel-descriptor
294 |    cudart/cudaErrorInvalidConfiguration :invalid-configuration
295 |    cudart/cudaErrorInvalidDeviceFunction :invalid-device-function
296 |    cudart/cudaErrorInvalidDevicePointer :invalid-device-pointer
297 |    cudart/cudaErrorInvalidFilterSetting :invalid-filter-setting
298 |    cudart/cudaErrorInvalidHostPointer :invalid-host-pointer
299 |    cudart/cudaErrorInvalidMemcpyDirection :invalid-memcpy-direction
300 |    cudart/cudaErrorInvalidNormSetting :invalid-norm-setting
301 |    cudart/cudaErrorInvalidPitchValue :invalid-pitch-value
302 |    cudart/cudaErrorInvalidSurface :invalid-surface
303 |    cudart/cudaErrorInvalidSymbol :invalid-symbol
304 |    cudart/cudaErrorInvalidTexture :invalid-texture
305 |    cudart/cudaErrorInvalidTextureBinding :invalid-texture-binding
306 |    cudart/cudaErrorLaunchFileScopedSurf :launch-file-scoped-surf
307 |    cudart/cudaErrorLaunchFileScopedTex :launch-file-scoped-tex
308 |    cudart/cudaErrorLaunchMaxDepthExceeded :max-depth-exceeded
309 |    cudart/cudaErrorLaunchPendingCountExceeded :launch-pending-count-exceeded
310 |    cudart/cudaErrorMemoryValueTooLarge :memory-value-too-large
311 |    cudart/cudaErrorMissingConfiguration :missing-configuration
312 |    cudart/cudaErrorMixedDeviceExecution :mixed-device-execution
313 |    cudart/cudaErrorNotPermitted :not-permitted
314 |    cudart/cudaErrorNotYetImplemented :not-yet-implemented
315 |    cudart/cudaErrorPriorLaunchFailure :prior-launch-failure
316 |    cudart/cudaErrorSoftwareValidityNotEstablished :software-validity-not-established
317 |    cudart/cudaErrorStartupFailure :startup-failure
318 |    cudart/cudaErrorSyncDepthExceeded :sync-depth-exceeded
319 |    cudart/cudaErrorSynchronizationError :synchronization-error
320 |    cudart/cudaErrorTextureFetchFailed :texture-fetch-failed
321 |    cudart/cudaErrorTextureNotBound :texture-not-bound})
322 | 
323 | (def ^{:const true
324 |        :doc "CUDA Error messages as defined in nvrtc."}
325 |   nvrtc-result-codes
326 |   {nvrtc/NVRTC_SUCCESS :success
327 |    nvrtc/NVRTC_ERROR_BUILTIN_OPERATION_FAILURE :builtin-operation-failure
328 |    nvrtc/NVRTC_ERROR_COMPILATION :compilation
329 |    nvrtc/NVRTC_ERROR_INVALID_INPUT :invalid-input
330 |    nvrtc/NVRTC_ERROR_INTERNAL_ERROR :internal-error
331 |    nvrtc/NVRTC_ERROR_INVALID_OPTION :invalid-option
332 |    nvrtc/NVRTC_ERROR_INVALID_PROGRAM :invalid-program
333 |    nvrtc/NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID :name-expression-not-valid
334 |    nvrtc/NVRTC_ERROR_OUT_OF_MEMORY :out-of-memory
335 |    nvrtc/NVRTC_ERROR_PROGRAM_CREATION_FAILURE :program-creation-failure
336 |    nvrtc/NVRTC_ERROR_TIME_FILE_WRITE_FAILED :time-file-write-ahead})
337 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/impl.clj:
--------------------------------------------------------------------------------
  1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
  2 | ;;   The use and distribution terms for this software are covered by the
  3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
  4 | ;;   which can be found in the file LICENSE at the root of this distribution.
  5 | ;;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;;   the terms of this license.
  7 | ;;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:author "Dragan Djuric"}
 10 |     uncomplicate.clojurecuda.internal.impl
 11 |   (:require [uncomplicate.commons
 12 |              [core :refer [with-release let-release Releaseable release info Bytes bytesize Entries
 13 |                            size* size]]
 14 |              [utils :as cu :refer [dragan-says-ex]]]
 15 |             [uncomplicate.fluokitten.protocols :refer [Comonad extract]]
 16 |             [uncomplicate.clojure-cpp :as cpp
 17 |              :refer [put-entry! pointer safe int-pointer pointer-pointer byte-pointer size-t-pointer
 18 |                      get-entry get-string null? long-pointer PointerCreator TypedPointerCreator
 19 |                      clong-pointer short-pointer char-pointer double-pointer float-pointer pointer-seq
 20 |                      capacity! address Accessor get! put! get-keyword]]
 21 |             [uncomplicate.clojurecuda.internal
 22 |              [constants :refer [cu-result-codes jit-input-types jit-options nvrtc-result-codes]]
 23 |              [utils :refer [with-check]]]
 24 |             [clojure.core.async :refer [go >!]])
 25 |   (:import java.nio.file.Path
 26 |            java.io.File
 27 |            [clojure.lang IFn AFn Seqable]
 28 |            [org.bytedeco.javacpp Pointer BytePointer PointerPointer LongPointer IntPointer]
 29 |            [org.bytedeco.cuda.global cudart nvrtc]
 30 |            [org.bytedeco.cuda.cudart CUctx_st CUstream_st CUevent_st CUmod_st CUlinkState_st]
 31 |            org.bytedeco.cuda.nvrtc._nvrtcProgram
 32 |            [uncomplicate.clojure_cpp StringPointer KeywordPointer]
 33 |            [uncomplicate.clojurecuda.internal.javacpp CUHostFn CUStreamCallback]))
 34 | 
 35 | (defprotocol CUPointer
 36 |   (cu-address* [this])
 37 |   (device? [this]))
 38 | 
 39 | (defprotocol Parameter
 40 |   (set-parameter* [this pp i]))
 41 | 
 42 | (extend-type Object
 43 |   Parameter
 44 |   (set-parameter* [parameter pp i]
 45 |     (put-entry! pp i (pointer parameter))))
 46 | 
 47 | ;; ==================== Release resources =======================
 48 | 
 49 | (deftype CUDevice [^int dev]
 50 |   Object
 51 |   (hashCode [_]
 52 |     dev)
 53 |   (equals [_ y]
 54 |     (and (instance? CUDevice y) (= dev (.dev ^CUDevice y))))
 55 |   (toString [_]
 56 |     (format "#Device[:cuda, %d]" dev))
 57 |   Comonad
 58 |   (extract [_]
 59 |     dev))
 60 | 
 61 | (extend-type CUctx_st
 62 |   Releaseable
 63 |   (release [this]
 64 |     (locking this
 65 |       (cudart/cuCtxDestroy this)
 66 |       (.deallocate this)
 67 |       (.setNull this)
 68 |       true)))
 69 | 
 70 | (extend-type CUstream_st
 71 |   Releaseable
 72 |   (release [this]
 73 |     (locking this
 74 |       (cudart/cuStreamDestroy this)
 75 |       (.deallocate this)
 76 |       (.setNull this)
 77 |       true)))
 78 | 
 79 | (extend-type CUevent_st
 80 |   Releaseable
 81 |   (release [this]
 82 |     (locking this
 83 |       (cudart/cuEventDestroy this)
 84 |       (.deallocate this)
 85 |       (.setNull this)
 86 |       true)))
 87 | 
 88 | (extend-type CUmod_st
 89 |   Releaseable
 90 |   (release [this]
 91 |     (locking this
 92 |       (cudart/cuModuleUnload this)
 93 |       (.deallocate this)
 94 |       (.setNull this)
 95 |       true)))
 96 | 
 97 | (extend-type CUlinkState_st
 98 |   Releaseable
 99 |   (release [this]
100 |     (locking this
101 |       (cudart/cuLinkDestroy this)
102 |       (.deallocate this)
103 |       (.setNull this)
104 |       true)))
105 | 
106 | (extend-type _nvrtcProgram
107 |   Releaseable
108 |   (release [this]
109 |     (locking this
110 |       (nvrtc/nvrtcDestroyProgram this)
111 |       (.deallocate this)
112 |       (.setNull this)
113 |       true)))
114 | 
115 | ;; ================== Module Management =====================================
116 | 
117 | (defprotocol ModuleLoad
118 |   (module-load* [data m])
119 |   (link-add* [data link-state type opts vals]))
120 | 
121 | (defn enc-jit-options [options]
122 |   (map (fn [[option value]]
123 |          [(or (jit-options option)
124 |               (throw (ex-info "Unknown jit option." {:option option :available jit-options})))
125 |           (safe (pointer value))])
126 |        options))
127 | 
128 | (defn check-options [^IntPointer options ^Pointer option-values]
129 |   (when-not (= (size options) (size option-values))
130 |     (throw (ex-info "Inconsistent number of options provided."
131 |                     {:requested (size options) :provided (size option-values)}))))
132 | 
133 | (defn link-add-data* [^CUlinkState_st link-state type ^Pointer data ^String name
134 |                       ^IntPointer options ^Pointer option-values]
135 |   (let [type (int (or (jit-input-types type)
136 |                       (throw (ex-info "Invalid jit input type."
137 |                                       {:type type :available jit-input-types}))))]
138 |     (check-options options option-values)
139 |     (with-check (cudart/cuLinkAddData link-state type data (bytesize data) name
140 |                                       (size options) options option-values)
141 |       {:data data} link-state)))
142 | 
143 | (defn link-add-file* [^CUlinkState_st link-state type ^String file-name
144 |                       ^IntPointer options ^Pointer option-values]
145 |   (let [type (int (or (jit-input-types type)
146 |                       (throw (ex-info "Invalid jit input type."
147 |                                       {:type type :available jit-input-types}))))]
148 |     (check-options options option-values)
149 |     (with-check (cudart/cuLinkAddFile link-state type file-name
150 |                                       (size options) options option-values)
151 |       {:file file-name} link-state)))
152 | 
153 | (defn link*
154 |   [^CUlinkState_st link-state data options]
155 |   (let [[opts vals] (enc-jit-options options)]
156 |     (let-release [opts (int-pointer opts)
157 |                   vals (pointer-pointer vals)]
158 |       (with-check (cudart/cuLinkCreate (size opts) opts ^PointerPointer vals link-state)
159 |         (doseq [[type d options name] data]
160 |           (if name
161 |             (link-add-data* link-state type d name opts vals)
162 |             (link-add* d link-state type opts vals))))))
163 |   link-state)
164 | 
165 | (extend-type String
166 |   ModuleLoad
167 |   (module-load* [data m]
168 |     (with-check (cudart/cuModuleLoadData ^CUmod_st m (byte-pointer data)) {:data data} m))
169 |   (link-add* [data link-state type opts vals]
170 |     (link-add-data* link-state type (byte-pointer data) "unnamed" opts vals)))
171 | 
172 | (extend-type Pointer
173 |   ModuleLoad
174 |   (module-load* [data m]
175 |     (with-check (cudart/cuModuleLoadData m data)
176 |       {:data data} m))
177 |   (link-add* [data link-state type opts vals]
178 |     (link-add-data* link-state type data "unnamed" opts vals)))
179 | 
180 | (extend-type Path
181 |   ModuleLoad
182 |   (module-load* [file-path m]
183 |     (let [file-name (.toString file-path)]
184 |       (with-check (cudart/cuModuleLoad ^CUmod_st m (str file-name)) {:file (str file-path)} m)))
185 |   (link-add* [file-path link-state type opts vals]
186 |     (link-add-file* link-state type (.toString file-path) opts vals)))
187 | 
188 | (extend-type File
189 |   ModuleLoad
190 |   (module-load* [file m]
191 |     (with-check (cudart/cuModuleLoad ^CUmod_st m (str file)) {:file (str file)} m))
192 |   (link-add* [file link-state type opts vals]
193 |     (link-add-file* link-state type (.toString file) opts vals)))
194 | 
195 | ;; ====================== Nvrtc program JIT ========================================
196 | 
197 | (defn ^:private nvrtc-error
198 |   "Converts an CUDA Nvrtc error code to an ExceptionInfo with richer, user-friendly information."
199 |   ([^long err-code details]
200 |    (let [err (get nvrtc-result-codes err-code err-code)]
201 |      (ex-info (format "NVRTC error: %s." err)
202 |               {:name err :code err-code :type :nvrtc-error :details details})))
203 |   ([err-code]
204 |    (nvrtc-error err-code nil)))
205 | 
206 | (defmacro ^:private with-check-nvrtc
207 |   "Evaluates `form` if `err-code` is not zero (`:success`), otherwise throws
208 |   an appropriate `ExceptionInfo` with decoded informative details.
209 |   It helps with CUDA nvrtc methods that return error codes directly, while
210 |   returning computation results through mutating arguments.
211 |   "
212 |   ([err-code form]
213 |    `(cu/with-check nvrtc-error ~err-code ~form)))
214 | 
215 | (defn program*
216 |   [^BytePointer name ^BytePointer source-code
217 |    ^PointerPointer source-headers ^PointerPointer include-names]
218 |   (let-release [res (_nvrtcProgram.)]
219 |     (with-check-nvrtc
220 |       (nvrtc/nvrtcCreateProgram res source-code name
221 |                                 (size source-headers) source-headers include-names)
222 |       res)))
223 | 
224 | (defn program-log*
225 |   "Returns the log string generated by the previous compilation of `program`."
226 |   [^_nvrtcProgram program]
227 |   (with-release [log-size (size-t-pointer 1)]
228 |     (with-check-nvrtc (nvrtc/nvrtcGetProgramLogSize program log-size)
229 |       (with-release [log (byte-pointer (get-entry log-size 0))]
230 |         (with-check-nvrtc (nvrtc/nvrtcGetProgramLog program log) (get-string log))))))
231 | 
232 | (defn compile*
233 |   "Compiles the given `program` using an array of string `options`."
234 |   ([^_nvrtcProgram program ^PointerPointer options]
235 |    (let [err (nvrtc/nvrtcCompileProgram program (size options) options)]
236 |      (if (= 0 err)
237 |        program
238 |        (throw (nvrtc-error err (program-log* program)))))))
239 | 
240 | (defn ptx*
241 |   "Returns the PTX generated by the previous compilation of `program`."
242 |   [^_nvrtcProgram program]
243 |   (with-release [ptx-size (size-t-pointer 1)]
244 |     (with-check-nvrtc (nvrtc/nvrtcGetPTXSize program ptx-size)
245 |       (let-release [ptx (byte-pointer (get-entry ptx-size 0))]
246 |         (with-check-nvrtc (nvrtc/nvrtcGetPTX program ptx)
247 |           ptx)))))
248 | 
249 | (extend-type _nvrtcProgram
250 |   ModuleLoad
251 |   (module-load* [program m]
252 |     (with-check (cudart/cuModuleLoadData ^CUmod_st m (ptx* program)) m))
253 |   (link-add* [program link-state type opts vals]
254 |     (link-add-data* link-state type (ptx* program) "unnamed" opts vals)))
255 | 
256 | ;; =================== Context Management ==================================
257 | 
258 | (defn context*
259 |   "Creates a CUDA context on the `device` using a raw integer `flag`.
260 |   For available flags, see [[constants/ctx-flags]].
261 |   "
262 |   [^long dev ^long flags]
263 |   (let [res (CUctx_st.)]
264 |     (with-check (cudart/cuCtxCreate res flags dev)
265 |       {:dev (info dev) :flags flags}
266 |       res)))
267 | 
268 | (defn current-context*
269 |   "If `ctx` is provided, bounds it as current. Returns the CUDA context bound to the calling CPU thread."
270 |   ([]
271 |    (let [ctx (CUctx_st.)]
272 |      (with-check (cudart/cuCtxGetCurrent ctx) ctx)))
273 |   ([^CUctx_st ctx]
274 |    (with-check (cudart/cuCtxSetCurrent ctx) ctx)))
275 | 
276 | ;; ==================== Linear memory ================================================
277 | 
278 | (defprotocol MemSet
279 |   (memset* [this dptr n] [this dptr n hstream]))
280 | 
281 | (extend-type Byte
282 |   MemSet
283 |   (memset*
284 |     ([this dptr n]
285 |      (with-check (cudart/cuMemsetD8 dptr this n) dptr))
286 |     ([this dptr n hstream]
287 |      (with-check (cudart/cuMemsetD8Async dptr this n hstream) dptr))))
288 | 
289 | (extend-type Short
290 |   MemSet
291 |   (memset*
292 |     ([this dptr n]
293 |      (with-check (cudart/cuMemsetD16 dptr this n) dptr))
294 |     ([this dptr n hstream]
295 |      (with-check (cudart/cuMemsetD16Async dptr this n hstream) dptr))))
296 | 
297 | (extend-type Integer
298 |   MemSet
299 |   (memset*
300 |     ([this dptr n]
301 |      (with-check (cudart/cuMemsetD32 dptr this n) dptr))
302 |     ([this dptr n hstream]
303 |      (with-check (cudart/cuMemsetD32Async dptr this n hstream) dptr))))
304 | 
305 | (extend-type Float
306 |   MemSet
307 |   (memset*
308 |     ([this dptr n]
309 |      (with-check (cudart/cuMemsetD32 dptr (Float/floatToIntBits this) n) dptr))
310 |     ([this dptr n hstream]
311 |      (with-check (cudart/cuMemsetD32Async dptr (Float/floatToIntBits this) n hstream) dptr))))
312 | 
313 | (extend-type Double
314 |   MemSet
315 |   (memset*
316 |     ([this dptr n]
317 |      (if (= 0.0 this)
318 |        (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr)
319 |        (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this})))
320 |     ([this dptr n hstream]
321 |      (if (= 0.0 this)
322 |        (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr)
323 |        (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this})))))
324 | 
325 | (extend-type Long
326 |   MemSet
327 |   (memset*
328 |     ([this dptr n]
329 |      (if (= 0 this)
330 |        (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr)
331 |        (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this})))
332 |     ([this dptr n hstream]
333 |      (if (= 0 this)
334 |        (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr)
335 |        (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this})))))
336 | 
337 | (defprotocol Memcpy
338 |   "An object that represents memory that participates in CUDA operations.
339 |   It can be on the device, or on the host. Built-in implementations:
340 |   CUDA pointers, JavaCPP pointers, Java primitive arrays, etc.
341 |   "
342 |   (memcpy-host* [dst src size] [dst src size hstream])
343 |   (memcpy* [dst src size] [dst src size hstream]))
344 | 
345 | (defn offset ^long [dptr ^long offset]
346 |   (if (<= 0 offset (bytesize dptr))
347 |     (+ (long (cu-address* dptr)) offset)
348 |     (dragan-says-ex "Requested bytes are out of the bounds of this device pointer."
349 |                     {:offset offset :size (bytesize dptr)})))
350 | 
351 | (deftype CUDevicePtr [^LongPointer daddr ^long byte-size master]
352 |   Object
353 |   (hashCode [_]
354 |     (hash-combine (hash daddr) byte-size))
355 |   (equals [_ y]
356 |     (and (instance? CUDevicePtr y) (= (get-entry daddr 0) (cu-address* y))))
357 |   (toString [_]
358 |     (format "#DevicePtr[:cuda, 0x%x, %d bytes]" (get-entry daddr 0) byte-size))
359 |   Releaseable
360 |   (release [_]
361 |     (locking daddr
362 |       (when-not (null? daddr)
363 |         (when master
364 |           (with-check (cudart/cuMemFree (get-entry daddr 0)) true))
365 |         (release daddr))
366 |       true))
367 |   Comonad
368 |   (extract [_]
369 |     (extract daddr))
370 |   CUPointer
371 |   (cu-address* [_]
372 |     (get-entry daddr 0))
373 |   (device? [_]
374 |     true)
375 |   Bytes
376 |   (bytesize* [_]
377 |     byte-size)
378 |   Entries
379 |   (size* [_]
380 |     byte-size)
381 |   (sizeof* [_]
382 |     Byte/BYTES)
383 |   Parameter
384 |   (set-parameter* [_ pp i]
385 |     (put-entry! pp i daddr))
386 |   Memcpy
387 |   (memcpy-host* [this src byte-count]
388 |     (with-check
389 |       (cudart/cuMemcpyHtoD (get-entry daddr 0) (safe (pointer src)) byte-count)
390 |       this))
391 |   (memcpy-host* [this src byte-count hstream]
392 |     (with-check
393 |       (cudart/cuMemcpyHtoDAsync (get-entry daddr 0) (safe (pointer src)) byte-count hstream)
394 |       this))
395 |   (memcpy* [this src byte-count]
396 |     (with-check
397 |       (cudart/cuMemcpy (get-entry daddr 0) (cu-address* src) byte-count)
398 |       this))
399 |   (memcpy* [this src byte-count hstream]
400 |     (with-check
401 |       (cudart/cuMemcpyAsync (get-entry daddr 0) (cu-address* src) byte-count hstream)
402 |       this)))
403 | 
404 | (defn mem-alloc-managed*
405 |   ([^long size ^long flag]
406 |    (let-release [daddr (long-pointer 1)]
407 |      (with-check (cudart/cuMemAllocManaged daddr size flag)
408 |        (->CUDevicePtr daddr size true)))))
409 | 
410 | ;; =================== Runtime Memory ===============================================
411 | 
412 | (defn cupointer-memcpy*
413 |   ([dst src ^long byte-count]
414 |    (with-check
415 |      (if (instance? Pointer src)
416 |        (cudart/cudaMemcpy (safe (pointer dst)) (extract src) byte-count cudart/cudaMemcpyDefault)
417 |        (cudart/cuMemcpy (cu-address* dst) (cu-address* src) byte-count))
418 |      dst))
419 |   ([dst src ^long byte-count hstream]
420 |    (with-check
421 |      (if (instance? Pointer src)
422 |        (cudart/cudaMemcpyAsync (safe (pointer dst)) (extract src)
423 |                                byte-count cudart/cudaMemcpyDefault hstream)
424 |        (cudart/cuMemcpyAsync (cu-address* dst) (cu-address* src) byte-count hstream))
425 |      dst)))
426 | 
427 | (defn offset-address [^Pointer p]
428 |   (+ (.address (safe p)) (* (.sizeof p) (.position p))))
429 | 
430 | (deftype CURuntimePtr [^Pointer dptr master]
431 |   Object
432 |   (hashCode [_]
433 |     (hash dptr))
434 |   (equals [_ y]
435 |     (and (instance? CURuntimePtr y) (= dptr (.-dptr ^CURuntimePtr y) 0)))
436 |   (toString [this]
437 |     (format "#RuntimePtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize dptr)))
438 |   Releaseable
439 |   (release [_]
440 |     (locking dptr
441 |       (when-not (null? dptr)
442 |         (when master
443 |           (with-check (cudart/cudaFree (.position dptr 0)) (.setNull dptr))))
444 |       true))
445 |   Comonad
446 |   (extract [_]
447 |     (offset-address dptr))
448 |   CUPointer
449 |   (cu-address* [_]
450 |     (offset-address dptr))
451 |   (device? [_]
452 |     true)
453 |   PointerCreator
454 |   (pointer* [_]
455 |     dptr)
456 |   (pointer* [_ i]
457 |     (pointer dptr i))
458 |   TypedPointerCreator
459 |   (byte-pointer [_]
460 |     (byte-pointer dptr))
461 |   (clong-pointer [_]
462 |     (clong-pointer dptr))
463 |   (size-t-pointer [_]
464 |     (clong-pointer dptr))
465 |   (pointer-pointer [_]
466 |     (pointer-pointer dptr))
467 |   (char-pointer [_]
468 |     (char-pointer dptr))
469 |   (short-pointer [_]
470 |     (short-pointer dptr))
471 |   (int-pointer [_]
472 |     (int-pointer dptr))
473 |   (long-pointer [_]
474 |     (long-pointer dptr))
475 |   (float-pointer [_]
476 |     (float-pointer dptr))
477 |   (double-pointer [_]
478 |     (double-pointer dptr))
479 |   Bytes
480 |   (bytesize* [_]
481 |     (bytesize dptr))
482 |   Entries
483 |   (size* [_]
484 |     (size* dptr))
485 |   (sizeof* [_]
486 |     (.sizeof dptr))
487 |   Seqable
488 |   (seq [_]
489 |     (pointer-seq dptr))
490 |   Parameter
491 |   (set-parameter* [this pp i]
492 |     (put-entry! pp i (pointer (offset-address dptr))))
493 |   Memcpy
494 |   (memcpy-host* [this src byte-count]
495 |     (with-check
496 |       (cudart/cuMemcpyHtoD (offset-address dptr) (safe (pointer src)) byte-count)
497 |       this))
498 |   (memcpy-host* [this src byte-count hstream]
499 |     (with-check
500 |       (cudart/cuMemcpyHtoDAsync (offset-address dptr) (safe (pointer src)) byte-count hstream)
501 |       this))
502 |   (memcpy* [this src byte-count]
503 |     (cupointer-memcpy* this src byte-count))
504 |   (memcpy* [this src byte-count hstream]
505 |     (cupointer-memcpy* this src byte-count hstream)))
506 | 
507 | (defn malloc-runtime*
508 |   ([^long size]
509 |    (let-release [p (byte-pointer nil)]
510 |      (with-check (cudart/cudaMalloc p size)
511 |        (->CURuntimePtr (capacity! p size) true))))
512 |   ([^long size pointer-type]
513 |    (let-release [p (byte-pointer nil)]
514 |      (with-check (cudart/cudaMalloc p size)
515 |        (->CURuntimePtr (pointer-type (capacity! p size)) true)))))
516 | 
517 | ;; =================== Pinned Memory ================================================
518 | 
519 | (defn free-pinned [hp]
520 |   (with-check (cudart/cuMemFreeHost hp) (release hp)))
521 | 
522 | (defn unregister-pinned [hp]
523 |   (with-check (cudart/cuMemHostUnregister hp) hp))
524 | 
525 | (deftype CUPinnedPtr [^Pointer hptr master release-fn]
526 |   Object
527 |   (hashCode [_]
528 |     (hash hptr))
529 |   (equals [this y]
530 |     (and (instance? CUPinnedPtr y) (= (offset-address hptr) (cu-address* y))))
531 |   (toString [this]
532 |     (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (offset-address hptr) (bytesize hptr)))
533 |   Releaseable
534 |   (release [_]
535 |     (locking hptr
536 |       (when-not (null? hptr)
537 |         (when master
538 |           (release-fn (.position hptr 0))))
539 |       true))
540 |   Comonad
541 |   (extract [_]
542 |     (extract hptr))
543 |   CUPointer
544 |   (cu-address* [_]
545 |     (offset-address hptr))
546 |   (device? [_]
547 |     false)
548 |   PointerCreator
549 |   (pointer* [_]
550 |     hptr)
551 |   (pointer* [_ i]
552 |     (pointer hptr i))
553 |   TypedPointerCreator
554 |   (byte-pointer [_]
555 |     (byte-pointer hptr))
556 |   (clong-pointer [_]
557 |     (clong-pointer hptr))
558 |   (size-t-pointer [_]
559 |     (clong-pointer hptr))
560 |   (pointer-pointer [_]
561 |     (pointer-pointer hptr))
562 |   (char-pointer [_]
563 |     (char-pointer hptr))
564 |   (short-pointer [_]
565 |     (short-pointer hptr))
566 |   (int-pointer [_]
567 |     (int-pointer hptr))
568 |   (long-pointer [_]
569 |     (long-pointer hptr))
570 |   (float-pointer [_]
571 |     (float-pointer hptr))
572 |   (double-pointer [_]
573 |     (double-pointer hptr))
574 |   Bytes
575 |   (bytesize* [_]
576 |     (bytesize hptr))
577 |   Entries
578 |   (size* [_]
579 |     (size* hptr))
580 |   (sizeof* [_]
581 |     (.sizeof hptr))
582 |   Seqable
583 |   (seq [_]
584 |     (pointer-seq hptr))
585 |   Accessor
586 |   (get-entry [_]
587 |     (get-entry hptr))
588 |   (get-entry [_ i]
589 |     (get-entry hptr i))
590 |   (put-entry! [this value]
591 |     (put-entry! hptr value)
592 |     this)
593 |   (put-entry! [this i value]
594 |     (put-entry! hptr i value)
595 |     this)
596 |   (get! [_ arr]
597 |     (get! hptr arr)
598 |     arr)
599 |   (get! [_ arr offset length]
600 |     (get! hptr arr offset length)
601 |     arr)
602 |   (put! [this obj]
603 |     (put! hptr obj)
604 |     this)
605 |   (put! [_ obj offset length]
606 |     (put! hptr obj offset length))
607 |   Parameter
608 |   (set-parameter* [_ pp i]
609 |     (put-entry! pp i (pointer (offset-address hptr))))
610 |   Memcpy
611 |   (memcpy-host* [this src byte-count]
612 |     (with-check (cudart/cuMemcpyDtoH hptr (offset-address hptr) byte-count) this))
613 |   (memcpy-host* [this src byte-count hstream]
614 |     (with-check (cudart/cuMemcpyDtoHAsync hptr (offset-address hptr) byte-count hstream) this))
615 |   (memcpy* [this src byte-count]
616 |     (cupointer-memcpy* this src byte-count))
617 |   (memcpy* [this src byte-count hstream]
618 |     (cupointer-memcpy* this src byte-count hstream)))
619 | 
620 | (defn mem-host-alloc*
621 |   "Allocates `byte-size` bytes of page-locked memory, 'pinned' on the host, using raw integer `flags`.
622 |   For available flags, see [[constants/mem-host-alloc-flags]]. The memory is not initialized.
623 |   `byte-size` must be greater than `0`.
624 |   See [cuMemHostAlloc](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
625 |   "
626 |   ([^long byte-size ^long flags]
627 |    (let-release [p (byte-pointer nil)]
628 |      (with-check (cudart/cuMemHostAlloc p byte-size flags)
629 |        (->CUPinnedPtr (capacity! p byte-size) true free-pinned))))
630 |   ([^long byte-size ^long flags pointer-type]
631 |    (let-release [p (byte-pointer nil)]
632 |      (with-check (cudart/cuMemHostAlloc p byte-size flags)
633 |        (->CUPinnedPtr (pointer-type (capacity! p byte-size)) true free-pinned)))))
634 | 
635 | (defn mem-host-register*
636 |   "Registers previously allocated host `Pointer` and pins it, using raw integer `flags`.
637 |   See [cuMemHostRegister](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
638 |   "
639 |   ([hptr ^long flags]
640 |    (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags)
641 |      (->CUPinnedPtr hptr true unregister-pinned)))
642 |   ([hptr ^long flags pointer-type]
643 |    (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags)
644 |      (let [tp (pointer-type (capacity! hptr size))]
645 |        (->CUPinnedPtr tp true unregister-pinned)))))
646 | 
647 | (deftype CUMappedPtr [^Pointer hptr master]
648 |   Object
649 |   (hashCode [_]
650 |     (hash hptr))
651 |   (equals [this y]
652 |     (and (instance? CUMappedPtr y) (= (cu-address* this) (cu-address* y))))
653 |   (toString [this]
654 |     (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize hptr)))
655 |   Releaseable
656 |   (release [_]
657 |     (locking hptr
658 |       (when-not (null? hptr)
659 |         (when master
660 |           (with-check (cudart/cuMemFreeHost (.position hptr 0))
661 |             (release hptr))))
662 |       true))
663 |   Comonad
664 |   (extract [_]
665 |     (offset-address hptr))
666 |   CUPointer
667 |   (cu-address* [_]
668 |     (offset-address hptr))
669 |   (device? [_]
670 |     false)
671 |   PointerCreator
672 |   (pointer* [_]
673 |     hptr)
674 |   (pointer* [_ i]
675 |     (pointer hptr i))
676 |   TypedPointerCreator
677 |   (byte-pointer [_]
678 |     (byte-pointer hptr))
679 |   (clong-pointer [_]
680 |     (clong-pointer hptr))
681 |   (size-t-pointer [_]
682 |     (clong-pointer hptr))
683 |   (pointer-pointer [_]
684 |     (pointer-pointer hptr))
685 |   (char-pointer [_]
686 |     (char-pointer hptr))
687 |   (short-pointer [_]
688 |     (short-pointer hptr))
689 |   (int-pointer [_]
690 |     (int-pointer hptr))
691 |   (long-pointer [_]
692 |     (long-pointer hptr))
693 |   (float-pointer [_]
694 |     (float-pointer hptr))
695 |   (double-pointer [_]
696 |     (double-pointer hptr))
697 |   Bytes
698 |   (bytesize* [_]
699 |     (bytesize hptr))
700 |   Entries
701 |   (size* [_]
702 |     (size* hptr))
703 |   (sizeof* [_]
704 |     (.sizeof hptr))
705 |   Seqable
706 |   (seq [_]
707 |     (pointer-seq hptr))
708 |   Accessor
709 |   (get-entry [_]
710 |     (get-entry hptr))
711 |   (get-entry [_ i]
712 |     (get-entry hptr i))
713 |   (put-entry! [this value]
714 |     (put-entry! hptr value)
715 |     this)
716 |   (put-entry! [this i value]
717 |     (put-entry! hptr i value)
718 |     this)
719 |   (get! [_ arr]
720 |     (get! hptr arr)
721 |     arr)
722 |   (get! [_ arr offset length]
723 |     (get! hptr arr offset length)
724 |     arr)
725 |   (put! [this obj]
726 |     (put! hptr obj)
727 |     this)
728 |   (put! [_ obj offset length]
729 |     (put! hptr obj offset length))
730 |   Parameter
731 |   (set-parameter* [_ pp i]
732 |     (put-entry! pp i (pointer (offset-address hptr))))
733 |   Memcpy
734 |   (memcpy-host* [this src byte-count]
735 |     (if (device? src)
736 |       (with-check (cudart/cuMemcpy (offset-address hptr) (cu-address* src) byte-count) this)
737 |       (cpp/memcpy! (safe (pointer src)) (extract hptr)))
738 |     this)
739 |   (memcpy-host* [this src byte-count hstream]
740 |     (with-check
741 |       (if (device? src)
742 |         (cudart/cuMemcpyAsync (offset-address hptr) (cu-address* src) byte-count hstream)
743 |         (cudart/cuMemcpyHtoDAsync (offset-address hptr) (safe (pointer src)) byte-count hstream))
744 |       this))
745 |   (memcpy* [this src byte-count]
746 |     (cupointer-memcpy* this src byte-count))
747 |   (memcpy* [this src byte-count hstream]
748 |     (cupointer-memcpy* this src byte-count hstream)))
749 | 
750 | (defn mem-alloc-host*
751 |   "Allocates `byte-size` bytes of page-locked memory, 'mapped' to the device.
752 |   For available flags, see [constants/mem-host-alloc-flags]
753 |   The memory is not initialized. `byte-size` must be greater than `0`.
754 |   See [cuMemAllocHost](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
755 |   "
756 |   ([^long byte-size]
757 |    (let-release [p (byte-pointer nil)]
758 |      (with-check (cudart/cuMemAllocHost p byte-size)
759 |        (->CUMappedPtr (capacity! p byte-size) true))))
760 |   ([^long byte-size pointer-type]
761 |    (let-release [p (byte-pointer nil)]
762 |      (with-check (cudart/cuMemAllocHost p byte-size)
763 |        (->CUMappedPtr (pointer-type (capacity! p byte-size)) true)))))
764 | 
765 | ;; =============== Host memory  =================================
766 | 
767 | (extend-type Pointer
768 |   CUPointer
769 |   (cu-address* [this]
770 |     (offset-address this))
771 |   (device? [_]
772 |     false)
773 |   Parameter
774 |   (set-parameter* [parameter pp i]
775 |     (put-entry! pp i (pointer (offset-address parameter))))
776 |   Memcpy
777 |   (memcpy-host*
778 |     ([this src byte-count]
779 |      (with-check
780 |        (if (instance? Pointer src)
781 |          (cudart/cudaMemcpy (extract this) (safe (pointer src)) cudart/cudaMemcpyDefault byte-count)
782 |          (cudart/cuMemcpyDtoH (extract this) (cu-address* src) byte-count))
783 |        this))
784 |     ([this src byte-count hstream]
785 |      (with-check
786 |        (if (instance? Pointer src)
787 |          (cudart/cudaMemcpyAsync (extract this) (safe (pointer src))
788 |                                  cudart/cudaMemcpyDefault byte-count hstream)
789 |          (cudart/cuMemcpyDtoHAsync (extract this) (cu-address* src) byte-count hstream))
790 |        this)))
791 |   (memcpy*
792 |     ([this src byte-count]
793 |      (with-check
794 |        (if (instance? Pointer src)
795 |          (cudart/cudaMemcpy (extract this) (safe (pointer src)) byte-count cudart/cudaMemcpyDefault)
796 |          (cudart/cuMemcpy (offset-address (extract this)) (cu-address* src) byte-count))
797 |        this))
798 |     ([this src byte-count hstream]
799 |      (with-check
800 |        (if (instance? Pointer src)
801 |          (cudart/cudaMemcpyAsync (extract this) (safe (pointer src))
802 |                                  byte-count cudart/cudaMemcpyDefault hstream)
803 |          (cudart/cuMemcpyAsync (offset-address (extract this)) (cu-address* src) byte-count hstream))
804 |        this))))
805 | 
806 | ;; ================== Stream Management ======================================
807 | 
808 | (defn stream*
809 |   "Create a stream using an optional `priority` and an integer `flag`.
810 |   See [cuStreamCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
811 |   "
812 |   ([^long flag]
813 |    (let [res (CUstream_st.)]
814 |      (with-check (cudart/cuStreamCreate res flag) res)))
815 |   ([^long priority ^long flag]
816 |    (let [res (CUstream_st.)]
817 |      (with-check (cudart/cuStreamCreateWithPriority res flag priority) res))))
818 | 
819 | (defn ready*
820 |   "Determines status (ready or not) of a compute stream or event.
821 |   See [cuStreamQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html),
822 |   and [cuEventQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
823 |   "
824 |   [obj]
825 |   (case (class obj)
826 |     CUstream_st (cudart/cuStreamQuery obj)
827 |     CUevent_st (cudart/cuEventQuery obj)
828 |     cudart/CUDA_ERROR_NOT_READY))
829 | 
830 | (defrecord StreamCallbackInfo [status data])
831 | 
832 | (deftype StreamCallback [ch]
833 |   IFn
834 |   (invoke [_ _ status data]
835 |     (go (>! ch (->StreamCallbackInfo (get cu-result-codes status status) (extract data)))))
836 |   (applyTo [this xs]
837 |     (AFn/applyToHelper this xs)))
838 | 
839 | (defprotocol HostFn
840 |   (host-fn* [type ch]))
841 | 
842 | (extend-type KeywordPointer
843 |   HostFn
844 |   (host-fn* [_ ch]
845 |     (fn [data]
846 |       (go (>! ch (get-keyword (byte-pointer data)))))))
847 | 
848 | (extend-type StringPointer
849 |   HostFn
850 |   (host-fn* [_ ch]
851 |     (fn [data]
852 |       (go (>! ch (get-string (byte-pointer data)))))))
853 | 
854 | (extend-type Pointer
855 |   HostFn
856 |   (host-fn* [_ ch]
857 |     (fn [data]
858 |       (go (>! ch data)))))
859 | 
860 | (defn add-host-fn*
861 |   [^CUstream_st hstream ^IFn f ^Pointer data]
862 |   (let-release [hostfn (CUHostFn. f)]
863 |     (with-check (cudart/cuLaunchHostFunc hstream hostfn data)
864 |       hstream)))
865 | 
866 | (defn attach-mem*
867 |   "Attach memory of `byte-size`, specified by an integer `flag` to a `hstream` asynchronously.
868 |   See [cuStreamAttachMemAsync](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html).
869 |   "
870 |   ([^CUstream_st hstream mem byte-size flag]
871 |    (with-check (cudart/cuStreamAttachMemAsync hstream mem byte-size flag) hstream)))
872 | 
873 | ;; ================== Event Management =======================================
874 | 
875 | (defn event*
876 |   "Creates an event specified by integer `flags`.
877 |   See [cuEventCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
878 |   "
879 |   [^long flags]
880 |   (let [res (CUevent_st.)]
881 |     (with-check (cudart/cuEventCreate res flags) res)))
882 | 
883 | ;; ================== Peer Context Memory Access =============================
884 | 
885 | (defn can-access-peer*
886 |   "queries if a device may directly access a peer device's memory.
887 |   see [cudevicecanaccesspeer](http://docs.nvidia.com/cuda/cuda-driver-api/group__cuda__peer__access.html)
888 |   "
889 |   [^long dev ^long peer]
890 |   (with-release [res (int-pointer 1)]
891 |     (with-check (cudart/cuDeviceCanAccessPeer ^IntPointer res dev peer)
892 |       (pos? (int (get-entry res 0))))))
893 | 
894 | (defn p2p-attribute*
895 |   "Queries attributes of the link between two devices.
896 |   See [cuDeviceGetP2PAttribute](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
897 |   "
898 |   [^long dev ^long peer ^long attribute]
899 |   (let [res (int-pointer 1)]
900 |     (with-check
901 |       (cudart/cudaDeviceGetP2PAttribute ^IntPointer res attribute dev peer)
902 |       (pos? (int (get-entry res 0))))))
903 | 
904 | ;; ================ print-method ============================================
905 | 
906 | (defn format-pointer [title p ^java.io.Writer w]
907 |   (.write w (format "#%s[:cuda, 0x%x]" title (address p))))
908 | 
909 | (defmethod print-method CUDevice [p ^java.io.Writer w]
910 |   (.write w (str p)))
911 | 
912 | (defmethod print-method CUctx_st [p w]
913 |   (format-pointer "Context" p w))
914 | 
915 | (defmethod print-method CUstream_st [p w]
916 |   (format-pointer "Stream" p w))
917 | 
918 | (defmethod print-method CUevent_st [p w]
919 |   (format-pointer "Event" p w))
920 | 
921 | (defmethod print-method CUmod_st [p w]
922 |   (format-pointer "Module" p w))
923 | 
924 | (defmethod print-method CUlinkState_st [p w]
925 |   (format-pointer "LinkState" p w))
926 | 
927 | (defmethod print-method _nvrtcProgram [p w]
928 |   (format-pointer "Program" p w))
929 | 
930 | (defmethod print-method CUDevicePtr [p w]
931 |   (format-pointer "DevicePtr" p w))
932 | 
933 | (defmethod print-method CUPinnedPtr [p w]
934 |   (format-pointer "PinnedPtr" p w))
935 | 
936 | (defmethod print-method CUMappedPtr [p w]
937 |   (format-pointer "MappedPtr" p w))
938 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/utils.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns ^{:author "Dragan Djuric"}
10 |     uncomplicate.clojurecuda.internal.utils
11 |   "Utility functions used as helpers in other ClojureCUDA namespaces.
12 |   The user of the ClojureCUDA library would probably not need to use
13 |   any of the functions defined here."
14 |   (:require [uncomplicate.commons.utils :as utils]
15 |             [uncomplicate.clojurecuda.internal.constants :refer [cu-result-codes]])
16 |   (:import clojure.lang.ExceptionInfo))
17 | 
18 | ;; ============= Error Codes ===================================================
19 | 
20 | (defn error
21 |   "Converts an CUDA error code to an [ExceptionInfo] (http://clojuredocs.org/clojure.core/ex-info)
22 |   with richer, user-friendly information.
23 |   Accepts a long `err-code` that should be one of the codes defined in CUDA standard, and an
24 |   optional `details` argument that could be anything that you think is informative.
25 | 
26 |   Examples:
27 |   (error 0) => an ExceptionInfo instance
28 |   (error -5 {:comment \"Why here?\"\"}) => an ExceptionInfo instance
29 |   "
30 |   ([^long err-code details]
31 |    (let [err (get cu-result-codes err-code err-code)]
32 |      (ex-info (format "CUDA error: %s." err)
33 |               {:name err :code err-code :type :cuda :details details})))
34 |   ([^long err-code]
35 |    (error err-code nil)))
36 | 
37 | (defmacro with-check
38 |   "Evaluates `form` if `status` is not zero (`:success`), otherwise throws
39 |   an appropriate `ExceptionInfo` with decoded informative details.
40 |   It helps fith CUDA methods that return error codes directly, while
41 |   returning computation results through side-effects in arguments.
42 | 
43 |   Example:
44 |   (with-check (some-jcuda-call-that-returns-error-code) result)
45 |   "
46 |   ([status form]
47 |    `(utils/with-check error ~status ~form))
48 |   ([status details form]
49 |    `(let [status# ~status]
50 |       (if (= 0 status#)
51 |         ~form
52 |         (throw (error status# ~details))))))
53 | 
54 | (defmacro maybe
55 |   "Evaluates form in try/catch block; if a CUDA-related exception is caught,
56 |   substitutes the result with the [ExceptionInfo](http://clojuredocs.org/clojure.core/ex-info) object."
57 |   [form]
58 |   `(try ~form
59 |         (catch ExceptionInfo ex-info#
60 |           (if (= :cuda (:type (ex-data ex-info#)))
61 |             (:name (ex-data ex-info#))
62 |             (throw ex-info#)))))
63 | 


--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/toolbox.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns ^{:author "Dragan Djuric"}
10 |     uncomplicate.clojurecuda.toolbox
11 |   "Various helpers that are not needed by ClojureCUDA itself,
12 |   but may be very helpful in applications. See Neanderthal and Bayadera libraries
13 |   for the examples of how to use them."
14 |   (:require [uncomplicate.commons
15 |              [core :refer [with-release]]
16 |              [utils :refer [count-groups]]]
17 |             [uncomplicate.clojure-cpp
18 |              :refer [byte-pointer get-long get-int get-double get-float]]
19 |             [uncomplicate.clojurecuda.core
20 |              :refer [grid-1d grid-2d launch! memcpy-host! parameters set-parameter! set-parameter!]])
21 |   (:import org.bytedeco.javacpp.PointerPointer))
22 | 
23 | (defn launch-reduce!
24 |   ([hstream main-kernel reduction-kernel main-params reduction-params n local-n]
25 |    (let [main-params (if (instance? PointerPointer main-params)
26 |                        (set-parameter! main-params 0 n)
27 |                        (apply parameters n main-params))
28 |          reduction-params (if (instance? PointerPointer reduction-params)
29 |                             reduction-params
30 |                             (apply parameters Integer/MAX_VALUE reduction-params))]
31 |      (launch! main-kernel (grid-1d n local-n) hstream main-params)
32 |      (loop [global-size (count-groups local-n n)]
33 |        (when (< 1 global-size)
34 |          (launch! reduction-kernel (grid-1d global-size local-n) hstream
35 |                   (set-parameter! reduction-params 0 global-size))
36 |          (recur (count-groups local-n global-size))))
37 |      hstream))
38 |   ([hstream main-kernel reduction-kernel main-params reduction-params m n local-m local-n]
39 |    (let [main-params (if (instance? PointerPointer main-params)
40 |                        (set-parameter! main-params 0 m n)
41 |                        (apply parameters m n main-params))
42 |          reduction-params (if (instance? PointerPointer reduction-params)
43 |                             reduction-params
44 |                             (apply parameters Integer/MAX_VALUE Integer/MAX_VALUE reduction-params))]
45 |      (if (or (< 1 (long local-n)) (<= (long local-n) (long n)))
46 |        (loop [hstream (launch! main-kernel (grid-2d m n local-m local-n) hstream main-params)
47 |               global-size (count-groups local-n n)]
48 |          (if (= 1 global-size)
49 |            hstream
50 |            (recur (launch! reduction-kernel (grid-2d m global-size local-m local-n) hstream
51 |                            (set-parameter! reduction-params 0 m global-size))
52 |                   (count-groups local-n global-size))))
53 |        (throw (IllegalArgumentException.
54 |                (format "local-n %d would cause infinite recursion for n:%d." local-n n)))))))
55 | 
56 | (defn read-int
57 |   (^long [cu-buf]
58 |    (with-release [res (byte-pointer Integer/BYTES)]
59 |      (memcpy-host! cu-buf res)
60 |      (get-int res 0)))
61 |   (^long [hstream cu-buf]
62 |    (with-release [res (byte-pointer Integer/BYTES)]
63 |      (memcpy-host! cu-buf res hstream)
64 |      (get-int res 0))))
65 | 
66 | (defn read-long
67 |   (^long [cu-buf]
68 |    (with-release [res (byte-pointer Long/BYTES)]
69 |      (memcpy-host! cu-buf res)
70 |      (get-long res 0)))
71 |   (^long [hstream cu-buf]
72 |    (with-release [res (byte-pointer Long/BYTES)]
73 |      (memcpy-host! cu-buf res hstream)
74 |      (get-long res 0))))
75 | 
76 | (defn read-double
77 |   (^double [cu-buf]
78 |    (with-release [res (byte-pointer Double/BYTES)]
79 |      (memcpy-host! cu-buf res)
80 |      (get-double res 0)))
81 |   (^double [hstream cu-buf]
82 |    (with-release [res (byte-pointer Double/BYTES)]
83 |      (memcpy-host! cu-buf res hstream)
84 |      (get-double res 0))))
85 | 
86 | (defn read-float
87 |   (^double [cu-buf]
88 |    (with-release [res (byte-pointer Float/BYTES)]
89 |      (memcpy-host! cu-buf res)
90 |      (get-float res 0)))
91 |   (^double [hstream cu-buf]
92 |    (with-release [res (byte-pointer Float/BYTES)]
93 |      (memcpy-host! cu-buf res hstream)
94 |      (get-float res 0))))
95 | 


--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/float.h:
--------------------------------------------------------------------------------
 1 | #ifndef _float_h_
 2 | #define _float_h_
 3 | 
 4 | inline __host__ __device__ float  jitify_int_as_float(int i) {
 5 |     union FloatInt { float f; int i; } fi;
 6 |     fi.i = i;
 7 |     return fi.f;
 8 | }
 9 | 
10 | inline __host__ __device__ double jitify_longlong_as_double(long long i) {
11 |     union DoubleLongLong { double f; long long i; } fi;
12 |     fi.i = i;
13 |     return fi.f;
14 | }
15 | 
16 | #define FLT_RADIX       2
17 | #define FLT_MANT_DIG    24
18 | #define DBL_MANT_DIG    53
19 | #define FLT_DIG         6
20 | #define DBL_DIG         15
21 | #define FLT_MIN_EXP     -125
22 | #define DBL_MIN_EXP     -1021
23 | #define FLT_MIN_10_EXP  -37
24 | #define DBL_MIN_10_EXP  -307
25 | #define FLT_MAX_EXP     128
26 | #define DBL_MAX_EXP     1024
27 | #define FLT_MAX_10_EXP  38
28 | #define DBL_MAX_10_EXP  308
29 | #define FLT_MAX         jitify_int_as_float(2139095039)
30 | #define DBL_MAX         jitify_longlong_as_double(9218868437227405311)
31 | #define FLT_EPSILON     jitify_int_as_float(872415232)
32 | #define DBL_EPSILON     jitify_longlong_as_double(4372995238176751616)
33 | #define FLT_MIN         jitify_int_as_float(8388608)
34 | #define DBL_MIN         jitify_longlong_as_double(4503599627370496)
35 | #define FLT_ROUNDS      1
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/stddef.h:
--------------------------------------------------------------------------------
1 | #ifndef _stddef_h_
2 | #define _stddef_h_
3 | 
4 | typedef unsigned long size_t;
5 | typedef   signed long ptrdiff_t;
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/stdint.h:
--------------------------------------------------------------------------------
 1 | #ifndef _stdint_h_
 2 | #define _stdint_h_
 3 | 
 4 | typedef signed char      int8_t;
 5 | typedef signed short     int16_t;
 6 | typedef signed int       int32_t;
 7 | typedef signed long long int64_t;
 8 | typedef signed char      int_fast8_t;
 9 | typedef signed short     int_fast16_t;
10 | typedef signed int       int_fast32_t;
11 | typedef signed long long int_fast64_t;
12 | typedef signed char      int_least8_t;
13 | typedef signed short     int_least16_t;
14 | typedef signed int       int_least32_t;
15 | typedef signed long long int_least64_t;
16 | typedef signed long long intmax_t;
17 | typedef signed long      intptr_t;
18 | typedef unsigned char      uint8_t;
19 | typedef unsigned short     uint16_t;
20 | typedef unsigned int       uint32_t;
21 | typedef unsigned long long uint64_t;
22 | typedef unsigned char      uint_fast8_t;
23 | typedef unsigned short     uint_fast16_t;
24 | typedef unsigned int       uint_fast32_t;
25 | typedef unsigned long long uint_fast64_t;
26 | typedef unsigned char      uint_least8_t;
27 | typedef unsigned short     uint_least16_t;
28 | typedef unsigned int       uint_least32_t;
29 | typedef unsigned long long uint_least64_t;
30 | typedef unsigned long long uintmax_t;
31 | typedef unsigned long      uintptr_t;
32 | #define INT8_MIN    SCHAR_MIN
33 | #define INT16_MIN   SHRT_MIN
34 | #define INT32_MIN   INT_MIN
35 | #define INT64_MIN   LLONG_MIN
36 | #define INT8_MAX    SCHAR_MAX
37 | #define INT16_MAX   SHRT_MAX
38 | #define INT32_MAX   INT_MAX
39 | #define INT64_MAX   LLONG_MAX
40 | #define UINT8_MAX   UCHAR_MAX
41 | #define UINT16_MAX  USHRT_MAX
42 | #define UINT32_MAX  UINT_MAX
43 | #define UINT64_MAX  ULLONG_MAX
44 | #define INTPTR_MIN  LONG_MIN
45 | #define INTMAX_MIN  LLONG_MIN
46 | #define INTPTR_MAX  LONG_MAX
47 | #define INTMAX_MAX  LLONG_MAX
48 | #define UINTPTR_MAX ULONG_MAX
49 | #define UINTMAX_MAX ULLONG_MAX
50 | #define PTRDIFF_MIN INTPTR_MIN
51 | #define PTRDIFF_MAX INTPTR_MAX
52 | #define SIZE_MAX    UINT64_MAX
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu:
--------------------------------------------------------------------------------
  1 | extern "C" {
  2 | 
  3 | #ifndef REAL
  4 | #define REAL float
  5 | #endif
  6 |     
  7 | #ifndef ACCUMULATOR
  8 | #define ACCUMULATOR float
  9 | #endif
 10 |     
 11 | #ifndef WGS
 12 | #define WGS 1024
 13 | #endif
 14 | 
 15 | #ifndef WGSm
 16 | #define WGSm 64
 17 | #endif
 18 | 
 19 | #ifndef WGSn
 20 | #define WGSn 16
 21 | #endif
 22 | 
 23 | // ================= Sum reduction =============================================
 24 | 
 25 |     __device__ ACCUMULATOR block_reduction_sum (const ACCUMULATOR value) {
 26 | 
 27 |         const int local_id = threadIdx.x;
 28 | 
 29 |         __shared__ ACCUMULATOR lacc[WGS];
 30 |         lacc[local_id] = value;
 31 | 
 32 |         __syncthreads();
 33 | 
 34 |         ACCUMULATOR pacc = value;
 35 |         int i = blockDim.x;
 36 |         while (i > 0) {
 37 |             const bool include_odd = (i > ((i >> 1) << 1)) && (local_id == ((i >> 1) - 1));
 38 |             i >>= 1;
 39 |             if (include_odd) {
 40 |                 pacc += lacc[local_id + i + 1];
 41 |             }
 42 |             if (local_id < i) {
 43 |                 pacc += lacc[local_id + i];
 44 |                 lacc[local_id] = pacc;
 45 |             }
 46 |             __syncthreads();
 47 |         }
 48 | 
 49 |         return lacc[0];
 50 |     }
 51 | 
 52 |     __device__ ACCUMULATOR block_reduction_sum_2 (const ACCUMULATOR value) {
 53 | 
 54 |         const int local_row = threadIdx.x;
 55 |         const int local_col = threadIdx.y;
 56 |         const int local_m = blockDim.x;
 57 | 
 58 |         __shared__ ACCUMULATOR lacc[WGS];
 59 |         lacc[local_row + local_col * local_m] = value;
 60 | 
 61 |         __syncthreads();
 62 |         
 63 |         ACCUMULATOR pacc = value;
 64 |         int i = blockDim.y;
 65 |         while (i > 0) {
 66 |             const bool include_odd = (i > ((i >> 1) << 1)) && (local_col == ((i >> 1) - 1));
 67 |             i >>= 1;
 68 |             if (include_odd) {
 69 |                 pacc += lacc[local_row + (local_col + i + 1) * local_m];
 70 |             }
 71 |             if (local_col < i) {
 72 |                 pacc += lacc[local_row + (local_col + i) * local_m];
 73 |                 lacc[local_row + local_col * local_m] = pacc;
 74 |             }
 75 |             __syncthreads();
 76 |         }
 77 | 
 78 |         return lacc[local_row];
 79 | 
 80 |     }
 81 | 
 82 |     __global__ void sum_reduction(const int n, ACCUMULATOR* acc) {
 83 |         const int gid = blockIdx.x * blockDim.x + threadIdx.x;
 84 |         const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? acc[gid] : 0.0);
 85 |         if (threadIdx.x == 0) {
 86 |             acc[blockIdx.x] = sum;
 87 |         }
 88 |     }
 89 | 
 90 |     __global__ void sum_reduction_horizontal (const int m, const int n, ACCUMULATOR* acc) {
 91 |         const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
 92 |         const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
 93 |         const int i = m * gid_1 + gid_0;
 94 |         const bool valid = (gid_0 < m) && (gid_1 < n);
 95 |         const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0);
 96 |         const bool write = valid && (threadIdx.y == 0);
 97 |         if (write) {
 98 |             acc[m * blockIdx.y + gid_0] = sum;
 99 |         }
100 |     }
101 | 
102 |     __global__ void sum_reduction_vertical (const int m, const int n, ACCUMULATOR* acc) {
103 |         const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
104 |         const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
105 |         const int i = n * gid_0 + gid_1;
106 |         const bool valid = (gid_0 < m) && (gid_1 < n);
107 |         const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0);
108 |         const bool write = valid && (threadIdx.y == 0);
109 |         if (write) {
110 |             acc[m * blockIdx.y + gid_0] = sum;
111 |         }
112 |     }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/java/uncomplicate/clojurecuda/internal/javacpp/CUHostFn.java:
--------------------------------------------------------------------------------
 1 | //   Copyright (c) Dragan Djuric. All rights reserved.
 2 | //   The use and distribution terms for this software are covered by the
 3 | //   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | //   which can be found in the file LICENSE at the root of this distribution.
 5 | //   By using this software in any fashion, you are agreeing to be bound by
 6 | //   the terms of this license.
 7 | //   You must not remove this notice, or any other, from this software.
 8 | 
 9 | package uncomplicate.clojurecuda.internal.javacpp;
10 | 
11 | import clojure.lang.IFn;
12 | import org.bytedeco.javacpp.Pointer;
13 | import org.bytedeco.cuda.cudart.CUhostFn;
14 | import org.bytedeco.cuda.cudart.CUstream_st;
15 | 
16 | 
17 | public class CUHostFn extends CUhostFn {
18 | 
19 |     private IFn fun;
20 | 
21 |     public CUHostFn (IFn fun) {
22 |         this.fun = fun;
23 |     }
24 | 
25 |     public void call (Pointer userData) {
26 |         fun.invoke(userData);
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/java/uncomplicate/clojurecuda/internal/javacpp/CUStreamCallback.java:
--------------------------------------------------------------------------------
 1 | //   Copyright (c) Dragan Djuric. All rights reserved.
 2 | //   The use and distribution terms for this software are covered by the
 3 | //   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | //   which can be found in the file LICENSE at the root of this distribution.
 5 | //   By using this software in any fashion, you are agreeing to be bound by
 6 | //   the terms of this license.
 7 | //   You must not remove this notice, or any other, from this software.
 8 | 
 9 | package uncomplicate.clojurecuda.internal.javacpp;
10 | 
11 | import clojure.lang.IFn;
12 | import org.bytedeco.javacpp.Pointer;
13 | import org.bytedeco.cuda.cudart.CUstreamCallback;
14 | import org.bytedeco.cuda.cudart.CUstream_st;
15 | 
16 | 
17 | public class CUStreamCallback extends CUstreamCallback {
18 | 
19 |     private IFn fun;
20 | 
21 |     public CUStreamCallback (IFn fun) {
22 |         this.fun = fun;
23 |     }
24 | 
25 |     public void call (CUstream_st hstream, int status, Pointer userData) {
26 |         fun.invoke(hstream, status, userData);
27 |     }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/core_test.clj:
--------------------------------------------------------------------------------
  1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
  2 | ;;   The use and distribution terms for this software are covered by the
  3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
  4 | ;;   which can be found in the file LICENSE at the root of this distribution.
  5 | ;;   By using this software in any fashion, you are agreeing to be bound by
  6 | ;;   the terms of this license.
  7 | ;;   You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns uncomplicate.clojurecuda.core-test
 10 |   (:require [midje.sweet :refer [facts => throws truthy]]
 11 |             [clojure.core.async :refer [chan <!!]]
 12 |             [uncomplicate.commons.core :refer [release with-release size bytesize let-release]]
 13 |             [uncomplicate.clojure-cpp :as cpp
 14 |              :refer [pointer float-pointer byte-pointer get-entry get-float put-float! int-pointer
 15 |                      long-pointer put-int! pointer-seq put-entry! fill! ptr address position!]]
 16 |             [uncomplicate.clojurecuda
 17 |              [core :refer :all]
 18 |              [info :as info :refer [pci-bus-id-string]]]
 19 |             [uncomplicate.clojurecuda.internal.impl :refer [cu-address*]])
 20 |   (:import clojure.lang.ExceptionInfo))
 21 | 
 22 | ;; ================== Driver tests ======================================================
 23 | 
 24 | (facts
 25 |  "Driver tests."
 26 |  (init) => true)
 27 | 
 28 | (facts
 29 |  "Device tests."
 30 |  (<= 0 (device-count)) => true
 31 |  (device 0) => truthy
 32 |  (device -1) => (throws ExceptionInfo)
 33 |  (device 33) => (throws ExceptionInfo)
 34 |  (device (pci-bus-id-string (device))) => (device))
 35 | 
 36 | ;; ===================== Context Management Tests =======================================
 37 | 
 38 | (facts
 39 |  "Context tests"
 40 |  (with-release [dev (device 0)]
 41 |    (let [ctx (context dev :sched-auto)]
 42 |      ctx => truthy
 43 |      (release ctx) => true
 44 |      (context dev :unknown) => (throws ExceptionInfo))
 45 |    (let [ctx1 (context dev :sched-blocking-sync)
 46 |          ctx2 (context dev :sched-blocking-sync)]
 47 |      (with-context ctx1
 48 |        (with-context ctx2
 49 |          (current-context) => ctx2
 50 |          (do (pop-context!) (current-context)) => ctx1
 51 |          (current-context! ctx2) => ctx2
 52 |          (current-context) => ctx2
 53 |          (release ctx2) => true
 54 |          (release ctx2) => true)))))
 55 | 
 56 | ;; =============== Module Management & Execution Control Tests =====================================
 57 | 
 58 | (facts
 59 |  "Test Parameters"
 60 |  (with-context (context (device))
 61 |    (with-release [cnt 3
 62 |                   extra 4
 63 |                   gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra)))
 64 |                   params (parameters cnt gpu-a)]
 65 |      (size params) => 2
 66 |      (get-entry (int-pointer (get-entry params 0))) => 3
 67 |      (get-entry (long-pointer (get-entry params 1))) => (cu-address* gpu-a)
 68 |      (address (pointer gpu-a)) => (cu-address* gpu-a)
 69 |      (address (pointer gpu-a 1)) => (inc (long (cu-address* gpu-a)))
 70 |      (address (position! (pointer gpu-a) 1)) => (cu-address* gpu-a))))
 71 | 
 72 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu")
 73 |       cnt 300
 74 |       extra 5]
 75 |   (with-context (context (device))
 76 |     (with-release [prog (compile! (program program-source {"dummy" "placeholder"}))
 77 |                    grid (grid-1d cnt (min 256 cnt))]
 78 |       (with-release [modl (module prog)
 79 |                      fun (function modl "inc")
 80 |                      strm (stream :non-blocking)
 81 |                      host-a (float-pointer (+ cnt extra))
 82 |                      gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra)))]
 83 | 
 84 |         (facts
 85 |          "Test launch"
 86 |          (fill! host-a 0)
 87 |          (put-entry! host-a 0 1)
 88 |          (put-entry! host-a 10 100)
 89 |          (memcpy-host! host-a gpu-a strm) => gpu-a
 90 |          (launch! fun grid strm (parameters (int cnt) gpu-a)) => strm
 91 |          (synchronize! strm) => strm
 92 |          (memcpy-host! gpu-a host-a strm) => host-a
 93 |          (get-entry host-a 0) => 2.0
 94 |          (get-entry host-a 10) => 101.0
 95 |          (get-entry host-a (dec cnt)) => 1.0
 96 |          (get-entry host-a cnt) => 0.0
 97 |          (get-entry host-a (dec (+ cnt extra))) => 0.0))
 98 | 
 99 |       (with-release [modl (module)]
100 |         (facts
101 |          "Test device globals"
102 |          (load! modl prog) => modl
103 |          (with-release [fun (function modl "constant_inc")
104 |                         gpu-a (global modl "gpu_a")
105 |                         constant-gpu-a (global modl "constant_gpu_a")]
106 |            (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0])
107 |            (memcpy! gpu-a constant-gpu-a) => constant-gpu-a
108 |            (launch! fun (grid-1d 3) (parameters 3 gpu-a))
109 |            (pointer-seq (memcpy-host! constant-gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0])
110 |            (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [2.0 4.0 6.0])))))))
111 | 
112 | ;; =============== Stream Management Tests ==============================================
113 | 
114 | (with-context (context (device 0) :map-host)
115 | 
116 |   (facts
117 |    "Stream creation and memory copy tests."
118 |    (with-release [strm (stream :non-blocking)
119 |                   cuda1 (mem-alloc-runtime Float/BYTES)
120 |                   cuda2 (mem-alloc-runtime Float/BYTES)
121 |                   host1 (float-array [173.0])
122 |                   host2 (byte-pointer Float/BYTES)]
123 |      (memcpy-host! host1 cuda1 strm) => cuda1
124 |      (synchronize! strm)
125 |      (memcpy! cuda1 cuda2) => cuda2
126 |      (memcpy-host! cuda2 host2 strm) => host2
127 |      (synchronize! strm)
128 |      (get-float host2 0) => 173.0))
129 | 
130 |   (facts
131 |    "Stream and memory release."
132 |    (with-release [strm (stream :non-blocking)
133 |                   cuda (mem-alloc-runtime Float/BYTES)]
134 |      (release strm) => true
135 |      (release strm) => true
136 |      (release cuda) => true
137 |      (memcpy! cuda cuda) => (throws IllegalArgumentException)
138 |      (release cuda) => true)))
139 | 
140 | (with-context (context (device 0) :map-host)
141 |   (facts
142 |    "Host functions."
143 |    (let [ch (chan)]
144 |      (with-release [strm (stream :non-blocking)
145 |                     cuda1 (mem-alloc-runtime Float/BYTES)
146 |                     cuda2 (mem-alloc-runtime Float/BYTES)
147 |                     host1 (float-array [163.0])
148 |                     host2 (float-pointer [12])
149 |                     ch (chan)]
150 |        (listen! strm ch :host)
151 |        (memcpy-host! host1 cuda1 strm) => cuda1
152 |        (memcpy! cuda1 cuda2 strm) => cuda2
153 |        (synchronize! strm)
154 |        (memcpy-host! cuda2 (float-array 1) strm) => (throws Exception)
155 |        (get-entry (memcpy-host! cuda2 host2 strm) 0) => 163.0
156 |        (<!! ch) => :host))))
157 | 
158 | ;; =============== Memory Management Tests ==============================================
159 | 
160 | (with-release [dev (device 0)]
161 |   (with-context (context dev :map-host)
162 | 
163 |     (facts
164 |      "mem-alloc-runtime tests."
165 |      (mem-alloc-driver 0) => (throws ExceptionInfo)
166 |      (with-release [buf (mem-alloc-runtime Float/BYTES)]
167 |        (bytesize buf) => Float/BYTES))
168 | 
169 |     (facts
170 |      "Linear memory tests."
171 |      (with-release [cuda1 (mem-alloc-runtime Float/BYTES)
172 |                     cuda2 (mem-alloc-runtime Float/BYTES)
173 |                     host1 (float-array [173.0])
174 |                     host2 (byte-pointer Float/BYTES)]
175 |        (memcpy-host! host1 cuda1) => cuda1
176 |        (memcpy! cuda1 cuda2) => cuda2
177 |        (memcpy-host! cuda2 host2) => host2
178 |        (get-float host2 0) => 173.0))
179 | 
180 |     (facts
181 |      "Linear memory sub-region tests."
182 |      (with-release [cuda (mem-alloc-runtime 20)]
183 |        (memcpy-host! (float-array [1 2 3 4 5]) cuda) => cuda
184 |        (let-release [cuda1 (mem-sub-region cuda 0 8)
185 |                      cuda2 (mem-sub-region cuda 8 12)]
186 |          (mem-sub-region cuda 8 20) => (throws ExceptionInfo)
187 |          (pointer-seq (memcpy-host! cuda1 (float-pointer 2))) => [1.0 2.0]
188 |          (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [3.0 4.0 5.0]
189 |          (do (release cuda1)
190 |              (release cuda2)
191 |              (pointer-seq (memcpy-host! cuda (float-pointer 5))) => [1.0 2.0 3.0 4.0 5.0]))))
192 | 
193 |     (facts
194 |      "Runtime cudaMalloc tests."
195 |      (with-release [cuda1 (mem-alloc-runtime Float/BYTES :float)
196 |                     cuda2 (mem-alloc-runtime (* 3 Float/BYTES) :float)
197 |                     host1 (float-pointer [100.0])
198 |                     host2 (mem-alloc-mapped Float/BYTES :float)
199 |                     zero (mem-alloc-runtime 0)]
200 |        zero => truthy
201 |        (bytesize cuda1) => Float/BYTES
202 |        (memcpy-host! host1 cuda1) => cuda1
203 |        (synchronize!)
204 |        (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [100.0]
205 |        (seq (memcpy! cuda1 host2)) => [100.0]
206 |        (position! (pointer cuda2) 2)
207 |        (.position (pointer cuda2)) => 2
208 |        (memcpy! cuda1 cuda2) => cuda2
209 |        (position! (pointer cuda2) 0)
210 |        (memcpy-host! (float-pointer [200.0 300.0]) cuda2) => cuda2
211 |        (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [200.0 300.0 100.0]))
212 | 
213 |     (facts
214 |      "Pinned memory tests."
215 |      (with-release [pinned-host (mem-alloc-pinned Float/BYTES :float :devicemap)
216 |                     cuda1 (mem-alloc-runtime Float/BYTES)]
217 |        (mem-alloc-pinned Float/BYTES :unknown) => (throws ExceptionInfo)
218 |        (bytesize pinned-host) => Float/BYTES
219 |        (put-entry! pinned-host 0 13)
220 |        (memcpy-host! pinned-host cuda1) => cuda1
221 |        (put-entry! pinned-host 0 11)
222 |        (memcpy! cuda1 pinned-host) => pinned-host
223 |        (synchronize!)
224 |        (get-entry pinned-host 0) => 13.0
225 |        (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [13.0]))
226 | 
227 |     (facts
228 |      "Mapped memory tests."
229 |      (with-release [mapped-host (mem-alloc-mapped Float/BYTES :float)
230 |                     cuda1 (mem-alloc-runtime Float/BYTES)
231 |                     mapped-host2 (mem-alloc-mapped Float/BYTES :float)]
232 |        (bytesize mapped-host) => Float/BYTES
233 |        (put-entry! mapped-host 0 14.0)
234 |        (memcpy-host! mapped-host cuda1) => cuda1
235 |        (get-entry (memcpy-host! cuda1 (float-pointer 1)) 0) => 14.0
236 |        (get-entry (memcpy! cuda1 mapped-host2)) => 14.0
237 |        (synchronize!)
238 |        (seq mapped-host2) => [14.0]))
239 | 
240 |     (facts
241 |      "CUDA Raw Runtime Pointer tests."
242 |      (with-release [host1 (float-pointer [1 2 3 4])
243 |                     cuda1 (cuda-malloc (* 4 Float/BYTES) :float)
244 |                     cuda2 (cuda-malloc (* 3 Float/BYTES) :float)
245 |                     host2 (float-pointer 4)
246 |                     host3 (float-pointer 3)]
247 |        (memcpy-to-device! host1 cuda1) => cuda1
248 |        (memcpy! (ptr cuda1 2) (ptr cuda2 1))
249 |        (synchronize!)
250 |        (pointer-seq (memcpy-to-host! cuda1 host2)) => [1.0 2.0 3.0 4.0]
251 |        (pointer-seq (memcpy-to-host! cuda2 host3)) => [0.0 3.0 4.0]
252 |        (cuda-free! cuda1) => cuda1
253 |        (cuda-free! cuda1) => cuda1
254 |        (cuda-free! cuda2) => cuda2))
255 | 
256 |     (facts
257 |      "CUDA Raw Runtime Pointer arithmetic tests."
258 |      (with-release [host1 (float-pointer [1 2 3 4])
259 |                     cuda1 (cuda-malloc (* 4 Float/BYTES) :float)]
260 |        (memcpy-to-device! host1 cuda1) => cuda1
261 |        (pointer cuda1) => cuda1
262 |        (pointer cuda1 0) => cuda1
263 |        (size (pointer cuda1 1)) => (dec (size cuda1))
264 |        (bytesize (pointer cuda1 1)) => (- (bytesize cuda1) Float/BYTES)
265 |        (size (ptr cuda1 1)) => (dec (size cuda1))
266 |        (bytesize (ptr cuda1 1)) => (- (bytesize cuda1) Float/BYTES)
267 |        (cuda-free! cuda1) => cuda1))
268 | 
269 |     (facts
270 |       "cuda-malloc memset tests."
271 |       (with-release [cuda1 (cuda-malloc (* 2 Integer/BYTES) :int)]
272 |         (memcpy-to-device! (int-pointer [124 134]) cuda1) => cuda1
273 |         (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 134]
274 |         (position! (pointer cuda1) 1)
275 |         (memset! cuda1 (int 100) 1)
276 |         (position! (pointer cuda1) 0)
277 |         (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 100]
278 |         (memset! cuda1 (int 200) 1)
279 |         (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [200 100]))
280 | 
281 |     (facts
282 |       "cuda-alloc-runtime memset tests."
283 |       (with-release [cuda1 (mem-alloc-runtime (* 2 Integer/BYTES) :int)]
284 |         (memcpy-host! (int-pointer [124 134]) cuda1) => cuda1
285 |         (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 134]
286 |         (position! (pointer cuda1) 1)
287 |         (memset! cuda1 (int 100) 1)
288 |         (position! (pointer cuda1) 0)
289 |         (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 100]
290 |         (memset! cuda1 (int 200) 1)
291 |         (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [200 100]))
292 | 
293 |     (when (and (info/managed-memory dev) (info/concurrent-managed-access dev))
294 |       (facts
295 |        "mem-alloc-driver tests."
296 |        (with-release [host0 (float-pointer [15])
297 |                       host1 (float-pointer 1)
298 |                       cuda0 (mem-alloc-driver Float/BYTES :host)
299 |                       cuda1 (mem-alloc-driver Float/BYTES :global)]
300 | 
301 |          (bytesize cuda0) => Float/BYTES
302 |          (mem-alloc-driver Float/BYTES :unknown) => (throws ExceptionInfo)
303 |          (memcpy-host! host0 cuda0) => cuda0
304 |          (memcpy! cuda0 cuda1) => cuda1
305 |          (memcpy-host! cuda1 host1) => host1
306 |          (get-entry host1 0) => 15.0)))
307 | 
308 |     (when (info/managed-memory dev)
309 |       (facts
310 |        "mem-alloc-driver with globally shared attached memory tests."
311 |        (with-release [host0 (float-pointer [16])
312 |                       host1 (float-pointer 1)
313 |                       cuda0 (mem-alloc-driver Float/BYTES :host)
314 |                       cuda1 (mem-alloc-driver Float/BYTES :global)]
315 |          (attach-mem! nil cuda0 Float/BYTES :global) => nil
316 |          (bytesize cuda0) => Float/BYTES
317 |          (memcpy-host! host0 cuda0) => cuda0
318 |          (memcpy! cuda0 cuda1) => cuda1
319 |          (memcpy-host! cuda1 host1) => host1
320 |          (get-entry host1 0) => 16.0))
321 |       (facts
322 |        "mem-alloc-driver with attached memory tests."
323 |        (with-release [host0 (float-pointer [17])
324 |                       host1 (float-pointer 1)
325 |                       cuda0 (mem-alloc-driver Float/BYTES :host)
326 |                       cuda1 (mem-alloc-driver Float/BYTES :global)]
327 |          (let [hstream (attach-mem! cuda0 Float/BYTES :single)]
328 |            (bytesize cuda0) => Float/BYTES
329 |            (if (info/concurrent-managed-access dev)
330 |              (memcpy-host! host0 cuda0) => cuda0
331 |              (memcpy-host! host0 cuda0) => (throws ExceptionInfo))
332 |            (memcpy-host! host0 cuda0 hstream) => cuda0
333 |            (memcpy! cuda0 cuda1 hstream) => cuda1
334 |            (memcpy-host! cuda1 host1 hstream) => host1
335 |            (synchronize! hstream)
336 |            (get-entry host1 0) => 17.0))))
337 | 
338 |     (facts
339 |      "mem-alloc-registered tests."
340 |      (with-release [host0 (byte-pointer Float/BYTES)
341 |                     host1 (byte-pointer Float/BYTES)
342 |                     cuda0 (mem-register-pinned! host0)
343 |                     cuda1 (mem-register-pinned! host1)]
344 | 
345 |        (bytesize cuda0) => Float/BYTES
346 |        (put-float! host0 0 44.0)
347 |        (memcpy! cuda0 cuda1) => cuda1
348 |        (get-float host1 0) => 44.0))))
349 | 
350 | ;; ================= Peer Access Management Tests =====================================
351 | 
352 | (facts
353 |  "Peer access tests (requires 2 devices)."
354 |  (let [num-dev (device-count)
355 |        devices (mapv device (range num-dev))
356 |        combinations (set (for [x (range num-dev) y (range num-dev) :when (not= x y)] #{x y}))
357 |        p2p? (fn [num-pair] (let [[a b] (vec num-pair)
358 |                                  dev-a (nth devices a)
359 |                                  dev-b (nth devices b)]
360 |                              (when (and (p2p-attribute dev-a dev-b :access-supported)
361 |                                         (can-access-peer dev-a dev-b)
362 |                                         (can-access-peer dev-b dev-a))
363 |                                [dev-a dev-b])))]
364 |    (if-let [[dev-a dev-b] (some p2p? combinations)]
365 |      (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-ad.cdu")
366 |            ^:const vctr-len 3]
367 |        (with-release [host-a (float-array [1 2 3])
368 |                       host-b (float-array [2 3 4])
369 |                       host-sum (float-array vctr-len)
370 |                       ctx (context dev-a)
371 |                       peer-ctx (context dev-b)]
372 |          (in-context ctx
373 |                      (with-release [prog (compile! (program program-source))
374 |                                     m (module prog)
375 |                                     vector-add (function m "add")
376 |                                     gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len))
377 |                                     gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len))
378 |                                     gpu-b (in-context peer-ctx (mem-alloc-runtime (* Float/BYTES vctr-len)))]
379 |                        (disable-peer-access! peer-ctx) => (throws ExceptionInfo)
380 |                        (in-context peer-ctx (disable-peer-access! ctx) => (throws ExceptionInfo))
381 |                        (memcpy-host! host-a gpu-a) => gpu-a
382 |                        (in-context peer-ctx (memcpy-host! host-b gpu-b) => gpu-b)
383 |                        (enable-peer-access! peer-ctx) => peer-ctx
384 |                        (in-context peer-ctx (enable-peer-access! ctx) => ctx)
385 |                        (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum))
386 |                        (synchronize!)
387 |                        (memcpy-host! gpu-a-sum host-sum) => (seq [3.0 5.0 7.0])
388 |                        (disable-peer-access! peer-ctx) => peer-ctx
389 |                        (in-context peer-ctx (disable-peer-access! ctx) => ctx))))
390 |        (when-let [dev (first devices)]
391 |          (p2p-attribute dev dev :access-supported) => (throws ExceptionInfo)
392 |          (can-access-peer dev dev) => false)))))
393 | 
394 | (facts
395 |  "Runtime API Pointer kernel launch test"
396 |  (let [dev (device 0)
397 |        program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu")
398 |        ^:const vctr-len 3]
399 |    (with-release [host-a (float-pointer [1 2 3])
400 |                   host-b (float-pointer [2 3 4])
401 |                   host-sum (float-pointer vctr-len)
402 |                   ctx (context dev)]
403 |      (in-context ctx
404 |                  (with-release [prog (compile! (program program-source))
405 |                                 m (module prog)
406 |                                 vector-add (function m "add")
407 |                                 gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len))
408 |                                 gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len))
409 |                                 gpu-b (mem-alloc-runtime (* Float/BYTES vctr-len))]
410 |                    (memcpy-host! host-a gpu-a) => gpu-a
411 |                    (memcpy-host! host-b gpu-b) => gpu-b
412 |                    (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum))
413 |                    (synchronize!)
414 |                    (pointer-seq (memcpy! gpu-a-sum host-sum)) => (seq [3.0 5.0 7.0]))))))
415 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/examples/dynamic_parallelism_test.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns uncomplicate.clojurecuda.examples.dynamic-parallelism-test
10 |   (:require [midje.sweet :refer [facts =>]]
11 |             [clojure.java.io :refer [file]]
12 |             [uncomplicate.commons.core :refer [with-release]]
13 |             [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]]
14 |             [uncomplicate.clojurecuda.core
15 |              :refer [compile! context device function grid-1d init launch! link link-complete!
16 |                      mem-alloc-runtime memcpy-host! module parameters program with-context]]))
17 | 
18 | (init)
19 | 
20 | (let [program-source (slurp "test/cuda/examples/dynamic-parallelism.cu")
21 |       num-parent-threads 8
22 |       num-child-threads 8
23 |       num-elements (* num-parent-threads num-child-threads)]
24 |   (with-context (context (device))
25 |     (with-release [prog (compile! (program program-source)
26 |                                   ["--relocatable-device-code=true" "-default-device"])
27 |                    linked-prog (link [[:library (file "/opt/cuda/lib64/libcudadevrt.a")]
28 |                                       [:ptx prog]])
29 |                    m (module (link-complete! linked-prog))
30 |                    parent (function m "parentKernel")
31 |                    data (mem-alloc-runtime (* Float/BYTES num-elements))]
32 |       (facts
33 |        "Dynamic parallelism JCuda example."
34 |        (memcpy-host! (float-pointer num-elements) data)
35 |        (launch! parent (grid-1d (+ num-elements num-elements (- 1)) num-parent-threads)
36 |                 (parameters num-elements data))
37 |        (pointer-seq (memcpy-host! data (float-pointer num-elements)))
38 |        => (map float (seq [0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7
39 |                            1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7
40 |                            2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7
41 |                            3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7
42 |                            4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7
43 |                            5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7
44 |                            6.0 6.1 6.2 6.3 6.4 6.5 6.6 6.7
45 |                            7.0 7.1 7.2 7.3 7.4 7.5 7.6 7.7]))))))
46 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/examples/vector_add_test.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns uncomplicate.clojurecuda.examples.vector-add-test
10 |   (:require [midje.sweet :refer [facts =>]]
11 |             [uncomplicate.commons.core :refer [with-release size]]
12 |             [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]]
13 |             [uncomplicate.clojurecuda.core
14 |              :refer [compile! context device function grid-1d init launch! mem-alloc-driver
15 |                      mem-alloc-pinned mem-alloc-runtime memcpy-host! module parameters program
16 |                      synchronize! with-context]]))
17 | 
18 | (init)
19 | 
20 | (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu")]
21 |   (with-context (context (device))
22 |     (with-release [prog (compile! (program program-source))
23 |                    m (module prog)
24 |                    add (function m "add")
25 |                    host-a (float-pointer [1 2 3])
26 |                    host-b (float-pointer [2 3 4])
27 |                    host-sum (float-pointer 3)
28 |                    gpu-a (mem-alloc-runtime (* Float/BYTES 3))
29 |                    gpu-b (mem-alloc-driver (* Float/BYTES 3))
30 |                    gpu-sum (mem-alloc-pinned (* Float/BYTES 3))]
31 |       (facts
32 |        "Vector add JCuda example."
33 |        (memcpy-host! host-a gpu-a)
34 |        (memcpy-host! host-b gpu-b)
35 |        (launch! add (grid-1d (size host-sum)) (parameters (size host-sum) gpu-a gpu-b gpu-sum))
36 |        (synchronize!)
37 |        (pointer-seq (memcpy-host! gpu-sum host-sum)) => (seq [3.0 5.0 7.0])))))
38 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/info_test.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns uncomplicate.clojurecuda.info-test
10 |   (:require [midje.sweet :refer [facts =>]]
11 |             [uncomplicate.commons.core :refer [with-release info]]
12 |             [uncomplicate.clojurecuda
13 |              [core :refer [compile! context device function init module program stream with-context]]
14 |              [info :refer [driver-version limit limit! stream-flag]]]
15 |             [uncomplicate.clojurecuda.internal.constants :refer [stream-flags]]))
16 | 
17 | (init)
18 | 
19 | (facts
20 |  "Driver info tests."
21 |  (pos? (driver-version)) => true)
22 | 
23 | (facts
24 |  "Device info tests."
25 |  (count (info (device 0))) => 83)
26 | 
27 | (with-release [ctx (context (device))]
28 |   (facts
29 |    "Context info tests."
30 |    (count (info ctx)) => 13
31 |    (limit! :stack-size 512) => 512
32 |    (limit :stack-size) => 512))
33 | 
34 | (with-context (context (device))
35 |   (with-release [hstream (stream :non-blocking)]
36 |     (facts
37 |      "Stream info tests."
38 |      (count (info hstream)) => 2
39 |      (stream-flag hstream) => (stream-flags :non-blocking)
40 |      (:flag (info hstream))))) => :non-blocking
41 | 
42 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu")]
43 |   (with-context (context (device))
44 |     (with-release [prog (compile! (program program-source))
45 |                    modl (module prog)
46 |                    fun (function modl "inc")]
47 |       (facts
48 |        "function info tests."
49 |        (count (info fun)) => 7))))
50 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/toolbox_test.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns uncomplicate.clojurecuda.toolbox-test
10 |   (:require [midje.sweet :refer [facts => roughly]]
11 |             [uncomplicate.commons
12 |              [core :refer [with-release]]
13 |              [utils :refer [count-groups]]]
14 |             [uncomplicate.clojure-cpp :refer [float-pointer double-pointer pointer-seq]]
15 |             [uncomplicate.clojurecuda
16 |              [core :refer [compile! context device function init mem-alloc-runtime memcpy-host!
17 |                            module program with-context]]
18 |              [info :refer [max-block-dim-x]]
19 |              [toolbox :refer [launch-reduce! read-double]]]))
20 | 
21 | (init)
22 | 
23 | (let [dev (device)
24 |       cnt-m 311
25 |       cnt-n 9011
26 |       cnt (* cnt-m cnt-n)
27 |       program-source (str (slurp "src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu") "\n"
28 |                           (slurp "test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu"))]
29 | 
30 |   (with-context (context dev)
31 |     (with-release [wgs (max-block-dim-x dev)
32 |                    prog (compile! (program program-source)
33 |                                   ["-DREAL=float" "-DACCUMULATOR=double"
34 |                                    (format "-DWGS=%d" wgs)])
35 |                    modl (module prog)
36 |                    data (float-pointer (range cnt))
37 |                    cu-data (mem-alloc-runtime (* cnt Float/BYTES))
38 |                    sum-reduction-horizontal (function modl "sum_reduction_horizontal")
39 |                    sum-horizontal (function modl "sum_reduce_horizontal")]
40 | 
41 |       (memcpy-host! data cu-data)
42 | 
43 |       (let [acc-size (* Double/BYTES (max 1 (count-groups wgs cnt)))]
44 |         (with-release [sum-reduction-kernel (function modl "sum_reduction")
45 |                        sum-reduce-kernel (function modl "sum_reduce")
46 |                        cu-acc (mem-alloc-runtime acc-size)]
47 |           (facts
48 |            "Test 1D reduction."
49 |            (launch-reduce! nil sum-reduce-kernel sum-reduction-kernel [cu-acc cu-data] [cu-acc] cnt wgs)
50 |            (read-double cu-acc) => 3926780329410.0)))
51 | 
52 |       (let [wgs-m 64
53 |             wgs-n 16
54 |             acc-size (* Double/BYTES (max 1 (* cnt-m (count-groups wgs-n cnt-n))))
55 |             res (double-pointer cnt-m)]
56 |         (with-release [sum-reduction-horizontal (function modl "sum_reduction_horizontal")
57 |                        sum-reduce-horizontal (function modl "sum_reduce_horizontal")
58 |                        cu-acc (mem-alloc-runtime acc-size)]
59 |           (facts
60 |            "Test horizontal 2D reduction."
61 |            (launch-reduce! nil sum-reduce-horizontal sum-reduction-horizontal
62 |                            [cu-acc cu-data] [cu-acc] cnt-m cnt-n wgs-m wgs-n)
63 |            (memcpy-host! cu-acc res)
64 |            (apply + (pointer-seq res)) => (roughly 3.92678032941E12))))
65 | 
66 |       (let [wgs-m 64
67 |             wgs-n 16
68 |             acc-size (* Double/BYTES (max 1 (* cnt-n (count-groups wgs-m cnt-m))))
69 |             res (double-pointer cnt-n)]
70 |         (with-release [sum-reduction-vertical (function modl "sum_reduction_vertical")
71 |                        sum-reduce-vertical (function modl "sum_reduce_vertical")
72 |                        cu-acc (mem-alloc-runtime acc-size)]
73 |           (facts
74 |            "Test vertical 2D reduction."
75 |            (launch-reduce! nil sum-reduce-vertical sum-reduction-vertical
76 |                            [cu-acc cu-data] [cu-acc] cnt-n cnt-m wgs-n wgs-m)
77 |            (memcpy-host! cu-acc res)
78 |            (apply + (pointer-seq res)) => (roughly 3.92678032941E12)))))))
79 | 


--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/utils_test.clj:
--------------------------------------------------------------------------------
 1 | ;;   Copyright (c) Dragan Djuric. All rights reserved.
 2 | ;;   The use and distribution terms for this software are covered by the
 3 | ;;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
 4 | ;;   which can be found in the file LICENSE at the root of this distribution.
 5 | ;;   By using this software in any fashion, you are agreeing to be bound by
 6 | ;;   the terms of this license.
 7 | ;;   You must not remove this notice, or any other, from this software.
 8 | 
 9 | (ns uncomplicate.clojurecuda.utils-test
10 |   (:require [midje.sweet :refer [facts => throws]]
11 |             [uncomplicate.clojurecuda.internal.utils :refer [error maybe with-check]]))
12 | 
13 | (facts
14 |  "error tests"
15 | 
16 |  (ex-data (error 0))
17 |  => {:code 0, :details nil, :name :success :type :cuda}
18 | 
19 |  (ex-data (error -43))
20 |  => {:code -43, :details nil, :name -43, :type :cuda}
21 | 
22 |  (ex-data (error 0 "Additional details"))
23 |  => {:code 0, :details "Additional details", :name :success, :type :cuda})
24 | 
25 | (facts
26 |  "with-check tests"
27 |  (let [f (fn [x] (if x 0 -1))]
28 |    (with-check (f 1) :success) => :success
29 |    (with-check (f false) :success) => (throws clojure.lang.ExceptionInfo)))
30 | 
31 | (facts
32 |  "maybe tests"
33 |  (ex-data (maybe (throw (ex-info "Test Exception" {:data :test}))))
34 |  => (throws clojure.lang.ExceptionInfo)
35 | 
36 |  (:type (ex-data (error -1 nil))) => :cuda)
37 | 


--------------------------------------------------------------------------------
/test/cuda/examples/dynamic-parallelism.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Created based on example from Marco Hutter:
 3 |  *
 4 |  * JCuda - Java bindings for NVIDIA CUDA
 5 |  *
 6 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 7 |  */
 8 | 
 9 | extern "C"
10 | __global__ void childKernel(unsigned int parentThreadIndex, float* data) {
11 |     data[threadIdx.x] = parentThreadIndex + 0.1f * threadIdx.x;
12 | }
13 | 
14 | extern "C"
15 | __global__ void parentKernel(unsigned int size, float *data) {
16 |     childKernel<<<1, 8>>>(threadIdx.x, data + threadIdx.x * 8);
17 | }
18 | 


--------------------------------------------------------------------------------
/test/cuda/examples/jnvrtc-vector-add.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Created based on example from Marco Hutter:
 3 |  *
 4 |  * JCuda - Java bindings for NVIDIA CUDA
 5 |  *
 6 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 7 |  */
 8 | 
 9 | extern "C"
10 | __global__ void add(int n, float *a, float *b, float *sum) {
11 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
12 |     if (i < n) {
13 |         sum[i] = a[i] + b[i];
14 |     }
15 | };
16 | 


--------------------------------------------------------------------------------
/test/cuda/uncomplicate/clojurecuda/kernels/test.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | 
 3 |     __global__ void inc (int n, float* a) {
 4 |         int i = blockIdx.x * blockDim.x + threadIdx.x;
 5 |         if (i < n) {
 6 |             a[i] += 1;
 7 |         }
 8 |     };
 9 | 
10 |     __device__ float gpu_a[] = {1.0, 2.0, 3.0};
11 | 
12 |     __device__ __constant__ float constant_gpu_a[3];
13 | 
14 |     __global__ void constant_inc (int n, float* a) {
15 |         int i = blockIdx.x * blockDim.x + threadIdx.x;
16 |         if (i < n) {
17 |             a[i] += constant_gpu_a[i];
18 |         }
19 |     };
20 |     
21 | }
22 | 


--------------------------------------------------------------------------------
/test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 |         
 3 |     __global__ void sum_reduce (const int n, ACCUMULATOR* acc, const REAL* x) {
 4 |         const int gid = blockIdx.x * blockDim.x + threadIdx.x;
 5 |         const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? x[gid] : 0.0);
 6 |         if (threadIdx.x == 0) {
 7 |             acc[blockIdx.x] = sum;
 8 |         }
 9 |     };
10 | 
11 |     __global__ void sum_reduce_horizontal (const int m, const int n, ACCUMULATOR* acc, const REAL* a) {
12 |         const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
13 |         const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
14 |         const int i = m * gid_1 + gid_0;
15 |         const bool valid = (gid_0 < m) && (gid_1 < n);
16 |         const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0);
17 |         const bool write = valid && (threadIdx.y == 0);
18 |         if (write) {
19 |             acc[m * blockIdx.y + gid_0] = sum;
20 |         }
21 |     }
22 | 
23 |     __global__ void sum_reduce_vertical (const int m, const int n, ACCUMULATOR* acc, const REAL* a) {
24 |         const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
25 |         const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
26 |         const int i = n * gid_0 + gid_1;
27 |         const bool valid = (gid_0 < m) && (gid_1 < n);
28 |         const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0);
29 |         const bool write = valid && (threadIdx.y == 0);
30 |         if (write) {
31 |             acc[m * blockIdx.y + gid_0] = sum;
32 |         }
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------