├── .github
└── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── project.clj
├── src
├── clojure
│ └── uncomplicate
│ │ └── clojurecuda
│ │ ├── core.clj
│ │ ├── info.clj
│ │ ├── internal
│ │ ├── constants.clj
│ │ ├── impl.clj
│ │ └── utils.clj
│ │ └── toolbox.clj
├── cuda
│ └── uncomplicate
│ │ └── clojurecuda
│ │ ├── include
│ │ └── jitify
│ │ │ ├── LICENSE
│ │ │ ├── float.h
│ │ │ ├── stddef.h
│ │ │ └── stdint.h
│ │ └── kernels
│ │ └── reduction.cu
└── java
│ └── uncomplicate
│ └── clojurecuda
│ └── internal
│ └── javacpp
│ ├── CUHostFn.java
│ └── CUStreamCallback.java
└── test
├── clojure
└── uncomplicate
│ └── clojurecuda
│ ├── core_test.clj
│ ├── examples
│ ├── dynamic_parallelism_test.clj
│ └── vector_add_test.clj
│ ├── info_test.clj
│ ├── toolbox_test.clj
│ └── utils_test.clj
└── cuda
├── examples
├── dynamic-parallelism.cu
└── jnvrtc-vector-add.cu
└── uncomplicate
└── clojurecuda
└── kernels
├── test.cu
└── toolbox-test.cu
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: draganrocks
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /lib
3 | /classes
4 | /checkouts
5 | pom.xml
6 | pom.xml.asc
7 | *.jar
8 | *.class
9 | /.lein-*
10 | /.nrepl-port
11 | doc
12 | docs
13 | hs_*.log
14 | .#*
15 | .DS_Store
16 | *.o
17 | *.so
18 | */nrepl-port
19 | */target
20 | .idea
21 | /*.iml
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
4 |
5 | 1. DEFINITIONS
6 |
7 | "Contribution" means:
8 |
9 | a) in the case of the initial Contributor, the initial code and
10 | documentation distributed under this Agreement, and
11 |
12 | b) in the case of each subsequent Contributor:
13 |
14 | i) changes to the Program, and
15 |
16 | ii) additions to the Program;
17 |
18 | where such changes and/or additions to the Program originate from and are
19 | distributed by that particular Contributor. A Contribution 'originates' from
20 | a Contributor if it was added to the Program by such Contributor itself or
21 | anyone acting on such Contributor's behalf. Contributions do not include
22 | additions to the Program which: (i) are separate modules of software
23 | distributed in conjunction with the Program under their own license
24 | agreement, and (ii) are not derivative works of the Program.
25 |
26 | "Contributor" means any person or entity that distributes the Program.
27 |
28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
29 | necessarily infringed by the use or sale of its Contribution alone or when
30 | combined with the Program.
31 |
32 | "Program" means the Contributions distributed in accordance with this
33 | Agreement.
34 |
35 | "Recipient" means anyone who receives the Program under this Agreement,
36 | including all Contributors.
37 |
38 | 2. GRANT OF RIGHTS
39 |
40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
42 | reproduce, prepare derivative works of, publicly display, publicly perform,
43 | distribute and sublicense the Contribution of such Contributor, if any, and
44 | such derivative works, in source code and object code form.
45 |
46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
49 | transfer the Contribution of such Contributor, if any, in source code and
50 | object code form. This patent license shall apply to the combination of the
51 | Contribution and the Program if, at the time the Contribution is added by the
52 | Contributor, such addition of the Contribution causes such combination to be
53 | covered by the Licensed Patents. The patent license shall not apply to any
54 | other combinations which include the Contribution. No hardware per se is
55 | licensed hereunder.
56 |
57 | c) Recipient understands that although each Contributor grants the licenses
58 | to its Contributions set forth herein, no assurances are provided by any
59 | Contributor that the Program does not infringe the patent or other
60 | intellectual property rights of any other entity. Each Contributor disclaims
61 | any liability to Recipient for claims brought by any other entity based on
62 | infringement of intellectual property rights or otherwise. As a condition to
63 | exercising the rights and licenses granted hereunder, each Recipient hereby
64 | assumes sole responsibility to secure any other intellectual property rights
65 | needed, if any. For example, if a third party patent license is required to
66 | allow Recipient to distribute the Program, it is Recipient's responsibility
67 | to acquire that license before distributing the Program.
68 |
69 | d) Each Contributor represents that to its knowledge it has sufficient
70 | copyright rights in its Contribution, if any, to grant the copyright license
71 | set forth in this Agreement.
72 |
73 | 3. REQUIREMENTS
74 |
75 | A Contributor may choose to distribute the Program in object code form under
76 | its own license agreement, provided that:
77 |
78 | a) it complies with the terms and conditions of this Agreement; and
79 |
80 | b) its license agreement:
81 |
82 | i) effectively disclaims on behalf of all Contributors all warranties and
83 | conditions, express and implied, including warranties or conditions of title
84 | and non-infringement, and implied warranties or conditions of merchantability
85 | and fitness for a particular purpose;
86 |
87 | ii) effectively excludes on behalf of all Contributors all liability for
88 | damages, including direct, indirect, special, incidental and consequential
89 | damages, such as lost profits;
90 |
91 | iii) states that any provisions which differ from this Agreement are offered
92 | by that Contributor alone and not by any other party; and
93 |
94 | iv) states that source code for the Program is available from such
95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
96 | or through a medium customarily used for software exchange.
97 |
98 | When the Program is made available in source code form:
99 |
100 | a) it must be made available under this Agreement; and
101 |
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 |
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 |
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 |
111 | 4. COMMERCIAL DISTRIBUTION
112 |
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering. The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor tocontrol, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 |
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 |
144 | 5. NO WARRANTY
145 |
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 |
157 | 6. DISCLAIMER OF LIABILITY
158 |
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | nEXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 |
168 | 7. GENERAL
169 |
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 |
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 |
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 |
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 |
210 | This Agreement is governed by the laws of the State of New York and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.
215 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [New books available for subscription](https://aiprobook.com)
2 |
3 |
4 |
5 |
6 |
7 | # ClojureCUDA
8 |
9 | [Adopt your pet function](https://dragan.rocks/articles/18/Patreon-Announcement-Adopt-a-Function) and [become a patron](https://patreon.com/draganrocks).
10 |
11 | Clojure library for CUDA development. See the documentation at [ClojureCUDA website](https://clojurecuda.uncomplicate.org).
12 |
13 | ## License
14 |
15 | Copyright © 2017-2019 Dragan Djuric
16 |
17 | Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version.
18 |
--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (defproject uncomplicate/clojurecuda "0.21.1-SNAPSHOT"
10 | :description "ClojureCUDA is a Clojure library for parallel computations with Nvidia's CUDA."
11 | :url "https://github.com/uncomplicate/clojurecuda"
12 | :scm {:name "git"
13 | :url "https://github.com/uncomplicate/clojurecuda"}
14 | :license {:name "Eclipse Public License"
15 | :url "http://www.eclipse.org/legal/epl-v10.html"}
16 | :dependencies [[org.clojure/clojure "1.12.0"]
17 | [org.clojure/core.async "1.7.701"]
18 | [uncomplicate/commons "0.16.1"]
19 | [uncomplicate/fluokitten "0.10.0"]
20 | [org.uncomplicate/clojure-cpp "0.4.1-SNAPSHOT"]
21 | [org.bytedeco/cuda-platform "12.8-9.8-1.5.12-SNAPSHOT"]]
22 |
23 | :profiles {:dev [:dev/all ~(leiningen.core.utils/get-os)]
24 | :dev/all {:plugins [[lein-midje "3.2.1"]
25 | [lein-codox "0.10.8"]
26 | [com.github.clj-kondo/lein-clj-kondo "0.2.5"]]
27 | :global-vars {*warn-on-reflection* true
28 | *assert* true
29 | *unchecked-math* :warn-on-boxed
30 | *print-length* 128}
31 | :dependencies [[midje "1.10.10"]
32 | [codox-theme-rdash "0.1.2"]]
33 | :codox {:metadata {:doc/format :markdown}
34 | :source-uri "http://github.com/uncomplicate/clojurecuda/blob/master/{filepath}#L{line}"
35 | :output-path "docs/codox"
36 | :themes [:rdash]
37 | :namespaces [uncomplicate.clojurecuda.core
38 | uncomplicate.clojurecuda.info
39 | uncomplicate.clojurecuda.toolbox
40 | uncomplicate.clojurecuda.internal.constants]}}
41 | :linux {:dependencies [[org.bytedeco/cuda "12.8-9.8-1.5.12-SNAPSHOT" :classifier linux-x86_64-redist]]}
42 | :windows {:dependencies [[org.bytedeco/cuda "12.9-9.9-1.5.12-SNAPSHOT" :classifier windows-x86_64-redist]]}}
43 |
44 | :repositories [["snapshots" "https://oss.sonatype.org/content/repositories/snapshots"]]
45 |
46 | :javac-options ["-target" "1.8" "-source" "1.8" "-Xlint:-options"]
47 |
48 | :source-paths ["src/clojure" "src/cuda"]
49 | :test-paths ["test/clojure" "test/cuda"]
50 | :java-source-paths ["src/java"])
51 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/core.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.core
11 | "Core ClojureCUDA functions for CUDA **host** programming. The kernels should
12 | be provided as strings (that may be stored and read from files) or binaries, written in CUDA C/C++.
13 |
14 | Many examples are available in ClojureCUDA [core test](https://github.com/uncomplicate/clojurecuda/blob/master/test/clojure/uncomplicate/clojurecuda/core_test.clj).
15 | You can see how to write CUDA [kernels here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/examples)
16 | and [here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/uncomplicate/clojurecuda/kernels)
17 | and examples of [how to load them here](https://github.com/uncomplicate/clojurecuda/tree/master/test/clojure/uncomplicate/clojurecuda/examples/).
18 |
19 | For more advanced examples, please read the source code of the CUDA engine of [Neanderthal linear algebra library](https://github.com/uncomplicate/neanderthal) (mainly general CUDA and cuBLAS are used there),
20 | and the [Deep Diamond tensor and linear algebra library](https://github.com/uncomplicate/neanderthal) (for extensive use of cuDNN).
21 |
22 | Here's a categorized map of core functions. Most functions throw `ExceptionInfo` in case of errors
23 | thrown by the CUDA driver.
24 |
25 | - Device management: [[init]], [[device-count]], [[device]].
26 | - Context management: [[context]], [[current-context]], [[current-context!]], [[put-context!]],
27 | [[push-context!]], [[in-context]], [[with-context]], [[with-default]].
28 | - Memory management: [[memcpy!]], [[mumcpy-to-host!]], [[memcpy-to-device!]], [[memset!]].
29 | [[mem-sub-region]], [[mem-alloc-driver]], [[mem-alloc-runtime]], [[cuda-malloc]], [[cuda-free!]]
30 | [[mem-alloc-pinned]], [[mem-register-pinned!]], [[mem-alloc-mapped]],
31 | - Module management: [[link]], [[link-complete!]], [[load!]], [[module]].
32 | - Execution control: [[gdid-1d]], [[grid-2d]], [[grid-3d]], [[global]], [[set-parameter!]],
33 | [[parameters]], [[function]], [[launch!]].
34 | - Stream management: [[stream]], [[default-stream]], [[ready?]], [[synchronize!]],
35 | [[add-host-fn!]], [[listen!]], [[wait-event!]], [[attach-mem!]].
36 | - Event management: [[event]], [[elapsed-time!]], [[record!]], [[can-access-peer]],
37 | [[p2p-attribute]], [[disable-peer-access!]], [[enable-peer-access!]].
38 | - NVRTC program JIT: [[program]], [[program-log]], [[compile!]], [[ptx]].
39 |
40 | Please see [CUDA Driver API](https://docs.nvidia.com/cuda/pdf/CUDA_Driver_API.pdf) for details
41 | not discussed in ClojureCUDA documentation.
42 | "
43 | (:require [uncomplicate.commons
44 | [core :refer [with-release let-release info bytesize sizeof size]]
45 | [utils :refer [mask count-groups dragan-says-ex]]]
46 | [uncomplicate.fluokitten.protocols :refer [extract]]
47 | [uncomplicate.clojure-cpp
48 | :refer [null? pointer byte-pointer string-pointer int-pointer long-pointer
49 | size-t-pointer pointer-pointer get-entry put-entry! safe type-pointer position!
50 | capacity! address]]
51 | [uncomplicate.clojurecuda.info :as cuda-info]
52 | [uncomplicate.clojurecuda.internal
53 | [constants :refer [ctx-flags event-flags mem-attach-flags mem-host-alloc-flags
54 | mem-host-register-flags p2p-attributes stream-flags]]
55 | [impl :refer [->CUDevice ->CUDevicePtr add-host-fn* attach-mem* can-access-peer*
56 | compile* context* cu-address* current-context* event* host-fn* link*
57 | malloc-runtime* mem-alloc-host* mem-alloc-managed* mem-host-alloc*
58 | mem-host-register* memcpy* memcpy-host* memset* module-load* offset
59 | p2p-attribute* program* program-log* ptx* ready* set-parameter* stream*]]
60 | [utils :refer [with-check]]])
61 | (:import [org.bytedeco.javacpp Pointer LongPointer SizeTPointer PointerPointer]
62 | org.bytedeco.cuda.global.cudart
63 | [org.bytedeco.cuda.cudart CUctx_st CUlinkState_st CUmod_st CUfunc_st CUstream_st CUevent_st]))
64 |
65 | (defn init
66 | "Initializes the CUDA driver. This function must be called before any other function
67 | from ClojureCUDA in the current process.
68 | See [CUDA Initialization](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html)
69 | "
70 | []
71 | (with-check (cudart/cuInit 0) true))
72 |
73 | ;; ================== Device Management ====================================
74 |
75 | (defn device-count
76 | "Returns the number of CUDA devices on the system.
77 | See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html).
78 | "
79 | ^long []
80 | (let [res (int-pointer 1)]
81 | (with-check (cudart/cuDeviceGetCount res) (get-entry res 0))))
82 |
83 | (defn device
84 | "Returns a device specified with its ordinal number `id` or string PCI Bus `id`.
85 | See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html).
86 | "
87 | ([id]
88 | (with-release [res (int-pointer 1)]
89 | (with-check
90 | (if (number? id)
91 | (cudart/cuDeviceGet res (long id))
92 | (cudart/cuDeviceGetByPCIBusId res ^String id))
93 | {:device-id id}
94 | (->CUDevice (get-entry res 0)))))
95 | ([]
96 | (device 0)))
97 |
98 | ;; =================== Context Management ==================================
99 |
100 | (defn context
101 | "Creates a CUDA context on the `device` using a keyword `flag`.
102 | For available flags, see [[internal.constants/ctx-flags]]. The default is none.
103 | The context must be released after use.
104 |
105 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
106 | "
107 | ([dev flag]
108 | (context* (extract dev)
109 | (or (ctx-flags flag)
110 | (throw (ex-info "Unknown context flag." {:flag flag :available ctx-flags})))))
111 | ([dev]
112 | (context* (extract dev) 0)))
113 |
114 | (defn current-context
115 | "Returns the CUDA context bound to the calling CPU thread.
116 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
117 | "
118 | []
119 | (current-context*))
120 |
121 | (defn current-context!
122 | "Binds the specified CUDA context `ctx` to the calling CPU thread.
123 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
124 | "
125 | [ctx]
126 | (current-context* ctx)
127 | ctx)
128 |
129 | (defn pop-context!
130 | "Pops the current CUDA context `ctx` from the current CPU thread.
131 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
132 | "
133 | []
134 | (let [ctx (CUctx_st.)]
135 | (with-check (cudart/cuCtxPopCurrent ctx) ctx)))
136 |
137 | (defn push-context!
138 | "Pushes a context `ctx` on the current CPU thread.
139 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
140 | "
141 | [^CUctx_st ctx]
142 | (with-check (cudart/cuCtxPushCurrent ctx) ctx))
143 |
144 | (defmacro in-context
145 | "Pushes the context `ctx` to the top of the context stack, evaluates the body with `ctx`
146 | as the current context, and pops the context from the stack.
147 | Does NOT release the context, unlike [[with-context]].
148 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
149 | "
150 | [ctx & body]
151 | `(try
152 | (push-context! ~ctx)
153 | ~@body
154 | (finally (pop-context!))))
155 |
156 | (defmacro with-context
157 | "Pushes the context `ctx` to the top of the context stack, evaluates the body, and pops the context
158 | from the stack. Releases the context, unlike [[in-context]].
159 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
160 | "
161 | [ctx & body]
162 | `(with-release [ctx# ~ctx]
163 | (in-context ctx# ~@body)))
164 |
165 | (defmacro with-default
166 | "Initializes CUDA, creates the default context and executes the body in it.
167 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html).
168 | "
169 | [& body]
170 | `(do
171 | (init)
172 | (with-release [dev# (device)]
173 | (with-context (context dev#)
174 | ~@body))))
175 |
176 | ;; ================== Memory Management ==============================================
177 |
178 | (defn ^:private check-size [ptr ^long offset ^long byte-count]
179 | (when-not (<= 0 offset (+ offset byte-count) (bytesize ptr))
180 | (dragan-says-ex "Requested bytes are out of the bounds of this device pointer."
181 | {:offset offset :requested byte-count :available (bytesize ptr)})))
182 |
183 | (defn memcpy!
184 | "Copies `byte-count` or maximum available device memory from `src` to `dst`.
185 | TODO mapped, pinned
186 | If `hstream` is provided, executes asynchronously.
187 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
188 | "
189 | ([src dst]
190 | (memcpy! src dst (min (bytesize src) (bytesize dst)) nil))
191 | ([src dst byte-count-or-stream]
192 | (if (number? byte-count-or-stream)
193 | (do (check-size src 0 byte-count-or-stream)
194 | (check-size dst 0 byte-count-or-stream)
195 | (memcpy* dst src byte-count-or-stream nil))
196 | (memcpy! src dst (min (bytesize src) (bytesize dst)) byte-count-or-stream))
197 | dst)
198 | ([src dst ^long byte-count hstream]
199 | (check-size src 0 byte-count)
200 | (check-size dst 0 byte-count)
201 | (memcpy* dst src byte-count hstream)
202 | dst))
203 |
204 | (defn memcpy-to-host!
205 | "Copies `byte-count` or maximum available memory from device `src` to host `dst`. Useful when `src`
206 | or `dst` is a generic pointer for which it cannot be determined whether it manages memory on host
207 | or on device (see [[cuda-malloc!]]).
208 | If `hstream` is provided, executes asynchronously.
209 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
210 | "
211 | ([^Pointer src ^Pointer dst ^long byte-count hstream]
212 | (check-size src 0 byte-count)
213 | (check-size dst 0 byte-count)
214 | (with-check
215 | (if hstream
216 | (cudart/cuMemcpyDtoHAsync (extract dst) (address (extract src)) byte-count hstream)
217 | (cudart/cuMemcpyDtoH (extract dst) (address (extract src)) byte-count))
218 | dst))
219 | ([src dst count-or-stream]
220 | (if (integer? count-or-stream)
221 | (memcpy-to-host! src dst count-or-stream nil)
222 | (memcpy-to-host! src dst (min (bytesize src) (bytesize dst)) count-or-stream))
223 | dst)
224 | ([src dst]
225 | (memcpy-to-host! src dst (min (bytesize src) (bytesize dst)))
226 | dst))
227 |
228 | (defn memcpy-to-device!
229 | "Copies `byte-count` or all possible memory from host `src` to device `dst`. Useful when `src` or
230 | `dst` is a generic pointer for which it cannot be determined whether it manages memory on host or
231 | on device (see [[cuda-malloc!]]).
232 | If `hstream` is provided, executes asynchronously.
233 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
234 | "
235 | ([^Pointer src ^Pointer dst ^long byte-count hstream]
236 | (check-size src 0 byte-count)
237 | (check-size dst 0 byte-count)
238 | (with-check
239 | (if hstream
240 | (cudart/cuMemcpyHtoDAsync (address (extract dst)) (extract src) byte-count hstream)
241 | (cudart/cuMemcpyHtoD (address (extract dst)) (extract src) byte-count))
242 | dst))
243 | ([src dst count-or-stream]
244 | (if (integer? count-or-stream)
245 | (memcpy-to-device! src dst count-or-stream nil)
246 | (memcpy-to-device! src dst (min (bytesize src) (bytesize dst)) count-or-stream))
247 | dst)
248 | ([src dst]
249 | (memcpy-to-device! src dst (min (bytesize src) (bytesize dst)))
250 | dst))
251 |
252 | (defn memcpy-host!
253 | "Copies `byte-count` or all possible memory from `src` to `dst`, one of which
254 | has to be accessible from the host. If `hstream` is provided, executes asynchronously.
255 | A polymorphic function that figures out what needs to be done. Supports everything
256 | except pointers created by [[cuda-malloc!]].
257 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
258 | "
259 | ([src dst ^long byte-count hstream]
260 | (check-size src 0 byte-count)
261 | (check-size dst 0 byte-count)
262 | (if hstream
263 | (memcpy-host* dst src byte-count hstream)
264 | (memcpy-host* dst src byte-count))
265 | dst)
266 | ([src dst count-or-stream]
267 | (if (integer? count-or-stream)
268 | (memcpy-host! src dst count-or-stream nil)
269 | (memcpy-host* dst src (min (bytesize src) (bytesize dst)) count-or-stream))
270 | dst)
271 | ([src dst]
272 | (memcpy-host* dst src (min (bytesize src) (bytesize dst)))
273 | dst))
274 |
275 | (defn memset!
276 | "Sets `n` elements or all segments of `dptr` memory to `value` (supports all Java primitive number
277 | types except `double`, and `long` with value larger than `Integer/MAX_VALUE`). If `hstream` is
278 | provided, executes asynchronously.
279 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
280 | "
281 | ([dptr value]
282 | (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value)))
283 | dptr)
284 | ([dptr value n-or-hstream]
285 | (if (integer? n-or-hstream)
286 | (do (check-size dptr 0 (* (sizeof value) (long n-or-hstream)))
287 | (memset* value (cu-address* dptr) n-or-hstream))
288 | (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value)) n-or-hstream))
289 | dptr)
290 | ([dptr value ^long n hstream]
291 | (if hstream
292 | (do (check-size dptr 0 (* (sizeof value) n))
293 | (memset* value (cu-address* dptr) n hstream))
294 | (memset! dptr value n))
295 | dptr))
296 |
297 | ;; ==================== Driver-managed device memory ===============================================
298 |
299 | (defn mem-sub-region
300 | "Creates CUDA device memory object that references a sub-region of `mem` from `origin`
301 | to `byte-count`, or maximum available byte size.
302 | "
303 | ([mem ^long origin ^long byte-count]
304 | (check-size mem origin byte-count)
305 | (let-release [sub-dptr (long-pointer 1)]
306 | (->CUDevicePtr (put-entry! sub-dptr 0 (offset mem origin)) byte-count false)))
307 | ([mem ^long origin]
308 | (mem-sub-region mem origin (bytesize mem))))
309 |
310 | (defn mem-alloc-driver
311 | "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the
312 | Unified Memory system, specified by a keyword `flag`. For available flags, see [[internal.constants/mem-attach-flags]].
313 | Returns a CUDA device memory object, which can NOT be extracted as a `Pointer`, but can be accessed
314 | directly through its address in the device memory.
315 | See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
316 | "
317 | ([^long byte-size flag]
318 | (mem-alloc-managed* (max 0 byte-size)
319 | (or (mem-attach-flags flag)
320 | (throw (ex-info "Unknown mem-attach flag."
321 | {:flag flag :available mem-attach-flags})))))
322 | ([^long byte-size]
323 | (mem-alloc-managed* byte-size cudart/CU_MEM_ATTACH_GLOBAL)))
324 |
325 | ;; =================== Runtime API Memory ================================================
326 |
327 | (defn mem-alloc-runtime
328 | "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the
329 | Unified Memory system. Returns a CUDA device memory object managed by the CUDA runtime API, which
330 | can be extracted as a `Pointer`. Equivalent unwrapped `Pointer` can be created by [[cuda-malloc]].
331 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
332 | "
333 | ([^long byte-size type] ;;TODO functions that receive type should accept size instead of bytesize
334 | (if-let [t (type-pointer type)]
335 | (malloc-runtime* (max 0 byte-size) t)
336 | (throw (ex-info (format "Unknown data type: %s." (str type)) {}))))
337 | ([^long byte-size]
338 | (malloc-runtime* (max 0 byte-size))))
339 |
340 | (defn cuda-malloc
341 | "Returns a `Pointer` to `byte-size` bytes of uninitialized memory that will be automatically
342 | managed by the Unified Memory system. The pointer is managed by the CUDA runtime API.
343 | Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for
344 | `FloatPointer`, etc.).
345 | This pointer has to be manually released by [[cuda-free!]]. For a more seamless experience,
346 | use the wrapper provided by the [[mem-alloc-runtime]] function.
347 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
348 | "
349 | ([^long byte-size]
350 | (let [byte-size (max 0 byte-size)]
351 | (let-release [p (byte-pointer nil)]
352 | (with-check (cudart/cudaMalloc p byte-size) (capacity! p byte-size)))))
353 | ([^long byte-size type]
354 | (if-let [pt (type-pointer type)]
355 | (let [byte-size (max 0 byte-size)]
356 | (let-release [p (byte-pointer nil)]
357 | (with-check (cudart/cudaMalloc p byte-size) (pt (capacity! p byte-size)))))
358 | (throw (ex-info (format "Unknown data type: %s." (str type)) {})))))
359 |
360 | (defn cuda-free!
361 | "Frees the runtime device memory that has been created by [[cuda-malloc]].
362 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html)
363 | "
364 | [^Pointer dptr]
365 | (when-not (null? dptr)
366 | (with-check (cudart/cudaFree (position! dptr 0))
367 | (do (.deallocate dptr) (.setNull dptr))))
368 | dptr)
369 |
370 | ;; =================== Pinned and Mapped Memory ================================================
371 |
372 | (defn mem-alloc-pinned
373 | "Allocates `byte-size` bytes of uninitialized page-locked memory, 'pinned' on the host, using
374 | keyword `flags`. For available flags, see [[internal.constants/mem-host-alloc-flags]]; the default
375 | is `:none`. Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for
376 | `FloatPointer`, etc.).
377 | Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is optimized
378 | for [[memcpy!]].
379 | See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
380 | "
381 | ([^long byte-size]
382 | (mem-host-alloc* (max 0 byte-size) 0))
383 | ([^long byte-size type-or-flags]
384 | (if-let [t (type-pointer type-or-flags)]
385 | (mem-host-alloc* (max 0 byte-size) 0 t)
386 | (mem-host-alloc* (max 0 byte-size)
387 | (if (keyword? type-or-flags)
388 | (or (mem-host-alloc-flags type-or-flags)
389 | (throw (ex-info "Unknown mem-host-alloc flag."
390 | {:flag type-or-flags :available mem-host-alloc-flags})))
391 | (mask mem-host-alloc-flags type-or-flags)))))
392 | ([^long byte-size type flags]
393 | (if-let [t (type-pointer type)]
394 | (mem-host-alloc* (max 0 byte-size)
395 | (if (keyword? flags)
396 | (or (mem-host-alloc-flags flags)
397 | (throw (ex-info "Unknown mem-host-alloc flag."
398 | {:flag flags :available mem-host-alloc-flags})))
399 | (mask mem-host-alloc-flags flags))
400 | t)
401 | (throw (ex-info (format "Unknown data type: %s." (str type)) {})))))
402 |
403 | (defn mem-register-pinned!
404 | "Registers previously instantiated host pointer, 'pinned' from the device, using
405 | keyword `flags`. For available flags, see [[internal.constants/mem-host-register-flags]]; the
406 | default is `:none`. Returns the pinned object equivalent to the one created by [[mem-alloc-pinned]].
407 | Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is
408 | optimized for [[memcpy!]].
409 | See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
410 | "
411 | ([memory flags]
412 | (mem-host-register* memory (if (keyword? flags)
413 | (or (mem-host-register-flags flags)
414 | (throw (ex-info "Unknown mem-host-register flag."
415 | {:flag flags :available mem-host-register-flags})))
416 | (mask mem-host-register-flags flags))))
417 | ([memory]
418 | (mem-host-register* memory 0)))
419 |
420 | (defn mem-alloc-mapped
421 | "Allocates `byte-size` bytes of uninitialized host memory, 'mapped' to the device. Optionally,
422 | accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for `FloatPointer`, etc.).
423 | Mapped memory is optimized for the [[memcpy!]] operation, while 'pinned' memory is optimized for
424 | [[memcpy-host!]].
425 | See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html)
426 | "
427 | ([^long byte-size]
428 | (mem-alloc-host* (max 0 byte-size)))
429 | ([^long byte-size type]
430 | (mem-alloc-host* (max 0 byte-size) (type-pointer type))))
431 |
432 | ;; ================== Module Management =====================================
433 |
434 | (defn link
435 | "Invokes the CUDA linker on data provided as a vector `[[type source ], ...]`.
436 | Produces a cubin compiled for a particular Nvidia architecture.
437 | Please see relevant examples from the test folder.
438 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
439 | "
440 | ([data options]
441 | (let-release [res (CUlinkState_st.)]
442 | (link* res data options)))
443 | ([data]
444 | (link data nil))
445 | ([]
446 | (CUlinkState_st.)))
447 |
448 | (defn link-complete!
449 | "Completes the link state created by [[link]], so that it can be loaded by the [[module]] function.
450 | Please see relevant examples from the test folder."
451 | [^CUlinkState_st link-state]
452 | (let-release [cubin-image (byte-pointer nil)]
453 | (with-release [size-out (size-t-pointer 1)]
454 | (with-check
455 | (cudart/cuLinkComplete link-state cubin-image size-out)
456 | (capacity! cubin-image (get-entry size-out 0))))))
457 |
458 | (defn load!
459 | "Load module's data from a [[ptx]] string, nvrtc program, java path, or binary `data`.
460 | Please see relevant examples from the test folder.
461 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
462 | "
463 | [m data]
464 | (module-load* (safe (pointer data)) m)
465 | m)
466 |
467 | (defn module
468 | "Creates a new CUDA module and loads a string, nvrtc program, or binary `data`.
469 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)"
470 | ([]
471 | (CUmod_st.))
472 | ([data]
473 | (load! (module) data)))
474 |
475 | (defrecord GridDim [^long grid-x ^long grid-y ^long grid-z ^long block-x ^long block-y ^long block-z])
476 |
477 | (defn grid-1d
478 | "Creates a 1-dimensional [[GridDim]] record with grid and block dimensions x.
479 | Note: dim-x is the total number of threads globally, not the number of blocks."
480 | ([^long dim-x]
481 | (let [block-x (min dim-x 1024)]
482 | (grid-1d dim-x block-x)))
483 | ([^long dim-x ^long block-x]
484 | (let [block-x (min dim-x block-x)]
485 | (GridDim. (count-groups block-x dim-x) 1 1 block-x 1 1))))
486 |
487 | (defn grid-2d
488 | "Creates a 2-dimensional [[GridDim]] record with grid and block dimensions x and y.
489 | Note: dim-x is the total number of threads globally, not the number of blocks."
490 | ([^long dim-x ^long dim-y]
491 | (let [block-x (min dim-x 32)
492 | block-y (min dim-y (long (/ 1024 block-x)))]
493 | (grid-2d dim-x dim-y block-x block-y)))
494 | ([^long dim-x ^long dim-y ^long block-x ^long block-y]
495 | (let [block-x (min dim-x block-x)
496 | block-y (min dim-y block-y)]
497 | (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y) 1 block-x block-y 1))))
498 |
499 | (defn grid-3d
500 | "Creates a 3-dimensional [[GridDim]] record with grid and block dimensions x, y, and z.
501 | Note: dim-x is the total number of threads globally, not the number of blocks."
502 | ([^long dim-x ^long dim-y ^long dim-z]
503 | (let [block-x (min dim-x 32)
504 | block-y (min dim-y (long (/ 1024 block-x)))
505 | block-z (min dim-z (long (/ 1024 (* block-x block-y))))]
506 | (grid-3d dim-x dim-y dim-z block-x block-y block-z)))
507 | ([dim-x dim-y dim-z block-x block-y block-z]
508 | (let [block-x (min (long dim-x) (long block-x))
509 | block-y (min (long dim-y) (long block-y))
510 | block-z (min (long dim-z) (long block-z))]
511 | (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y)
512 | (count-groups block-z dim-z) block-x block-y block-z))))
513 |
514 | (defn global
515 | "Returns CUDA global device memory object named `name` from module `m`. Global memory is
516 | typically defined in C++ source files of CUDA kernels.
517 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
518 | "
519 | [^CUmod_st m ^String name]
520 | (let-release [dptr (long-pointer 1)]
521 | (with-release [byte-size (size-t-pointer 1)]
522 | (with-check
523 | (cudart/cuModuleGetGlobal ^LongPointer dptr ^SizeTPointer byte-size m name)
524 | {:name name}
525 | (->CUDevicePtr dptr (get-entry byte-size 0) false)))))
526 |
527 | (defn set-parameter!
528 | "Sets the `i`th parameter in a parameter array `pp` and the rest of `parameters` in places after `i`."
529 | [^PointerPointer pp i parameter & parameters]
530 | (if (< -1 (long i) (size pp))
531 | (set-parameter* parameter (extract pp) i)
532 | (throw (ex-info "Index out of bounds." {:requested i :available (size pp)})))
533 | (if parameters
534 | (recur pp (inc (long i)) (first parameters) (next parameters))
535 | pp))
536 |
537 | (defn parameters
538 | "Creates an `PointerPointer`s to CUDA `parameter`'s. `parameter` can be any object on
539 | device (Device API memory, Runtime API memory, JavaCPP pointers), or host (arrays, numbers, JavaCPP
540 | pointers) that makes sense as a kernel parameter per CUDA specification. Use the result as a parameter
541 | argument in [[launch!]].
542 | "
543 | ([parameter & parameters]
544 | (let-release [len (if parameters (inc (count parameters)) 1)
545 | pp (pointer-pointer len)]
546 | (apply set-parameter! pp 0 parameter parameters))))
547 |
548 | ;; ====================== Execution Control ==================================
549 |
550 | (defn function
551 | "Returns CUDA kernel function named `name` located in module `m`.
552 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
553 | "
554 | [^CUmod_st m ^String name]
555 | (let [res (CUfunc_st.)]
556 | (with-check (cudart/cuModuleGetFunction res m name) {:name name} res)))
557 |
558 | (defn launch!
559 | "Invokes the kernel `fun` on a `grid-dim` grid of blocks, usinng `params` `PointerPointer`.
560 | Optionally, you can specify the amount of shared memory that will be available to each thread block,
561 | and `hstream` to use for execution.
562 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)
563 | "
564 | ([^CUfunc_st fun ^GridDim grid-dim shared-mem-bytes ^CUstream_st hstream ^PointerPointer params]
565 | (with-check
566 | (cudart/cuLaunchKernel fun (.grid-x grid-dim) (.grid-y grid-dim) (.grid-z grid-dim)
567 | (.block-x grid-dim) (.block-y grid-dim) (.block-z grid-dim)
568 | (int shared-mem-bytes) hstream params nil)
569 | {:kernel (info fun) :grid-dim grid-dim :hstream (info hstream)}
570 | hstream))
571 | ([^CUfunc_st fun ^GridDim grid-dim hstream params]
572 | (launch! fun grid-dim 0 hstream params))
573 | ([^CUfunc_st fun ^GridDim grid-dim params]
574 | (launch! fun grid-dim 0 nil params)))
575 |
576 | ;; ================== Stream Management ======================================
577 |
578 | (defn stream
579 | "Creates a stream using an optional integer `priority` and a keyword `flag`.
580 | For available flags, see [[internal.constants/stream-flags]]
581 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
582 | "
583 | ([]
584 | (stream* cudart/CU_STREAM_DEFAULT))
585 | ([flag]
586 | (stream* (or (stream-flags flag)
587 | (throw (ex-info "Invalid stream flag." {:flag flag :available stream-flags})))))
588 | ([^long priority flag]
589 | (stream* priority (or (stream-flags flag)
590 | (throw (ex-info "Invaling stream flag."
591 | {:flag flag :available stream-flags}))))))
592 |
593 | (def default-stream
594 | ^{:const true
595 | :doc "The default per-thread stream."}
596 | cudart/CU_STREAM_PER_THREAD)
597 |
598 | (defn ready?
599 | "Determines status (ready or not) of a compute stream or event `obj`.
600 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
601 | and [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
602 | "
603 | [obj]
604 | (= cudart/CUDA_SUCCESS (ready* (extract obj))))
605 |
606 | (defn synchronize!
607 | "Blocks the current thread until the context's or `hstream`'s tasks complete."
608 | ([]
609 | (with-check (cudart/cuCtxSynchronize) true))
610 | ([^CUstream_st hstream]
611 | (with-check (cudart/cuStreamSynchronize hstream) hstream)))
612 |
613 | (defn add-host-fn!
614 | "Adds host function `f` to a compute stream, with optional `data` related to the call.
615 | If `data` is not provided, places `hstream` under data.
616 | "
617 | ([hstream f data]
618 | (add-host-fn* hstream f data)
619 | hstream)
620 | ([hstream f]
621 | (add-host-fn* hstream f hstream)
622 | hstream))
623 |
624 | (defn listen!
625 | "Adds a host function listener to a compute stream, with optional `data` related to the call,
626 | and connects it to a Clojure channel `chan`. If `data` is not provided, places `hstream` under data.
627 | "
628 | ([hstream ch data]
629 | (let [data (safe (pointer data))]
630 | (add-host-fn* hstream (host-fn* data ch) data)
631 | hstream))
632 | ([hstream ch]
633 | (add-host-fn* hstream (host-fn* hstream ch) hstream)
634 | hstream))
635 |
636 | (defn wait-event!
637 | "Makes a compute stream `hstream` wait on an event `ev`.
638 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
639 | "
640 | [^CUstream_st hstream ^CUevent_st ev]
641 | (with-check (cudart/cuStreamWaitEvent hstream ev 0) hstream))
642 |
643 | (defn attach-mem!
644 | "Attaches memory `mem` of size `size`, specified by `flag` to a `hstream` asynchronously.
645 | For available flags, see [[internal.constants/mem-attach-flags]]. Te default is `:single`.
646 | If :global flag is specified, the memory can be accessed by any stream on any device.
647 | If :host flag is specified, the program makes a guarantee that it won't access the memory on
648 | the device from any stream on a device that has no `concurrent-managed-access` capability.
649 | If :single flag is specified and `hStream` is associated with a device that has no
650 | `concurrent-managed-access` capability, the program makes a guarantee that it will only access
651 | the memory on the device from `hStream`. It is illegal to attach singly to the nil stream,
652 | because the nil stream is a virtual global stream and not a specific stream. An error will
653 | be returned in this case.
654 |
655 | When memory is associated with a single stream, the Unified Memory system will allow CPU access
656 | to this memory region so long as all operations in hStream have completed, regardless of whether
657 | other streams are active. In effect, this constrains exclusive ownership of the managed memory
658 | region by an active GPU to per-stream activity instead of whole-GPU activity.
659 |
660 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)."
661 | ([^CUstream_st hstream mem ^long byte-size flag]
662 | (attach-mem* (or (extract hstream)
663 | (when-not (= :global flag)
664 | (throw (ex-info "nil stream is a virtual global stream and not a specific stream that may be only used with :global mem-attach flag."
665 | {:flag flag :available mem-attach-flags}))))
666 | (cu-address* mem) byte-size
667 | (or (mem-attach-flags flag)
668 | (throw (ex-info "Unknown mem-attach flag."
669 | {:flag flag :available mem-attach-flags}))))
670 | hstream)
671 | ([mem byte-size flag]
672 | (attach-mem! default-stream mem byte-size flag)))
673 |
674 | ;; ================== Event Management =======================================
675 |
676 | (defn event
677 | "Creates an event specified by keyword `flags`. For available flags, see
678 | [[internal.constants/event-flags]].
679 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
680 | "
681 | ([]
682 | (event* cudart/CU_EVENT_DEFAULT))
683 | ([flag & flags]
684 | (event* (if flags
685 | (mask event-flags (cons flag flags))
686 | (or (event-flags flag)
687 | (throw (ex-info "Unknown event flag." {:flag flag :available event-flags})))))))
688 |
689 | (defn elapsed-time!
690 | "Computes the elapsed time in milliseconds between `start-event` and `end-event`.
691 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
692 | "
693 | ^double [^CUevent_st start-event ^CUevent_st end-event]
694 | (let [res (float-array 1)]
695 | (with-check (cudart/cuEventElapsedTime res start-event end-event) (aget res 0))))
696 |
697 | (defn record!
698 | "Records an even! `ev` on optional `stream`.
699 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
700 | "
701 | ([^CUstream_st stream ^CUevent_st event]
702 | (with-check (cudart/cuEventRecord event stream) stream))
703 | ([^CUevent_st event]
704 | (with-check (cudart/cuEventRecord event nil) default-stream)))
705 |
706 | ;; ================== Peer Context Memory Access =============================
707 |
708 | (defn can-access-peer
709 | "Queries if a device may directly access a peer device's memory.
710 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
711 | "
712 | [dev peer]
713 | (can-access-peer* (extract dev) (extract peer)))
714 |
715 | (defn p2p-attribute
716 | "Queries attributes of the link between two devices.
717 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
718 | "
719 | [dev peer attribute]
720 | (p2p-attribute* (extract dev) (extract peer) (or (p2p-attributes attribute)
721 | (throw (ex-info "Unknown p2p attribute."
722 | {:attribute attribute :available p2p-attributes})))))
723 |
724 | (defn disable-peer-access!
725 | "Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
726 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
727 | "
728 | ([ctx]
729 | (with-check (cudart/cuCtxDisablePeerAccess ctx) ctx))
730 | ([]
731 | (disable-peer-access! (current-context))))
732 |
733 | (defn enable-peer-access!
734 | "Enables direct access to memory allocations in a peer context and unregisters any registered allocations.
735 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
736 | "
737 | ([ctx]
738 | (with-check (cudart/cuCtxEnablePeerAccess ctx 0) ctx)
739 | ctx)
740 | ([]
741 | (enable-peer-access! (current-context))))
742 |
743 | ;; ====================== Nvrtc program JIT ========================================
744 |
745 | (defn program
746 | "Creates a CUDA program from the `source-code`, with an optional `name` and an optional
747 | hash map of `headers` (as strings) and their names.
748 | "
749 | ([^String name ^String source-code headers]
750 | (program* (string-pointer name) (string-pointer source-code)
751 | (pointer-pointer (into-array String (vals headers)))
752 | (pointer-pointer (into-array String (keys headers)))))
753 | ([source-code headers]
754 | (program nil source-code headers))
755 | ([source-code]
756 | (program nil source-code nil)))
757 |
758 | (defn program-log
759 | "Returns the log string generated by the previous compilation of `prog`."
760 | [prog]
761 | (program-log* prog))
762 |
763 | (defn compile!
764 | "Compiles the given `prog` using a list of string `options`."
765 | ([prog options]
766 | (compile* prog (pointer-pointer (into-array String options)))
767 | prog)
768 | ([prog]
769 | (compile! prog nil)))
770 |
771 | (defn ptx
772 | "Returns the PTX generated by the previous compilation of `prog`."
773 | [prog]
774 | (ptx* prog))
775 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/info.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.info
11 | "Info functions for all CUDA objects (devices, etc...).
12 | "
13 | (:require [clojure.string :as str]
14 | [uncomplicate.commons.core :refer [with-release Info]]
15 | [uncomplicate.fluokitten.core :refer [fmap op]]
16 | [uncomplicate.clojure-cpp :as cpp
17 | :refer [int-pointer byte-pointer size-t-pointer get-string get-entry]]
18 | [uncomplicate.clojurecuda.internal
19 | [constants :refer [ctx-limits dec-compute-mode dec-func-cache-config dec-shared-config
20 | dec-stream-flag func-cache-config shared-config-map]]
21 | [utils :refer [with-check maybe]]
22 | [impl :refer [current-context* ->CUDevice]]])
23 | (:import [org.bytedeco.cuda.global cudart]
24 | [org.bytedeco.cuda.cudart CUctx_st CUfunc_st CUstream_st]
25 | [uncomplicate.clojurecuda.internal.impl CUDevice]))
26 |
27 | ;; =================== Info* utility macros ===============================
28 |
29 | (defmacro ^:private info-attribute* [method object attribute]
30 | `(long (with-release [res# (int-pointer 1)]
31 | (with-check (~method res# ~attribute ~object)
32 | (get-entry res# 0)))))
33 |
34 | ;; =================== Version Management =================================
35 |
36 | (defn driver-version ^long []
37 | (with-release [res (int-pointer 1)]
38 | (with-check (cudart/cuDriverGetVersion res) (get-entry res 0))))
39 |
40 | ;; =================== Device info =======================================
41 |
42 | (defn device-name [^CUDevice device]
43 | (with-release [res (byte-pointer 64)]
44 | (with-check (cudart/cuDeviceGetName res 64 (.dev device))
45 | (clojure.string/replace (get-string res) #" " ""))))
46 |
47 | (defn total-mem [^CUDevice device]
48 | (with-release [res (size-t-pointer 1)]
49 | (with-check (cudart/cuDeviceTotalMem res (.dev device))
50 | (get-entry res 0))))
51 |
52 | (defn async-engine-count ^long [^CUDevice device]
53 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
54 | cudart/CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT))
55 |
56 | (defn can-map-host-memory [^CUDevice device]
57 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
58 | cudart/CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY)))
59 |
60 | (defn clock-rate ^long [^CUDevice device]
61 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
62 | cudart/CU_DEVICE_ATTRIBUTE_CLOCK_RATE))
63 |
64 | (defn compute-capability-major ^long [^CUDevice device]
65 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
66 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR))
67 |
68 | (defn compute-capability-minor ^long [^CUDevice device]
69 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
70 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR))
71 |
72 | (defn compute-mode [^CUDevice device]
73 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
74 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_MODE))
75 |
76 | (defn concurrent-kernels ^long [^CUDevice device]
77 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
78 | cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS))
79 |
80 | (defn ecc-enabled [^CUDevice device]
81 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
82 | cudart/CU_DEVICE_ATTRIBUTE_ECC_ENABLED)))
83 |
84 | (defn global-L1-cache-supported [^CUDevice device]
85 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
86 | cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED)))
87 |
88 | (defn global-memory-bus-width ^long [^CUDevice device]
89 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
90 | cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH))
91 |
92 | (defn integrated [^CUDevice device]
93 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
94 | cudart/CU_DEVICE_ATTRIBUTE_INTEGRATED)))
95 |
96 | (defn kernel-exec-timeout [^CUDevice device]
97 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
98 | cudart/CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)))
99 |
100 | (defn L2-cache-size ^long [^CUDevice device]
101 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
102 | cudart/CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE))
103 |
104 | (defn local-L1-cache-supported [^CUDevice device]
105 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
106 | cudart/CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED)))
107 |
108 | (defn managed-memory [^CUDevice device]
109 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
110 | cudart/CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)))
111 |
112 | (defn concurrent-managed-access [^CUDevice device]
113 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
114 | cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)))
115 |
116 | (defn max-block-dim-x ^long [^CUDevice device]
117 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
118 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X))
119 |
120 | (defn max-block-dim-y ^long [^CUDevice device]
121 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
122 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y))
123 |
124 | (defn max-block-dim-z ^long [^CUDevice device]
125 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
126 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
127 |
128 | (defn max-grid-dim-x ^long [^CUDevice device]
129 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
130 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X))
131 |
132 | (defn max-grid-dim-y ^long [^CUDevice device]
133 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
134 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y))
135 |
136 | (defn max-grid-dim-z ^long [^CUDevice device]
137 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
138 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
139 |
140 | (defn max-pitch ^long [^CUDevice device]
141 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
142 | cudart/CU_DEVICE_ATTRIBUTE_MAX_PITCH))
143 |
144 | (defn max-registers-per-block ^long [^CUDevice device]
145 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
146 | cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK))
147 |
148 | (defn max-registers-per-multiprocessor ^long [^CUDevice device]
149 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
150 | cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR))
151 |
152 | (defn max-shared-memory-per-block ^long [^CUDevice device]
153 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
154 | cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK))
155 |
156 | (defn max-shared-memory-per-multiprocessor ^long [^CUDevice device]
157 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
158 | cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR))
159 |
160 | (defn max-threads-per-block ^long [^CUDevice device]
161 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
162 | cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
163 |
164 | (defn max-threads-per-multiprocessor ^long [^CUDevice device]
165 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
166 | cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR))
167 |
168 | (defn maximum-surface1d-layered-layers ^long [^CUDevice device]
169 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
170 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS))
171 |
172 | (defn maximum-surface1d-layered-width ^long [^CUDevice device]
173 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
174 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH))
175 |
176 | (defn maximum-surface1d-width ^long [^CUDevice device]
177 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
178 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH))
179 |
180 | (defn maximum-surface2d-height ^long [^CUDevice device]
181 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
182 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT))
183 |
184 | (defn maximum-surface2d-width ^long [^CUDevice device]
185 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
186 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH))
187 |
188 | (defn maximum-surface2d-layered-height ^long [^CUDevice device]
189 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
190 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT))
191 |
192 | (defn maximum-surface2d-layered-width ^long [^CUDevice device]
193 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
194 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH))
195 |
196 | (defn maximum-surface2d-layered-layers ^long [^CUDevice device]
197 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
198 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS))
199 |
200 | (defn maximum-surface3d-depth ^long [^CUDevice device]
201 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
202 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH))
203 |
204 | (defn maximum-surface3d-height ^long [^CUDevice device]
205 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
206 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT))
207 |
208 | (defn maximum-surface3d-width ^long [^CUDevice device]
209 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
210 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH))
211 |
212 | (defn maximum-surfacecubemap-layered-width ^long [^CUDevice device]
213 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
214 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH))
215 |
216 | (defn maximum-surfacecubemap-layered-layers ^long [^CUDevice device]
217 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
218 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS))
219 |
220 | (defn maximum-surfacecubemap-width ^long [^CUDevice device]
221 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
222 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH))
223 |
224 | (defn maximum-texture1d-layered-width ^long [^CUDevice device]
225 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
226 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH))
227 |
228 | (defn maximum-texture1d-layered-layers ^long [^CUDevice device]
229 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
230 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS))
231 |
232 | (defn maximum-texture1d-linear-width ^long [^CUDevice device]
233 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
234 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH))
235 |
236 | (defn maximum-texture1d-mipmapped-width ^long [^CUDevice device]
237 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
238 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH))
239 |
240 | (defn maximum-texture1d-width ^long [^CUDevice device]
241 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
242 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH))
243 |
244 | (defn maximum-texture2d-height ^long [^CUDevice device]
245 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
246 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT))
247 |
248 | (defn maximum-texture2d-layered-height ^long [^CUDevice device]
249 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
250 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))
251 |
252 | (defn maximum-texture2d-layered-layers ^long [^CUDevice device]
253 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
254 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS))
255 |
256 | (defn maximum-texture2d-linear-height ^long [^CUDevice device]
257 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
258 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT))
259 |
260 | (defn maximum-texture2d-linear-pitch ^long [^CUDevice device]
261 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
262 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH))
263 |
264 | (defn maximum-texture2d-linear-width ^long [^CUDevice device]
265 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
266 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH))
267 |
268 | (defn maximum-texture2d-mipmapped-width ^long [^CUDevice device]
269 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
270 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH))
271 |
272 | (defn maximum-texture2d-mipmapped-height ^long [^CUDevice device]
273 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
274 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT))
275 |
276 | (defn maximum-texture2d-width ^long [^CUDevice device]
277 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
278 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH))
279 |
280 | (defn maximum-texture3d-depth ^long [^CUDevice device]
281 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
282 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH))
283 |
284 | (defn maximum-texture3d-depth-alternate ^long [^CUDevice device]
285 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
286 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE))
287 |
288 | (defn maximum-texture3d-height ^long [^CUDevice device]
289 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
290 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT))
291 |
292 | (defn maximum-texture3d-height-alternate ^long [^CUDevice device]
293 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
294 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE))
295 |
296 | (defn maximum-texture3d-width ^long [^CUDevice device]
297 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
298 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH))
299 |
300 | (defn maximum-texture3d-width-alternate ^long [^CUDevice device]
301 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
302 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE))
303 |
304 | (defn maximum-texturecubemap-layered-layers ^long [^CUDevice device]
305 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
306 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS))
307 |
308 | (defn maximum-texturecubemap-layered-width ^long [^CUDevice device]
309 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
310 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH))
311 |
312 | (defn maximum-texturecubemap-width ^long [^CUDevice device]
313 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
314 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH))
315 |
316 | (defn memory-clock-rate ^long [^CUDevice device]
317 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
318 | cudart/CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE))
319 |
320 | (defn multi-gpu-board [^CUDevice device]
321 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
322 | cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD)))
323 |
324 | (defn multi-gpu-board-group-id ^long [^CUDevice device]
325 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
326 | cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID))
327 |
328 | (defn multiprocessor-count ^long [^CUDevice device]
329 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
330 | cudart/CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
331 |
332 | (defn pci-bus-id ^long [^CUDevice device]
333 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
334 | cudart/CU_DEVICE_ATTRIBUTE_PCI_BUS_ID))
335 |
336 | (defn pci-bus-id-string [^CUDevice device]
337 | (with-release [res (byte-pointer 13)
338 | res2 (byte-pointer 12)]
339 | (with-check (cudart/cuDeviceGetPCIBusId res 13 (.dev device))
340 | (do
341 | (cpp/memcpy! res res2 12)
342 | (get-string res2)))))
343 |
344 | (defn pci-device-id ^long [^CUDevice device]
345 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
346 | cudart/CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID))
347 |
348 | (defn pci-domain-id ^long [^CUDevice device]
349 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
350 | cudart/CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID))
351 |
352 | (defn stream-priorities-supported [^CUDevice device]
353 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
354 | cudart/CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED)))
355 |
356 | (defn surface-alignment ^long [^CUDevice device]
357 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
358 | cudart/CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT))
359 |
360 | (defn tcc-driver [^CUDevice device]
361 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
362 | cudart/CU_DEVICE_ATTRIBUTE_TCC_DRIVER)))
363 |
364 | (defn texture-alignment ^long [^CUDevice device]
365 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
366 | cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT))
367 |
368 | (defn texture-pitch-alignment ^long [^CUDevice device]
369 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
370 | cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT))
371 |
372 | (defn total-constant-memory ^long [^CUDevice device]
373 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
374 | cudart/CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY))
375 |
376 | (defn unified-addressing [^CUDevice device]
377 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
378 | cudart/CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)))
379 |
380 | (defn warp-size ^long [^CUDevice device]
381 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device)
382 | cudart/CU_DEVICE_ATTRIBUTE_WARP_SIZE))
383 |
384 | (def ^:no-doc
385 | device-attributes
386 | {:name device-name
387 | :total-mem total-mem
388 | :async-engine-count async-engine-count
389 | :can-map-host-memory can-map-host-memory
390 | :clock-rate clock-rate
391 | :compute-capability-major compute-capability-major
392 | :compute-capability-minor compute-capability-minor
393 | :compute-mode (comp dec-compute-mode compute-mode)
394 | :concurrent-kernels concurrent-kernels
395 | :ecc-enabled ecc-enabled
396 | :global-L1-cache-supported global-L1-cache-supported
397 | :global-memory-bus-width global-memory-bus-width
398 | :integrated integrated
399 | :kernel-exec-timeout kernel-exec-timeout
400 | :L2-cache-size L2-cache-size
401 | :local-L1-cache-supported local-L1-cache-supported
402 | :managed-memory managed-memory
403 | :max-block-dim-x max-block-dim-x
404 | :max-block-dim-y max-block-dim-y
405 | :max-block-dim-z max-block-dim-z
406 | :max-grid-dim-x max-grid-dim-x
407 | :max-grid-dim-y max-grid-dim-y
408 | :max-grid-dim-z max-grid-dim-z
409 | :max-pitch max-pitch
410 | :max-registers-per-block max-registers-per-block
411 | :max-registers-per-multiprocessor max-registers-per-multiprocessor
412 | :max-shared-memory-per-block max-shared-memory-per-block
413 | :max-shared-memory-per-multiprocessor max-shared-memory-per-multiprocessor
414 | :max-threads-per-block max-threads-per-block
415 | :max-threads-per-multiprocessor max-threads-per-multiprocessor
416 | :maximum-surface1d-layered-layers maximum-surface1d-layered-layers
417 | :maximum-surface1d-layered-width maximum-surface1d-layered-width
418 | :maximum-surface1d-width maximum-surface1d-width
419 | :maximum-surface2d-height maximum-surface2d-height
420 | :maximum-surface2d-width maximum-surface2d-width
421 | :maximum-surface2d-layered-height maximum-surface2d-layered-height
422 | :maximum-surface2d-layered-width maximum-surface2d-layered-width
423 | :maximum-surface2d-layered-layers maximum-surface2d-layered-layers
424 | :maximum-surface3d-depth maximum-surface3d-depth
425 | :maximum-surface3d-height maximum-surface3d-height
426 | :maximum-surface3d-width maximum-surface3d-width
427 | :maximum-surfacecubemap-layered-width maximum-surfacecubemap-layered-width
428 | :maximum-surfacecubemap-layered-layers maximum-surfacecubemap-layered-layers
429 | :maximum-surfacecubemap-width maximum-surfacecubemap-width
430 | :maximum-texture1d-layered-width maximum-texture1d-layered-width
431 | :maximum-texture1d-layered-layers maximum-texture1d-layered-layers
432 | :maximum-texture1d-linear-width maximum-texture1d-linear-width
433 | :maximum-texture1d-mipmapped-width maximum-texture1d-mipmapped-width
434 | :maximum-texture1d-width maximum-texture1d-width
435 | :maximum-texture2d-height maximum-texture2d-height
436 | :maximum-texture2d-layered-height maximum-texture2d-layered-height
437 | :maximum-texture2d-layered-layers maximum-texture2d-layered-layers
438 | :maximum-texture2d-linear-height maximum-texture2d-linear-height
439 | :maximum-texture2d-linear-pitch maximum-texture2d-linear-pitch
440 | :maximum-texture2d-linear-width maximum-texture2d-linear-width
441 | :maximum-texture2d-mipmapped-width maximum-texture2d-mipmapped-width
442 | :maximum-texture2d-mipmapped-height maximum-texture2d-mipmapped-height
443 | :maximum-texture2d-width maximum-texture2d-width
444 | :maximum-texture3d-depth maximum-texture3d-depth
445 | :maximum-texture3d-depth-alternate maximum-texture3d-depth-alternate
446 | :maximum-texture3d-height maximum-texture3d-height
447 | :maximum-texture3d-height-alternate maximum-texture3d-height-alternate
448 | :maximum-texture3d-width maximum-texture3d-width
449 | :maximum-texture3d-width-alternate maximum-texture3d-width-alternate
450 | :maximum-texturecubemap-layered-layers maximum-texturecubemap-layered-layers
451 | :maximum-texturecubemap-layered-width maximum-texturecubemap-layered-width
452 | :maximum-texturecubemap-width maximum-texturecubemap-width
453 | :memory-clock-rate memory-clock-rate
454 | :multi-gpu-board multi-gpu-board
455 | :multi-gpu-board-group-id multi-gpu-board-group-id
456 | :multiprocessor-count multiprocessor-count
457 | :pci-bus-id pci-bus-id
458 | :pci-bus-id-string pci-bus-id-string
459 | :pci-device-id pci-device-id
460 | :pci-domain-id pci-domain-id
461 | :stream-priorities-supported stream-priorities-supported
462 | :surface-alignment surface-alignment
463 | :tcc-driver tcc-driver
464 | :texture-alignment texture-alignment
465 | :texture-pitch-alignment texture-pitch-alignment
466 | :total-constant-memory total-constant-memory
467 | :unified-addressing unified-addressing
468 | :warp-size warp-size})
469 |
470 | (extend-type CUDevice
471 | Info
472 | (info
473 | ([d attribute]
474 | (if-let [attribute-fn (device-attributes attribute)]
475 | (maybe (attribute-fn d))
476 | (throw (ex-info "Unknown attribute." {:attribute attribute}))))
477 | ([d]
478 | (fmap #(maybe (% d)) device-attributes))))
479 |
480 | ;; ======================= Context Info ==================================
481 |
482 | (defn api-version
483 | "Gets the context's API version."
484 | ([^CUctx_st ctx]
485 | (with-release [res (int-pointer 1)]
486 | (with-check (cudart/cuCtxGetApiVersion ctx res) (get-entry res 0))))
487 | ([]
488 | (with-release [res (int-pointer 1)]
489 | (with-check (cudart/cuCtxGetApiVersion ^CUctx_st (current-context*) res)
490 | (get-entry res 0)))))
491 |
492 | (defn cache-config
493 | "Returns the preferred cache configuration for the current context.
494 |
495 | See [cuCtxGetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
496 | "
497 | []
498 | (with-release [res (int-pointer 1)]
499 | (dec-func-cache-config (with-check (cudart/cuCtxGetCacheConfig res) (get-entry res 0)))))
500 |
501 | (defn limit*
502 | "Returns or sets resource limits for the attribute specified by integer `limit`.
503 |
504 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
505 | "
506 | (^long [limit]
507 | (with-release [res (size-t-pointer 1)]
508 | (with-check (cudart/cuCtxGetLimit res limit) (get-entry res 0))))
509 | (^long [limit ^long value]
510 | (with-check (cudart/cuCtxSetLimit limit value) value)))
511 |
512 | (defn limit
513 | "Returns resource limits for the attribute specified by keyword `limit`.
514 |
515 | Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`,
516 | `dev-runtime-pending-launch-count`.
517 |
518 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
519 | "
520 | ^long [limit]
521 | (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits})))))
522 |
523 | (defn limit!
524 | "Sets resource limit for the attribute specified by keyword `limit` to `value`.
525 |
526 | Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`,
527 | `dev-runtime-pending-launch-count`.
528 |
529 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
530 | "
531 | ^long [limit ^long value]
532 | (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits})))
533 | value))
534 |
535 | (defn ctx-device
536 | "Returns the device for the current context."
537 | []
538 | (with-release [res (int-pointer 1)]
539 | (with-check (cudart/cuCtxGetDevice res) (->CUDevice (get-entry res 0)))))
540 |
541 | (defn shared-config*
542 | "Sets or gets the current shared memory configuration for the current context or kernel `func`.
543 |
544 | See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
545 | See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
546 | See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
547 | "
548 | (^long []
549 | (with-release [res (int-pointer 1)]
550 | (with-check (cudart/cuCtxGetSharedMemConfig res) (get-entry res 0))))
551 | (^long [^long config]
552 | (with-check (cudart/cuCtxSetSharedMemConfig config) config))
553 | ([^CUfunc_st func ^long config]
554 | (with-check (cudart/cuFuncSetSharedMemConfig func config) func)))
555 |
556 | (defn shared-config
557 | "Gets the current shared memory configuration for the current context.
558 |
559 | See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
560 | "
561 | []
562 | (dec-shared-config (shared-config*)))
563 |
564 | (defn shared-config!
565 | "Sets the current shared memory configuration for the current context.
566 |
567 | See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
568 | See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
569 | "
570 | ([config]
571 | (shared-config* (or (shared-config-map config)
572 | (ex-info "Unknown config." {:config config :available shared-config-map}))))
573 | ([func config]
574 | (shared-config* func (or (shared-config-map config)
575 | (ex-info "Unknown config." {:config config :available shared-config-map})))))
576 |
577 | (defn stream-priority-range
578 | "Returns a vector of 2 numerical values that correspond to the least and greatest stream priorities.
579 |
580 | See [cuCtxGetStreamPriorityRange](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html)
581 | "
582 | []
583 | (with-release [least (int-pointer 1)
584 | greatest (int-pointer 1)]
585 | (with-check (cudart/cuCtxGetStreamPriorityRange least greatest)
586 | [(get-entry least 0) (get-entry greatest 0)])))
587 |
588 | (extend-type CUctx_st
589 | Info
590 | (info
591 | ([_ info-type]
592 | (maybe
593 | (case info-type
594 | :api-version (api-version)
595 | :cache-config (cache-config)
596 | :stack-size (limit* cudart/CU_LIMIT_STACK_SIZE)
597 | :malloc-heap-size (limit* cudart/CU_LIMIT_MALLOC_HEAP_SIZE)
598 | :printf-fifo-size (limit* cudart/CU_LIMIT_PRINTF_FIFO_SIZE)
599 | :dev-runtime-sync-depth (limit* cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH)
600 | :dev-runtime-pending-launch-count (limit* cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT)
601 | :limits (fmap #(maybe (limit* %)) ctx-limits)
602 | :device (ctx-device)
603 | :shared-config (shared-config)
604 | :stream-priority-range (stream-priority-range)
605 | nil)))
606 | ([_]
607 | (op {:api-version (maybe (api-version))
608 | :cache-config (maybe (cache-config))
609 | :device (maybe (ctx-device))
610 | :shared-config (shared-config)
611 | :stream-priority-range (stream-priority-range)}
612 | (fmap #(maybe (limit* %)) ctx-limits)))))
613 |
614 | ;; =========================== Stream Management ================================
615 |
616 | (defn stream-flag [^CUstream_st hstream]
617 | (with-release [res (int-pointer 1)]
618 | (with-check (cudart/cuStreamGetFlags hstream res) (get-entry res 0))))
619 |
620 | (defn stream-priority ^long [^CUstream_st hstream]
621 | (with-release [res (int-pointer 1)]
622 | (with-check (cudart/cuStreamGetPriority hstream res) (get-entry res 0))))
623 |
624 | (extend-type CUstream_st
625 | Info
626 | (info
627 | ([hstream info-type]
628 | (maybe
629 | (case info-type
630 | :flag (dec-stream-flag (stream-flag hstream))
631 | :priority (stream-priority hstream)
632 | nil)))
633 | ([hstream]
634 | {:flag (maybe (dec-stream-flag (stream-flag hstream)))
635 | :priority (maybe (stream-priority hstream))})))
636 |
637 | ;; ============================= Execution Management ==========================
638 |
639 | (defn max-threads-per-block-fn ^long [^CUfunc_st function]
640 | (info-attribute* cudart/cuFuncGetAttribute function
641 | cudart/CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
642 |
643 | (defn shared-size ^long [^CUfunc_st function]
644 | (info-attribute* cudart/cuFuncGetAttribute function
645 | cudart/CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES))
646 |
647 | (defn const-size ^long [^CUfunc_st function]
648 | (info-attribute* cudart/cuFuncGetAttribute function
649 | cudart/CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES))
650 |
651 | (defn local-size ^long [^CUfunc_st function]
652 | (info-attribute* cudart/cuFuncGetAttribute function
653 | cudart/CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES))
654 |
655 | (defn num-regs ^long [^CUfunc_st function]
656 | (info-attribute* cudart/cuFuncGetAttribute function
657 | cudart/CU_FUNC_ATTRIBUTE_NUM_REGS))
658 |
659 | (defn ptx-version ^long [^CUfunc_st function]
660 | (info-attribute* cudart/cuFuncGetAttribute function
661 | cudart/CU_FUNC_ATTRIBUTE_PTX_VERSION))
662 |
663 | (defn binary-version ^long [^CUfunc_st function]
664 | (info-attribute* cudart/cuFuncGetAttribute function
665 | cudart/CU_FUNC_ATTRIBUTE_BINARY_VERSION))
666 |
667 | (defn cache-config*
668 | "Sets the preferred cache configuration for a device function `fun`, as an integer `config`.
669 |
670 | See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
671 | "
672 | [fun ^long config]
673 | (with-check (cudart/cuFuncSetCacheConfig fun config) fun))
674 |
675 | (defn cache-config!
676 | "Sets the preferred cache configuration for a device function `fun`, as a keyword `config`.
677 |
678 | Available configs are `:prefer-none`, `:prefer-shared`, `:prefer-L1`, and `:prefer-equal`.
679 |
680 | See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html)
681 | "
682 | [fun config]
683 | (cache-config* fun (or (func-cache-config config)
684 | (throw (ex-info "Invaling cache config."
685 | {:config config :available func-cache-config})))))
686 |
687 | (extend-type CUfunc_st
688 | Info
689 | (info
690 | ([fun info-type]
691 | (maybe
692 | (case info-type
693 | :max-threads-per-block (max-threads-per-block-fn fun)
694 | :shared-size (shared-size fun)
695 | :const-size (const-size fun)
696 | :local-size (local-size fun)
697 | :num-regs (num-regs fun)
698 | :ptx-version (ptx-version fun)
699 | :binary-version (binary-version fun)
700 | nil)))
701 | ([fun]
702 | {:max-threads-per-block (maybe (max-threads-per-block-fn fun))
703 | :shared-size (maybe (shared-size fun))
704 | :const-size (maybe (const-size fun))
705 | :local-size (maybe (local-size fun))
706 | :num-regs (maybe (num-regs fun))
707 | :ptx-version (maybe (ptx-version fun))
708 | :binary-version (maybe (binary-version fun))})))
709 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/constants.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.internal.constants
11 | "Defines constants and mappings from/to CUDA constants."
12 | (:import [org.bytedeco.cuda.global cudart nvrtc]))
13 |
14 | ;; ==================== Keyword mapping ======================================
15 |
16 | (def ^{:const true
17 | :doc "Available context flags defined in the CUDA standard."}
18 | ctx-flags
19 | {:blocking-sync cudart/CU_CTX_BLOCKING_SYNC
20 | :coredump cudart/CU_CTX_COREDUMP_ENABLE
21 | :flags-mask cudart/CU_CTX_FLAGS_MASK
22 | :lmem-resize-to-max cudart/CU_CTX_LMEM_RESIZE_TO_MAX
23 | :map-host cudart/CU_CTX_MAP_HOST
24 | :sched-auto cudart/CU_CTX_SCHED_AUTO
25 | :sched-blocking-sync cudart/CU_CTX_SCHED_BLOCKING_SYNC
26 | :sched-mask cudart/CU_CTX_SCHED_MASK
27 | :sched-spin cudart/CU_CTX_SCHED_SPIN
28 | :sched-yield cudart/CU_CTX_SCHED_YIELD
29 | :sync-memops cudart/CU_CTX_SYNC_MEMOPS
30 | :user-coredump cudart/CU_CTX_USER_COREDUMP_ENABLE})
31 |
32 | (def ^{:const true
33 | :doc "Available context limits."}
34 | ctx-limits
35 | {:dev-runtime-pending-launch-count cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT
36 | :dev-runtime-sync-depth cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH
37 | :malloc-heap-size cudart/CU_LIMIT_MALLOC_HEAP_SIZE
38 | :max cudart/CU_LIMIT_MAX
39 | :max-l2-fetch-granularity cudart/CU_LIMIT_MAX_L2_FETCH_GRANULARITY
40 | :persisting-l2-cache-size cudart/CU_LIMIT_PERSISTING_L2_CACHE_SIZE
41 | :printf-fifo-size cudart/CU_LIMIT_PRINTF_FIFO_SIZE
42 | :stack-size cudart/CU_LIMIT_STACK_SIZE})
43 |
44 | (def ^{:const true
45 | :doc "Available shared memory configurations."}
46 | shared-config-map
47 | {:default-bank-size cudart/CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE
48 | :four-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE
49 | :eight-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE})
50 |
51 | (defn dec-shared-config [^long config]
52 | (case config
53 | 0 :default-bank-size
54 | 1 :four-byte-bank-size
55 | 2 :eight-byte-bank-size
56 | config))
57 |
58 | (def ^{:const true
59 | :doc "Available device P2P attributes."}
60 | p2p-attributes
61 | {:access-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED
62 | :access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED
63 | :cuda-array-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED
64 | :native-atomic-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED
65 | :performance-rank cudart/CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK})
66 |
67 | (defn dec-compute-mode [^long mode]
68 | (case mode
69 | 0 :default
70 | 1 :exclusive
71 | 2 :prohibited
72 | 3 :exclusive-process
73 | mode) )
74 |
75 | (def ^{:const true
76 | :doc "Available flags for the [[core/mem-host-alloc]] function."}
77 | mem-host-alloc-flags
78 | {:portable cudart/CU_MEMHOSTALLOC_PORTABLE
79 | :devicemap cudart/CU_MEMHOSTALLOC_DEVICEMAP
80 | :writecombined cudart/CU_MEMHOSTALLOC_WRITECOMBINED})
81 |
82 | (def ^{:const true
83 | :doc "Available flags for the [[core/mem-host-register]] function."}
84 | mem-host-register-flags
85 | {:devicemap cudart/CU_MEMHOSTREGISTER_DEVICEMAP
86 | :iomemory cudart/CU_MEMHOSTREGISTER_IOMEMORY
87 | :portable cudart/CU_MEMHOSTREGISTER_PORTABLE
88 | :read-onlyp cudart/CU_MEMHOSTREGISTER_READ_ONLY})
89 |
90 | (def ^{:const true
91 | :doc "Available flags for the [[core/mem-host-attach]] function."}
92 | mem-attach-flags
93 | {:global cudart/CU_MEM_ATTACH_GLOBAL
94 | :host cudart/CU_MEM_ATTACH_HOST
95 | :single cudart/CU_MEM_ATTACH_SINGLE})
96 |
97 | (def ^{:const true
98 | :doc "Available flags for the [[core/mem-host-attach]] function."}
99 | stream-flags
100 | {:default cudart/CU_STREAM_DEFAULT
101 | :non-blocking cudart/CU_STREAM_NON_BLOCKING})
102 |
103 | (defn dec-stream-flag [^long flag]
104 | (case flag
105 | 0 :default
106 | 1 :non-blocking
107 | flag))
108 |
109 | (def ^{:const true
110 | :doc "Available flags for the [[core/event]] function."}
111 | event-flags
112 | {:blocking-sync cudart/CU_EVENT_BLOCKING_SYNC
113 | :default cudart/CU_EVENT_DEFAULT
114 | :disable-timing cudart/CU_EVENT_DISABLE_TIMING
115 | :interprocess cudart/CU_EVENT_INTERPROCESS})
116 |
117 | (def ^{:const true
118 | :doc "Available config for the [[core/cache-config!]] function."}
119 | func-cache-config
120 | {:prefer-none cudart/CU_FUNC_CACHE_PREFER_NONE
121 | :prefer-shared cudart/CU_FUNC_CACHE_PREFER_SHARED
122 | :prefer-L1 cudart/CU_FUNC_CACHE_PREFER_L1
123 | :prefer-equal cudart/CU_FUNC_CACHE_PREFER_EQUAL})
124 |
125 | (defn dec-func-cache-config [^long mode]
126 | (case mode
127 | 0 :prefer-none
128 | 1 :prefer-shared
129 | 2 :prefer-L1
130 | 3 :prefer-equal
131 | mode))
132 |
133 | (def ^{:const true
134 | :doc "Available jit options defined in the CUDA standard."}
135 | jit-options
136 | {:cache-mode cudart/CU_JIT_CACHE_MODE
137 | :cache-option-ca cudart/CU_JIT_CACHE_OPTION_CA
138 | :cache-option-cg cudart/CU_JIT_CACHE_OPTION_CG
139 | :cache-option-none cudart/CU_JIT_CACHE_OPTION_NONE
140 | :error-log-buffer cudart/CU_JIT_ERROR_LOG_BUFFER
141 | :error-log-buffer-size-bytes cudart/CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
142 | :fallback-strategy cudart/CU_JIT_FALLBACK_STRATEGY
143 | :fast-compile-strategy cudart/CU_JIT_FAST_COMPILE
144 | :fma cudart/CU_JIT_FMA
145 | :ftz cudart/CU_JIT_FTZ
146 | :generate-debug-info cudart/CU_JIT_GENERATE_DEBUG_INFO
147 | :generate-line-info cudart/CU_JIT_GENERATE_LINE_INFO
148 | :global-symbol-addresses cudart/CU_JIT_GLOBAL_SYMBOL_ADDRESSES
149 | :global-symbol-count cudart/CU_JIT_GLOBAL_SYMBOL_COUNT
150 | :global-symbol-names cudart/CU_JIT_GLOBAL_SYMBOL_NAMES
151 | :info-log-buffer cudart/CU_JIT_INFO_LOG_BUFFER
152 | :info-log-buffer-size-bytes cudart/CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
153 | :input-cubin cudart/CU_JIT_INPUT_CUBIN
154 | :input-fatbinary cudart/CU_JIT_INPUT_FATBINARY
155 | :input-library cudart/CU_JIT_INPUT_LIBRARY
156 | :input-nvvm cudart/CU_JIT_INPUT_NVVM
157 | :input-object cudart/CU_JIT_INPUT_OBJECT
158 | :input-ptx cudart/CU_JIT_INPUT_PTX
159 | :log-verbose cudart/CU_JIT_LOG_VERBOSE
160 | :lto cudart/CU_JIT_LTO
161 | :max-registers cudart/CU_JIT_MAX_REGISTERS
162 | :new-sm3x-opt cudart/CU_JIT_NEW_SM3X_OPT
163 | :num-input-tupes cudart/CU_JIT_NUM_INPUT_TYPES
164 | :num-options cudart/CU_JIT_NUM_OPTIONS
165 | :optimization-level cudart/CU_JIT_OPTIMIZATION_LEVEL
166 | :optimize-unused-device-variables cudart/CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES
167 | :position-independent-code cudart/CU_JIT_POSITION_INDEPENDENT_CODE
168 | :prec-div cudart/CU_JIT_PREC_DIV
169 | :prec-sqrt cudart/CU_JIT_PREC_SQRT
170 | :referenced-kernel-count cudart/CU_JIT_REFERENCED_KERNEL_COUNT
171 | :referenced-kernel-names cudart/CU_JIT_REFERENCED_KERNEL_NAMES
172 | :referenced-variable-count cudart/CU_JIT_REFERENCED_VARIABLE_COUNT
173 | :referenced-variable-names cudart/CU_JIT_REFERENCED_VARIABLE_NAMES
174 | :target cudart/CU_JIT_TARGET
175 | :target-from-cucontext cudart/CU_JIT_TARGET_FROM_CUCONTEXT
176 | :threads-per-block cudart/CU_JIT_THREADS_PER_BLOCK
177 | :wall-time cudart/CU_JIT_WALL_TIME})
178 |
179 | (def ^{:const true
180 | :doc "Available jit input types defined in the CUDA standard."}
181 | jit-input-types
182 | {:cubin cudart/CU_JIT_INPUT_CUBIN
183 | :ptx cudart/CU_JIT_INPUT_PTX
184 | :fatbinary cudart/CU_JIT_INPUT_FATBINARY
185 | :object cudart/CU_JIT_INPUT_OBJECT
186 | :library cudart/CU_JIT_INPUT_LIBRARY
187 | :nvvm cudart/CU_JIT_INPUT_NVVM
188 | :num cudart/CU_JIT_NUM_INPUT_TYPES})
189 |
190 | (def ^{:const true
191 | :doc "CUDA Error messages as defined in CUresult."}
192 | cu-result-codes
193 | {cudart/CUDA_SUCCESS :success
194 | cudart/CUDA_ERROR_ALREADY_ACQUIRED :already-acquired
195 | cudart/CUDA_ERROR_ALREADY_MAPPED :already-mapped
196 | cudart/CUDA_ERROR_ARRAY_IS_MAPPED :array-is-mapped
197 | cudart/CUDA_ERROR_ASSERT :assert
198 | cudart/CUDA_ERROR_CAPTURED_EVENT :captured-event
199 | cudart/CUDA_ERROR_CDP_NOT_SUPPORTED :cdp-not-supported
200 | cudart/CUDA_ERROR_CDP_VERSION_MISMATCH :sdp-version-mismatch
201 | cudart/CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE :compat-not-supported-on-device
202 | cudart/CUDA_ERROR_CONTEXT_ALREADY_CURRENT :context-already-current
203 | cudart/CUDA_ERROR_CONTEXT_ALREADY_IN_USE :context-already-in-use
204 | cudart/CUDA_ERROR_CONTEXT_IS_DESTROYED :context-is-destroyed
205 | cudart/CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE :cooperative-launch-too-large
206 | cudart/CUDA_ERROR_DEINITIALIZED :deinitialized
207 | cudart/CUDA_ERROR_DEVICE_NOT_LICENSED :device-not-licensed
208 | cudart/CUDA_ERROR_DEVICE_UNAVAILABLE :unavailable
209 | cudart/CUDA_ERROR_ECC_UNCORRECTABLE :ecc-uncorrectable
210 | cudart/CUDA_ERROR_EXTERNAL_DEVICE :external-device
211 | cudart/CUDA_ERROR_FILE_NOT_FOUND :file-not-found
212 | cudart/CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE :graph-exec-update-failure
213 | cudart/CUDA_ERROR_HARDWARE_STACK_ERROR :hardware-stack-errox
214 | cudart/CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED :host-memory-already-registered
215 | cudart/CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED :host-memory-not-registered
216 | cudart/CUDA_ERROR_ILLEGAL_ADDRESS :illegal-address
217 | cudart/CUDA_ERROR_ILLEGAL_INSTRUCTION :illegal-instruction
218 | cudart/CUDA_ERROR_ILLEGAL_STATE :illegal-state
219 | cudart/CUDA_ERROR_INVALID_ADDRESS_SPACE :invalid-address-space
220 | cudart/CUDA_ERROR_INVALID_CLUSTER_SIZE :invalid-cluster-size
221 | cudart/CUDA_ERROR_INVALID_CONTEXT :invalid-context
222 | cudart/CUDA_ERROR_INVALID_DEVICE :invalid-device
223 | cudart/CUDA_ERROR_INVALID_GRAPHICS_CONTEXT :invalid-graphics-context
224 | cudart/CUDA_ERROR_INVALID_HANDLE :invalid-handle
225 | cudart/CUDA_ERROR_INVALID_IMAGE :invalid-image
226 | cudart/CUDA_ERROR_INVALID_PC :invalid-pc
227 | cudart/CUDA_ERROR_INVALID_PTX :invalid-ptx
228 | cudart/CUDA_ERROR_INVALID_SOURCE :invalid-source
229 | cudart/CUDA_ERROR_INVALID_VALUE :invalid-value
230 | cudart/CUDA_ERROR_JIT_COMPILATION_DISABLED :jit-compilation-disabled
231 | cudart/CUDA_ERROR_JIT_COMPILER_NOT_FOUND :jit-compiler-not-found
232 | cudart/CUDA_ERROR_LAUNCH_FAILED :launch-failed
233 | cudart/CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING :launch-incompatible-texturing
234 | cudart/CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES :launch-out-of-resources
235 | cudart/CUDA_ERROR_LAUNCH_TIMEOUT :launch-timeout
236 | cudart/CUDA_ERROR_MAP_FAILED :map-failed
237 | cudart/CUDA_ERROR_MISALIGNED_ADDRESS :misaligned-address
238 | cudart/CUDA_ERROR_MPS_CLIENT_TERMINATED :client-terminated
239 | cudart/CUDA_ERROR_MPS_CONNECTION_FAILED :connection-failed
240 | cudart/CUDA_ERROR_MPS_MAX_CLIENTS_REACHED :clients-reached
241 | cudart/CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED :connection-reached
242 | cudart/CUDA_ERROR_MPS_RPC_FAILURE :rpc-failure
243 | cudart/CUDA_ERROR_MPS_SERVER_NOT_READY :server-not-ready
244 | cudart/CUDA_ERROR_NO_BINARY_FOR_GPU :binary-for-gpu
245 | cudart/CUDA_ERROR_NO_DEVICE :no-device
246 | cudart/CUDA_ERROR_NOT_FOUND :not-found
247 | cudart/CUDA_ERROR_NOT_INITIALIZED :not-initialized
248 | cudart/CUDA_ERROR_NOT_MAPPED :not-mapped
249 | cudart/CUDA_ERROR_NOT_MAPPED_AS_ARRAY :not-mapped-as-array
250 | cudart/CUDA_ERROR_NOT_MAPPED_AS_POINTER :mapped-as-pointer
251 | cudart/CUDA_ERROR_NOT_READY :not-ready
252 | cudart/CUDA_ERROR_NOT_SUPPORTED :not-supported
253 | cudart/CUDA_ERROR_NVLINK_UNCORRECTABLE :nvlink-uncorrectable
254 | cudart/CUDA_ERROR_OPERATING_SYSTEM :operating-system
255 | cudart/CUDA_ERROR_OUT_OF_MEMORY :out-of-memory
256 | cudart/CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED :already-enabled
257 | cudart/CUDA_ERROR_PEER_ACCESS_NOT_ENABLED :access-not-enabled
258 | cudart/CUDA_ERROR_PEER_ACCESS_UNSUPPORTED :access-unsupported
259 | cudart/CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE :context-active
260 | cudart/CUDA_ERROR_PROFILER_ALREADY_STARTED :profiler-already-started
261 | cudart/CUDA_ERROR_PROFILER_ALREADY_STOPPED :profiler-already-stopped
262 | cudart/CUDA_ERROR_PROFILER_DISABLED :profiler-disabled
263 | cudart/CUDA_ERROR_PROFILER_NOT_INITIALIZED :profiler-not-initialized
264 | cudart/CUDA_ERROR_SHARED_OBJECT_INIT_FAILED :shared-object-init-failed
265 | cudart/CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND :shared-object-symblol-not-found
266 | cudart/CUDA_ERROR_STREAM_CAPTURE_IMPLICIT :stream-capture-implicit
267 | cudart/CUDA_ERROR_STREAM_CAPTURE_INVALIDATED :stream-capture-invalidated
268 | cudart/CUDA_ERROR_STREAM_CAPTURE_ISOLATION :stream-capture-isolation
269 | cudart/CUDA_ERROR_STREAM_CAPTURE_MERGE :stream-capture-merge
270 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNJOINED :stream-capture-unjoined
271 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNMATCHED :stream-capture-unmatched
272 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED :stream-capture-unsupported
273 | cudart/CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD :stream-capture-wrong-thread
274 | cudart/CUDA_ERROR_STUB_LIBRARY :stub-library
275 | cudart/CUDA_ERROR_SYSTEM_DRIVER_MISMATCH :driver-mismatch
276 | cudart/CUDA_ERROR_SYSTEM_NOT_READY :system-not-ready
277 | cudart/CUDA_ERROR_TIMEOUT :timeout
278 | cudart/CUDA_ERROR_TOO_MANY_PEERS :too-many-peers
279 | cudart/CUDA_ERROR_UNKNOWN :unknown
280 | cudart/CUDA_ERROR_UNMAP_FAILED :unmap-failed
281 | cudart/CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC :devside-sync
282 | cudart/CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY :exec-affinity
283 | cudart/CUDA_ERROR_UNSUPPORTED_LIMIT :unsupported-limit
284 | cudart/CUDA_ERROR_UNSUPPORTED_PTX_VERSION :unsupported-ptx-version
285 | cudart/cudaErrorAddressOfConstant :address-of-constant
286 | cudart/cudaErrorApiFailureBase :failure-base
287 | cudart/cudaErrorCallRequiresNewerDriver :call-requires-newer-driver
288 | cudart/cudaErrorDuplicateSurfaceName :duplicate-surface-name
289 | cudart/cudaErrorDuplicateTextureName :duplicate-texture-name
290 | cudart/cudaErrorDuplicateVariableName :duplicate-variable-name
291 | cudart/cudaErrorIncompatibleDriverContext :incompatible-context
292 | cudart/cudaErrorInsufficientDriver :insufficient-driver
293 | cudart/cudaErrorInvalidChannelDescriptor :invalid-channel-descriptor
294 | cudart/cudaErrorInvalidConfiguration :invalid-configuration
295 | cudart/cudaErrorInvalidDeviceFunction :invalid-device-function
296 | cudart/cudaErrorInvalidDevicePointer :invalid-device-pointer
297 | cudart/cudaErrorInvalidFilterSetting :invalid-filter-setting
298 | cudart/cudaErrorInvalidHostPointer :invalid-host-pointer
299 | cudart/cudaErrorInvalidMemcpyDirection :invalid-memcpy-direction
300 | cudart/cudaErrorInvalidNormSetting :invalid-norm-setting
301 | cudart/cudaErrorInvalidPitchValue :invalid-pitch-value
302 | cudart/cudaErrorInvalidSurface :invalid-surface
303 | cudart/cudaErrorInvalidSymbol :invalid-symbol
304 | cudart/cudaErrorInvalidTexture :invalid-texture
305 | cudart/cudaErrorInvalidTextureBinding :invalid-texture-binding
306 | cudart/cudaErrorLaunchFileScopedSurf :launch-file-scoped-surf
307 | cudart/cudaErrorLaunchFileScopedTex :launch-file-scoped-tex
308 | cudart/cudaErrorLaunchMaxDepthExceeded :max-depth-exceeded
309 | cudart/cudaErrorLaunchPendingCountExceeded :launch-pending-count-exceeded
310 | cudart/cudaErrorMemoryValueTooLarge :memory-value-too-large
311 | cudart/cudaErrorMissingConfiguration :missing-configuration
312 | cudart/cudaErrorMixedDeviceExecution :mixed-device-execution
313 | cudart/cudaErrorNotPermitted :not-permitted
314 | cudart/cudaErrorNotYetImplemented :not-yet-implemented
315 | cudart/cudaErrorPriorLaunchFailure :prior-launch-failure
316 | cudart/cudaErrorSoftwareValidityNotEstablished :software-validity-not-established
317 | cudart/cudaErrorStartupFailure :startup-failure
318 | cudart/cudaErrorSyncDepthExceeded :sync-depth-exceeded
319 | cudart/cudaErrorSynchronizationError :synchronization-error
320 | cudart/cudaErrorTextureFetchFailed :texture-fetch-failed
321 | cudart/cudaErrorTextureNotBound :texture-not-bound})
322 |
323 | (def ^{:const true
324 | :doc "CUDA Error messages as defined in nvrtc."}
325 | nvrtc-result-codes
326 | {nvrtc/NVRTC_SUCCESS :success
327 | nvrtc/NVRTC_ERROR_BUILTIN_OPERATION_FAILURE :builtin-operation-failure
328 | nvrtc/NVRTC_ERROR_COMPILATION :compilation
329 | nvrtc/NVRTC_ERROR_INVALID_INPUT :invalid-input
330 | nvrtc/NVRTC_ERROR_INTERNAL_ERROR :internal-error
331 | nvrtc/NVRTC_ERROR_INVALID_OPTION :invalid-option
332 | nvrtc/NVRTC_ERROR_INVALID_PROGRAM :invalid-program
333 | nvrtc/NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID :name-expression-not-valid
334 | nvrtc/NVRTC_ERROR_OUT_OF_MEMORY :out-of-memory
335 | nvrtc/NVRTC_ERROR_PROGRAM_CREATION_FAILURE :program-creation-failure
336 | nvrtc/NVRTC_ERROR_TIME_FILE_WRITE_FAILED :time-file-write-ahead})
337 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/impl.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.internal.impl
11 | (:require [uncomplicate.commons
12 | [core :refer [with-release let-release Releaseable release info Bytes bytesize Entries
13 | size* size]]
14 | [utils :as cu :refer [dragan-says-ex]]]
15 | [uncomplicate.fluokitten.protocols :refer [Comonad extract]]
16 | [uncomplicate.clojure-cpp :as cpp
17 | :refer [put-entry! pointer safe int-pointer pointer-pointer byte-pointer size-t-pointer
18 | get-entry get-string null? long-pointer PointerCreator TypedPointerCreator
19 | clong-pointer short-pointer char-pointer double-pointer float-pointer pointer-seq
20 | capacity! address Accessor get! put! get-keyword]]
21 | [uncomplicate.clojurecuda.internal
22 | [constants :refer [cu-result-codes jit-input-types jit-options nvrtc-result-codes]]
23 | [utils :refer [with-check]]]
24 | [clojure.core.async :refer [go >!]])
25 | (:import java.nio.file.Path
26 | java.io.File
27 | [clojure.lang IFn AFn Seqable]
28 | [org.bytedeco.javacpp Pointer BytePointer PointerPointer LongPointer IntPointer]
29 | [org.bytedeco.cuda.global cudart nvrtc]
30 | [org.bytedeco.cuda.cudart CUctx_st CUstream_st CUevent_st CUmod_st CUlinkState_st]
31 | org.bytedeco.cuda.nvrtc._nvrtcProgram
32 | [uncomplicate.clojure_cpp StringPointer KeywordPointer]
33 | [uncomplicate.clojurecuda.internal.javacpp CUHostFn CUStreamCallback]))
34 |
35 | (defprotocol CUPointer
36 | (cu-address* [this])
37 | (device? [this]))
38 |
39 | (defprotocol Parameter
40 | (set-parameter* [this pp i]))
41 |
42 | (extend-type Object
43 | Parameter
44 | (set-parameter* [parameter pp i]
45 | (put-entry! pp i (pointer parameter))))
46 |
47 | ;; ==================== Release resources =======================
48 |
49 | (deftype CUDevice [^int dev]
50 | Object
51 | (hashCode [_]
52 | dev)
53 | (equals [_ y]
54 | (and (instance? CUDevice y) (= dev (.dev ^CUDevice y))))
55 | (toString [_]
56 | (format "#Device[:cuda, %d]" dev))
57 | Comonad
58 | (extract [_]
59 | dev))
60 |
61 | (extend-type CUctx_st
62 | Releaseable
63 | (release [this]
64 | (locking this
65 | (cudart/cuCtxDestroy this)
66 | (.deallocate this)
67 | (.setNull this)
68 | true)))
69 |
70 | (extend-type CUstream_st
71 | Releaseable
72 | (release [this]
73 | (locking this
74 | (cudart/cuStreamDestroy this)
75 | (.deallocate this)
76 | (.setNull this)
77 | true)))
78 |
79 | (extend-type CUevent_st
80 | Releaseable
81 | (release [this]
82 | (locking this
83 | (cudart/cuEventDestroy this)
84 | (.deallocate this)
85 | (.setNull this)
86 | true)))
87 |
88 | (extend-type CUmod_st
89 | Releaseable
90 | (release [this]
91 | (locking this
92 | (cudart/cuModuleUnload this)
93 | (.deallocate this)
94 | (.setNull this)
95 | true)))
96 |
97 | (extend-type CUlinkState_st
98 | Releaseable
99 | (release [this]
100 | (locking this
101 | (cudart/cuLinkDestroy this)
102 | (.deallocate this)
103 | (.setNull this)
104 | true)))
105 |
106 | (extend-type _nvrtcProgram
107 | Releaseable
108 | (release [this]
109 | (locking this
110 | (nvrtc/nvrtcDestroyProgram this)
111 | (.deallocate this)
112 | (.setNull this)
113 | true)))
114 |
115 | ;; ================== Module Management =====================================
116 |
117 | (defprotocol ModuleLoad
118 | (module-load* [data m])
119 | (link-add* [data link-state type opts vals]))
120 |
121 | (defn enc-jit-options [options]
122 | (map (fn [[option value]]
123 | [(or (jit-options option)
124 | (throw (ex-info "Unknown jit option." {:option option :available jit-options})))
125 | (safe (pointer value))])
126 | options))
127 |
128 | (defn check-options [^IntPointer options ^Pointer option-values]
129 | (when-not (= (size options) (size option-values))
130 | (throw (ex-info "Inconsistent number of options provided."
131 | {:requested (size options) :provided (size option-values)}))))
132 |
133 | (defn link-add-data* [^CUlinkState_st link-state type ^Pointer data ^String name
134 | ^IntPointer options ^Pointer option-values]
135 | (let [type (int (or (jit-input-types type)
136 | (throw (ex-info "Invalid jit input type."
137 | {:type type :available jit-input-types}))))]
138 | (check-options options option-values)
139 | (with-check (cudart/cuLinkAddData link-state type data (bytesize data) name
140 | (size options) options option-values)
141 | {:data data} link-state)))
142 |
143 | (defn link-add-file* [^CUlinkState_st link-state type ^String file-name
144 | ^IntPointer options ^Pointer option-values]
145 | (let [type (int (or (jit-input-types type)
146 | (throw (ex-info "Invalid jit input type."
147 | {:type type :available jit-input-types}))))]
148 | (check-options options option-values)
149 | (with-check (cudart/cuLinkAddFile link-state type file-name
150 | (size options) options option-values)
151 | {:file file-name} link-state)))
152 |
153 | (defn link*
154 | [^CUlinkState_st link-state data options]
155 | (let [[opts vals] (enc-jit-options options)]
156 | (let-release [opts (int-pointer opts)
157 | vals (pointer-pointer vals)]
158 | (with-check (cudart/cuLinkCreate (size opts) opts ^PointerPointer vals link-state)
159 | (doseq [[type d options name] data]
160 | (if name
161 | (link-add-data* link-state type d name opts vals)
162 | (link-add* d link-state type opts vals))))))
163 | link-state)
164 |
165 | (extend-type String
166 | ModuleLoad
167 | (module-load* [data m]
168 | (with-check (cudart/cuModuleLoadData ^CUmod_st m (byte-pointer data)) {:data data} m))
169 | (link-add* [data link-state type opts vals]
170 | (link-add-data* link-state type (byte-pointer data) "unnamed" opts vals)))
171 |
172 | (extend-type Pointer
173 | ModuleLoad
174 | (module-load* [data m]
175 | (with-check (cudart/cuModuleLoadData m data)
176 | {:data data} m))
177 | (link-add* [data link-state type opts vals]
178 | (link-add-data* link-state type data "unnamed" opts vals)))
179 |
180 | (extend-type Path
181 | ModuleLoad
182 | (module-load* [file-path m]
183 | (let [file-name (.toString file-path)]
184 | (with-check (cudart/cuModuleLoad ^CUmod_st m (str file-name)) {:file (str file-path)} m)))
185 | (link-add* [file-path link-state type opts vals]
186 | (link-add-file* link-state type (.toString file-path) opts vals)))
187 |
188 | (extend-type File
189 | ModuleLoad
190 | (module-load* [file m]
191 | (with-check (cudart/cuModuleLoad ^CUmod_st m (str file)) {:file (str file)} m))
192 | (link-add* [file link-state type opts vals]
193 | (link-add-file* link-state type (.toString file) opts vals)))
194 |
195 | ;; ====================== Nvrtc program JIT ========================================
196 |
197 | (defn ^:private nvrtc-error
198 | "Converts an CUDA Nvrtc error code to an ExceptionInfo with richer, user-friendly information."
199 | ([^long err-code details]
200 | (let [err (get nvrtc-result-codes err-code err-code)]
201 | (ex-info (format "NVRTC error: %s." err)
202 | {:name err :code err-code :type :nvrtc-error :details details})))
203 | ([err-code]
204 | (nvrtc-error err-code nil)))
205 |
206 | (defmacro ^:private with-check-nvrtc
207 | "Evaluates `form` if `err-code` is not zero (`:success`), otherwise throws
208 | an appropriate `ExceptionInfo` with decoded informative details.
209 | It helps with CUDA nvrtc methods that return error codes directly, while
210 | returning computation results through mutating arguments.
211 | "
212 | ([err-code form]
213 | `(cu/with-check nvrtc-error ~err-code ~form)))
214 |
215 | (defn program*
216 | [^BytePointer name ^BytePointer source-code
217 | ^PointerPointer source-headers ^PointerPointer include-names]
218 | (let-release [res (_nvrtcProgram.)]
219 | (with-check-nvrtc
220 | (nvrtc/nvrtcCreateProgram res source-code name
221 | (size source-headers) source-headers include-names)
222 | res)))
223 |
224 | (defn program-log*
225 | "Returns the log string generated by the previous compilation of `program`."
226 | [^_nvrtcProgram program]
227 | (with-release [log-size (size-t-pointer 1)]
228 | (with-check-nvrtc (nvrtc/nvrtcGetProgramLogSize program log-size)
229 | (with-release [log (byte-pointer (get-entry log-size 0))]
230 | (with-check-nvrtc (nvrtc/nvrtcGetProgramLog program log) (get-string log))))))
231 |
232 | (defn compile*
233 | "Compiles the given `program` using an array of string `options`."
234 | ([^_nvrtcProgram program ^PointerPointer options]
235 | (let [err (nvrtc/nvrtcCompileProgram program (size options) options)]
236 | (if (= 0 err)
237 | program
238 | (throw (nvrtc-error err (program-log* program)))))))
239 |
240 | (defn ptx*
241 | "Returns the PTX generated by the previous compilation of `program`."
242 | [^_nvrtcProgram program]
243 | (with-release [ptx-size (size-t-pointer 1)]
244 | (with-check-nvrtc (nvrtc/nvrtcGetPTXSize program ptx-size)
245 | (let-release [ptx (byte-pointer (get-entry ptx-size 0))]
246 | (with-check-nvrtc (nvrtc/nvrtcGetPTX program ptx)
247 | ptx)))))
248 |
249 | (extend-type _nvrtcProgram
250 | ModuleLoad
251 | (module-load* [program m]
252 | (with-check (cudart/cuModuleLoadData ^CUmod_st m (ptx* program)) m))
253 | (link-add* [program link-state type opts vals]
254 | (link-add-data* link-state type (ptx* program) "unnamed" opts vals)))
255 |
256 | ;; =================== Context Management ==================================
257 |
258 | (defn context*
259 | "Creates a CUDA context on the `device` using a raw integer `flag`.
260 | For available flags, see [[constants/ctx-flags]].
261 | "
262 | [^long dev ^long flags]
263 | (let [res (CUctx_st.)]
264 | (with-check (cudart/cuCtxCreate res flags dev)
265 | {:dev (info dev) :flags flags}
266 | res)))
267 |
268 | (defn current-context*
269 | "If `ctx` is provided, bounds it as current. Returns the CUDA context bound to the calling CPU thread."
270 | ([]
271 | (let [ctx (CUctx_st.)]
272 | (with-check (cudart/cuCtxGetCurrent ctx) ctx)))
273 | ([^CUctx_st ctx]
274 | (with-check (cudart/cuCtxSetCurrent ctx) ctx)))
275 |
276 | ;; ==================== Linear memory ================================================
277 |
278 | (defprotocol MemSet
279 | (memset* [this dptr n] [this dptr n hstream]))
280 |
281 | (extend-type Byte
282 | MemSet
283 | (memset*
284 | ([this dptr n]
285 | (with-check (cudart/cuMemsetD8 dptr this n) dptr))
286 | ([this dptr n hstream]
287 | (with-check (cudart/cuMemsetD8Async dptr this n hstream) dptr))))
288 |
289 | (extend-type Short
290 | MemSet
291 | (memset*
292 | ([this dptr n]
293 | (with-check (cudart/cuMemsetD16 dptr this n) dptr))
294 | ([this dptr n hstream]
295 | (with-check (cudart/cuMemsetD16Async dptr this n hstream) dptr))))
296 |
297 | (extend-type Integer
298 | MemSet
299 | (memset*
300 | ([this dptr n]
301 | (with-check (cudart/cuMemsetD32 dptr this n) dptr))
302 | ([this dptr n hstream]
303 | (with-check (cudart/cuMemsetD32Async dptr this n hstream) dptr))))
304 |
305 | (extend-type Float
306 | MemSet
307 | (memset*
308 | ([this dptr n]
309 | (with-check (cudart/cuMemsetD32 dptr (Float/floatToIntBits this) n) dptr))
310 | ([this dptr n hstream]
311 | (with-check (cudart/cuMemsetD32Async dptr (Float/floatToIntBits this) n hstream) dptr))))
312 |
313 | (extend-type Double
314 | MemSet
315 | (memset*
316 | ([this dptr n]
317 | (if (= 0.0 this)
318 | (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr)
319 | (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this})))
320 | ([this dptr n hstream]
321 | (if (= 0.0 this)
322 | (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr)
323 | (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this})))))
324 |
325 | (extend-type Long
326 | MemSet
327 | (memset*
328 | ([this dptr n]
329 | (if (= 0 this)
330 | (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr)
331 | (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this})))
332 | ([this dptr n hstream]
333 | (if (= 0 this)
334 | (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr)
335 | (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this})))))
336 |
337 | (defprotocol Memcpy
338 | "An object that represents memory that participates in CUDA operations.
339 | It can be on the device, or on the host. Built-in implementations:
340 | CUDA pointers, JavaCPP pointers, Java primitive arrays, etc.
341 | "
342 | (memcpy-host* [dst src size] [dst src size hstream])
343 | (memcpy* [dst src size] [dst src size hstream]))
344 |
345 | (defn offset ^long [dptr ^long offset]
346 | (if (<= 0 offset (bytesize dptr))
347 | (+ (long (cu-address* dptr)) offset)
348 | (dragan-says-ex "Requested bytes are out of the bounds of this device pointer."
349 | {:offset offset :size (bytesize dptr)})))
350 |
351 | (deftype CUDevicePtr [^LongPointer daddr ^long byte-size master]
352 | Object
353 | (hashCode [_]
354 | (hash-combine (hash daddr) byte-size))
355 | (equals [_ y]
356 | (and (instance? CUDevicePtr y) (= (get-entry daddr 0) (cu-address* y))))
357 | (toString [_]
358 | (format "#DevicePtr[:cuda, 0x%x, %d bytes]" (get-entry daddr 0) byte-size))
359 | Releaseable
360 | (release [_]
361 | (locking daddr
362 | (when-not (null? daddr)
363 | (when master
364 | (with-check (cudart/cuMemFree (get-entry daddr 0)) true))
365 | (release daddr))
366 | true))
367 | Comonad
368 | (extract [_]
369 | (extract daddr))
370 | CUPointer
371 | (cu-address* [_]
372 | (get-entry daddr 0))
373 | (device? [_]
374 | true)
375 | Bytes
376 | (bytesize* [_]
377 | byte-size)
378 | Entries
379 | (size* [_]
380 | byte-size)
381 | (sizeof* [_]
382 | Byte/BYTES)
383 | Parameter
384 | (set-parameter* [_ pp i]
385 | (put-entry! pp i daddr))
386 | Memcpy
387 | (memcpy-host* [this src byte-count]
388 | (with-check
389 | (cudart/cuMemcpyHtoD (get-entry daddr 0) (safe (pointer src)) byte-count)
390 | this))
391 | (memcpy-host* [this src byte-count hstream]
392 | (with-check
393 | (cudart/cuMemcpyHtoDAsync (get-entry daddr 0) (safe (pointer src)) byte-count hstream)
394 | this))
395 | (memcpy* [this src byte-count]
396 | (with-check
397 | (cudart/cuMemcpy (get-entry daddr 0) (cu-address* src) byte-count)
398 | this))
399 | (memcpy* [this src byte-count hstream]
400 | (with-check
401 | (cudart/cuMemcpyAsync (get-entry daddr 0) (cu-address* src) byte-count hstream)
402 | this)))
403 |
404 | (defn mem-alloc-managed*
405 | ([^long size ^long flag]
406 | (let-release [daddr (long-pointer 1)]
407 | (with-check (cudart/cuMemAllocManaged daddr size flag)
408 | (->CUDevicePtr daddr size true)))))
409 |
410 | ;; =================== Runtime Memory ===============================================
411 |
412 | (defn cupointer-memcpy*
413 | ([dst src ^long byte-count]
414 | (with-check
415 | (if (instance? Pointer src)
416 | (cudart/cudaMemcpy (safe (pointer dst)) (extract src) byte-count cudart/cudaMemcpyDefault)
417 | (cudart/cuMemcpy (cu-address* dst) (cu-address* src) byte-count))
418 | dst))
419 | ([dst src ^long byte-count hstream]
420 | (with-check
421 | (if (instance? Pointer src)
422 | (cudart/cudaMemcpyAsync (safe (pointer dst)) (extract src)
423 | byte-count cudart/cudaMemcpyDefault hstream)
424 | (cudart/cuMemcpyAsync (cu-address* dst) (cu-address* src) byte-count hstream))
425 | dst)))
426 |
427 | (defn offset-address [^Pointer p]
428 | (+ (.address (safe p)) (* (.sizeof p) (.position p))))
429 |
430 | (deftype CURuntimePtr [^Pointer dptr master]
431 | Object
432 | (hashCode [_]
433 | (hash dptr))
434 | (equals [_ y]
435 | (and (instance? CURuntimePtr y) (= dptr (.-dptr ^CURuntimePtr y) 0)))
436 | (toString [this]
437 | (format "#RuntimePtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize dptr)))
438 | Releaseable
439 | (release [_]
440 | (locking dptr
441 | (when-not (null? dptr)
442 | (when master
443 | (with-check (cudart/cudaFree (.position dptr 0)) (.setNull dptr))))
444 | true))
445 | Comonad
446 | (extract [_]
447 | (offset-address dptr))
448 | CUPointer
449 | (cu-address* [_]
450 | (offset-address dptr))
451 | (device? [_]
452 | true)
453 | PointerCreator
454 | (pointer* [_]
455 | dptr)
456 | (pointer* [_ i]
457 | (pointer dptr i))
458 | TypedPointerCreator
459 | (byte-pointer [_]
460 | (byte-pointer dptr))
461 | (clong-pointer [_]
462 | (clong-pointer dptr))
463 | (size-t-pointer [_]
464 | (clong-pointer dptr))
465 | (pointer-pointer [_]
466 | (pointer-pointer dptr))
467 | (char-pointer [_]
468 | (char-pointer dptr))
469 | (short-pointer [_]
470 | (short-pointer dptr))
471 | (int-pointer [_]
472 | (int-pointer dptr))
473 | (long-pointer [_]
474 | (long-pointer dptr))
475 | (float-pointer [_]
476 | (float-pointer dptr))
477 | (double-pointer [_]
478 | (double-pointer dptr))
479 | Bytes
480 | (bytesize* [_]
481 | (bytesize dptr))
482 | Entries
483 | (size* [_]
484 | (size* dptr))
485 | (sizeof* [_]
486 | (.sizeof dptr))
487 | Seqable
488 | (seq [_]
489 | (pointer-seq dptr))
490 | Parameter
491 | (set-parameter* [this pp i]
492 | (put-entry! pp i (pointer (offset-address dptr))))
493 | Memcpy
494 | (memcpy-host* [this src byte-count]
495 | (with-check
496 | (cudart/cuMemcpyHtoD (offset-address dptr) (safe (pointer src)) byte-count)
497 | this))
498 | (memcpy-host* [this src byte-count hstream]
499 | (with-check
500 | (cudart/cuMemcpyHtoDAsync (offset-address dptr) (safe (pointer src)) byte-count hstream)
501 | this))
502 | (memcpy* [this src byte-count]
503 | (cupointer-memcpy* this src byte-count))
504 | (memcpy* [this src byte-count hstream]
505 | (cupointer-memcpy* this src byte-count hstream)))
506 |
507 | (defn malloc-runtime*
508 | ([^long size]
509 | (let-release [p (byte-pointer nil)]
510 | (with-check (cudart/cudaMalloc p size)
511 | (->CURuntimePtr (capacity! p size) true))))
512 | ([^long size pointer-type]
513 | (let-release [p (byte-pointer nil)]
514 | (with-check (cudart/cudaMalloc p size)
515 | (->CURuntimePtr (pointer-type (capacity! p size)) true)))))
516 |
517 | ;; =================== Pinned Memory ================================================
518 |
519 | (defn free-pinned [hp]
520 | (with-check (cudart/cuMemFreeHost hp) (release hp)))
521 |
522 | (defn unregister-pinned [hp]
523 | (with-check (cudart/cuMemHostUnregister hp) hp))
524 |
525 | (deftype CUPinnedPtr [^Pointer hptr master release-fn]
526 | Object
527 | (hashCode [_]
528 | (hash hptr))
529 | (equals [this y]
530 | (and (instance? CUPinnedPtr y) (= (offset-address hptr) (cu-address* y))))
531 | (toString [this]
532 | (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (offset-address hptr) (bytesize hptr)))
533 | Releaseable
534 | (release [_]
535 | (locking hptr
536 | (when-not (null? hptr)
537 | (when master
538 | (release-fn (.position hptr 0))))
539 | true))
540 | Comonad
541 | (extract [_]
542 | (extract hptr))
543 | CUPointer
544 | (cu-address* [_]
545 | (offset-address hptr))
546 | (device? [_]
547 | false)
548 | PointerCreator
549 | (pointer* [_]
550 | hptr)
551 | (pointer* [_ i]
552 | (pointer hptr i))
553 | TypedPointerCreator
554 | (byte-pointer [_]
555 | (byte-pointer hptr))
556 | (clong-pointer [_]
557 | (clong-pointer hptr))
558 | (size-t-pointer [_]
559 | (clong-pointer hptr))
560 | (pointer-pointer [_]
561 | (pointer-pointer hptr))
562 | (char-pointer [_]
563 | (char-pointer hptr))
564 | (short-pointer [_]
565 | (short-pointer hptr))
566 | (int-pointer [_]
567 | (int-pointer hptr))
568 | (long-pointer [_]
569 | (long-pointer hptr))
570 | (float-pointer [_]
571 | (float-pointer hptr))
572 | (double-pointer [_]
573 | (double-pointer hptr))
574 | Bytes
575 | (bytesize* [_]
576 | (bytesize hptr))
577 | Entries
578 | (size* [_]
579 | (size* hptr))
580 | (sizeof* [_]
581 | (.sizeof hptr))
582 | Seqable
583 | (seq [_]
584 | (pointer-seq hptr))
585 | Accessor
586 | (get-entry [_]
587 | (get-entry hptr))
588 | (get-entry [_ i]
589 | (get-entry hptr i))
590 | (put-entry! [this value]
591 | (put-entry! hptr value)
592 | this)
593 | (put-entry! [this i value]
594 | (put-entry! hptr i value)
595 | this)
596 | (get! [_ arr]
597 | (get! hptr arr)
598 | arr)
599 | (get! [_ arr offset length]
600 | (get! hptr arr offset length)
601 | arr)
602 | (put! [this obj]
603 | (put! hptr obj)
604 | this)
605 | (put! [_ obj offset length]
606 | (put! hptr obj offset length))
607 | Parameter
608 | (set-parameter* [_ pp i]
609 | (put-entry! pp i (pointer (offset-address hptr))))
610 | Memcpy
611 | (memcpy-host* [this src byte-count]
612 | (with-check (cudart/cuMemcpyDtoH hptr (offset-address hptr) byte-count) this))
613 | (memcpy-host* [this src byte-count hstream]
614 | (with-check (cudart/cuMemcpyDtoHAsync hptr (offset-address hptr) byte-count hstream) this))
615 | (memcpy* [this src byte-count]
616 | (cupointer-memcpy* this src byte-count))
617 | (memcpy* [this src byte-count hstream]
618 | (cupointer-memcpy* this src byte-count hstream)))
619 |
620 | (defn mem-host-alloc*
621 | "Allocates `byte-size` bytes of page-locked memory, 'pinned' on the host, using raw integer `flags`.
622 | For available flags, see [[constants/mem-host-alloc-flags]]. The memory is not initialized.
623 | `byte-size` must be greater than `0`.
624 | See [cuMemHostAlloc](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
625 | "
626 | ([^long byte-size ^long flags]
627 | (let-release [p (byte-pointer nil)]
628 | (with-check (cudart/cuMemHostAlloc p byte-size flags)
629 | (->CUPinnedPtr (capacity! p byte-size) true free-pinned))))
630 | ([^long byte-size ^long flags pointer-type]
631 | (let-release [p (byte-pointer nil)]
632 | (with-check (cudart/cuMemHostAlloc p byte-size flags)
633 | (->CUPinnedPtr (pointer-type (capacity! p byte-size)) true free-pinned)))))
634 |
635 | (defn mem-host-register*
636 | "Registers previously allocated host `Pointer` and pins it, using raw integer `flags`.
637 | See [cuMemHostRegister](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
638 | "
639 | ([hptr ^long flags]
640 | (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags)
641 | (->CUPinnedPtr hptr true unregister-pinned)))
642 | ([hptr ^long flags pointer-type]
643 | (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags)
644 | (let [tp (pointer-type (capacity! hptr size))]
645 | (->CUPinnedPtr tp true unregister-pinned)))))
646 |
647 | (deftype CUMappedPtr [^Pointer hptr master]
648 | Object
649 | (hashCode [_]
650 | (hash hptr))
651 | (equals [this y]
652 | (and (instance? CUMappedPtr y) (= (cu-address* this) (cu-address* y))))
653 | (toString [this]
654 | (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize hptr)))
655 | Releaseable
656 | (release [_]
657 | (locking hptr
658 | (when-not (null? hptr)
659 | (when master
660 | (with-check (cudart/cuMemFreeHost (.position hptr 0))
661 | (release hptr))))
662 | true))
663 | Comonad
664 | (extract [_]
665 | (offset-address hptr))
666 | CUPointer
667 | (cu-address* [_]
668 | (offset-address hptr))
669 | (device? [_]
670 | false)
671 | PointerCreator
672 | (pointer* [_]
673 | hptr)
674 | (pointer* [_ i]
675 | (pointer hptr i))
676 | TypedPointerCreator
677 | (byte-pointer [_]
678 | (byte-pointer hptr))
679 | (clong-pointer [_]
680 | (clong-pointer hptr))
681 | (size-t-pointer [_]
682 | (clong-pointer hptr))
683 | (pointer-pointer [_]
684 | (pointer-pointer hptr))
685 | (char-pointer [_]
686 | (char-pointer hptr))
687 | (short-pointer [_]
688 | (short-pointer hptr))
689 | (int-pointer [_]
690 | (int-pointer hptr))
691 | (long-pointer [_]
692 | (long-pointer hptr))
693 | (float-pointer [_]
694 | (float-pointer hptr))
695 | (double-pointer [_]
696 | (double-pointer hptr))
697 | Bytes
698 | (bytesize* [_]
699 | (bytesize hptr))
700 | Entries
701 | (size* [_]
702 | (size* hptr))
703 | (sizeof* [_]
704 | (.sizeof hptr))
705 | Seqable
706 | (seq [_]
707 | (pointer-seq hptr))
708 | Accessor
709 | (get-entry [_]
710 | (get-entry hptr))
711 | (get-entry [_ i]
712 | (get-entry hptr i))
713 | (put-entry! [this value]
714 | (put-entry! hptr value)
715 | this)
716 | (put-entry! [this i value]
717 | (put-entry! hptr i value)
718 | this)
719 | (get! [_ arr]
720 | (get! hptr arr)
721 | arr)
722 | (get! [_ arr offset length]
723 | (get! hptr arr offset length)
724 | arr)
725 | (put! [this obj]
726 | (put! hptr obj)
727 | this)
728 | (put! [_ obj offset length]
729 | (put! hptr obj offset length))
730 | Parameter
731 | (set-parameter* [_ pp i]
732 | (put-entry! pp i (pointer (offset-address hptr))))
733 | Memcpy
734 | (memcpy-host* [this src byte-count]
735 | (if (device? src)
736 | (with-check (cudart/cuMemcpy (offset-address hptr) (cu-address* src) byte-count) this)
737 | (cpp/memcpy! (safe (pointer src)) (extract hptr)))
738 | this)
739 | (memcpy-host* [this src byte-count hstream]
740 | (with-check
741 | (if (device? src)
742 | (cudart/cuMemcpyAsync (offset-address hptr) (cu-address* src) byte-count hstream)
743 | (cudart/cuMemcpyHtoDAsync (offset-address hptr) (safe (pointer src)) byte-count hstream))
744 | this))
745 | (memcpy* [this src byte-count]
746 | (cupointer-memcpy* this src byte-count))
747 | (memcpy* [this src byte-count hstream]
748 | (cupointer-memcpy* this src byte-count hstream)))
749 |
750 | (defn mem-alloc-host*
751 | "Allocates `byte-size` bytes of page-locked memory, 'mapped' to the device.
752 | For available flags, see [constants/mem-host-alloc-flags]
753 | The memory is not initialized. `byte-size` must be greater than `0`.
754 | See [cuMemAllocHost](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html).
755 | "
756 | ([^long byte-size]
757 | (let-release [p (byte-pointer nil)]
758 | (with-check (cudart/cuMemAllocHost p byte-size)
759 | (->CUMappedPtr (capacity! p byte-size) true))))
760 | ([^long byte-size pointer-type]
761 | (let-release [p (byte-pointer nil)]
762 | (with-check (cudart/cuMemAllocHost p byte-size)
763 | (->CUMappedPtr (pointer-type (capacity! p byte-size)) true)))))
764 |
765 | ;; =============== Host memory =================================
766 |
767 | (extend-type Pointer
768 | CUPointer
769 | (cu-address* [this]
770 | (offset-address this))
771 | (device? [_]
772 | false)
773 | Parameter
774 | (set-parameter* [parameter pp i]
775 | (put-entry! pp i (pointer (offset-address parameter))))
776 | Memcpy
777 | (memcpy-host*
778 | ([this src byte-count]
779 | (with-check
780 | (if (instance? Pointer src)
781 | (cudart/cudaMemcpy (extract this) (safe (pointer src)) cudart/cudaMemcpyDefault byte-count)
782 | (cudart/cuMemcpyDtoH (extract this) (cu-address* src) byte-count))
783 | this))
784 | ([this src byte-count hstream]
785 | (with-check
786 | (if (instance? Pointer src)
787 | (cudart/cudaMemcpyAsync (extract this) (safe (pointer src))
788 | cudart/cudaMemcpyDefault byte-count hstream)
789 | (cudart/cuMemcpyDtoHAsync (extract this) (cu-address* src) byte-count hstream))
790 | this)))
791 | (memcpy*
792 | ([this src byte-count]
793 | (with-check
794 | (if (instance? Pointer src)
795 | (cudart/cudaMemcpy (extract this) (safe (pointer src)) byte-count cudart/cudaMemcpyDefault)
796 | (cudart/cuMemcpy (offset-address (extract this)) (cu-address* src) byte-count))
797 | this))
798 | ([this src byte-count hstream]
799 | (with-check
800 | (if (instance? Pointer src)
801 | (cudart/cudaMemcpyAsync (extract this) (safe (pointer src))
802 | byte-count cudart/cudaMemcpyDefault hstream)
803 | (cudart/cuMemcpyAsync (offset-address (extract this)) (cu-address* src) byte-count hstream))
804 | this))))
805 |
806 | ;; ================== Stream Management ======================================
807 |
808 | (defn stream*
809 | "Create a stream using an optional `priority` and an integer `flag`.
810 | See [cuStreamCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)
811 | "
812 | ([^long flag]
813 | (let [res (CUstream_st.)]
814 | (with-check (cudart/cuStreamCreate res flag) res)))
815 | ([^long priority ^long flag]
816 | (let [res (CUstream_st.)]
817 | (with-check (cudart/cuStreamCreateWithPriority res flag priority) res))))
818 |
819 | (defn ready*
820 | "Determines status (ready or not) of a compute stream or event.
821 | See [cuStreamQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html),
822 | and [cuEventQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
823 | "
824 | [obj]
825 | (case (class obj)
826 | CUstream_st (cudart/cuStreamQuery obj)
827 | CUevent_st (cudart/cuEventQuery obj)
828 | cudart/CUDA_ERROR_NOT_READY))
829 |
830 | (defrecord StreamCallbackInfo [status data])
831 |
832 | (deftype StreamCallback [ch]
833 | IFn
834 | (invoke [_ _ status data]
835 | (go (>! ch (->StreamCallbackInfo (get cu-result-codes status status) (extract data)))))
836 | (applyTo [this xs]
837 | (AFn/applyToHelper this xs)))
838 |
839 | (defprotocol HostFn
840 | (host-fn* [type ch]))
841 |
842 | (extend-type KeywordPointer
843 | HostFn
844 | (host-fn* [_ ch]
845 | (fn [data]
846 | (go (>! ch (get-keyword (byte-pointer data)))))))
847 |
848 | (extend-type StringPointer
849 | HostFn
850 | (host-fn* [_ ch]
851 | (fn [data]
852 | (go (>! ch (get-string (byte-pointer data)))))))
853 |
854 | (extend-type Pointer
855 | HostFn
856 | (host-fn* [_ ch]
857 | (fn [data]
858 | (go (>! ch data)))))
859 |
860 | (defn add-host-fn*
861 | [^CUstream_st hstream ^IFn f ^Pointer data]
862 | (let-release [hostfn (CUHostFn. f)]
863 | (with-check (cudart/cuLaunchHostFunc hstream hostfn data)
864 | hstream)))
865 |
866 | (defn attach-mem*
867 | "Attach memory of `byte-size`, specified by an integer `flag` to a `hstream` asynchronously.
868 | See [cuStreamAttachMemAsync](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html).
869 | "
870 | ([^CUstream_st hstream mem byte-size flag]
871 | (with-check (cudart/cuStreamAttachMemAsync hstream mem byte-size flag) hstream)))
872 |
873 | ;; ================== Event Management =======================================
874 |
875 | (defn event*
876 | "Creates an event specified by integer `flags`.
877 | See [cuEventCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html)
878 | "
879 | [^long flags]
880 | (let [res (CUevent_st.)]
881 | (with-check (cudart/cuEventCreate res flags) res)))
882 |
883 | ;; ================== Peer Context Memory Access =============================
884 |
885 | (defn can-access-peer*
886 | "queries if a device may directly access a peer device's memory.
887 | see [cudevicecanaccesspeer](http://docs.nvidia.com/cuda/cuda-driver-api/group__cuda__peer__access.html)
888 | "
889 | [^long dev ^long peer]
890 | (with-release [res (int-pointer 1)]
891 | (with-check (cudart/cuDeviceCanAccessPeer ^IntPointer res dev peer)
892 | (pos? (int (get-entry res 0))))))
893 |
894 | (defn p2p-attribute*
895 | "Queries attributes of the link between two devices.
896 | See [cuDeviceGetP2PAttribute](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html)
897 | "
898 | [^long dev ^long peer ^long attribute]
899 | (let [res (int-pointer 1)]
900 | (with-check
901 | (cudart/cudaDeviceGetP2PAttribute ^IntPointer res attribute dev peer)
902 | (pos? (int (get-entry res 0))))))
903 |
904 | ;; ================ print-method ============================================
905 |
906 | (defn format-pointer [title p ^java.io.Writer w]
907 | (.write w (format "#%s[:cuda, 0x%x]" title (address p))))
908 |
909 | (defmethod print-method CUDevice [p ^java.io.Writer w]
910 | (.write w (str p)))
911 |
912 | (defmethod print-method CUctx_st [p w]
913 | (format-pointer "Context" p w))
914 |
915 | (defmethod print-method CUstream_st [p w]
916 | (format-pointer "Stream" p w))
917 |
918 | (defmethod print-method CUevent_st [p w]
919 | (format-pointer "Event" p w))
920 |
921 | (defmethod print-method CUmod_st [p w]
922 | (format-pointer "Module" p w))
923 |
924 | (defmethod print-method CUlinkState_st [p w]
925 | (format-pointer "LinkState" p w))
926 |
927 | (defmethod print-method _nvrtcProgram [p w]
928 | (format-pointer "Program" p w))
929 |
930 | (defmethod print-method CUDevicePtr [p w]
931 | (format-pointer "DevicePtr" p w))
932 |
933 | (defmethod print-method CUPinnedPtr [p w]
934 | (format-pointer "PinnedPtr" p w))
935 |
936 | (defmethod print-method CUMappedPtr [p w]
937 | (format-pointer "MappedPtr" p w))
938 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/internal/utils.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.internal.utils
11 | "Utility functions used as helpers in other ClojureCUDA namespaces.
12 | The user of the ClojureCUDA library would probably not need to use
13 | any of the functions defined here."
14 | (:require [uncomplicate.commons.utils :as utils]
15 | [uncomplicate.clojurecuda.internal.constants :refer [cu-result-codes]])
16 | (:import clojure.lang.ExceptionInfo))
17 |
18 | ;; ============= Error Codes ===================================================
19 |
20 | (defn error
21 | "Converts an CUDA error code to an [ExceptionInfo] (http://clojuredocs.org/clojure.core/ex-info)
22 | with richer, user-friendly information.
23 | Accepts a long `err-code` that should be one of the codes defined in CUDA standard, and an
24 | optional `details` argument that could be anything that you think is informative.
25 |
26 | Examples:
27 | (error 0) => an ExceptionInfo instance
28 | (error -5 {:comment \"Why here?\"\"}) => an ExceptionInfo instance
29 | "
30 | ([^long err-code details]
31 | (let [err (get cu-result-codes err-code err-code)]
32 | (ex-info (format "CUDA error: %s." err)
33 | {:name err :code err-code :type :cuda :details details})))
34 | ([^long err-code]
35 | (error err-code nil)))
36 |
37 | (defmacro with-check
38 | "Evaluates `form` if `status` is not zero (`:success`), otherwise throws
39 | an appropriate `ExceptionInfo` with decoded informative details.
40 | It helps fith CUDA methods that return error codes directly, while
41 | returning computation results through side-effects in arguments.
42 |
43 | Example:
44 | (with-check (some-jcuda-call-that-returns-error-code) result)
45 | "
46 | ([status form]
47 | `(utils/with-check error ~status ~form))
48 | ([status details form]
49 | `(let [status# ~status]
50 | (if (= 0 status#)
51 | ~form
52 | (throw (error status# ~details))))))
53 |
54 | (defmacro maybe
55 | "Evaluates form in try/catch block; if a CUDA-related exception is caught,
56 | substitutes the result with the [ExceptionInfo](http://clojuredocs.org/clojure.core/ex-info) object."
57 | [form]
58 | `(try ~form
59 | (catch ExceptionInfo ex-info#
60 | (if (= :cuda (:type (ex-data ex-info#)))
61 | (:name (ex-data ex-info#))
62 | (throw ex-info#)))))
63 |
--------------------------------------------------------------------------------
/src/clojure/uncomplicate/clojurecuda/toolbox.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns ^{:author "Dragan Djuric"}
10 | uncomplicate.clojurecuda.toolbox
11 | "Various helpers that are not needed by ClojureCUDA itself,
12 | but may be very helpful in applications. See Neanderthal and Bayadera libraries
13 | for the examples of how to use them."
14 | (:require [uncomplicate.commons
15 | [core :refer [with-release]]
16 | [utils :refer [count-groups]]]
17 | [uncomplicate.clojure-cpp
18 | :refer [byte-pointer get-long get-int get-double get-float]]
19 | [uncomplicate.clojurecuda.core
20 | :refer [grid-1d grid-2d launch! memcpy-host! parameters set-parameter! set-parameter!]])
21 | (:import org.bytedeco.javacpp.PointerPointer))
22 |
23 | (defn launch-reduce!
24 | ([hstream main-kernel reduction-kernel main-params reduction-params n local-n]
25 | (let [main-params (if (instance? PointerPointer main-params)
26 | (set-parameter! main-params 0 n)
27 | (apply parameters n main-params))
28 | reduction-params (if (instance? PointerPointer reduction-params)
29 | reduction-params
30 | (apply parameters Integer/MAX_VALUE reduction-params))]
31 | (launch! main-kernel (grid-1d n local-n) hstream main-params)
32 | (loop [global-size (count-groups local-n n)]
33 | (when (< 1 global-size)
34 | (launch! reduction-kernel (grid-1d global-size local-n) hstream
35 | (set-parameter! reduction-params 0 global-size))
36 | (recur (count-groups local-n global-size))))
37 | hstream))
38 | ([hstream main-kernel reduction-kernel main-params reduction-params m n local-m local-n]
39 | (let [main-params (if (instance? PointerPointer main-params)
40 | (set-parameter! main-params 0 m n)
41 | (apply parameters m n main-params))
42 | reduction-params (if (instance? PointerPointer reduction-params)
43 | reduction-params
44 | (apply parameters Integer/MAX_VALUE Integer/MAX_VALUE reduction-params))]
45 | (if (or (< 1 (long local-n)) (<= (long local-n) (long n)))
46 | (loop [hstream (launch! main-kernel (grid-2d m n local-m local-n) hstream main-params)
47 | global-size (count-groups local-n n)]
48 | (if (= 1 global-size)
49 | hstream
50 | (recur (launch! reduction-kernel (grid-2d m global-size local-m local-n) hstream
51 | (set-parameter! reduction-params 0 m global-size))
52 | (count-groups local-n global-size))))
53 | (throw (IllegalArgumentException.
54 | (format "local-n %d would cause infinite recursion for n:%d." local-n n)))))))
55 |
56 | (defn read-int
57 | (^long [cu-buf]
58 | (with-release [res (byte-pointer Integer/BYTES)]
59 | (memcpy-host! cu-buf res)
60 | (get-int res 0)))
61 | (^long [hstream cu-buf]
62 | (with-release [res (byte-pointer Integer/BYTES)]
63 | (memcpy-host! cu-buf res hstream)
64 | (get-int res 0))))
65 |
66 | (defn read-long
67 | (^long [cu-buf]
68 | (with-release [res (byte-pointer Long/BYTES)]
69 | (memcpy-host! cu-buf res)
70 | (get-long res 0)))
71 | (^long [hstream cu-buf]
72 | (with-release [res (byte-pointer Long/BYTES)]
73 | (memcpy-host! cu-buf res hstream)
74 | (get-long res 0))))
75 |
76 | (defn read-double
77 | (^double [cu-buf]
78 | (with-release [res (byte-pointer Double/BYTES)]
79 | (memcpy-host! cu-buf res)
80 | (get-double res 0)))
81 | (^double [hstream cu-buf]
82 | (with-release [res (byte-pointer Double/BYTES)]
83 | (memcpy-host! cu-buf res hstream)
84 | (get-double res 0))))
85 |
86 | (defn read-float
87 | (^double [cu-buf]
88 | (with-release [res (byte-pointer Float/BYTES)]
89 | (memcpy-host! cu-buf res)
90 | (get-float res 0)))
91 | (^double [hstream cu-buf]
92 | (with-release [res (byte-pointer Float/BYTES)]
93 | (memcpy-host! cu-buf res hstream)
94 | (get-float res 0))))
95 |
--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2017, NVIDIA Corporation
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/float.h:
--------------------------------------------------------------------------------
1 | #ifndef _float_h_
2 | #define _float_h_
3 |
4 | inline __host__ __device__ float jitify_int_as_float(int i) {
5 | union FloatInt { float f; int i; } fi;
6 | fi.i = i;
7 | return fi.f;
8 | }
9 |
10 | inline __host__ __device__ double jitify_longlong_as_double(long long i) {
11 | union DoubleLongLong { double f; long long i; } fi;
12 | fi.i = i;
13 | return fi.f;
14 | }
15 |
16 | #define FLT_RADIX 2
17 | #define FLT_MANT_DIG 24
18 | #define DBL_MANT_DIG 53
19 | #define FLT_DIG 6
20 | #define DBL_DIG 15
21 | #define FLT_MIN_EXP -125
22 | #define DBL_MIN_EXP -1021
23 | #define FLT_MIN_10_EXP -37
24 | #define DBL_MIN_10_EXP -307
25 | #define FLT_MAX_EXP 128
26 | #define DBL_MAX_EXP 1024
27 | #define FLT_MAX_10_EXP 38
28 | #define DBL_MAX_10_EXP 308
29 | #define FLT_MAX jitify_int_as_float(2139095039)
30 | #define DBL_MAX jitify_longlong_as_double(9218868437227405311)
31 | #define FLT_EPSILON jitify_int_as_float(872415232)
32 | #define DBL_EPSILON jitify_longlong_as_double(4372995238176751616)
33 | #define FLT_MIN jitify_int_as_float(8388608)
34 | #define DBL_MIN jitify_longlong_as_double(4503599627370496)
35 | #define FLT_ROUNDS 1
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/stddef.h:
--------------------------------------------------------------------------------
1 | #ifndef _stddef_h_
2 | #define _stddef_h_
3 |
4 | typedef unsigned long size_t;
5 | typedef signed long ptrdiff_t;
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/include/jitify/stdint.h:
--------------------------------------------------------------------------------
1 | #ifndef _stdint_h_
2 | #define _stdint_h_
3 |
4 | typedef signed char int8_t;
5 | typedef signed short int16_t;
6 | typedef signed int int32_t;
7 | typedef signed long long int64_t;
8 | typedef signed char int_fast8_t;
9 | typedef signed short int_fast16_t;
10 | typedef signed int int_fast32_t;
11 | typedef signed long long int_fast64_t;
12 | typedef signed char int_least8_t;
13 | typedef signed short int_least16_t;
14 | typedef signed int int_least32_t;
15 | typedef signed long long int_least64_t;
16 | typedef signed long long intmax_t;
17 | typedef signed long intptr_t;
18 | typedef unsigned char uint8_t;
19 | typedef unsigned short uint16_t;
20 | typedef unsigned int uint32_t;
21 | typedef unsigned long long uint64_t;
22 | typedef unsigned char uint_fast8_t;
23 | typedef unsigned short uint_fast16_t;
24 | typedef unsigned int uint_fast32_t;
25 | typedef unsigned long long uint_fast64_t;
26 | typedef unsigned char uint_least8_t;
27 | typedef unsigned short uint_least16_t;
28 | typedef unsigned int uint_least32_t;
29 | typedef unsigned long long uint_least64_t;
30 | typedef unsigned long long uintmax_t;
31 | typedef unsigned long uintptr_t;
32 | #define INT8_MIN SCHAR_MIN
33 | #define INT16_MIN SHRT_MIN
34 | #define INT32_MIN INT_MIN
35 | #define INT64_MIN LLONG_MIN
36 | #define INT8_MAX SCHAR_MAX
37 | #define INT16_MAX SHRT_MAX
38 | #define INT32_MAX INT_MAX
39 | #define INT64_MAX LLONG_MAX
40 | #define UINT8_MAX UCHAR_MAX
41 | #define UINT16_MAX USHRT_MAX
42 | #define UINT32_MAX UINT_MAX
43 | #define UINT64_MAX ULLONG_MAX
44 | #define INTPTR_MIN LONG_MIN
45 | #define INTMAX_MIN LLONG_MIN
46 | #define INTPTR_MAX LONG_MAX
47 | #define INTMAX_MAX LLONG_MAX
48 | #define UINTPTR_MAX ULONG_MAX
49 | #define UINTMAX_MAX ULLONG_MAX
50 | #define PTRDIFF_MIN INTPTR_MIN
51 | #define PTRDIFF_MAX INTPTR_MAX
52 | #define SIZE_MAX UINT64_MAX
53 |
54 | #endif
55 |
--------------------------------------------------------------------------------
/src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu:
--------------------------------------------------------------------------------
1 | extern "C" {
2 |
3 | #ifndef REAL
4 | #define REAL float
5 | #endif
6 |
7 | #ifndef ACCUMULATOR
8 | #define ACCUMULATOR float
9 | #endif
10 |
11 | #ifndef WGS
12 | #define WGS 1024
13 | #endif
14 |
15 | #ifndef WGSm
16 | #define WGSm 64
17 | #endif
18 |
19 | #ifndef WGSn
20 | #define WGSn 16
21 | #endif
22 |
23 | // ================= Sum reduction =============================================
24 |
25 | __device__ ACCUMULATOR block_reduction_sum (const ACCUMULATOR value) {
26 |
27 | const int local_id = threadIdx.x;
28 |
29 | __shared__ ACCUMULATOR lacc[WGS];
30 | lacc[local_id] = value;
31 |
32 | __syncthreads();
33 |
34 | ACCUMULATOR pacc = value;
35 | int i = blockDim.x;
36 | while (i > 0) {
37 | const bool include_odd = (i > ((i >> 1) << 1)) && (local_id == ((i >> 1) - 1));
38 | i >>= 1;
39 | if (include_odd) {
40 | pacc += lacc[local_id + i + 1];
41 | }
42 | if (local_id < i) {
43 | pacc += lacc[local_id + i];
44 | lacc[local_id] = pacc;
45 | }
46 | __syncthreads();
47 | }
48 |
49 | return lacc[0];
50 | }
51 |
52 | __device__ ACCUMULATOR block_reduction_sum_2 (const ACCUMULATOR value) {
53 |
54 | const int local_row = threadIdx.x;
55 | const int local_col = threadIdx.y;
56 | const int local_m = blockDim.x;
57 |
58 | __shared__ ACCUMULATOR lacc[WGS];
59 | lacc[local_row + local_col * local_m] = value;
60 |
61 | __syncthreads();
62 |
63 | ACCUMULATOR pacc = value;
64 | int i = blockDim.y;
65 | while (i > 0) {
66 | const bool include_odd = (i > ((i >> 1) << 1)) && (local_col == ((i >> 1) - 1));
67 | i >>= 1;
68 | if (include_odd) {
69 | pacc += lacc[local_row + (local_col + i + 1) * local_m];
70 | }
71 | if (local_col < i) {
72 | pacc += lacc[local_row + (local_col + i) * local_m];
73 | lacc[local_row + local_col * local_m] = pacc;
74 | }
75 | __syncthreads();
76 | }
77 |
78 | return lacc[local_row];
79 |
80 | }
81 |
82 | __global__ void sum_reduction(const int n, ACCUMULATOR* acc) {
83 | const int gid = blockIdx.x * blockDim.x + threadIdx.x;
84 | const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? acc[gid] : 0.0);
85 | if (threadIdx.x == 0) {
86 | acc[blockIdx.x] = sum;
87 | }
88 | }
89 |
90 | __global__ void sum_reduction_horizontal (const int m, const int n, ACCUMULATOR* acc) {
91 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
92 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
93 | const int i = m * gid_1 + gid_0;
94 | const bool valid = (gid_0 < m) && (gid_1 < n);
95 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0);
96 | const bool write = valid && (threadIdx.y == 0);
97 | if (write) {
98 | acc[m * blockIdx.y + gid_0] = sum;
99 | }
100 | }
101 |
102 | __global__ void sum_reduction_vertical (const int m, const int n, ACCUMULATOR* acc) {
103 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
104 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
105 | const int i = n * gid_0 + gid_1;
106 | const bool valid = (gid_0 < m) && (gid_1 < n);
107 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0);
108 | const bool write = valid && (threadIdx.y == 0);
109 | if (write) {
110 | acc[m * blockIdx.y + gid_0] = sum;
111 | }
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/src/java/uncomplicate/clojurecuda/internal/javacpp/CUHostFn.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) Dragan Djuric. All rights reserved.
2 | // The use and distribution terms for this software are covered by the
3 | // Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | // which can be found in the file LICENSE at the root of this distribution.
5 | // By using this software in any fashion, you are agreeing to be bound by
6 | // the terms of this license.
7 | // You must not remove this notice, or any other, from this software.
8 |
9 | package uncomplicate.clojurecuda.internal.javacpp;
10 |
11 | import clojure.lang.IFn;
12 | import org.bytedeco.javacpp.Pointer;
13 | import org.bytedeco.cuda.cudart.CUhostFn;
14 | import org.bytedeco.cuda.cudart.CUstream_st;
15 |
16 |
17 | public class CUHostFn extends CUhostFn {
18 |
19 | private IFn fun;
20 |
21 | public CUHostFn (IFn fun) {
22 | this.fun = fun;
23 | }
24 |
25 | public void call (Pointer userData) {
26 | fun.invoke(userData);
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/java/uncomplicate/clojurecuda/internal/javacpp/CUStreamCallback.java:
--------------------------------------------------------------------------------
1 | // Copyright (c) Dragan Djuric. All rights reserved.
2 | // The use and distribution terms for this software are covered by the
3 | // Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | // which can be found in the file LICENSE at the root of this distribution.
5 | // By using this software in any fashion, you are agreeing to be bound by
6 | // the terms of this license.
7 | // You must not remove this notice, or any other, from this software.
8 |
9 | package uncomplicate.clojurecuda.internal.javacpp;
10 |
11 | import clojure.lang.IFn;
12 | import org.bytedeco.javacpp.Pointer;
13 | import org.bytedeco.cuda.cudart.CUstreamCallback;
14 | import org.bytedeco.cuda.cudart.CUstream_st;
15 |
16 |
17 | public class CUStreamCallback extends CUstreamCallback {
18 |
19 | private IFn fun;
20 |
21 | public CUStreamCallback (IFn fun) {
22 | this.fun = fun;
23 | }
24 |
25 | public void call (CUstream_st hstream, int status, Pointer userData) {
26 | fun.invoke(hstream, status, userData);
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/core_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.core-test
10 | (:require [midje.sweet :refer [facts => throws truthy]]
11 | [clojure.core.async :refer [chan true)
27 |
28 | (facts
29 | "Device tests."
30 | (<= 0 (device-count)) => true
31 | (device 0) => truthy
32 | (device -1) => (throws ExceptionInfo)
33 | (device 33) => (throws ExceptionInfo)
34 | (device (pci-bus-id-string (device))) => (device))
35 |
36 | ;; ===================== Context Management Tests =======================================
37 |
38 | (facts
39 | "Context tests"
40 | (with-release [dev (device 0)]
41 | (let [ctx (context dev :sched-auto)]
42 | ctx => truthy
43 | (release ctx) => true
44 | (context dev :unknown) => (throws ExceptionInfo))
45 | (let [ctx1 (context dev :sched-blocking-sync)
46 | ctx2 (context dev :sched-blocking-sync)]
47 | (with-context ctx1
48 | (with-context ctx2
49 | (current-context) => ctx2
50 | (do (pop-context!) (current-context)) => ctx1
51 | (current-context! ctx2) => ctx2
52 | (current-context) => ctx2
53 | (release ctx2) => true
54 | (release ctx2) => true)))))
55 |
56 | ;; =============== Module Management & Execution Control Tests =====================================
57 |
58 | (facts
59 | "Test Parameters"
60 | (with-context (context (device))
61 | (with-release [cnt 3
62 | extra 4
63 | gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra)))
64 | params (parameters cnt gpu-a)]
65 | (size params) => 2
66 | (get-entry (int-pointer (get-entry params 0))) => 3
67 | (get-entry (long-pointer (get-entry params 1))) => (cu-address* gpu-a)
68 | (address (pointer gpu-a)) => (cu-address* gpu-a)
69 | (address (pointer gpu-a 1)) => (inc (long (cu-address* gpu-a)))
70 | (address (position! (pointer gpu-a) 1)) => (cu-address* gpu-a))))
71 |
72 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu")
73 | cnt 300
74 | extra 5]
75 | (with-context (context (device))
76 | (with-release [prog (compile! (program program-source {"dummy" "placeholder"}))
77 | grid (grid-1d cnt (min 256 cnt))]
78 | (with-release [modl (module prog)
79 | fun (function modl "inc")
80 | strm (stream :non-blocking)
81 | host-a (float-pointer (+ cnt extra))
82 | gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra)))]
83 |
84 | (facts
85 | "Test launch"
86 | (fill! host-a 0)
87 | (put-entry! host-a 0 1)
88 | (put-entry! host-a 10 100)
89 | (memcpy-host! host-a gpu-a strm) => gpu-a
90 | (launch! fun grid strm (parameters (int cnt) gpu-a)) => strm
91 | (synchronize! strm) => strm
92 | (memcpy-host! gpu-a host-a strm) => host-a
93 | (get-entry host-a 0) => 2.0
94 | (get-entry host-a 10) => 101.0
95 | (get-entry host-a (dec cnt)) => 1.0
96 | (get-entry host-a cnt) => 0.0
97 | (get-entry host-a (dec (+ cnt extra))) => 0.0))
98 |
99 | (with-release [modl (module)]
100 | (facts
101 | "Test device globals"
102 | (load! modl prog) => modl
103 | (with-release [fun (function modl "constant_inc")
104 | gpu-a (global modl "gpu_a")
105 | constant-gpu-a (global modl "constant_gpu_a")]
106 | (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0])
107 | (memcpy! gpu-a constant-gpu-a) => constant-gpu-a
108 | (launch! fun (grid-1d 3) (parameters 3 gpu-a))
109 | (pointer-seq (memcpy-host! constant-gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0])
110 | (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [2.0 4.0 6.0])))))))
111 |
112 | ;; =============== Stream Management Tests ==============================================
113 |
114 | (with-context (context (device 0) :map-host)
115 |
116 | (facts
117 | "Stream creation and memory copy tests."
118 | (with-release [strm (stream :non-blocking)
119 | cuda1 (mem-alloc-runtime Float/BYTES)
120 | cuda2 (mem-alloc-runtime Float/BYTES)
121 | host1 (float-array [173.0])
122 | host2 (byte-pointer Float/BYTES)]
123 | (memcpy-host! host1 cuda1 strm) => cuda1
124 | (synchronize! strm)
125 | (memcpy! cuda1 cuda2) => cuda2
126 | (memcpy-host! cuda2 host2 strm) => host2
127 | (synchronize! strm)
128 | (get-float host2 0) => 173.0))
129 |
130 | (facts
131 | "Stream and memory release."
132 | (with-release [strm (stream :non-blocking)
133 | cuda (mem-alloc-runtime Float/BYTES)]
134 | (release strm) => true
135 | (release strm) => true
136 | (release cuda) => true
137 | (memcpy! cuda cuda) => (throws IllegalArgumentException)
138 | (release cuda) => true)))
139 |
140 | (with-context (context (device 0) :map-host)
141 | (facts
142 | "Host functions."
143 | (let [ch (chan)]
144 | (with-release [strm (stream :non-blocking)
145 | cuda1 (mem-alloc-runtime Float/BYTES)
146 | cuda2 (mem-alloc-runtime Float/BYTES)
147 | host1 (float-array [163.0])
148 | host2 (float-pointer [12])
149 | ch (chan)]
150 | (listen! strm ch :host)
151 | (memcpy-host! host1 cuda1 strm) => cuda1
152 | (memcpy! cuda1 cuda2 strm) => cuda2
153 | (synchronize! strm)
154 | (memcpy-host! cuda2 (float-array 1) strm) => (throws Exception)
155 | (get-entry (memcpy-host! cuda2 host2 strm) 0) => 163.0
156 | ( :host))))
157 |
158 | ;; =============== Memory Management Tests ==============================================
159 |
160 | (with-release [dev (device 0)]
161 | (with-context (context dev :map-host)
162 |
163 | (facts
164 | "mem-alloc-runtime tests."
165 | (mem-alloc-driver 0) => (throws ExceptionInfo)
166 | (with-release [buf (mem-alloc-runtime Float/BYTES)]
167 | (bytesize buf) => Float/BYTES))
168 |
169 | (facts
170 | "Linear memory tests."
171 | (with-release [cuda1 (mem-alloc-runtime Float/BYTES)
172 | cuda2 (mem-alloc-runtime Float/BYTES)
173 | host1 (float-array [173.0])
174 | host2 (byte-pointer Float/BYTES)]
175 | (memcpy-host! host1 cuda1) => cuda1
176 | (memcpy! cuda1 cuda2) => cuda2
177 | (memcpy-host! cuda2 host2) => host2
178 | (get-float host2 0) => 173.0))
179 |
180 | (facts
181 | "Linear memory sub-region tests."
182 | (with-release [cuda (mem-alloc-runtime 20)]
183 | (memcpy-host! (float-array [1 2 3 4 5]) cuda) => cuda
184 | (let-release [cuda1 (mem-sub-region cuda 0 8)
185 | cuda2 (mem-sub-region cuda 8 12)]
186 | (mem-sub-region cuda 8 20) => (throws ExceptionInfo)
187 | (pointer-seq (memcpy-host! cuda1 (float-pointer 2))) => [1.0 2.0]
188 | (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [3.0 4.0 5.0]
189 | (do (release cuda1)
190 | (release cuda2)
191 | (pointer-seq (memcpy-host! cuda (float-pointer 5))) => [1.0 2.0 3.0 4.0 5.0]))))
192 |
193 | (facts
194 | "Runtime cudaMalloc tests."
195 | (with-release [cuda1 (mem-alloc-runtime Float/BYTES :float)
196 | cuda2 (mem-alloc-runtime (* 3 Float/BYTES) :float)
197 | host1 (float-pointer [100.0])
198 | host2 (mem-alloc-mapped Float/BYTES :float)
199 | zero (mem-alloc-runtime 0)]
200 | zero => truthy
201 | (bytesize cuda1) => Float/BYTES
202 | (memcpy-host! host1 cuda1) => cuda1
203 | (synchronize!)
204 | (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [100.0]
205 | (seq (memcpy! cuda1 host2)) => [100.0]
206 | (position! (pointer cuda2) 2)
207 | (.position (pointer cuda2)) => 2
208 | (memcpy! cuda1 cuda2) => cuda2
209 | (position! (pointer cuda2) 0)
210 | (memcpy-host! (float-pointer [200.0 300.0]) cuda2) => cuda2
211 | (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [200.0 300.0 100.0]))
212 |
213 | (facts
214 | "Pinned memory tests."
215 | (with-release [pinned-host (mem-alloc-pinned Float/BYTES :float :devicemap)
216 | cuda1 (mem-alloc-runtime Float/BYTES)]
217 | (mem-alloc-pinned Float/BYTES :unknown) => (throws ExceptionInfo)
218 | (bytesize pinned-host) => Float/BYTES
219 | (put-entry! pinned-host 0 13)
220 | (memcpy-host! pinned-host cuda1) => cuda1
221 | (put-entry! pinned-host 0 11)
222 | (memcpy! cuda1 pinned-host) => pinned-host
223 | (synchronize!)
224 | (get-entry pinned-host 0) => 13.0
225 | (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [13.0]))
226 |
227 | (facts
228 | "Mapped memory tests."
229 | (with-release [mapped-host (mem-alloc-mapped Float/BYTES :float)
230 | cuda1 (mem-alloc-runtime Float/BYTES)
231 | mapped-host2 (mem-alloc-mapped Float/BYTES :float)]
232 | (bytesize mapped-host) => Float/BYTES
233 | (put-entry! mapped-host 0 14.0)
234 | (memcpy-host! mapped-host cuda1) => cuda1
235 | (get-entry (memcpy-host! cuda1 (float-pointer 1)) 0) => 14.0
236 | (get-entry (memcpy! cuda1 mapped-host2)) => 14.0
237 | (synchronize!)
238 | (seq mapped-host2) => [14.0]))
239 |
240 | (facts
241 | "CUDA Raw Runtime Pointer tests."
242 | (with-release [host1 (float-pointer [1 2 3 4])
243 | cuda1 (cuda-malloc (* 4 Float/BYTES) :float)
244 | cuda2 (cuda-malloc (* 3 Float/BYTES) :float)
245 | host2 (float-pointer 4)
246 | host3 (float-pointer 3)]
247 | (memcpy-to-device! host1 cuda1) => cuda1
248 | (memcpy! (ptr cuda1 2) (ptr cuda2 1))
249 | (synchronize!)
250 | (pointer-seq (memcpy-to-host! cuda1 host2)) => [1.0 2.0 3.0 4.0]
251 | (pointer-seq (memcpy-to-host! cuda2 host3)) => [0.0 3.0 4.0]
252 | (cuda-free! cuda1) => cuda1
253 | (cuda-free! cuda1) => cuda1
254 | (cuda-free! cuda2) => cuda2))
255 |
256 | (facts
257 | "CUDA Raw Runtime Pointer arithmetic tests."
258 | (with-release [host1 (float-pointer [1 2 3 4])
259 | cuda1 (cuda-malloc (* 4 Float/BYTES) :float)]
260 | (memcpy-to-device! host1 cuda1) => cuda1
261 | (pointer cuda1) => cuda1
262 | (pointer cuda1 0) => cuda1
263 | (size (pointer cuda1 1)) => (dec (size cuda1))
264 | (bytesize (pointer cuda1 1)) => (- (bytesize cuda1) Float/BYTES)
265 | (size (ptr cuda1 1)) => (dec (size cuda1))
266 | (bytesize (ptr cuda1 1)) => (- (bytesize cuda1) Float/BYTES)
267 | (cuda-free! cuda1) => cuda1))
268 |
269 | (facts
270 | "cuda-malloc memset tests."
271 | (with-release [cuda1 (cuda-malloc (* 2 Integer/BYTES) :int)]
272 | (memcpy-to-device! (int-pointer [124 134]) cuda1) => cuda1
273 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 134]
274 | (position! (pointer cuda1) 1)
275 | (memset! cuda1 (int 100) 1)
276 | (position! (pointer cuda1) 0)
277 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 100]
278 | (memset! cuda1 (int 200) 1)
279 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [200 100]))
280 |
281 | (facts
282 | "cuda-alloc-runtime memset tests."
283 | (with-release [cuda1 (mem-alloc-runtime (* 2 Integer/BYTES) :int)]
284 | (memcpy-host! (int-pointer [124 134]) cuda1) => cuda1
285 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 134]
286 | (position! (pointer cuda1) 1)
287 | (memset! cuda1 (int 100) 1)
288 | (position! (pointer cuda1) 0)
289 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 100]
290 | (memset! cuda1 (int 200) 1)
291 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [200 100]))
292 |
293 | (when (and (info/managed-memory dev) (info/concurrent-managed-access dev))
294 | (facts
295 | "mem-alloc-driver tests."
296 | (with-release [host0 (float-pointer [15])
297 | host1 (float-pointer 1)
298 | cuda0 (mem-alloc-driver Float/BYTES :host)
299 | cuda1 (mem-alloc-driver Float/BYTES :global)]
300 |
301 | (bytesize cuda0) => Float/BYTES
302 | (mem-alloc-driver Float/BYTES :unknown) => (throws ExceptionInfo)
303 | (memcpy-host! host0 cuda0) => cuda0
304 | (memcpy! cuda0 cuda1) => cuda1
305 | (memcpy-host! cuda1 host1) => host1
306 | (get-entry host1 0) => 15.0)))
307 |
308 | (when (info/managed-memory dev)
309 | (facts
310 | "mem-alloc-driver with globally shared attached memory tests."
311 | (with-release [host0 (float-pointer [16])
312 | host1 (float-pointer 1)
313 | cuda0 (mem-alloc-driver Float/BYTES :host)
314 | cuda1 (mem-alloc-driver Float/BYTES :global)]
315 | (attach-mem! nil cuda0 Float/BYTES :global) => nil
316 | (bytesize cuda0) => Float/BYTES
317 | (memcpy-host! host0 cuda0) => cuda0
318 | (memcpy! cuda0 cuda1) => cuda1
319 | (memcpy-host! cuda1 host1) => host1
320 | (get-entry host1 0) => 16.0))
321 | (facts
322 | "mem-alloc-driver with attached memory tests."
323 | (with-release [host0 (float-pointer [17])
324 | host1 (float-pointer 1)
325 | cuda0 (mem-alloc-driver Float/BYTES :host)
326 | cuda1 (mem-alloc-driver Float/BYTES :global)]
327 | (let [hstream (attach-mem! cuda0 Float/BYTES :single)]
328 | (bytesize cuda0) => Float/BYTES
329 | (if (info/concurrent-managed-access dev)
330 | (memcpy-host! host0 cuda0) => cuda0
331 | (memcpy-host! host0 cuda0) => (throws ExceptionInfo))
332 | (memcpy-host! host0 cuda0 hstream) => cuda0
333 | (memcpy! cuda0 cuda1 hstream) => cuda1
334 | (memcpy-host! cuda1 host1 hstream) => host1
335 | (synchronize! hstream)
336 | (get-entry host1 0) => 17.0))))
337 |
338 | (facts
339 | "mem-alloc-registered tests."
340 | (with-release [host0 (byte-pointer Float/BYTES)
341 | host1 (byte-pointer Float/BYTES)
342 | cuda0 (mem-register-pinned! host0)
343 | cuda1 (mem-register-pinned! host1)]
344 |
345 | (bytesize cuda0) => Float/BYTES
346 | (put-float! host0 0 44.0)
347 | (memcpy! cuda0 cuda1) => cuda1
348 | (get-float host1 0) => 44.0))))
349 |
350 | ;; ================= Peer Access Management Tests =====================================
351 |
352 | (facts
353 | "Peer access tests (requires 2 devices)."
354 | (let [num-dev (device-count)
355 | devices (mapv device (range num-dev))
356 | combinations (set (for [x (range num-dev) y (range num-dev) :when (not= x y)] #{x y}))
357 | p2p? (fn [num-pair] (let [[a b] (vec num-pair)
358 | dev-a (nth devices a)
359 | dev-b (nth devices b)]
360 | (when (and (p2p-attribute dev-a dev-b :access-supported)
361 | (can-access-peer dev-a dev-b)
362 | (can-access-peer dev-b dev-a))
363 | [dev-a dev-b])))]
364 | (if-let [[dev-a dev-b] (some p2p? combinations)]
365 | (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-ad.cdu")
366 | ^:const vctr-len 3]
367 | (with-release [host-a (float-array [1 2 3])
368 | host-b (float-array [2 3 4])
369 | host-sum (float-array vctr-len)
370 | ctx (context dev-a)
371 | peer-ctx (context dev-b)]
372 | (in-context ctx
373 | (with-release [prog (compile! (program program-source))
374 | m (module prog)
375 | vector-add (function m "add")
376 | gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len))
377 | gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len))
378 | gpu-b (in-context peer-ctx (mem-alloc-runtime (* Float/BYTES vctr-len)))]
379 | (disable-peer-access! peer-ctx) => (throws ExceptionInfo)
380 | (in-context peer-ctx (disable-peer-access! ctx) => (throws ExceptionInfo))
381 | (memcpy-host! host-a gpu-a) => gpu-a
382 | (in-context peer-ctx (memcpy-host! host-b gpu-b) => gpu-b)
383 | (enable-peer-access! peer-ctx) => peer-ctx
384 | (in-context peer-ctx (enable-peer-access! ctx) => ctx)
385 | (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum))
386 | (synchronize!)
387 | (memcpy-host! gpu-a-sum host-sum) => (seq [3.0 5.0 7.0])
388 | (disable-peer-access! peer-ctx) => peer-ctx
389 | (in-context peer-ctx (disable-peer-access! ctx) => ctx))))
390 | (when-let [dev (first devices)]
391 | (p2p-attribute dev dev :access-supported) => (throws ExceptionInfo)
392 | (can-access-peer dev dev) => false)))))
393 |
394 | (facts
395 | "Runtime API Pointer kernel launch test"
396 | (let [dev (device 0)
397 | program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu")
398 | ^:const vctr-len 3]
399 | (with-release [host-a (float-pointer [1 2 3])
400 | host-b (float-pointer [2 3 4])
401 | host-sum (float-pointer vctr-len)
402 | ctx (context dev)]
403 | (in-context ctx
404 | (with-release [prog (compile! (program program-source))
405 | m (module prog)
406 | vector-add (function m "add")
407 | gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len))
408 | gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len))
409 | gpu-b (mem-alloc-runtime (* Float/BYTES vctr-len))]
410 | (memcpy-host! host-a gpu-a) => gpu-a
411 | (memcpy-host! host-b gpu-b) => gpu-b
412 | (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum))
413 | (synchronize!)
414 | (pointer-seq (memcpy! gpu-a-sum host-sum)) => (seq [3.0 5.0 7.0]))))))
415 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/examples/dynamic_parallelism_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.examples.dynamic-parallelism-test
10 | (:require [midje.sweet :refer [facts =>]]
11 | [clojure.java.io :refer [file]]
12 | [uncomplicate.commons.core :refer [with-release]]
13 | [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]]
14 | [uncomplicate.clojurecuda.core
15 | :refer [compile! context device function grid-1d init launch! link link-complete!
16 | mem-alloc-runtime memcpy-host! module parameters program with-context]]))
17 |
18 | (init)
19 |
20 | (let [program-source (slurp "test/cuda/examples/dynamic-parallelism.cu")
21 | num-parent-threads 8
22 | num-child-threads 8
23 | num-elements (* num-parent-threads num-child-threads)]
24 | (with-context (context (device))
25 | (with-release [prog (compile! (program program-source)
26 | ["--relocatable-device-code=true" "-default-device"])
27 | linked-prog (link [[:library (file "/opt/cuda/lib64/libcudadevrt.a")]
28 | [:ptx prog]])
29 | m (module (link-complete! linked-prog))
30 | parent (function m "parentKernel")
31 | data (mem-alloc-runtime (* Float/BYTES num-elements))]
32 | (facts
33 | "Dynamic parallelism JCuda example."
34 | (memcpy-host! (float-pointer num-elements) data)
35 | (launch! parent (grid-1d (+ num-elements num-elements (- 1)) num-parent-threads)
36 | (parameters num-elements data))
37 | (pointer-seq (memcpy-host! data (float-pointer num-elements)))
38 | => (map float (seq [0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7
39 | 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7
40 | 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7
41 | 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7
42 | 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7
43 | 5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7
44 | 6.0 6.1 6.2 6.3 6.4 6.5 6.6 6.7
45 | 7.0 7.1 7.2 7.3 7.4 7.5 7.6 7.7]))))))
46 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/examples/vector_add_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.examples.vector-add-test
10 | (:require [midje.sweet :refer [facts =>]]
11 | [uncomplicate.commons.core :refer [with-release size]]
12 | [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]]
13 | [uncomplicate.clojurecuda.core
14 | :refer [compile! context device function grid-1d init launch! mem-alloc-driver
15 | mem-alloc-pinned mem-alloc-runtime memcpy-host! module parameters program
16 | synchronize! with-context]]))
17 |
18 | (init)
19 |
20 | (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu")]
21 | (with-context (context (device))
22 | (with-release [prog (compile! (program program-source))
23 | m (module prog)
24 | add (function m "add")
25 | host-a (float-pointer [1 2 3])
26 | host-b (float-pointer [2 3 4])
27 | host-sum (float-pointer 3)
28 | gpu-a (mem-alloc-runtime (* Float/BYTES 3))
29 | gpu-b (mem-alloc-driver (* Float/BYTES 3))
30 | gpu-sum (mem-alloc-pinned (* Float/BYTES 3))]
31 | (facts
32 | "Vector add JCuda example."
33 | (memcpy-host! host-a gpu-a)
34 | (memcpy-host! host-b gpu-b)
35 | (launch! add (grid-1d (size host-sum)) (parameters (size host-sum) gpu-a gpu-b gpu-sum))
36 | (synchronize!)
37 | (pointer-seq (memcpy-host! gpu-sum host-sum)) => (seq [3.0 5.0 7.0])))))
38 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/info_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.info-test
10 | (:require [midje.sweet :refer [facts =>]]
11 | [uncomplicate.commons.core :refer [with-release info]]
12 | [uncomplicate.clojurecuda
13 | [core :refer [compile! context device function init module program stream with-context]]
14 | [info :refer [driver-version limit limit! stream-flag]]]
15 | [uncomplicate.clojurecuda.internal.constants :refer [stream-flags]]))
16 |
17 | (init)
18 |
19 | (facts
20 | "Driver info tests."
21 | (pos? (driver-version)) => true)
22 |
23 | (facts
24 | "Device info tests."
25 | (count (info (device 0))) => 83)
26 |
27 | (with-release [ctx (context (device))]
28 | (facts
29 | "Context info tests."
30 | (count (info ctx)) => 13
31 | (limit! :stack-size 512) => 512
32 | (limit :stack-size) => 512))
33 |
34 | (with-context (context (device))
35 | (with-release [hstream (stream :non-blocking)]
36 | (facts
37 | "Stream info tests."
38 | (count (info hstream)) => 2
39 | (stream-flag hstream) => (stream-flags :non-blocking)
40 | (:flag (info hstream))))) => :non-blocking
41 |
42 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu")]
43 | (with-context (context (device))
44 | (with-release [prog (compile! (program program-source))
45 | modl (module prog)
46 | fun (function modl "inc")]
47 | (facts
48 | "function info tests."
49 | (count (info fun)) => 7))))
50 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/toolbox_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.toolbox-test
10 | (:require [midje.sweet :refer [facts => roughly]]
11 | [uncomplicate.commons
12 | [core :refer [with-release]]
13 | [utils :refer [count-groups]]]
14 | [uncomplicate.clojure-cpp :refer [float-pointer double-pointer pointer-seq]]
15 | [uncomplicate.clojurecuda
16 | [core :refer [compile! context device function init mem-alloc-runtime memcpy-host!
17 | module program with-context]]
18 | [info :refer [max-block-dim-x]]
19 | [toolbox :refer [launch-reduce! read-double]]]))
20 |
21 | (init)
22 |
23 | (let [dev (device)
24 | cnt-m 311
25 | cnt-n 9011
26 | cnt (* cnt-m cnt-n)
27 | program-source (str (slurp "src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu") "\n"
28 | (slurp "test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu"))]
29 |
30 | (with-context (context dev)
31 | (with-release [wgs (max-block-dim-x dev)
32 | prog (compile! (program program-source)
33 | ["-DREAL=float" "-DACCUMULATOR=double"
34 | (format "-DWGS=%d" wgs)])
35 | modl (module prog)
36 | data (float-pointer (range cnt))
37 | cu-data (mem-alloc-runtime (* cnt Float/BYTES))
38 | sum-reduction-horizontal (function modl "sum_reduction_horizontal")
39 | sum-horizontal (function modl "sum_reduce_horizontal")]
40 |
41 | (memcpy-host! data cu-data)
42 |
43 | (let [acc-size (* Double/BYTES (max 1 (count-groups wgs cnt)))]
44 | (with-release [sum-reduction-kernel (function modl "sum_reduction")
45 | sum-reduce-kernel (function modl "sum_reduce")
46 | cu-acc (mem-alloc-runtime acc-size)]
47 | (facts
48 | "Test 1D reduction."
49 | (launch-reduce! nil sum-reduce-kernel sum-reduction-kernel [cu-acc cu-data] [cu-acc] cnt wgs)
50 | (read-double cu-acc) => 3926780329410.0)))
51 |
52 | (let [wgs-m 64
53 | wgs-n 16
54 | acc-size (* Double/BYTES (max 1 (* cnt-m (count-groups wgs-n cnt-n))))
55 | res (double-pointer cnt-m)]
56 | (with-release [sum-reduction-horizontal (function modl "sum_reduction_horizontal")
57 | sum-reduce-horizontal (function modl "sum_reduce_horizontal")
58 | cu-acc (mem-alloc-runtime acc-size)]
59 | (facts
60 | "Test horizontal 2D reduction."
61 | (launch-reduce! nil sum-reduce-horizontal sum-reduction-horizontal
62 | [cu-acc cu-data] [cu-acc] cnt-m cnt-n wgs-m wgs-n)
63 | (memcpy-host! cu-acc res)
64 | (apply + (pointer-seq res)) => (roughly 3.92678032941E12))))
65 |
66 | (let [wgs-m 64
67 | wgs-n 16
68 | acc-size (* Double/BYTES (max 1 (* cnt-n (count-groups wgs-m cnt-m))))
69 | res (double-pointer cnt-n)]
70 | (with-release [sum-reduction-vertical (function modl "sum_reduction_vertical")
71 | sum-reduce-vertical (function modl "sum_reduce_vertical")
72 | cu-acc (mem-alloc-runtime acc-size)]
73 | (facts
74 | "Test vertical 2D reduction."
75 | (launch-reduce! nil sum-reduce-vertical sum-reduction-vertical
76 | [cu-acc cu-data] [cu-acc] cnt-n cnt-m wgs-n wgs-m)
77 | (memcpy-host! cu-acc res)
78 | (apply + (pointer-seq res)) => (roughly 3.92678032941E12)))))))
79 |
--------------------------------------------------------------------------------
/test/clojure/uncomplicate/clojurecuda/utils_test.clj:
--------------------------------------------------------------------------------
1 | ;; Copyright (c) Dragan Djuric. All rights reserved.
2 | ;; The use and distribution terms for this software are covered by the
3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later
4 | ;; which can be found in the file LICENSE at the root of this distribution.
5 | ;; By using this software in any fashion, you are agreeing to be bound by
6 | ;; the terms of this license.
7 | ;; You must not remove this notice, or any other, from this software.
8 |
9 | (ns uncomplicate.clojurecuda.utils-test
10 | (:require [midje.sweet :refer [facts => throws]]
11 | [uncomplicate.clojurecuda.internal.utils :refer [error maybe with-check]]))
12 |
13 | (facts
14 | "error tests"
15 |
16 | (ex-data (error 0))
17 | => {:code 0, :details nil, :name :success :type :cuda}
18 |
19 | (ex-data (error -43))
20 | => {:code -43, :details nil, :name -43, :type :cuda}
21 |
22 | (ex-data (error 0 "Additional details"))
23 | => {:code 0, :details "Additional details", :name :success, :type :cuda})
24 |
25 | (facts
26 | "with-check tests"
27 | (let [f (fn [x] (if x 0 -1))]
28 | (with-check (f 1) :success) => :success
29 | (with-check (f false) :success) => (throws clojure.lang.ExceptionInfo)))
30 |
31 | (facts
32 | "maybe tests"
33 | (ex-data (maybe (throw (ex-info "Test Exception" {:data :test}))))
34 | => (throws clojure.lang.ExceptionInfo)
35 |
36 | (:type (ex-data (error -1 nil))) => :cuda)
37 |
--------------------------------------------------------------------------------
/test/cuda/examples/dynamic-parallelism.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Created based on example from Marco Hutter:
3 | *
4 | * JCuda - Java bindings for NVIDIA CUDA
5 | *
6 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
7 | */
8 |
9 | extern "C"
10 | __global__ void childKernel(unsigned int parentThreadIndex, float* data) {
11 | data[threadIdx.x] = parentThreadIndex + 0.1f * threadIdx.x;
12 | }
13 |
14 | extern "C"
15 | __global__ void parentKernel(unsigned int size, float *data) {
16 | childKernel<<<1, 8>>>(threadIdx.x, data + threadIdx.x * 8);
17 | }
18 |
--------------------------------------------------------------------------------
/test/cuda/examples/jnvrtc-vector-add.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Created based on example from Marco Hutter:
3 | *
4 | * JCuda - Java bindings for NVIDIA CUDA
5 | *
6 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
7 | */
8 |
9 | extern "C"
10 | __global__ void add(int n, float *a, float *b, float *sum) {
11 | int i = blockIdx.x * blockDim.x + threadIdx.x;
12 | if (i < n) {
13 | sum[i] = a[i] + b[i];
14 | }
15 | };
16 |
--------------------------------------------------------------------------------
/test/cuda/uncomplicate/clojurecuda/kernels/test.cu:
--------------------------------------------------------------------------------
1 | extern "C" {
2 |
3 | __global__ void inc (int n, float* a) {
4 | int i = blockIdx.x * blockDim.x + threadIdx.x;
5 | if (i < n) {
6 | a[i] += 1;
7 | }
8 | };
9 |
10 | __device__ float gpu_a[] = {1.0, 2.0, 3.0};
11 |
12 | __device__ __constant__ float constant_gpu_a[3];
13 |
14 | __global__ void constant_inc (int n, float* a) {
15 | int i = blockIdx.x * blockDim.x + threadIdx.x;
16 | if (i < n) {
17 | a[i] += constant_gpu_a[i];
18 | }
19 | };
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu:
--------------------------------------------------------------------------------
1 | extern "C" {
2 |
3 | __global__ void sum_reduce (const int n, ACCUMULATOR* acc, const REAL* x) {
4 | const int gid = blockIdx.x * blockDim.x + threadIdx.x;
5 | const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? x[gid] : 0.0);
6 | if (threadIdx.x == 0) {
7 | acc[blockIdx.x] = sum;
8 | }
9 | };
10 |
11 | __global__ void sum_reduce_horizontal (const int m, const int n, ACCUMULATOR* acc, const REAL* a) {
12 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
13 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
14 | const int i = m * gid_1 + gid_0;
15 | const bool valid = (gid_0 < m) && (gid_1 < n);
16 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0);
17 | const bool write = valid && (threadIdx.y == 0);
18 | if (write) {
19 | acc[m * blockIdx.y + gid_0] = sum;
20 | }
21 | }
22 |
23 | __global__ void sum_reduce_vertical (const int m, const int n, ACCUMULATOR* acc, const REAL* a) {
24 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x;
25 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y;
26 | const int i = n * gid_0 + gid_1;
27 | const bool valid = (gid_0 < m) && (gid_1 < n);
28 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0);
29 | const bool write = valid && (threadIdx.y == 0);
30 | if (write) {
31 | acc[m * blockIdx.y + gid_0] = sum;
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------