├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── project.clj ├── src ├── clojure │ └── uncomplicate │ │ └── clojurecuda │ │ ├── core.clj │ │ ├── info.clj │ │ ├── internal │ │ ├── constants.clj │ │ ├── impl.clj │ │ └── utils.clj │ │ └── toolbox.clj ├── cuda │ └── uncomplicate │ │ └── clojurecuda │ │ ├── include │ │ └── jitify │ │ │ ├── LICENSE │ │ │ ├── float.h │ │ │ ├── stddef.h │ │ │ └── stdint.h │ │ └── kernels │ │ └── reduction.cu └── java │ └── uncomplicate │ └── clojurecuda │ └── internal │ └── javacpp │ ├── CUHostFn.java │ └── CUStreamCallback.java └── test ├── clojure └── uncomplicate │ └── clojurecuda │ ├── core_test.clj │ ├── examples │ ├── dynamic_parallelism_test.clj │ └── vector_add_test.clj │ ├── info_test.clj │ ├── toolbox_test.clj │ └── utils_test.clj └── cuda ├── examples ├── dynamic-parallelism.cu └── jnvrtc-vector-add.cu └── uncomplicate └── clojurecuda └── kernels ├── test.cu └── toolbox-test.cu /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: draganrocks 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /lib 3 | /classes 4 | /checkouts 5 | pom.xml 6 | pom.xml.asc 7 | *.jar 8 | *.class 9 | /.lein-* 10 | /.nrepl-port 11 | doc 12 | docs 13 | hs_*.log 14 | .#* 15 | .DS_Store 16 | *.o 17 | *.so 18 | */nrepl-port 19 | */target 20 | .idea 21 | /*.iml 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | nEXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [New books available for subscription](https://aiprobook.com) 2 | 3 | Deep Learning for Programmers 4 | 5 | Numerical Linear Algebra for Programmers 6 | 7 | # ClojureCUDA 8 | 9 | [Adopt your pet function](https://dragan.rocks/articles/18/Patreon-Announcement-Adopt-a-Function) and [become a patron](https://patreon.com/draganrocks). 10 | 11 | Clojure library for CUDA development. See the documentation at [ClojureCUDA website](https://clojurecuda.uncomplicate.org). 12 | 13 | ## License 14 | 15 | Copyright © 2017-2019 Dragan Djuric 16 | 17 | Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version. 18 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (defproject uncomplicate/clojurecuda "0.21.1-SNAPSHOT" 10 | :description "ClojureCUDA is a Clojure library for parallel computations with Nvidia's CUDA." 11 | :url "https://github.com/uncomplicate/clojurecuda" 12 | :scm {:name "git" 13 | :url "https://github.com/uncomplicate/clojurecuda"} 14 | :license {:name "Eclipse Public License" 15 | :url "http://www.eclipse.org/legal/epl-v10.html"} 16 | :dependencies [[org.clojure/clojure "1.12.0"] 17 | [org.clojure/core.async "1.7.701"] 18 | [uncomplicate/commons "0.16.1"] 19 | [uncomplicate/fluokitten "0.10.0"] 20 | [org.uncomplicate/clojure-cpp "0.4.1-SNAPSHOT"] 21 | [org.bytedeco/cuda-platform "12.8-9.8-1.5.12-SNAPSHOT"]] 22 | 23 | :profiles {:dev [:dev/all ~(leiningen.core.utils/get-os)] 24 | :dev/all {:plugins [[lein-midje "3.2.1"] 25 | [lein-codox "0.10.8"] 26 | [com.github.clj-kondo/lein-clj-kondo "0.2.5"]] 27 | :global-vars {*warn-on-reflection* true 28 | *assert* true 29 | *unchecked-math* :warn-on-boxed 30 | *print-length* 128} 31 | :dependencies [[midje "1.10.10"] 32 | [codox-theme-rdash "0.1.2"]] 33 | :codox {:metadata {:doc/format :markdown} 34 | :source-uri "http://github.com/uncomplicate/clojurecuda/blob/master/{filepath}#L{line}" 35 | :output-path "docs/codox" 36 | :themes [:rdash] 37 | :namespaces [uncomplicate.clojurecuda.core 38 | uncomplicate.clojurecuda.info 39 | uncomplicate.clojurecuda.toolbox 40 | uncomplicate.clojurecuda.internal.constants]}} 41 | :linux {:dependencies [[org.bytedeco/cuda "12.8-9.8-1.5.12-SNAPSHOT" :classifier linux-x86_64-redist]]} 42 | :windows {:dependencies [[org.bytedeco/cuda "12.9-9.9-1.5.12-SNAPSHOT" :classifier windows-x86_64-redist]]}} 43 | 44 | :repositories [["snapshots" "https://oss.sonatype.org/content/repositories/snapshots"]] 45 | 46 | :javac-options ["-target" "1.8" "-source" "1.8" "-Xlint:-options"] 47 | 48 | :source-paths ["src/clojure" "src/cuda"] 49 | :test-paths ["test/clojure" "test/cuda"] 50 | :java-source-paths ["src/java"]) 51 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/core.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.core 11 | "Core ClojureCUDA functions for CUDA **host** programming. The kernels should 12 | be provided as strings (that may be stored and read from files) or binaries, written in CUDA C/C++. 13 | 14 | Many examples are available in ClojureCUDA [core test](https://github.com/uncomplicate/clojurecuda/blob/master/test/clojure/uncomplicate/clojurecuda/core_test.clj). 15 | You can see how to write CUDA [kernels here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/examples) 16 | and [here](https://github.com/uncomplicate/clojurecuda/tree/master/test/cuda/uncomplicate/clojurecuda/kernels) 17 | and examples of [how to load them here](https://github.com/uncomplicate/clojurecuda/tree/master/test/clojure/uncomplicate/clojurecuda/examples/). 18 | 19 | For more advanced examples, please read the source code of the CUDA engine of [Neanderthal linear algebra library](https://github.com/uncomplicate/neanderthal) (mainly general CUDA and cuBLAS are used there), 20 | and the [Deep Diamond tensor and linear algebra library](https://github.com/uncomplicate/neanderthal) (for extensive use of cuDNN). 21 | 22 | Here's a categorized map of core functions. Most functions throw `ExceptionInfo` in case of errors 23 | thrown by the CUDA driver. 24 | 25 | - Device management: [[init]], [[device-count]], [[device]]. 26 | - Context management: [[context]], [[current-context]], [[current-context!]], [[put-context!]], 27 | [[push-context!]], [[in-context]], [[with-context]], [[with-default]]. 28 | - Memory management: [[memcpy!]], [[mumcpy-to-host!]], [[memcpy-to-device!]], [[memset!]]. 29 | [[mem-sub-region]], [[mem-alloc-driver]], [[mem-alloc-runtime]], [[cuda-malloc]], [[cuda-free!]] 30 | [[mem-alloc-pinned]], [[mem-register-pinned!]], [[mem-alloc-mapped]], 31 | - Module management: [[link]], [[link-complete!]], [[load!]], [[module]]. 32 | - Execution control: [[gdid-1d]], [[grid-2d]], [[grid-3d]], [[global]], [[set-parameter!]], 33 | [[parameters]], [[function]], [[launch!]]. 34 | - Stream management: [[stream]], [[default-stream]], [[ready?]], [[synchronize!]], 35 | [[add-host-fn!]], [[listen!]], [[wait-event!]], [[attach-mem!]]. 36 | - Event management: [[event]], [[elapsed-time!]], [[record!]], [[can-access-peer]], 37 | [[p2p-attribute]], [[disable-peer-access!]], [[enable-peer-access!]]. 38 | - NVRTC program JIT: [[program]], [[program-log]], [[compile!]], [[ptx]]. 39 | 40 | Please see [CUDA Driver API](https://docs.nvidia.com/cuda/pdf/CUDA_Driver_API.pdf) for details 41 | not discussed in ClojureCUDA documentation. 42 | " 43 | (:require [uncomplicate.commons 44 | [core :refer [with-release let-release info bytesize sizeof size]] 45 | [utils :refer [mask count-groups dragan-says-ex]]] 46 | [uncomplicate.fluokitten.protocols :refer [extract]] 47 | [uncomplicate.clojure-cpp 48 | :refer [null? pointer byte-pointer string-pointer int-pointer long-pointer 49 | size-t-pointer pointer-pointer get-entry put-entry! safe type-pointer position! 50 | capacity! address]] 51 | [uncomplicate.clojurecuda.info :as cuda-info] 52 | [uncomplicate.clojurecuda.internal 53 | [constants :refer [ctx-flags event-flags mem-attach-flags mem-host-alloc-flags 54 | mem-host-register-flags p2p-attributes stream-flags]] 55 | [impl :refer [->CUDevice ->CUDevicePtr add-host-fn* attach-mem* can-access-peer* 56 | compile* context* cu-address* current-context* event* host-fn* link* 57 | malloc-runtime* mem-alloc-host* mem-alloc-managed* mem-host-alloc* 58 | mem-host-register* memcpy* memcpy-host* memset* module-load* offset 59 | p2p-attribute* program* program-log* ptx* ready* set-parameter* stream*]] 60 | [utils :refer [with-check]]]) 61 | (:import [org.bytedeco.javacpp Pointer LongPointer SizeTPointer PointerPointer] 62 | org.bytedeco.cuda.global.cudart 63 | [org.bytedeco.cuda.cudart CUctx_st CUlinkState_st CUmod_st CUfunc_st CUstream_st CUevent_st])) 64 | 65 | (defn init 66 | "Initializes the CUDA driver. This function must be called before any other function 67 | from ClojureCUDA in the current process. 68 | See [CUDA Initialization](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html) 69 | " 70 | [] 71 | (with-check (cudart/cuInit 0) true)) 72 | 73 | ;; ================== Device Management ==================================== 74 | 75 | (defn device-count 76 | "Returns the number of CUDA devices on the system. 77 | See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html). 78 | " 79 | ^long [] 80 | (let [res (int-pointer 1)] 81 | (with-check (cudart/cuDeviceGetCount res) (get-entry res 0)))) 82 | 83 | (defn device 84 | "Returns a device specified with its ordinal number `id` or string PCI Bus `id`. 85 | See [CUDA Device Management](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html). 86 | " 87 | ([id] 88 | (with-release [res (int-pointer 1)] 89 | (with-check 90 | (if (number? id) 91 | (cudart/cuDeviceGet res (long id)) 92 | (cudart/cuDeviceGetByPCIBusId res ^String id)) 93 | {:device-id id} 94 | (->CUDevice (get-entry res 0))))) 95 | ([] 96 | (device 0))) 97 | 98 | ;; =================== Context Management ================================== 99 | 100 | (defn context 101 | "Creates a CUDA context on the `device` using a keyword `flag`. 102 | For available flags, see [[internal.constants/ctx-flags]]. The default is none. 103 | The context must be released after use. 104 | 105 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 106 | " 107 | ([dev flag] 108 | (context* (extract dev) 109 | (or (ctx-flags flag) 110 | (throw (ex-info "Unknown context flag." {:flag flag :available ctx-flags}))))) 111 | ([dev] 112 | (context* (extract dev) 0))) 113 | 114 | (defn current-context 115 | "Returns the CUDA context bound to the calling CPU thread. 116 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 117 | " 118 | [] 119 | (current-context*)) 120 | 121 | (defn current-context! 122 | "Binds the specified CUDA context `ctx` to the calling CPU thread. 123 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 124 | " 125 | [ctx] 126 | (current-context* ctx) 127 | ctx) 128 | 129 | (defn pop-context! 130 | "Pops the current CUDA context `ctx` from the current CPU thread. 131 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 132 | " 133 | [] 134 | (let [ctx (CUctx_st.)] 135 | (with-check (cudart/cuCtxPopCurrent ctx) ctx))) 136 | 137 | (defn push-context! 138 | "Pushes a context `ctx` on the current CPU thread. 139 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 140 | " 141 | [^CUctx_st ctx] 142 | (with-check (cudart/cuCtxPushCurrent ctx) ctx)) 143 | 144 | (defmacro in-context 145 | "Pushes the context `ctx` to the top of the context stack, evaluates the body with `ctx` 146 | as the current context, and pops the context from the stack. 147 | Does NOT release the context, unlike [[with-context]]. 148 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 149 | " 150 | [ctx & body] 151 | `(try 152 | (push-context! ~ctx) 153 | ~@body 154 | (finally (pop-context!)))) 155 | 156 | (defmacro with-context 157 | "Pushes the context `ctx` to the top of the context stack, evaluates the body, and pops the context 158 | from the stack. Releases the context, unlike [[in-context]]. 159 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 160 | " 161 | [ctx & body] 162 | `(with-release [ctx# ~ctx] 163 | (in-context ctx# ~@body))) 164 | 165 | (defmacro with-default 166 | "Initializes CUDA, creates the default context and executes the body in it. 167 | See [CUDA Context Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html). 168 | " 169 | [& body] 170 | `(do 171 | (init) 172 | (with-release [dev# (device)] 173 | (with-context (context dev#) 174 | ~@body)))) 175 | 176 | ;; ================== Memory Management ============================================== 177 | 178 | (defn ^:private check-size [ptr ^long offset ^long byte-count] 179 | (when-not (<= 0 offset (+ offset byte-count) (bytesize ptr)) 180 | (dragan-says-ex "Requested bytes are out of the bounds of this device pointer." 181 | {:offset offset :requested byte-count :available (bytesize ptr)}))) 182 | 183 | (defn memcpy! 184 | "Copies `byte-count` or maximum available device memory from `src` to `dst`. 185 | TODO mapped, pinned 186 | If `hstream` is provided, executes asynchronously. 187 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 188 | " 189 | ([src dst] 190 | (memcpy! src dst (min (bytesize src) (bytesize dst)) nil)) 191 | ([src dst byte-count-or-stream] 192 | (if (number? byte-count-or-stream) 193 | (do (check-size src 0 byte-count-or-stream) 194 | (check-size dst 0 byte-count-or-stream) 195 | (memcpy* dst src byte-count-or-stream nil)) 196 | (memcpy! src dst (min (bytesize src) (bytesize dst)) byte-count-or-stream)) 197 | dst) 198 | ([src dst ^long byte-count hstream] 199 | (check-size src 0 byte-count) 200 | (check-size dst 0 byte-count) 201 | (memcpy* dst src byte-count hstream) 202 | dst)) 203 | 204 | (defn memcpy-to-host! 205 | "Copies `byte-count` or maximum available memory from device `src` to host `dst`. Useful when `src` 206 | or `dst` is a generic pointer for which it cannot be determined whether it manages memory on host 207 | or on device (see [[cuda-malloc!]]). 208 | If `hstream` is provided, executes asynchronously. 209 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 210 | " 211 | ([^Pointer src ^Pointer dst ^long byte-count hstream] 212 | (check-size src 0 byte-count) 213 | (check-size dst 0 byte-count) 214 | (with-check 215 | (if hstream 216 | (cudart/cuMemcpyDtoHAsync (extract dst) (address (extract src)) byte-count hstream) 217 | (cudart/cuMemcpyDtoH (extract dst) (address (extract src)) byte-count)) 218 | dst)) 219 | ([src dst count-or-stream] 220 | (if (integer? count-or-stream) 221 | (memcpy-to-host! src dst count-or-stream nil) 222 | (memcpy-to-host! src dst (min (bytesize src) (bytesize dst)) count-or-stream)) 223 | dst) 224 | ([src dst] 225 | (memcpy-to-host! src dst (min (bytesize src) (bytesize dst))) 226 | dst)) 227 | 228 | (defn memcpy-to-device! 229 | "Copies `byte-count` or all possible memory from host `src` to device `dst`. Useful when `src` or 230 | `dst` is a generic pointer for which it cannot be determined whether it manages memory on host or 231 | on device (see [[cuda-malloc!]]). 232 | If `hstream` is provided, executes asynchronously. 233 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 234 | " 235 | ([^Pointer src ^Pointer dst ^long byte-count hstream] 236 | (check-size src 0 byte-count) 237 | (check-size dst 0 byte-count) 238 | (with-check 239 | (if hstream 240 | (cudart/cuMemcpyHtoDAsync (address (extract dst)) (extract src) byte-count hstream) 241 | (cudart/cuMemcpyHtoD (address (extract dst)) (extract src) byte-count)) 242 | dst)) 243 | ([src dst count-or-stream] 244 | (if (integer? count-or-stream) 245 | (memcpy-to-device! src dst count-or-stream nil) 246 | (memcpy-to-device! src dst (min (bytesize src) (bytesize dst)) count-or-stream)) 247 | dst) 248 | ([src dst] 249 | (memcpy-to-device! src dst (min (bytesize src) (bytesize dst))) 250 | dst)) 251 | 252 | (defn memcpy-host! 253 | "Copies `byte-count` or all possible memory from `src` to `dst`, one of which 254 | has to be accessible from the host. If `hstream` is provided, executes asynchronously. 255 | A polymorphic function that figures out what needs to be done. Supports everything 256 | except pointers created by [[cuda-malloc!]]. 257 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 258 | " 259 | ([src dst ^long byte-count hstream] 260 | (check-size src 0 byte-count) 261 | (check-size dst 0 byte-count) 262 | (if hstream 263 | (memcpy-host* dst src byte-count hstream) 264 | (memcpy-host* dst src byte-count)) 265 | dst) 266 | ([src dst count-or-stream] 267 | (if (integer? count-or-stream) 268 | (memcpy-host! src dst count-or-stream nil) 269 | (memcpy-host* dst src (min (bytesize src) (bytesize dst)) count-or-stream)) 270 | dst) 271 | ([src dst] 272 | (memcpy-host* dst src (min (bytesize src) (bytesize dst))) 273 | dst)) 274 | 275 | (defn memset! 276 | "Sets `n` elements or all segments of `dptr` memory to `value` (supports all Java primitive number 277 | types except `double`, and `long` with value larger than `Integer/MAX_VALUE`). If `hstream` is 278 | provided, executes asynchronously. 279 | See [CUDA Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 280 | " 281 | ([dptr value] 282 | (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value))) 283 | dptr) 284 | ([dptr value n-or-hstream] 285 | (if (integer? n-or-hstream) 286 | (do (check-size dptr 0 (* (sizeof value) (long n-or-hstream))) 287 | (memset* value (cu-address* dptr) n-or-hstream)) 288 | (memset* value (cu-address* dptr) (quot (bytesize dptr) (sizeof value)) n-or-hstream)) 289 | dptr) 290 | ([dptr value ^long n hstream] 291 | (if hstream 292 | (do (check-size dptr 0 (* (sizeof value) n)) 293 | (memset* value (cu-address* dptr) n hstream)) 294 | (memset! dptr value n)) 295 | dptr)) 296 | 297 | ;; ==================== Driver-managed device memory =============================================== 298 | 299 | (defn mem-sub-region 300 | "Creates CUDA device memory object that references a sub-region of `mem` from `origin` 301 | to `byte-count`, or maximum available byte size. 302 | " 303 | ([mem ^long origin ^long byte-count] 304 | (check-size mem origin byte-count) 305 | (let-release [sub-dptr (long-pointer 1)] 306 | (->CUDevicePtr (put-entry! sub-dptr 0 (offset mem origin)) byte-count false))) 307 | ([mem ^long origin] 308 | (mem-sub-region mem origin (bytesize mem)))) 309 | 310 | (defn mem-alloc-driver 311 | "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the 312 | Unified Memory system, specified by a keyword `flag`. For available flags, see [[internal.constants/mem-attach-flags]]. 313 | Returns a CUDA device memory object, which can NOT be extracted as a `Pointer`, but can be accessed 314 | directly through its address in the device memory. 315 | See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 316 | " 317 | ([^long byte-size flag] 318 | (mem-alloc-managed* (max 0 byte-size) 319 | (or (mem-attach-flags flag) 320 | (throw (ex-info "Unknown mem-attach flag." 321 | {:flag flag :available mem-attach-flags}))))) 322 | ([^long byte-size] 323 | (mem-alloc-managed* byte-size cudart/CU_MEM_ATTACH_GLOBAL))) 324 | 325 | ;; =================== Runtime API Memory ================================================ 326 | 327 | (defn mem-alloc-runtime 328 | "Allocates the `byte-size` bytes of uninitialized memory that will be automatically managed by the 329 | Unified Memory system. Returns a CUDA device memory object managed by the CUDA runtime API, which 330 | can be extracted as a `Pointer`. Equivalent unwrapped `Pointer` can be created by [[cuda-malloc]]. 331 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html) 332 | " 333 | ([^long byte-size type] ;;TODO functions that receive type should accept size instead of bytesize 334 | (if-let [t (type-pointer type)] 335 | (malloc-runtime* (max 0 byte-size) t) 336 | (throw (ex-info (format "Unknown data type: %s." (str type)) {})))) 337 | ([^long byte-size] 338 | (malloc-runtime* (max 0 byte-size)))) 339 | 340 | (defn cuda-malloc 341 | "Returns a `Pointer` to `byte-size` bytes of uninitialized memory that will be automatically 342 | managed by the Unified Memory system. The pointer is managed by the CUDA runtime API. 343 | Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for 344 | `FloatPointer`, etc.). 345 | This pointer has to be manually released by [[cuda-free!]]. For a more seamless experience, 346 | use the wrapper provided by the [[mem-alloc-runtime]] function. 347 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html) 348 | " 349 | ([^long byte-size] 350 | (let [byte-size (max 0 byte-size)] 351 | (let-release [p (byte-pointer nil)] 352 | (with-check (cudart/cudaMalloc p byte-size) (capacity! p byte-size))))) 353 | ([^long byte-size type] 354 | (if-let [pt (type-pointer type)] 355 | (let [byte-size (max 0 byte-size)] 356 | (let-release [p (byte-pointer nil)] 357 | (with-check (cudart/cudaMalloc p byte-size) (pt (capacity! p byte-size))))) 358 | (throw (ex-info (format "Unknown data type: %s." (str type)) {}))))) 359 | 360 | (defn cuda-free! 361 | "Frees the runtime device memory that has been created by [[cuda-malloc]]. 362 | See [CUDA Runtime API Memory Management](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html) 363 | " 364 | [^Pointer dptr] 365 | (when-not (null? dptr) 366 | (with-check (cudart/cudaFree (position! dptr 0)) 367 | (do (.deallocate dptr) (.setNull dptr)))) 368 | dptr) 369 | 370 | ;; =================== Pinned and Mapped Memory ================================================ 371 | 372 | (defn mem-alloc-pinned 373 | "Allocates `byte-size` bytes of uninitialized page-locked memory, 'pinned' on the host, using 374 | keyword `flags`. For available flags, see [[internal.constants/mem-host-alloc-flags]]; the default 375 | is `:none`. Optionally, accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for 376 | `FloatPointer`, etc.). 377 | Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is optimized 378 | for [[memcpy!]]. 379 | See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 380 | " 381 | ([^long byte-size] 382 | (mem-host-alloc* (max 0 byte-size) 0)) 383 | ([^long byte-size type-or-flags] 384 | (if-let [t (type-pointer type-or-flags)] 385 | (mem-host-alloc* (max 0 byte-size) 0 t) 386 | (mem-host-alloc* (max 0 byte-size) 387 | (if (keyword? type-or-flags) 388 | (or (mem-host-alloc-flags type-or-flags) 389 | (throw (ex-info "Unknown mem-host-alloc flag." 390 | {:flag type-or-flags :available mem-host-alloc-flags}))) 391 | (mask mem-host-alloc-flags type-or-flags))))) 392 | ([^long byte-size type flags] 393 | (if-let [t (type-pointer type)] 394 | (mem-host-alloc* (max 0 byte-size) 395 | (if (keyword? flags) 396 | (or (mem-host-alloc-flags flags) 397 | (throw (ex-info "Unknown mem-host-alloc flag." 398 | {:flag flags :available mem-host-alloc-flags}))) 399 | (mask mem-host-alloc-flags flags)) 400 | t) 401 | (throw (ex-info (format "Unknown data type: %s." (str type)) {}))))) 402 | 403 | (defn mem-register-pinned! 404 | "Registers previously instantiated host pointer, 'pinned' from the device, using 405 | keyword `flags`. For available flags, see [[internal.constants/mem-host-register-flags]]; the 406 | default is `:none`. Returns the pinned object equivalent to the one created by [[mem-alloc-pinned]]. 407 | Pinned memory is optimized for the [[memcpy-host!]] function, while 'mapped' memory is 408 | optimized for [[memcpy!]]. 409 | See [CUDA Device Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 410 | " 411 | ([memory flags] 412 | (mem-host-register* memory (if (keyword? flags) 413 | (or (mem-host-register-flags flags) 414 | (throw (ex-info "Unknown mem-host-register flag." 415 | {:flag flags :available mem-host-register-flags}))) 416 | (mask mem-host-register-flags flags)))) 417 | ([memory] 418 | (mem-host-register* memory 0))) 419 | 420 | (defn mem-alloc-mapped 421 | "Allocates `byte-size` bytes of uninitialized host memory, 'mapped' to the device. Optionally, 422 | accepts a `type` of the pointer as a keyword (`:float` or `Float/TYPE` for `FloatPointer`, etc.). 423 | Mapped memory is optimized for the [[memcpy!]] operation, while 'pinned' memory is optimized for 424 | [[memcpy-host!]]. 425 | See [CUDA Driver API Memory Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html) 426 | " 427 | ([^long byte-size] 428 | (mem-alloc-host* (max 0 byte-size))) 429 | ([^long byte-size type] 430 | (mem-alloc-host* (max 0 byte-size) (type-pointer type)))) 431 | 432 | ;; ================== Module Management ===================================== 433 | 434 | (defn link 435 | "Invokes the CUDA linker on data provided as a vector `[[type source ], ...]`. 436 | Produces a cubin compiled for a particular Nvidia architecture. 437 | Please see relevant examples from the test folder. 438 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html) 439 | " 440 | ([data options] 441 | (let-release [res (CUlinkState_st.)] 442 | (link* res data options))) 443 | ([data] 444 | (link data nil)) 445 | ([] 446 | (CUlinkState_st.))) 447 | 448 | (defn link-complete! 449 | "Completes the link state created by [[link]], so that it can be loaded by the [[module]] function. 450 | Please see relevant examples from the test folder." 451 | [^CUlinkState_st link-state] 452 | (let-release [cubin-image (byte-pointer nil)] 453 | (with-release [size-out (size-t-pointer 1)] 454 | (with-check 455 | (cudart/cuLinkComplete link-state cubin-image size-out) 456 | (capacity! cubin-image (get-entry size-out 0)))))) 457 | 458 | (defn load! 459 | "Load module's data from a [[ptx]] string, nvrtc program, java path, or binary `data`. 460 | Please see relevant examples from the test folder. 461 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html) 462 | " 463 | [m data] 464 | (module-load* (safe (pointer data)) m) 465 | m) 466 | 467 | (defn module 468 | "Creates a new CUDA module and loads a string, nvrtc program, or binary `data`. 469 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html)" 470 | ([] 471 | (CUmod_st.)) 472 | ([data] 473 | (load! (module) data))) 474 | 475 | (defrecord GridDim [^long grid-x ^long grid-y ^long grid-z ^long block-x ^long block-y ^long block-z]) 476 | 477 | (defn grid-1d 478 | "Creates a 1-dimensional [[GridDim]] record with grid and block dimensions x. 479 | Note: dim-x is the total number of threads globally, not the number of blocks." 480 | ([^long dim-x] 481 | (let [block-x (min dim-x 1024)] 482 | (grid-1d dim-x block-x))) 483 | ([^long dim-x ^long block-x] 484 | (let [block-x (min dim-x block-x)] 485 | (GridDim. (count-groups block-x dim-x) 1 1 block-x 1 1)))) 486 | 487 | (defn grid-2d 488 | "Creates a 2-dimensional [[GridDim]] record with grid and block dimensions x and y. 489 | Note: dim-x is the total number of threads globally, not the number of blocks." 490 | ([^long dim-x ^long dim-y] 491 | (let [block-x (min dim-x 32) 492 | block-y (min dim-y (long (/ 1024 block-x)))] 493 | (grid-2d dim-x dim-y block-x block-y))) 494 | ([^long dim-x ^long dim-y ^long block-x ^long block-y] 495 | (let [block-x (min dim-x block-x) 496 | block-y (min dim-y block-y)] 497 | (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y) 1 block-x block-y 1)))) 498 | 499 | (defn grid-3d 500 | "Creates a 3-dimensional [[GridDim]] record with grid and block dimensions x, y, and z. 501 | Note: dim-x is the total number of threads globally, not the number of blocks." 502 | ([^long dim-x ^long dim-y ^long dim-z] 503 | (let [block-x (min dim-x 32) 504 | block-y (min dim-y (long (/ 1024 block-x))) 505 | block-z (min dim-z (long (/ 1024 (* block-x block-y))))] 506 | (grid-3d dim-x dim-y dim-z block-x block-y block-z))) 507 | ([dim-x dim-y dim-z block-x block-y block-z] 508 | (let [block-x (min (long dim-x) (long block-x)) 509 | block-y (min (long dim-y) (long block-y)) 510 | block-z (min (long dim-z) (long block-z))] 511 | (GridDim. (count-groups block-x dim-x) (count-groups block-y dim-y) 512 | (count-groups block-z dim-z) block-x block-y block-z)))) 513 | 514 | (defn global 515 | "Returns CUDA global device memory object named `name` from module `m`. Global memory is 516 | typically defined in C++ source files of CUDA kernels. 517 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html) 518 | " 519 | [^CUmod_st m ^String name] 520 | (let-release [dptr (long-pointer 1)] 521 | (with-release [byte-size (size-t-pointer 1)] 522 | (with-check 523 | (cudart/cuModuleGetGlobal ^LongPointer dptr ^SizeTPointer byte-size m name) 524 | {:name name} 525 | (->CUDevicePtr dptr (get-entry byte-size 0) false))))) 526 | 527 | (defn set-parameter! 528 | "Sets the `i`th parameter in a parameter array `pp` and the rest of `parameters` in places after `i`." 529 | [^PointerPointer pp i parameter & parameters] 530 | (if (< -1 (long i) (size pp)) 531 | (set-parameter* parameter (extract pp) i) 532 | (throw (ex-info "Index out of bounds." {:requested i :available (size pp)}))) 533 | (if parameters 534 | (recur pp (inc (long i)) (first parameters) (next parameters)) 535 | pp)) 536 | 537 | (defn parameters 538 | "Creates an `PointerPointer`s to CUDA `parameter`'s. `parameter` can be any object on 539 | device (Device API memory, Runtime API memory, JavaCPP pointers), or host (arrays, numbers, JavaCPP 540 | pointers) that makes sense as a kernel parameter per CUDA specification. Use the result as a parameter 541 | argument in [[launch!]]. 542 | " 543 | ([parameter & parameters] 544 | (let-release [len (if parameters (inc (count parameters)) 1) 545 | pp (pointer-pointer len)] 546 | (apply set-parameter! pp 0 parameter parameters)))) 547 | 548 | ;; ====================== Execution Control ================================== 549 | 550 | (defn function 551 | "Returns CUDA kernel function named `name` located in module `m`. 552 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html) 553 | " 554 | [^CUmod_st m ^String name] 555 | (let [res (CUfunc_st.)] 556 | (with-check (cudart/cuModuleGetFunction res m name) {:name name} res))) 557 | 558 | (defn launch! 559 | "Invokes the kernel `fun` on a `grid-dim` grid of blocks, usinng `params` `PointerPointer`. 560 | Optionally, you can specify the amount of shared memory that will be available to each thread block, 561 | and `hstream` to use for execution. 562 | See [CUDA Module Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html) 563 | " 564 | ([^CUfunc_st fun ^GridDim grid-dim shared-mem-bytes ^CUstream_st hstream ^PointerPointer params] 565 | (with-check 566 | (cudart/cuLaunchKernel fun (.grid-x grid-dim) (.grid-y grid-dim) (.grid-z grid-dim) 567 | (.block-x grid-dim) (.block-y grid-dim) (.block-z grid-dim) 568 | (int shared-mem-bytes) hstream params nil) 569 | {:kernel (info fun) :grid-dim grid-dim :hstream (info hstream)} 570 | hstream)) 571 | ([^CUfunc_st fun ^GridDim grid-dim hstream params] 572 | (launch! fun grid-dim 0 hstream params)) 573 | ([^CUfunc_st fun ^GridDim grid-dim params] 574 | (launch! fun grid-dim 0 nil params))) 575 | 576 | ;; ================== Stream Management ====================================== 577 | 578 | (defn stream 579 | "Creates a stream using an optional integer `priority` and a keyword `flag`. 580 | For available flags, see [[internal.constants/stream-flags]] 581 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html) 582 | " 583 | ([] 584 | (stream* cudart/CU_STREAM_DEFAULT)) 585 | ([flag] 586 | (stream* (or (stream-flags flag) 587 | (throw (ex-info "Invalid stream flag." {:flag flag :available stream-flags}))))) 588 | ([^long priority flag] 589 | (stream* priority (or (stream-flags flag) 590 | (throw (ex-info "Invaling stream flag." 591 | {:flag flag :available stream-flags})))))) 592 | 593 | (def default-stream 594 | ^{:const true 595 | :doc "The default per-thread stream."} 596 | cudart/CU_STREAM_PER_THREAD) 597 | 598 | (defn ready? 599 | "Determines status (ready or not) of a compute stream or event `obj`. 600 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html) 601 | and [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 602 | " 603 | [obj] 604 | (= cudart/CUDA_SUCCESS (ready* (extract obj)))) 605 | 606 | (defn synchronize! 607 | "Blocks the current thread until the context's or `hstream`'s tasks complete." 608 | ([] 609 | (with-check (cudart/cuCtxSynchronize) true)) 610 | ([^CUstream_st hstream] 611 | (with-check (cudart/cuStreamSynchronize hstream) hstream))) 612 | 613 | (defn add-host-fn! 614 | "Adds host function `f` to a compute stream, with optional `data` related to the call. 615 | If `data` is not provided, places `hstream` under data. 616 | " 617 | ([hstream f data] 618 | (add-host-fn* hstream f data) 619 | hstream) 620 | ([hstream f] 621 | (add-host-fn* hstream f hstream) 622 | hstream)) 623 | 624 | (defn listen! 625 | "Adds a host function listener to a compute stream, with optional `data` related to the call, 626 | and connects it to a Clojure channel `chan`. If `data` is not provided, places `hstream` under data. 627 | " 628 | ([hstream ch data] 629 | (let [data (safe (pointer data))] 630 | (add-host-fn* hstream (host-fn* data ch) data) 631 | hstream)) 632 | ([hstream ch] 633 | (add-host-fn* hstream (host-fn* hstream ch) hstream) 634 | hstream)) 635 | 636 | (defn wait-event! 637 | "Makes a compute stream `hstream` wait on an event `ev`. 638 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 639 | " 640 | [^CUstream_st hstream ^CUevent_st ev] 641 | (with-check (cudart/cuStreamWaitEvent hstream ev 0) hstream)) 642 | 643 | (defn attach-mem! 644 | "Attaches memory `mem` of size `size`, specified by `flag` to a `hstream` asynchronously. 645 | For available flags, see [[internal.constants/mem-attach-flags]]. Te default is `:single`. 646 | If :global flag is specified, the memory can be accessed by any stream on any device. 647 | If :host flag is specified, the program makes a guarantee that it won't access the memory on 648 | the device from any stream on a device that has no `concurrent-managed-access` capability. 649 | If :single flag is specified and `hStream` is associated with a device that has no 650 | `concurrent-managed-access` capability, the program makes a guarantee that it will only access 651 | the memory on the device from `hStream`. It is illegal to attach singly to the nil stream, 652 | because the nil stream is a virtual global stream and not a specific stream. An error will 653 | be returned in this case. 654 | 655 | When memory is associated with a single stream, the Unified Memory system will allow CPU access 656 | to this memory region so long as all operations in hStream have completed, regardless of whether 657 | other streams are active. In effect, this constrains exclusive ownership of the managed memory 658 | region by an active GPU to per-stream activity instead of whole-GPU activity. 659 | 660 | See [CUDA Stream Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html)." 661 | ([^CUstream_st hstream mem ^long byte-size flag] 662 | (attach-mem* (or (extract hstream) 663 | (when-not (= :global flag) 664 | (throw (ex-info "nil stream is a virtual global stream and not a specific stream that may be only used with :global mem-attach flag." 665 | {:flag flag :available mem-attach-flags})))) 666 | (cu-address* mem) byte-size 667 | (or (mem-attach-flags flag) 668 | (throw (ex-info "Unknown mem-attach flag." 669 | {:flag flag :available mem-attach-flags})))) 670 | hstream) 671 | ([mem byte-size flag] 672 | (attach-mem! default-stream mem byte-size flag))) 673 | 674 | ;; ================== Event Management ======================================= 675 | 676 | (defn event 677 | "Creates an event specified by keyword `flags`. For available flags, see 678 | [[internal.constants/event-flags]]. 679 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 680 | " 681 | ([] 682 | (event* cudart/CU_EVENT_DEFAULT)) 683 | ([flag & flags] 684 | (event* (if flags 685 | (mask event-flags (cons flag flags)) 686 | (or (event-flags flag) 687 | (throw (ex-info "Unknown event flag." {:flag flag :available event-flags}))))))) 688 | 689 | (defn elapsed-time! 690 | "Computes the elapsed time in milliseconds between `start-event` and `end-event`. 691 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 692 | " 693 | ^double [^CUevent_st start-event ^CUevent_st end-event] 694 | (let [res (float-array 1)] 695 | (with-check (cudart/cuEventElapsedTime res start-event end-event) (aget res 0)))) 696 | 697 | (defn record! 698 | "Records an even! `ev` on optional `stream`. 699 | See [CUDA Event Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 700 | " 701 | ([^CUstream_st stream ^CUevent_st event] 702 | (with-check (cudart/cuEventRecord event stream) stream)) 703 | ([^CUevent_st event] 704 | (with-check (cudart/cuEventRecord event nil) default-stream))) 705 | 706 | ;; ================== Peer Context Memory Access ============================= 707 | 708 | (defn can-access-peer 709 | "Queries if a device may directly access a peer device's memory. 710 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html) 711 | " 712 | [dev peer] 713 | (can-access-peer* (extract dev) (extract peer))) 714 | 715 | (defn p2p-attribute 716 | "Queries attributes of the link between two devices. 717 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html) 718 | " 719 | [dev peer attribute] 720 | (p2p-attribute* (extract dev) (extract peer) (or (p2p-attributes attribute) 721 | (throw (ex-info "Unknown p2p attribute." 722 | {:attribute attribute :available p2p-attributes}))))) 723 | 724 | (defn disable-peer-access! 725 | "Disables direct access to memory allocations in a peer context and unregisters any registered allocations. 726 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html) 727 | " 728 | ([ctx] 729 | (with-check (cudart/cuCtxDisablePeerAccess ctx) ctx)) 730 | ([] 731 | (disable-peer-access! (current-context)))) 732 | 733 | (defn enable-peer-access! 734 | "Enables direct access to memory allocations in a peer context and unregisters any registered allocations. 735 | See [CUDA Peer Access Management](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html) 736 | " 737 | ([ctx] 738 | (with-check (cudart/cuCtxEnablePeerAccess ctx 0) ctx) 739 | ctx) 740 | ([] 741 | (enable-peer-access! (current-context)))) 742 | 743 | ;; ====================== Nvrtc program JIT ======================================== 744 | 745 | (defn program 746 | "Creates a CUDA program from the `source-code`, with an optional `name` and an optional 747 | hash map of `headers` (as strings) and their names. 748 | " 749 | ([^String name ^String source-code headers] 750 | (program* (string-pointer name) (string-pointer source-code) 751 | (pointer-pointer (into-array String (vals headers))) 752 | (pointer-pointer (into-array String (keys headers))))) 753 | ([source-code headers] 754 | (program nil source-code headers)) 755 | ([source-code] 756 | (program nil source-code nil))) 757 | 758 | (defn program-log 759 | "Returns the log string generated by the previous compilation of `prog`." 760 | [prog] 761 | (program-log* prog)) 762 | 763 | (defn compile! 764 | "Compiles the given `prog` using a list of string `options`." 765 | ([prog options] 766 | (compile* prog (pointer-pointer (into-array String options))) 767 | prog) 768 | ([prog] 769 | (compile! prog nil))) 770 | 771 | (defn ptx 772 | "Returns the PTX generated by the previous compilation of `prog`." 773 | [prog] 774 | (ptx* prog)) 775 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/info.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.info 11 | "Info functions for all CUDA objects (devices, etc...). 12 | " 13 | (:require [clojure.string :as str] 14 | [uncomplicate.commons.core :refer [with-release Info]] 15 | [uncomplicate.fluokitten.core :refer [fmap op]] 16 | [uncomplicate.clojure-cpp :as cpp 17 | :refer [int-pointer byte-pointer size-t-pointer get-string get-entry]] 18 | [uncomplicate.clojurecuda.internal 19 | [constants :refer [ctx-limits dec-compute-mode dec-func-cache-config dec-shared-config 20 | dec-stream-flag func-cache-config shared-config-map]] 21 | [utils :refer [with-check maybe]] 22 | [impl :refer [current-context* ->CUDevice]]]) 23 | (:import [org.bytedeco.cuda.global cudart] 24 | [org.bytedeco.cuda.cudart CUctx_st CUfunc_st CUstream_st] 25 | [uncomplicate.clojurecuda.internal.impl CUDevice])) 26 | 27 | ;; =================== Info* utility macros =============================== 28 | 29 | (defmacro ^:private info-attribute* [method object attribute] 30 | `(long (with-release [res# (int-pointer 1)] 31 | (with-check (~method res# ~attribute ~object) 32 | (get-entry res# 0))))) 33 | 34 | ;; =================== Version Management ================================= 35 | 36 | (defn driver-version ^long [] 37 | (with-release [res (int-pointer 1)] 38 | (with-check (cudart/cuDriverGetVersion res) (get-entry res 0)))) 39 | 40 | ;; =================== Device info ======================================= 41 | 42 | (defn device-name [^CUDevice device] 43 | (with-release [res (byte-pointer 64)] 44 | (with-check (cudart/cuDeviceGetName res 64 (.dev device)) 45 | (clojure.string/replace (get-string res) #"" "")))) 46 | 47 | (defn total-mem [^CUDevice device] 48 | (with-release [res (size-t-pointer 1)] 49 | (with-check (cudart/cuDeviceTotalMem res (.dev device)) 50 | (get-entry res 0)))) 51 | 52 | (defn async-engine-count ^long [^CUDevice device] 53 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 54 | cudart/CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT)) 55 | 56 | (defn can-map-host-memory [^CUDevice device] 57 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 58 | cudart/CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY))) 59 | 60 | (defn clock-rate ^long [^CUDevice device] 61 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 62 | cudart/CU_DEVICE_ATTRIBUTE_CLOCK_RATE)) 63 | 64 | (defn compute-capability-major ^long [^CUDevice device] 65 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 66 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)) 67 | 68 | (defn compute-capability-minor ^long [^CUDevice device] 69 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 70 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)) 71 | 72 | (defn compute-mode [^CUDevice device] 73 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 74 | cudart/CU_DEVICE_ATTRIBUTE_COMPUTE_MODE)) 75 | 76 | (defn concurrent-kernels ^long [^CUDevice device] 77 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 78 | cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS)) 79 | 80 | (defn ecc-enabled [^CUDevice device] 81 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 82 | cudart/CU_DEVICE_ATTRIBUTE_ECC_ENABLED))) 83 | 84 | (defn global-L1-cache-supported [^CUDevice device] 85 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 86 | cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED))) 87 | 88 | (defn global-memory-bus-width ^long [^CUDevice device] 89 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 90 | cudart/CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH)) 91 | 92 | (defn integrated [^CUDevice device] 93 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 94 | cudart/CU_DEVICE_ATTRIBUTE_INTEGRATED))) 95 | 96 | (defn kernel-exec-timeout [^CUDevice device] 97 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 98 | cudart/CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT))) 99 | 100 | (defn L2-cache-size ^long [^CUDevice device] 101 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 102 | cudart/CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)) 103 | 104 | (defn local-L1-cache-supported [^CUDevice device] 105 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 106 | cudart/CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED))) 107 | 108 | (defn managed-memory [^CUDevice device] 109 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 110 | cudart/CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY))) 111 | 112 | (defn concurrent-managed-access [^CUDevice device] 113 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 114 | cudart/CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS))) 115 | 116 | (defn max-block-dim-x ^long [^CUDevice device] 117 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 118 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)) 119 | 120 | (defn max-block-dim-y ^long [^CUDevice device] 121 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 122 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)) 123 | 124 | (defn max-block-dim-z ^long [^CUDevice device] 125 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 126 | cudart/CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)) 127 | 128 | (defn max-grid-dim-x ^long [^CUDevice device] 129 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 130 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)) 131 | 132 | (defn max-grid-dim-y ^long [^CUDevice device] 133 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 134 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)) 135 | 136 | (defn max-grid-dim-z ^long [^CUDevice device] 137 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 138 | cudart/CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z)) 139 | 140 | (defn max-pitch ^long [^CUDevice device] 141 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 142 | cudart/CU_DEVICE_ATTRIBUTE_MAX_PITCH)) 143 | 144 | (defn max-registers-per-block ^long [^CUDevice device] 145 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 146 | cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK)) 147 | 148 | (defn max-registers-per-multiprocessor ^long [^CUDevice device] 149 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 150 | cudart/CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR)) 151 | 152 | (defn max-shared-memory-per-block ^long [^CUDevice device] 153 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 154 | cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)) 155 | 156 | (defn max-shared-memory-per-multiprocessor ^long [^CUDevice device] 157 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 158 | cudart/CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR)) 159 | 160 | (defn max-threads-per-block ^long [^CUDevice device] 161 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 162 | cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) 163 | 164 | (defn max-threads-per-multiprocessor ^long [^CUDevice device] 165 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 166 | cudart/CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR)) 167 | 168 | (defn maximum-surface1d-layered-layers ^long [^CUDevice device] 169 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 170 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS)) 171 | 172 | (defn maximum-surface1d-layered-width ^long [^CUDevice device] 173 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 174 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH)) 175 | 176 | (defn maximum-surface1d-width ^long [^CUDevice device] 177 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 178 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH)) 179 | 180 | (defn maximum-surface2d-height ^long [^CUDevice device] 181 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 182 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT)) 183 | 184 | (defn maximum-surface2d-width ^long [^CUDevice device] 185 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 186 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH)) 187 | 188 | (defn maximum-surface2d-layered-height ^long [^CUDevice device] 189 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 190 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT)) 191 | 192 | (defn maximum-surface2d-layered-width ^long [^CUDevice device] 193 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 194 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH)) 195 | 196 | (defn maximum-surface2d-layered-layers ^long [^CUDevice device] 197 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 198 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS)) 199 | 200 | (defn maximum-surface3d-depth ^long [^CUDevice device] 201 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 202 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH)) 203 | 204 | (defn maximum-surface3d-height ^long [^CUDevice device] 205 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 206 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT)) 207 | 208 | (defn maximum-surface3d-width ^long [^CUDevice device] 209 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 210 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH)) 211 | 212 | (defn maximum-surfacecubemap-layered-width ^long [^CUDevice device] 213 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 214 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH)) 215 | 216 | (defn maximum-surfacecubemap-layered-layers ^long [^CUDevice device] 217 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 218 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS)) 219 | 220 | (defn maximum-surfacecubemap-width ^long [^CUDevice device] 221 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 222 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH)) 223 | 224 | (defn maximum-texture1d-layered-width ^long [^CUDevice device] 225 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 226 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH)) 227 | 228 | (defn maximum-texture1d-layered-layers ^long [^CUDevice device] 229 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 230 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS)) 231 | 232 | (defn maximum-texture1d-linear-width ^long [^CUDevice device] 233 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 234 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH)) 235 | 236 | (defn maximum-texture1d-mipmapped-width ^long [^CUDevice device] 237 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 238 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH)) 239 | 240 | (defn maximum-texture1d-width ^long [^CUDevice device] 241 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 242 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH)) 243 | 244 | (defn maximum-texture2d-height ^long [^CUDevice device] 245 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 246 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT)) 247 | 248 | (defn maximum-texture2d-layered-height ^long [^CUDevice device] 249 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 250 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT)) 251 | 252 | (defn maximum-texture2d-layered-layers ^long [^CUDevice device] 253 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 254 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS)) 255 | 256 | (defn maximum-texture2d-linear-height ^long [^CUDevice device] 257 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 258 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT)) 259 | 260 | (defn maximum-texture2d-linear-pitch ^long [^CUDevice device] 261 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 262 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH)) 263 | 264 | (defn maximum-texture2d-linear-width ^long [^CUDevice device] 265 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 266 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH)) 267 | 268 | (defn maximum-texture2d-mipmapped-width ^long [^CUDevice device] 269 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 270 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH)) 271 | 272 | (defn maximum-texture2d-mipmapped-height ^long [^CUDevice device] 273 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 274 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT)) 275 | 276 | (defn maximum-texture2d-width ^long [^CUDevice device] 277 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 278 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH)) 279 | 280 | (defn maximum-texture3d-depth ^long [^CUDevice device] 281 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 282 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH)) 283 | 284 | (defn maximum-texture3d-depth-alternate ^long [^CUDevice device] 285 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 286 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE)) 287 | 288 | (defn maximum-texture3d-height ^long [^CUDevice device] 289 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 290 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT)) 291 | 292 | (defn maximum-texture3d-height-alternate ^long [^CUDevice device] 293 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 294 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE)) 295 | 296 | (defn maximum-texture3d-width ^long [^CUDevice device] 297 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 298 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH)) 299 | 300 | (defn maximum-texture3d-width-alternate ^long [^CUDevice device] 301 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 302 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE)) 303 | 304 | (defn maximum-texturecubemap-layered-layers ^long [^CUDevice device] 305 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 306 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS)) 307 | 308 | (defn maximum-texturecubemap-layered-width ^long [^CUDevice device] 309 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 310 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH)) 311 | 312 | (defn maximum-texturecubemap-width ^long [^CUDevice device] 313 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 314 | cudart/CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH)) 315 | 316 | (defn memory-clock-rate ^long [^CUDevice device] 317 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 318 | cudart/CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE)) 319 | 320 | (defn multi-gpu-board [^CUDevice device] 321 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 322 | cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD))) 323 | 324 | (defn multi-gpu-board-group-id ^long [^CUDevice device] 325 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 326 | cudart/CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID)) 327 | 328 | (defn multiprocessor-count ^long [^CUDevice device] 329 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 330 | cudart/CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) 331 | 332 | (defn pci-bus-id ^long [^CUDevice device] 333 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 334 | cudart/CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)) 335 | 336 | (defn pci-bus-id-string [^CUDevice device] 337 | (with-release [res (byte-pointer 13) 338 | res2 (byte-pointer 12)] 339 | (with-check (cudart/cuDeviceGetPCIBusId res 13 (.dev device)) 340 | (do 341 | (cpp/memcpy! res res2 12) 342 | (get-string res2))))) 343 | 344 | (defn pci-device-id ^long [^CUDevice device] 345 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 346 | cudart/CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)) 347 | 348 | (defn pci-domain-id ^long [^CUDevice device] 349 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 350 | cudart/CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID)) 351 | 352 | (defn stream-priorities-supported [^CUDevice device] 353 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 354 | cudart/CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED))) 355 | 356 | (defn surface-alignment ^long [^CUDevice device] 357 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 358 | cudart/CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT)) 359 | 360 | (defn tcc-driver [^CUDevice device] 361 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 362 | cudart/CU_DEVICE_ATTRIBUTE_TCC_DRIVER))) 363 | 364 | (defn texture-alignment ^long [^CUDevice device] 365 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 366 | cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT)) 367 | 368 | (defn texture-pitch-alignment ^long [^CUDevice device] 369 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 370 | cudart/CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT)) 371 | 372 | (defn total-constant-memory ^long [^CUDevice device] 373 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 374 | cudart/CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)) 375 | 376 | (defn unified-addressing [^CUDevice device] 377 | (pos? (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 378 | cudart/CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING))) 379 | 380 | (defn warp-size ^long [^CUDevice device] 381 | (info-attribute* cudart/cuDeviceGetAttribute (.dev device) 382 | cudart/CU_DEVICE_ATTRIBUTE_WARP_SIZE)) 383 | 384 | (def ^:no-doc 385 | device-attributes 386 | {:name device-name 387 | :total-mem total-mem 388 | :async-engine-count async-engine-count 389 | :can-map-host-memory can-map-host-memory 390 | :clock-rate clock-rate 391 | :compute-capability-major compute-capability-major 392 | :compute-capability-minor compute-capability-minor 393 | :compute-mode (comp dec-compute-mode compute-mode) 394 | :concurrent-kernels concurrent-kernels 395 | :ecc-enabled ecc-enabled 396 | :global-L1-cache-supported global-L1-cache-supported 397 | :global-memory-bus-width global-memory-bus-width 398 | :integrated integrated 399 | :kernel-exec-timeout kernel-exec-timeout 400 | :L2-cache-size L2-cache-size 401 | :local-L1-cache-supported local-L1-cache-supported 402 | :managed-memory managed-memory 403 | :max-block-dim-x max-block-dim-x 404 | :max-block-dim-y max-block-dim-y 405 | :max-block-dim-z max-block-dim-z 406 | :max-grid-dim-x max-grid-dim-x 407 | :max-grid-dim-y max-grid-dim-y 408 | :max-grid-dim-z max-grid-dim-z 409 | :max-pitch max-pitch 410 | :max-registers-per-block max-registers-per-block 411 | :max-registers-per-multiprocessor max-registers-per-multiprocessor 412 | :max-shared-memory-per-block max-shared-memory-per-block 413 | :max-shared-memory-per-multiprocessor max-shared-memory-per-multiprocessor 414 | :max-threads-per-block max-threads-per-block 415 | :max-threads-per-multiprocessor max-threads-per-multiprocessor 416 | :maximum-surface1d-layered-layers maximum-surface1d-layered-layers 417 | :maximum-surface1d-layered-width maximum-surface1d-layered-width 418 | :maximum-surface1d-width maximum-surface1d-width 419 | :maximum-surface2d-height maximum-surface2d-height 420 | :maximum-surface2d-width maximum-surface2d-width 421 | :maximum-surface2d-layered-height maximum-surface2d-layered-height 422 | :maximum-surface2d-layered-width maximum-surface2d-layered-width 423 | :maximum-surface2d-layered-layers maximum-surface2d-layered-layers 424 | :maximum-surface3d-depth maximum-surface3d-depth 425 | :maximum-surface3d-height maximum-surface3d-height 426 | :maximum-surface3d-width maximum-surface3d-width 427 | :maximum-surfacecubemap-layered-width maximum-surfacecubemap-layered-width 428 | :maximum-surfacecubemap-layered-layers maximum-surfacecubemap-layered-layers 429 | :maximum-surfacecubemap-width maximum-surfacecubemap-width 430 | :maximum-texture1d-layered-width maximum-texture1d-layered-width 431 | :maximum-texture1d-layered-layers maximum-texture1d-layered-layers 432 | :maximum-texture1d-linear-width maximum-texture1d-linear-width 433 | :maximum-texture1d-mipmapped-width maximum-texture1d-mipmapped-width 434 | :maximum-texture1d-width maximum-texture1d-width 435 | :maximum-texture2d-height maximum-texture2d-height 436 | :maximum-texture2d-layered-height maximum-texture2d-layered-height 437 | :maximum-texture2d-layered-layers maximum-texture2d-layered-layers 438 | :maximum-texture2d-linear-height maximum-texture2d-linear-height 439 | :maximum-texture2d-linear-pitch maximum-texture2d-linear-pitch 440 | :maximum-texture2d-linear-width maximum-texture2d-linear-width 441 | :maximum-texture2d-mipmapped-width maximum-texture2d-mipmapped-width 442 | :maximum-texture2d-mipmapped-height maximum-texture2d-mipmapped-height 443 | :maximum-texture2d-width maximum-texture2d-width 444 | :maximum-texture3d-depth maximum-texture3d-depth 445 | :maximum-texture3d-depth-alternate maximum-texture3d-depth-alternate 446 | :maximum-texture3d-height maximum-texture3d-height 447 | :maximum-texture3d-height-alternate maximum-texture3d-height-alternate 448 | :maximum-texture3d-width maximum-texture3d-width 449 | :maximum-texture3d-width-alternate maximum-texture3d-width-alternate 450 | :maximum-texturecubemap-layered-layers maximum-texturecubemap-layered-layers 451 | :maximum-texturecubemap-layered-width maximum-texturecubemap-layered-width 452 | :maximum-texturecubemap-width maximum-texturecubemap-width 453 | :memory-clock-rate memory-clock-rate 454 | :multi-gpu-board multi-gpu-board 455 | :multi-gpu-board-group-id multi-gpu-board-group-id 456 | :multiprocessor-count multiprocessor-count 457 | :pci-bus-id pci-bus-id 458 | :pci-bus-id-string pci-bus-id-string 459 | :pci-device-id pci-device-id 460 | :pci-domain-id pci-domain-id 461 | :stream-priorities-supported stream-priorities-supported 462 | :surface-alignment surface-alignment 463 | :tcc-driver tcc-driver 464 | :texture-alignment texture-alignment 465 | :texture-pitch-alignment texture-pitch-alignment 466 | :total-constant-memory total-constant-memory 467 | :unified-addressing unified-addressing 468 | :warp-size warp-size}) 469 | 470 | (extend-type CUDevice 471 | Info 472 | (info 473 | ([d attribute] 474 | (if-let [attribute-fn (device-attributes attribute)] 475 | (maybe (attribute-fn d)) 476 | (throw (ex-info "Unknown attribute." {:attribute attribute})))) 477 | ([d] 478 | (fmap #(maybe (% d)) device-attributes)))) 479 | 480 | ;; ======================= Context Info ================================== 481 | 482 | (defn api-version 483 | "Gets the context's API version." 484 | ([^CUctx_st ctx] 485 | (with-release [res (int-pointer 1)] 486 | (with-check (cudart/cuCtxGetApiVersion ctx res) (get-entry res 0)))) 487 | ([] 488 | (with-release [res (int-pointer 1)] 489 | (with-check (cudart/cuCtxGetApiVersion ^CUctx_st (current-context*) res) 490 | (get-entry res 0))))) 491 | 492 | (defn cache-config 493 | "Returns the preferred cache configuration for the current context. 494 | 495 | See [cuCtxGetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 496 | " 497 | [] 498 | (with-release [res (int-pointer 1)] 499 | (dec-func-cache-config (with-check (cudart/cuCtxGetCacheConfig res) (get-entry res 0))))) 500 | 501 | (defn limit* 502 | "Returns or sets resource limits for the attribute specified by integer `limit`. 503 | 504 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 505 | " 506 | (^long [limit] 507 | (with-release [res (size-t-pointer 1)] 508 | (with-check (cudart/cuCtxGetLimit res limit) (get-entry res 0)))) 509 | (^long [limit ^long value] 510 | (with-check (cudart/cuCtxSetLimit limit value) value))) 511 | 512 | (defn limit 513 | "Returns resource limits for the attribute specified by keyword `limit`. 514 | 515 | Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`, 516 | `dev-runtime-pending-launch-count`. 517 | 518 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 519 | " 520 | ^long [limit] 521 | (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits}))))) 522 | 523 | (defn limit! 524 | "Sets resource limit for the attribute specified by keyword `limit` to `value`. 525 | 526 | Supported limits are: `stack-size`, `malloc-heap-size`, `printf-fifo-size`, `dev-runtime-sync-depth`, 527 | `dev-runtime-pending-launch-count`. 528 | 529 | See [cuCtxGetLimit](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 530 | " 531 | ^long [limit ^long value] 532 | (limit* (or (ctx-limits limit) (throw (ex-info "Unknown limit." {:limit limit :available ctx-limits}))) 533 | value)) 534 | 535 | (defn ctx-device 536 | "Returns the device for the current context." 537 | [] 538 | (with-release [res (int-pointer 1)] 539 | (with-check (cudart/cuCtxGetDevice res) (->CUDevice (get-entry res 0))))) 540 | 541 | (defn shared-config* 542 | "Sets or gets the current shared memory configuration for the current context or kernel `func`. 543 | 544 | See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 545 | See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 546 | See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html) 547 | " 548 | (^long [] 549 | (with-release [res (int-pointer 1)] 550 | (with-check (cudart/cuCtxGetSharedMemConfig res) (get-entry res 0)))) 551 | (^long [^long config] 552 | (with-check (cudart/cuCtxSetSharedMemConfig config) config)) 553 | ([^CUfunc_st func ^long config] 554 | (with-check (cudart/cuFuncSetSharedMemConfig func config) func))) 555 | 556 | (defn shared-config 557 | "Gets the current shared memory configuration for the current context. 558 | 559 | See [cuCtxGetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 560 | " 561 | [] 562 | (dec-shared-config (shared-config*))) 563 | 564 | (defn shared-config! 565 | "Sets the current shared memory configuration for the current context. 566 | 567 | See [cuCtxSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 568 | See [cuFuncSetSharedMemConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html) 569 | " 570 | ([config] 571 | (shared-config* (or (shared-config-map config) 572 | (ex-info "Unknown config." {:config config :available shared-config-map})))) 573 | ([func config] 574 | (shared-config* func (or (shared-config-map config) 575 | (ex-info "Unknown config." {:config config :available shared-config-map}))))) 576 | 577 | (defn stream-priority-range 578 | "Returns a vector of 2 numerical values that correspond to the least and greatest stream priorities. 579 | 580 | See [cuCtxGetStreamPriorityRange](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html) 581 | " 582 | [] 583 | (with-release [least (int-pointer 1) 584 | greatest (int-pointer 1)] 585 | (with-check (cudart/cuCtxGetStreamPriorityRange least greatest) 586 | [(get-entry least 0) (get-entry greatest 0)]))) 587 | 588 | (extend-type CUctx_st 589 | Info 590 | (info 591 | ([_ info-type] 592 | (maybe 593 | (case info-type 594 | :api-version (api-version) 595 | :cache-config (cache-config) 596 | :stack-size (limit* cudart/CU_LIMIT_STACK_SIZE) 597 | :malloc-heap-size (limit* cudart/CU_LIMIT_MALLOC_HEAP_SIZE) 598 | :printf-fifo-size (limit* cudart/CU_LIMIT_PRINTF_FIFO_SIZE) 599 | :dev-runtime-sync-depth (limit* cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH) 600 | :dev-runtime-pending-launch-count (limit* cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT) 601 | :limits (fmap #(maybe (limit* %)) ctx-limits) 602 | :device (ctx-device) 603 | :shared-config (shared-config) 604 | :stream-priority-range (stream-priority-range) 605 | nil))) 606 | ([_] 607 | (op {:api-version (maybe (api-version)) 608 | :cache-config (maybe (cache-config)) 609 | :device (maybe (ctx-device)) 610 | :shared-config (shared-config) 611 | :stream-priority-range (stream-priority-range)} 612 | (fmap #(maybe (limit* %)) ctx-limits))))) 613 | 614 | ;; =========================== Stream Management ================================ 615 | 616 | (defn stream-flag [^CUstream_st hstream] 617 | (with-release [res (int-pointer 1)] 618 | (with-check (cudart/cuStreamGetFlags hstream res) (get-entry res 0)))) 619 | 620 | (defn stream-priority ^long [^CUstream_st hstream] 621 | (with-release [res (int-pointer 1)] 622 | (with-check (cudart/cuStreamGetPriority hstream res) (get-entry res 0)))) 623 | 624 | (extend-type CUstream_st 625 | Info 626 | (info 627 | ([hstream info-type] 628 | (maybe 629 | (case info-type 630 | :flag (dec-stream-flag (stream-flag hstream)) 631 | :priority (stream-priority hstream) 632 | nil))) 633 | ([hstream] 634 | {:flag (maybe (dec-stream-flag (stream-flag hstream))) 635 | :priority (maybe (stream-priority hstream))}))) 636 | 637 | ;; ============================= Execution Management ========================== 638 | 639 | (defn max-threads-per-block-fn ^long [^CUfunc_st function] 640 | (info-attribute* cudart/cuFuncGetAttribute function 641 | cudart/CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) 642 | 643 | (defn shared-size ^long [^CUfunc_st function] 644 | (info-attribute* cudart/cuFuncGetAttribute function 645 | cudart/CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)) 646 | 647 | (defn const-size ^long [^CUfunc_st function] 648 | (info-attribute* cudart/cuFuncGetAttribute function 649 | cudart/CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)) 650 | 651 | (defn local-size ^long [^CUfunc_st function] 652 | (info-attribute* cudart/cuFuncGetAttribute function 653 | cudart/CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)) 654 | 655 | (defn num-regs ^long [^CUfunc_st function] 656 | (info-attribute* cudart/cuFuncGetAttribute function 657 | cudart/CU_FUNC_ATTRIBUTE_NUM_REGS)) 658 | 659 | (defn ptx-version ^long [^CUfunc_st function] 660 | (info-attribute* cudart/cuFuncGetAttribute function 661 | cudart/CU_FUNC_ATTRIBUTE_PTX_VERSION)) 662 | 663 | (defn binary-version ^long [^CUfunc_st function] 664 | (info-attribute* cudart/cuFuncGetAttribute function 665 | cudart/CU_FUNC_ATTRIBUTE_BINARY_VERSION)) 666 | 667 | (defn cache-config* 668 | "Sets the preferred cache configuration for a device function `fun`, as an integer `config`. 669 | 670 | See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html) 671 | " 672 | [fun ^long config] 673 | (with-check (cudart/cuFuncSetCacheConfig fun config) fun)) 674 | 675 | (defn cache-config! 676 | "Sets the preferred cache configuration for a device function `fun`, as a keyword `config`. 677 | 678 | Available configs are `:prefer-none`, `:prefer-shared`, `:prefer-L1`, and `:prefer-equal`. 679 | 680 | See [cuFuncSetCacheConfig](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html) 681 | " 682 | [fun config] 683 | (cache-config* fun (or (func-cache-config config) 684 | (throw (ex-info "Invaling cache config." 685 | {:config config :available func-cache-config}))))) 686 | 687 | (extend-type CUfunc_st 688 | Info 689 | (info 690 | ([fun info-type] 691 | (maybe 692 | (case info-type 693 | :max-threads-per-block (max-threads-per-block-fn fun) 694 | :shared-size (shared-size fun) 695 | :const-size (const-size fun) 696 | :local-size (local-size fun) 697 | :num-regs (num-regs fun) 698 | :ptx-version (ptx-version fun) 699 | :binary-version (binary-version fun) 700 | nil))) 701 | ([fun] 702 | {:max-threads-per-block (maybe (max-threads-per-block-fn fun)) 703 | :shared-size (maybe (shared-size fun)) 704 | :const-size (maybe (const-size fun)) 705 | :local-size (maybe (local-size fun)) 706 | :num-regs (maybe (num-regs fun)) 707 | :ptx-version (maybe (ptx-version fun)) 708 | :binary-version (maybe (binary-version fun))}))) 709 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/internal/constants.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.internal.constants 11 | "Defines constants and mappings from/to CUDA constants." 12 | (:import [org.bytedeco.cuda.global cudart nvrtc])) 13 | 14 | ;; ==================== Keyword mapping ====================================== 15 | 16 | (def ^{:const true 17 | :doc "Available context flags defined in the CUDA standard."} 18 | ctx-flags 19 | {:blocking-sync cudart/CU_CTX_BLOCKING_SYNC 20 | :coredump cudart/CU_CTX_COREDUMP_ENABLE 21 | :flags-mask cudart/CU_CTX_FLAGS_MASK 22 | :lmem-resize-to-max cudart/CU_CTX_LMEM_RESIZE_TO_MAX 23 | :map-host cudart/CU_CTX_MAP_HOST 24 | :sched-auto cudart/CU_CTX_SCHED_AUTO 25 | :sched-blocking-sync cudart/CU_CTX_SCHED_BLOCKING_SYNC 26 | :sched-mask cudart/CU_CTX_SCHED_MASK 27 | :sched-spin cudart/CU_CTX_SCHED_SPIN 28 | :sched-yield cudart/CU_CTX_SCHED_YIELD 29 | :sync-memops cudart/CU_CTX_SYNC_MEMOPS 30 | :user-coredump cudart/CU_CTX_USER_COREDUMP_ENABLE}) 31 | 32 | (def ^{:const true 33 | :doc "Available context limits."} 34 | ctx-limits 35 | {:dev-runtime-pending-launch-count cudart/CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT 36 | :dev-runtime-sync-depth cudart/CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH 37 | :malloc-heap-size cudart/CU_LIMIT_MALLOC_HEAP_SIZE 38 | :max cudart/CU_LIMIT_MAX 39 | :max-l2-fetch-granularity cudart/CU_LIMIT_MAX_L2_FETCH_GRANULARITY 40 | :persisting-l2-cache-size cudart/CU_LIMIT_PERSISTING_L2_CACHE_SIZE 41 | :printf-fifo-size cudart/CU_LIMIT_PRINTF_FIFO_SIZE 42 | :stack-size cudart/CU_LIMIT_STACK_SIZE}) 43 | 44 | (def ^{:const true 45 | :doc "Available shared memory configurations."} 46 | shared-config-map 47 | {:default-bank-size cudart/CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE 48 | :four-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE 49 | :eight-byte-bank-size cudart/CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE}) 50 | 51 | (defn dec-shared-config [^long config] 52 | (case config 53 | 0 :default-bank-size 54 | 1 :four-byte-bank-size 55 | 2 :eight-byte-bank-size 56 | config)) 57 | 58 | (def ^{:const true 59 | :doc "Available device P2P attributes."} 60 | p2p-attributes 61 | {:access-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED 62 | :access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED 63 | :cuda-array-access-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED 64 | :native-atomic-supported cudart/CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED 65 | :performance-rank cudart/CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK}) 66 | 67 | (defn dec-compute-mode [^long mode] 68 | (case mode 69 | 0 :default 70 | 1 :exclusive 71 | 2 :prohibited 72 | 3 :exclusive-process 73 | mode) ) 74 | 75 | (def ^{:const true 76 | :doc "Available flags for the [[core/mem-host-alloc]] function."} 77 | mem-host-alloc-flags 78 | {:portable cudart/CU_MEMHOSTALLOC_PORTABLE 79 | :devicemap cudart/CU_MEMHOSTALLOC_DEVICEMAP 80 | :writecombined cudart/CU_MEMHOSTALLOC_WRITECOMBINED}) 81 | 82 | (def ^{:const true 83 | :doc "Available flags for the [[core/mem-host-register]] function."} 84 | mem-host-register-flags 85 | {:devicemap cudart/CU_MEMHOSTREGISTER_DEVICEMAP 86 | :iomemory cudart/CU_MEMHOSTREGISTER_IOMEMORY 87 | :portable cudart/CU_MEMHOSTREGISTER_PORTABLE 88 | :read-onlyp cudart/CU_MEMHOSTREGISTER_READ_ONLY}) 89 | 90 | (def ^{:const true 91 | :doc "Available flags for the [[core/mem-host-attach]] function."} 92 | mem-attach-flags 93 | {:global cudart/CU_MEM_ATTACH_GLOBAL 94 | :host cudart/CU_MEM_ATTACH_HOST 95 | :single cudart/CU_MEM_ATTACH_SINGLE}) 96 | 97 | (def ^{:const true 98 | :doc "Available flags for the [[core/mem-host-attach]] function."} 99 | stream-flags 100 | {:default cudart/CU_STREAM_DEFAULT 101 | :non-blocking cudart/CU_STREAM_NON_BLOCKING}) 102 | 103 | (defn dec-stream-flag [^long flag] 104 | (case flag 105 | 0 :default 106 | 1 :non-blocking 107 | flag)) 108 | 109 | (def ^{:const true 110 | :doc "Available flags for the [[core/event]] function."} 111 | event-flags 112 | {:blocking-sync cudart/CU_EVENT_BLOCKING_SYNC 113 | :default cudart/CU_EVENT_DEFAULT 114 | :disable-timing cudart/CU_EVENT_DISABLE_TIMING 115 | :interprocess cudart/CU_EVENT_INTERPROCESS}) 116 | 117 | (def ^{:const true 118 | :doc "Available config for the [[core/cache-config!]] function."} 119 | func-cache-config 120 | {:prefer-none cudart/CU_FUNC_CACHE_PREFER_NONE 121 | :prefer-shared cudart/CU_FUNC_CACHE_PREFER_SHARED 122 | :prefer-L1 cudart/CU_FUNC_CACHE_PREFER_L1 123 | :prefer-equal cudart/CU_FUNC_CACHE_PREFER_EQUAL}) 124 | 125 | (defn dec-func-cache-config [^long mode] 126 | (case mode 127 | 0 :prefer-none 128 | 1 :prefer-shared 129 | 2 :prefer-L1 130 | 3 :prefer-equal 131 | mode)) 132 | 133 | (def ^{:const true 134 | :doc "Available jit options defined in the CUDA standard."} 135 | jit-options 136 | {:cache-mode cudart/CU_JIT_CACHE_MODE 137 | :cache-option-ca cudart/CU_JIT_CACHE_OPTION_CA 138 | :cache-option-cg cudart/CU_JIT_CACHE_OPTION_CG 139 | :cache-option-none cudart/CU_JIT_CACHE_OPTION_NONE 140 | :error-log-buffer cudart/CU_JIT_ERROR_LOG_BUFFER 141 | :error-log-buffer-size-bytes cudart/CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES 142 | :fallback-strategy cudart/CU_JIT_FALLBACK_STRATEGY 143 | :fast-compile-strategy cudart/CU_JIT_FAST_COMPILE 144 | :fma cudart/CU_JIT_FMA 145 | :ftz cudart/CU_JIT_FTZ 146 | :generate-debug-info cudart/CU_JIT_GENERATE_DEBUG_INFO 147 | :generate-line-info cudart/CU_JIT_GENERATE_LINE_INFO 148 | :global-symbol-addresses cudart/CU_JIT_GLOBAL_SYMBOL_ADDRESSES 149 | :global-symbol-count cudart/CU_JIT_GLOBAL_SYMBOL_COUNT 150 | :global-symbol-names cudart/CU_JIT_GLOBAL_SYMBOL_NAMES 151 | :info-log-buffer cudart/CU_JIT_INFO_LOG_BUFFER 152 | :info-log-buffer-size-bytes cudart/CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES 153 | :input-cubin cudart/CU_JIT_INPUT_CUBIN 154 | :input-fatbinary cudart/CU_JIT_INPUT_FATBINARY 155 | :input-library cudart/CU_JIT_INPUT_LIBRARY 156 | :input-nvvm cudart/CU_JIT_INPUT_NVVM 157 | :input-object cudart/CU_JIT_INPUT_OBJECT 158 | :input-ptx cudart/CU_JIT_INPUT_PTX 159 | :log-verbose cudart/CU_JIT_LOG_VERBOSE 160 | :lto cudart/CU_JIT_LTO 161 | :max-registers cudart/CU_JIT_MAX_REGISTERS 162 | :new-sm3x-opt cudart/CU_JIT_NEW_SM3X_OPT 163 | :num-input-tupes cudart/CU_JIT_NUM_INPUT_TYPES 164 | :num-options cudart/CU_JIT_NUM_OPTIONS 165 | :optimization-level cudart/CU_JIT_OPTIMIZATION_LEVEL 166 | :optimize-unused-device-variables cudart/CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES 167 | :position-independent-code cudart/CU_JIT_POSITION_INDEPENDENT_CODE 168 | :prec-div cudart/CU_JIT_PREC_DIV 169 | :prec-sqrt cudart/CU_JIT_PREC_SQRT 170 | :referenced-kernel-count cudart/CU_JIT_REFERENCED_KERNEL_COUNT 171 | :referenced-kernel-names cudart/CU_JIT_REFERENCED_KERNEL_NAMES 172 | :referenced-variable-count cudart/CU_JIT_REFERENCED_VARIABLE_COUNT 173 | :referenced-variable-names cudart/CU_JIT_REFERENCED_VARIABLE_NAMES 174 | :target cudart/CU_JIT_TARGET 175 | :target-from-cucontext cudart/CU_JIT_TARGET_FROM_CUCONTEXT 176 | :threads-per-block cudart/CU_JIT_THREADS_PER_BLOCK 177 | :wall-time cudart/CU_JIT_WALL_TIME}) 178 | 179 | (def ^{:const true 180 | :doc "Available jit input types defined in the CUDA standard."} 181 | jit-input-types 182 | {:cubin cudart/CU_JIT_INPUT_CUBIN 183 | :ptx cudart/CU_JIT_INPUT_PTX 184 | :fatbinary cudart/CU_JIT_INPUT_FATBINARY 185 | :object cudart/CU_JIT_INPUT_OBJECT 186 | :library cudart/CU_JIT_INPUT_LIBRARY 187 | :nvvm cudart/CU_JIT_INPUT_NVVM 188 | :num cudart/CU_JIT_NUM_INPUT_TYPES}) 189 | 190 | (def ^{:const true 191 | :doc "CUDA Error messages as defined in CUresult."} 192 | cu-result-codes 193 | {cudart/CUDA_SUCCESS :success 194 | cudart/CUDA_ERROR_ALREADY_ACQUIRED :already-acquired 195 | cudart/CUDA_ERROR_ALREADY_MAPPED :already-mapped 196 | cudart/CUDA_ERROR_ARRAY_IS_MAPPED :array-is-mapped 197 | cudart/CUDA_ERROR_ASSERT :assert 198 | cudart/CUDA_ERROR_CAPTURED_EVENT :captured-event 199 | cudart/CUDA_ERROR_CDP_NOT_SUPPORTED :cdp-not-supported 200 | cudart/CUDA_ERROR_CDP_VERSION_MISMATCH :sdp-version-mismatch 201 | cudart/CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE :compat-not-supported-on-device 202 | cudart/CUDA_ERROR_CONTEXT_ALREADY_CURRENT :context-already-current 203 | cudart/CUDA_ERROR_CONTEXT_ALREADY_IN_USE :context-already-in-use 204 | cudart/CUDA_ERROR_CONTEXT_IS_DESTROYED :context-is-destroyed 205 | cudart/CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE :cooperative-launch-too-large 206 | cudart/CUDA_ERROR_DEINITIALIZED :deinitialized 207 | cudart/CUDA_ERROR_DEVICE_NOT_LICENSED :device-not-licensed 208 | cudart/CUDA_ERROR_DEVICE_UNAVAILABLE :unavailable 209 | cudart/CUDA_ERROR_ECC_UNCORRECTABLE :ecc-uncorrectable 210 | cudart/CUDA_ERROR_EXTERNAL_DEVICE :external-device 211 | cudart/CUDA_ERROR_FILE_NOT_FOUND :file-not-found 212 | cudart/CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE :graph-exec-update-failure 213 | cudart/CUDA_ERROR_HARDWARE_STACK_ERROR :hardware-stack-errox 214 | cudart/CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED :host-memory-already-registered 215 | cudart/CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED :host-memory-not-registered 216 | cudart/CUDA_ERROR_ILLEGAL_ADDRESS :illegal-address 217 | cudart/CUDA_ERROR_ILLEGAL_INSTRUCTION :illegal-instruction 218 | cudart/CUDA_ERROR_ILLEGAL_STATE :illegal-state 219 | cudart/CUDA_ERROR_INVALID_ADDRESS_SPACE :invalid-address-space 220 | cudart/CUDA_ERROR_INVALID_CLUSTER_SIZE :invalid-cluster-size 221 | cudart/CUDA_ERROR_INVALID_CONTEXT :invalid-context 222 | cudart/CUDA_ERROR_INVALID_DEVICE :invalid-device 223 | cudart/CUDA_ERROR_INVALID_GRAPHICS_CONTEXT :invalid-graphics-context 224 | cudart/CUDA_ERROR_INVALID_HANDLE :invalid-handle 225 | cudart/CUDA_ERROR_INVALID_IMAGE :invalid-image 226 | cudart/CUDA_ERROR_INVALID_PC :invalid-pc 227 | cudart/CUDA_ERROR_INVALID_PTX :invalid-ptx 228 | cudart/CUDA_ERROR_INVALID_SOURCE :invalid-source 229 | cudart/CUDA_ERROR_INVALID_VALUE :invalid-value 230 | cudart/CUDA_ERROR_JIT_COMPILATION_DISABLED :jit-compilation-disabled 231 | cudart/CUDA_ERROR_JIT_COMPILER_NOT_FOUND :jit-compiler-not-found 232 | cudart/CUDA_ERROR_LAUNCH_FAILED :launch-failed 233 | cudart/CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING :launch-incompatible-texturing 234 | cudart/CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES :launch-out-of-resources 235 | cudart/CUDA_ERROR_LAUNCH_TIMEOUT :launch-timeout 236 | cudart/CUDA_ERROR_MAP_FAILED :map-failed 237 | cudart/CUDA_ERROR_MISALIGNED_ADDRESS :misaligned-address 238 | cudart/CUDA_ERROR_MPS_CLIENT_TERMINATED :client-terminated 239 | cudart/CUDA_ERROR_MPS_CONNECTION_FAILED :connection-failed 240 | cudart/CUDA_ERROR_MPS_MAX_CLIENTS_REACHED :clients-reached 241 | cudart/CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED :connection-reached 242 | cudart/CUDA_ERROR_MPS_RPC_FAILURE :rpc-failure 243 | cudart/CUDA_ERROR_MPS_SERVER_NOT_READY :server-not-ready 244 | cudart/CUDA_ERROR_NO_BINARY_FOR_GPU :binary-for-gpu 245 | cudart/CUDA_ERROR_NO_DEVICE :no-device 246 | cudart/CUDA_ERROR_NOT_FOUND :not-found 247 | cudart/CUDA_ERROR_NOT_INITIALIZED :not-initialized 248 | cudart/CUDA_ERROR_NOT_MAPPED :not-mapped 249 | cudart/CUDA_ERROR_NOT_MAPPED_AS_ARRAY :not-mapped-as-array 250 | cudart/CUDA_ERROR_NOT_MAPPED_AS_POINTER :mapped-as-pointer 251 | cudart/CUDA_ERROR_NOT_READY :not-ready 252 | cudart/CUDA_ERROR_NOT_SUPPORTED :not-supported 253 | cudart/CUDA_ERROR_NVLINK_UNCORRECTABLE :nvlink-uncorrectable 254 | cudart/CUDA_ERROR_OPERATING_SYSTEM :operating-system 255 | cudart/CUDA_ERROR_OUT_OF_MEMORY :out-of-memory 256 | cudart/CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED :already-enabled 257 | cudart/CUDA_ERROR_PEER_ACCESS_NOT_ENABLED :access-not-enabled 258 | cudart/CUDA_ERROR_PEER_ACCESS_UNSUPPORTED :access-unsupported 259 | cudart/CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE :context-active 260 | cudart/CUDA_ERROR_PROFILER_ALREADY_STARTED :profiler-already-started 261 | cudart/CUDA_ERROR_PROFILER_ALREADY_STOPPED :profiler-already-stopped 262 | cudart/CUDA_ERROR_PROFILER_DISABLED :profiler-disabled 263 | cudart/CUDA_ERROR_PROFILER_NOT_INITIALIZED :profiler-not-initialized 264 | cudart/CUDA_ERROR_SHARED_OBJECT_INIT_FAILED :shared-object-init-failed 265 | cudart/CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND :shared-object-symblol-not-found 266 | cudart/CUDA_ERROR_STREAM_CAPTURE_IMPLICIT :stream-capture-implicit 267 | cudart/CUDA_ERROR_STREAM_CAPTURE_INVALIDATED :stream-capture-invalidated 268 | cudart/CUDA_ERROR_STREAM_CAPTURE_ISOLATION :stream-capture-isolation 269 | cudart/CUDA_ERROR_STREAM_CAPTURE_MERGE :stream-capture-merge 270 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNJOINED :stream-capture-unjoined 271 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNMATCHED :stream-capture-unmatched 272 | cudart/CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED :stream-capture-unsupported 273 | cudart/CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD :stream-capture-wrong-thread 274 | cudart/CUDA_ERROR_STUB_LIBRARY :stub-library 275 | cudart/CUDA_ERROR_SYSTEM_DRIVER_MISMATCH :driver-mismatch 276 | cudart/CUDA_ERROR_SYSTEM_NOT_READY :system-not-ready 277 | cudart/CUDA_ERROR_TIMEOUT :timeout 278 | cudart/CUDA_ERROR_TOO_MANY_PEERS :too-many-peers 279 | cudart/CUDA_ERROR_UNKNOWN :unknown 280 | cudart/CUDA_ERROR_UNMAP_FAILED :unmap-failed 281 | cudart/CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC :devside-sync 282 | cudart/CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY :exec-affinity 283 | cudart/CUDA_ERROR_UNSUPPORTED_LIMIT :unsupported-limit 284 | cudart/CUDA_ERROR_UNSUPPORTED_PTX_VERSION :unsupported-ptx-version 285 | cudart/cudaErrorAddressOfConstant :address-of-constant 286 | cudart/cudaErrorApiFailureBase :failure-base 287 | cudart/cudaErrorCallRequiresNewerDriver :call-requires-newer-driver 288 | cudart/cudaErrorDuplicateSurfaceName :duplicate-surface-name 289 | cudart/cudaErrorDuplicateTextureName :duplicate-texture-name 290 | cudart/cudaErrorDuplicateVariableName :duplicate-variable-name 291 | cudart/cudaErrorIncompatibleDriverContext :incompatible-context 292 | cudart/cudaErrorInsufficientDriver :insufficient-driver 293 | cudart/cudaErrorInvalidChannelDescriptor :invalid-channel-descriptor 294 | cudart/cudaErrorInvalidConfiguration :invalid-configuration 295 | cudart/cudaErrorInvalidDeviceFunction :invalid-device-function 296 | cudart/cudaErrorInvalidDevicePointer :invalid-device-pointer 297 | cudart/cudaErrorInvalidFilterSetting :invalid-filter-setting 298 | cudart/cudaErrorInvalidHostPointer :invalid-host-pointer 299 | cudart/cudaErrorInvalidMemcpyDirection :invalid-memcpy-direction 300 | cudart/cudaErrorInvalidNormSetting :invalid-norm-setting 301 | cudart/cudaErrorInvalidPitchValue :invalid-pitch-value 302 | cudart/cudaErrorInvalidSurface :invalid-surface 303 | cudart/cudaErrorInvalidSymbol :invalid-symbol 304 | cudart/cudaErrorInvalidTexture :invalid-texture 305 | cudart/cudaErrorInvalidTextureBinding :invalid-texture-binding 306 | cudart/cudaErrorLaunchFileScopedSurf :launch-file-scoped-surf 307 | cudart/cudaErrorLaunchFileScopedTex :launch-file-scoped-tex 308 | cudart/cudaErrorLaunchMaxDepthExceeded :max-depth-exceeded 309 | cudart/cudaErrorLaunchPendingCountExceeded :launch-pending-count-exceeded 310 | cudart/cudaErrorMemoryValueTooLarge :memory-value-too-large 311 | cudart/cudaErrorMissingConfiguration :missing-configuration 312 | cudart/cudaErrorMixedDeviceExecution :mixed-device-execution 313 | cudart/cudaErrorNotPermitted :not-permitted 314 | cudart/cudaErrorNotYetImplemented :not-yet-implemented 315 | cudart/cudaErrorPriorLaunchFailure :prior-launch-failure 316 | cudart/cudaErrorSoftwareValidityNotEstablished :software-validity-not-established 317 | cudart/cudaErrorStartupFailure :startup-failure 318 | cudart/cudaErrorSyncDepthExceeded :sync-depth-exceeded 319 | cudart/cudaErrorSynchronizationError :synchronization-error 320 | cudart/cudaErrorTextureFetchFailed :texture-fetch-failed 321 | cudart/cudaErrorTextureNotBound :texture-not-bound}) 322 | 323 | (def ^{:const true 324 | :doc "CUDA Error messages as defined in nvrtc."} 325 | nvrtc-result-codes 326 | {nvrtc/NVRTC_SUCCESS :success 327 | nvrtc/NVRTC_ERROR_BUILTIN_OPERATION_FAILURE :builtin-operation-failure 328 | nvrtc/NVRTC_ERROR_COMPILATION :compilation 329 | nvrtc/NVRTC_ERROR_INVALID_INPUT :invalid-input 330 | nvrtc/NVRTC_ERROR_INTERNAL_ERROR :internal-error 331 | nvrtc/NVRTC_ERROR_INVALID_OPTION :invalid-option 332 | nvrtc/NVRTC_ERROR_INVALID_PROGRAM :invalid-program 333 | nvrtc/NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID :name-expression-not-valid 334 | nvrtc/NVRTC_ERROR_OUT_OF_MEMORY :out-of-memory 335 | nvrtc/NVRTC_ERROR_PROGRAM_CREATION_FAILURE :program-creation-failure 336 | nvrtc/NVRTC_ERROR_TIME_FILE_WRITE_FAILED :time-file-write-ahead}) 337 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/internal/impl.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.internal.impl 11 | (:require [uncomplicate.commons 12 | [core :refer [with-release let-release Releaseable release info Bytes bytesize Entries 13 | size* size]] 14 | [utils :as cu :refer [dragan-says-ex]]] 15 | [uncomplicate.fluokitten.protocols :refer [Comonad extract]] 16 | [uncomplicate.clojure-cpp :as cpp 17 | :refer [put-entry! pointer safe int-pointer pointer-pointer byte-pointer size-t-pointer 18 | get-entry get-string null? long-pointer PointerCreator TypedPointerCreator 19 | clong-pointer short-pointer char-pointer double-pointer float-pointer pointer-seq 20 | capacity! address Accessor get! put! get-keyword]] 21 | [uncomplicate.clojurecuda.internal 22 | [constants :refer [cu-result-codes jit-input-types jit-options nvrtc-result-codes]] 23 | [utils :refer [with-check]]] 24 | [clojure.core.async :refer [go >!]]) 25 | (:import java.nio.file.Path 26 | java.io.File 27 | [clojure.lang IFn AFn Seqable] 28 | [org.bytedeco.javacpp Pointer BytePointer PointerPointer LongPointer IntPointer] 29 | [org.bytedeco.cuda.global cudart nvrtc] 30 | [org.bytedeco.cuda.cudart CUctx_st CUstream_st CUevent_st CUmod_st CUlinkState_st] 31 | org.bytedeco.cuda.nvrtc._nvrtcProgram 32 | [uncomplicate.clojure_cpp StringPointer KeywordPointer] 33 | [uncomplicate.clojurecuda.internal.javacpp CUHostFn CUStreamCallback])) 34 | 35 | (defprotocol CUPointer 36 | (cu-address* [this]) 37 | (device? [this])) 38 | 39 | (defprotocol Parameter 40 | (set-parameter* [this pp i])) 41 | 42 | (extend-type Object 43 | Parameter 44 | (set-parameter* [parameter pp i] 45 | (put-entry! pp i (pointer parameter)))) 46 | 47 | ;; ==================== Release resources ======================= 48 | 49 | (deftype CUDevice [^int dev] 50 | Object 51 | (hashCode [_] 52 | dev) 53 | (equals [_ y] 54 | (and (instance? CUDevice y) (= dev (.dev ^CUDevice y)))) 55 | (toString [_] 56 | (format "#Device[:cuda, %d]" dev)) 57 | Comonad 58 | (extract [_] 59 | dev)) 60 | 61 | (extend-type CUctx_st 62 | Releaseable 63 | (release [this] 64 | (locking this 65 | (cudart/cuCtxDestroy this) 66 | (.deallocate this) 67 | (.setNull this) 68 | true))) 69 | 70 | (extend-type CUstream_st 71 | Releaseable 72 | (release [this] 73 | (locking this 74 | (cudart/cuStreamDestroy this) 75 | (.deallocate this) 76 | (.setNull this) 77 | true))) 78 | 79 | (extend-type CUevent_st 80 | Releaseable 81 | (release [this] 82 | (locking this 83 | (cudart/cuEventDestroy this) 84 | (.deallocate this) 85 | (.setNull this) 86 | true))) 87 | 88 | (extend-type CUmod_st 89 | Releaseable 90 | (release [this] 91 | (locking this 92 | (cudart/cuModuleUnload this) 93 | (.deallocate this) 94 | (.setNull this) 95 | true))) 96 | 97 | (extend-type CUlinkState_st 98 | Releaseable 99 | (release [this] 100 | (locking this 101 | (cudart/cuLinkDestroy this) 102 | (.deallocate this) 103 | (.setNull this) 104 | true))) 105 | 106 | (extend-type _nvrtcProgram 107 | Releaseable 108 | (release [this] 109 | (locking this 110 | (nvrtc/nvrtcDestroyProgram this) 111 | (.deallocate this) 112 | (.setNull this) 113 | true))) 114 | 115 | ;; ================== Module Management ===================================== 116 | 117 | (defprotocol ModuleLoad 118 | (module-load* [data m]) 119 | (link-add* [data link-state type opts vals])) 120 | 121 | (defn enc-jit-options [options] 122 | (map (fn [[option value]] 123 | [(or (jit-options option) 124 | (throw (ex-info "Unknown jit option." {:option option :available jit-options}))) 125 | (safe (pointer value))]) 126 | options)) 127 | 128 | (defn check-options [^IntPointer options ^Pointer option-values] 129 | (when-not (= (size options) (size option-values)) 130 | (throw (ex-info "Inconsistent number of options provided." 131 | {:requested (size options) :provided (size option-values)})))) 132 | 133 | (defn link-add-data* [^CUlinkState_st link-state type ^Pointer data ^String name 134 | ^IntPointer options ^Pointer option-values] 135 | (let [type (int (or (jit-input-types type) 136 | (throw (ex-info "Invalid jit input type." 137 | {:type type :available jit-input-types}))))] 138 | (check-options options option-values) 139 | (with-check (cudart/cuLinkAddData link-state type data (bytesize data) name 140 | (size options) options option-values) 141 | {:data data} link-state))) 142 | 143 | (defn link-add-file* [^CUlinkState_st link-state type ^String file-name 144 | ^IntPointer options ^Pointer option-values] 145 | (let [type (int (or (jit-input-types type) 146 | (throw (ex-info "Invalid jit input type." 147 | {:type type :available jit-input-types}))))] 148 | (check-options options option-values) 149 | (with-check (cudart/cuLinkAddFile link-state type file-name 150 | (size options) options option-values) 151 | {:file file-name} link-state))) 152 | 153 | (defn link* 154 | [^CUlinkState_st link-state data options] 155 | (let [[opts vals] (enc-jit-options options)] 156 | (let-release [opts (int-pointer opts) 157 | vals (pointer-pointer vals)] 158 | (with-check (cudart/cuLinkCreate (size opts) opts ^PointerPointer vals link-state) 159 | (doseq [[type d options name] data] 160 | (if name 161 | (link-add-data* link-state type d name opts vals) 162 | (link-add* d link-state type opts vals)))))) 163 | link-state) 164 | 165 | (extend-type String 166 | ModuleLoad 167 | (module-load* [data m] 168 | (with-check (cudart/cuModuleLoadData ^CUmod_st m (byte-pointer data)) {:data data} m)) 169 | (link-add* [data link-state type opts vals] 170 | (link-add-data* link-state type (byte-pointer data) "unnamed" opts vals))) 171 | 172 | (extend-type Pointer 173 | ModuleLoad 174 | (module-load* [data m] 175 | (with-check (cudart/cuModuleLoadData m data) 176 | {:data data} m)) 177 | (link-add* [data link-state type opts vals] 178 | (link-add-data* link-state type data "unnamed" opts vals))) 179 | 180 | (extend-type Path 181 | ModuleLoad 182 | (module-load* [file-path m] 183 | (let [file-name (.toString file-path)] 184 | (with-check (cudart/cuModuleLoad ^CUmod_st m (str file-name)) {:file (str file-path)} m))) 185 | (link-add* [file-path link-state type opts vals] 186 | (link-add-file* link-state type (.toString file-path) opts vals))) 187 | 188 | (extend-type File 189 | ModuleLoad 190 | (module-load* [file m] 191 | (with-check (cudart/cuModuleLoad ^CUmod_st m (str file)) {:file (str file)} m)) 192 | (link-add* [file link-state type opts vals] 193 | (link-add-file* link-state type (.toString file) opts vals))) 194 | 195 | ;; ====================== Nvrtc program JIT ======================================== 196 | 197 | (defn ^:private nvrtc-error 198 | "Converts an CUDA Nvrtc error code to an ExceptionInfo with richer, user-friendly information." 199 | ([^long err-code details] 200 | (let [err (get nvrtc-result-codes err-code err-code)] 201 | (ex-info (format "NVRTC error: %s." err) 202 | {:name err :code err-code :type :nvrtc-error :details details}))) 203 | ([err-code] 204 | (nvrtc-error err-code nil))) 205 | 206 | (defmacro ^:private with-check-nvrtc 207 | "Evaluates `form` if `err-code` is not zero (`:success`), otherwise throws 208 | an appropriate `ExceptionInfo` with decoded informative details. 209 | It helps with CUDA nvrtc methods that return error codes directly, while 210 | returning computation results through mutating arguments. 211 | " 212 | ([err-code form] 213 | `(cu/with-check nvrtc-error ~err-code ~form))) 214 | 215 | (defn program* 216 | [^BytePointer name ^BytePointer source-code 217 | ^PointerPointer source-headers ^PointerPointer include-names] 218 | (let-release [res (_nvrtcProgram.)] 219 | (with-check-nvrtc 220 | (nvrtc/nvrtcCreateProgram res source-code name 221 | (size source-headers) source-headers include-names) 222 | res))) 223 | 224 | (defn program-log* 225 | "Returns the log string generated by the previous compilation of `program`." 226 | [^_nvrtcProgram program] 227 | (with-release [log-size (size-t-pointer 1)] 228 | (with-check-nvrtc (nvrtc/nvrtcGetProgramLogSize program log-size) 229 | (with-release [log (byte-pointer (get-entry log-size 0))] 230 | (with-check-nvrtc (nvrtc/nvrtcGetProgramLog program log) (get-string log)))))) 231 | 232 | (defn compile* 233 | "Compiles the given `program` using an array of string `options`." 234 | ([^_nvrtcProgram program ^PointerPointer options] 235 | (let [err (nvrtc/nvrtcCompileProgram program (size options) options)] 236 | (if (= 0 err) 237 | program 238 | (throw (nvrtc-error err (program-log* program))))))) 239 | 240 | (defn ptx* 241 | "Returns the PTX generated by the previous compilation of `program`." 242 | [^_nvrtcProgram program] 243 | (with-release [ptx-size (size-t-pointer 1)] 244 | (with-check-nvrtc (nvrtc/nvrtcGetPTXSize program ptx-size) 245 | (let-release [ptx (byte-pointer (get-entry ptx-size 0))] 246 | (with-check-nvrtc (nvrtc/nvrtcGetPTX program ptx) 247 | ptx))))) 248 | 249 | (extend-type _nvrtcProgram 250 | ModuleLoad 251 | (module-load* [program m] 252 | (with-check (cudart/cuModuleLoadData ^CUmod_st m (ptx* program)) m)) 253 | (link-add* [program link-state type opts vals] 254 | (link-add-data* link-state type (ptx* program) "unnamed" opts vals))) 255 | 256 | ;; =================== Context Management ================================== 257 | 258 | (defn context* 259 | "Creates a CUDA context on the `device` using a raw integer `flag`. 260 | For available flags, see [[constants/ctx-flags]]. 261 | " 262 | [^long dev ^long flags] 263 | (let [res (CUctx_st.)] 264 | (with-check (cudart/cuCtxCreate res flags dev) 265 | {:dev (info dev) :flags flags} 266 | res))) 267 | 268 | (defn current-context* 269 | "If `ctx` is provided, bounds it as current. Returns the CUDA context bound to the calling CPU thread." 270 | ([] 271 | (let [ctx (CUctx_st.)] 272 | (with-check (cudart/cuCtxGetCurrent ctx) ctx))) 273 | ([^CUctx_st ctx] 274 | (with-check (cudart/cuCtxSetCurrent ctx) ctx))) 275 | 276 | ;; ==================== Linear memory ================================================ 277 | 278 | (defprotocol MemSet 279 | (memset* [this dptr n] [this dptr n hstream])) 280 | 281 | (extend-type Byte 282 | MemSet 283 | (memset* 284 | ([this dptr n] 285 | (with-check (cudart/cuMemsetD8 dptr this n) dptr)) 286 | ([this dptr n hstream] 287 | (with-check (cudart/cuMemsetD8Async dptr this n hstream) dptr)))) 288 | 289 | (extend-type Short 290 | MemSet 291 | (memset* 292 | ([this dptr n] 293 | (with-check (cudart/cuMemsetD16 dptr this n) dptr)) 294 | ([this dptr n hstream] 295 | (with-check (cudart/cuMemsetD16Async dptr this n hstream) dptr)))) 296 | 297 | (extend-type Integer 298 | MemSet 299 | (memset* 300 | ([this dptr n] 301 | (with-check (cudart/cuMemsetD32 dptr this n) dptr)) 302 | ([this dptr n hstream] 303 | (with-check (cudart/cuMemsetD32Async dptr this n hstream) dptr)))) 304 | 305 | (extend-type Float 306 | MemSet 307 | (memset* 308 | ([this dptr n] 309 | (with-check (cudart/cuMemsetD32 dptr (Float/floatToIntBits this) n) dptr)) 310 | ([this dptr n hstream] 311 | (with-check (cudart/cuMemsetD32Async dptr (Float/floatToIntBits this) n hstream) dptr)))) 312 | 313 | (extend-type Double 314 | MemSet 315 | (memset* 316 | ([this dptr n] 317 | (if (= 0.0 this) 318 | (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr) 319 | (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this}))) 320 | ([this dptr n hstream] 321 | (if (= 0.0 this) 322 | (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr) 323 | (dragan-says-ex "Only zeroes are suported in double memset! function." {:value this}))))) 324 | 325 | (extend-type Long 326 | MemSet 327 | (memset* 328 | ([this dptr n] 329 | (if (= 0 this) 330 | (with-check (cudart/cuMemsetD32 dptr (int 0) (* 2 (long n))) dptr) 331 | (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this}))) 332 | ([this dptr n hstream] 333 | (if (= 0 this) 334 | (with-check (cudart/cuMemsetD32Async dptr (int 0) (* 2 (long n)) hstream) dptr) 335 | (dragan-says-ex "Only zeroes are suported in long memset! function." {:value this}))))) 336 | 337 | (defprotocol Memcpy 338 | "An object that represents memory that participates in CUDA operations. 339 | It can be on the device, or on the host. Built-in implementations: 340 | CUDA pointers, JavaCPP pointers, Java primitive arrays, etc. 341 | " 342 | (memcpy-host* [dst src size] [dst src size hstream]) 343 | (memcpy* [dst src size] [dst src size hstream])) 344 | 345 | (defn offset ^long [dptr ^long offset] 346 | (if (<= 0 offset (bytesize dptr)) 347 | (+ (long (cu-address* dptr)) offset) 348 | (dragan-says-ex "Requested bytes are out of the bounds of this device pointer." 349 | {:offset offset :size (bytesize dptr)}))) 350 | 351 | (deftype CUDevicePtr [^LongPointer daddr ^long byte-size master] 352 | Object 353 | (hashCode [_] 354 | (hash-combine (hash daddr) byte-size)) 355 | (equals [_ y] 356 | (and (instance? CUDevicePtr y) (= (get-entry daddr 0) (cu-address* y)))) 357 | (toString [_] 358 | (format "#DevicePtr[:cuda, 0x%x, %d bytes]" (get-entry daddr 0) byte-size)) 359 | Releaseable 360 | (release [_] 361 | (locking daddr 362 | (when-not (null? daddr) 363 | (when master 364 | (with-check (cudart/cuMemFree (get-entry daddr 0)) true)) 365 | (release daddr)) 366 | true)) 367 | Comonad 368 | (extract [_] 369 | (extract daddr)) 370 | CUPointer 371 | (cu-address* [_] 372 | (get-entry daddr 0)) 373 | (device? [_] 374 | true) 375 | Bytes 376 | (bytesize* [_] 377 | byte-size) 378 | Entries 379 | (size* [_] 380 | byte-size) 381 | (sizeof* [_] 382 | Byte/BYTES) 383 | Parameter 384 | (set-parameter* [_ pp i] 385 | (put-entry! pp i daddr)) 386 | Memcpy 387 | (memcpy-host* [this src byte-count] 388 | (with-check 389 | (cudart/cuMemcpyHtoD (get-entry daddr 0) (safe (pointer src)) byte-count) 390 | this)) 391 | (memcpy-host* [this src byte-count hstream] 392 | (with-check 393 | (cudart/cuMemcpyHtoDAsync (get-entry daddr 0) (safe (pointer src)) byte-count hstream) 394 | this)) 395 | (memcpy* [this src byte-count] 396 | (with-check 397 | (cudart/cuMemcpy (get-entry daddr 0) (cu-address* src) byte-count) 398 | this)) 399 | (memcpy* [this src byte-count hstream] 400 | (with-check 401 | (cudart/cuMemcpyAsync (get-entry daddr 0) (cu-address* src) byte-count hstream) 402 | this))) 403 | 404 | (defn mem-alloc-managed* 405 | ([^long size ^long flag] 406 | (let-release [daddr (long-pointer 1)] 407 | (with-check (cudart/cuMemAllocManaged daddr size flag) 408 | (->CUDevicePtr daddr size true))))) 409 | 410 | ;; =================== Runtime Memory =============================================== 411 | 412 | (defn cupointer-memcpy* 413 | ([dst src ^long byte-count] 414 | (with-check 415 | (if (instance? Pointer src) 416 | (cudart/cudaMemcpy (safe (pointer dst)) (extract src) byte-count cudart/cudaMemcpyDefault) 417 | (cudart/cuMemcpy (cu-address* dst) (cu-address* src) byte-count)) 418 | dst)) 419 | ([dst src ^long byte-count hstream] 420 | (with-check 421 | (if (instance? Pointer src) 422 | (cudart/cudaMemcpyAsync (safe (pointer dst)) (extract src) 423 | byte-count cudart/cudaMemcpyDefault hstream) 424 | (cudart/cuMemcpyAsync (cu-address* dst) (cu-address* src) byte-count hstream)) 425 | dst))) 426 | 427 | (defn offset-address [^Pointer p] 428 | (+ (.address (safe p)) (* (.sizeof p) (.position p)))) 429 | 430 | (deftype CURuntimePtr [^Pointer dptr master] 431 | Object 432 | (hashCode [_] 433 | (hash dptr)) 434 | (equals [_ y] 435 | (and (instance? CURuntimePtr y) (= dptr (.-dptr ^CURuntimePtr y) 0))) 436 | (toString [this] 437 | (format "#RuntimePtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize dptr))) 438 | Releaseable 439 | (release [_] 440 | (locking dptr 441 | (when-not (null? dptr) 442 | (when master 443 | (with-check (cudart/cudaFree (.position dptr 0)) (.setNull dptr)))) 444 | true)) 445 | Comonad 446 | (extract [_] 447 | (offset-address dptr)) 448 | CUPointer 449 | (cu-address* [_] 450 | (offset-address dptr)) 451 | (device? [_] 452 | true) 453 | PointerCreator 454 | (pointer* [_] 455 | dptr) 456 | (pointer* [_ i] 457 | (pointer dptr i)) 458 | TypedPointerCreator 459 | (byte-pointer [_] 460 | (byte-pointer dptr)) 461 | (clong-pointer [_] 462 | (clong-pointer dptr)) 463 | (size-t-pointer [_] 464 | (clong-pointer dptr)) 465 | (pointer-pointer [_] 466 | (pointer-pointer dptr)) 467 | (char-pointer [_] 468 | (char-pointer dptr)) 469 | (short-pointer [_] 470 | (short-pointer dptr)) 471 | (int-pointer [_] 472 | (int-pointer dptr)) 473 | (long-pointer [_] 474 | (long-pointer dptr)) 475 | (float-pointer [_] 476 | (float-pointer dptr)) 477 | (double-pointer [_] 478 | (double-pointer dptr)) 479 | Bytes 480 | (bytesize* [_] 481 | (bytesize dptr)) 482 | Entries 483 | (size* [_] 484 | (size* dptr)) 485 | (sizeof* [_] 486 | (.sizeof dptr)) 487 | Seqable 488 | (seq [_] 489 | (pointer-seq dptr)) 490 | Parameter 491 | (set-parameter* [this pp i] 492 | (put-entry! pp i (pointer (offset-address dptr)))) 493 | Memcpy 494 | (memcpy-host* [this src byte-count] 495 | (with-check 496 | (cudart/cuMemcpyHtoD (offset-address dptr) (safe (pointer src)) byte-count) 497 | this)) 498 | (memcpy-host* [this src byte-count hstream] 499 | (with-check 500 | (cudart/cuMemcpyHtoDAsync (offset-address dptr) (safe (pointer src)) byte-count hstream) 501 | this)) 502 | (memcpy* [this src byte-count] 503 | (cupointer-memcpy* this src byte-count)) 504 | (memcpy* [this src byte-count hstream] 505 | (cupointer-memcpy* this src byte-count hstream))) 506 | 507 | (defn malloc-runtime* 508 | ([^long size] 509 | (let-release [p (byte-pointer nil)] 510 | (with-check (cudart/cudaMalloc p size) 511 | (->CURuntimePtr (capacity! p size) true)))) 512 | ([^long size pointer-type] 513 | (let-release [p (byte-pointer nil)] 514 | (with-check (cudart/cudaMalloc p size) 515 | (->CURuntimePtr (pointer-type (capacity! p size)) true))))) 516 | 517 | ;; =================== Pinned Memory ================================================ 518 | 519 | (defn free-pinned [hp] 520 | (with-check (cudart/cuMemFreeHost hp) (release hp))) 521 | 522 | (defn unregister-pinned [hp] 523 | (with-check (cudart/cuMemHostUnregister hp) hp)) 524 | 525 | (deftype CUPinnedPtr [^Pointer hptr master release-fn] 526 | Object 527 | (hashCode [_] 528 | (hash hptr)) 529 | (equals [this y] 530 | (and (instance? CUPinnedPtr y) (= (offset-address hptr) (cu-address* y)))) 531 | (toString [this] 532 | (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (offset-address hptr) (bytesize hptr))) 533 | Releaseable 534 | (release [_] 535 | (locking hptr 536 | (when-not (null? hptr) 537 | (when master 538 | (release-fn (.position hptr 0)))) 539 | true)) 540 | Comonad 541 | (extract [_] 542 | (extract hptr)) 543 | CUPointer 544 | (cu-address* [_] 545 | (offset-address hptr)) 546 | (device? [_] 547 | false) 548 | PointerCreator 549 | (pointer* [_] 550 | hptr) 551 | (pointer* [_ i] 552 | (pointer hptr i)) 553 | TypedPointerCreator 554 | (byte-pointer [_] 555 | (byte-pointer hptr)) 556 | (clong-pointer [_] 557 | (clong-pointer hptr)) 558 | (size-t-pointer [_] 559 | (clong-pointer hptr)) 560 | (pointer-pointer [_] 561 | (pointer-pointer hptr)) 562 | (char-pointer [_] 563 | (char-pointer hptr)) 564 | (short-pointer [_] 565 | (short-pointer hptr)) 566 | (int-pointer [_] 567 | (int-pointer hptr)) 568 | (long-pointer [_] 569 | (long-pointer hptr)) 570 | (float-pointer [_] 571 | (float-pointer hptr)) 572 | (double-pointer [_] 573 | (double-pointer hptr)) 574 | Bytes 575 | (bytesize* [_] 576 | (bytesize hptr)) 577 | Entries 578 | (size* [_] 579 | (size* hptr)) 580 | (sizeof* [_] 581 | (.sizeof hptr)) 582 | Seqable 583 | (seq [_] 584 | (pointer-seq hptr)) 585 | Accessor 586 | (get-entry [_] 587 | (get-entry hptr)) 588 | (get-entry [_ i] 589 | (get-entry hptr i)) 590 | (put-entry! [this value] 591 | (put-entry! hptr value) 592 | this) 593 | (put-entry! [this i value] 594 | (put-entry! hptr i value) 595 | this) 596 | (get! [_ arr] 597 | (get! hptr arr) 598 | arr) 599 | (get! [_ arr offset length] 600 | (get! hptr arr offset length) 601 | arr) 602 | (put! [this obj] 603 | (put! hptr obj) 604 | this) 605 | (put! [_ obj offset length] 606 | (put! hptr obj offset length)) 607 | Parameter 608 | (set-parameter* [_ pp i] 609 | (put-entry! pp i (pointer (offset-address hptr)))) 610 | Memcpy 611 | (memcpy-host* [this src byte-count] 612 | (with-check (cudart/cuMemcpyDtoH hptr (offset-address hptr) byte-count) this)) 613 | (memcpy-host* [this src byte-count hstream] 614 | (with-check (cudart/cuMemcpyDtoHAsync hptr (offset-address hptr) byte-count hstream) this)) 615 | (memcpy* [this src byte-count] 616 | (cupointer-memcpy* this src byte-count)) 617 | (memcpy* [this src byte-count hstream] 618 | (cupointer-memcpy* this src byte-count hstream))) 619 | 620 | (defn mem-host-alloc* 621 | "Allocates `byte-size` bytes of page-locked memory, 'pinned' on the host, using raw integer `flags`. 622 | For available flags, see [[constants/mem-host-alloc-flags]]. The memory is not initialized. 623 | `byte-size` must be greater than `0`. 624 | See [cuMemHostAlloc](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html). 625 | " 626 | ([^long byte-size ^long flags] 627 | (let-release [p (byte-pointer nil)] 628 | (with-check (cudart/cuMemHostAlloc p byte-size flags) 629 | (->CUPinnedPtr (capacity! p byte-size) true free-pinned)))) 630 | ([^long byte-size ^long flags pointer-type] 631 | (let-release [p (byte-pointer nil)] 632 | (with-check (cudart/cuMemHostAlloc p byte-size flags) 633 | (->CUPinnedPtr (pointer-type (capacity! p byte-size)) true free-pinned))))) 634 | 635 | (defn mem-host-register* 636 | "Registers previously allocated host `Pointer` and pins it, using raw integer `flags`. 637 | See [cuMemHostRegister](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html). 638 | " 639 | ([hptr ^long flags] 640 | (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags) 641 | (->CUPinnedPtr hptr true unregister-pinned))) 642 | ([hptr ^long flags pointer-type] 643 | (with-check (cudart/cuMemHostRegister hptr (bytesize hptr) flags) 644 | (let [tp (pointer-type (capacity! hptr size))] 645 | (->CUPinnedPtr tp true unregister-pinned))))) 646 | 647 | (deftype CUMappedPtr [^Pointer hptr master] 648 | Object 649 | (hashCode [_] 650 | (hash hptr)) 651 | (equals [this y] 652 | (and (instance? CUMappedPtr y) (= (cu-address* this) (cu-address* y)))) 653 | (toString [this] 654 | (format "#PinnedPtr[:cuda, 0x%x, %d bytes]" (cu-address* this) (bytesize hptr))) 655 | Releaseable 656 | (release [_] 657 | (locking hptr 658 | (when-not (null? hptr) 659 | (when master 660 | (with-check (cudart/cuMemFreeHost (.position hptr 0)) 661 | (release hptr)))) 662 | true)) 663 | Comonad 664 | (extract [_] 665 | (offset-address hptr)) 666 | CUPointer 667 | (cu-address* [_] 668 | (offset-address hptr)) 669 | (device? [_] 670 | false) 671 | PointerCreator 672 | (pointer* [_] 673 | hptr) 674 | (pointer* [_ i] 675 | (pointer hptr i)) 676 | TypedPointerCreator 677 | (byte-pointer [_] 678 | (byte-pointer hptr)) 679 | (clong-pointer [_] 680 | (clong-pointer hptr)) 681 | (size-t-pointer [_] 682 | (clong-pointer hptr)) 683 | (pointer-pointer [_] 684 | (pointer-pointer hptr)) 685 | (char-pointer [_] 686 | (char-pointer hptr)) 687 | (short-pointer [_] 688 | (short-pointer hptr)) 689 | (int-pointer [_] 690 | (int-pointer hptr)) 691 | (long-pointer [_] 692 | (long-pointer hptr)) 693 | (float-pointer [_] 694 | (float-pointer hptr)) 695 | (double-pointer [_] 696 | (double-pointer hptr)) 697 | Bytes 698 | (bytesize* [_] 699 | (bytesize hptr)) 700 | Entries 701 | (size* [_] 702 | (size* hptr)) 703 | (sizeof* [_] 704 | (.sizeof hptr)) 705 | Seqable 706 | (seq [_] 707 | (pointer-seq hptr)) 708 | Accessor 709 | (get-entry [_] 710 | (get-entry hptr)) 711 | (get-entry [_ i] 712 | (get-entry hptr i)) 713 | (put-entry! [this value] 714 | (put-entry! hptr value) 715 | this) 716 | (put-entry! [this i value] 717 | (put-entry! hptr i value) 718 | this) 719 | (get! [_ arr] 720 | (get! hptr arr) 721 | arr) 722 | (get! [_ arr offset length] 723 | (get! hptr arr offset length) 724 | arr) 725 | (put! [this obj] 726 | (put! hptr obj) 727 | this) 728 | (put! [_ obj offset length] 729 | (put! hptr obj offset length)) 730 | Parameter 731 | (set-parameter* [_ pp i] 732 | (put-entry! pp i (pointer (offset-address hptr)))) 733 | Memcpy 734 | (memcpy-host* [this src byte-count] 735 | (if (device? src) 736 | (with-check (cudart/cuMemcpy (offset-address hptr) (cu-address* src) byte-count) this) 737 | (cpp/memcpy! (safe (pointer src)) (extract hptr))) 738 | this) 739 | (memcpy-host* [this src byte-count hstream] 740 | (with-check 741 | (if (device? src) 742 | (cudart/cuMemcpyAsync (offset-address hptr) (cu-address* src) byte-count hstream) 743 | (cudart/cuMemcpyHtoDAsync (offset-address hptr) (safe (pointer src)) byte-count hstream)) 744 | this)) 745 | (memcpy* [this src byte-count] 746 | (cupointer-memcpy* this src byte-count)) 747 | (memcpy* [this src byte-count hstream] 748 | (cupointer-memcpy* this src byte-count hstream))) 749 | 750 | (defn mem-alloc-host* 751 | "Allocates `byte-size` bytes of page-locked memory, 'mapped' to the device. 752 | For available flags, see [constants/mem-host-alloc-flags] 753 | The memory is not initialized. `byte-size` must be greater than `0`. 754 | See [cuMemAllocHost](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html). 755 | " 756 | ([^long byte-size] 757 | (let-release [p (byte-pointer nil)] 758 | (with-check (cudart/cuMemAllocHost p byte-size) 759 | (->CUMappedPtr (capacity! p byte-size) true)))) 760 | ([^long byte-size pointer-type] 761 | (let-release [p (byte-pointer nil)] 762 | (with-check (cudart/cuMemAllocHost p byte-size) 763 | (->CUMappedPtr (pointer-type (capacity! p byte-size)) true))))) 764 | 765 | ;; =============== Host memory ================================= 766 | 767 | (extend-type Pointer 768 | CUPointer 769 | (cu-address* [this] 770 | (offset-address this)) 771 | (device? [_] 772 | false) 773 | Parameter 774 | (set-parameter* [parameter pp i] 775 | (put-entry! pp i (pointer (offset-address parameter)))) 776 | Memcpy 777 | (memcpy-host* 778 | ([this src byte-count] 779 | (with-check 780 | (if (instance? Pointer src) 781 | (cudart/cudaMemcpy (extract this) (safe (pointer src)) cudart/cudaMemcpyDefault byte-count) 782 | (cudart/cuMemcpyDtoH (extract this) (cu-address* src) byte-count)) 783 | this)) 784 | ([this src byte-count hstream] 785 | (with-check 786 | (if (instance? Pointer src) 787 | (cudart/cudaMemcpyAsync (extract this) (safe (pointer src)) 788 | cudart/cudaMemcpyDefault byte-count hstream) 789 | (cudart/cuMemcpyDtoHAsync (extract this) (cu-address* src) byte-count hstream)) 790 | this))) 791 | (memcpy* 792 | ([this src byte-count] 793 | (with-check 794 | (if (instance? Pointer src) 795 | (cudart/cudaMemcpy (extract this) (safe (pointer src)) byte-count cudart/cudaMemcpyDefault) 796 | (cudart/cuMemcpy (offset-address (extract this)) (cu-address* src) byte-count)) 797 | this)) 798 | ([this src byte-count hstream] 799 | (with-check 800 | (if (instance? Pointer src) 801 | (cudart/cudaMemcpyAsync (extract this) (safe (pointer src)) 802 | byte-count cudart/cudaMemcpyDefault hstream) 803 | (cudart/cuMemcpyAsync (offset-address (extract this)) (cu-address* src) byte-count hstream)) 804 | this)))) 805 | 806 | ;; ================== Stream Management ====================================== 807 | 808 | (defn stream* 809 | "Create a stream using an optional `priority` and an integer `flag`. 810 | See [cuStreamCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html) 811 | " 812 | ([^long flag] 813 | (let [res (CUstream_st.)] 814 | (with-check (cudart/cuStreamCreate res flag) res))) 815 | ([^long priority ^long flag] 816 | (let [res (CUstream_st.)] 817 | (with-check (cudart/cuStreamCreateWithPriority res flag priority) res)))) 818 | 819 | (defn ready* 820 | "Determines status (ready or not) of a compute stream or event. 821 | See [cuStreamQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html), 822 | and [cuEventQuery](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 823 | " 824 | [obj] 825 | (case (class obj) 826 | CUstream_st (cudart/cuStreamQuery obj) 827 | CUevent_st (cudart/cuEventQuery obj) 828 | cudart/CUDA_ERROR_NOT_READY)) 829 | 830 | (defrecord StreamCallbackInfo [status data]) 831 | 832 | (deftype StreamCallback [ch] 833 | IFn 834 | (invoke [_ _ status data] 835 | (go (>! ch (->StreamCallbackInfo (get cu-result-codes status status) (extract data))))) 836 | (applyTo [this xs] 837 | (AFn/applyToHelper this xs))) 838 | 839 | (defprotocol HostFn 840 | (host-fn* [type ch])) 841 | 842 | (extend-type KeywordPointer 843 | HostFn 844 | (host-fn* [_ ch] 845 | (fn [data] 846 | (go (>! ch (get-keyword (byte-pointer data))))))) 847 | 848 | (extend-type StringPointer 849 | HostFn 850 | (host-fn* [_ ch] 851 | (fn [data] 852 | (go (>! ch (get-string (byte-pointer data))))))) 853 | 854 | (extend-type Pointer 855 | HostFn 856 | (host-fn* [_ ch] 857 | (fn [data] 858 | (go (>! ch data))))) 859 | 860 | (defn add-host-fn* 861 | [^CUstream_st hstream ^IFn f ^Pointer data] 862 | (let-release [hostfn (CUHostFn. f)] 863 | (with-check (cudart/cuLaunchHostFunc hstream hostfn data) 864 | hstream))) 865 | 866 | (defn attach-mem* 867 | "Attach memory of `byte-size`, specified by an integer `flag` to a `hstream` asynchronously. 868 | See [cuStreamAttachMemAsync](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html). 869 | " 870 | ([^CUstream_st hstream mem byte-size flag] 871 | (with-check (cudart/cuStreamAttachMemAsync hstream mem byte-size flag) hstream))) 872 | 873 | ;; ================== Event Management ======================================= 874 | 875 | (defn event* 876 | "Creates an event specified by integer `flags`. 877 | See [cuEventCreate](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html) 878 | " 879 | [^long flags] 880 | (let [res (CUevent_st.)] 881 | (with-check (cudart/cuEventCreate res flags) res))) 882 | 883 | ;; ================== Peer Context Memory Access ============================= 884 | 885 | (defn can-access-peer* 886 | "queries if a device may directly access a peer device's memory. 887 | see [cudevicecanaccesspeer](http://docs.nvidia.com/cuda/cuda-driver-api/group__cuda__peer__access.html) 888 | " 889 | [^long dev ^long peer] 890 | (with-release [res (int-pointer 1)] 891 | (with-check (cudart/cuDeviceCanAccessPeer ^IntPointer res dev peer) 892 | (pos? (int (get-entry res 0)))))) 893 | 894 | (defn p2p-attribute* 895 | "Queries attributes of the link between two devices. 896 | See [cuDeviceGetP2PAttribute](http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html) 897 | " 898 | [^long dev ^long peer ^long attribute] 899 | (let [res (int-pointer 1)] 900 | (with-check 901 | (cudart/cudaDeviceGetP2PAttribute ^IntPointer res attribute dev peer) 902 | (pos? (int (get-entry res 0)))))) 903 | 904 | ;; ================ print-method ============================================ 905 | 906 | (defn format-pointer [title p ^java.io.Writer w] 907 | (.write w (format "#%s[:cuda, 0x%x]" title (address p)))) 908 | 909 | (defmethod print-method CUDevice [p ^java.io.Writer w] 910 | (.write w (str p))) 911 | 912 | (defmethod print-method CUctx_st [p w] 913 | (format-pointer "Context" p w)) 914 | 915 | (defmethod print-method CUstream_st [p w] 916 | (format-pointer "Stream" p w)) 917 | 918 | (defmethod print-method CUevent_st [p w] 919 | (format-pointer "Event" p w)) 920 | 921 | (defmethod print-method CUmod_st [p w] 922 | (format-pointer "Module" p w)) 923 | 924 | (defmethod print-method CUlinkState_st [p w] 925 | (format-pointer "LinkState" p w)) 926 | 927 | (defmethod print-method _nvrtcProgram [p w] 928 | (format-pointer "Program" p w)) 929 | 930 | (defmethod print-method CUDevicePtr [p w] 931 | (format-pointer "DevicePtr" p w)) 932 | 933 | (defmethod print-method CUPinnedPtr [p w] 934 | (format-pointer "PinnedPtr" p w)) 935 | 936 | (defmethod print-method CUMappedPtr [p w] 937 | (format-pointer "MappedPtr" p w)) 938 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/internal/utils.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.internal.utils 11 | "Utility functions used as helpers in other ClojureCUDA namespaces. 12 | The user of the ClojureCUDA library would probably not need to use 13 | any of the functions defined here." 14 | (:require [uncomplicate.commons.utils :as utils] 15 | [uncomplicate.clojurecuda.internal.constants :refer [cu-result-codes]]) 16 | (:import clojure.lang.ExceptionInfo)) 17 | 18 | ;; ============= Error Codes =================================================== 19 | 20 | (defn error 21 | "Converts an CUDA error code to an [ExceptionInfo] (http://clojuredocs.org/clojure.core/ex-info) 22 | with richer, user-friendly information. 23 | Accepts a long `err-code` that should be one of the codes defined in CUDA standard, and an 24 | optional `details` argument that could be anything that you think is informative. 25 | 26 | Examples: 27 | (error 0) => an ExceptionInfo instance 28 | (error -5 {:comment \"Why here?\"\"}) => an ExceptionInfo instance 29 | " 30 | ([^long err-code details] 31 | (let [err (get cu-result-codes err-code err-code)] 32 | (ex-info (format "CUDA error: %s." err) 33 | {:name err :code err-code :type :cuda :details details}))) 34 | ([^long err-code] 35 | (error err-code nil))) 36 | 37 | (defmacro with-check 38 | "Evaluates `form` if `status` is not zero (`:success`), otherwise throws 39 | an appropriate `ExceptionInfo` with decoded informative details. 40 | It helps fith CUDA methods that return error codes directly, while 41 | returning computation results through side-effects in arguments. 42 | 43 | Example: 44 | (with-check (some-jcuda-call-that-returns-error-code) result) 45 | " 46 | ([status form] 47 | `(utils/with-check error ~status ~form)) 48 | ([status details form] 49 | `(let [status# ~status] 50 | (if (= 0 status#) 51 | ~form 52 | (throw (error status# ~details)))))) 53 | 54 | (defmacro maybe 55 | "Evaluates form in try/catch block; if a CUDA-related exception is caught, 56 | substitutes the result with the [ExceptionInfo](http://clojuredocs.org/clojure.core/ex-info) object." 57 | [form] 58 | `(try ~form 59 | (catch ExceptionInfo ex-info# 60 | (if (= :cuda (:type (ex-data ex-info#))) 61 | (:name (ex-data ex-info#)) 62 | (throw ex-info#))))) 63 | -------------------------------------------------------------------------------- /src/clojure/uncomplicate/clojurecuda/toolbox.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:author "Dragan Djuric"} 10 | uncomplicate.clojurecuda.toolbox 11 | "Various helpers that are not needed by ClojureCUDA itself, 12 | but may be very helpful in applications. See Neanderthal and Bayadera libraries 13 | for the examples of how to use them." 14 | (:require [uncomplicate.commons 15 | [core :refer [with-release]] 16 | [utils :refer [count-groups]]] 17 | [uncomplicate.clojure-cpp 18 | :refer [byte-pointer get-long get-int get-double get-float]] 19 | [uncomplicate.clojurecuda.core 20 | :refer [grid-1d grid-2d launch! memcpy-host! parameters set-parameter! set-parameter!]]) 21 | (:import org.bytedeco.javacpp.PointerPointer)) 22 | 23 | (defn launch-reduce! 24 | ([hstream main-kernel reduction-kernel main-params reduction-params n local-n] 25 | (let [main-params (if (instance? PointerPointer main-params) 26 | (set-parameter! main-params 0 n) 27 | (apply parameters n main-params)) 28 | reduction-params (if (instance? PointerPointer reduction-params) 29 | reduction-params 30 | (apply parameters Integer/MAX_VALUE reduction-params))] 31 | (launch! main-kernel (grid-1d n local-n) hstream main-params) 32 | (loop [global-size (count-groups local-n n)] 33 | (when (< 1 global-size) 34 | (launch! reduction-kernel (grid-1d global-size local-n) hstream 35 | (set-parameter! reduction-params 0 global-size)) 36 | (recur (count-groups local-n global-size)))) 37 | hstream)) 38 | ([hstream main-kernel reduction-kernel main-params reduction-params m n local-m local-n] 39 | (let [main-params (if (instance? PointerPointer main-params) 40 | (set-parameter! main-params 0 m n) 41 | (apply parameters m n main-params)) 42 | reduction-params (if (instance? PointerPointer reduction-params) 43 | reduction-params 44 | (apply parameters Integer/MAX_VALUE Integer/MAX_VALUE reduction-params))] 45 | (if (or (< 1 (long local-n)) (<= (long local-n) (long n))) 46 | (loop [hstream (launch! main-kernel (grid-2d m n local-m local-n) hstream main-params) 47 | global-size (count-groups local-n n)] 48 | (if (= 1 global-size) 49 | hstream 50 | (recur (launch! reduction-kernel (grid-2d m global-size local-m local-n) hstream 51 | (set-parameter! reduction-params 0 m global-size)) 52 | (count-groups local-n global-size)))) 53 | (throw (IllegalArgumentException. 54 | (format "local-n %d would cause infinite recursion for n:%d." local-n n))))))) 55 | 56 | (defn read-int 57 | (^long [cu-buf] 58 | (with-release [res (byte-pointer Integer/BYTES)] 59 | (memcpy-host! cu-buf res) 60 | (get-int res 0))) 61 | (^long [hstream cu-buf] 62 | (with-release [res (byte-pointer Integer/BYTES)] 63 | (memcpy-host! cu-buf res hstream) 64 | (get-int res 0)))) 65 | 66 | (defn read-long 67 | (^long [cu-buf] 68 | (with-release [res (byte-pointer Long/BYTES)] 69 | (memcpy-host! cu-buf res) 70 | (get-long res 0))) 71 | (^long [hstream cu-buf] 72 | (with-release [res (byte-pointer Long/BYTES)] 73 | (memcpy-host! cu-buf res hstream) 74 | (get-long res 0)))) 75 | 76 | (defn read-double 77 | (^double [cu-buf] 78 | (with-release [res (byte-pointer Double/BYTES)] 79 | (memcpy-host! cu-buf res) 80 | (get-double res 0))) 81 | (^double [hstream cu-buf] 82 | (with-release [res (byte-pointer Double/BYTES)] 83 | (memcpy-host! cu-buf res hstream) 84 | (get-double res 0)))) 85 | 86 | (defn read-float 87 | (^double [cu-buf] 88 | (with-release [res (byte-pointer Float/BYTES)] 89 | (memcpy-host! cu-buf res) 90 | (get-float res 0))) 91 | (^double [hstream cu-buf] 92 | (with-release [res (byte-pointer Float/BYTES)] 93 | (memcpy-host! cu-buf res hstream) 94 | (get-float res 0)))) 95 | -------------------------------------------------------------------------------- /src/cuda/uncomplicate/clojurecuda/include/jitify/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/cuda/uncomplicate/clojurecuda/include/jitify/float.h: -------------------------------------------------------------------------------- 1 | #ifndef _float_h_ 2 | #define _float_h_ 3 | 4 | inline __host__ __device__ float jitify_int_as_float(int i) { 5 | union FloatInt { float f; int i; } fi; 6 | fi.i = i; 7 | return fi.f; 8 | } 9 | 10 | inline __host__ __device__ double jitify_longlong_as_double(long long i) { 11 | union DoubleLongLong { double f; long long i; } fi; 12 | fi.i = i; 13 | return fi.f; 14 | } 15 | 16 | #define FLT_RADIX 2 17 | #define FLT_MANT_DIG 24 18 | #define DBL_MANT_DIG 53 19 | #define FLT_DIG 6 20 | #define DBL_DIG 15 21 | #define FLT_MIN_EXP -125 22 | #define DBL_MIN_EXP -1021 23 | #define FLT_MIN_10_EXP -37 24 | #define DBL_MIN_10_EXP -307 25 | #define FLT_MAX_EXP 128 26 | #define DBL_MAX_EXP 1024 27 | #define FLT_MAX_10_EXP 38 28 | #define DBL_MAX_10_EXP 308 29 | #define FLT_MAX jitify_int_as_float(2139095039) 30 | #define DBL_MAX jitify_longlong_as_double(9218868437227405311) 31 | #define FLT_EPSILON jitify_int_as_float(872415232) 32 | #define DBL_EPSILON jitify_longlong_as_double(4372995238176751616) 33 | #define FLT_MIN jitify_int_as_float(8388608) 34 | #define DBL_MIN jitify_longlong_as_double(4503599627370496) 35 | #define FLT_ROUNDS 1 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/cuda/uncomplicate/clojurecuda/include/jitify/stddef.h: -------------------------------------------------------------------------------- 1 | #ifndef _stddef_h_ 2 | #define _stddef_h_ 3 | 4 | typedef unsigned long size_t; 5 | typedef signed long ptrdiff_t; 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /src/cuda/uncomplicate/clojurecuda/include/jitify/stdint.h: -------------------------------------------------------------------------------- 1 | #ifndef _stdint_h_ 2 | #define _stdint_h_ 3 | 4 | typedef signed char int8_t; 5 | typedef signed short int16_t; 6 | typedef signed int int32_t; 7 | typedef signed long long int64_t; 8 | typedef signed char int_fast8_t; 9 | typedef signed short int_fast16_t; 10 | typedef signed int int_fast32_t; 11 | typedef signed long long int_fast64_t; 12 | typedef signed char int_least8_t; 13 | typedef signed short int_least16_t; 14 | typedef signed int int_least32_t; 15 | typedef signed long long int_least64_t; 16 | typedef signed long long intmax_t; 17 | typedef signed long intptr_t; 18 | typedef unsigned char uint8_t; 19 | typedef unsigned short uint16_t; 20 | typedef unsigned int uint32_t; 21 | typedef unsigned long long uint64_t; 22 | typedef unsigned char uint_fast8_t; 23 | typedef unsigned short uint_fast16_t; 24 | typedef unsigned int uint_fast32_t; 25 | typedef unsigned long long uint_fast64_t; 26 | typedef unsigned char uint_least8_t; 27 | typedef unsigned short uint_least16_t; 28 | typedef unsigned int uint_least32_t; 29 | typedef unsigned long long uint_least64_t; 30 | typedef unsigned long long uintmax_t; 31 | typedef unsigned long uintptr_t; 32 | #define INT8_MIN SCHAR_MIN 33 | #define INT16_MIN SHRT_MIN 34 | #define INT32_MIN INT_MIN 35 | #define INT64_MIN LLONG_MIN 36 | #define INT8_MAX SCHAR_MAX 37 | #define INT16_MAX SHRT_MAX 38 | #define INT32_MAX INT_MAX 39 | #define INT64_MAX LLONG_MAX 40 | #define UINT8_MAX UCHAR_MAX 41 | #define UINT16_MAX USHRT_MAX 42 | #define UINT32_MAX UINT_MAX 43 | #define UINT64_MAX ULLONG_MAX 44 | #define INTPTR_MIN LONG_MIN 45 | #define INTMAX_MIN LLONG_MIN 46 | #define INTPTR_MAX LONG_MAX 47 | #define INTMAX_MAX LLONG_MAX 48 | #define UINTPTR_MAX ULONG_MAX 49 | #define UINTMAX_MAX ULLONG_MAX 50 | #define PTRDIFF_MIN INTPTR_MIN 51 | #define PTRDIFF_MAX INTPTR_MAX 52 | #define SIZE_MAX UINT64_MAX 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | #ifndef REAL 4 | #define REAL float 5 | #endif 6 | 7 | #ifndef ACCUMULATOR 8 | #define ACCUMULATOR float 9 | #endif 10 | 11 | #ifndef WGS 12 | #define WGS 1024 13 | #endif 14 | 15 | #ifndef WGSm 16 | #define WGSm 64 17 | #endif 18 | 19 | #ifndef WGSn 20 | #define WGSn 16 21 | #endif 22 | 23 | // ================= Sum reduction ============================================= 24 | 25 | __device__ ACCUMULATOR block_reduction_sum (const ACCUMULATOR value) { 26 | 27 | const int local_id = threadIdx.x; 28 | 29 | __shared__ ACCUMULATOR lacc[WGS]; 30 | lacc[local_id] = value; 31 | 32 | __syncthreads(); 33 | 34 | ACCUMULATOR pacc = value; 35 | int i = blockDim.x; 36 | while (i > 0) { 37 | const bool include_odd = (i > ((i >> 1) << 1)) && (local_id == ((i >> 1) - 1)); 38 | i >>= 1; 39 | if (include_odd) { 40 | pacc += lacc[local_id + i + 1]; 41 | } 42 | if (local_id < i) { 43 | pacc += lacc[local_id + i]; 44 | lacc[local_id] = pacc; 45 | } 46 | __syncthreads(); 47 | } 48 | 49 | return lacc[0]; 50 | } 51 | 52 | __device__ ACCUMULATOR block_reduction_sum_2 (const ACCUMULATOR value) { 53 | 54 | const int local_row = threadIdx.x; 55 | const int local_col = threadIdx.y; 56 | const int local_m = blockDim.x; 57 | 58 | __shared__ ACCUMULATOR lacc[WGS]; 59 | lacc[local_row + local_col * local_m] = value; 60 | 61 | __syncthreads(); 62 | 63 | ACCUMULATOR pacc = value; 64 | int i = blockDim.y; 65 | while (i > 0) { 66 | const bool include_odd = (i > ((i >> 1) << 1)) && (local_col == ((i >> 1) - 1)); 67 | i >>= 1; 68 | if (include_odd) { 69 | pacc += lacc[local_row + (local_col + i + 1) * local_m]; 70 | } 71 | if (local_col < i) { 72 | pacc += lacc[local_row + (local_col + i) * local_m]; 73 | lacc[local_row + local_col * local_m] = pacc; 74 | } 75 | __syncthreads(); 76 | } 77 | 78 | return lacc[local_row]; 79 | 80 | } 81 | 82 | __global__ void sum_reduction(const int n, ACCUMULATOR* acc) { 83 | const int gid = blockIdx.x * blockDim.x + threadIdx.x; 84 | const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? acc[gid] : 0.0); 85 | if (threadIdx.x == 0) { 86 | acc[blockIdx.x] = sum; 87 | } 88 | } 89 | 90 | __global__ void sum_reduction_horizontal (const int m, const int n, ACCUMULATOR* acc) { 91 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x; 92 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y; 93 | const int i = m * gid_1 + gid_0; 94 | const bool valid = (gid_0 < m) && (gid_1 < n); 95 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0); 96 | const bool write = valid && (threadIdx.y == 0); 97 | if (write) { 98 | acc[m * blockIdx.y + gid_0] = sum; 99 | } 100 | } 101 | 102 | __global__ void sum_reduction_vertical (const int m, const int n, ACCUMULATOR* acc) { 103 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x; 104 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y; 105 | const int i = n * gid_0 + gid_1; 106 | const bool valid = (gid_0 < m) && (gid_1 < n); 107 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? acc[i] : 0.0); 108 | const bool write = valid && (threadIdx.y == 0); 109 | if (write) { 110 | acc[m * blockIdx.y + gid_0] = sum; 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/java/uncomplicate/clojurecuda/internal/javacpp/CUHostFn.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) Dragan Djuric. All rights reserved. 2 | // The use and distribution terms for this software are covered by the 3 | // Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | // which can be found in the file LICENSE at the root of this distribution. 5 | // By using this software in any fashion, you are agreeing to be bound by 6 | // the terms of this license. 7 | // You must not remove this notice, or any other, from this software. 8 | 9 | package uncomplicate.clojurecuda.internal.javacpp; 10 | 11 | import clojure.lang.IFn; 12 | import org.bytedeco.javacpp.Pointer; 13 | import org.bytedeco.cuda.cudart.CUhostFn; 14 | import org.bytedeco.cuda.cudart.CUstream_st; 15 | 16 | 17 | public class CUHostFn extends CUhostFn { 18 | 19 | private IFn fun; 20 | 21 | public CUHostFn (IFn fun) { 22 | this.fun = fun; 23 | } 24 | 25 | public void call (Pointer userData) { 26 | fun.invoke(userData); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/java/uncomplicate/clojurecuda/internal/javacpp/CUStreamCallback.java: -------------------------------------------------------------------------------- 1 | // Copyright (c) Dragan Djuric. All rights reserved. 2 | // The use and distribution terms for this software are covered by the 3 | // Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | // which can be found in the file LICENSE at the root of this distribution. 5 | // By using this software in any fashion, you are agreeing to be bound by 6 | // the terms of this license. 7 | // You must not remove this notice, or any other, from this software. 8 | 9 | package uncomplicate.clojurecuda.internal.javacpp; 10 | 11 | import clojure.lang.IFn; 12 | import org.bytedeco.javacpp.Pointer; 13 | import org.bytedeco.cuda.cudart.CUstreamCallback; 14 | import org.bytedeco.cuda.cudart.CUstream_st; 15 | 16 | 17 | public class CUStreamCallback extends CUstreamCallback { 18 | 19 | private IFn fun; 20 | 21 | public CUStreamCallback (IFn fun) { 22 | this.fun = fun; 23 | } 24 | 25 | public void call (CUstream_st hstream, int status, Pointer userData) { 26 | fun.invoke(hstream, status, userData); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/core_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.core-test 10 | (:require [midje.sweet :refer [facts => throws truthy]] 11 | [clojure.core.async :refer [chan true) 27 | 28 | (facts 29 | "Device tests." 30 | (<= 0 (device-count)) => true 31 | (device 0) => truthy 32 | (device -1) => (throws ExceptionInfo) 33 | (device 33) => (throws ExceptionInfo) 34 | (device (pci-bus-id-string (device))) => (device)) 35 | 36 | ;; ===================== Context Management Tests ======================================= 37 | 38 | (facts 39 | "Context tests" 40 | (with-release [dev (device 0)] 41 | (let [ctx (context dev :sched-auto)] 42 | ctx => truthy 43 | (release ctx) => true 44 | (context dev :unknown) => (throws ExceptionInfo)) 45 | (let [ctx1 (context dev :sched-blocking-sync) 46 | ctx2 (context dev :sched-blocking-sync)] 47 | (with-context ctx1 48 | (with-context ctx2 49 | (current-context) => ctx2 50 | (do (pop-context!) (current-context)) => ctx1 51 | (current-context! ctx2) => ctx2 52 | (current-context) => ctx2 53 | (release ctx2) => true 54 | (release ctx2) => true))))) 55 | 56 | ;; =============== Module Management & Execution Control Tests ===================================== 57 | 58 | (facts 59 | "Test Parameters" 60 | (with-context (context (device)) 61 | (with-release [cnt 3 62 | extra 4 63 | gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra))) 64 | params (parameters cnt gpu-a)] 65 | (size params) => 2 66 | (get-entry (int-pointer (get-entry params 0))) => 3 67 | (get-entry (long-pointer (get-entry params 1))) => (cu-address* gpu-a) 68 | (address (pointer gpu-a)) => (cu-address* gpu-a) 69 | (address (pointer gpu-a 1)) => (inc (long (cu-address* gpu-a))) 70 | (address (position! (pointer gpu-a) 1)) => (cu-address* gpu-a)))) 71 | 72 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu") 73 | cnt 300 74 | extra 5] 75 | (with-context (context (device)) 76 | (with-release [prog (compile! (program program-source {"dummy" "placeholder"})) 77 | grid (grid-1d cnt (min 256 cnt))] 78 | (with-release [modl (module prog) 79 | fun (function modl "inc") 80 | strm (stream :non-blocking) 81 | host-a (float-pointer (+ cnt extra)) 82 | gpu-a (mem-alloc-runtime (* Float/BYTES (+ cnt extra)))] 83 | 84 | (facts 85 | "Test launch" 86 | (fill! host-a 0) 87 | (put-entry! host-a 0 1) 88 | (put-entry! host-a 10 100) 89 | (memcpy-host! host-a gpu-a strm) => gpu-a 90 | (launch! fun grid strm (parameters (int cnt) gpu-a)) => strm 91 | (synchronize! strm) => strm 92 | (memcpy-host! gpu-a host-a strm) => host-a 93 | (get-entry host-a 0) => 2.0 94 | (get-entry host-a 10) => 101.0 95 | (get-entry host-a (dec cnt)) => 1.0 96 | (get-entry host-a cnt) => 0.0 97 | (get-entry host-a (dec (+ cnt extra))) => 0.0)) 98 | 99 | (with-release [modl (module)] 100 | (facts 101 | "Test device globals" 102 | (load! modl prog) => modl 103 | (with-release [fun (function modl "constant_inc") 104 | gpu-a (global modl "gpu_a") 105 | constant-gpu-a (global modl "constant_gpu_a")] 106 | (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0]) 107 | (memcpy! gpu-a constant-gpu-a) => constant-gpu-a 108 | (launch! fun (grid-1d 3) (parameters 3 gpu-a)) 109 | (pointer-seq (memcpy-host! constant-gpu-a (float-pointer 3))) => (seq [1.0 2.0 3.0]) 110 | (pointer-seq (memcpy-host! gpu-a (float-pointer 3))) => (seq [2.0 4.0 6.0]))))))) 111 | 112 | ;; =============== Stream Management Tests ============================================== 113 | 114 | (with-context (context (device 0) :map-host) 115 | 116 | (facts 117 | "Stream creation and memory copy tests." 118 | (with-release [strm (stream :non-blocking) 119 | cuda1 (mem-alloc-runtime Float/BYTES) 120 | cuda2 (mem-alloc-runtime Float/BYTES) 121 | host1 (float-array [173.0]) 122 | host2 (byte-pointer Float/BYTES)] 123 | (memcpy-host! host1 cuda1 strm) => cuda1 124 | (synchronize! strm) 125 | (memcpy! cuda1 cuda2) => cuda2 126 | (memcpy-host! cuda2 host2 strm) => host2 127 | (synchronize! strm) 128 | (get-float host2 0) => 173.0)) 129 | 130 | (facts 131 | "Stream and memory release." 132 | (with-release [strm (stream :non-blocking) 133 | cuda (mem-alloc-runtime Float/BYTES)] 134 | (release strm) => true 135 | (release strm) => true 136 | (release cuda) => true 137 | (memcpy! cuda cuda) => (throws IllegalArgumentException) 138 | (release cuda) => true))) 139 | 140 | (with-context (context (device 0) :map-host) 141 | (facts 142 | "Host functions." 143 | (let [ch (chan)] 144 | (with-release [strm (stream :non-blocking) 145 | cuda1 (mem-alloc-runtime Float/BYTES) 146 | cuda2 (mem-alloc-runtime Float/BYTES) 147 | host1 (float-array [163.0]) 148 | host2 (float-pointer [12]) 149 | ch (chan)] 150 | (listen! strm ch :host) 151 | (memcpy-host! host1 cuda1 strm) => cuda1 152 | (memcpy! cuda1 cuda2 strm) => cuda2 153 | (synchronize! strm) 154 | (memcpy-host! cuda2 (float-array 1) strm) => (throws Exception) 155 | (get-entry (memcpy-host! cuda2 host2 strm) 0) => 163.0 156 | ( :host)))) 157 | 158 | ;; =============== Memory Management Tests ============================================== 159 | 160 | (with-release [dev (device 0)] 161 | (with-context (context dev :map-host) 162 | 163 | (facts 164 | "mem-alloc-runtime tests." 165 | (mem-alloc-driver 0) => (throws ExceptionInfo) 166 | (with-release [buf (mem-alloc-runtime Float/BYTES)] 167 | (bytesize buf) => Float/BYTES)) 168 | 169 | (facts 170 | "Linear memory tests." 171 | (with-release [cuda1 (mem-alloc-runtime Float/BYTES) 172 | cuda2 (mem-alloc-runtime Float/BYTES) 173 | host1 (float-array [173.0]) 174 | host2 (byte-pointer Float/BYTES)] 175 | (memcpy-host! host1 cuda1) => cuda1 176 | (memcpy! cuda1 cuda2) => cuda2 177 | (memcpy-host! cuda2 host2) => host2 178 | (get-float host2 0) => 173.0)) 179 | 180 | (facts 181 | "Linear memory sub-region tests." 182 | (with-release [cuda (mem-alloc-runtime 20)] 183 | (memcpy-host! (float-array [1 2 3 4 5]) cuda) => cuda 184 | (let-release [cuda1 (mem-sub-region cuda 0 8) 185 | cuda2 (mem-sub-region cuda 8 12)] 186 | (mem-sub-region cuda 8 20) => (throws ExceptionInfo) 187 | (pointer-seq (memcpy-host! cuda1 (float-pointer 2))) => [1.0 2.0] 188 | (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [3.0 4.0 5.0] 189 | (do (release cuda1) 190 | (release cuda2) 191 | (pointer-seq (memcpy-host! cuda (float-pointer 5))) => [1.0 2.0 3.0 4.0 5.0])))) 192 | 193 | (facts 194 | "Runtime cudaMalloc tests." 195 | (with-release [cuda1 (mem-alloc-runtime Float/BYTES :float) 196 | cuda2 (mem-alloc-runtime (* 3 Float/BYTES) :float) 197 | host1 (float-pointer [100.0]) 198 | host2 (mem-alloc-mapped Float/BYTES :float) 199 | zero (mem-alloc-runtime 0)] 200 | zero => truthy 201 | (bytesize cuda1) => Float/BYTES 202 | (memcpy-host! host1 cuda1) => cuda1 203 | (synchronize!) 204 | (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [100.0] 205 | (seq (memcpy! cuda1 host2)) => [100.0] 206 | (position! (pointer cuda2) 2) 207 | (.position (pointer cuda2)) => 2 208 | (memcpy! cuda1 cuda2) => cuda2 209 | (position! (pointer cuda2) 0) 210 | (memcpy-host! (float-pointer [200.0 300.0]) cuda2) => cuda2 211 | (pointer-seq (memcpy-host! cuda2 (float-pointer 3))) => [200.0 300.0 100.0])) 212 | 213 | (facts 214 | "Pinned memory tests." 215 | (with-release [pinned-host (mem-alloc-pinned Float/BYTES :float :devicemap) 216 | cuda1 (mem-alloc-runtime Float/BYTES)] 217 | (mem-alloc-pinned Float/BYTES :unknown) => (throws ExceptionInfo) 218 | (bytesize pinned-host) => Float/BYTES 219 | (put-entry! pinned-host 0 13) 220 | (memcpy-host! pinned-host cuda1) => cuda1 221 | (put-entry! pinned-host 0 11) 222 | (memcpy! cuda1 pinned-host) => pinned-host 223 | (synchronize!) 224 | (get-entry pinned-host 0) => 13.0 225 | (pointer-seq (memcpy-host! cuda1 (float-pointer 1))) => [13.0])) 226 | 227 | (facts 228 | "Mapped memory tests." 229 | (with-release [mapped-host (mem-alloc-mapped Float/BYTES :float) 230 | cuda1 (mem-alloc-runtime Float/BYTES) 231 | mapped-host2 (mem-alloc-mapped Float/BYTES :float)] 232 | (bytesize mapped-host) => Float/BYTES 233 | (put-entry! mapped-host 0 14.0) 234 | (memcpy-host! mapped-host cuda1) => cuda1 235 | (get-entry (memcpy-host! cuda1 (float-pointer 1)) 0) => 14.0 236 | (get-entry (memcpy! cuda1 mapped-host2)) => 14.0 237 | (synchronize!) 238 | (seq mapped-host2) => [14.0])) 239 | 240 | (facts 241 | "CUDA Raw Runtime Pointer tests." 242 | (with-release [host1 (float-pointer [1 2 3 4]) 243 | cuda1 (cuda-malloc (* 4 Float/BYTES) :float) 244 | cuda2 (cuda-malloc (* 3 Float/BYTES) :float) 245 | host2 (float-pointer 4) 246 | host3 (float-pointer 3)] 247 | (memcpy-to-device! host1 cuda1) => cuda1 248 | (memcpy! (ptr cuda1 2) (ptr cuda2 1)) 249 | (synchronize!) 250 | (pointer-seq (memcpy-to-host! cuda1 host2)) => [1.0 2.0 3.0 4.0] 251 | (pointer-seq (memcpy-to-host! cuda2 host3)) => [0.0 3.0 4.0] 252 | (cuda-free! cuda1) => cuda1 253 | (cuda-free! cuda1) => cuda1 254 | (cuda-free! cuda2) => cuda2)) 255 | 256 | (facts 257 | "CUDA Raw Runtime Pointer arithmetic tests." 258 | (with-release [host1 (float-pointer [1 2 3 4]) 259 | cuda1 (cuda-malloc (* 4 Float/BYTES) :float)] 260 | (memcpy-to-device! host1 cuda1) => cuda1 261 | (pointer cuda1) => cuda1 262 | (pointer cuda1 0) => cuda1 263 | (size (pointer cuda1 1)) => (dec (size cuda1)) 264 | (bytesize (pointer cuda1 1)) => (- (bytesize cuda1) Float/BYTES) 265 | (size (ptr cuda1 1)) => (dec (size cuda1)) 266 | (bytesize (ptr cuda1 1)) => (- (bytesize cuda1) Float/BYTES) 267 | (cuda-free! cuda1) => cuda1)) 268 | 269 | (facts 270 | "cuda-malloc memset tests." 271 | (with-release [cuda1 (cuda-malloc (* 2 Integer/BYTES) :int)] 272 | (memcpy-to-device! (int-pointer [124 134]) cuda1) => cuda1 273 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 134] 274 | (position! (pointer cuda1) 1) 275 | (memset! cuda1 (int 100) 1) 276 | (position! (pointer cuda1) 0) 277 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [124 100] 278 | (memset! cuda1 (int 200) 1) 279 | (pointer-seq (memcpy-to-host! cuda1 (int-pointer 2))) => [200 100])) 280 | 281 | (facts 282 | "cuda-alloc-runtime memset tests." 283 | (with-release [cuda1 (mem-alloc-runtime (* 2 Integer/BYTES) :int)] 284 | (memcpy-host! (int-pointer [124 134]) cuda1) => cuda1 285 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 134] 286 | (position! (pointer cuda1) 1) 287 | (memset! cuda1 (int 100) 1) 288 | (position! (pointer cuda1) 0) 289 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [124 100] 290 | (memset! cuda1 (int 200) 1) 291 | (pointer-seq (memcpy-host! cuda1 (int-pointer 2))) => [200 100])) 292 | 293 | (when (and (info/managed-memory dev) (info/concurrent-managed-access dev)) 294 | (facts 295 | "mem-alloc-driver tests." 296 | (with-release [host0 (float-pointer [15]) 297 | host1 (float-pointer 1) 298 | cuda0 (mem-alloc-driver Float/BYTES :host) 299 | cuda1 (mem-alloc-driver Float/BYTES :global)] 300 | 301 | (bytesize cuda0) => Float/BYTES 302 | (mem-alloc-driver Float/BYTES :unknown) => (throws ExceptionInfo) 303 | (memcpy-host! host0 cuda0) => cuda0 304 | (memcpy! cuda0 cuda1) => cuda1 305 | (memcpy-host! cuda1 host1) => host1 306 | (get-entry host1 0) => 15.0))) 307 | 308 | (when (info/managed-memory dev) 309 | (facts 310 | "mem-alloc-driver with globally shared attached memory tests." 311 | (with-release [host0 (float-pointer [16]) 312 | host1 (float-pointer 1) 313 | cuda0 (mem-alloc-driver Float/BYTES :host) 314 | cuda1 (mem-alloc-driver Float/BYTES :global)] 315 | (attach-mem! nil cuda0 Float/BYTES :global) => nil 316 | (bytesize cuda0) => Float/BYTES 317 | (memcpy-host! host0 cuda0) => cuda0 318 | (memcpy! cuda0 cuda1) => cuda1 319 | (memcpy-host! cuda1 host1) => host1 320 | (get-entry host1 0) => 16.0)) 321 | (facts 322 | "mem-alloc-driver with attached memory tests." 323 | (with-release [host0 (float-pointer [17]) 324 | host1 (float-pointer 1) 325 | cuda0 (mem-alloc-driver Float/BYTES :host) 326 | cuda1 (mem-alloc-driver Float/BYTES :global)] 327 | (let [hstream (attach-mem! cuda0 Float/BYTES :single)] 328 | (bytesize cuda0) => Float/BYTES 329 | (if (info/concurrent-managed-access dev) 330 | (memcpy-host! host0 cuda0) => cuda0 331 | (memcpy-host! host0 cuda0) => (throws ExceptionInfo)) 332 | (memcpy-host! host0 cuda0 hstream) => cuda0 333 | (memcpy! cuda0 cuda1 hstream) => cuda1 334 | (memcpy-host! cuda1 host1 hstream) => host1 335 | (synchronize! hstream) 336 | (get-entry host1 0) => 17.0)))) 337 | 338 | (facts 339 | "mem-alloc-registered tests." 340 | (with-release [host0 (byte-pointer Float/BYTES) 341 | host1 (byte-pointer Float/BYTES) 342 | cuda0 (mem-register-pinned! host0) 343 | cuda1 (mem-register-pinned! host1)] 344 | 345 | (bytesize cuda0) => Float/BYTES 346 | (put-float! host0 0 44.0) 347 | (memcpy! cuda0 cuda1) => cuda1 348 | (get-float host1 0) => 44.0)))) 349 | 350 | ;; ================= Peer Access Management Tests ===================================== 351 | 352 | (facts 353 | "Peer access tests (requires 2 devices)." 354 | (let [num-dev (device-count) 355 | devices (mapv device (range num-dev)) 356 | combinations (set (for [x (range num-dev) y (range num-dev) :when (not= x y)] #{x y})) 357 | p2p? (fn [num-pair] (let [[a b] (vec num-pair) 358 | dev-a (nth devices a) 359 | dev-b (nth devices b)] 360 | (when (and (p2p-attribute dev-a dev-b :access-supported) 361 | (can-access-peer dev-a dev-b) 362 | (can-access-peer dev-b dev-a)) 363 | [dev-a dev-b])))] 364 | (if-let [[dev-a dev-b] (some p2p? combinations)] 365 | (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-ad.cdu") 366 | ^:const vctr-len 3] 367 | (with-release [host-a (float-array [1 2 3]) 368 | host-b (float-array [2 3 4]) 369 | host-sum (float-array vctr-len) 370 | ctx (context dev-a) 371 | peer-ctx (context dev-b)] 372 | (in-context ctx 373 | (with-release [prog (compile! (program program-source)) 374 | m (module prog) 375 | vector-add (function m "add") 376 | gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len)) 377 | gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len)) 378 | gpu-b (in-context peer-ctx (mem-alloc-runtime (* Float/BYTES vctr-len)))] 379 | (disable-peer-access! peer-ctx) => (throws ExceptionInfo) 380 | (in-context peer-ctx (disable-peer-access! ctx) => (throws ExceptionInfo)) 381 | (memcpy-host! host-a gpu-a) => gpu-a 382 | (in-context peer-ctx (memcpy-host! host-b gpu-b) => gpu-b) 383 | (enable-peer-access! peer-ctx) => peer-ctx 384 | (in-context peer-ctx (enable-peer-access! ctx) => ctx) 385 | (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum)) 386 | (synchronize!) 387 | (memcpy-host! gpu-a-sum host-sum) => (seq [3.0 5.0 7.0]) 388 | (disable-peer-access! peer-ctx) => peer-ctx 389 | (in-context peer-ctx (disable-peer-access! ctx) => ctx)))) 390 | (when-let [dev (first devices)] 391 | (p2p-attribute dev dev :access-supported) => (throws ExceptionInfo) 392 | (can-access-peer dev dev) => false))))) 393 | 394 | (facts 395 | "Runtime API Pointer kernel launch test" 396 | (let [dev (device 0) 397 | program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu") 398 | ^:const vctr-len 3] 399 | (with-release [host-a (float-pointer [1 2 3]) 400 | host-b (float-pointer [2 3 4]) 401 | host-sum (float-pointer vctr-len) 402 | ctx (context dev)] 403 | (in-context ctx 404 | (with-release [prog (compile! (program program-source)) 405 | m (module prog) 406 | vector-add (function m "add") 407 | gpu-a (mem-alloc-runtime (* Float/BYTES vctr-len)) 408 | gpu-a-sum (mem-alloc-runtime (* Float/BYTES vctr-len)) 409 | gpu-b (mem-alloc-runtime (* Float/BYTES vctr-len))] 410 | (memcpy-host! host-a gpu-a) => gpu-a 411 | (memcpy-host! host-b gpu-b) => gpu-b 412 | (launch! vector-add (grid-1d vctr-len) (parameters vctr-len gpu-a gpu-b gpu-a-sum)) 413 | (synchronize!) 414 | (pointer-seq (memcpy! gpu-a-sum host-sum)) => (seq [3.0 5.0 7.0])))))) 415 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/examples/dynamic_parallelism_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.examples.dynamic-parallelism-test 10 | (:require [midje.sweet :refer [facts =>]] 11 | [clojure.java.io :refer [file]] 12 | [uncomplicate.commons.core :refer [with-release]] 13 | [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]] 14 | [uncomplicate.clojurecuda.core 15 | :refer [compile! context device function grid-1d init launch! link link-complete! 16 | mem-alloc-runtime memcpy-host! module parameters program with-context]])) 17 | 18 | (init) 19 | 20 | (let [program-source (slurp "test/cuda/examples/dynamic-parallelism.cu") 21 | num-parent-threads 8 22 | num-child-threads 8 23 | num-elements (* num-parent-threads num-child-threads)] 24 | (with-context (context (device)) 25 | (with-release [prog (compile! (program program-source) 26 | ["--relocatable-device-code=true" "-default-device"]) 27 | linked-prog (link [[:library (file "/opt/cuda/lib64/libcudadevrt.a")] 28 | [:ptx prog]]) 29 | m (module (link-complete! linked-prog)) 30 | parent (function m "parentKernel") 31 | data (mem-alloc-runtime (* Float/BYTES num-elements))] 32 | (facts 33 | "Dynamic parallelism JCuda example." 34 | (memcpy-host! (float-pointer num-elements) data) 35 | (launch! parent (grid-1d (+ num-elements num-elements (- 1)) num-parent-threads) 36 | (parameters num-elements data)) 37 | (pointer-seq (memcpy-host! data (float-pointer num-elements))) 38 | => (map float (seq [0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 39 | 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 40 | 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 41 | 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 42 | 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 43 | 5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7 44 | 6.0 6.1 6.2 6.3 6.4 6.5 6.6 6.7 45 | 7.0 7.1 7.2 7.3 7.4 7.5 7.6 7.7])))))) 46 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/examples/vector_add_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.examples.vector-add-test 10 | (:require [midje.sweet :refer [facts =>]] 11 | [uncomplicate.commons.core :refer [with-release size]] 12 | [uncomplicate.clojure-cpp :refer [float-pointer pointer-seq]] 13 | [uncomplicate.clojurecuda.core 14 | :refer [compile! context device function grid-1d init launch! mem-alloc-driver 15 | mem-alloc-pinned mem-alloc-runtime memcpy-host! module parameters program 16 | synchronize! with-context]])) 17 | 18 | (init) 19 | 20 | (let [program-source (slurp "test/cuda/examples/jnvrtc-vector-add.cu")] 21 | (with-context (context (device)) 22 | (with-release [prog (compile! (program program-source)) 23 | m (module prog) 24 | add (function m "add") 25 | host-a (float-pointer [1 2 3]) 26 | host-b (float-pointer [2 3 4]) 27 | host-sum (float-pointer 3) 28 | gpu-a (mem-alloc-runtime (* Float/BYTES 3)) 29 | gpu-b (mem-alloc-driver (* Float/BYTES 3)) 30 | gpu-sum (mem-alloc-pinned (* Float/BYTES 3))] 31 | (facts 32 | "Vector add JCuda example." 33 | (memcpy-host! host-a gpu-a) 34 | (memcpy-host! host-b gpu-b) 35 | (launch! add (grid-1d (size host-sum)) (parameters (size host-sum) gpu-a gpu-b gpu-sum)) 36 | (synchronize!) 37 | (pointer-seq (memcpy-host! gpu-sum host-sum)) => (seq [3.0 5.0 7.0]))))) 38 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/info_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.info-test 10 | (:require [midje.sweet :refer [facts =>]] 11 | [uncomplicate.commons.core :refer [with-release info]] 12 | [uncomplicate.clojurecuda 13 | [core :refer [compile! context device function init module program stream with-context]] 14 | [info :refer [driver-version limit limit! stream-flag]]] 15 | [uncomplicate.clojurecuda.internal.constants :refer [stream-flags]])) 16 | 17 | (init) 18 | 19 | (facts 20 | "Driver info tests." 21 | (pos? (driver-version)) => true) 22 | 23 | (facts 24 | "Device info tests." 25 | (count (info (device 0))) => 83) 26 | 27 | (with-release [ctx (context (device))] 28 | (facts 29 | "Context info tests." 30 | (count (info ctx)) => 13 31 | (limit! :stack-size 512) => 512 32 | (limit :stack-size) => 512)) 33 | 34 | (with-context (context (device)) 35 | (with-release [hstream (stream :non-blocking)] 36 | (facts 37 | "Stream info tests." 38 | (count (info hstream)) => 2 39 | (stream-flag hstream) => (stream-flags :non-blocking) 40 | (:flag (info hstream))))) => :non-blocking 41 | 42 | (let [program-source (slurp "test/cuda/uncomplicate/clojurecuda/kernels/test.cu")] 43 | (with-context (context (device)) 44 | (with-release [prog (compile! (program program-source)) 45 | modl (module prog) 46 | fun (function modl "inc")] 47 | (facts 48 | "function info tests." 49 | (count (info fun)) => 7)))) 50 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/toolbox_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.toolbox-test 10 | (:require [midje.sweet :refer [facts => roughly]] 11 | [uncomplicate.commons 12 | [core :refer [with-release]] 13 | [utils :refer [count-groups]]] 14 | [uncomplicate.clojure-cpp :refer [float-pointer double-pointer pointer-seq]] 15 | [uncomplicate.clojurecuda 16 | [core :refer [compile! context device function init mem-alloc-runtime memcpy-host! 17 | module program with-context]] 18 | [info :refer [max-block-dim-x]] 19 | [toolbox :refer [launch-reduce! read-double]]])) 20 | 21 | (init) 22 | 23 | (let [dev (device) 24 | cnt-m 311 25 | cnt-n 9011 26 | cnt (* cnt-m cnt-n) 27 | program-source (str (slurp "src/cuda/uncomplicate/clojurecuda/kernels/reduction.cu") "\n" 28 | (slurp "test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu"))] 29 | 30 | (with-context (context dev) 31 | (with-release [wgs (max-block-dim-x dev) 32 | prog (compile! (program program-source) 33 | ["-DREAL=float" "-DACCUMULATOR=double" 34 | (format "-DWGS=%d" wgs)]) 35 | modl (module prog) 36 | data (float-pointer (range cnt)) 37 | cu-data (mem-alloc-runtime (* cnt Float/BYTES)) 38 | sum-reduction-horizontal (function modl "sum_reduction_horizontal") 39 | sum-horizontal (function modl "sum_reduce_horizontal")] 40 | 41 | (memcpy-host! data cu-data) 42 | 43 | (let [acc-size (* Double/BYTES (max 1 (count-groups wgs cnt)))] 44 | (with-release [sum-reduction-kernel (function modl "sum_reduction") 45 | sum-reduce-kernel (function modl "sum_reduce") 46 | cu-acc (mem-alloc-runtime acc-size)] 47 | (facts 48 | "Test 1D reduction." 49 | (launch-reduce! nil sum-reduce-kernel sum-reduction-kernel [cu-acc cu-data] [cu-acc] cnt wgs) 50 | (read-double cu-acc) => 3926780329410.0))) 51 | 52 | (let [wgs-m 64 53 | wgs-n 16 54 | acc-size (* Double/BYTES (max 1 (* cnt-m (count-groups wgs-n cnt-n)))) 55 | res (double-pointer cnt-m)] 56 | (with-release [sum-reduction-horizontal (function modl "sum_reduction_horizontal") 57 | sum-reduce-horizontal (function modl "sum_reduce_horizontal") 58 | cu-acc (mem-alloc-runtime acc-size)] 59 | (facts 60 | "Test horizontal 2D reduction." 61 | (launch-reduce! nil sum-reduce-horizontal sum-reduction-horizontal 62 | [cu-acc cu-data] [cu-acc] cnt-m cnt-n wgs-m wgs-n) 63 | (memcpy-host! cu-acc res) 64 | (apply + (pointer-seq res)) => (roughly 3.92678032941E12)))) 65 | 66 | (let [wgs-m 64 67 | wgs-n 16 68 | acc-size (* Double/BYTES (max 1 (* cnt-n (count-groups wgs-m cnt-m)))) 69 | res (double-pointer cnt-n)] 70 | (with-release [sum-reduction-vertical (function modl "sum_reduction_vertical") 71 | sum-reduce-vertical (function modl "sum_reduce_vertical") 72 | cu-acc (mem-alloc-runtime acc-size)] 73 | (facts 74 | "Test vertical 2D reduction." 75 | (launch-reduce! nil sum-reduce-vertical sum-reduction-vertical 76 | [cu-acc cu-data] [cu-acc] cnt-n cnt-m wgs-n wgs-m) 77 | (memcpy-host! cu-acc res) 78 | (apply + (pointer-seq res)) => (roughly 3.92678032941E12))))))) 79 | -------------------------------------------------------------------------------- /test/clojure/uncomplicate/clojurecuda/utils_test.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Dragan Djuric. All rights reserved. 2 | ;; The use and distribution terms for this software are covered by the 3 | ;; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) or later 4 | ;; which can be found in the file LICENSE at the root of this distribution. 5 | ;; By using this software in any fashion, you are agreeing to be bound by 6 | ;; the terms of this license. 7 | ;; You must not remove this notice, or any other, from this software. 8 | 9 | (ns uncomplicate.clojurecuda.utils-test 10 | (:require [midje.sweet :refer [facts => throws]] 11 | [uncomplicate.clojurecuda.internal.utils :refer [error maybe with-check]])) 12 | 13 | (facts 14 | "error tests" 15 | 16 | (ex-data (error 0)) 17 | => {:code 0, :details nil, :name :success :type :cuda} 18 | 19 | (ex-data (error -43)) 20 | => {:code -43, :details nil, :name -43, :type :cuda} 21 | 22 | (ex-data (error 0 "Additional details")) 23 | => {:code 0, :details "Additional details", :name :success, :type :cuda}) 24 | 25 | (facts 26 | "with-check tests" 27 | (let [f (fn [x] (if x 0 -1))] 28 | (with-check (f 1) :success) => :success 29 | (with-check (f false) :success) => (throws clojure.lang.ExceptionInfo))) 30 | 31 | (facts 32 | "maybe tests" 33 | (ex-data (maybe (throw (ex-info "Test Exception" {:data :test})))) 34 | => (throws clojure.lang.ExceptionInfo) 35 | 36 | (:type (ex-data (error -1 nil))) => :cuda) 37 | -------------------------------------------------------------------------------- /test/cuda/examples/dynamic-parallelism.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Created based on example from Marco Hutter: 3 | * 4 | * JCuda - Java bindings for NVIDIA CUDA 5 | * 6 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 7 | */ 8 | 9 | extern "C" 10 | __global__ void childKernel(unsigned int parentThreadIndex, float* data) { 11 | data[threadIdx.x] = parentThreadIndex + 0.1f * threadIdx.x; 12 | } 13 | 14 | extern "C" 15 | __global__ void parentKernel(unsigned int size, float *data) { 16 | childKernel<<<1, 8>>>(threadIdx.x, data + threadIdx.x * 8); 17 | } 18 | -------------------------------------------------------------------------------- /test/cuda/examples/jnvrtc-vector-add.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Created based on example from Marco Hutter: 3 | * 4 | * JCuda - Java bindings for NVIDIA CUDA 5 | * 6 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 7 | */ 8 | 9 | extern "C" 10 | __global__ void add(int n, float *a, float *b, float *sum) { 11 | int i = blockIdx.x * blockDim.x + threadIdx.x; 12 | if (i < n) { 13 | sum[i] = a[i] + b[i]; 14 | } 15 | }; 16 | -------------------------------------------------------------------------------- /test/cuda/uncomplicate/clojurecuda/kernels/test.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void inc (int n, float* a) { 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | if (i < n) { 6 | a[i] += 1; 7 | } 8 | }; 9 | 10 | __device__ float gpu_a[] = {1.0, 2.0, 3.0}; 11 | 12 | __device__ __constant__ float constant_gpu_a[3]; 13 | 14 | __global__ void constant_inc (int n, float* a) { 15 | int i = blockIdx.x * blockDim.x + threadIdx.x; 16 | if (i < n) { 17 | a[i] += constant_gpu_a[i]; 18 | } 19 | }; 20 | 21 | } 22 | -------------------------------------------------------------------------------- /test/cuda/uncomplicate/clojurecuda/kernels/toolbox-test.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | 3 | __global__ void sum_reduce (const int n, ACCUMULATOR* acc, const REAL* x) { 4 | const int gid = blockIdx.x * blockDim.x + threadIdx.x; 5 | const ACCUMULATOR sum = block_reduction_sum( (gid < n) ? x[gid] : 0.0); 6 | if (threadIdx.x == 0) { 7 | acc[blockIdx.x] = sum; 8 | } 9 | }; 10 | 11 | __global__ void sum_reduce_horizontal (const int m, const int n, ACCUMULATOR* acc, const REAL* a) { 12 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x; 13 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y; 14 | const int i = m * gid_1 + gid_0; 15 | const bool valid = (gid_0 < m) && (gid_1 < n); 16 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0); 17 | const bool write = valid && (threadIdx.y == 0); 18 | if (write) { 19 | acc[m * blockIdx.y + gid_0] = sum; 20 | } 21 | } 22 | 23 | __global__ void sum_reduce_vertical (const int m, const int n, ACCUMULATOR* acc, const REAL* a) { 24 | const int gid_0 = blockIdx.x * blockDim.x + threadIdx.x; 25 | const int gid_1 = blockIdx.y * blockDim.y + threadIdx.y; 26 | const int i = n * gid_0 + gid_1; 27 | const bool valid = (gid_0 < m) && (gid_1 < n); 28 | const ACCUMULATOR sum = block_reduction_sum_2( (valid) ? a[i] : 0.0); 29 | const bool write = valid && (threadIdx.y == 0); 30 | if (write) { 31 | acc[m * blockIdx.y + gid_0] = sum; 32 | } 33 | } 34 | } 35 | --------------------------------------------------------------------------------