├── .github └── FUNDING.yml ├── .gitignore ├── .gitlab-ci.yml ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── LICENSE.md ├── README.md ├── accel-core ├── .cargo │ └── config ├── Cargo.toml ├── README.md ├── rust-toolchain └── src │ └── lib.rs ├── accel-derive ├── Cargo.toml ├── README.md ├── src │ ├── builder.rs │ ├── contexted.rs │ ├── host.rs │ ├── launchable.rs │ ├── lib.rs │ └── parser.rs └── tests │ ├── kernels │ ├── arguments.rs │ ├── dependencies.rs │ ├── dependencies_default.rs │ ├── dependencies_git.rs │ └── do_nothing.rs │ └── try_build.rs ├── accel ├── Cargo.toml ├── benches │ └── memcpy.rs ├── examples │ └── add.rs ├── src │ ├── block.rs │ ├── device.rs │ ├── error.rs │ ├── execution.rs │ ├── grid.rs │ ├── instruction.rs │ ├── lib.rs │ ├── linker.rs │ ├── memory │ │ ├── array.rs │ │ ├── device.rs │ │ ├── dimension.rs │ │ ├── info.rs │ │ ├── mod.rs │ │ ├── page_locked.rs │ │ ├── registered.rs │ │ ├── scalar.rs │ │ └── slice.rs │ ├── module.rs │ ├── profiler.rs │ └── stream.rs └── tests │ ├── argref.rs │ ├── data │ ├── Makefile │ ├── add.cu │ ├── add.cubin │ ├── add.ptx │ ├── sub.cu │ ├── sub.cubin │ └── sub.ptx │ ├── launch_async.rs │ ├── launch_async │ ├── mut_ref_fail.rs │ ├── mut_ref_fail.stderr │ └── mut_ref_success.rs │ ├── read_host_memory.rs │ └── slice.rs ├── diagrams ├── .gitignore ├── compile_flow.png ├── compile_flow.svg └── compile_flow.tex ├── docker ├── .gitignore ├── Makefile ├── README.md ├── centos.Dockerfile ├── cuda.conf └── ubuntu.Dockerfile ├── public └── index.html └── setup_nvptx_toolchain.sh /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [termoshtt] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | target/ 4 | *.rustfmt 5 | rusty-tags.* 6 | 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 8 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock 9 | Cargo.lock 10 | 11 | # cargo fmt 12 | *.bk 13 | 14 | # generated PTX 15 | *.s 16 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: registry.gitlab.com/termoshtt/accel/ubuntu18.04-cuda10.2-nightly2020-05-01:master 2 | 3 | variables: 4 | AWS_DEFAULT_REGION: ap-northeast-1 5 | BUCKET_NAME: accel-gitlab-ci 6 | 7 | stages: 8 | - docker 9 | - test 10 | - bench 11 | - package 12 | - deploy 13 | 14 | test:cargo-clippy: 15 | stage: test 16 | script: 17 | - cargo clippy 18 | 19 | test:cargo-fmt: 20 | stage: test 21 | script: 22 | - cargo fmt -- --check 23 | 24 | .with_gpu: 25 | before_script: 26 | - nvidia-smi 27 | tags: 28 | - gpu 29 | only: 30 | - master 31 | - tags 32 | - /^gpu-.*/ 33 | 34 | test:accel: 35 | extends: .with_gpu 36 | stage: test 37 | script: 38 | - cargo test 39 | 40 | test:ignored: 41 | extends: .with_gpu 42 | stage: test 43 | script: 44 | - cd accel 45 | - cargo test -- --ignored 46 | allow_failure: true 47 | 48 | bench: 49 | extends: .with_gpu 50 | stage: bench 51 | script: 52 | - rm -rf accel/target/criterion 53 | - cargo bench 54 | - mv accel/target/criterion public/benchmark 55 | artifacts: 56 | paths: 57 | - public/benchmark 58 | only: 59 | variables: 60 | - $CI_COMMIT_MESSAGE =~ /\[bench\]/ 61 | - $CI_RUN_BENCHMARK 62 | 63 | changelog: 64 | image: debian 65 | stage: test 66 | before_script: 67 | - apt update 68 | - apt install -y git 69 | script: 70 | - test -n "$(git diff origin/master CHANGELOG.md)" 71 | except: 72 | - master 73 | 74 | package: 75 | stage: package 76 | script: 77 | # Document of accel, accel-derive 78 | - cargo doc --no-deps --document-private-items 79 | - mv target/doc public/accel 80 | # Document of accel-core 81 | - cd accel-core 82 | - cargo doc 83 | - mv ./target/nvptx64-nvidia-cuda/doc ../public/accel-core 84 | - cd - 85 | artifacts: 86 | paths: 87 | - public 88 | 89 | pages: 90 | stage: deploy 91 | dependencies: 92 | - package 93 | script: 94 | - find public 95 | artifacts: 96 | paths: 97 | - public 98 | only: 99 | - master 100 | 101 | .s3: 102 | image: python 103 | stage: deploy 104 | dependencies: 105 | - package 106 | before_script: 107 | - pip install awscli 108 | only: 109 | - master 110 | - tags 111 | - /^gpu-.*/ 112 | 113 | deploy_s3: 114 | extends: .s3 115 | script: 116 | - aws s3 cp public s3://${BUCKET_NAME}/${CI_COMMIT_REF_SLUG} --recursive --acl public-read 117 | environment: 118 | name: ${CI_COMMIT_REF_SLUG} 119 | url: https://${BUCKET_NAME}.s3-${AWS_DEFAULT_REGION}.amazonaws.com/${CI_COMMIT_REF_SLUG}/index.html 120 | on_stop: clean_s3 121 | 122 | clean_s3: 123 | extends: .s3 124 | script: 125 | - aws s3 rm s3://${BUCKET_NAME}/${CI_COMMIT_REF_SLUG} --recursive 126 | environment: 127 | name: ${CI_COMMIT_REF_SLUG} 128 | action: stop 129 | when: manual 130 | 131 | .build: 132 | image: docker:stable 133 | stage: docker 134 | services: 135 | - docker:dind 136 | before_script: 137 | - apk add make 138 | - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY 139 | script: 140 | - make -C docker ${CI_JOB_NAME} 141 | only: 142 | refs: 143 | - master 144 | - tags 145 | changes: 146 | - docker/* 147 | 148 | centos7-cuda10.0-nightly2020-01-02: 149 | extends: .build 150 | centos7-cuda10.1-nightly2020-01-02: 151 | extends: .build 152 | centos7-cuda10.2-nightly2020-01-02: 153 | extends: .build 154 | ubuntu18.04-cuda10.0-nightly2020-01-02: 155 | extends: .build 156 | ubuntu18.04-cuda10.1-nightly2020-01-02: 157 | extends: .build 158 | ubuntu18.04-cuda10.2-nightly2020-01-02: 159 | extends: .build 160 | centos7-cuda10.0-nightly2020-05-01: 161 | extends: .build 162 | centos7-cuda10.1-nightly2020-05-01: 163 | extends: .build 164 | centos7-cuda10.2-nightly2020-05-01: 165 | extends: .build 166 | ubuntu18.04-cuda10.0-nightly2020-05-01: 167 | extends: .build 168 | ubuntu18.04-cuda10.1-nightly2020-05-01: 169 | extends: .build 170 | ubuntu18.04-cuda10.2-nightly2020-05-01: 171 | extends: .build 172 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Unreleased (will be 0.4.0) 2 | =========================== 3 | 4 | ### Added 5 | 6 | - async/.await support 7 | - memcpy https://gitlab.com/termoshtt/accel/-/merge_requests/85 8 | - kernel launch https://gitlab.com/termoshtt/accel/-/merge_requests/88 9 | - `ContextRef` struct https://gitlab.com/termoshtt/accel/-/merge_requests/83 10 | - memcpy benchmark https://gitlab.com/termoshtt/accel/-/merge_requests/81 11 | 12 | ### Changed 13 | 14 | - `Memory` trait update 15 | - memcpy implementation uses unified addressing https://gitlab.com/termoshtt/accel/-/merge_requests/84 16 | - `Memset` trait is merged into `Memory` trait https://gitlab.com/termoshtt/accel/-/merge_requests/96 17 | - Kernel launch APIs changes, refactoring `Launchable` and `DeviceSend` traits 18 | - reference support https://gitlab.com/termoshtt/accel/-/merge_requests/90 19 | - Host slice to device pointer conversion https://gitlab.com/termoshtt/accel/-/merge_requests/91 20 | - `module` sub-module split https://gitlab.com/termoshtt/accel/-/merge_requests/89 21 | - `#[kernel]` proc-macro works in accel crate https://gitlab.com/termoshtt/accel/-/merge_requests/97 22 | - Fixed spelling issues in Readme https://gitlab.com/termoshtt/accel/-/merge_requests/99 23 | 24 | ### Maintenance 25 | 26 | - Force write CHANGELOG on each merge requests https://gitlab.com/termoshtt/accel/-/merge_requests/95 27 | 28 | 0.3.1 - 2020-05-25 29 | ------------------- 30 | 31 | - HotFix for `impl_array_scalar` macro https://gitlab.com/termoshtt/accel/-/issues/58 https://gitlab.com/termoshtt/accel/-/issues/59 https://gitlab.com/termoshtt/accel/-/merge_requests/80 32 | 33 | 0.3.0 - 2020-05-04 34 | =================== 35 | 36 | ### Added 37 | 38 | - RAII based Profiler API https://gitlab.com/termoshtt/accel/-/merge_requests/74 39 | - Registered Host memory https://gitlab.com/termoshtt/accel/-/merge_requests/73 40 | - Memcpy, Memset traits https://gitlab.com/termoshtt/accel/-/merge_requests/70 https://gitlab.com/termoshtt/accel/-/merge_requests/60 https://gitlab.com/termoshtt/accel/-/merge_requests/59 https://gitlab.com/termoshtt/accel/-/merge_requests/58 41 | - `Into` and `Into` for primitive types https://gitlab.com/termoshtt/accel/-/merge_requests/55 42 | 43 | ### Changed 44 | 45 | - Use Rust nightly-2020-05-01 https://gitlab.com/termoshtt/accel/-/merge_requests/75 46 | - Build a container with nightly-2020-05-01 https://gitlab.com/termoshtt/accel/-/merge_requests/76 47 | - Switch to `nvidia/cuda:*-base` containers https://gitlab.com/termoshtt/accel/-/merge_requests/67 48 | - Use `Arc` instead of `&Context` https://gitlab.com/termoshtt/accel/-/merge_requests/66 49 | - Export `accel-derive::kernel` into `accel::` https://gitlab.com/termoshtt/accel/-/merge_requests/68 50 | - Do not `panic!` on `Drop` of CUDA bindings https://gitlab.com/termoshtt/accel/-/merge_requests/53 51 | 52 | ### Removed 53 | - Inconsistent f64 support https://gitlab.com/termoshtt/accel/-/merge_requests/71 54 | - `Launchable::stream_launch` because of its unsafety https://gitlab.com/termoshtt/accel/-/merge_requests/69 55 | 56 | ### Others 57 | 58 | - Add cargo-clippy and cargo-fmt tests on CI https://gitlab.com/termoshtt/accel/-/merge_requests/65 59 | 60 | 0.3.0-alpha.2 - 2020-04-06 61 | ---------------------------- 62 | 63 | - Minimum Supported Rust version to be 1.42 64 | 65 | ### Without CUDA Runtime API 66 | 67 | - Rewrite using [CUDA Driver API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html) https://gitlab.com/termoshtt/accel/-/issues/19 68 | - Explicit RAII handling of CUDA Context https://gitlab.com/termoshtt/accel/-/merge_requests/51 69 | - CUDA Managed memories 70 | - Device memory https://gitlab.com/termoshtt/accel/-/merge_requests/40 71 | - Page-locked host memory https://gitlab.com/termoshtt/accel/-/merge_requests/47 72 | - CUDA Stream / Event handlers https://gitlab.com/termoshtt/accel/-/merge_requests/52 73 | - Asynchronous Kernel launch 74 | 75 | ### alloc for device code 76 | 77 | - Global allocator using CUDA's malloc/free https://gitlab.com/termoshtt/accel/-/merge_requests/26 78 | - `println!`, `assert_eq!` support https://gitlab.com/termoshtt/accel/-/merge_requests/25 79 | 80 | ### Move to GitLab 81 | 82 | - GitHub Actions has several problems 83 | - https://github.com/rust-accel/docker-action 84 | - https://github.com/rust-accel/container 85 | - GPU hosted runner for GitLab CI is now working on an instance managed by RICOS Co. Ltd. https://gitlab.com/termoshtt/accel/-/merge_requests/28 86 | 87 | 0.3.0-alpha.1 - 2020-01-12 88 | --------------------------- 89 | 90 | [Restart Accel Project!](https://github.com/rust-accel/accel/issues/64) 91 | 92 | ### Stable Rust 93 | 94 | Stabilize Host-side code, though device-side code still requires nightly. 95 | 96 | - Rust 2018 edition https://github.com/rust-accel/accel/pull/70 97 | - proc-macro has been stabilized as https://github.com/rust-accel/accel/pull/63 98 | - cargo check runs on stable Rust https://github.com/rust-accel/accel/pull/66 99 | 100 | ### Update dependencies 101 | 102 | - syn, quote, proc-macro2 1.0 https://github.com/rust-accel/accel/pull/67 103 | - rust-cuda/cuda-{runtime,driver}-sys 0.3.0-alpha.1 https://github.com/rust-accel/accel/pull/66 104 | 105 | ### rust-ptx-linker 106 | 107 | Linker flavor using rust-ptx-linker has been merged into rustc https://github.com/rust-lang/rust/pull/57937 108 | 109 | - Rewrite accel-derive with rust-ptx-linker https://github.com/rust-accel/accel/pull/71 110 | - archive [nvptx](https://github.com/rust-accel/nvptx) and other crates 111 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "accel", 4 | "accel-derive", 5 | ] 6 | 7 | exclude = [ 8 | "accel-core", 9 | ] 10 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Toshiki Teramura 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Licence 2 | -------- 3 | Dual-licensed to be compatible with the Rust project. 4 | 5 | - [Apache License, Version 2.0](./LICENSE-APACHE) 6 | - [the MIT license](./LICENSE-MIT) 7 | 8 | In addition, you must refer [End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html) for using CUDA. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Accel: GPGPU Framework for Rust 2 | ================================ 3 | 4 | [![pipeline status](https://gitlab.com/termoshtt/accel/badges/master/pipeline.svg)](https://gitlab.com/termoshtt/accel/commits/master) 5 | 6 | |crate |crates.io |docs.rs |GitLab Pages | | 7 | |:-----------|:---------------------------------------------------------------------------|:----------------------------------------------------------------------|:-----------------------------------------------------------------------------|:------------------------------------------| 8 | |accel |[![Crate](http://meritbadge.herokuapp.com/accel)][crate/accel] |[![docs.rs](https://docs.rs/accel/badge.svg)][docs/accel] |[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel] |CUDA-based GPGPU framework | 9 | |accel-core |[![Crate](http://meritbadge.herokuapp.com/accel-core)][crate/accel-core] |[![docs.rs](https://docs.rs/accel-core/badge.svg)][docs/accel-core] |[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel-core] |Helper for writing device code | 10 | |accel-derive|[![Crate](http://meritbadge.herokuapp.com/accel-derive)][crate/accel-derive]|[![docs.rs](https://docs.rs/accel-derive/badge.svg)][docs/accel-derive]|[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel-derive]|Procedural macro for generating kernel code| 11 | 12 | [crate/accel]: https://crates.io/crates/accel/0.3.0 13 | [crate/accel-core]: https://crates.io/crates/accel-core/0.3.0 14 | [crate/accel-derive]: https://crates.io/crates/accel-derive/0.3.0 15 | 16 | [docs/accel]: https://docs.rs/accel/0.3.0 17 | [docs/accel-core]: https://docs.rs/accel-core/0.3.0 18 | [docs/accel-derive]: https://docs.rs/accel-derive/0.3.0 19 | 20 | [dev/accel]: https://termoshtt.gitlab.io/accel/accel/accel 21 | [dev/accel-core]: https://termoshtt.gitlab.io/accel/accel/accel_core 22 | [dev/accel-derive]: https://termoshtt.gitlab.io/accel/accel/accel_derive 23 | 24 | Requirements 25 | ------------ 26 | ![minimum supported rust version](https://img.shields.io/badge/rustc-1.42+-red.svg) 27 | 28 | - Minimum Supported Rust Version (MSRV) is 1.42.0 29 | - Install [CUDA](https://developer.nvidia.com/cuda-downloads) on your system 30 | - accel depends on CUDA Device APIs through [rust-cuda/cuda-sys](https://github.com/rust-cuda/cuda-sys) 31 | - accel does not depend on CUDA Runtime APIs. It means that a compiled binary requires only `libcuda.so` at runtime, which is far lighter than entire CUDA development toolkit. 32 | - Setup NVPTX target of Rust 33 | - Install `nightly-2020-05-01` toolchain with `nvptx64-nvidia-cuda` target, and [rust-ptx-linker](https://github.com/denzp/rust-ptx-linker) 34 | - There is an [setup script](setup_nvptx_toolchain.sh) for them: 35 | 36 | ``` 37 | curl -sSL https://gitlab.com/termoshtt/accel/raw/master/setup_nvptx_toolchain.sh | bash 38 | ``` 39 | 40 | Or, you can use [docker container](./docker) 41 | 42 | Limitations 43 | ------------ 44 | This project is still in early stage. There are several limitations as following: 45 | 46 | - For runtime on CPU 47 | - [Windows](https://gitlab.com/termoshtt/accel/-/issues/25) and macOS are not supported 48 | - [f64](https://gitlab.com/termoshtt/accel/-/issues/53) and [Complex number](https://gitlab.com/termoshtt/accel/-/issues/54) supports are missing 49 | - [Texture/Surface object handling](https://gitlab.com/termoshtt/accel/-/issues/40) is missing 50 | - Async features based on CUDA Stream and Events are disabled until [async/.await support](https://gitlab.com/termoshtt/accel/-/issues/4) 51 | 52 | - For writing GPU kernel code 53 | - [libstd cannot be used in writing kernel](https://gitlab.com/termoshtt/accel/-/issues/38) 54 | - [Rust slice cannot be used in writing kernel](https://gitlab.com/termoshtt/accel/-/issues/7) 55 | - [Shared memory](https://gitlab.com/termoshtt/accel/-/issues/39) cannot be used 56 | 57 | Contribution 58 | ------------ 59 | This project is developed on [GitLab](https://gitlab.com/termoshtt/accel) and mirrored to [GitHub](https://github.com/rust-accel/accel). 60 | 61 | Sponsors 62 | -------- 63 | - [RICOS Co. Ltd](https://www.ricos.co.jp/) 64 | - GPU instances for CI and development 65 | 66 | Links 67 | ------ 68 | 69 | Projects which accel depends on: 70 | 71 | - [rust-cuda/cuda-sys](https://github.com/rust-cuda/cuda-sys): CUDA Runtime and Driver API binding to Rust 72 | - [denzp/rust-ptx-linker](https://github.com/denzp/rust-ptx-linker): Linker for PTX files generated by `rustc` 73 | 74 | Related Projects: 75 | 76 | - [rust-cuda/wg](https://github.com/rust-cuda/wg): Working group for Rust CUDA Team 77 | - [denzp/rust-ptx-builder](https://github.com/denzp/rust-ptx-builder): Another CUDA kernel builder from Rust crate 78 | - [bheisler/RustaCUDA](https://github.com/bheisler/RustaCUDA): Another CUDA-based Rust framework 79 | -------------------------------------------------------------------------------- /accel-core/.cargo/config: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "nvptx64-nvidia-cuda" 3 | -------------------------------------------------------------------------------- /accel-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "accel-core" 3 | version = "0.3.0" 4 | authors = ["Toshiki Teramura "] 5 | edition = "2018" 6 | 7 | description = "Support crate for writing GPGPU kernels using accel" 8 | documentation = "https://docs.rs/accel-core/" 9 | repository = "https://github.com/termoshtt/accel" 10 | keywords = ["GPGPU", "CUDA", "platform-intrinsic"] 11 | license = "MIT/Apache-2.0" 12 | readme = "README.md" 13 | categories = [] 14 | 15 | [package.metadata.docs.rs] 16 | targets = ["nvptx64-nvidia-cuda"] 17 | -------------------------------------------------------------------------------- /accel-core/README.md: -------------------------------------------------------------------------------- 1 | accel-core 2 | =========== 3 | 4 | [![Crate](http://meritbadge.herokuapp.com/accel-core)](https://crates.io/crates/accel-core) 5 | [![docs.rs](https://docs.rs/accel-core/badge.svg)](https://docs.rs/accel-core) 6 | 7 | Support crate for writing kernels 8 | -------------------------------------------------------------------------------- /accel-core/rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly-2020-05-01 2 | -------------------------------------------------------------------------------- /accel-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Support crate for writting GPU kernel in Rust 2 | //! 3 | //! - This crate works only for `nvptx64-nvidia-cuda` target 4 | //! - There is no support of `libstd` for `nvptx64-nvidia-cuda` target, 5 | //! i.e. You need to write `#![no_std]` Rust code. 6 | //! - `alloc` crate is supported by `accel_core::PTXAllocator` which utilizes CUDA malloc/free system-calls 7 | //! - You can use `println!` and `assert_eq!` throught it. 8 | 9 | #![feature(stdsimd)] 10 | #![no_std] 11 | 12 | extern crate alloc; 13 | 14 | use alloc::alloc::*; 15 | use core::arch::nvptx; 16 | 17 | /// Memory allocator using CUDA malloc/free 18 | pub struct PTXAllocator; 19 | 20 | unsafe impl GlobalAlloc for PTXAllocator { 21 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 22 | nvptx::malloc(layout.size()) as *mut u8 23 | } 24 | unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { 25 | nvptx::free(ptr as *mut _); 26 | } 27 | } 28 | 29 | /// Alternative of [std::print!](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call 30 | #[macro_export] 31 | macro_rules! print { 32 | ($($arg:tt)*) => { 33 | let msg = ::alloc::format!($($arg)*); 34 | unsafe { 35 | ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut()); 36 | } 37 | } 38 | } 39 | 40 | /// Alternative of [std::println!](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call 41 | #[macro_export] 42 | macro_rules! println { 43 | () => ($crate::print!("\n")); 44 | ($fmt:expr) => ($crate::print!(concat!($fmt, "\n"))); 45 | ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*)); 46 | } 47 | 48 | /// Assertion in GPU kernel for two expressions are equal. 49 | /// 50 | /// If assertion failed, accel API will return [accel::error::AccelError::DeviceAssertionFailed](https://docs.rs/accel/0.3.0-alpha.2/accel/error/enum.AccelError.html#variant.DeviceAssertionFailed) 51 | #[macro_export] 52 | macro_rules! assert_eq { 53 | ($a:expr, $b:expr) => { 54 | if $a != $b { 55 | let msg = alloc::format!( 56 | "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}", 57 | stringify!($a), 58 | stringify!($b), 59 | $a, 60 | $b 61 | ); 62 | unsafe { 63 | ::core::arch::nvptx::__assert_fail( 64 | msg.as_ptr(), 65 | file!().as_ptr(), 66 | line!(), 67 | // FIXME cannot get function name. 68 | // See https://github.com/rust-lang/rfcs/pull/2818 69 | "".as_ptr(), 70 | ) 71 | }; 72 | } 73 | }; 74 | } 75 | 76 | /// Assertion in GPU kernel for two expressions are not equal. 77 | /// 78 | /// If assertion failed, accel API will return [accel::error::AccelError::DeviceAssertionFailed](https://docs.rs/accel/0.3.0-alpha.2/accel/error/enum.AccelError.html#variant.DeviceAssertionFailed) 79 | #[macro_export] 80 | macro_rules! assert_ne { 81 | ($a:expr, $b:expr) => { 82 | if $a == $b { 83 | let msg = alloc::format!( 84 | "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}", 85 | stringify!($a), 86 | stringify!($b), 87 | $a, 88 | $b 89 | ); 90 | unsafe { 91 | ::core::arch::nvptx::__assert_fail( 92 | msg.as_ptr(), 93 | file!().as_ptr(), 94 | line!(), 95 | // FIXME cannot get function name. 96 | // See https://github.com/rust-lang/rfcs/pull/2818 97 | "".as_ptr(), 98 | ) 99 | }; 100 | } 101 | }; 102 | } 103 | 104 | /// Dimension specified in kernel launching 105 | pub struct Dim3 { 106 | pub x: i32, 107 | pub y: i32, 108 | pub z: i32, 109 | } 110 | 111 | /// Indices where the kernel code running on 112 | pub struct Idx3 { 113 | pub x: i32, 114 | pub y: i32, 115 | pub z: i32, 116 | } 117 | 118 | pub fn block_dim() -> Dim3 { 119 | unsafe { 120 | Dim3 { 121 | x: nvptx::_block_dim_x(), 122 | y: nvptx::_block_dim_y(), 123 | z: nvptx::_block_dim_z(), 124 | } 125 | } 126 | } 127 | 128 | pub fn block_idx() -> Idx3 { 129 | unsafe { 130 | Idx3 { 131 | x: nvptx::_block_idx_x(), 132 | y: nvptx::_block_idx_y(), 133 | z: nvptx::_block_idx_z(), 134 | } 135 | } 136 | } 137 | 138 | pub fn grid_dim() -> Dim3 { 139 | unsafe { 140 | Dim3 { 141 | x: nvptx::_grid_dim_x(), 142 | y: nvptx::_grid_dim_y(), 143 | z: nvptx::_grid_dim_z(), 144 | } 145 | } 146 | } 147 | 148 | pub fn thread_idx() -> Idx3 { 149 | unsafe { 150 | Idx3 { 151 | x: nvptx::_thread_idx_x(), 152 | y: nvptx::_thread_idx_y(), 153 | z: nvptx::_thread_idx_z(), 154 | } 155 | } 156 | } 157 | 158 | impl Dim3 { 159 | pub fn size(&self) -> i32 { 160 | (self.x * self.y * self.z) 161 | } 162 | } 163 | 164 | impl Idx3 { 165 | pub fn into_id(&self, dim: Dim3) -> i32 { 166 | self.x + self.y * dim.x + self.z * dim.x * dim.y 167 | } 168 | } 169 | 170 | pub fn index() -> isize { 171 | let block_id = block_idx().into_id(grid_dim()); 172 | let thread_id = thread_idx().into_id(block_dim()); 173 | (block_id + thread_id) as isize 174 | } 175 | -------------------------------------------------------------------------------- /accel-derive/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "accel-derive" 3 | version = "0.3.0" 4 | authors = ["Toshiki Teramura "] 5 | edition = "2018" 6 | 7 | description = "Procedual macro for writing GPGPU kernel" 8 | documentation = "https://docs.rs/accel-derive/" 9 | repository = "https://github.com/termoshtt/accel" 10 | keywords = ["GPGPU", "CUDA", "proc-macro"] 11 | license = "MIT/Apache-2.0" 12 | readme = "README.md" 13 | categories = [] 14 | 15 | [lib] 16 | proc-macro = true 17 | 18 | [dependencies] 19 | proc-macro-crate = "0.1" 20 | proc-macro2 = "1.0.18" 21 | quote = "1.0.6" 22 | syn = { version = "1.0.30", features = ["full", "extra-traits"] } 23 | 24 | dirs = "2.0.2" 25 | maplit = "1.0.2" 26 | serde = { version = "1.0.111", features = ["derive"] } 27 | toml = "0.5.6" 28 | 29 | failure = "0.1.8" 30 | anyhow = "1.0.31" 31 | 32 | [dev-dependencies] 33 | trybuild = "1.0.27" 34 | accel = { version = "0.4.0-alpha.0", path = "../accel" } 35 | -------------------------------------------------------------------------------- /accel-derive/README.md: -------------------------------------------------------------------------------- 1 | accel-derive 2 | ============= 3 | 4 | [![Crate](http://meritbadge.herokuapp.com/accel-derive)](https://crates.io/crates/accel-derive) 5 | [![docs.rs](https://docs.rs/accel-derive/badge.svg)](https://docs.rs/accel-derive) 6 | 7 | Procedural-macro crate for `#[kernel]`. `#[kernel]` function will be converted to two part: 8 | 9 | - Device code will be compiled into PTX assembler 10 | - Host code which call the generated device code (PTX asm) using `accel::module` API 11 | 12 | ![Compile flow graph](../diagrams/compile_flow.png) 13 | -------------------------------------------------------------------------------- /accel-derive/src/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::parser::*; 2 | use failure::*; 3 | use quote::quote; 4 | use std::{ 5 | collections::{hash_map::DefaultHasher, HashMap}, 6 | env, fs, 7 | hash::*, 8 | io::{Read, Write}, 9 | path::*, 10 | process::Command, 11 | }; 12 | 13 | const NIGHTLY_VERSION: &str = "nightly-2020-05-01"; 14 | 15 | trait CheckRun { 16 | fn check_run(&mut self) -> Fallible<()>; 17 | } 18 | 19 | impl CheckRun for Command { 20 | fn check_run(&mut self) -> Fallible<()> { 21 | // Filter CARGO_* and OUT_DIR envs 22 | let filtered_env: HashMap = env::vars() 23 | .filter(|&(ref k, _)| !(k.starts_with("CARGO") || k == "OUT_DIR")) 24 | .collect(); 25 | let output = self.env_clear().envs(&filtered_env).output()?; 26 | if !output.status.success() { 27 | println!("{}", std::str::from_utf8(&output.stdout)?); 28 | eprintln!("{}", std::str::from_utf8(&output.stderr)?); 29 | bail!("External command failed: {:?}", self); 30 | } 31 | Ok(()) 32 | } 33 | } 34 | 35 | /// Generate Rust code for nvptx64-nvidia-cuda target from tokens 36 | fn ptx_kernel(func: &syn::ItemFn) -> String { 37 | let vis = &func.vis; 38 | let ident = &func.sig.ident; 39 | let unsafety = &func.sig.unsafety; 40 | let block = &func.block; 41 | 42 | let fn_token = &func.sig.fn_token; 43 | let inputs = &func.sig.inputs; 44 | let output = &func.sig.output; 45 | 46 | let kernel = quote! { 47 | #![feature(abi_ptx, stdsimd, alloc_error_handler)] 48 | #![no_std] 49 | extern crate alloc; 50 | #[global_allocator] 51 | static _GLOBAL_ALLOCATOR: accel_core::PTXAllocator = accel_core::PTXAllocator; 52 | #[no_mangle] 53 | #vis #unsafety extern "ptx-kernel" #fn_token #ident(#inputs) #output #block 54 | #[panic_handler] 55 | fn panic(_info: &::core::panic::PanicInfo) -> ! { 56 | unsafe { ::core::arch::nvptx::trap() } 57 | } 58 | #[alloc_error_handler] 59 | fn alloc_error_handler(_: core::alloc::Layout) -> ! { 60 | unsafe { ::core::arch::nvptx::trap() } 61 | } 62 | }; 63 | kernel.to_string() 64 | } 65 | 66 | fn calc_hash(t: &T) -> u64 { 67 | let mut s = DefaultHasher::new(); 68 | t.hash(&mut s); 69 | s.finish() 70 | } 71 | 72 | fn project_id() -> String { 73 | let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); 74 | let hash = calc_hash(&manifest_dir); 75 | let stem = PathBuf::from(manifest_dir) 76 | .file_stem() 77 | .unwrap() 78 | .to_str() 79 | .unwrap() 80 | .to_string(); 81 | format!("{}-{:x}", stem, hash) 82 | } 83 | 84 | pub fn compile_tokens(func: &syn::ItemFn) -> Fallible { 85 | let meta = MetaData::from_token(func)?; 86 | 87 | // Create crate 88 | let dir = dirs::cache_dir() 89 | .unwrap() 90 | .join("accel-derive") 91 | .join(project_id()) 92 | .join(meta.name()); 93 | fs::create_dir_all(dir.join("src"))?; 94 | 95 | // Generate lib.rs and write into a file 96 | let mut lib_rs = fs::File::create(dir.join("src/lib.rs"))?; 97 | lib_rs.write_all(ptx_kernel(func).as_bytes())?; 98 | lib_rs.sync_data()?; 99 | 100 | // Generate Cargo.toml 101 | let mut cargo_toml = fs::File::create(dir.join("Cargo.toml"))?; 102 | cargo_toml.write_all(toml::to_string(&meta)?.as_bytes())?; 103 | cargo_toml.sync_data()?; 104 | 105 | // Build 106 | Command::new("cargo") 107 | .args(&[&format!("+{}", NIGHTLY_VERSION), "fmt"]) 108 | .current_dir(&dir) 109 | .check_run()?; 110 | Command::new("cargo") 111 | .args(&[ 112 | &format!("+{}", NIGHTLY_VERSION), 113 | "build", 114 | "--release", 115 | "--target", 116 | "nvptx64-nvidia-cuda", 117 | ]) 118 | .current_dir(&dir) 119 | .check_run()?; 120 | 121 | // Read PTX file 122 | let mut ptx = fs::File::open(dir.join(format!( 123 | "target/nvptx64-nvidia-cuda/release/{}.ptx", 124 | meta.name() 125 | )))?; 126 | let mut buf = String::new(); 127 | ptx.read_to_string(&mut buf)?; 128 | Ok(buf) 129 | } 130 | 131 | #[cfg(test)] 132 | mod tests { 133 | use super::*; 134 | 135 | #[test] 136 | fn build_do_nothing() { 137 | let func = syn::parse_str("unsafe fn do_nothing() {}").unwrap(); 138 | let ptx = compile_tokens(&func).unwrap(); 139 | assert!(ptx.len() > 0); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /accel-derive/src/contexted.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::*; 2 | use quote::quote; 3 | use syn::*; 4 | 5 | fn seek_context_ident(input: &DeriveInput) -> Ident { 6 | match &input.data { 7 | syn::Data::Struct(syn::DataStruct { fields, .. }) => match fields { 8 | Fields::Named(fields_named) => { 9 | for field in fields_named.named.iter() { 10 | let field = field.ident.clone().unwrap(); 11 | if field.to_string() == "context" || field.to_string() == "ctx" { 12 | return field; 13 | } 14 | } 15 | } 16 | _ => unreachable!("Must be named field"), 17 | }, 18 | _ => unreachable!("Must be a struct"), 19 | }; 20 | unreachable!("context or ctx not found") 21 | } 22 | 23 | pub fn contexted(input: DeriveInput) -> TokenStream { 24 | let name = &input.ident; 25 | let generics = &input.generics; 26 | let context_ident = seek_context_ident(&input); 27 | quote! { 28 | impl #generics Contexted for #name #generics { 29 | fn sync(&self) -> Result<()> { 30 | self.#context_ident.sync() 31 | } 32 | 33 | fn version(&self) -> Result { 34 | self.#context_ident.version() 35 | } 36 | 37 | fn guard(&self) -> Result { 38 | self.#context_ident.guard() 39 | } 40 | 41 | fn get_ref(&self) -> ContextRef { 42 | self.#context_ident.get_ref() 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /accel-derive/src/host.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Span, TokenStream}; 2 | use quote::quote; 3 | 4 | /// Split out types from function definition 5 | /// 6 | /// - Reference type, e.g. `&i32` will be modified into lifetimed reference `&'arg i32` 7 | /// 8 | fn input_types(func: &syn::ItemFn) -> Vec { 9 | func.sig 10 | .inputs 11 | .iter() 12 | .map(|arg| match arg { 13 | syn::FnArg::Typed(ref val) => { 14 | let mut ty = *val.ty.clone(); 15 | match &mut ty { 16 | syn::Type::Reference(re) => { 17 | re.lifetime = Some(syn::Lifetime::new("'arg", Span::call_site())) 18 | } 19 | _ => {} 20 | } 21 | ty 22 | } 23 | _ => panic!("Unsupported kernel input type sigunature"), 24 | }) 25 | .collect() 26 | } 27 | 28 | fn accel_path() -> String { 29 | if let Ok(name) = proc_macro_crate::crate_name("accel") { 30 | // accel exists as an external crate 31 | return name; 32 | } 33 | 34 | if std::env::var("CARGO_PKG_NAME").unwrap() == "accel" { 35 | // doctest in accel 36 | // 37 | // "--crate-type bin" should be specified for doctest 38 | let mut find_flag = false; 39 | for arg in std::env::args() { 40 | if arg == "--crate-type" { 41 | find_flag = true; 42 | } 43 | if find_flag { 44 | if arg == "bin" { 45 | return "accel".into(); 46 | } 47 | } 48 | } 49 | 50 | // in accel crate 51 | return "crate".into(); 52 | } 53 | unreachable!("Cannot determine accel crate name"); 54 | } 55 | 56 | fn impl_submodule(ptx_str: &str, func: &syn::ItemFn) -> TokenStream { 57 | let input_types = input_types(func); 58 | let accel = accel_path(); 59 | 60 | let launchable: syn::Path = syn::parse_str(&format!( 61 | "{}::execution::Launchable{}", 62 | accel, 63 | input_types.len() 64 | )) 65 | .unwrap(); 66 | 67 | let targets: Vec = (1..=input_types.len()) 68 | .into_iter() 69 | .map(|k| syn::Ident::new(&format!("Target{}", k), Span::call_site())) 70 | .collect(); 71 | 72 | let ident = &func.sig.ident; 73 | 74 | let accel = syn::Ident::new(&accel, Span::call_site()); 75 | let kernel_name = quote! { #ident }.to_string(); 76 | quote! { 77 | /// Auto-generated by accel-derive 78 | mod #ident { 79 | pub const PTX_STR: &'static str = #ptx_str; 80 | 81 | pub struct Module(#accel::Module); 82 | 83 | impl Module { 84 | pub fn new(ctx: &#accel::Context) -> #accel::error::Result { 85 | Ok(Module(#accel::Module::from_str(ctx, PTX_STR)?)) 86 | } 87 | } 88 | 89 | impl<'arg> #launchable <'arg> for Module { 90 | #( 91 | type #targets = #input_types; 92 | )* 93 | fn get_kernel(&self) -> #accel::error::Result<#accel::Kernel> { 94 | Ok(self.0.get_kernel(#kernel_name)?) 95 | } 96 | } 97 | } 98 | } 99 | } 100 | 101 | fn caller(func: &syn::ItemFn) -> TokenStream { 102 | let accel = accel_path(); 103 | let vis = &func.vis; 104 | let ident = &func.sig.ident; 105 | let fn_token = &func.sig.fn_token; 106 | 107 | let input_types = input_types(func); 108 | 109 | let args_types: Vec = (1..=input_types.len()) 110 | .into_iter() 111 | .map(|k| syn::Ident::new(&format!("Arg{}", k), Span::call_site())) 112 | .collect(); 113 | 114 | let launchable: syn::Path = syn::parse_str(&format!( 115 | "{}::execution::Launchable{}", 116 | accel, 117 | input_types.len() 118 | )) 119 | .unwrap(); 120 | 121 | let accel = syn::Ident::new(&accel, Span::call_site()); 122 | 123 | quote! { 124 | #vis #fn_token #ident<'arg, #(#args_types),* >( 125 | ctx: &#accel::Context, 126 | grid: impl Into<#accel::Grid>, 127 | block: impl Into<#accel::Block>, 128 | args: (#(#args_types,)*) 129 | ) -> #accel::error::Result<()> 130 | where 131 | #( 132 | #args_types: #accel::execution::DeviceSend 133 | ),* 134 | { 135 | use #launchable; 136 | let module = #ident::Module::new(ctx)?; 137 | module.launch(grid, block, args)?; 138 | Ok(()) 139 | } 140 | } 141 | } 142 | 143 | pub fn func2caller(ptx_str: &str, func: &syn::ItemFn) -> TokenStream { 144 | let impl_submodule = impl_submodule(ptx_str, func); 145 | let caller = caller(func); 146 | quote! { 147 | #impl_submodule 148 | #caller 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod tests { 154 | use anyhow::Result; 155 | use std::{ 156 | io::Write, 157 | process::{Command, Stdio}, 158 | }; 159 | 160 | const TEST_KERNEL: &'static str = r#" 161 | fn kernel_name(arg1: i32, arg2: f64) {} 162 | "#; 163 | 164 | /// Format TokenStream by rustfmt 165 | /// 166 | /// This can test if the input TokenStream is valid in terms of rustfmt. 167 | fn pretty_print(tt: &impl ToString) -> Result<()> { 168 | let mut fmt = Command::new("rustfmt") 169 | .stdin(Stdio::piped()) 170 | .stdout(Stdio::piped()) 171 | .spawn()?; 172 | fmt.stdin 173 | .as_mut() 174 | .unwrap() 175 | .write(tt.to_string().as_bytes())?; 176 | let out = fmt.wait_with_output()?; 177 | println!("{}", String::from_utf8_lossy(&out.stdout)); 178 | Ok(()) 179 | } 180 | 181 | #[test] 182 | fn impl_submodule() -> Result<()> { 183 | let func: syn::ItemFn = syn::parse_str(TEST_KERNEL)?; 184 | let ts = super::impl_submodule("", &func); 185 | pretty_print(&ts)?; 186 | Ok(()) 187 | } 188 | 189 | #[test] 190 | fn caller() -> Result<()> { 191 | let func: syn::ItemFn = syn::parse_str(TEST_KERNEL)?; 192 | let ts = super::caller(&func); 193 | pretty_print(&ts)?; 194 | Ok(()) 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /accel-derive/src/launchable.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Span, TokenStream}; 2 | use quote::quote; 3 | pub fn generate(item: TokenStream) -> TokenStream { 4 | let literal: syn::LitInt = syn::parse2(item).unwrap(); 5 | let n: usize = literal.base10_parse().unwrap(); 6 | (0..=n) 7 | .into_iter() 8 | .map(|i| { 9 | let name = syn::Ident::new(&format!("Launchable{}", i), Span::call_site()); 10 | let targets: Vec = (1..=i) 11 | .into_iter() 12 | .map(|k| syn::Ident::new(&format!("Target{}", k), Span::call_site())) 13 | .collect(); 14 | let args_value: Vec = (1..=i) 15 | .into_iter() 16 | .map(|k| syn::Ident::new(&format!("arg{}", k), Span::call_site())) 17 | .collect(); 18 | let args_types: Vec = (1..=i) 19 | .into_iter() 20 | .map(|k| syn::Ident::new(&format!("Arg{}", k), Span::call_site())) 21 | .collect(); 22 | quote! { 23 | /// Launchable Kernel with N-arguments 24 | /// 25 | /// This is auto-generated by `accel_derive::define_launchable!` proc-macro. 26 | /// See [module level document](index.html) for detail. 27 | pub trait #name <'arg> { 28 | #( 29 | type #targets; 30 | )* 31 | fn get_kernel(&self) -> Result; 32 | fn launch<#(#args_types),*>( 33 | &self, 34 | grid: impl Into, 35 | block: impl Into, 36 | (#(#args_value,)*): (#(#args_types,)*), 37 | ) -> Result<()> 38 | where 39 | #( 40 | #args_types: DeviceSend 41 | ),* 42 | { 43 | let grid = grid.into(); 44 | let block = block.into(); 45 | let kernel = self.get_kernel()?; 46 | let mut args = [#(#args_value.as_kernel_parameter()),*]; 47 | unsafe { 48 | contexted_call!( 49 | &kernel, 50 | cuLaunchKernel, 51 | kernel.func, 52 | grid.x, 53 | grid.y, 54 | grid.z, 55 | block.x, 56 | block.y, 57 | block.z, 58 | 0, /* FIXME: no shared memory */ 59 | null_mut(), /* use default stream */ 60 | args.as_mut_ptr(), 61 | null_mut() /* no extra */ 62 | )?; 63 | } 64 | kernel.sync()?; 65 | Ok(()) 66 | } 67 | 68 | fn launch_async<#(#args_types),*>( 69 | &self, 70 | grid: impl Into, 71 | block: impl Into, 72 | (#(#args_value,)*): (#(#args_types,)*), 73 | ) -> ::futures::future::BoxFuture<'arg, Result<()>> 74 | where 75 | #( 76 | #args_types: DeviceSend + 'arg 77 | ),* 78 | { 79 | let grid = grid.into(); 80 | let block = block.into(); 81 | let kernel = self.get_kernel().unwrap(); 82 | let stream = stream::Stream::new(kernel.get_ref()); 83 | let mut args = [#(#args_value.as_kernel_parameter()),*]; 84 | unsafe { 85 | contexted_call!( 86 | &kernel, 87 | cuLaunchKernel, 88 | kernel.func, 89 | grid.x, 90 | grid.y, 91 | grid.z, 92 | block.x, 93 | block.y, 94 | block.z, 95 | 0, /* FIXME: no shared memory */ 96 | stream.stream, 97 | args.as_mut_ptr(), 98 | null_mut() /* no extra */ 99 | ) 100 | } 101 | .expect("Asynchronous kernel launch has been failed"); 102 | Box::pin(stream.into_future()) 103 | } 104 | } 105 | } 106 | }) 107 | .collect() 108 | } 109 | -------------------------------------------------------------------------------- /accel-derive/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![recursion_limit = "128"] 2 | 3 | //! Get compiled PTX as `String` 4 | //! ---------------------------- 5 | //! 6 | //! The proc-macro `#[kernel]` creates a submodule `add::` in addition to a function `add`. 7 | //! Kernel Rust code is compiled into PTX string using rustc's `nvptx64-nvidia-cuda` toolchain. 8 | //! Generated PTX string is embedded into proc-macro output as `{kernel_name}::PTX_STR`. 9 | //! 10 | //! ``` 11 | //! use accel_derive::kernel; 12 | //! 13 | //! #[kernel] 14 | //! unsafe fn add(a: *const f64, b: *const f64, c: *mut f64, n: usize) { 15 | //! let i = accel_core::index(); 16 | //! if (i as usize) < n { 17 | //! *c.offset(i) = *a.offset(i) + *b.offset(i); 18 | //! } 19 | //! } 20 | //! 21 | //! // PTX assembler code is embedded as `add::PTX_STR` 22 | //! println!("{}", add::PTX_STR); 23 | //! ``` 24 | 25 | mod builder; 26 | mod contexted; 27 | mod host; 28 | mod launchable; 29 | mod parser; 30 | 31 | use proc_macro::TokenStream; 32 | 33 | #[proc_macro_attribute] 34 | pub fn kernel(_attr: TokenStream, func: TokenStream) -> TokenStream { 35 | let func: syn::ItemFn = syn::parse(func).expect("Not a function"); 36 | let ptx_str = builder::compile_tokens(&func).expect("Failed to compile to PTX"); 37 | host::func2caller(&ptx_str, &func).into() 38 | } 39 | 40 | #[proc_macro_derive(Contexted)] 41 | pub fn contexted(input: TokenStream) -> TokenStream { 42 | contexted::contexted(syn::parse(input).unwrap()).into() 43 | } 44 | 45 | #[proc_macro] 46 | pub fn define_launchable(item: TokenStream) -> TokenStream { 47 | launchable::generate(item.into()).into() 48 | } 49 | -------------------------------------------------------------------------------- /accel-derive/src/parser.rs: -------------------------------------------------------------------------------- 1 | use failure::*; 2 | use maplit::hashmap; 3 | use quote::ToTokens; 4 | use serde::{Deserialize, Serialize}; 5 | use std::collections::HashMap; 6 | 7 | #[derive(Debug, Serialize)] 8 | pub struct MetaData { 9 | package: HashMap<&'static str, String>, 10 | lib: HashMap<&'static str, Vec<&'static str>>, 11 | dependencies: HashMap, 12 | } 13 | 14 | impl MetaData { 15 | fn new(name: &str) -> Self { 16 | MetaData { 17 | package: hashmap! { "version" => "0.0.0".into(), "name" => name.into(), "edition" => "2018".into() }, 18 | lib: hashmap! { "crate-type" => vec![ "cdylib" ] }, 19 | dependencies: HashMap::new(), 20 | } 21 | } 22 | 23 | pub fn name(&self) -> &str { 24 | &self.package["name"] 25 | } 26 | 27 | pub fn from_token(func: &syn::ItemFn) -> Fallible { 28 | let attrs = &func.attrs; 29 | let mut kernel_attrs = MetaData::new(&func.sig.ident.to_string()); 30 | for attr in attrs { 31 | let path = attr.path.to_token_stream().to_string(); 32 | match path.as_ref() { 33 | "dependencies" => { 34 | let dep = parse_dependency( 35 | attr.tokens 36 | .to_string() 37 | .trim_start_matches('(') 38 | .trim_end_matches(')'), 39 | )?; 40 | for (key, val) in dep { 41 | kernel_attrs.dependencies.insert(key, val); 42 | } 43 | } 44 | "name" => { 45 | let token = attr.tokens.to_string(); 46 | let name = token.trim_start_matches('(').trim_end_matches(')').trim(); 47 | kernel_attrs.package.insert("name", name.into()); 48 | } 49 | _ => { 50 | continue; 51 | } 52 | } 53 | } 54 | kernel_attrs 55 | .dependencies 56 | .entry("accel-core".into()) 57 | .or_insert_with(|| Depenency::Version("0.3.0-alpha.4".into())); 58 | Ok(kernel_attrs) 59 | } 60 | } 61 | 62 | // Should I use `cargo::core::dependency::Depenency`? 63 | // https://docs.rs/cargo/0.41.0/cargo/core/dependency/struct.Dependency.html 64 | #[derive(Debug, PartialEq, Serialize, Deserialize)] 65 | #[serde(untagged, deny_unknown_fields)] 66 | enum Depenency { 67 | Version(String), 68 | VersionTable { 69 | version: String, 70 | #[serde(default)] 71 | features: Vec, 72 | }, 73 | Git { 74 | git: String, 75 | branch: Option, 76 | tag: Option, 77 | hash: Option, 78 | #[serde(default)] 79 | features: Vec, 80 | }, 81 | Path { 82 | path: String, 83 | #[serde(default)] 84 | features: Vec, 85 | }, 86 | } 87 | 88 | fn parse_dependency(dep: &str) -> Fallible> { 89 | Ok(toml::from_str(&dep.replace("\n", ""))?) 90 | } 91 | 92 | #[cfg(test)] 93 | mod tests { 94 | #[test] 95 | fn parse_dependency() { 96 | let map = super::parse_dependency(r#"accel-core = "0.1.1""#).unwrap(); 97 | dbg!(map); 98 | let map = super::parse_dependency(r#"accel-core = { version = "0.1.1" }"#).unwrap(); 99 | dbg!(map); 100 | 101 | let map = super::parse_dependency( 102 | r#"accel-core = { git = "https://github.com/rust-accel/accel" }"#, 103 | ) 104 | .unwrap(); 105 | dbg!(map); 106 | 107 | let map = super::parse_dependency( 108 | r#"accel-core = { git = "https://github.com/rust-accel/accel", branch = "master" }"#, 109 | ) 110 | .unwrap(); 111 | dbg!(map); 112 | 113 | // `git` is lacked 114 | assert!(super::parse_dependency(r#"accel-core = { branch = "master" }"#,).is_err()); 115 | 116 | // Unsupported tag 117 | assert!(super::parse_dependency( 118 | r#"accel-core = { git = "https://github.com/rust-accel/accel", homhom = "master" }"#, 119 | ) 120 | .is_err()); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /accel-derive/tests/kernels/arguments.rs: -------------------------------------------------------------------------------- 1 | //! Testing launch arguments are correctly handled 2 | 3 | use accel::*; 4 | use accel_derive::kernel; 5 | use anyhow::Result; 6 | 7 | #[kernel] 8 | pub fn launch(i: i32) { 9 | accel_core::println!("i = {}", i); 10 | } 11 | 12 | fn test() -> Result<()> { 13 | let device = Device::nth(0)?; 14 | let ctx = device.create_context(); 15 | let i = 12; 16 | let grid = Grid::x(1); 17 | let block = Block::x(4); 18 | launch(&ctx, grid, block, (i,))?; 19 | Ok(()) 20 | } 21 | 22 | // Only check `test` can be compiled. not run here 23 | fn main() {} 24 | -------------------------------------------------------------------------------- /accel-derive/tests/kernels/dependencies.rs: -------------------------------------------------------------------------------- 1 | use accel_derive::kernel; 2 | 3 | #[kernel] 4 | #[dependencies("accel-core" = "0.3.0-alpha.4")] 5 | unsafe fn version() { 6 | let _i = accel_core::index(); 7 | } 8 | 9 | #[kernel] 10 | #[dependencies("accel-core" = { version = "0.3.0-alpha.4" })] 11 | unsafe fn version_table() { 12 | let _i = accel_core::index(); 13 | } 14 | 15 | fn main() {} 16 | -------------------------------------------------------------------------------- /accel-derive/tests/kernels/dependencies_default.rs: -------------------------------------------------------------------------------- 1 | use accel_derive::kernel; 2 | 3 | #[kernel] 4 | unsafe fn dependencies_default() { 5 | let _i = accel_core::index(); // accel-core exists 6 | } 7 | 8 | fn main() {} 9 | -------------------------------------------------------------------------------- /accel-derive/tests/kernels/dependencies_git.rs: -------------------------------------------------------------------------------- 1 | use accel_derive::kernel; 2 | 3 | #[kernel] 4 | #[dependencies("accel-core" = { git = "https://gitlab.com/termoshtt/accel" })] 5 | unsafe fn git() { 6 | let _i = accel_core::index(); 7 | } 8 | 9 | #[kernel] 10 | #[dependencies("accel-core" = { git = "https://gitlab.com/termoshtt/accel", branch = "master" })] 11 | unsafe fn git_branch() { 12 | let _i = accel_core::index(); 13 | } 14 | 15 | fn main() {} 16 | -------------------------------------------------------------------------------- /accel-derive/tests/kernels/do_nothing.rs: -------------------------------------------------------------------------------- 1 | use accel_derive::kernel; 2 | 3 | // Build test 4 | #[kernel] 5 | unsafe fn do_nothing() {} 6 | 7 | fn main() {} 8 | -------------------------------------------------------------------------------- /accel-derive/tests/try_build.rs: -------------------------------------------------------------------------------- 1 | #[test] 2 | fn kernel_generate() { 3 | let t = trybuild::TestCases::new(); 4 | t.pass("tests/kernels/do_nothing.rs"); 5 | t.pass("tests/kernels/dependencies.rs"); 6 | t.pass("tests/kernels/dependencies_git.rs"); 7 | t.pass("tests/kernels/dependencies_default.rs"); 8 | t.pass("tests/kernels/arguments.rs"); 9 | } 10 | -------------------------------------------------------------------------------- /accel/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "accel" 3 | version = "0.4.0-alpha.0" 4 | authors = ["Toshiki Teramura "] 5 | edition = "2018" 6 | 7 | description = "GPGPU Framework for Rust" 8 | documentation = "https://docs.rs/accel" 9 | repository = "https://gitlab.com/termoshtt/accel" 10 | keywords = ["GPGPU", "CUDA"] 11 | license = "MIT/Apache-2.0" 12 | readme = "../README.md" 13 | categories = [] 14 | 15 | [dependencies] 16 | accel-derive = { version = "0.3.0", path = "../accel-derive" } 17 | bitflags = "1.2.1" 18 | cuda-driver-sys = "0.3.0" 19 | derive-new = "0.5.8" 20 | futures = "0.3.5" 21 | log = "0.4.8" 22 | num-derive = "0.3.0" 23 | num-traits = "0.2.11" 24 | paste = "0.1.15" 25 | thiserror = "1.0.19" 26 | tokio = { version = "0.2.21", features = ["blocking"] } 27 | 28 | [dev-dependencies] 29 | criterion = "0.3.2" 30 | tokio = { version = "0.2.21", features = ["full"] } 31 | trybuild = "1.0.27" 32 | 33 | [[bench]] 34 | name = "memcpy" 35 | harness = false 36 | -------------------------------------------------------------------------------- /accel/benches/memcpy.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | use criterion::*; 3 | 4 | fn h2d(c: &mut Criterion) { 5 | let device = Device::nth(0).unwrap(); 6 | let context = device.create_context(); 7 | let mut group = c.benchmark_group("h2d"); 8 | 9 | macro_rules! impl_HtoD { 10 | ($host:expr, $id:expr) => { 11 | let host = $host; 12 | let n = host.len(); 13 | let mut dev = DeviceMemory::zeros(&context, n); 14 | group.bench_with_input( 15 | BenchmarkId::new(&format!("direct_{}", $id), n), 16 | &n, 17 | |b, _| { 18 | b.iter(|| { 19 | for i in 0..n { 20 | dev[i] = host[i]; 21 | } 22 | }) 23 | }, 24 | ); 25 | group.bench_with_input( 26 | BenchmarkId::new(format!("memcpy_{}", $id), n), 27 | &n, 28 | |b, _| { 29 | b.iter(|| { 30 | dev.copy_from(&host); 31 | }) 32 | }, 33 | ); 34 | }; 35 | } 36 | 37 | for &n in &[1000, 10_000, 100_000] { 38 | // impl_HtoD!(vec![0_u32; n], "vec"); 39 | impl_HtoD!(PageLockedMemory::::zeros(&context, n), "page_locked"); 40 | let mut vec_tmp = vec![0_u32; n]; 41 | impl_HtoD!(RegisteredMemory::new(&context, &mut vec_tmp), "registered"); 42 | } 43 | } 44 | 45 | fn d2h(c: &mut Criterion) { 46 | let device = Device::nth(0).unwrap(); 47 | let context = device.create_context(); 48 | let mut group = c.benchmark_group("d2h"); 49 | 50 | macro_rules! impl_DtoH { 51 | ($host:expr, $id:expr) => { 52 | let mut host = $host; 53 | let n = host.len(); 54 | let dev = DeviceMemory::zeros(&context, n); 55 | group.bench_with_input( 56 | BenchmarkId::new(&format!("direct_{}", $id), n), 57 | &n, 58 | |b, _| { 59 | b.iter(|| { 60 | for i in 0..n { 61 | host[i] = dev[i]; 62 | } 63 | }) 64 | }, 65 | ); 66 | group.bench_with_input( 67 | BenchmarkId::new(format!("memcpy_{}", $id), n), 68 | &n, 69 | |b, _| { 70 | b.iter(|| { 71 | host.copy_from(&dev); 72 | }) 73 | }, 74 | ); 75 | }; 76 | } 77 | 78 | for &n in &[1000, 10_000, 100_000] { 79 | impl_DtoH!(vec![0_u32; n], "vec"); 80 | impl_DtoH!(PageLockedMemory::::zeros(&context, n), "page_locked"); 81 | let mut vec_tmp = vec![0_u32; n]; 82 | impl_DtoH!(RegisteredMemory::new(&context, &mut vec_tmp), "registered"); 83 | } 84 | } 85 | 86 | criterion_group!(benches, h2d, d2h); 87 | criterion_main!(benches); 88 | -------------------------------------------------------------------------------- /accel/examples/add.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | unsafe fn add(a: *const f32, b: *const f32, c: *mut f32, n: usize) { 5 | let i = accel_core::index(); 6 | if (i as usize) < n { 7 | *c.offset(i) = *a.offset(i) + *b.offset(i); 8 | } 9 | } 10 | 11 | fn main() -> error::Result<()> { 12 | let device = Device::nth(0)?; 13 | let ctx = device.create_context(); 14 | 15 | let _pf = Profiler::start(&ctx); 16 | 17 | // Allocate memories on GPU 18 | let n = 1024; 19 | let mut a = DeviceMemory::::zeros(&ctx, n); 20 | let mut b = DeviceMemory::::zeros(&ctx, n); 21 | let mut c = DeviceMemory::::zeros(&ctx, n); 22 | 23 | // Accessible from CPU as usual Rust slice (though this will be slow) 24 | for i in 0..n { 25 | a[i] = i as f32; 26 | b[i] = 2.0 * i as f32; 27 | } 28 | 29 | // Launch kernel synchronously 30 | add( 31 | &ctx, 32 | 1, /* grid */ 33 | n, /* block */ 34 | (a.as_ptr(), b.as_ptr(), c.as_mut_ptr(), n), 35 | ) 36 | .expect("Kernel call failed"); 37 | 38 | Ok(()) 39 | } 40 | -------------------------------------------------------------------------------- /accel/src/block.rs: -------------------------------------------------------------------------------- 1 | use num_traits::ToPrimitive; 2 | 3 | /// Size of Block (thread block) in [CUDA thread hierarchy]( http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-model ) 4 | /// 5 | /// Every input integer and float convert into `u32` using [ToPrimitive]. 6 | /// If the conversion is impossible, e.g. negative or too large integers, the conversion will panics. 7 | /// 8 | /// [ToPrimitive]: https://docs.rs/num-traits/0.2.11/num_traits/cast/trait.ToPrimitive.html 9 | /// 10 | /// Examples 11 | /// -------- 12 | /// 13 | /// - Explicit creation 14 | /// 15 | /// ``` 16 | /// # use accel::*; 17 | /// let block1d = Block::x(64); 18 | /// assert_eq!(block1d.x, 64); 19 | /// 20 | /// let block2d = Block::xy(64, 128); 21 | /// assert_eq!(block2d.x, 64); 22 | /// assert_eq!(block2d.y, 128); 23 | /// 24 | /// let block3d = Block::xyz(64, 128, 256); 25 | /// assert_eq!(block3d.x, 64); 26 | /// assert_eq!(block3d.y, 128); 27 | /// assert_eq!(block3d.z, 256); 28 | /// ``` 29 | /// 30 | /// - From single integer (unsigned and signed) 31 | /// 32 | /// ``` 33 | /// # use accel::*; 34 | /// let block1d: Block = 64_usize.into(); 35 | /// assert_eq!(block1d.x, 64); 36 | /// 37 | /// let block1d: Block = 64_i32.into(); 38 | /// assert_eq!(block1d.x, 64); 39 | /// ``` 40 | /// 41 | /// - From tuple 42 | /// 43 | /// ``` 44 | /// # use accel::*; 45 | /// let block1d: Block = (64,).into(); 46 | /// assert_eq!(block1d.x, 64); 47 | /// 48 | /// let block2d: Block = (64, 128).into(); 49 | /// assert_eq!(block2d.x, 64); 50 | /// assert_eq!(block2d.y, 128); 51 | /// 52 | /// let block3d: Block = (64, 128, 256).into(); 53 | /// assert_eq!(block3d.x, 64); 54 | /// assert_eq!(block3d.y, 128); 55 | /// assert_eq!(block3d.z, 256); 56 | /// ``` 57 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] 58 | pub struct Block { 59 | pub x: u32, 60 | pub y: u32, 61 | pub z: u32, 62 | } 63 | 64 | impl Block { 65 | /// 1D Block 66 | /// 67 | /// Panic 68 | /// ----- 69 | /// - If input values cannot convert to u32 70 | pub fn x(x: I) -> Self { 71 | Block { 72 | x: x.to_u32().expect("Cannot convert to u32"), 73 | y: 1, 74 | z: 1, 75 | } 76 | } 77 | 78 | /// 2D Block 79 | /// 80 | /// Panic 81 | /// ----- 82 | /// - If input values cannot convert to u32 83 | pub fn xy(x: I1, y: I2) -> Self { 84 | Block { 85 | x: x.to_u32().expect("Cannot convert to u32"), 86 | y: y.to_u32().expect("Cannot convert to u32"), 87 | z: 1, 88 | } 89 | } 90 | 91 | /// 3D Block 92 | /// 93 | /// Panic 94 | /// ----- 95 | /// - If input values cannot convert to u32 96 | pub fn xyz(x: I1, y: I2, z: I3) -> Self { 97 | Block { 98 | x: x.to_u32().expect("Cannot convert to u32"), 99 | y: y.to_u32().expect("Cannot convert to u32"), 100 | z: z.to_u32().expect("Cannot convert to u32"), 101 | } 102 | } 103 | } 104 | 105 | impl Into for (I,) { 106 | fn into(self) -> Block { 107 | Block::x(self.0) 108 | } 109 | } 110 | 111 | impl Into for (I1, I2) { 112 | fn into(self) -> Block { 113 | Block::xy(self.0, self.1) 114 | } 115 | } 116 | 117 | impl Into for (I1, I2, I3) { 118 | fn into(self) -> Block { 119 | Block::xyz(self.0, self.1, self.2) 120 | } 121 | } 122 | 123 | macro_rules! impl_into_block { 124 | ($integer:ty) => { 125 | impl Into for $integer { 126 | fn into(self) -> Block { 127 | Block::x(self) 128 | } 129 | } 130 | }; 131 | } 132 | 133 | impl_into_block!(u8); 134 | impl_into_block!(u16); 135 | impl_into_block!(u32); 136 | impl_into_block!(u64); 137 | impl_into_block!(u128); 138 | impl_into_block!(usize); 139 | impl_into_block!(i8); 140 | impl_into_block!(i16); 141 | impl_into_block!(i32); 142 | impl_into_block!(i64); 143 | impl_into_block!(i128); 144 | impl_into_block!(isize); 145 | -------------------------------------------------------------------------------- /accel/src/device.rs: -------------------------------------------------------------------------------- 1 | //! CUDA [Device] and [Context] 2 | //! 3 | //! [Device]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html 4 | //! [Context]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html 5 | 6 | use crate::{error::*, *}; 7 | use cuda::*; 8 | use std::sync::{Arc, Once}; 9 | 10 | pub use accel_derive::Contexted; 11 | 12 | /// Handler for device and its primary context 13 | #[derive(Debug, PartialEq, PartialOrd)] 14 | pub struct Device { 15 | device: CUdevice, 16 | } 17 | 18 | impl Device { 19 | /// Initializer for CUDA Driver API 20 | fn init() { 21 | static DRIVER_API_INIT: Once = Once::new(); 22 | DRIVER_API_INIT.call_once(|| unsafe { 23 | ffi_call!(cuda::cuInit, 0).expect("Initialization of CUDA Driver API failed"); 24 | }); 25 | } 26 | 27 | /// Get number of available GPUs 28 | pub fn get_count() -> Result { 29 | Self::init(); 30 | let mut count: i32 = 0; 31 | unsafe { 32 | ffi_call!(cuDeviceGetCount, &mut count as *mut i32)?; 33 | } 34 | Ok(count as usize) 35 | } 36 | 37 | pub fn nth(id: usize) -> Result { 38 | let count = Self::get_count()?; 39 | if id >= count { 40 | return Err(AccelError::DeviceNotFound { id, count }); 41 | } 42 | let device = unsafe { ffi_new!(cuDeviceGet, id as i32)? }; 43 | Ok(Device { device }) 44 | } 45 | 46 | /// Get total memory of GPU 47 | pub fn total_memory(&self) -> Result { 48 | let mut mem = 0; 49 | unsafe { 50 | ffi_call!(cuDeviceTotalMem_v2, &mut mem as *mut _, self.device)?; 51 | } 52 | Ok(mem) 53 | } 54 | 55 | /// Get name of GPU 56 | pub fn get_name(&self) -> Result { 57 | let mut bytes: Vec = vec![0_u8; 1024]; 58 | unsafe { 59 | ffi_call!( 60 | cuDeviceGetName, 61 | bytes.as_mut_ptr() as *mut i8, 62 | 1024, 63 | self.device 64 | )?; 65 | } 66 | Ok(String::from_utf8(bytes).expect("GPU name is not UTF8")) 67 | } 68 | 69 | /// Create a new CUDA context on this device. 70 | /// 71 | /// ``` 72 | /// # use accel::*; 73 | /// let device = Device::nth(0).unwrap(); 74 | /// let ctx = device.create_context(); 75 | /// ``` 76 | pub fn create_context(&self) -> Context { 77 | let ptr = unsafe { 78 | ffi_new!( 79 | cuCtxCreate_v2, 80 | CUctx_flags_enum::CU_CTX_SCHED_AUTO as u32, 81 | self.device 82 | ) 83 | } 84 | .expect("Failed to create a new context"); 85 | if ptr.is_null() { 86 | panic!("Cannot crate a new context"); 87 | } 88 | let ptr_new = ctx_pop().unwrap(); 89 | assert_eq!(ptr, ptr_new); 90 | Arc::new(ContextOwned { ptr }) 91 | } 92 | } 93 | 94 | /// Push to the context stack of this thread 95 | fn ctx_push(ptr: CUcontext) -> Result<()> { 96 | unsafe { ffi_call!(cuCtxPushCurrent_v2, ptr) }?; 97 | Ok(()) 98 | } 99 | 100 | /// Pop from the context stack of this thread 101 | fn ctx_pop() -> Result { 102 | let ptr = unsafe { ffi_new!(cuCtxPopCurrent_v2) }?; 103 | if ptr.is_null() { 104 | panic!("No current context"); 105 | } 106 | Ok(ptr) 107 | } 108 | 109 | /// Get API version 110 | fn ctx_version(ptr: CUcontext) -> Result { 111 | let mut version: u32 = 0; 112 | unsafe { ffi_call!(cuCtxGetApiVersion, ptr, &mut version as *mut _) }?; 113 | Ok(version) 114 | } 115 | 116 | /// Block until all tasks in this context to be complete. 117 | fn ctx_sync(ptr: CUcontext) -> Result<()> { 118 | ctx_push(ptr)?; 119 | unsafe { ffi_call!(cuCtxSynchronize) }?; 120 | let ptr_new = ctx_pop()?; 121 | assert_eq!(ptr, ptr_new); 122 | Ok(()) 123 | } 124 | 125 | /// Object with CUDA context 126 | pub trait Contexted { 127 | fn guard(&self) -> Result; 128 | fn sync(&self) -> Result<()>; 129 | fn version(&self) -> Result; 130 | /// Get a reference 131 | /// 132 | /// This is **NOT** a Rust reference, i.e. you can drop owned context while the reference exists. 133 | /// The reference becomes expired after owned context is released, and it will cause a runtime error. 134 | /// 135 | fn get_ref(&self) -> ContextRef; 136 | } 137 | 138 | /// Owend handler for CUDA context 139 | #[derive(Debug, PartialEq)] 140 | pub struct ContextOwned { 141 | ptr: CUcontext, 142 | } 143 | 144 | pub type Context = Arc; 145 | 146 | impl Drop for ContextOwned { 147 | fn drop(&mut self) { 148 | if let Err(e) = unsafe { ffi_call!(cuCtxDestroy_v2, self.ptr) } { 149 | log::error!("Context remove failed: {:?}", e); 150 | } 151 | } 152 | } 153 | 154 | unsafe impl Send for ContextOwned {} 155 | unsafe impl Sync for ContextOwned {} 156 | 157 | impl Contexted for Context { 158 | fn sync(&self) -> Result<()> { 159 | ctx_sync(self.ptr) 160 | } 161 | 162 | fn version(&self) -> Result { 163 | ctx_version(self.ptr) 164 | } 165 | 166 | fn guard(&self) -> Result { 167 | ctx_push(self.ptr)?; 168 | Ok(ContextGuard { ptr: self.ptr }) 169 | } 170 | 171 | fn get_ref(&self) -> ContextRef { 172 | ContextRef { ptr: self.ptr } 173 | } 174 | } 175 | 176 | /// Non-Owend handler for CUDA context 177 | /// 178 | /// The validity of reference is checked dynamically. 179 | /// CUDA APIs (e.g. [cuPointerGetAttribute]) allow us to get a pointer to CUDA context, 180 | /// but its validity cannot be assured by Rust lifetime system. 181 | /// 182 | /// [cuPointerGetAttribute]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED_1g0c28ed0aff848042bc0533110e45820c 183 | /// 184 | #[derive(Debug, PartialEq, Clone, Copy)] 185 | pub struct ContextRef { 186 | ptr: CUcontext, 187 | } 188 | 189 | impl ContextRef { 190 | pub(crate) fn from_ptr(ptr: CUcontext) -> Self { 191 | assert!(!ptr.is_null()); 192 | ContextRef { ptr } 193 | } 194 | } 195 | 196 | unsafe impl Send for ContextRef {} 197 | unsafe impl Sync for ContextRef {} 198 | 199 | impl Contexted for ContextRef { 200 | fn sync(&self) -> Result<()> { 201 | ctx_sync(self.ptr) 202 | } 203 | 204 | fn version(&self) -> Result { 205 | ctx_version(self.ptr) 206 | } 207 | 208 | fn guard(&self) -> Result { 209 | ctx_push(self.ptr)?; 210 | Ok(ContextGuard { ptr: self.ptr }) 211 | } 212 | 213 | fn get_ref(&self) -> ContextRef { 214 | self.clone() 215 | } 216 | } 217 | 218 | impl std::cmp::PartialEq for ContextOwned { 219 | fn eq(&self, ctx: &ContextRef) -> bool { 220 | self.ptr == ctx.ptr 221 | } 222 | } 223 | 224 | impl std::cmp::PartialEq for ContextRef { 225 | fn eq(&self, ctx: &ContextOwned) -> bool { 226 | self.ptr == ctx.ptr 227 | } 228 | } 229 | 230 | /// RAII handler for using CUDA context 231 | /// 232 | /// As described in [CUDA Programming Guide], library using CUDA should push context before using 233 | /// it, and then pop it. 234 | /// 235 | /// [CUDA Programming Guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#context 236 | pub struct ContextGuard { 237 | ptr: CUcontext, 238 | } 239 | 240 | impl Drop for ContextGuard { 241 | fn drop(&mut self) { 242 | match ctx_pop() { 243 | Ok(ptr) => { 244 | if ptr != self.ptr { 245 | log::error!("Poped context is different from pushed: {:?}", ptr); 246 | } 247 | } 248 | Err(e) => { 249 | log::error!("Failed to pop context: {}", e); 250 | } 251 | } 252 | } 253 | } 254 | 255 | #[cfg(test)] 256 | mod tests { 257 | use super::*; 258 | 259 | #[test] 260 | fn get_count() -> Result<()> { 261 | Device::get_count()?; 262 | Ok(()) 263 | } 264 | 265 | #[test] 266 | fn get_zeroth() -> Result<()> { 267 | Device::nth(0)?; 268 | Ok(()) 269 | } 270 | 271 | #[test] 272 | fn out_of_range() -> Result<()> { 273 | assert!(Device::nth(129).is_err()); 274 | Ok(()) 275 | } 276 | 277 | #[test] 278 | fn create() -> Result<()> { 279 | let device = Device::nth(0)?; 280 | let ctx = device.create_context(); 281 | dbg!(&ctx); 282 | Ok(()) 283 | } 284 | 285 | #[should_panic] 286 | #[test] 287 | fn expired_context_ref() { 288 | let device = Device::nth(0).unwrap(); 289 | let ctx = device.create_context(); 290 | let ctx_ref = ctx.get_ref(); 291 | drop(ctx); 292 | let _version = ctx_ref.version().unwrap(); // ctx has been expired 293 | } 294 | 295 | #[should_panic] 296 | #[test] 297 | fn expired_contexted_call() { 298 | let device = Device::nth(0).unwrap(); 299 | let ctx = device.create_context(); 300 | let ctx_ref = ctx.get_ref(); 301 | drop(ctx); 302 | unsafe { contexted_call!(&ctx_ref, cuCtxSynchronize) }.unwrap(); 303 | } 304 | } 305 | -------------------------------------------------------------------------------- /accel/src/error.rs: -------------------------------------------------------------------------------- 1 | use cuda::cudaError_enum as DeviceError; 2 | use std::path::PathBuf; 3 | 4 | pub type Result = ::std::result::Result; 5 | 6 | #[derive(thiserror::Error, Debug)] 7 | pub enum AccelError { 8 | /// Raw errors originates from CUDA Device APIs 9 | #[error("CUDA Device API Error: {api_name}, {error:?}")] 10 | CUDAError { 11 | api_name: String, 12 | error: DeviceError, 13 | }, 14 | 15 | // This is not an error potentially, but it should be a bug if not captured by accel 16 | #[error("Async operations issues previously have not completed yet")] 17 | AsyncOperationNotReady, 18 | 19 | /// Error for user device code assertion 20 | #[error("Assertion in device code has failed")] 21 | DeviceAssertionFailed, 22 | 23 | #[error("No device found for given ID")] 24 | DeviceNotFound { id: usize, count: usize }, 25 | 26 | #[error("File not found: {path:?}")] 27 | FileNotFound { path: PathBuf }, 28 | 29 | #[error(transparent)] 30 | AsyncTaskFailed(#[from] tokio::task::JoinError), 31 | } 32 | 33 | /// Convert return code of CUDA Driver/Runtime API into Result 34 | pub(crate) fn check(error: DeviceError, api_name: &str) -> Result<()> { 35 | match error { 36 | DeviceError::CUDA_SUCCESS => Ok(()), 37 | DeviceError::CUDA_ERROR_ASSERT => Err(AccelError::DeviceAssertionFailed), 38 | DeviceError::CUDA_ERROR_NOT_READY => Err(AccelError::AsyncOperationNotReady), 39 | _ => Err(AccelError::CUDAError { 40 | api_name: api_name.into(), 41 | error, 42 | }), 43 | } 44 | } 45 | 46 | #[macro_export] 47 | macro_rules! ffi_call { 48 | ($ffi:path $(,$args:expr)*) => { 49 | { 50 | $crate::error::check($ffi($($args),*), stringify!($ffi)) 51 | } 52 | }; 53 | } 54 | 55 | #[macro_export] 56 | macro_rules! ffi_new { 57 | ($ffi:path $(,$args:expr)*) => { 58 | { 59 | let mut value = ::std::mem::MaybeUninit::uninit(); 60 | $crate::error::check($ffi(value.as_mut_ptr(), $($args),*), stringify!($ffi)).map(|_| value.assume_init()) 61 | } 62 | }; 63 | } 64 | 65 | #[macro_export] 66 | macro_rules! contexted_call { 67 | ($ctx:expr, $ffi:path $(,$args:expr)*) => { 68 | $crate::Contexted::guard($ctx).and_then(|_g| { $crate::ffi_call!($ffi $(,$args)*) }) 69 | }; 70 | } 71 | 72 | #[macro_export] 73 | macro_rules! contexted_new { 74 | ($ctx:expr, $ffi:path $(,$args:expr)*) => { 75 | $crate::Contexted::guard($ctx).and_then(|_g| { $crate::ffi_new!($ffi $(,$args)*) }) 76 | }; 77 | } 78 | -------------------------------------------------------------------------------- /accel/src/execution.rs: -------------------------------------------------------------------------------- 1 | //! Traits for CUDA Kernel launching 2 | //! 3 | //! Launchable traits 4 | //! ----------------- 5 | //! 6 | //! Launchable traits, i.e. `Launchable0`, `Launchable1`, ..., implement `launch` function which launches a kernel on device. 7 | //! 8 | //! ``` 9 | //! use accel::{*, error::Result}; 10 | //! 11 | //! // Trait for 2-arg kernel 12 | //! pub trait Launchable2 { 13 | //! // Type of arg1 on device 14 | //! type Target1; 15 | //! // Type of arg2 on device 16 | //! type Target2; 17 | //! 18 | //! // Launch kernel code on device 19 | //! fn launch< 20 | //! Arg1 /* Type of arg1 on host */, 21 | //! Arg2 /* Type of arg2 on host */ 22 | //! >( 23 | //! &self, 24 | //! grid: impl Into, 25 | //! block: impl Into, 26 | //! (arg1, arg2): (Arg1, Arg2) 27 | //! ) -> Result<()> 28 | //! where 29 | //! // Types on host and on device are bundled by DeviceSend trait 30 | //! Arg1: DeviceSend, 31 | //! Arg2: DeviceSend, 32 | //! { 33 | //! // default impl which uses crate-internal features 34 | //! todo!() // skip for document 35 | //! } 36 | //! 37 | //! // Specify entry point (see following example) 38 | //! fn get_kernel(&self) -> Result; 39 | //! } 40 | //! ``` 41 | //! 42 | //! These traits are generated by `accel_derive::define_launchable!` proc-macro. 43 | //! Launchable traits are specialized for N-args functions because it uses a tuple `(Arg1, Arg2, ..., ArgN)` 44 | //! for `launch` argument. 45 | //! [DeviceSend] trait specify how the host value is sent to device. 46 | //! 47 | //! One of Launchable traits will be implemented automatically by [accel::kernel] for an auto-generated [Module] struct: 48 | //! 49 | //! ``` 50 | //! #[accel::kernel] 51 | //! fn f(a: i32) {} 52 | //! ``` 53 | //! 54 | //! This simple definition will create a submodule `f` (same name of the function): 55 | //! 56 | //! ``` 57 | //! mod f { // same name sub-module 58 | //! 59 | //! pub const PTX_STR: &str = "{{ PTX string generated by rustc/nvptx64-nvidia-cuda }}"; 60 | //! 61 | //! // wrapper for implement one of Launchable traits 62 | //! pub struct Module(::accel::Module); 63 | //! 64 | //! // impl Launchable1 because number of arugment is 1 65 | //! impl ::accel::execution::Launchable1<'_> for Module { 66 | //! type Target1 = i32; // first argument of `f` 67 | //! 68 | //! // How to get kernel PTX code 69 | //! fn get_kernel(&self) -> ::accel::error::Result<::accel::Kernel> { 70 | //! self.0.get_kernel("f") 71 | //! } 72 | //! } 73 | //! } 74 | //! ``` 75 | //! 76 | //! For a function which takes N arguments, `Launchable{N}` will be implemented for corresponding module. 77 | //! Be sure that this sub-module will be generated where the `f` is defined. 78 | //! `get_kernel` and default implementation of `launch` are separated to keep unsafe codes in this crate. 79 | //! 80 | //! [DeviceSend]: trait.DeviceSend.html 81 | //! [accel::kernel]: ../attr.kernel.html 82 | //! [Module]: ../module/struct.Module.html 83 | 84 | use crate::{contexted_call, device::*, error::*, *}; 85 | use cuda::*; 86 | use std::{ffi::*, ptr::null_mut}; 87 | 88 | /// Type which can be sent to device 89 | pub trait DeviceSend { 90 | /// Type on device 91 | type Target; 92 | fn as_kernel_parameter(&self) -> *mut c_void { 93 | self as *const Self as *mut c_void 94 | } 95 | } 96 | 97 | impl DeviceSend for *mut T { 98 | type Target = Self; 99 | } 100 | 101 | impl DeviceSend for *const T { 102 | type Target = Self; 103 | } 104 | 105 | impl<'arg, T: Sized> DeviceSend for &'arg [T] { 106 | type Target = *const T; 107 | } 108 | 109 | impl<'arg, T: Sized> DeviceSend for &'arg mut [T] { 110 | type Target = *mut T; 111 | } 112 | 113 | macro_rules! impl_device_send { 114 | ($pri:ty) => { 115 | impl DeviceSend for $pri { 116 | type Target = Self; 117 | } 118 | 119 | impl<'arg> DeviceSend for &'arg $pri { 120 | type Target = Self; 121 | } 122 | 123 | impl<'arg> DeviceSend for &'arg mut $pri { 124 | type Target = Self; 125 | } 126 | }; 127 | } 128 | 129 | impl_device_send!(bool); 130 | impl_device_send!(i8); 131 | impl_device_send!(i16); 132 | impl_device_send!(i32); 133 | impl_device_send!(i64); 134 | impl_device_send!(i128); 135 | impl_device_send!(isize); 136 | impl_device_send!(u8); 137 | impl_device_send!(u16); 138 | impl_device_send!(u32); 139 | impl_device_send!(u64); 140 | impl_device_send!(u128); 141 | impl_device_send!(usize); 142 | impl_device_send!(f32); 143 | impl_device_send!(f64); 144 | 145 | accel_derive::define_launchable!(12 /* 0..=12 */); 146 | -------------------------------------------------------------------------------- /accel/src/grid.rs: -------------------------------------------------------------------------------- 1 | use num_traits::ToPrimitive; 2 | 3 | /// Size of Grid (grid of blocks) in [CUDA thread hierarchy]( http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-model ) 4 | /// 5 | /// Every input integer and float convert into `u32` using [ToPrimitive]. 6 | /// If the conversion is impossible, e.g. negative or too large integers, the conversion will panics. 7 | /// 8 | /// [ToPrimitive]: https://docs.rs/num-traits/0.2.11/num_traits/cast/trait.ToPrimitive.html 9 | /// 10 | /// Examples 11 | /// -------- 12 | /// 13 | /// - Explicit creation 14 | /// 15 | /// ``` 16 | /// # use accel::*; 17 | /// let grid1d = Grid::x(64); 18 | /// assert_eq!(grid1d.x, 64); 19 | /// 20 | /// let grid2d = Grid::xy(64, 128); 21 | /// assert_eq!(grid2d.x, 64); 22 | /// assert_eq!(grid2d.y, 128); 23 | /// 24 | /// let grid3d = Grid::xyz(64, 128, 256); 25 | /// assert_eq!(grid3d.x, 64); 26 | /// assert_eq!(grid3d.y, 128); 27 | /// assert_eq!(grid3d.z, 256); 28 | /// ``` 29 | /// 30 | /// - From single integer (unsigned and signed) 31 | /// 32 | /// ``` 33 | /// # use accel::*; 34 | /// let grid1d: Grid = 64_usize.into(); 35 | /// assert_eq!(grid1d.x, 64); 36 | /// 37 | /// let grid1d: Grid = 64_i32.into(); 38 | /// assert_eq!(grid1d.x, 64); 39 | /// ``` 40 | /// 41 | /// - From tuple 42 | /// 43 | /// ``` 44 | /// # use accel::*; 45 | /// let grid1d: Grid = (64,).into(); 46 | /// assert_eq!(grid1d.x, 64); 47 | /// 48 | /// let grid2d: Grid = (64, 128).into(); 49 | /// assert_eq!(grid2d.x, 64); 50 | /// assert_eq!(grid2d.y, 128); 51 | /// 52 | /// let grid3d: Grid = (64, 128, 256).into(); 53 | /// assert_eq!(grid3d.x, 64); 54 | /// assert_eq!(grid3d.y, 128); 55 | /// assert_eq!(grid3d.z, 256); 56 | /// ``` 57 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] 58 | pub struct Grid { 59 | pub x: u32, 60 | pub y: u32, 61 | pub z: u32, 62 | } 63 | 64 | impl Grid { 65 | /// 1D Grid 66 | /// 67 | /// Panic 68 | /// ----- 69 | /// - If input values cannot convert to u32 70 | pub fn x(x: I) -> Self { 71 | Grid { 72 | x: x.to_u32().expect("Cannot convert to u32"), 73 | y: 1, 74 | z: 1, 75 | } 76 | } 77 | 78 | /// 2D Grid 79 | /// 80 | /// Panic 81 | /// ----- 82 | /// - If input values cannot convert to u32 83 | pub fn xy(x: I1, y: I2) -> Self { 84 | Grid { 85 | x: x.to_u32().expect("Cannot convert to u32"), 86 | y: y.to_u32().expect("Cannot convert to u32"), 87 | z: 1, 88 | } 89 | } 90 | 91 | /// 3D Grid 92 | /// 93 | /// Panic 94 | /// ----- 95 | /// - If input values cannot convert to u32 96 | pub fn xyz(x: I1, y: I2, z: I3) -> Self { 97 | Grid { 98 | x: x.to_u32().expect("Cannot convert to u32"), 99 | y: y.to_u32().expect("Cannot convert to u32"), 100 | z: z.to_u32().expect("Cannot convert to u32"), 101 | } 102 | } 103 | } 104 | 105 | impl Into for (I,) { 106 | fn into(self) -> Grid { 107 | Grid::x(self.0) 108 | } 109 | } 110 | 111 | impl Into for (I1, I2) { 112 | fn into(self) -> Grid { 113 | Grid::xy(self.0, self.1) 114 | } 115 | } 116 | 117 | impl Into for (I1, I2, I3) { 118 | fn into(self) -> Grid { 119 | Grid::xyz(self.0, self.1, self.2) 120 | } 121 | } 122 | 123 | macro_rules! impl_into_grid { 124 | ($integer:ty) => { 125 | impl Into for $integer { 126 | fn into(self) -> Grid { 127 | Grid::x(self) 128 | } 129 | } 130 | }; 131 | } 132 | 133 | impl_into_grid!(u8); 134 | impl_into_grid!(u16); 135 | impl_into_grid!(u32); 136 | impl_into_grid!(u64); 137 | impl_into_grid!(u128); 138 | impl_into_grid!(usize); 139 | impl_into_grid!(i8); 140 | impl_into_grid!(i16); 141 | impl_into_grid!(i32); 142 | impl_into_grid!(i64); 143 | impl_into_grid!(i128); 144 | impl_into_grid!(isize); 145 | -------------------------------------------------------------------------------- /accel/src/instruction.rs: -------------------------------------------------------------------------------- 1 | use crate::{error::*, *}; 2 | use cuda::*; 3 | use std::{ffi::*, path::*}; 4 | 5 | /// Represent the resource of CUDA middle-IR (PTX/cubin) 6 | #[derive(Debug)] 7 | pub enum Instruction { 8 | PTX(CString), 9 | PTXFile(PathBuf), 10 | Cubin(Vec), 11 | CubinFile(PathBuf), 12 | } 13 | 14 | impl Instruction { 15 | /// Constructor for `Instruction::PTX` 16 | pub fn ptx(s: &str) -> Instruction { 17 | let ptx = CString::new(s).expect("Invalid PTX string"); 18 | Instruction::PTX(ptx) 19 | } 20 | 21 | /// Constructor for `Instruction::Cubin` 22 | pub fn cubin(sl: &[u8]) -> Instruction { 23 | Instruction::Cubin(sl.to_vec()) 24 | } 25 | 26 | /// Constructor for `Instruction::PTXFile` 27 | pub fn ptx_file(path: &Path) -> Result { 28 | if !path.exists() { 29 | return Err(AccelError::FileNotFound { 30 | path: path.to_owned(), 31 | }); 32 | } 33 | Ok(Instruction::PTXFile(path.to_owned())) 34 | } 35 | 36 | /// Constructor for `Instruction::CubinFile` 37 | pub fn cubin_file(path: &Path) -> Result { 38 | if !path.exists() { 39 | return Err(AccelError::FileNotFound { 40 | path: path.to_owned(), 41 | }); 42 | } 43 | Ok(Instruction::CubinFile(path.to_owned())) 44 | } 45 | } 46 | 47 | impl Instruction { 48 | /// Get type of PTX/cubin 49 | pub fn input_type(&self) -> CUjitInputType { 50 | match *self { 51 | Instruction::PTX(_) | Instruction::PTXFile(_) => CUjitInputType_enum::CU_JIT_INPUT_PTX, 52 | Instruction::Cubin(_) | Instruction::CubinFile(_) => { 53 | CUjitInputType_enum::CU_JIT_INPUT_CUBIN 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /accel/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! GPGPU framework for Rust based on [CUDA Driver API] 2 | //! 3 | //! [CUDA Driver API]: https://docs.nvidia.com/cuda/cuda-driver-api/ 4 | //! 5 | //! Setup 6 | //! ----- 7 | //! Currently (0.3.0), accel works only on Linux system. Windows support will come in future release (0.3.x or 0.4~). 8 | //! 9 | //! 1. Install [CUDA](https://developer.nvidia.com/cuda-downloads) on your system 10 | //! 2. Setup Rust environement using rustup (Requires 1.42 or later) 11 | //! 3. Add `nvptx64-nvidia-cuda` target and install `ptx-linker`, or run 12 | //! 13 | //! ```shell 14 | //! curl -sSL https://gitlab.com/termoshtt/accel/raw/master/setup_nvptx_toolchain.sh | bash 15 | //! ``` 16 | //! 17 | //! Examples 18 | //! -------- 19 | //! accel works with stable Rust 20 | //! 21 | //! ```toml 22 | //! [dependencies] 23 | //! accel = "=0.3.0-alpha.2" 24 | //! ``` 25 | //! 26 | //! Do **NOT** add `accel-core` to `[dependencies]`. 27 | //! It will be linked automatically into the device code. 28 | //! 29 | //! ### Vector Add 30 | //! 31 | //! ``` 32 | //! use accel::*; 33 | //! 34 | //! #[kernel] 35 | //! unsafe fn add(a: *const f32, b: *const f32, c: *mut f32, n: usize) { 36 | //! let i = accel_core::index(); 37 | //! if (i as usize) < n { 38 | //! *c.offset(i) = *a.offset(i) + *b.offset(i); 39 | //! } 40 | //! } 41 | //! 42 | //! fn main() -> error::Result<()> { 43 | //! let device = Device::nth(0)?; 44 | //! let ctx = device.create_context(); 45 | //! 46 | //! // Allocate memories on GPU 47 | //! let n = 32; 48 | //! let mut a = DeviceMemory::::zeros(&ctx, n); 49 | //! let mut b = DeviceMemory::::zeros(&ctx, n); 50 | //! let mut c = DeviceMemory::::zeros(&ctx, n); 51 | //! 52 | //! // Accessible from CPU as usual Rust slice (though this will be slow) 53 | //! for i in 0..n { 54 | //! a[i] = i as f32; 55 | //! b[i] = 2.0 * i as f32; 56 | //! } 57 | //! println!("a = {:?}", a.as_slice()); 58 | //! println!("b = {:?}", b.as_slice()); 59 | //! 60 | //! // Launch kernel synchronously 61 | //! add(&ctx, 62 | //! 1 /* grid */, 63 | //! n /* block */, 64 | //! (a.as_ptr(), b.as_ptr(), c.as_mut_ptr(), n) 65 | //! ).expect("Kernel call failed"); 66 | //! 67 | //! println!("c = {:?}", c.as_slice()); 68 | //! Ok(()) 69 | //! } 70 | //! ``` 71 | //! 72 | //! ### Assertion on GPU 73 | //! 74 | //! ``` 75 | //! use accel::*; 76 | //! 77 | //! #[kernel] 78 | //! fn assert() { 79 | //! accel_core::assert_eq!(1 + 2, 4); // will fail 80 | //! } 81 | //! 82 | //! fn main() -> error::Result<()> { 83 | //! let device = Device::nth(0)?; 84 | //! let ctx = device.create_context(); 85 | //! let result = assert(&ctx, 1 /* grid */, 4 /* block */, ()); 86 | //! assert!(result.is_err()); // assertion failed 87 | //! Ok(()) 88 | //! } 89 | //! ``` 90 | //! 91 | //! ### Print from GPU 92 | //! 93 | //! ``` 94 | //! use accel::*; 95 | //! 96 | //! #[kernel] 97 | //! pub fn print() { 98 | //! let i = accel_core::index(); 99 | //! accel_core::println!("Hello from {}", i); 100 | //! } 101 | //! 102 | //! fn main() -> error::Result<()> { 103 | //! let device = Device::nth(0)?; 104 | //! let ctx = device.create_context(); 105 | //! print(&ctx, 1, 4, ())?; 106 | //! Ok(()) 107 | //! } 108 | //! ``` 109 | 110 | extern crate cuda_driver_sys as cuda; 111 | 112 | pub use accel_derive::kernel; 113 | 114 | pub mod device; 115 | pub mod error; 116 | pub mod execution; 117 | pub mod linker; 118 | pub mod memory; 119 | pub mod module; 120 | pub mod profiler; 121 | pub mod stream; 122 | 123 | mod block; 124 | mod grid; 125 | mod instruction; 126 | 127 | pub use block::Block; 128 | pub use device::*; 129 | pub use execution::*; 130 | pub use grid::Grid; 131 | pub use instruction::Instruction; 132 | pub use linker::*; 133 | pub use memory::*; 134 | pub use module::*; 135 | pub use profiler::*; 136 | pub use stream::*; 137 | 138 | #[cfg(test)] 139 | mod tests { 140 | /// Test accel_derive::kernel can be used in accel crate itself 141 | #[super::kernel] 142 | fn f() {} 143 | } 144 | -------------------------------------------------------------------------------- /accel/src/linker.rs: -------------------------------------------------------------------------------- 1 | //! CUDA JIT compiler and Linkers 2 | 3 | use crate::{contexted_call, device::*, error::*, module::*, *}; 4 | use cuda::*; 5 | use std::{ 6 | collections::HashMap, 7 | ffi::{CStr, CString}, 8 | mem::MaybeUninit, 9 | os::raw::c_void, 10 | path::Path, 11 | ptr::null_mut, 12 | }; 13 | 14 | // TODO 15 | #[derive(Debug, Clone)] 16 | pub struct LogBuffer {} 17 | 18 | /// Configure generator for [CUjit_option] required in `cuLink*` APIs 19 | /// 20 | /// [CUjit_option]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g5527fa8030d5cabedc781a04dbd1997d 21 | #[derive(Debug, Clone, Default)] 22 | pub struct JITConfig { 23 | /// CU_JIT_MAX_REGISTERS, Applies to compiler only 24 | /// 25 | /// - Max number of registers that a thread may use. 26 | pub max_registers: Option, 27 | 28 | /// CU_JIT_THREADS_PER_BLOCK, Applies to compiler only 29 | /// 30 | /// - **IN**: Specifies minimum number of threads per block to target compilation for 31 | /// - **OUT**: Returns the number of threads the compiler actually targeted. 32 | /// This restricts the resource utilization fo the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations. 33 | /// 34 | /// Note 35 | /// ---- 36 | /// This option does not currently take into account any other resource limitations, such as shared memory utilization. Cannot be combined with CU_JIT_TARGET. 37 | pub threads_per_block: Option, 38 | 39 | /// CU_JIT_WALL_TIME, Applies to compiler and linker 40 | /// 41 | /// - Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker 42 | /// - Option type: float 43 | pub wall_time: Option, 44 | 45 | /// CU_JIT_INFO_LOG_BUFFER, Applies to compiler and linker 46 | /// 47 | /// - Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) 48 | /// 49 | /// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, Applies to compiler and linker 50 | /// 51 | /// - **IN**: Log buffer size in bytes. Log messages will be capped at this size (including null terminator) 52 | /// - **OUT**: Amount of log buffer filled with messages 53 | pub info_log_buffer: Option, 54 | 55 | /// CU_JIT_ERROR_LOG_BUFFER, Applies to compiler and linker 56 | /// 57 | /// - Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES) 58 | /// 59 | /// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, Applies to compiler and linker 60 | /// 61 | /// - **IN**: Log buffer size in bytes. Log messages will be capped at this size (including null terminator) 62 | /// - **OUT**: Amount of log buffer filled with messages 63 | pub error_log_buffer: Option, 64 | 65 | /// CU_JIT_OPTIMIZATION_LEVEL, Applies to compiler only 66 | /// 67 | /// - Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations. 68 | pub optimization_level: Option, 69 | 70 | /// CU_JIT_TARGET_FROM_CUCONTEXT, Applies to compiler and linker 71 | /// 72 | /// - No option value required. Determines the target based on the current attached context (default) 73 | pub target_from_cucontext: Option<()>, 74 | 75 | /// CU_JIT_TARGET, Applies to compiler and linker 76 | /// 77 | /// - Target is chosen based on supplied CUjit_target. Cannot be combined with CU_JIT_THREADS_PER_BLOCK. 78 | pub target: Option, 79 | 80 | /// CU_JIT_FALLBACK_STRATEGY, Applies to compiler only 81 | /// 82 | /// - Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied CUjit_fallback. 83 | /// This option cannot be used with cuLink* APIs as the linker requires exact matches. 84 | pub fallback_strategy: Option, 85 | 86 | /// CU_JIT_GENERATE_DEBUG_INFO, Applies to compiler and linker 87 | /// 88 | /// - Specifies whether to create debug information in output (-g) (0: false, default) 89 | pub generate_debug_info: Option, 90 | 91 | /// CU_JIT_LOG_VERBOSE, Applies to compiler and linker 92 | /// 93 | /// - Generate verbose log messages (0: false, default) 94 | pub log_verbose: Option, 95 | 96 | /// CU_JIT_GENERATE_LINE_INFO, Applies to compiler only 97 | /// 98 | /// - Generate line number information (-lineinfo) (0: false, default) 99 | pub generate_line_info: Option, 100 | 101 | /// CU_JIT_CACHE_MODE, Applies to compiler only 102 | /// 103 | /// - Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied CUjit_cacheMode_enum. 104 | pub cache_mode: Option, 105 | 106 | /// CU_JIT_NEW_SM3X_OPT 107 | /// 108 | /// - The below jit options are used for internal purposes only, in this version of CUDA 109 | pub new_sm3x_opt: Option, 110 | 111 | /// CU_JIT_FAST_COMPILE 112 | pub fast_compile: bool, 113 | 114 | /// CU_JIT_GLOBAL_SYMBOL_NAMES, Applies to dynamic linker only 115 | /// 116 | /// - Array of device symbol names that will be relocated to the corresponing host addresses stored in CU_JIT_GLOBAL_SYMBOL_ADDRESSES. 117 | /// Must contain CU_JIT_GLOBAL_SYMBOL_COUNT entries. When loding a device module, driver will relocate all encountered unresolved symbols to the host addresses. 118 | /// It is only allowed to register symbols that correspond to unresolved global variables. It is illegal to register the same device symbol at multiple addresses. 119 | /// 120 | /// CU_JIT_GLOBAL_SYMBOL_ADDRESSES, Applies to dynamic linker only 121 | /// 122 | /// - Array of host addresses that will be used to relocate corresponding device symbols stored in CU_JIT_GLOBAL_SYMBOL_NAMES. 123 | /// Must contain CU_JIT_GLOBAL_SYMBOL_COUNT entries. 124 | /// 125 | /// CU_JIT_GLOBAL_SYMBOL_COUNT, Applies to dynamic linker only 126 | /// 127 | /// - Number of entries in CU_JIT_GLOBAL_SYMBOL_NAMES and CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays. 128 | pub global_symbol: HashMap, 129 | } 130 | 131 | impl JITConfig { 132 | /// Pack configure into C API compatible format 133 | fn pack(&mut self) -> (u32, Vec, Vec<*mut c_void>) { 134 | let mut opt_keys = Vec::new(); 135 | let mut opt_values = Vec::new(); 136 | 137 | macro_rules! check_option { 138 | ( $tag:ident, $opt_name:ident) => { 139 | if let Some($opt_name) = self.$opt_name.as_ref() { 140 | opt_keys.push(CUjit_option::$tag); 141 | opt_values.push($opt_name as *const _ as *mut c_void); 142 | } 143 | }; 144 | } 145 | check_option!(CU_JIT_MAX_REGISTERS, max_registers); 146 | check_option!(CU_JIT_THREADS_PER_BLOCK, threads_per_block); 147 | check_option!(CU_JIT_WALL_TIME, wall_time); 148 | check_option!(CU_JIT_OPTIMIZATION_LEVEL, optimization_level); 149 | check_option!(CU_JIT_TARGET, target); 150 | check_option!(CU_JIT_FALLBACK_STRATEGY, fallback_strategy); 151 | check_option!(CU_JIT_GENERATE_DEBUG_INFO, generate_debug_info); 152 | check_option!(CU_JIT_LOG_VERBOSE, log_verbose); 153 | check_option!(CU_JIT_GENERATE_LINE_INFO, generate_line_info); 154 | check_option!(CU_JIT_CACHE_MODE, cache_mode); 155 | check_option!(CU_JIT_NEW_SM3X_OPT, new_sm3x_opt); 156 | 157 | if self.fast_compile { 158 | opt_keys.push(CUjit_option::CU_JIT_FAST_COMPILE); 159 | opt_values.push(&self.fast_compile as *const bool as *mut c_void); 160 | } 161 | 162 | if let Some(_info_log_buffer) = self.info_log_buffer.as_mut() { 163 | unimplemented!("Log for JIT is not supported yet"); 164 | } 165 | 166 | if let Some(_error_log_buffer) = self.error_log_buffer.as_mut() { 167 | unimplemented!("Log for JIT is not supported yet"); 168 | } 169 | 170 | if !self.global_symbol.is_empty() { 171 | unimplemented!("GLOBAL_SYMBOL flags are not supported yet"); 172 | } 173 | assert_eq!(opt_keys.len(), opt_values.len()); 174 | (opt_keys.len() as u32, opt_keys, opt_values) 175 | } 176 | } 177 | 178 | /// Consuming builder for cubin from PTX and cubins 179 | #[derive(accel_derive::Contexted)] 180 | pub struct Linker { 181 | state: CUlinkState, 182 | cfg: JITConfig, 183 | ctx: Context, 184 | } 185 | 186 | impl Drop for Linker { 187 | fn drop(&mut self) { 188 | if let Err(e) = unsafe { contexted_call!(self, cuLinkDestroy, self.state) } { 189 | log::error!("Failed to release Linker: {:?}", e) 190 | } 191 | } 192 | } 193 | 194 | impl Linker { 195 | /// Create a new Linker 196 | pub fn create(ctx: &Context, mut cfg: JITConfig) -> Result { 197 | let (n, mut opt, mut opts) = cfg.pack(); 198 | let state = unsafe { 199 | let mut state = MaybeUninit::uninit(); 200 | contexted_call!( 201 | ctx, 202 | cuLinkCreate_v2, 203 | n, 204 | opt.as_mut_ptr(), 205 | opts.as_mut_ptr(), 206 | state.as_mut_ptr() 207 | )?; 208 | state.assume_init() 209 | }; 210 | Ok(Linker { 211 | state, 212 | cfg, 213 | ctx: ctx.clone(), 214 | }) 215 | } 216 | 217 | /// Wrapper of cuLinkAddData 218 | unsafe fn add_data(mut self, input_type: CUjitInputType, data: &[u8]) -> Result { 219 | let (nopts, mut opts, mut opt_vals) = self.cfg.pack(); 220 | let name = CString::new("").unwrap(); 221 | contexted_call!( 222 | &self, 223 | cuLinkAddData_v2, 224 | self.state, 225 | input_type, 226 | data.as_ptr() as *mut _, 227 | data.len(), 228 | name.as_ptr(), 229 | nopts, 230 | opts.as_mut_ptr(), 231 | opt_vals.as_mut_ptr() 232 | )?; 233 | Ok(self) 234 | } 235 | 236 | /// Wrapper of cuLinkAddFile 237 | unsafe fn add_file(mut self, input_type: CUjitInputType, path: &Path) -> Result { 238 | let filename = CString::new(path.to_str().unwrap()).expect("Invalid file path"); 239 | let (nopts, mut opts, mut opt_vals) = self.cfg.pack(); 240 | contexted_call!( 241 | &self, 242 | cuLinkAddFile_v2, 243 | self.state, 244 | input_type, 245 | filename.as_ptr(), 246 | nopts, 247 | opts.as_mut_ptr(), 248 | opt_vals.as_mut_ptr() 249 | )?; 250 | Ok(self) 251 | } 252 | 253 | /// Add a resouce into the linker stack. 254 | pub fn add(self, data: &Instruction) -> Result { 255 | Ok(match *data { 256 | Instruction::PTX(ref ptx) => unsafe { 257 | let cstr = CString::new(ptx.as_bytes()).expect("Invalid PTX String"); 258 | self.add_data(data.input_type(), cstr.as_bytes_with_nul())? 259 | }, 260 | Instruction::Cubin(ref bin) => unsafe { self.add_data(data.input_type(), &bin)? }, 261 | Instruction::PTXFile(ref path) | Instruction::CubinFile(ref path) => unsafe { 262 | self.add_file(data.input_type(), path)? 263 | }, 264 | }) 265 | } 266 | 267 | /// Wrapper of cuLinkComplete 268 | /// 269 | /// LinkComplete returns a reference to cubin, 270 | /// which is managed by LinkState. 271 | /// Use owned strategy to avoid considering lifetime. 272 | pub fn complete(self) -> Result { 273 | let mut cb = null_mut(); 274 | unsafe { 275 | contexted_call!( 276 | &self, 277 | cuLinkComplete, 278 | self.state, 279 | &mut cb as *mut _, 280 | null_mut() 281 | )?; 282 | Ok(Instruction::cubin(CStr::from_ptr(cb as _).to_bytes())) 283 | } 284 | } 285 | } 286 | 287 | /// Link PTX/cubin into a module 288 | pub fn link(ctx: &Context, data: &[Instruction], opt: JITConfig) -> Result { 289 | let mut l = Linker::create(&ctx, opt)?; 290 | for d in data { 291 | l = l.add(d)?; 292 | } 293 | let cubin = l.complete()?; 294 | Module::load(ctx, &cubin) 295 | } 296 | 297 | #[cfg(test)] 298 | mod tests { 299 | use super::*; 300 | 301 | #[test] 302 | fn create() -> Result<()> { 303 | let device = Device::nth(0)?; 304 | let ctx = device.create_context(); 305 | let _linker = Linker::create(&ctx, JITConfig::default())?; 306 | Ok(()) 307 | } 308 | 309 | #[test] 310 | fn ptx_file() -> Result<()> { 311 | let device = Device::nth(0)?; 312 | let ctx = device.create_context(); 313 | let linker = Linker::create(&ctx, JITConfig::default())?; 314 | let data = Instruction::ptx_file(Path::new("tests/data/add.ptx"))?; 315 | linker.add(&data)?; 316 | Ok(()) 317 | } 318 | 319 | #[test] 320 | fn linking() -> Result<()> { 321 | let device = Device::nth(0)?; 322 | let ctx = device.create_context(); 323 | 324 | let data_add = Instruction::ptx_file(Path::new("tests/data/add.ptx"))?; 325 | let data_sub = Instruction::ptx_file(Path::new("tests/data/sub.ptx"))?; 326 | let _module = Linker::create(&ctx, JITConfig::default())? 327 | .add(&data_add)? 328 | .add(&data_sub)? 329 | .complete()?; 330 | Ok(()) 331 | } 332 | 333 | #[ignore] // FIXME Causes CUDA_ERROR_NO_BINARY_FOR_GPU 334 | #[test] 335 | fn cubin_file() -> Result<()> { 336 | let device = Device::nth(0)?; 337 | let ctx = device.create_context(); 338 | let linker = Linker::create(&ctx, JITConfig::default())?; 339 | let data = Instruction::cubin_file(Path::new("tests/data/add.cubin"))?; 340 | linker.add(&data)?; 341 | Ok(()) 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /accel/src/memory/array.rs: -------------------------------------------------------------------------------- 1 | //! CUDA [Array] and [Texture], [Surface] Objects 2 | //! 3 | //! [Array]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-arrays 4 | //! [Texture]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html#group__CUDA__TEXOBJECT 5 | //! [Surface]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html#group__CUDA__SURFOBJECT 6 | 7 | use crate::{contexted_call, contexted_new, device::Contexted, error::Result, *}; 8 | use cuda::*; 9 | use futures::future::BoxFuture; 10 | use num_traits::ToPrimitive; 11 | use std::marker::PhantomData; 12 | 13 | pub use cuda::CUDA_ARRAY3D_DESCRIPTOR as Descriptor; 14 | 15 | #[derive(Debug, Contexted)] 16 | pub struct Array { 17 | array: CUarray, 18 | dim: Dim, 19 | context: Context, 20 | phantom: PhantomData, 21 | } 22 | 23 | unsafe impl Send for Array {} 24 | unsafe impl Sync for Array {} 25 | 26 | impl Drop for Array { 27 | fn drop(&mut self) { 28 | if let Err(e) = unsafe { contexted_call!(self, cuArrayDestroy, self.array) } { 29 | log::error!("Failed to cleanup array: {:?}", e); 30 | } 31 | } 32 | } 33 | 34 | impl Array { 35 | /// Get dimension 36 | pub fn dim(&self) -> &Dim { 37 | &self.dim 38 | } 39 | } 40 | 41 | impl Memory for Array { 42 | type Elem = T; 43 | fn head_addr(&self) -> *const T { 44 | self.array as _ 45 | } 46 | fn head_addr_mut(&mut self) -> *mut T { 47 | self.array as _ 48 | } 49 | 50 | fn num_elem(&self) -> usize { 51 | self.dim.len() 52 | } 53 | 54 | fn memory_type(&self) -> MemoryType { 55 | MemoryType::Array 56 | } 57 | 58 | fn set(&mut self, value: Self::Elem) { 59 | // FIXME CUDA does not have memcpy for array. This is easy but too expensive alternative way 60 | let src = PageLockedMemory::from_elem(&self.context, self.dim.len(), value); 61 | self.copy_from(&src); 62 | } 63 | } 64 | 65 | fn memcpy3d_param_h2a( 66 | src: &[T], 67 | dst: &mut Array, 68 | ) -> CUDA_MEMCPY3D { 69 | let dim = dst.dim; 70 | CUDA_MEMCPY3D { 71 | srcMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED, 72 | srcDevice: src.as_ptr() as CUdeviceptr, 73 | 74 | dstMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_ARRAY, 75 | dstArray: dst.array, 76 | 77 | WidthInBytes: dim.width() * T::size_of() * dim.num_channels().to_usize().unwrap(), 78 | Height: dim.height(), 79 | Depth: dim.depth(), 80 | 81 | ..Default::default() 82 | } 83 | } 84 | 85 | impl Memcpy<[T]> for Array { 86 | fn copy_from(&mut self, src: &[T]) { 87 | assert_ne!(self.head_addr(), src.head_addr()); 88 | assert_eq!(self.num_elem(), src.num_elem()); 89 | unsafe { contexted_call!(self, cuMemcpy3D_v2, &memcpy3d_param_h2a(src, self)) } 90 | .expect("memcpy into array failed"); 91 | } 92 | 93 | fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> { 94 | assert_ne!(self.head_addr(), src.head_addr()); 95 | assert_eq!(self.num_elem(), src.num_elem()); 96 | let stream = stream::Stream::new(self.context.get_ref()); 97 | unsafe { 98 | contexted_call!( 99 | self, 100 | cuMemcpy3DAsync_v2, 101 | &memcpy3d_param_h2a(src, self), 102 | stream.stream 103 | ) 104 | } 105 | .expect("memcpy into array failed"); 106 | Box::pin(async { stream.into_future().await.expect("async memcpy failed") }) 107 | } 108 | } 109 | 110 | fn memcpy3d_param_a2h( 111 | src: &Array, 112 | dst: &mut [T], 113 | ) -> CUDA_MEMCPY3D { 114 | let dim = src.dim; 115 | CUDA_MEMCPY3D { 116 | srcMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_ARRAY, 117 | srcArray: src.array, 118 | 119 | dstMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED, 120 | dstDevice: dst.as_mut_ptr() as CUdeviceptr, 121 | 122 | WidthInBytes: dim.width() * T::size_of() * dim.num_channels().to_usize().unwrap(), 123 | Height: dim.height(), 124 | Depth: dim.depth(), 125 | 126 | ..Default::default() 127 | } 128 | } 129 | 130 | impl Memcpy> for [T] { 131 | fn copy_from(&mut self, src: &Array) { 132 | assert_ne!(self.head_addr(), src.head_addr()); 133 | assert_eq!(self.num_elem(), src.num_elem()); 134 | unsafe { contexted_call!(src, cuMemcpy3D_v2, &memcpy3d_param_a2h(src, self)) } 135 | .expect("memcpy from array failed"); 136 | } 137 | 138 | fn copy_from_async<'a>(&'a mut self, src: &'a Array) -> BoxFuture<'a, ()> { 139 | assert_ne!(self.head_addr(), src.head_addr()); 140 | assert_eq!(self.num_elem(), src.num_elem()); 141 | let stream = stream::Stream::new(src.context.get_ref()); 142 | unsafe { 143 | contexted_call!( 144 | src, 145 | cuMemcpy3DAsync_v2, 146 | &memcpy3d_param_a2h(src, self), 147 | stream.stream 148 | ) 149 | } 150 | .expect("memcpy from array failed"); 151 | Box::pin(async { stream.into_future().await.expect("async memcpy failed") }) 152 | } 153 | } 154 | 155 | macro_rules! impl_memcpy_array { 156 | ($t:path) => { 157 | impl Memcpy> for $t { 158 | fn copy_from(&mut self, src: &Array) { 159 | self.as_mut_slice().copy_from(src); 160 | } 161 | fn copy_from_async<'a>(&'a mut self, src: &'a Array) -> BoxFuture<'a, ()> { 162 | self.as_mut_slice().copy_from_async(src) 163 | } 164 | } 165 | 166 | impl Memcpy<$t> for Array { 167 | fn copy_from(&mut self, src: &$t) { 168 | self.copy_from(src.as_slice()); 169 | } 170 | fn copy_from_async<'a>(&'a mut self, src: &'a $t) -> BoxFuture<'a, ()> { 171 | self.copy_from_async(src.as_slice()) 172 | } 173 | } 174 | }; 175 | } 176 | 177 | impl_memcpy_array!(DeviceMemory::); 178 | impl_memcpy_array!(PageLockedMemory::); 179 | impl_memcpy_array!(RegisteredMemory::<'_, T>); 180 | 181 | impl Allocatable for Array { 182 | type Shape = Dim; 183 | unsafe fn uninitialized(context: &Context, dim: Dim) -> Self { 184 | let desc = dim.as_descriptor::(); 185 | let array = 186 | contexted_new!(context, cuArray3DCreate_v2, &desc).expect("Cannot create a new array"); 187 | Array { 188 | array, 189 | dim, 190 | context: context.clone(), 191 | phantom: PhantomData, 192 | } 193 | } 194 | } 195 | 196 | #[cfg(test)] 197 | mod tests { 198 | use super::*; 199 | use crate::device::*; 200 | 201 | #[test] 202 | fn new_1d() -> Result<()> { 203 | let device = Device::nth(0)?; 204 | let context = device.create_context(); 205 | let _array1: Array = Array::zeros(&context, 10.into()); 206 | let _array2: Array = Array::zeros(&context, (10,).into()); 207 | Ok(()) 208 | } 209 | 210 | #[test] 211 | fn new_2d() -> Result<()> { 212 | let device = Device::nth(0)?; 213 | let context = device.create_context(); 214 | let _array: Array = Array::zeros(&context, (10, 12).into()); 215 | Ok(()) 216 | } 217 | 218 | #[test] 219 | fn new_3d() -> Result<()> { 220 | let device = Device::nth(0)?; 221 | let context = device.create_context(); 222 | let _array: Array = Array::zeros(&context, (10, 12, 8).into()); 223 | Ok(()) 224 | } 225 | 226 | #[test] 227 | fn new_1d_layered() -> Result<()> { 228 | let device = Device::nth(0)?; 229 | let context = device.create_context(); 230 | let _array: Array = Array::zeros(&context, (10, 12).into()); 231 | Ok(()) 232 | } 233 | 234 | #[test] 235 | fn new_2d_layered() -> Result<()> { 236 | let device = Device::nth(0)?; 237 | let context = device.create_context(); 238 | let _array: Array = Array::zeros(&context, (10, 12, 8).into()); 239 | Ok(()) 240 | } 241 | 242 | #[test] 243 | fn memcpy_h2a2h_1d() -> Result<()> { 244 | let device = Device::nth(0)?; 245 | let context = device.create_context(); 246 | let n = 10; 247 | let src = PageLockedMemory::from_elem(&context, n, 2_u32); 248 | let mut dst = PageLockedMemory::zeros(&context, n); 249 | let mut array = unsafe { Array::::uninitialized(&context, n.into()) }; 250 | array.copy_from(&src); 251 | dst.copy_from(&array); 252 | dbg!(dst.as_slice()); 253 | for i in 0..n { 254 | assert_eq!(dst[i], 2_u32); 255 | } 256 | Ok(()) 257 | } 258 | 259 | #[test] 260 | fn memcpy_d2a2d_2d() -> Result<()> { 261 | let device = Device::nth(0)?; 262 | let context = device.create_context(); 263 | let n = 3; 264 | let m = 4; 265 | let src = DeviceMemory::from_elem(&context, n * m, 2_u32); 266 | let mut dst = DeviceMemory::zeros(&context, n * m); 267 | let mut array = unsafe { Array::::uninitialized(&context, (n, m).into()) }; 268 | array.copy_from(&src); 269 | dst.copy_from(&array); 270 | dbg!(dst.as_slice()); 271 | for i in 0..n * m { 272 | assert_eq!(dst[i], 2_u32); 273 | } 274 | Ok(()) 275 | } 276 | 277 | #[test] 278 | fn memcpy_h2a2h_2d() -> Result<()> { 279 | let device = Device::nth(0)?; 280 | let context = device.create_context(); 281 | let n = 3; 282 | let m = 4; 283 | let src = PageLockedMemory::from_elem(&context, n * m, 2_u32); 284 | let mut dst = PageLockedMemory::zeros(&context, n * m); 285 | let mut array = unsafe { Array::::uninitialized(&context, (n, m).into()) }; 286 | array.copy_from(&src); 287 | dst.copy_from(&array); 288 | dbg!(dst.as_slice()); 289 | for i in 0..n { 290 | assert_eq!(dst[i], 2_u32); 291 | } 292 | Ok(()) 293 | } 294 | 295 | #[test] 296 | fn memcpy_d2a2d_1d() -> Result<()> { 297 | let device = Device::nth(0)?; 298 | let context = device.create_context(); 299 | let n = 3; 300 | let m = 4; 301 | let src = DeviceMemory::from_elem(&context, n * m, 2_u32); 302 | let mut dst = DeviceMemory::zeros(&context, n * m); 303 | let mut array = unsafe { Array::::uninitialized(&context, (n, m).into()) }; 304 | array.copy_from(&src); 305 | dst.copy_from(&array); 306 | dbg!(dst.as_slice()); 307 | for i in 0..n { 308 | assert_eq!(dst[i], 2_u32); 309 | } 310 | Ok(()) 311 | } 312 | #[test] 313 | fn memcpy_h2a2h_3d() -> Result<()> { 314 | let device = Device::nth(0)?; 315 | let context = device.create_context(); 316 | let n = 3; 317 | let m = 4; 318 | let l = 2; 319 | let src = PageLockedMemory::from_elem(&context, n * m * l, 2_u32); 320 | let mut dst = PageLockedMemory::zeros(&context, n * m * l); 321 | let mut array = unsafe { Array::::uninitialized(&context, (n, m, l).into()) }; 322 | array.copy_from(&src); 323 | dst.copy_from(&array); 324 | dbg!(dst.as_slice()); 325 | for i in 0..n { 326 | assert_eq!(dst[i], 2_u32); 327 | } 328 | Ok(()) 329 | } 330 | 331 | #[test] 332 | fn memcpy_d2a2d_3d() -> Result<()> { 333 | let device = Device::nth(0)?; 334 | let context = device.create_context(); 335 | let n = 3; 336 | let m = 4; 337 | let l = 2; 338 | let src = DeviceMemory::from_elem(&context, n * l * m, 2_u32); 339 | let mut dst = DeviceMemory::zeros(&context, n * l * m); 340 | let mut array = unsafe { Array::::uninitialized(&context, (n, m, l).into()) }; 341 | array.copy_from(&src); 342 | dst.copy_from(&array); 343 | dbg!(dst.as_slice()); 344 | for i in 0..n { 345 | assert_eq!(dst[i], 2_u32); 346 | } 347 | Ok(()) 348 | } 349 | 350 | #[test] 351 | fn memcpy_h2a2h_1dlayer() -> Result<()> { 352 | let device = Device::nth(0)?; 353 | let context = device.create_context(); 354 | let n = 3; 355 | let m = 4; 356 | let src = PageLockedMemory::from_elem(&context, n * m, 2_u32); 357 | let mut dst = PageLockedMemory::zeros(&context, n * m); 358 | let mut array = unsafe { Array::::uninitialized(&context, (n, m).into()) }; 359 | array.copy_from(&src); 360 | dst.copy_from(&array); 361 | dbg!(dst.as_slice()); 362 | for i in 0..n { 363 | assert_eq!(dst[i], 2_u32); 364 | } 365 | Ok(()) 366 | } 367 | 368 | #[test] 369 | fn memcpy_d2a2d_1dlayer() -> Result<()> { 370 | let device = Device::nth(0)?; 371 | let context = device.create_context(); 372 | let n = 3; 373 | let m = 4; 374 | let src = DeviceMemory::from_elem(&context, n * m, 2_u32); 375 | let mut dst = DeviceMemory::zeros(&context, n * m); 376 | let mut array = unsafe { Array::::uninitialized(&context, (n, m).into()) }; 377 | array.copy_from(&src); 378 | dst.copy_from(&array); 379 | dbg!(dst.as_slice()); 380 | for i in 0..n { 381 | assert_eq!(dst[i], 2_u32); 382 | } 383 | Ok(()) 384 | } 385 | 386 | #[test] 387 | fn memcpy_h2a2h_2dlayer() -> Result<()> { 388 | let device = Device::nth(0)?; 389 | let context = device.create_context(); 390 | let n = 3; 391 | let m = 4; 392 | let l = 2; 393 | let src = PageLockedMemory::from_elem(&context, n * m * l, 2_u32); 394 | let mut dst = PageLockedMemory::zeros(&context, n * m * l); 395 | let mut array = 396 | unsafe { Array::::uninitialized(&context, (n, m, l).into()) }; 397 | array.copy_from(&src); 398 | dst.copy_from(&array); 399 | dbg!(dst.as_slice()); 400 | for i in 0..n { 401 | assert_eq!(dst[i], 2_u32); 402 | } 403 | Ok(()) 404 | } 405 | 406 | #[test] 407 | fn memcpy_d2a2d_2dlayer() -> Result<()> { 408 | let device = Device::nth(0)?; 409 | let context = device.create_context(); 410 | let n = 3; 411 | let m = 4; 412 | let l = 2; 413 | let src = DeviceMemory::from_elem(&context, n * m * l, 2_u32); 414 | let mut dst = DeviceMemory::zeros(&context, n * m * l); 415 | let mut array = 416 | unsafe { Array::::uninitialized(&context, (n, m, l).into()) }; 417 | array.copy_from(&src); 418 | dst.copy_from(&array); 419 | dbg!(dst.as_slice()); 420 | for i in 0..n { 421 | assert_eq!(dst[i], 2_u32); 422 | } 423 | Ok(()) 424 | } 425 | } 426 | -------------------------------------------------------------------------------- /accel/src/memory/device.rs: -------------------------------------------------------------------------------- 1 | //! Device and Host memory handlers 2 | 3 | use super::*; 4 | use crate::{error::*, *}; 5 | use cuda::*; 6 | use std::{ 7 | fmt, 8 | marker::PhantomData, 9 | ops::{Deref, DerefMut}, 10 | }; 11 | 12 | use cuda::CUmemAttach_flags_enum as AttachFlag; 13 | 14 | /// Memory allocated on the device. 15 | #[derive(Contexted)] 16 | pub struct DeviceMemory { 17 | ptr: CUdeviceptr, 18 | size: usize, 19 | context: Context, 20 | phantom: PhantomData, 21 | } 22 | 23 | unsafe impl Sync for DeviceMemory {} 24 | unsafe impl Send for DeviceMemory {} 25 | 26 | impl Drop for DeviceMemory { 27 | fn drop(&mut self) { 28 | if let Err(e) = unsafe { contexted_call!(self, cuMemFree_v2, self.ptr) } { 29 | log::error!("Failed to free device memory: {:?}", e); 30 | } 31 | } 32 | } 33 | 34 | impl fmt::Debug for DeviceMemory { 35 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 36 | f.debug_struct("DeviceMemory") 37 | .field("context", &self.context) 38 | .field("data", &self.as_slice()) 39 | .finish() 40 | } 41 | } 42 | 43 | impl Deref for DeviceMemory { 44 | type Target = [T]; 45 | fn deref(&self) -> &[T] { 46 | unsafe { std::slice::from_raw_parts(self.ptr as _, self.size) } 47 | } 48 | } 49 | 50 | impl DerefMut for DeviceMemory { 51 | fn deref_mut(&mut self) -> &mut [T] { 52 | unsafe { std::slice::from_raw_parts_mut(self.ptr as _, self.size) } 53 | } 54 | } 55 | 56 | impl PartialEq for DeviceMemory { 57 | fn eq(&self, other: &Self) -> bool { 58 | // FIXME should be tested on device 59 | self.as_slice().eq(other.as_slice()) 60 | } 61 | } 62 | 63 | impl PartialEq<[T]> for DeviceMemory { 64 | fn eq(&self, other: &[T]) -> bool { 65 | // FIXME should be tested on device 66 | self.as_slice().eq(other) 67 | } 68 | } 69 | 70 | impl Memory for DeviceMemory { 71 | type Elem = T; 72 | fn head_addr(&self) -> *const T { 73 | self.ptr as _ 74 | } 75 | 76 | fn head_addr_mut(&mut self) -> *mut T { 77 | self.ptr as _ 78 | } 79 | 80 | fn num_elem(&self) -> usize { 81 | self.size 82 | } 83 | 84 | fn memory_type(&self) -> MemoryType { 85 | MemoryType::Device 86 | } 87 | 88 | fn set(&mut self, value: T) { 89 | match T::size_of() { 90 | 1 => unsafe { 91 | contexted_call!( 92 | self, 93 | cuMemsetD8_v2, 94 | self.head_addr_mut() as CUdeviceptr, 95 | value.to_le_u8().unwrap(), 96 | self.num_elem() 97 | ) 98 | } 99 | .expect("memset failed for 8-bit scalar"), 100 | 2 => unsafe { 101 | contexted_call!( 102 | self, 103 | cuMemsetD16_v2, 104 | self.head_addr_mut() as CUdeviceptr, 105 | value.to_le_u16().unwrap(), 106 | self.num_elem() 107 | ) 108 | } 109 | .expect("memset failed for 16-bit scalar"), 110 | 4 => unsafe { 111 | contexted_call!( 112 | self, 113 | cuMemsetD32_v2, 114 | self.head_addr_mut() as CUdeviceptr, 115 | value.to_le_u32().unwrap(), 116 | self.num_elem() 117 | ) 118 | } 119 | .expect("memset failed for 64-bit scalar"), 120 | _ => { 121 | unimplemented!("memset for Device memory is only supported for 8/16/32-bit scalars") 122 | } 123 | } 124 | } 125 | } 126 | 127 | impl Continuous for DeviceMemory { 128 | fn as_slice(&self) -> &[T] { 129 | self 130 | } 131 | fn as_mut_slice(&mut self) -> &mut [T] { 132 | self 133 | } 134 | } 135 | 136 | impl Allocatable for DeviceMemory { 137 | type Shape = usize; 138 | unsafe fn uninitialized(context: &Context, size: usize) -> Self { 139 | assert!(size > 0, "Zero-sized malloc is forbidden"); 140 | let ptr = contexted_new!( 141 | context, 142 | cuMemAllocManaged, 143 | size * std::mem::size_of::(), 144 | AttachFlag::CU_MEM_ATTACH_GLOBAL as u32 145 | ) 146 | .expect("Cannot allocate device memory"); 147 | DeviceMemory { 148 | ptr, 149 | size, 150 | context: context.clone(), 151 | phantom: PhantomData, 152 | } 153 | } 154 | } 155 | 156 | impl<'arg, T: Scalar> DeviceSend for &'arg DeviceMemory { 157 | type Target = *const T; 158 | fn as_kernel_parameter(&self) -> *mut c_void { 159 | &self.ptr as *const CUdeviceptr as *mut c_void 160 | } 161 | } 162 | 163 | impl<'arg, T: Scalar> DeviceSend for &'arg mut DeviceMemory { 164 | type Target = *mut T; 165 | fn as_kernel_parameter(&self) -> *mut c_void { 166 | &self.ptr as *const CUdeviceptr as *mut c_void 167 | } 168 | } 169 | 170 | #[cfg(test)] 171 | mod tests { 172 | use super::*; 173 | 174 | #[test] 175 | fn as_mut_slice() -> Result<()> { 176 | let device = Device::nth(0)?; 177 | let context = device.create_context(); 178 | let mut mem = DeviceMemory::::zeros(&context, 12); 179 | let sl = mem.as_mut_slice(); 180 | sl[0] = 3; // test if accessible from host 181 | assert_eq!(sl.num_elem(), 12); 182 | Ok(()) 183 | } 184 | 185 | #[should_panic(expected = "Zero-sized malloc is forbidden")] 186 | #[test] 187 | fn device_new_zero() { 188 | let device = Device::nth(0).unwrap(); 189 | let context = device.create_context(); 190 | let _a = DeviceMemory::::zeros(&context, 0); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /accel/src/memory/dimension.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | use derive_new::new; 3 | use num_derive::{FromPrimitive, ToPrimitive}; 4 | use num_traits::{ToPrimitive, Zero}; 5 | use std::{fmt::Debug, ops::Add}; 6 | 7 | pub use cuda::CUDA_ARRAY3D_DESCRIPTOR as Descriptor; 8 | 9 | /// This specifies the number of packed elements per "CUDA array element". 10 | /// 11 | /// - The CUDA array element approach is useful e.g. for [RGBA color model], 12 | /// which has 4 values at each point of figures. 13 | /// - For example, When `T=f32` and `NumChannels::Two`, 14 | /// the size of "CUDA array element" is 64bit as packed two 32bit float values. 15 | /// - We call `T` element, although "CUDA array element" represents `[T; num_channels]`. 16 | /// `Memory::num_elem()` returns how many `T` exists in this array. 17 | /// 18 | /// [RGBA color model]: https://en.wikipedia.org/wiki/RGBA_color_model 19 | #[repr(u32)] 20 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, FromPrimitive, ToPrimitive)] 21 | pub enum NumChannels { 22 | /// Single element in each "CUDA Array element" 23 | One = 1, 24 | /// Two scalars in each CUDA Array element 25 | Two = 2, 26 | /// Four scalars in each CUDA Array element 27 | Four = 4, 28 | } 29 | 30 | impl Default for NumChannels { 31 | fn default() -> Self { 32 | NumChannels::One 33 | } 34 | } 35 | 36 | pub trait Dimension: Zero + Debug + Clone + Copy + PartialEq + Send + Sync + 'static { 37 | fn as_descriptor(&self) -> Descriptor; 38 | 39 | /// Number of elements 40 | fn len(&self) -> usize; 41 | 42 | /// Get number of element `T` in each "CUDA Array element" 43 | fn num_channels(&self) -> NumChannels; 44 | 45 | fn width(&self) -> usize { 46 | self.as_descriptor::().Width 47 | } 48 | 49 | fn height(&self) -> usize { 50 | std::cmp::max(self.as_descriptor::().Height, 1) 51 | } 52 | 53 | fn depth(&self) -> usize { 54 | std::cmp::max(self.as_descriptor::().Depth, 1) 55 | } 56 | } 57 | 58 | /// Spec of 1D Array 59 | #[derive(Debug, Clone, Copy, PartialEq, new)] 60 | pub struct Ix1 { 61 | pub width: usize, 62 | #[new(default)] 63 | pub num_channels: NumChannels, 64 | } 65 | 66 | impl From for Ix1 { 67 | fn from(width: usize) -> Ix1 { 68 | Ix1 { 69 | width, 70 | num_channels: NumChannels::One, 71 | } 72 | } 73 | } 74 | 75 | impl From<(usize,)> for Ix1 { 76 | fn from((width,): (usize,)) -> Ix1 { 77 | Ix1 { 78 | width, 79 | num_channels: NumChannels::One, 80 | } 81 | } 82 | } 83 | 84 | impl Add for Ix1 { 85 | type Output = Self; 86 | fn add(self, other: Self) -> Self { 87 | assert_eq!(self.num_channels, other.num_channels); 88 | Self { 89 | width: self.width + other.width, 90 | num_channels: self.num_channels, 91 | } 92 | } 93 | } 94 | 95 | impl Zero for Ix1 { 96 | fn zero() -> Self { 97 | Ix1::new(0) 98 | } 99 | 100 | fn is_zero(&self) -> bool { 101 | self.len() == 0 102 | } 103 | } 104 | 105 | impl Dimension for Ix1 { 106 | fn as_descriptor(&self) -> Descriptor { 107 | Descriptor { 108 | Width: self.width, 109 | Height: 0, 110 | Depth: 0, 111 | NumChannels: self.num_channels.to_u32().unwrap(), 112 | Flags: ArrayFlag::empty().bits(), 113 | Format: T::format(), 114 | } 115 | } 116 | 117 | fn len(&self) -> usize { 118 | self.width * self.num_channels.to_usize().unwrap() 119 | } 120 | 121 | fn num_channels(&self) -> NumChannels { 122 | self.num_channels 123 | } 124 | } 125 | 126 | /// Spec of 2D Array 127 | #[derive(Debug, Clone, Copy, PartialEq, new)] 128 | pub struct Ix2 { 129 | pub width: usize, 130 | pub height: usize, 131 | #[new(default)] 132 | pub num_channels: NumChannels, 133 | } 134 | 135 | impl From<(usize, usize)> for Ix2 { 136 | fn from((width, height): (usize, usize)) -> Ix2 { 137 | Ix2 { 138 | width, 139 | height, 140 | num_channels: NumChannels::One, 141 | } 142 | } 143 | } 144 | 145 | impl Add for Ix2 { 146 | type Output = Self; 147 | fn add(self, other: Self) -> Self { 148 | assert_eq!(self.num_channels, other.num_channels); 149 | Self { 150 | width: self.width + other.width, 151 | height: self.height + other.height, 152 | num_channels: self.num_channels, 153 | } 154 | } 155 | } 156 | 157 | impl Zero for Ix2 { 158 | fn zero() -> Self { 159 | Ix2::new(0, 0) 160 | } 161 | 162 | fn is_zero(&self) -> bool { 163 | self.len() == 0 164 | } 165 | } 166 | 167 | impl Dimension for Ix2 { 168 | fn as_descriptor(&self) -> Descriptor { 169 | Descriptor { 170 | Width: self.width, 171 | Height: self.height, 172 | Depth: 0, 173 | NumChannels: self.num_channels.to_u32().unwrap(), 174 | Flags: ArrayFlag::empty().bits(), 175 | Format: T::format(), 176 | } 177 | } 178 | 179 | fn len(&self) -> usize { 180 | self.width * self.height * self.num_channels.to_usize().unwrap() 181 | } 182 | 183 | fn num_channels(&self) -> NumChannels { 184 | self.num_channels 185 | } 186 | } 187 | 188 | /// Spec of 3D Array 189 | #[derive(Debug, Clone, Copy, PartialEq, new)] 190 | pub struct Ix3 { 191 | pub width: usize, 192 | pub height: usize, 193 | pub depth: usize, 194 | #[new(default)] 195 | pub num_channels: NumChannels, 196 | } 197 | 198 | impl From<(usize, usize, usize)> for Ix3 { 199 | fn from((width, height, depth): (usize, usize, usize)) -> Ix3 { 200 | Ix3 { 201 | width, 202 | height, 203 | depth, 204 | num_channels: NumChannels::One, 205 | } 206 | } 207 | } 208 | 209 | impl Add for Ix3 { 210 | type Output = Self; 211 | fn add(self, other: Self) -> Self { 212 | assert_eq!(self.num_channels, other.num_channels); 213 | Self { 214 | width: self.width + other.width, 215 | height: self.height + other.height, 216 | depth: self.depth + other.depth, 217 | num_channels: self.num_channels, 218 | } 219 | } 220 | } 221 | 222 | impl Zero for Ix3 { 223 | fn zero() -> Self { 224 | Ix3::new(0, 0, 0) 225 | } 226 | 227 | fn is_zero(&self) -> bool { 228 | self.len() == 0 229 | } 230 | } 231 | 232 | impl Dimension for Ix3 { 233 | fn as_descriptor(&self) -> Descriptor { 234 | Descriptor { 235 | Width: self.width, 236 | Height: self.height, 237 | Depth: self.depth, 238 | NumChannels: self.num_channels.to_u32().unwrap(), 239 | Flags: ArrayFlag::empty().bits(), 240 | Format: T::format(), 241 | } 242 | } 243 | 244 | fn len(&self) -> usize { 245 | self.width * self.height * self.depth * self.num_channels().to_usize().unwrap() 246 | } 247 | 248 | fn num_channels(&self) -> NumChannels { 249 | self.num_channels 250 | } 251 | } 252 | 253 | /// Spec of Layered 1D Array 254 | #[derive(Debug, Clone, Copy, PartialEq, new)] 255 | pub struct Ix1Layered { 256 | /// Width of each layer 257 | pub width: usize, 258 | /// Depth of layer 259 | pub depth: usize, 260 | #[new(default)] 261 | pub num_channels: NumChannels, 262 | } 263 | 264 | impl From<(usize, usize)> for Ix1Layered { 265 | fn from((width, depth): (usize, usize)) -> Ix1Layered { 266 | Ix1Layered { 267 | width, 268 | depth, 269 | num_channels: NumChannels::One, 270 | } 271 | } 272 | } 273 | 274 | impl Add for Ix1Layered { 275 | type Output = Self; 276 | fn add(self, other: Self) -> Self { 277 | assert_eq!(self.num_channels, other.num_channels); 278 | Self { 279 | width: self.width + other.width, 280 | depth: self.depth + other.depth, 281 | num_channels: self.num_channels, 282 | } 283 | } 284 | } 285 | 286 | impl Zero for Ix1Layered { 287 | fn zero() -> Self { 288 | Self::new(0, 0) 289 | } 290 | 291 | fn is_zero(&self) -> bool { 292 | self.len() == 0 293 | } 294 | } 295 | 296 | impl Dimension for Ix1Layered { 297 | fn as_descriptor(&self) -> Descriptor { 298 | Descriptor { 299 | Width: self.width, 300 | Height: 0, 301 | Depth: self.depth, 302 | NumChannels: self.num_channels.to_u32().unwrap(), 303 | Flags: ArrayFlag::LAYERED.bits(), 304 | Format: T::format(), 305 | } 306 | } 307 | 308 | fn len(&self) -> usize { 309 | self.width * self.depth * self.num_channels.to_usize().unwrap() 310 | } 311 | 312 | fn num_channels(&self) -> NumChannels { 313 | self.num_channels 314 | } 315 | } 316 | 317 | /// Spec of Layered 2D Array 318 | #[derive(Debug, Clone, Copy, PartialEq, new)] 319 | pub struct Ix2Layered { 320 | /// Width of each layer 321 | pub width: usize, 322 | /// height of each layer 323 | pub height: usize, 324 | /// Depth of layer 325 | pub depth: usize, 326 | #[new(default)] 327 | pub num_channels: NumChannels, 328 | } 329 | 330 | impl From<(usize, usize, usize)> for Ix2Layered { 331 | fn from((width, height, depth): (usize, usize, usize)) -> Ix2Layered { 332 | Ix2Layered { 333 | width, 334 | height, 335 | depth, 336 | num_channels: NumChannels::One, 337 | } 338 | } 339 | } 340 | 341 | impl Add for Ix2Layered { 342 | type Output = Self; 343 | fn add(self, other: Self) -> Self { 344 | assert_eq!(self.num_channels, other.num_channels); 345 | Self { 346 | width: self.width + other.width, 347 | height: self.height + other.height, 348 | depth: self.depth + other.depth, 349 | num_channels: self.num_channels, 350 | } 351 | } 352 | } 353 | 354 | impl Zero for Ix2Layered { 355 | fn zero() -> Self { 356 | Self::new(0, 0, 0) 357 | } 358 | 359 | fn is_zero(&self) -> bool { 360 | self.len() == 0 361 | } 362 | } 363 | 364 | impl Dimension for Ix2Layered { 365 | fn as_descriptor(&self) -> Descriptor { 366 | Descriptor { 367 | Width: self.width, 368 | Height: self.height, 369 | Depth: self.depth, 370 | NumChannels: self.num_channels.to_u32().unwrap(), 371 | Flags: ArrayFlag::LAYERED.bits(), 372 | Format: T::format(), 373 | } 374 | } 375 | 376 | fn len(&self) -> usize { 377 | self.width * self.height * self.depth * self.num_channels.to_usize().unwrap() 378 | } 379 | 380 | fn num_channels(&self) -> NumChannels { 381 | self.num_channels 382 | } 383 | } 384 | 385 | bitflags::bitflags! { 386 | pub struct ArrayFlag: u32 { 387 | /// If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. 388 | const LAYERED = 0x01; 389 | /// This flag must be set in order to bind a surface reference to the CUDA array 390 | const SURFACE_LDST = 0x02; 391 | /// If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. 392 | const CUBEMAP = 0x04; 393 | /// This flag must be set in order to perform texture gather operations on a CUDA array. 394 | const TEXTURE_GATHER = 0x08; 395 | /// This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. 396 | const DEPTH_TEXTURE = 0x10; 397 | /// This flag indicates that the CUDA array may be bound as a color target in an external graphics API 398 | const COLOR_ATTACHMENT = 0x20; 399 | } 400 | } 401 | -------------------------------------------------------------------------------- /accel/src/memory/info.rs: -------------------------------------------------------------------------------- 1 | use crate::{contexted_call, device::*}; 2 | use cuda::*; 3 | 4 | /// Total and Free memory size of the device (in bytes) 5 | #[derive(Debug, Clone, Copy, PartialEq)] 6 | struct MemoryInfo { 7 | free: usize, 8 | total: usize, 9 | } 10 | 11 | impl MemoryInfo { 12 | fn get(ctx: Context) -> Self { 13 | let mut free = 0; 14 | let mut total = 0; 15 | unsafe { 16 | contexted_call!( 17 | &ctx, 18 | cuMemGetInfo_v2, 19 | &mut free as *mut usize, 20 | &mut total as *mut usize 21 | ) 22 | } 23 | .expect("Cannot get memory info"); 24 | MemoryInfo { free, total } 25 | } 26 | } 27 | 28 | /// Get total memory size in bytes of the current device 29 | /// 30 | /// Panic 31 | /// ------ 32 | /// - when given context is not current 33 | pub fn total_memory(ctx: Context) -> usize { 34 | MemoryInfo::get(ctx).total 35 | } 36 | 37 | /// Get free memory size in bytes of the current device 38 | /// 39 | /// Panic 40 | /// ------ 41 | /// - when given context is not current 42 | pub fn free_memory(ctx: Context) -> usize { 43 | MemoryInfo::get(ctx).free 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use super::*; 49 | use crate::error::*; 50 | 51 | #[test] 52 | fn info() -> Result<()> { 53 | let device = Device::nth(0)?; 54 | let ctx = device.create_context(); 55 | let mem_info = MemoryInfo::get(ctx); 56 | dbg!(&mem_info); 57 | assert!(mem_info.free > 0); 58 | assert!(mem_info.total > mem_info.free); 59 | Ok(()) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /accel/src/memory/mod.rs: -------------------------------------------------------------------------------- 1 | //! Memory management 2 | //! 3 | //! Unified address 4 | //! --------------- 5 | //! 6 | //! - All memories are mapped into a single 64bit memory space 7 | //! - We can get where the pointed memory exists from its value. 8 | //! 9 | //! Memory Types 10 | //! ------------ 11 | //! 12 | //! |name | where exists | From Host | From Device | As slice | Description | 13 | //! |:--------------------|:------------:|:---------:|:-----------:|:--------:|:-----------------------------------------------------------------------| 14 | //! | (usual) Host memory | Host | ✓ | - | ✓ | allocated by usual manner, e.g. `vec![0; n]` | 15 | //! | [RegisteredMemory] | Host | ✓ | ✓ | ✓ | A host memory registered into CUDA memory management system | 16 | //! | [PageLockedMemory] | Host | ✓ | ✓ | ✓ | OS memory paging is disabled for accelerating memory transfer | 17 | //! | [DeviceMemory] | Device | ✓ | ✓ | ✓ | allocated on device as a single span | 18 | //! | [Array] | Device | ✓ | ✓ | - | properly aligned memory on device for using Texture and Surface memory | 19 | //! 20 | //! Traits 21 | //! ------- 22 | //! 23 | //! |traits |`[T]`|[RegisteredMemory]|[PageLockedMemory]|[DeviceMemory]|[Array]| Description | 24 | //! |:------------|:---:|:----------------:|:----------------:|:------------:|:-----:|:-------------------------------------------| 25 | //! |[Memory] | ✓ | ✓ | ✓ | ✓ | ✓ | Has Unified address and element size | 26 | //! |[Contexted] | - | ✓ | ✓ | ✓ | ✓ | with CUDA Context | 27 | //! |[Continuous] | ✓ | ✓ | ✓ | ✓ | - | Can be treated as a Rust slice | 28 | //! |[Allocatable]| - | - | ✓ | ✓ | ✓ | Newly allocatable with its shape and value | 29 | //! 30 | //! [RegisteredMemory]: ./struct.RegisteredMemory.html 31 | //! [PageLockedMemory]: ./struct.PageLockedMemory.html 32 | //! [DeviceMemory]: ./struct.DeviceMemory.html 33 | //! [Array]: ./struct.Array.html 34 | //! 35 | //! [Memory]: ./trait.Memory.html 36 | //! [Memset]: ./trait.Memset.html 37 | //! [Contexted]: ../device/trait.Contexted.html 38 | //! [Continuous]: ./trait.Continuous.html 39 | //! [Allocatable]: ./trait.Allocatable.html 40 | 41 | mod array; 42 | mod device; 43 | mod dimension; 44 | mod info; 45 | mod page_locked; 46 | mod registered; 47 | mod scalar; 48 | mod slice; 49 | 50 | pub use array::*; 51 | pub use device::*; 52 | pub use dimension::*; 53 | pub use info::*; 54 | pub use page_locked::*; 55 | pub use registered::*; 56 | pub use scalar::*; 57 | 58 | use crate::*; 59 | use cuda::*; 60 | use futures::future::BoxFuture; 61 | use num_traits::Zero; 62 | use std::{ffi::c_void, mem::MaybeUninit}; 63 | 64 | /// Memory type 65 | /// 66 | /// Because of [unified addressing], we can get the memory type after casted into slice: 67 | /// 68 | /// - [DeviceMemory] 69 | /// 70 | /// ``` 71 | /// # use accel::{*, memory::*}; 72 | /// # let device = Device::nth(0).unwrap(); 73 | /// # let ctx = device.create_context(); 74 | /// let mem = DeviceMemory::::zeros(&ctx, 12); 75 | /// let sl = mem.as_slice(); 76 | /// assert_eq!(sl.memory_type(), MemoryType::Device); 77 | /// ``` 78 | /// 79 | /// - [PageLockedMemory] 80 | /// 81 | /// ``` 82 | /// # use accel::{*, memory::*}; 83 | /// # let device = Device::nth(0).unwrap(); 84 | /// # let ctx = device.create_context(); 85 | /// let mem = PageLockedMemory::::zeros(&ctx, 12); 86 | /// let sl = mem.as_slice(); 87 | /// assert_eq!(sl.memory_type(), MemoryType::PageLocked); 88 | /// ``` 89 | /// 90 | /// - [RegisteredMemory] 91 | /// - Be sure that [RegisteredMemory] and [PageLockedMemory] are indistinguishable 92 | /// 93 | /// ``` 94 | /// # use accel::{*, memory::*}; 95 | /// # let device = Device::nth(0).unwrap(); 96 | /// # let ctx = device.create_context(); 97 | /// let mut a = vec![0_i32; 12]; 98 | /// let mem = RegisteredMemory::::new(&ctx, &mut a); 99 | /// let sl = mem.as_slice(); 100 | /// assert_eq!(sl.memory_type(), MemoryType::PageLocked); 101 | /// ``` 102 | /// 103 | /// - [Array] cannot be casted into a slice 104 | /// 105 | /// [unified addressing]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED 106 | /// [Array]: ./struct.Array.html 107 | /// [DeviceMemory]: ./struct.DeviceMemory.html 108 | /// [RegisteredMemory]: ./struct.RegisteredMemory.html 109 | /// [PageLockedMemory]: ./struct.PageLockedMemory.html 110 | /// 111 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] 112 | pub enum MemoryType { 113 | /// Host memory **not** managed by CUDA memory system 114 | Host, 115 | /// Host memory managed by CUDA memory system, i.e. 116 | /// [RegisteredMemory](./struct.RegisteredMemory.html), and 117 | /// [PageLockedMemory](./struct.PageLockedMemory.html) 118 | PageLocked, 119 | /// Device memory 120 | Device, 121 | /// Array memory 122 | Array, 123 | } 124 | 125 | /// Has unique head address and allocated size. 126 | pub trait Memory { 127 | /// Scalar type of each element 128 | type Elem: Scalar; 129 | 130 | /// Get head address of the memory as a const pointer 131 | fn head_addr(&self) -> *const Self::Elem; 132 | 133 | /// Get head address of the memory as a mutable pointer 134 | fn head_addr_mut(&mut self) -> *mut Self::Elem; 135 | 136 | /// Number of elements 137 | fn num_elem(&self) -> usize; 138 | 139 | /// Get memory type, See [MemoryType](./enum.MemoryType.html) for detail. 140 | fn memory_type(&self) -> MemoryType; 141 | 142 | /// Set all elements by `value` 143 | /// 144 | /// Examples 145 | /// --------- 146 | /// 147 | /// - Set `i32` 148 | /// 149 | /// ``` 150 | /// # use accel::*; 151 | /// # let device = Device::nth(0).unwrap(); 152 | /// # let ctx = device.create_context(); 153 | /// let mut mem = DeviceMemory::::zeros(&ctx, 12); 154 | /// mem.set(1234); 155 | /// for &val in mem.as_slice() { 156 | /// assert_eq!(val, 1234); 157 | /// } 158 | /// ``` 159 | /// 160 | /// - Set `f32` 161 | /// - Be sure that `f64` is not supported yet because CUDA does not support 64-bit memset. 162 | /// 163 | /// ``` 164 | /// # use accel::*; 165 | /// # let device = Device::nth(0).unwrap(); 166 | /// # let ctx = device.create_context(); 167 | /// let mut mem = DeviceMemory::::zeros(&ctx, 12); 168 | /// mem.set(1.0); 169 | /// for &val in mem.as_slice() { 170 | /// assert_eq!(val, 1.0); 171 | /// } 172 | /// ``` 173 | /// 174 | /// - Set for host memory equals to `mem.iter_mut().for_each(|v| *v = value)` 175 | /// 176 | /// ``` 177 | /// # use accel::*; 178 | /// # let device = Device::nth(0).unwrap(); 179 | /// # let ctx = device.create_context(); 180 | /// let mut mem = PageLockedMemory::::zeros(&ctx, 12); 181 | /// mem.set(1234); 182 | /// for &val in mem.as_slice() { 183 | /// assert_eq!(val, 1234); 184 | /// } 185 | /// ``` 186 | fn set(&mut self, value: Self::Elem); 187 | } 188 | 189 | /// Copy data from one to another 190 | pub trait Memcpy + ?Sized>: Memory { 191 | /// Examples 192 | /// --------- 193 | /// 194 | /// - memcpy from page-locked host memory to device memory 195 | /// 196 | /// ``` 197 | /// # use accel::*; 198 | /// # let device = Device::nth(0).unwrap(); 199 | /// # let ctx = device.create_context(); 200 | /// let mut dest = DeviceMemory::::zeros(&ctx, 12); 201 | /// let src = PageLockedMemory::::zeros(&ctx, 12); 202 | /// dest.copy_from(&src); 203 | /// ``` 204 | /// 205 | /// - memcpy from device memory to page-locked host memory 206 | /// 207 | /// ``` 208 | /// # use accel::*; 209 | /// # let device = Device::nth(0).unwrap(); 210 | /// # let ctx = device.create_context(); 211 | /// let mut dest = PageLockedMemory::::zeros(&ctx, 12); 212 | /// let src = DeviceMemory::::zeros(&ctx, 12); 213 | /// dest.copy_from(&src); 214 | /// ``` 215 | /// 216 | /// - memcpy from device to device 217 | /// 218 | /// ``` 219 | /// # use accel::*; 220 | /// # let device = Device::nth(0).unwrap(); 221 | /// # let ctx = device.create_context(); 222 | /// let mut dest = DeviceMemory::::zeros(&ctx, 12); 223 | /// let src = DeviceMemory::::zeros(&ctx, 12); 224 | /// dest.copy_from(&src); 225 | /// ``` 226 | /// 227 | /// - memcpy from Rust slice to device memory 228 | /// 229 | /// ``` 230 | /// # use accel::*; 231 | /// # use std::ops::DerefMut; 232 | /// # let device = Device::nth(0).unwrap(); 233 | /// # let ctx = device.create_context(); 234 | /// let mut dest = DeviceMemory::::zeros(&ctx, 12); 235 | /// let src = vec![0_i32; 12]; 236 | /// dest.copy_from(src.as_slice()); // requires explicit cast to slice 237 | /// ``` 238 | /// 239 | /// - memcpy from device memory to Rust slice 240 | /// 241 | /// ``` 242 | /// # use accel::*; 243 | /// # let device = Device::nth(0).unwrap(); 244 | /// # let ctx = device.create_context(); 245 | /// let mut dest = vec![0_i32; 12]; 246 | /// let src = DeviceMemory::::zeros(&ctx, 12); 247 | /// dest.copy_from(&src); 248 | /// ``` 249 | /// 250 | /// - Cannot copy between different types 251 | /// 252 | /// ```compile_fail 253 | /// # use accel::*; 254 | /// # let device = Device::nth(0).unwrap(); 255 | /// # let ctx = device.create_context(); 256 | /// let mut dest = DeviceMemory::::zeros(&ctx, 12); 257 | /// let src = PageLockedMemory::::zeros(&ctx, 12); 258 | /// dest.copy_from(&src); // compile fail 259 | /// ``` 260 | /// 261 | /// - Panics if sizes are different 262 | /// 263 | /// ```should_panic 264 | /// # use accel::*; 265 | /// # let device = Device::nth(0).unwrap(); 266 | /// # let ctx = device.create_context(); 267 | /// let mut dest = DeviceMemory::::zeros(&ctx, 24); 268 | /// let src = PageLockedMemory::::zeros(&ctx, 12); 269 | /// dest.copy_from(&src); // will panic 270 | /// ``` 271 | /// 272 | /// Panics 273 | /// ------- 274 | /// 275 | /// - if `self` and `src` are identical 276 | /// - if sizes of memory mismatch 277 | /// 278 | fn copy_from(&mut self, source: &Target); 279 | 280 | /// Copy data in async manner 281 | /// 282 | /// ``` 283 | /// use accel::*; 284 | /// 285 | /// #[tokio::main] 286 | /// async fn main() { 287 | /// let device = Device::nth(0).unwrap(); 288 | /// let ctx = device.create_context(); 289 | /// let mut dest = DeviceMemory::::zeros(&ctx, 12); 290 | /// let src = PageLockedMemory::::zeros(&ctx, 12); 291 | /// dest.copy_from_async(&src).await; 292 | /// } 293 | /// ``` 294 | /// 295 | /// - Arrays are captured until await: 296 | /// 297 | /// ``` 298 | /// # use accel::*; 299 | /// # #[tokio::main] 300 | /// # async fn main() { 301 | /// # let device = Device::nth(0).unwrap(); 302 | /// # let ctx = device.create_context(); 303 | /// # let mut dest = DeviceMemory::::zeros(&ctx, 12); 304 | /// # let src = PageLockedMemory::::zeros(&ctx, 12); 305 | /// let future = dest.copy_from_async(&src); 306 | /// println!("src[0] = {}", src[0]); // Source is always accessible as usual &-reference 307 | /// future.await; 308 | /// # } 309 | /// ``` 310 | /// 311 | /// ```compile_fail 312 | /// # use accel::*; 313 | /// # #[tokio::main] 314 | /// # async fn main() { 315 | /// # let device = Device::nth(0).unwrap(); 316 | /// # let ctx = device.create_context(); 317 | /// # let mut dest = DeviceMemory::::zeros(&ctx, 12); 318 | /// # let src = PageLockedMemory::::zeros(&ctx, 12); 319 | /// let future = dest.copy_from_async(&src); 320 | /// println!("dest[0] = {}", dest[0]); // Destination is not accessible until .await 321 | /// future.await; 322 | /// # } 323 | /// ``` 324 | fn copy_from_async<'a>(&'a mut self, src: &'a Target) -> BoxFuture<'a, ()>; 325 | } 326 | 327 | /// Allocatable memories with CUDA context 328 | pub trait Allocatable: Contexted + Memory + Sized { 329 | /// Shape for initialization 330 | type Shape: Zero; 331 | 332 | /// Allocate a memory without initialization 333 | /// 334 | /// Safety 335 | /// ------ 336 | /// - Cause undefined behavior when read before write 337 | /// 338 | /// Panic 339 | /// ------ 340 | /// - if shape is zero 341 | unsafe fn uninitialized(ctx: &Context, shape: Self::Shape) -> Self; 342 | 343 | /// uniformly initialized 344 | /// 345 | /// Panic 346 | /// ------ 347 | /// - if shape is zero 348 | fn from_elem(ctx: &Context, shape: Self::Shape, elem: Self::Elem) -> Self { 349 | let mut mem = unsafe { Self::uninitialized(ctx, shape) }; 350 | mem.set(elem); 351 | mem 352 | } 353 | 354 | /// uniformly initialized by zero 355 | /// 356 | /// Panic 357 | /// ------ 358 | /// - if shape is zero 359 | fn zeros(ctx: &Context, shape: Self::Shape) -> Self { 360 | Self::from_elem(ctx, shape, ::zero()) 361 | } 362 | } 363 | 364 | /// Memory which has continuous 1D index, i.e. can be treated as a Rust slice 365 | pub trait Continuous: Memory { 366 | fn as_slice(&self) -> &[Self::Elem]; 367 | fn as_mut_slice(&mut self) -> &mut [Self::Elem]; 368 | } 369 | -------------------------------------------------------------------------------- /accel/src/memory/page_locked.rs: -------------------------------------------------------------------------------- 1 | //! Device and Host memory handlers 2 | 3 | use super::*; 4 | use crate::{error::Result, *}; 5 | use cuda::*; 6 | use std::{ 7 | fmt, 8 | ops::{Deref, DerefMut}, 9 | }; 10 | 11 | /// Host memory as page-locked. 12 | /// 13 | /// Allocating excessive amounts of pinned memory may degrade system performance, 14 | /// since it reduces the amount of memory available to the system for paging. 15 | /// As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device. 16 | /// 17 | /// See also [cuMemAllocHost]. 18 | /// 19 | /// [cuMemAllocHost]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 20 | #[derive(Contexted)] 21 | pub struct PageLockedMemory { 22 | ptr: *mut T, 23 | size: usize, 24 | context: Context, 25 | } 26 | 27 | unsafe impl Sync for PageLockedMemory {} 28 | unsafe impl Send for PageLockedMemory {} 29 | 30 | impl Drop for PageLockedMemory { 31 | fn drop(&mut self) { 32 | if let Err(e) = unsafe { contexted_call!(self, cuMemFreeHost, self.ptr as *mut _) } { 33 | log::error!("Cannot free page-locked memory: {:?}", e); 34 | } 35 | } 36 | } 37 | 38 | impl fmt::Debug for PageLockedMemory { 39 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 40 | f.debug_struct("PageLockedMemory") 41 | .field("context", &self.context) 42 | .field("data", &self.as_slice()) 43 | .finish() 44 | } 45 | } 46 | 47 | impl Deref for PageLockedMemory { 48 | type Target = [T]; 49 | fn deref(&self) -> &[T] { 50 | unsafe { std::slice::from_raw_parts(self.ptr as _, self.size) } 51 | } 52 | } 53 | 54 | impl DerefMut for PageLockedMemory { 55 | fn deref_mut(&mut self) -> &mut [T] { 56 | unsafe { std::slice::from_raw_parts_mut(self.ptr, self.size) } 57 | } 58 | } 59 | 60 | impl PartialEq for PageLockedMemory { 61 | fn eq(&self, other: &Self) -> bool { 62 | self.as_slice().eq(other.as_slice()) 63 | } 64 | } 65 | 66 | impl PartialEq<[T]> for PageLockedMemory { 67 | fn eq(&self, other: &[T]) -> bool { 68 | self.as_slice().eq(other) 69 | } 70 | } 71 | 72 | impl Memory for PageLockedMemory { 73 | type Elem = T; 74 | fn head_addr(&self) -> *const T { 75 | self.ptr as _ 76 | } 77 | 78 | fn head_addr_mut(&mut self) -> *mut T { 79 | self.ptr as _ 80 | } 81 | 82 | fn num_elem(&self) -> usize { 83 | self.size 84 | } 85 | 86 | fn memory_type(&self) -> MemoryType { 87 | MemoryType::PageLocked 88 | } 89 | 90 | fn set(&mut self, value: Self::Elem) { 91 | self.iter_mut().for_each(|v| *v = value); 92 | } 93 | } 94 | 95 | impl Continuous for PageLockedMemory { 96 | fn as_slice(&self) -> &[T] { 97 | self 98 | } 99 | fn as_mut_slice(&mut self) -> &mut [T] { 100 | self 101 | } 102 | } 103 | 104 | impl Allocatable for PageLockedMemory { 105 | type Shape = usize; 106 | unsafe fn uninitialized(context: &Context, size: usize) -> Self { 107 | assert!(size > 0, "Zero-sized malloc is forbidden"); 108 | let ptr = contexted_new!(context, cuMemAllocHost_v2, size * std::mem::size_of::()) 109 | .expect("Cannot allocate page-locked memory"); 110 | Self { 111 | ptr: ptr as *mut T, 112 | size, 113 | context: context.clone(), 114 | } 115 | } 116 | } 117 | 118 | impl<'arg, T: Scalar> DeviceSend for &'arg PageLockedMemory { 119 | type Target = *const T; 120 | fn as_kernel_parameter(&self) -> *mut c_void { 121 | &self.ptr as *const *mut T as *mut c_void 122 | } 123 | } 124 | 125 | impl<'arg, T: Scalar> DeviceSend for &'arg mut PageLockedMemory { 126 | type Target = *mut T; 127 | fn as_kernel_parameter(&self) -> *mut c_void { 128 | &self.ptr as *const *mut T as *mut c_void 129 | } 130 | } 131 | 132 | #[cfg(test)] 133 | mod tests { 134 | use super::*; 135 | 136 | #[test] 137 | fn as_mut_slice() -> Result<()> { 138 | let device = Device::nth(0)?; 139 | let context = device.create_context(); 140 | let mut mem = PageLockedMemory::::zeros(&context, 12); 141 | let sl = mem.as_mut_slice(); 142 | 143 | sl[0] = 3; // test if accessible 144 | assert_eq!(sl.num_elem(), 12); 145 | Ok(()) 146 | } 147 | 148 | #[should_panic(expected = "Zero-sized malloc is forbidden")] 149 | #[test] 150 | fn page_locked_new_zero() { 151 | let device = Device::nth(0).unwrap(); 152 | let context = device.create_context(); 153 | let _a = PageLockedMemory::::zeros(&context, 0); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /accel/src/memory/registered.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use crate::{error::Result, *}; 3 | use cuda::*; 4 | use std::{ 5 | ffi::c_void, 6 | ops::{Deref, DerefMut}, 7 | }; 8 | 9 | #[derive(Contexted, Debug)] 10 | pub struct RegisteredMemory<'a, T> { 11 | context: Context, 12 | data: &'a mut [T], 13 | } 14 | 15 | unsafe impl Sync for RegisteredMemory<'_, T> {} 16 | unsafe impl Send for RegisteredMemory<'_, T> {} 17 | 18 | impl Deref for RegisteredMemory<'_, T> { 19 | type Target = [T]; 20 | fn deref(&self) -> &[T] { 21 | self.data 22 | } 23 | } 24 | 25 | impl DerefMut for RegisteredMemory<'_, T> { 26 | fn deref_mut(&mut self) -> &mut [T] { 27 | self.data 28 | } 29 | } 30 | 31 | impl PartialEq for RegisteredMemory<'_, T> { 32 | fn eq(&self, other: &Self) -> bool { 33 | self.as_slice().eq(other.as_slice()) 34 | } 35 | } 36 | 37 | impl PartialEq<[T]> for RegisteredMemory<'_, T> { 38 | fn eq(&self, other: &[T]) -> bool { 39 | self.as_slice().eq(other) 40 | } 41 | } 42 | 43 | impl Drop for RegisteredMemory<'_, T> { 44 | fn drop(&mut self) { 45 | if let Err(e) = unsafe { 46 | contexted_call!( 47 | &self.context, 48 | cuMemHostUnregister, 49 | self.data.as_mut_ptr() as *mut c_void 50 | ) 51 | } { 52 | log::error!("Failed to unregister memory: {:?}", e); 53 | } 54 | } 55 | } 56 | 57 | impl<'a, T: Scalar> RegisteredMemory<'a, T> { 58 | pub fn new(context: &Context, data: &'a mut [T]) -> Self { 59 | unsafe { 60 | contexted_call!( 61 | context, 62 | cuMemHostRegister_v2, 63 | data.as_mut_ptr() as *mut c_void, 64 | data.len() * T::size_of(), 65 | 0 66 | ) 67 | } 68 | .expect("Failed to register host memory into CUDA memory system"); 69 | Self { 70 | context: context.clone(), 71 | data, 72 | } 73 | } 74 | } 75 | 76 | impl Memory for RegisteredMemory<'_, T> { 77 | type Elem = T; 78 | 79 | fn head_addr(&self) -> *const T { 80 | self.data.as_ptr() 81 | } 82 | 83 | fn head_addr_mut(&mut self) -> *mut T { 84 | self.data.as_mut_ptr() 85 | } 86 | 87 | fn num_elem(&self) -> usize { 88 | self.data.len() 89 | } 90 | 91 | fn memory_type(&self) -> MemoryType { 92 | MemoryType::Host 93 | } 94 | 95 | fn set(&mut self, value: Self::Elem) { 96 | self.iter_mut().for_each(|v| *v = value); 97 | } 98 | } 99 | 100 | impl Continuous for RegisteredMemory<'_, T> { 101 | fn as_slice(&self) -> &[T] { 102 | self 103 | } 104 | fn as_mut_slice(&mut self) -> &mut [T] { 105 | self 106 | } 107 | } 108 | 109 | impl<'arg, 'a: 'arg, T: Scalar> DeviceSend for &'arg RegisteredMemory<'a, T> { 110 | type Target = *const T; 111 | fn as_kernel_parameter(&self) -> *mut c_void { 112 | self.data.as_kernel_parameter() 113 | } 114 | } 115 | 116 | impl<'arg, 'a: 'arg, T: Scalar> DeviceSend for &'arg mut RegisteredMemory<'a, T> { 117 | type Target = *mut T; 118 | fn as_kernel_parameter(&self) -> *mut c_void { 119 | self.data.as_kernel_parameter() 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /accel/src/memory/scalar.rs: -------------------------------------------------------------------------------- 1 | pub use cuda::CUarray_format as ArrayFormatTag; 2 | use num_traits::Num; 3 | 4 | pub trait Scalar: Num + std::fmt::Debug + Copy + Send + Sync { 5 | fn format() -> ArrayFormatTag; 6 | 7 | fn size_of() -> usize { 8 | std::mem::size_of::() 9 | } 10 | 11 | /// Get little endian format in u8 12 | fn to_le_u8(self) -> Option { 13 | assert_ne!(Self::size_of(), u8::size_of()); 14 | None 15 | } 16 | 17 | /// Get little endian format in u16 18 | fn to_le_u16(self) -> Option { 19 | assert_ne!(Self::size_of(), u16::size_of()); 20 | None 21 | } 22 | 23 | /// Get little endian format in u32 24 | fn to_le_u32(self) -> Option { 25 | assert_ne!(Self::size_of(), u32::size_of()); 26 | None 27 | } 28 | } 29 | 30 | macro_rules! impl_array_scalar { 31 | ($scalar:ty, $le:ty, $format:ident) => { 32 | impl Scalar for $scalar { 33 | fn format() -> ArrayFormatTag { 34 | ArrayFormatTag::$format 35 | } 36 | paste::item! { 37 | fn [< to_le_ $le >](self) -> Option<$le> { 38 | assert_eq!(Self::size_of(), <$le>::size_of()); 39 | Some(<$le>::from_le_bytes(self.to_le_bytes())) 40 | } 41 | } 42 | } 43 | }; 44 | } 45 | 46 | impl_array_scalar!(u8, u8, CU_AD_FORMAT_UNSIGNED_INT8); 47 | impl_array_scalar!(u16, u16, CU_AD_FORMAT_UNSIGNED_INT16); 48 | impl_array_scalar!(u32, u32, CU_AD_FORMAT_UNSIGNED_INT32); 49 | impl_array_scalar!(i8, u8, CU_AD_FORMAT_SIGNED_INT8); 50 | impl_array_scalar!(i16, u16, CU_AD_FORMAT_SIGNED_INT16); 51 | impl_array_scalar!(i32, u32, CU_AD_FORMAT_SIGNED_INT32); 52 | // FIXME f16 is not supported yet 53 | // impl_array_scalar!(f16, u16, CU_AD_FORMAT_HALF); 54 | impl_array_scalar!(f32, u32, CU_AD_FORMAT_FLOAT); 55 | -------------------------------------------------------------------------------- /accel/src/memory/slice.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | /// Typed wrapper of cuPointerGetAttribute 4 | fn get_attr(ptr: *const T, attr: CUpointer_attribute) -> error::Result { 5 | let mut data = MaybeUninit::::uninit(); 6 | unsafe { 7 | ffi_call!( 8 | cuPointerGetAttribute, 9 | data.as_mut_ptr() as *mut c_void, 10 | attr, 11 | ptr as CUdeviceptr 12 | )?; 13 | Ok(data.assume_init()) 14 | } 15 | } 16 | 17 | /// Determine actual memory type dynamically 18 | /// 19 | /// Because `Continuous` memories can be treated as a slice, 20 | /// input slice may represents any type of memory. 21 | fn memory_type(ptr: *const T) -> MemoryType { 22 | match get_attr(ptr, CUpointer_attribute::CU_POINTER_ATTRIBUTE_MEMORY_TYPE) { 23 | Ok(CUmemorytype_enum::CU_MEMORYTYPE_HOST) => MemoryType::PageLocked, 24 | Ok(CUmemorytype_enum::CU_MEMORYTYPE_DEVICE) => MemoryType::Device, 25 | Ok(CUmemorytype_enum::CU_MEMORYTYPE_ARRAY) => MemoryType::Array, 26 | Ok(CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED) => { 27 | unreachable!("CU_POINTER_ATTRIBUTE_MEMORY_TYPE never be UNIFED") 28 | } 29 | Err(_) => { 30 | // unmanaged by CUDA memory system, i.e. host memory 31 | MemoryType::Host 32 | } 33 | } 34 | } 35 | 36 | fn get_context(ptr: *const T) -> Option { 37 | let ptr = 38 | get_attr::<_, CUcontext>(ptr, CUpointer_attribute::CU_POINTER_ATTRIBUTE_CONTEXT).ok()?; 39 | Some(ContextRef::from_ptr(ptr)) 40 | } 41 | 42 | impl Memory for [T] { 43 | type Elem = T; 44 | fn head_addr(&self) -> *const T { 45 | self.as_ptr() 46 | } 47 | 48 | fn head_addr_mut(&mut self) -> *mut T { 49 | self.as_mut_ptr() 50 | } 51 | 52 | fn num_elem(&self) -> usize { 53 | self.len() 54 | } 55 | 56 | fn memory_type(&self) -> MemoryType { 57 | memory_type(self.as_ptr()) 58 | } 59 | 60 | fn set(&mut self, value: T) { 61 | for val in self { 62 | *val = value; 63 | } 64 | } 65 | } 66 | 67 | impl Memcpy<[T]> for [T] { 68 | fn copy_from(&mut self, src: &[T]) { 69 | assert_ne!(self.head_addr(), src.head_addr()); 70 | assert_eq!(self.num_elem(), src.num_elem()); 71 | if let Some(ctx) = get_context(self.head_addr()).or_else(|| get_context(src.head_addr())) { 72 | unsafe { 73 | contexted_call!( 74 | &ctx, 75 | cuMemcpy, 76 | self.head_addr_mut() as CUdeviceptr, 77 | src.as_ptr() as CUdeviceptr, 78 | self.num_elem() * T::size_of() 79 | ) 80 | } 81 | .unwrap() 82 | } else { 83 | self.copy_from_slice(src); 84 | } 85 | } 86 | 87 | fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> { 88 | assert_ne!(self.head_addr(), src.head_addr()); 89 | assert_eq!(self.num_elem(), src.num_elem()); 90 | let ctx1 = get_context(self.head_addr()); 91 | let ctx2 = get_context(src.head_addr()); 92 | if let Some(ctx) = ctx1.or(ctx2) { 93 | let stream = stream::Stream::new(ctx); 94 | let byte_count = self.len() * std::mem::size_of::(); 95 | unsafe { 96 | contexted_call!( 97 | &ctx, 98 | cuMemcpyAsync, 99 | src.as_ptr() as CUdeviceptr, 100 | self.as_mut_ptr() as CUdeviceptr, 101 | byte_count, 102 | stream.stream 103 | ) 104 | } 105 | .expect("Failed to start async memcpy"); 106 | Box::pin(async { 107 | stream 108 | .into_future() 109 | .await 110 | .expect("Async memcpy thread failed") 111 | }) 112 | } else { 113 | self.copy_from_slice(src); 114 | Box::pin(async {}) 115 | } 116 | } 117 | } 118 | 119 | macro_rules! impl_memcpy_slice { 120 | ($t:path) => { 121 | impl Memcpy<[T]> for $t { 122 | fn copy_from(&mut self, src: &[T]) { 123 | self.as_mut_slice().copy_from(src); 124 | } 125 | fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> { 126 | self.as_mut_slice().copy_from_async(src) 127 | } 128 | } 129 | 130 | impl Memcpy<$t> for [T] { 131 | fn copy_from(&mut self, src: &$t) { 132 | self.copy_from(src.as_slice()); 133 | } 134 | fn copy_from_async<'a>(&'a mut self, src: &'a $t) -> BoxFuture<'a, ()> { 135 | self.copy_from_async(src.as_slice()) 136 | } 137 | } 138 | }; 139 | } 140 | 141 | impl_memcpy_slice!(DeviceMemory::); 142 | impl_memcpy_slice!(PageLockedMemory::); 143 | impl_memcpy_slice!(RegisteredMemory::<'_, T>); 144 | 145 | macro_rules! impl_memcpy { 146 | ($from:path, $to:path) => { 147 | impl Memcpy<$from> for $to { 148 | fn copy_from(&mut self, src: &$from) { 149 | self.as_mut_slice().copy_from(src.as_slice()); 150 | } 151 | fn copy_from_async<'a>(&'a mut self, src: &'a $from) -> BoxFuture<'a, ()> { 152 | self.as_mut_slice().copy_from_async(src.as_slice()) 153 | } 154 | } 155 | }; 156 | } 157 | 158 | impl_memcpy!(DeviceMemory::, DeviceMemory::); 159 | impl_memcpy!(DeviceMemory::, RegisteredMemory::<'_, T>); 160 | impl_memcpy!(DeviceMemory::, PageLockedMemory::); 161 | impl_memcpy!(PageLockedMemory::, DeviceMemory::); 162 | impl_memcpy!(PageLockedMemory::, RegisteredMemory::<'_, T>); 163 | impl_memcpy!(PageLockedMemory::, PageLockedMemory::); 164 | impl_memcpy!(RegisteredMemory::<'_, T>, DeviceMemory::); 165 | impl_memcpy!(RegisteredMemory::<'_, T>, RegisteredMemory::<'_, T>); 166 | impl_memcpy!(RegisteredMemory::<'_, T>, PageLockedMemory::); 167 | 168 | impl Continuous for [T] { 169 | fn as_slice(&self) -> &[Self::Elem] { 170 | self 171 | } 172 | 173 | fn as_mut_slice(&mut self) -> &mut [Self::Elem] { 174 | self 175 | } 176 | } 177 | 178 | #[cfg(test)] 179 | mod tests { 180 | use super::*; 181 | 182 | #[test] 183 | fn memory_type_host_vec() -> error::Result<()> { 184 | let a = vec![0_u32; 12]; 185 | assert_eq!(a.as_slice().memory_type(), MemoryType::Host); 186 | assert_eq!(a.as_slice().num_elem(), 12); 187 | Ok(()) 188 | } 189 | 190 | #[test] 191 | fn memory_type_host_vec_with_context() -> error::Result<()> { 192 | let device = Device::nth(0)?; 193 | let _ctx = device.create_context(); 194 | let a = vec![0_u32; 12]; 195 | assert_eq!(a.as_slice().memory_type(), MemoryType::Host); 196 | assert_eq!(a.as_slice().num_elem(), 12); 197 | Ok(()) 198 | } 199 | 200 | #[test] 201 | fn restore_context() -> error::Result<()> { 202 | let device = Device::nth(0)?; 203 | let ctx = device.create_context(); 204 | let a = PageLockedMemory::::zeros(&ctx, 12); 205 | let ctx_ptr = get_context(a.head_addr()).unwrap(); 206 | assert_eq!(*ctx, ctx_ptr); 207 | Ok(()) 208 | } 209 | 210 | #[tokio::test] 211 | async fn memcpy_async_host() { 212 | let a = vec![1_u32; 12]; 213 | let mut b1 = vec![0_u32; 12]; 214 | let mut b2 = vec![0_u32; 12]; 215 | let mut b3 = vec![0_u32; 12]; 216 | let fut1 = b1.copy_from_async(a.as_slice()); 217 | let fut2 = b2.copy_from_async(a.as_slice()); 218 | let fut3 = b3.copy_from_async(a.as_slice()); 219 | fut3.await; 220 | fut2.await; 221 | fut1.await; 222 | assert_eq!(a, b1); 223 | assert_eq!(a, b2); 224 | assert_eq!(a, b3); 225 | } 226 | 227 | #[tokio::test] 228 | async fn memcpy_async_d2h() { 229 | let device = Device::nth(0).unwrap(); 230 | let ctx = device.create_context(); 231 | let a = DeviceMemory::from_elem(&ctx, 12, 1_u32); 232 | let mut b1 = vec![0_u32; 12]; 233 | let mut b2 = vec![0_u32; 12]; 234 | let mut b3 = vec![0_u32; 12]; 235 | let fut1 = b1.copy_from_async(&a); 236 | let fut2 = b2.copy_from_async(&a); 237 | let fut3 = b3.copy_from_async(&a); 238 | fut3.await; 239 | fut2.await; 240 | fut1.await; 241 | assert_eq!(a.as_slice(), b1.as_slice()); 242 | assert_eq!(a.as_slice(), b2.as_slice()); 243 | assert_eq!(a.as_slice(), b3.as_slice()); 244 | } 245 | 246 | #[tokio::test] 247 | async fn memcpy_async_h2d() { 248 | let device = Device::nth(0).unwrap(); 249 | let ctx = device.create_context(); 250 | let a = PageLockedMemory::from_elem(&ctx, 12, 1_u32); 251 | let mut b1 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 252 | let mut b2 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 253 | let mut b3 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 254 | let fut1 = b1.copy_from_async(&a); 255 | let fut2 = b2.copy_from_async(&a); 256 | let fut3 = b3.copy_from_async(&a); 257 | fut3.await; 258 | fut2.await; 259 | fut1.await; 260 | assert_eq!(a.as_slice(), b1.as_slice()); 261 | assert_eq!(a.as_slice(), b2.as_slice()); 262 | assert_eq!(a.as_slice(), b3.as_slice()); 263 | } 264 | 265 | #[tokio::test] 266 | async fn memcpy_async_d2d() { 267 | let device = Device::nth(0).unwrap(); 268 | let ctx = device.create_context(); 269 | let a = DeviceMemory::from_elem(&ctx, 12, 1_u32); 270 | let mut b1 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 271 | let mut b2 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 272 | let mut b3 = DeviceMemory::from_elem(&ctx, 12, 0_u32); 273 | let fut1 = b1.copy_from_async(&a); 274 | let fut2 = b2.copy_from_async(&a); 275 | let fut3 = b3.copy_from_async(&a); 276 | fut3.await; 277 | fut2.await; 278 | fut1.await; 279 | assert_eq!(a.as_slice(), b1.as_slice()); 280 | assert_eq!(a.as_slice(), b2.as_slice()); 281 | assert_eq!(a.as_slice(), b3.as_slice()); 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /accel/src/module.rs: -------------------------------------------------------------------------------- 1 | //! CUDA Module (i.e. loaded PTX or cubin) 2 | 3 | use crate::{contexted_call, contexted_new, device::*, error::*, *}; 4 | use cuda::*; 5 | use std::ffi::*; 6 | 7 | /// CUDA Kernel function 8 | #[derive(Debug)] 9 | pub struct Kernel<'module> { 10 | pub(crate) func: CUfunction, 11 | module: &'module Module, 12 | } 13 | 14 | impl Contexted for Kernel<'_> { 15 | fn sync(&self) -> Result<()> { 16 | self.module.context.sync() 17 | } 18 | 19 | fn version(&self) -> Result { 20 | self.module.context.version() 21 | } 22 | 23 | fn guard(&self) -> Result { 24 | self.module.context.guard() 25 | } 26 | 27 | fn get_ref(&self) -> ContextRef { 28 | self.module.get_ref() 29 | } 30 | } 31 | 32 | /// OOP-like wrapper of `cuModule*` APIs 33 | #[derive(Debug, Contexted)] 34 | pub struct Module { 35 | module: CUmodule, 36 | context: Context, 37 | } 38 | 39 | impl Drop for Module { 40 | fn drop(&mut self) { 41 | if let Err(e) = unsafe { contexted_call!(&self.context, cuModuleUnload, self.module) } { 42 | log::error!("Failed to unload module: {:?}", e); 43 | } 44 | } 45 | } 46 | 47 | impl Module { 48 | /// integrated loader of Instruction 49 | pub fn load(context: &Context, data: &Instruction) -> Result { 50 | match *data { 51 | Instruction::PTX(ref ptx) => { 52 | let module = 53 | unsafe { contexted_new!(context, cuModuleLoadData, ptx.as_ptr() as *const _)? }; 54 | Ok(Module { 55 | module, 56 | context: context.clone(), 57 | }) 58 | } 59 | Instruction::Cubin(ref bin) => { 60 | let module = 61 | unsafe { contexted_new!(context, cuModuleLoadData, bin.as_ptr() as *const _)? }; 62 | Ok(Module { 63 | module, 64 | context: context.clone(), 65 | }) 66 | } 67 | Instruction::PTXFile(ref path) | Instruction::CubinFile(ref path) => { 68 | let filename = CString::new(path.to_str().unwrap()).expect("Invalid Path"); 69 | let module = unsafe { contexted_new!(context, cuModuleLoad, filename.as_ptr())? }; 70 | Ok(Module { 71 | module, 72 | context: context.clone(), 73 | }) 74 | } 75 | } 76 | } 77 | 78 | pub fn from_str(context: &Context, ptx: &str) -> Result { 79 | let data = Instruction::ptx(ptx); 80 | Self::load(context, &data) 81 | } 82 | 83 | /// Wrapper of `cuModuleGetFunction` 84 | pub fn get_kernel(&self, name: &str) -> Result { 85 | let name = CString::new(name).expect("Invalid Kernel name"); 86 | let func = 87 | unsafe { contexted_new!(self, cuModuleGetFunction, self.module, name.as_ptr()) }?; 88 | Ok(Kernel { func, module: self }) 89 | } 90 | } 91 | 92 | #[cfg(test)] 93 | mod tests { 94 | use super::*; 95 | 96 | #[test] 97 | fn load_do_nothing() -> Result<()> { 98 | // generated by do_nothing example in accel-derive 99 | let ptx = r#" 100 | .version 3.2 101 | .target sm_30 102 | .address_size 64 103 | .visible .entry do_nothing() 104 | { 105 | ret; 106 | } 107 | "#; 108 | let device = Device::nth(0)?; 109 | let ctx = device.create_context(); 110 | let _mod = Module::from_str(&ctx, ptx)?; 111 | Ok(()) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /accel/src/profiler.rs: -------------------------------------------------------------------------------- 1 | //! Profiling GPU kernels and host CUDA API calls 2 | 3 | use crate::*; 4 | use cuda::*; 5 | 6 | /// RAII handler for nvprof profiling 7 | /// 8 | /// - Profiling starts by `Profiler::start`, and stops by `Drop` of `Profiler`. 9 | /// - Unified memory profiling is not supported. You must add an option `--unified-memory-profiling off` to `nvprof` command. 10 | /// ```shell 11 | /// $ nvprof --unified-memory-profiling off ./target/release/examples/add 12 | /// ``` 13 | /// - You will find more options at [nvprof user's guide](https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) 14 | pub struct Profiler { 15 | ctx: Context, 16 | } 17 | 18 | impl Drop for Profiler { 19 | fn drop(&mut self) { 20 | if let Err(e) = unsafe { contexted_call!(&self.ctx, cuProfilerStop) } { 21 | log::error!("Failed to stop profiling: {:?}", e); 22 | } 23 | } 24 | } 25 | 26 | impl Profiler { 27 | pub fn start(ctx: &Context) -> Self { 28 | unsafe { contexted_call!(ctx, cuProfilerStart) }.expect("Profiler has already started"); 29 | Self { ctx: ctx.clone() } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /accel/src/stream.rs: -------------------------------------------------------------------------------- 1 | use crate::{contexted_call, contexted_new, device::*, error::*}; 2 | use cuda::*; 3 | use std::future::Future; 4 | 5 | /// Handler for non-blocking CUDA Stream 6 | #[derive(Debug, Contexted)] 7 | pub struct Stream { 8 | pub(crate) stream: CUstream, 9 | context: ContextRef, 10 | } 11 | 12 | unsafe impl Sync for Stream {} 13 | unsafe impl Send for Stream {} 14 | 15 | impl Drop for Stream { 16 | fn drop(&mut self) { 17 | if let Err(e) = unsafe { contexted_call!(self, cuStreamDestroy_v2, self.stream) } { 18 | log::error!("Failed to delete CUDA stream: {:?}", e); 19 | } 20 | } 21 | } 22 | 23 | impl Stream { 24 | /// Create a new non-blocking CUDA stream on the current context 25 | pub fn new(context: ContextRef) -> Self { 26 | let stream = unsafe { 27 | contexted_new!( 28 | &context, 29 | cuStreamCreate, 30 | CUstream_flags::CU_STREAM_NON_BLOCKING as u32 31 | ) 32 | } 33 | .expect("Failed to create CUDA stream"); 34 | Stream { context, stream } 35 | } 36 | 37 | /// Check all tasks in this stream have been completed 38 | pub fn query(&self) -> bool { 39 | match unsafe { contexted_call!(self, cuStreamQuery, self.stream) } { 40 | Ok(_) => true, 41 | Err(AccelError::AsyncOperationNotReady) => false, 42 | Err(e) => panic!("Unknown error is happened while cuStreamQuery: {:?}", e), 43 | } 44 | } 45 | 46 | /// Wait until all tasks in this stream have been completed 47 | pub fn sync(&self) -> Result<()> { 48 | unsafe { contexted_call!(self, cuStreamSynchronize, self.stream) }?; 49 | Ok(()) 50 | } 51 | 52 | /// Consume and convert into a Future 53 | pub fn into_future(self) -> impl Future> + Send { 54 | async { tokio::task::spawn_blocking(move || self.sync()).await? } 55 | } 56 | 57 | /// Wait event to sync another stream 58 | pub fn wait_event(&mut self, event: &Event) { 59 | unsafe { contexted_call!(self, cuStreamWaitEvent, self.stream, event.event, 0) } 60 | .expect("Failed to register an CUDA event waiting on CUDA stream"); 61 | } 62 | } 63 | 64 | #[derive(Contexted)] 65 | pub struct Event { 66 | event: CUevent, 67 | context: ContextRef, 68 | } 69 | 70 | unsafe impl Sync for Event {} 71 | unsafe impl Send for Event {} 72 | 73 | impl Drop for Event { 74 | fn drop(&mut self) { 75 | if let Err(e) = unsafe { contexted_call!(self, cuEventDestroy_v2, self.event) } { 76 | log::error!("Failed to delete CUDA event: {:?}", e); 77 | } 78 | } 79 | } 80 | 81 | impl Event { 82 | pub fn new(context: ContextRef) -> Self { 83 | let event = unsafe { 84 | contexted_new!( 85 | &context, 86 | cuEventCreate, 87 | CUevent_flags_enum::CU_EVENT_BLOCKING_SYNC as u32 88 | ) 89 | } 90 | .expect("Failed to create CUDA event"); 91 | Event { context, event } 92 | } 93 | 94 | pub fn record(&mut self, stream: &mut Stream) { 95 | unsafe { contexted_call!(self, cuEventRecord, self.event, stream.stream) } 96 | .expect("Failed to set event record"); 97 | } 98 | 99 | /// Query if the event has occur, returns true if already occurs 100 | pub fn query(&self) -> bool { 101 | match unsafe { contexted_call!(self, cuEventQuery, self.event) } { 102 | Ok(_) => true, 103 | Err(AccelError::AsyncOperationNotReady) => false, 104 | Err(e) => panic!("Unknown error occurs while cuEventQuery: {:?}", e), 105 | } 106 | } 107 | 108 | /// Wait until the event occurs with blocking 109 | pub fn sync(&self) -> Result<()> { 110 | unsafe { contexted_call!(self, cuEventSynchronize, self.event) }?; 111 | Ok(()) 112 | } 113 | } 114 | 115 | #[cfg(test)] 116 | mod tests { 117 | use super::*; 118 | 119 | #[test] 120 | fn new() -> Result<()> { 121 | let device = Device::nth(0)?; 122 | let context = device.create_context(); 123 | let _st = Stream::new(context.get_ref()); 124 | Ok(()) 125 | } 126 | 127 | #[test] 128 | fn trivial_sync() -> Result<()> { 129 | let device = Device::nth(0)?; 130 | let context = device.create_context(); 131 | let mut stream = Stream::new(context.get_ref()); 132 | let mut event = Event::new(context.get_ref()); 133 | event.record(&mut stream); 134 | // nothing to be waited 135 | event.sync()?; 136 | stream.sync()?; 137 | Ok(()) 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /accel/tests/argref.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | fn f(a: &i32, b: &mut i32) { 5 | if accel_core::index() == 0 { 6 | *b = *a; 7 | } 8 | } 9 | 10 | #[test] 11 | fn mut_ref_dev() -> error::Result<()> { 12 | let device = Device::nth(0)?; 13 | let ctx = device.create_context(); 14 | let mut a = DeviceMemory::::zeros(&ctx, 1); 15 | let mut b = DeviceMemory::::zeros(&ctx, 1); 16 | a[0] = 1; 17 | f(&ctx, 1, 1, (&a[0], &mut b[0]))?; 18 | assert_eq!(a, b); 19 | Ok(()) 20 | } 21 | 22 | #[test] 23 | fn mut_ref_host() -> error::Result<()> { 24 | let device = Device::nth(0)?; 25 | let ctx = device.create_context(); 26 | let mut a = PageLockedMemory::::zeros(&ctx, 1); 27 | let mut b = PageLockedMemory::::zeros(&ctx, 1); 28 | a[0] = 1; 29 | f(&ctx, 1, 1, (&a[0], &mut b[0]))?; 30 | assert_eq!(a, b); 31 | Ok(()) 32 | } 33 | -------------------------------------------------------------------------------- /accel/tests/data/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | all: add.ptx add.cubin sub.ptx sub.cubin 4 | 5 | clean: 6 | rm *.ptx *.cubin 7 | 8 | %.ptx: %.cu 9 | nvcc -ptx $< 10 | 11 | %.cubin: %.cu 12 | nvcc -cubin $< 13 | -------------------------------------------------------------------------------- /accel/tests/data/add.cu: -------------------------------------------------------------------------------- 1 | __global__ void add(const int a[], const int b[], int c[]) { 2 | int i = blockDim.x * blockIdx.x + threadIdx.x; 3 | c[i] = a[i] + b[i]; 4 | } 5 | -------------------------------------------------------------------------------- /accel/tests/data/add.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/accel/tests/data/add.cubin -------------------------------------------------------------------------------- /accel/tests/data/add.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-27506705 5 | // Cuda compilation tools, release 10.2, V10.2.89 6 | // Based on LLVM 3.4svn 7 | // 8 | 9 | .version 6.5 10 | .target sm_30 11 | .address_size 64 12 | 13 | // .globl _Z3addPKiS0_Pi 14 | 15 | .visible .entry _Z3addPKiS0_Pi( 16 | .param .u64 _Z3addPKiS0_Pi_param_0, 17 | .param .u64 _Z3addPKiS0_Pi_param_1, 18 | .param .u64 _Z3addPKiS0_Pi_param_2 19 | ) 20 | { 21 | .reg .b32 %r<8>; 22 | .reg .b64 %rd<11>; 23 | 24 | 25 | ld.param.u64 %rd1, [_Z3addPKiS0_Pi_param_0]; 26 | ld.param.u64 %rd2, [_Z3addPKiS0_Pi_param_1]; 27 | ld.param.u64 %rd3, [_Z3addPKiS0_Pi_param_2]; 28 | cvta.to.global.u64 %rd4, %rd3; 29 | cvta.to.global.u64 %rd5, %rd2; 30 | cvta.to.global.u64 %rd6, %rd1; 31 | mov.u32 %r1, %ntid.x; 32 | mov.u32 %r2, %ctaid.x; 33 | mov.u32 %r3, %tid.x; 34 | mad.lo.s32 %r4, %r2, %r1, %r3; 35 | mul.wide.s32 %rd7, %r4, 4; 36 | add.s64 %rd8, %rd6, %rd7; 37 | ld.global.u32 %r5, [%rd8]; 38 | add.s64 %rd9, %rd5, %rd7; 39 | ld.global.u32 %r6, [%rd9]; 40 | add.s32 %r7, %r6, %r5; 41 | add.s64 %rd10, %rd4, %rd7; 42 | st.global.u32 [%rd10], %r7; 43 | ret; 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /accel/tests/data/sub.cu: -------------------------------------------------------------------------------- 1 | __global__ void sub(const int a[], const int b[], int c[]) { 2 | int i = blockDim.x * blockIdx.x + threadIdx.x; 3 | c[i] = a[i] - b[i]; 4 | } 5 | 6 | -------------------------------------------------------------------------------- /accel/tests/data/sub.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/accel/tests/data/sub.cubin -------------------------------------------------------------------------------- /accel/tests/data/sub.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-27506705 5 | // Cuda compilation tools, release 10.2, V10.2.89 6 | // Based on LLVM 3.4svn 7 | // 8 | 9 | .version 6.5 10 | .target sm_30 11 | .address_size 64 12 | 13 | // .globl _Z3subPKiS0_Pi 14 | 15 | .visible .entry _Z3subPKiS0_Pi( 16 | .param .u64 _Z3subPKiS0_Pi_param_0, 17 | .param .u64 _Z3subPKiS0_Pi_param_1, 18 | .param .u64 _Z3subPKiS0_Pi_param_2 19 | ) 20 | { 21 | .reg .b32 %r<8>; 22 | .reg .b64 %rd<11>; 23 | 24 | 25 | ld.param.u64 %rd1, [_Z3subPKiS0_Pi_param_0]; 26 | ld.param.u64 %rd2, [_Z3subPKiS0_Pi_param_1]; 27 | ld.param.u64 %rd3, [_Z3subPKiS0_Pi_param_2]; 28 | cvta.to.global.u64 %rd4, %rd3; 29 | cvta.to.global.u64 %rd5, %rd2; 30 | cvta.to.global.u64 %rd6, %rd1; 31 | mov.u32 %r1, %ntid.x; 32 | mov.u32 %r2, %ctaid.x; 33 | mov.u32 %r3, %tid.x; 34 | mad.lo.s32 %r4, %r2, %r1, %r3; 35 | mul.wide.s32 %rd7, %r4, 4; 36 | add.s64 %rd8, %rd6, %rd7; 37 | ld.global.u32 %r5, [%rd8]; 38 | add.s64 %rd9, %rd5, %rd7; 39 | ld.global.u32 %r6, [%rd9]; 40 | sub.s32 %r7, %r5, %r6; 41 | add.s64 %rd10, %rd4, %rd7; 42 | st.global.u32 [%rd10], %r7; 43 | ret; 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /accel/tests/launch_async.rs: -------------------------------------------------------------------------------- 1 | #[test] 2 | fn launch_async_build_test() { 3 | let t = trybuild::TestCases::new(); 4 | t.compile_fail("tests/launch_async/mut_ref_fail.rs"); 5 | t.pass("tests/launch_async/mut_ref_success.rs"); 6 | } 7 | -------------------------------------------------------------------------------- /accel/tests/launch_async/mut_ref_fail.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | unsafe fn add(a: *const u32, b: *const u32, c: *mut u32, n: usize) { 5 | let i = accel_core::index(); 6 | if (i as usize) < n { 7 | *c.offset(i) = *a.offset(i) + *b.offset(i); 8 | } 9 | } 10 | 11 | #[tokio::main] 12 | async fn main() -> error::Result<()> { 13 | let device = Device::nth(0)?; 14 | let ctx = device.create_context(); 15 | let n = 16; 16 | let mut a = DeviceMemory::::zeros(&ctx, n); 17 | let mut b = DeviceMemory::::zeros(&ctx, n); 18 | let mut c = DeviceMemory::::zeros(&ctx, n); 19 | 20 | for i in 0..n { 21 | a[i] = i as u32; 22 | b[i] = 2 * i as u32; 23 | } 24 | 25 | let md = add::Module::new(&ctx)?; 26 | let future = md.launch_async(1, n, (&a, &b, &mut c, n)); 27 | 28 | println!("{:?}", c); // cannot be borrow 29 | future.await?; 30 | 31 | Ok(()) 32 | } 33 | -------------------------------------------------------------------------------- /accel/tests/launch_async/mut_ref_fail.stderr: -------------------------------------------------------------------------------- 1 | error[E0502]: cannot borrow `c` as immutable because it is also borrowed as mutable 2 | --> $DIR/mut_ref_fail.rs:28:22 3 | | 4 | 26 | let future = md.launch_async(1, n, (&a, &b, &mut c, n)); 5 | | ------ mutable borrow occurs here 6 | 27 | 7 | 28 | println!("{:?}", c); // cannot be borrow 8 | | ^ immutable borrow occurs here 9 | 29 | future.await?; 10 | | ------ mutable borrow later used here 11 | -------------------------------------------------------------------------------- /accel/tests/launch_async/mut_ref_success.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | unsafe fn add(a: *const u32, b: *const u32, c: *mut u32, n: usize) { 5 | let i = accel_core::index(); 6 | if (i as usize) < n { 7 | *c.offset(i) = *a.offset(i) + *b.offset(i); 8 | } 9 | } 10 | 11 | #[tokio::main] 12 | async fn main() -> error::Result<()> { 13 | let device = Device::nth(0)?; 14 | let ctx = device.create_context(); 15 | let n = 16; 16 | let mut a = DeviceMemory::::zeros(&ctx, n); 17 | let mut b = DeviceMemory::::zeros(&ctx, n); 18 | let mut c = DeviceMemory::::zeros(&ctx, n); 19 | 20 | for i in 0..n { 21 | a[i] = i as u32; 22 | b[i] = 2 * i as u32; 23 | } 24 | 25 | let md = add::Module::new(&ctx)?; 26 | let future = md.launch_async(1, n, (&a, &b, &mut c, n)); 27 | future.await?; 28 | for i in 0..n { 29 | assert_eq!(c[i], 3 * i as u32); // can be borrow 30 | } 31 | 32 | Ok(()) 33 | } 34 | -------------------------------------------------------------------------------- /accel/tests/read_host_memory.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | pub unsafe fn read_host_memory(a: *const i32) { 5 | let i = accel_core::index() as isize; 6 | accel_core::println!("a[{}] = {}", i, unsafe { *(a.offset(i)) }); 7 | } 8 | 9 | #[test] 10 | fn page_locked() -> error::Result<()> { 11 | let device = Device::nth(0)?; 12 | let ctx = device.create_context(); 13 | 14 | let mut a = PageLockedMemory::zeros(&ctx, 4); 15 | a[0] = 0; 16 | a[1] = 1; 17 | a[2] = 2; 18 | a[3] = 3; 19 | read_host_memory(&ctx, 1, 4, (a.as_ptr(),))?; 20 | Ok(()) 21 | } 22 | 23 | #[test] 24 | fn registered() -> error::Result<()> { 25 | let device = Device::nth(0)?; 26 | let ctx = device.create_context(); 27 | 28 | let mut a = vec![0; 4]; 29 | let mut mem = RegisteredMemory::new(&ctx, &mut a); 30 | mem[0] = 0; 31 | mem[1] = 1; 32 | mem[2] = 2; 33 | mem[3] = 3; 34 | read_host_memory(&ctx, 1, 4, (mem.as_ptr(),))?; 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /accel/tests/slice.rs: -------------------------------------------------------------------------------- 1 | use accel::*; 2 | 3 | #[kernel] 4 | unsafe fn set1(a: *mut i32, n: usize) { 5 | let i = accel_core::index(); 6 | if i < n as isize { 7 | *a.offset(i) = 1; 8 | } 9 | } 10 | 11 | #[test] 12 | fn slice_to_pointer_host() -> error::Result<()> { 13 | let device = Device::nth(0)?; 14 | let ctx = device.create_context(); 15 | let n = 12; 16 | let mut a = PageLockedMemory::::zeros(&ctx, n); 17 | set1(&ctx, 1, n, (&mut a, n))?; 18 | assert_eq!(a.as_slice(), vec![1_i32; n].as_slice()); 19 | Ok(()) 20 | } 21 | 22 | #[test] 23 | fn slice_to_pointer_dev() -> error::Result<()> { 24 | let device = Device::nth(0)?; 25 | let ctx = device.create_context(); 26 | let n = 12; 27 | let mut a = DeviceMemory::::zeros(&ctx, n); 28 | set1(&ctx, 1, n, (&mut a, n))?; 29 | assert_eq!(a.as_slice(), vec![1_i32; n].as_slice()); 30 | Ok(()) 31 | } 32 | 33 | #[test] 34 | fn slice_to_pointer_registered() -> error::Result<()> { 35 | let device = Device::nth(0)?; 36 | let ctx = device.create_context(); 37 | let n = 12; 38 | let mut v = vec![0_i32; n]; 39 | let mut a = RegisteredMemory::new(&ctx, &mut v); 40 | set1(&ctx, 1, n, (&mut a, n))?; 41 | assert_eq!(a.as_slice(), vec![1_i32; n].as_slice()); 42 | Ok(()) 43 | } 44 | -------------------------------------------------------------------------------- /diagrams/.gitignore: -------------------------------------------------------------------------------- 1 | *.aux 2 | *.log 3 | *.pdf 4 | *.bbl 5 | *.blg 6 | *.nav 7 | *.out 8 | *.snm 9 | *.toc 10 | *.dvi 11 | *.bb 12 | *.xbb 13 | *.fls 14 | *.fdb_latexmk 15 | *.synctex.gz 16 | -------------------------------------------------------------------------------- /diagrams/compile_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/diagrams/compile_flow.png -------------------------------------------------------------------------------- /diagrams/compile_flow.tex: -------------------------------------------------------------------------------- 1 | \documentclass{standalone} 2 | 3 | \usepackage{listings} 4 | \usepackage{tikz} 5 | \usetikzlibrary{positioning,fit,calc} 6 | 7 | \begin{document} 8 | \begin{tikzpicture}[ 9 | node distance=7mm, 10 | title/.style={font=\fontsize{6}{6}\color{black!50}\ttfamily}, 11 | typetag/.style={rectangle, draw=black!50, font=\scriptsize\ttfamily, anchor=west}, 12 | arrow-annotate/.style={midway, font=\fontsize{6}{6}\color{blue!70}\ttfamily} 13 | ] 14 | \node (main) [title] {main.rs}; 15 | \node (proc-macro) [below=of main.west, typetag, xshift=2mm, text width=6cm] { 16 | \#[accel::kernel] 17 | \parbox{8cm}{fn add(a: *const f32, b: *const f32, c: *mut f32)} 18 | }; 19 | \node (main-ptx) [below=of proc-macro.west, xshift=35mm, yshift=-1cm, typetag, text width=42mm] { 20 | mod add \{ 21 | \parbox{5cm}{const PTX\_STR = "\{Generated PTX\}";} 22 | \} 23 | }; 24 | \node (main-caller) [below=of proc-macro.west, yshift=-25mm, typetag, text width=6cm] { 25 | fn add(context, grid, block, \&(a, b, c)) \{ \} 26 | }; 27 | \draw[->] (proc-macro) -- (main-ptx) node[arrow-annotate, right] {Helper sub-module}; 28 | \draw[->] (proc-macro) -- (main-caller) node[arrow-annotate, left] {GPU Kernel caller}; 29 | \node[draw=black!50, fit={(main) (proc-macro) (main-ptx) (main-caller)}] {}; 30 | 31 | \node (ptx-builder-title) at (9cm, 0) [title, right, text width=25mm] {accel-derive/add crate}; 32 | \node (lib) [below=of ptx-builder-title.west, typetag, xshift=2mm] {lib.rs}; 33 | \node (toml) [below=of lib.west, typetag] {Cargo.toml}; 34 | \node (ptx-builder) [draw=black!50, fit={(ptx-builder-title) (lib) (toml)}] {}; 35 | 36 | \draw[->] (proc-macro) -- (ptx-builder) node[arrow-annotate, above] {Create on \$HOME/.cache}; 37 | 38 | \node (ptx-title) at (9cm, -28mm) [title, right, text width=25mm] {Generated PTX}; 39 | \node (ptx-add) [below=of ptx-title.west, typetag, xshift=2mm, yshift=-3mm, text width=2cm] { 40 | .entry add( 41 | \parbox{17mm}{.param .u64 a}, 42 | \parbox{17mm}{.param .u64 b}, 43 | \parbox{17mm}{.param .u64 c} 44 | ) 45 | }; 46 | \node (ptx) [draw=black!50, fit={(ptx-title) (ptx-add)}] {}; 47 | 48 | \draw[->] (ptx-builder) -- (ptx) node[arrow-annotate, right] {nvptx64-nvidia-cuda target}; 49 | \draw[->] (ptx) -- (main-ptx) node[arrow-annotate, below left]{embedded as String}; 50 | 51 | \end{tikzpicture} 52 | \end{document} 53 | -------------------------------------------------------------------------------- /docker/.gitignore: -------------------------------------------------------------------------------- 1 | ubuntu*-cuda*-nightly*.Dockerfile 2 | centos*-cuda*-nightly*.Dockerfile -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_VERSIONS := 10.0 10.1 10.2 2 | NIGHTLY_VERSIONS := 2020-01-02 2020-05-01 3 | 4 | CI_REGISTRY_IMAGE ?= registry.gitlab.com/termoshtt/accel 5 | CI_COMMIT_REF_SLUG ?= manual 6 | 7 | define ubuntu 8 | ubuntu$(1)-cuda$(2)-nightly$(3): 9 | sed -e "s/UBUNTU_VERSION/$(1)/" \ 10 | -e "s/CUDA_VERSION/$(2)/" \ 11 | -e "s/NIGHTLY_VERSION/$(3)/" \ 12 | < ubuntu.Dockerfile \ 13 | > $$@.Dockerfile 14 | docker build -f $$@.Dockerfile -t $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) . 15 | docker push $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) 16 | endef 17 | 18 | define centos 19 | centos$(1)-cuda$(2)-nightly$(3): 20 | sed -e "s/CENTOS_VERSION/$(1)/" \ 21 | -e "s/CUDA_VERSION/$(2)/" \ 22 | -e "s/NIGHTLY_VERSION/$(3)/" \ 23 | < centos.Dockerfile \ 24 | > $$@.Dockerfile 25 | docker build -f $$@.Dockerfile -t $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) . 26 | docker push $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) 27 | endef 28 | 29 | .PHONY: clean 30 | 31 | all: $(foreach NIGHTLY_VERSION,$(NIGHTLY_VERSIONS), \ 32 | $(foreach CUDA_VERSION,$(CUDA_VERSIONS),\ 33 | ubuntu18.04-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \ 34 | centos6-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \ 35 | centos7-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \ 36 | ) \ 37 | ) 38 | 39 | $(foreach NIGHTLY_VERSION,$(NIGHTLY_VERSIONS), \ 40 | $(foreach CUDA_VERSION,$(CUDA_VERSIONS), \ 41 | $(eval $(call ubuntu,18.04,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \ 42 | $(eval $(call centos,6,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \ 43 | $(eval $(call centos,7,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \ 44 | ) \ 45 | ) 46 | 47 | clean: 48 | rm -rf *-cuda*-nightly*.Dockerfile -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | rust-cuda containers 2 | ===================== 3 | 4 | Docker container including 5 | 6 | - CUDA based on [nvidia/cuda](https://hub.docker.com/r/nvidia/cuda/) containers 7 | - NVPTX target for Rust 8 | 9 | ``` 10 | docker run -it --rm registry.gitlab.com/termoshtt/accel/ubuntu18.04-cuda10.2:master 11 | ``` 12 | 13 | See also https://gitlab.com/termoshtt/accel/container_registry 14 | 15 | Supported Platforms 16 | ------------------ 17 | 18 | |CUDA | Ubuntu 18.04 | Ubuntu 16.04 | RedHat UBI8 | RedHat UBI7 | CentOS 7 | CentOS 6 | 19 | |:---:|:------------:|:------------:|:-----------:|:-----------:|:--------:|:--------:| 20 | |10.2 | ✔️ | ✔️ | | | ✔️ | ✔️ | 21 | |10.1 | ✔️ | ✔️ | | | ✔️ | ✔️ | 22 | |10.0 | ✔️ | ✔️ | - | - | ✔️ | ✔️ | 23 | |9.2 | ✔️ | ✔️ | - | - | ✔️ | ✔️ | 24 | |9.1 | - | ✔️ | - | - | ✔️ | ✔️ | 25 | |9.0 | - | ✔️ | - | - | ✔️ | ✔️ | 26 | |8.0 | - | ✔️ | - | - | ✔️ | ✔️ | 27 | 28 | - https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/supported-tags.md 29 | - https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/unsupported-tags.md 30 | -------------------------------------------------------------------------------- /docker/centos.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:CUDA_VERSION-base-centosCENTOS_VERSION 2 | 3 | COPY cuda.conf /etc/ld.so.conf.d 4 | RUN ldconfig 5 | ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs 6 | 7 | RUN yum install -y gcc && yum clean all 8 | 9 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain 1.42.0 10 | ENV PATH /root/.cargo/bin:$PATH 11 | 12 | RUN cargo install ptx-linker 13 | RUN rustup toolchain add nightly-NIGHTLY_VERSION 14 | RUN rustup target add nvptx64-nvidia-cuda --toolchain nightly-NIGHTLY_VERSION 15 | 16 | RUN rustup component add rustfmt clippy 17 | -------------------------------------------------------------------------------- /docker/cuda.conf: -------------------------------------------------------------------------------- 1 | /usr/local/cuda/lib64 2 | /usr/local/cuda/lib64/stubs 3 | -------------------------------------------------------------------------------- /docker/ubuntu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:CUDA_VERSION-base-ubuntuUBUNTU_VERSION 2 | 3 | COPY cuda.conf /etc/ld.so.conf.d 4 | RUN ldconfig 5 | ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs 6 | 7 | RUN apt-get update \ 8 | && apt-get install -y curl gcc \ 9 | && apt-get clean \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain 1.42.0 13 | ENV PATH /root/.cargo/bin:$PATH 14 | 15 | RUN cargo install ptx-linker 16 | RUN rustup toolchain add nightly-NIGHTLY_VERSION 17 | RUN rustup target add nvptx64-nvidia-cuda --toolchain nightly-NIGHTLY_VERSION 18 | 19 | RUN rustup component add rustfmt clippy 20 | -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CI artifact of termoshtt/accel project 5 | 6 | 7 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /setup_nvptx_toolchain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xue 3 | 4 | NIGHTLY=nightly-2020-05-01 5 | rustup toolchain add ${NIGHTLY} 6 | rustup target add nvptx64-nvidia-cuda --toolchain ${NIGHTLY} 7 | cargo install ptx-linker -f 8 | --------------------------------------------------------------------------------