├── .github
    └── FUNDING.yml
├── .gitignore
├── .gitlab-ci.yml
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── LICENSE.md
├── README.md
├── accel-core
    ├── .cargo
    │   └── config
    ├── Cargo.toml
    ├── README.md
    ├── rust-toolchain
    └── src
    │   └── lib.rs
├── accel-derive
    ├── Cargo.toml
    ├── README.md
    ├── src
    │   ├── builder.rs
    │   ├── contexted.rs
    │   ├── host.rs
    │   ├── launchable.rs
    │   ├── lib.rs
    │   └── parser.rs
    └── tests
    │   ├── kernels
    │       ├── arguments.rs
    │       ├── dependencies.rs
    │       ├── dependencies_default.rs
    │       ├── dependencies_git.rs
    │       └── do_nothing.rs
    │   └── try_build.rs
├── accel
    ├── Cargo.toml
    ├── benches
    │   └── memcpy.rs
    ├── examples
    │   └── add.rs
    ├── src
    │   ├── block.rs
    │   ├── device.rs
    │   ├── error.rs
    │   ├── execution.rs
    │   ├── grid.rs
    │   ├── instruction.rs
    │   ├── lib.rs
    │   ├── linker.rs
    │   ├── memory
    │   │   ├── array.rs
    │   │   ├── device.rs
    │   │   ├── dimension.rs
    │   │   ├── info.rs
    │   │   ├── mod.rs
    │   │   ├── page_locked.rs
    │   │   ├── registered.rs
    │   │   ├── scalar.rs
    │   │   └── slice.rs
    │   ├── module.rs
    │   ├── profiler.rs
    │   └── stream.rs
    └── tests
    │   ├── argref.rs
    │   ├── data
    │       ├── Makefile
    │       ├── add.cu
    │       ├── add.cubin
    │       ├── add.ptx
    │       ├── sub.cu
    │       ├── sub.cubin
    │       └── sub.ptx
    │   ├── launch_async.rs
    │   ├── launch_async
    │       ├── mut_ref_fail.rs
    │       ├── mut_ref_fail.stderr
    │       └── mut_ref_success.rs
    │   ├── read_host_memory.rs
    │   └── slice.rs
├── diagrams
    ├── .gitignore
    ├── compile_flow.png
    ├── compile_flow.svg
    └── compile_flow.tex
├── docker
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── centos.Dockerfile
    ├── cuda.conf
    └── ubuntu.Dockerfile
├── public
    └── index.html
└── setup_nvptx_toolchain.sh


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [termoshtt] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | target/
 4 | *.rustfmt
 5 | rusty-tags.*
 6 | 
 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 8 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock
 9 | Cargo.lock
10 | 
11 | # cargo fmt
12 | *.bk
13 | 
14 | # generated PTX
15 | *.s
16 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | image: registry.gitlab.com/termoshtt/accel/ubuntu18.04-cuda10.2-nightly2020-05-01:master
  2 | 
  3 | variables:
  4 |   AWS_DEFAULT_REGION: ap-northeast-1
  5 |   BUCKET_NAME: accel-gitlab-ci
  6 | 
  7 | stages:
  8 |   - docker
  9 |   - test
 10 |   - bench
 11 |   - package
 12 |   - deploy
 13 | 
 14 | test:cargo-clippy:
 15 |   stage: test
 16 |   script:
 17 |     - cargo clippy
 18 | 
 19 | test:cargo-fmt:
 20 |   stage: test
 21 |   script:
 22 |     - cargo fmt -- --check
 23 | 
 24 | .with_gpu:
 25 |   before_script:
 26 |     - nvidia-smi
 27 |   tags:
 28 |     - gpu
 29 |   only:
 30 |     - master
 31 |     - tags
 32 |     - /^gpu-.*/
 33 | 
 34 | test:accel:
 35 |   extends: .with_gpu
 36 |   stage: test
 37 |   script:
 38 |     - cargo test
 39 | 
 40 | test:ignored:
 41 |   extends: .with_gpu
 42 |   stage: test
 43 |   script:
 44 |     - cd accel
 45 |     - cargo test -- --ignored
 46 |   allow_failure: true
 47 | 
 48 | bench:
 49 |   extends: .with_gpu
 50 |   stage: bench
 51 |   script:
 52 |     - rm -rf accel/target/criterion
 53 |     - cargo bench
 54 |     - mv accel/target/criterion public/benchmark
 55 |   artifacts:
 56 |     paths:
 57 |       - public/benchmark
 58 |   only:
 59 |     variables:
 60 |       - $CI_COMMIT_MESSAGE =~ /\[bench\]/
 61 |       - $CI_RUN_BENCHMARK
 62 | 
 63 | changelog:
 64 |   image: debian
 65 |   stage: test
 66 |   before_script:
 67 |     - apt update
 68 |     - apt install -y git
 69 |   script:
 70 |     - test -n "$(git diff origin/master CHANGELOG.md)"
 71 |   except:
 72 |     - master
 73 | 
 74 | package:
 75 |   stage: package
 76 |   script:
 77 |     # Document of accel, accel-derive
 78 |     - cargo doc --no-deps --document-private-items
 79 |     - mv target/doc public/accel
 80 |     # Document of accel-core
 81 |     - cd accel-core
 82 |     - cargo doc
 83 |     - mv ./target/nvptx64-nvidia-cuda/doc ../public/accel-core
 84 |     - cd -
 85 |   artifacts:
 86 |     paths:
 87 |       - public
 88 | 
 89 | pages:
 90 |   stage: deploy
 91 |   dependencies:
 92 |     - package
 93 |   script:
 94 |     - find public
 95 |   artifacts:
 96 |     paths:
 97 |       - public
 98 |   only:
 99 |     - master
100 | 
101 | .s3:
102 |   image: python
103 |   stage: deploy
104 |   dependencies:
105 |     - package
106 |   before_script:
107 |     - pip install awscli
108 |   only:
109 |     - master
110 |     - tags
111 |     - /^gpu-.*/
112 | 
113 | deploy_s3:
114 |   extends: .s3
115 |   script:
116 |     - aws s3 cp public s3://${BUCKET_NAME}/${CI_COMMIT_REF_SLUG} --recursive --acl public-read
117 |   environment:
118 |     name: ${CI_COMMIT_REF_SLUG}
119 |     url: https://${BUCKET_NAME}.s3-${AWS_DEFAULT_REGION}.amazonaws.com/${CI_COMMIT_REF_SLUG}/index.html
120 |     on_stop: clean_s3
121 | 
122 | clean_s3:
123 |   extends: .s3
124 |   script:
125 |     - aws s3 rm s3://${BUCKET_NAME}/${CI_COMMIT_REF_SLUG} --recursive
126 |   environment:
127 |     name: ${CI_COMMIT_REF_SLUG}
128 |     action: stop
129 |   when: manual
130 | 
131 | .build:
132 |   image: docker:stable
133 |   stage: docker
134 |   services:
135 |     - docker:dind
136 |   before_script:
137 |     - apk add make
138 |     - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY
139 |   script:
140 |     - make -C docker ${CI_JOB_NAME}
141 |   only:
142 |     refs:
143 |       - master
144 |       - tags
145 |     changes:
146 |       - docker/*
147 | 
148 | centos7-cuda10.0-nightly2020-01-02:
149 |   extends: .build
150 | centos7-cuda10.1-nightly2020-01-02:
151 |   extends: .build
152 | centos7-cuda10.2-nightly2020-01-02:
153 |   extends: .build
154 | ubuntu18.04-cuda10.0-nightly2020-01-02:
155 |   extends: .build
156 | ubuntu18.04-cuda10.1-nightly2020-01-02:
157 |   extends: .build
158 | ubuntu18.04-cuda10.2-nightly2020-01-02:
159 |   extends: .build
160 | centos7-cuda10.0-nightly2020-05-01:
161 |   extends: .build
162 | centos7-cuda10.1-nightly2020-05-01:
163 |   extends: .build
164 | centos7-cuda10.2-nightly2020-05-01:
165 |   extends: .build
166 | ubuntu18.04-cuda10.0-nightly2020-05-01:
167 |   extends: .build
168 | ubuntu18.04-cuda10.1-nightly2020-05-01:
169 |   extends: .build
170 | ubuntu18.04-cuda10.2-nightly2020-05-01:
171 |   extends: .build
172 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Unreleased (will be 0.4.0)
  2 | ===========================
  3 | 
  4 | ### Added
  5 | 
  6 | - async/.await support
  7 |   - memcpy https://gitlab.com/termoshtt/accel/-/merge_requests/85
  8 |   - kernel launch https://gitlab.com/termoshtt/accel/-/merge_requests/88
  9 | - `ContextRef` struct https://gitlab.com/termoshtt/accel/-/merge_requests/83
 10 | - memcpy benchmark https://gitlab.com/termoshtt/accel/-/merge_requests/81
 11 | 
 12 | ### Changed
 13 | 
 14 | - `Memory` trait update
 15 |   - memcpy implementation uses unified addressing https://gitlab.com/termoshtt/accel/-/merge_requests/84
 16 |   - `Memset` trait is merged into `Memory` trait https://gitlab.com/termoshtt/accel/-/merge_requests/96
 17 | - Kernel launch APIs changes, refactoring `Launchable` and `DeviceSend` traits
 18 |   - reference support https://gitlab.com/termoshtt/accel/-/merge_requests/90
 19 |   - Host slice to device pointer conversion https://gitlab.com/termoshtt/accel/-/merge_requests/91
 20 | - `module` sub-module split https://gitlab.com/termoshtt/accel/-/merge_requests/89
 21 | - `#[kernel]` proc-macro works in accel crate https://gitlab.com/termoshtt/accel/-/merge_requests/97
 22 | - Fixed spelling issues in Readme https://gitlab.com/termoshtt/accel/-/merge_requests/99
 23 | 
 24 | ### Maintenance
 25 | 
 26 | - Force write CHANGELOG on each merge requests https://gitlab.com/termoshtt/accel/-/merge_requests/95
 27 | 
 28 | 0.3.1 - 2020-05-25
 29 | -------------------
 30 | 
 31 | - HotFix for `impl_array_scalar` macro https://gitlab.com/termoshtt/accel/-/issues/58 https://gitlab.com/termoshtt/accel/-/issues/59 https://gitlab.com/termoshtt/accel/-/merge_requests/80
 32 | 
 33 | 0.3.0 - 2020-05-04
 34 | ===================
 35 | 
 36 | ### Added
 37 | 
 38 | - RAII based Profiler API https://gitlab.com/termoshtt/accel/-/merge_requests/74
 39 | - Registered Host memory https://gitlab.com/termoshtt/accel/-/merge_requests/73
 40 | - Memcpy, Memset traits https://gitlab.com/termoshtt/accel/-/merge_requests/70 https://gitlab.com/termoshtt/accel/-/merge_requests/60 https://gitlab.com/termoshtt/accel/-/merge_requests/59 https://gitlab.com/termoshtt/accel/-/merge_requests/58
 41 | - `Into<Block>` and `Into<Grid>` for primitive types https://gitlab.com/termoshtt/accel/-/merge_requests/55
 42 | 
 43 | ### Changed
 44 | 
 45 | - Use Rust nightly-2020-05-01 https://gitlab.com/termoshtt/accel/-/merge_requests/75
 46 |   - Build a container with nightly-2020-05-01 https://gitlab.com/termoshtt/accel/-/merge_requests/76
 47 |   - Switch to `nvidia/cuda:*-base` containers https://gitlab.com/termoshtt/accel/-/merge_requests/67
 48 | - Use `Arc<Context>` instead of `&Context` https://gitlab.com/termoshtt/accel/-/merge_requests/66
 49 | - Export `accel-derive::kernel` into `accel::` https://gitlab.com/termoshtt/accel/-/merge_requests/68
 50 | - Do not `panic!` on `Drop` of CUDA bindings https://gitlab.com/termoshtt/accel/-/merge_requests/53
 51 | 
 52 | ### Removed
 53 | - Inconsistent f64 support https://gitlab.com/termoshtt/accel/-/merge_requests/71
 54 | - `Launchable::stream_launch` because of its unsafety https://gitlab.com/termoshtt/accel/-/merge_requests/69
 55 | 
 56 | ### Others
 57 | 
 58 | - Add cargo-clippy and cargo-fmt tests on CI https://gitlab.com/termoshtt/accel/-/merge_requests/65
 59 | 
 60 | 0.3.0-alpha.2 - 2020-04-06
 61 | ----------------------------
 62 | 
 63 | - Minimum Supported Rust version to be 1.42
 64 | 
 65 | ### Without CUDA Runtime API
 66 | 
 67 | - Rewrite using [CUDA Driver API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html) https://gitlab.com/termoshtt/accel/-/issues/19
 68 | - Explicit RAII handling of CUDA Context https://gitlab.com/termoshtt/accel/-/merge_requests/51
 69 | - CUDA Managed memories
 70 |   - Device memory https://gitlab.com/termoshtt/accel/-/merge_requests/40
 71 |   - Page-locked host memory https://gitlab.com/termoshtt/accel/-/merge_requests/47
 72 | - CUDA Stream / Event handlers https://gitlab.com/termoshtt/accel/-/merge_requests/52
 73 |     - Asynchronous Kernel launch
 74 | 
 75 | ### alloc for device code
 76 | 
 77 | - Global allocator using CUDA's malloc/free https://gitlab.com/termoshtt/accel/-/merge_requests/26
 78 | - `println!`, `assert_eq!` support https://gitlab.com/termoshtt/accel/-/merge_requests/25
 79 | 
 80 | ### Move to GitLab
 81 | 
 82 | - GitHub Actions has several problems
 83 |   - https://github.com/rust-accel/docker-action
 84 |   - https://github.com/rust-accel/container
 85 | - GPU hosted runner for GitLab CI is now working on an instance managed by RICOS Co. Ltd. https://gitlab.com/termoshtt/accel/-/merge_requests/28
 86 | 
 87 | 0.3.0-alpha.1 - 2020-01-12
 88 | ---------------------------
 89 | 
 90 | [Restart Accel Project!](https://github.com/rust-accel/accel/issues/64)
 91 | 
 92 | ### Stable Rust
 93 | 
 94 | Stabilize Host-side code, though device-side code still requires nightly.
 95 | 
 96 | - Rust 2018 edition https://github.com/rust-accel/accel/pull/70
 97 | - proc-macro has been stabilized as https://github.com/rust-accel/accel/pull/63
 98 | - cargo check runs on stable Rust https://github.com/rust-accel/accel/pull/66
 99 | 
100 | ### Update dependencies
101 | 
102 | - syn, quote, proc-macro2 1.0 https://github.com/rust-accel/accel/pull/67
103 | - rust-cuda/cuda-{runtime,driver}-sys 0.3.0-alpha.1 https://github.com/rust-accel/accel/pull/66
104 | 
105 | ### rust-ptx-linker
106 | 
107 | Linker flavor using rust-ptx-linker has been merged into rustc https://github.com/rust-lang/rust/pull/57937
108 | 
109 | - Rewrite accel-derive with rust-ptx-linker https://github.com/rust-accel/accel/pull/71
110 | - archive [nvptx](https://github.com/rust-accel/nvptx) and other crates
111 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |   "accel",
 4 |   "accel-derive",
 5 | ]
 6 | 
 7 | exclude = [
 8 | "accel-core",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Toshiki Teramura
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Licence
2 | --------
3 | Dual-licensed to be compatible with the Rust project.
4 | 
5 | - [Apache License, Version 2.0](./LICENSE-APACHE)
6 | - [the MIT license](./LICENSE-MIT)
7 | 
8 | In addition, you must refer [End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html) for using CUDA.
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Accel: GPGPU Framework for Rust
 2 | ================================
 3 | 
 4 | [![pipeline status](https://gitlab.com/termoshtt/accel/badges/master/pipeline.svg)](https://gitlab.com/termoshtt/accel/commits/master)
 5 | 
 6 | |crate       |crates.io                                                                   |docs.rs                                                                |GitLab Pages                                                                  |                                           |
 7 | |:-----------|:---------------------------------------------------------------------------|:----------------------------------------------------------------------|:-----------------------------------------------------------------------------|:------------------------------------------|
 8 | |accel       |[![Crate](http://meritbadge.herokuapp.com/accel)][crate/accel]              |[![docs.rs](https://docs.rs/accel/badge.svg)][docs/accel]              |[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel]       |CUDA-based GPGPU framework                 |
 9 | |accel-core  |[![Crate](http://meritbadge.herokuapp.com/accel-core)][crate/accel-core]    |[![docs.rs](https://docs.rs/accel-core/badge.svg)][docs/accel-core]    |[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel-core]  |Helper for writing device code             |
10 | |accel-derive|[![Crate](http://meritbadge.herokuapp.com/accel-derive)][crate/accel-derive]|[![docs.rs](https://docs.rs/accel-derive/badge.svg)][docs/accel-derive]|[![cargo-doc](https://img.shields.io/badge/doc-master-blue)][dev/accel-derive]|Procedural macro for generating kernel code|
11 | 
12 | [crate/accel]:        https://crates.io/crates/accel/0.3.0
13 | [crate/accel-core]:   https://crates.io/crates/accel-core/0.3.0
14 | [crate/accel-derive]: https://crates.io/crates/accel-derive/0.3.0
15 | 
16 | [docs/accel]:        https://docs.rs/accel/0.3.0
17 | [docs/accel-core]:   https://docs.rs/accel-core/0.3.0
18 | [docs/accel-derive]: https://docs.rs/accel-derive/0.3.0
19 | 
20 | [dev/accel]:        https://termoshtt.gitlab.io/accel/accel/accel
21 | [dev/accel-core]:   https://termoshtt.gitlab.io/accel/accel/accel_core
22 | [dev/accel-derive]: https://termoshtt.gitlab.io/accel/accel/accel_derive
23 | 
24 | Requirements
25 | ------------
26 | ![minimum supported rust version](https://img.shields.io/badge/rustc-1.42+-red.svg)
27 | 
28 | - Minimum Supported Rust Version (MSRV) is 1.42.0
29 | - Install [CUDA](https://developer.nvidia.com/cuda-downloads) on your system
30 |   - accel depends on CUDA Device APIs through [rust-cuda/cuda-sys](https://github.com/rust-cuda/cuda-sys)
31 |   - accel does not depend on CUDA Runtime APIs. It means that a compiled binary requires only `libcuda.so` at runtime, which is far lighter than entire CUDA development toolkit.
32 | - Setup NVPTX target of Rust
33 |   - Install `nightly-2020-05-01` toolchain with  `nvptx64-nvidia-cuda` target, and [rust-ptx-linker](https://github.com/denzp/rust-ptx-linker)
34 |   - There is an [setup script](setup_nvptx_toolchain.sh) for them:
35 | 
36 | ```
37 | curl -sSL https://gitlab.com/termoshtt/accel/raw/master/setup_nvptx_toolchain.sh | bash
38 | ```
39 | 
40 | Or, you can use [docker container](./docker)
41 | 
42 | Limitations
43 | ------------
44 | This project is still in early stage. There are several limitations as following:
45 | 
46 | - For runtime on CPU
47 |   - [Windows](https://gitlab.com/termoshtt/accel/-/issues/25) and macOS are not supported
48 |   - [f64](https://gitlab.com/termoshtt/accel/-/issues/53) and [Complex number](https://gitlab.com/termoshtt/accel/-/issues/54) supports are missing
49 |   - [Texture/Surface object handling](https://gitlab.com/termoshtt/accel/-/issues/40) is missing
50 |   - Async features based on CUDA Stream and Events are disabled until [async/.await support](https://gitlab.com/termoshtt/accel/-/issues/4)
51 | 
52 | - For writing GPU kernel code
53 |   - [libstd cannot be used in writing kernel](https://gitlab.com/termoshtt/accel/-/issues/38)
54 |   - [Rust slice cannot be used in writing kernel](https://gitlab.com/termoshtt/accel/-/issues/7)
55 |   - [Shared memory](https://gitlab.com/termoshtt/accel/-/issues/39) cannot be used
56 | 
57 | Contribution
58 | ------------
59 | This project is developed on [GitLab](https://gitlab.com/termoshtt/accel) and mirrored to [GitHub](https://github.com/rust-accel/accel).
60 | 
61 | Sponsors
62 | --------
63 | - [RICOS Co. Ltd](https://www.ricos.co.jp/)
64 |   - GPU instances for CI and development
65 | 
66 | Links
67 | ------
68 | 
69 | Projects which accel depends on:
70 | 
71 | - [rust-cuda/cuda-sys](https://github.com/rust-cuda/cuda-sys): CUDA Runtime and Driver API binding to Rust
72 | - [denzp/rust-ptx-linker](https://github.com/denzp/rust-ptx-linker): Linker for PTX files generated by `rustc`
73 | 
74 | Related Projects:
75 | 
76 | - [rust-cuda/wg](https://github.com/rust-cuda/wg): Working group for Rust CUDA Team
77 | - [denzp/rust-ptx-builder](https://github.com/denzp/rust-ptx-builder): Another CUDA kernel builder from Rust crate
78 | - [bheisler/RustaCUDA](https://github.com/bheisler/RustaCUDA): Another CUDA-based Rust framework
79 | 


--------------------------------------------------------------------------------
/accel-core/.cargo/config:
--------------------------------------------------------------------------------
1 | [build]
2 | target = "nvptx64-nvidia-cuda"
3 | 


--------------------------------------------------------------------------------
/accel-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "accel-core"
 3 | version = "0.3.0"
 4 | authors = ["Toshiki Teramura <toshiki.teramura@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | description   = "Support crate for writing GPGPU kernels using accel"
 8 | documentation = "https://docs.rs/accel-core/"
 9 | repository    = "https://github.com/termoshtt/accel"
10 | keywords      = ["GPGPU", "CUDA", "platform-intrinsic"]
11 | license       = "MIT/Apache-2.0"
12 | readme        = "README.md"
13 | categories    = []
14 | 
15 | [package.metadata.docs.rs]
16 | targets = ["nvptx64-nvidia-cuda"]
17 | 


--------------------------------------------------------------------------------
/accel-core/README.md:
--------------------------------------------------------------------------------
1 | accel-core
2 | ===========
3 | 
4 | [![Crate](http://meritbadge.herokuapp.com/accel-core)](https://crates.io/crates/accel-core)
5 | [![docs.rs](https://docs.rs/accel-core/badge.svg)](https://docs.rs/accel-core)
6 | 
7 | Support crate for writing kernels
8 | 


--------------------------------------------------------------------------------
/accel-core/rust-toolchain:
--------------------------------------------------------------------------------
1 | nightly-2020-05-01
2 | 


--------------------------------------------------------------------------------
/accel-core/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Support crate for writting GPU kernel in Rust
  2 | //!
  3 | //! - This crate works only for `nvptx64-nvidia-cuda` target
  4 | //! - There is no support of `libstd` for `nvptx64-nvidia-cuda` target,
  5 | //!   i.e. You need to write `#![no_std]` Rust code.
  6 | //! - `alloc` crate is supported by `accel_core::PTXAllocator` which utilizes CUDA malloc/free system-calls
  7 | //!   - You can use `println!` and `assert_eq!` throught it.
  8 | 
  9 | #![feature(stdsimd)]
 10 | #![no_std]
 11 | 
 12 | extern crate alloc;
 13 | 
 14 | use alloc::alloc::*;
 15 | use core::arch::nvptx;
 16 | 
 17 | /// Memory allocator using CUDA malloc/free
 18 | pub struct PTXAllocator;
 19 | 
 20 | unsafe impl GlobalAlloc for PTXAllocator {
 21 |     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
 22 |         nvptx::malloc(layout.size()) as *mut u8
 23 |     }
 24 |     unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
 25 |         nvptx::free(ptr as *mut _);
 26 |     }
 27 | }
 28 | 
 29 | /// Alternative of [std::print!](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call
 30 | #[macro_export]
 31 | macro_rules! print {
 32 |     ($($arg:tt)*) => {
 33 |         let msg = ::alloc::format!($($arg)*);
 34 |         unsafe {
 35 |             ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut());
 36 |         }
 37 |     }
 38 | }
 39 | 
 40 | /// Alternative of [std::println!](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call
 41 | #[macro_export]
 42 | macro_rules! println {
 43 |     () => ($crate::print!("\n"));
 44 |     ($fmt:expr) => ($crate::print!(concat!($fmt, "\n")));
 45 |     ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*));
 46 | }
 47 | 
 48 | /// Assertion in GPU kernel for two expressions are equal.
 49 | ///
 50 | /// If assertion failed, accel API will return [accel::error::AccelError::DeviceAssertionFailed](https://docs.rs/accel/0.3.0-alpha.2/accel/error/enum.AccelError.html#variant.DeviceAssertionFailed)
 51 | #[macro_export]
 52 | macro_rules! assert_eq {
 53 |     ($a:expr, $b:expr) => {
 54 |         if $a != $b {
 55 |             let msg = alloc::format!(
 56 |                 "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}",
 57 |                 stringify!($a),
 58 |                 stringify!($b),
 59 |                 $a,
 60 |                 $b
 61 |             );
 62 |             unsafe {
 63 |                 ::core::arch::nvptx::__assert_fail(
 64 |                     msg.as_ptr(),
 65 |                     file!().as_ptr(),
 66 |                     line!(),
 67 |                     // FIXME cannot get function name.
 68 |                     // See https://github.com/rust-lang/rfcs/pull/2818
 69 |                     "".as_ptr(),
 70 |                 )
 71 |             };
 72 |         }
 73 |     };
 74 | }
 75 | 
 76 | /// Assertion in GPU kernel for two expressions are not equal.
 77 | ///
 78 | /// If assertion failed, accel API will return [accel::error::AccelError::DeviceAssertionFailed](https://docs.rs/accel/0.3.0-alpha.2/accel/error/enum.AccelError.html#variant.DeviceAssertionFailed)
 79 | #[macro_export]
 80 | macro_rules! assert_ne {
 81 |     ($a:expr, $b:expr) => {
 82 |         if $a == $b {
 83 |             let msg = alloc::format!(
 84 |                 "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}",
 85 |                 stringify!($a),
 86 |                 stringify!($b),
 87 |                 $a,
 88 |                 $b
 89 |             );
 90 |             unsafe {
 91 |                 ::core::arch::nvptx::__assert_fail(
 92 |                     msg.as_ptr(),
 93 |                     file!().as_ptr(),
 94 |                     line!(),
 95 |                     // FIXME cannot get function name.
 96 |                     // See https://github.com/rust-lang/rfcs/pull/2818
 97 |                     "".as_ptr(),
 98 |                 )
 99 |             };
100 |         }
101 |     };
102 | }
103 | 
104 | /// Dimension specified in kernel launching
105 | pub struct Dim3 {
106 |     pub x: i32,
107 |     pub y: i32,
108 |     pub z: i32,
109 | }
110 | 
111 | /// Indices where the kernel code running on
112 | pub struct Idx3 {
113 |     pub x: i32,
114 |     pub y: i32,
115 |     pub z: i32,
116 | }
117 | 
118 | pub fn block_dim() -> Dim3 {
119 |     unsafe {
120 |         Dim3 {
121 |             x: nvptx::_block_dim_x(),
122 |             y: nvptx::_block_dim_y(),
123 |             z: nvptx::_block_dim_z(),
124 |         }
125 |     }
126 | }
127 | 
128 | pub fn block_idx() -> Idx3 {
129 |     unsafe {
130 |         Idx3 {
131 |             x: nvptx::_block_idx_x(),
132 |             y: nvptx::_block_idx_y(),
133 |             z: nvptx::_block_idx_z(),
134 |         }
135 |     }
136 | }
137 | 
138 | pub fn grid_dim() -> Dim3 {
139 |     unsafe {
140 |         Dim3 {
141 |             x: nvptx::_grid_dim_x(),
142 |             y: nvptx::_grid_dim_y(),
143 |             z: nvptx::_grid_dim_z(),
144 |         }
145 |     }
146 | }
147 | 
148 | pub fn thread_idx() -> Idx3 {
149 |     unsafe {
150 |         Idx3 {
151 |             x: nvptx::_thread_idx_x(),
152 |             y: nvptx::_thread_idx_y(),
153 |             z: nvptx::_thread_idx_z(),
154 |         }
155 |     }
156 | }
157 | 
158 | impl Dim3 {
159 |     pub fn size(&self) -> i32 {
160 |         (self.x * self.y * self.z)
161 |     }
162 | }
163 | 
164 | impl Idx3 {
165 |     pub fn into_id(&self, dim: Dim3) -> i32 {
166 |         self.x + self.y * dim.x + self.z * dim.x * dim.y
167 |     }
168 | }
169 | 
170 | pub fn index() -> isize {
171 |     let block_id = block_idx().into_id(grid_dim());
172 |     let thread_id = thread_idx().into_id(block_dim());
173 |     (block_id + thread_id) as isize
174 | }
175 | 


--------------------------------------------------------------------------------
/accel-derive/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "accel-derive"
 3 | version = "0.3.0"
 4 | authors = ["Toshiki Teramura <toshiki.teramura@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | description   = "Procedual macro for writing GPGPU kernel"
 8 | documentation = "https://docs.rs/accel-derive/"
 9 | repository    = "https://github.com/termoshtt/accel"
10 | keywords      = ["GPGPU", "CUDA", "proc-macro"]
11 | license       = "MIT/Apache-2.0"
12 | readme        = "README.md"
13 | categories    = []
14 | 
15 | [lib]
16 | proc-macro = true
17 | 
18 | [dependencies]
19 | proc-macro-crate = "0.1"
20 | proc-macro2 = "1.0.18"
21 | quote = "1.0.6"
22 | syn = { version = "1.0.30", features = ["full", "extra-traits"] }
23 | 
24 | dirs = "2.0.2"
25 | maplit = "1.0.2"
26 | serde = { version = "1.0.111", features = ["derive"] }
27 | toml = "0.5.6"
28 | 
29 | failure = "0.1.8"
30 | anyhow = "1.0.31"
31 | 
32 | [dev-dependencies]
33 | trybuild = "1.0.27"
34 | accel = { version = "0.4.0-alpha.0", path = "../accel" }
35 | 


--------------------------------------------------------------------------------
/accel-derive/README.md:
--------------------------------------------------------------------------------
 1 | accel-derive
 2 | =============
 3 | 
 4 | [![Crate](http://meritbadge.herokuapp.com/accel-derive)](https://crates.io/crates/accel-derive)
 5 | [![docs.rs](https://docs.rs/accel-derive/badge.svg)](https://docs.rs/accel-derive)
 6 | 
 7 | Procedural-macro crate for `#[kernel]`. `#[kernel]` function will be converted to two part:
 8 | 
 9 | - Device code will be compiled into PTX assembler
10 | - Host code which call the generated device code (PTX asm) using `accel::module` API
11 | 
12 | ![Compile flow graph](../diagrams/compile_flow.png)
13 | 


--------------------------------------------------------------------------------
/accel-derive/src/builder.rs:
--------------------------------------------------------------------------------
  1 | use crate::parser::*;
  2 | use failure::*;
  3 | use quote::quote;
  4 | use std::{
  5 |     collections::{hash_map::DefaultHasher, HashMap},
  6 |     env, fs,
  7 |     hash::*,
  8 |     io::{Read, Write},
  9 |     path::*,
 10 |     process::Command,
 11 | };
 12 | 
 13 | const NIGHTLY_VERSION: &str = "nightly-2020-05-01";
 14 | 
 15 | trait CheckRun {
 16 |     fn check_run(&mut self) -> Fallible<()>;
 17 | }
 18 | 
 19 | impl CheckRun for Command {
 20 |     fn check_run(&mut self) -> Fallible<()> {
 21 |         // Filter CARGO_* and OUT_DIR envs
 22 |         let filtered_env: HashMap<String, String> = env::vars()
 23 |             .filter(|&(ref k, _)| !(k.starts_with("CARGO") || k == "OUT_DIR"))
 24 |             .collect();
 25 |         let output = self.env_clear().envs(&filtered_env).output()?;
 26 |         if !output.status.success() {
 27 |             println!("{}", std::str::from_utf8(&output.stdout)?);
 28 |             eprintln!("{}", std::str::from_utf8(&output.stderr)?);
 29 |             bail!("External command failed: {:?}", self);
 30 |         }
 31 |         Ok(())
 32 |     }
 33 | }
 34 | 
 35 | /// Generate Rust code for nvptx64-nvidia-cuda target from tokens
 36 | fn ptx_kernel(func: &syn::ItemFn) -> String {
 37 |     let vis = &func.vis;
 38 |     let ident = &func.sig.ident;
 39 |     let unsafety = &func.sig.unsafety;
 40 |     let block = &func.block;
 41 | 
 42 |     let fn_token = &func.sig.fn_token;
 43 |     let inputs = &func.sig.inputs;
 44 |     let output = &func.sig.output;
 45 | 
 46 |     let kernel = quote! {
 47 |         #![feature(abi_ptx, stdsimd, alloc_error_handler)]
 48 |         #![no_std]
 49 |         extern crate alloc;
 50 |         #[global_allocator]
 51 |         static _GLOBAL_ALLOCATOR: accel_core::PTXAllocator = accel_core::PTXAllocator;
 52 |         #[no_mangle]
 53 |         #vis #unsafety extern "ptx-kernel" #fn_token #ident(#inputs) #output #block
 54 |         #[panic_handler]
 55 |         fn panic(_info: &::core::panic::PanicInfo) -> ! {
 56 |             unsafe { ::core::arch::nvptx::trap() }
 57 |         }
 58 |         #[alloc_error_handler]
 59 |         fn alloc_error_handler(_: core::alloc::Layout) -> ! {
 60 |             unsafe { ::core::arch::nvptx::trap() }
 61 |         }
 62 |     };
 63 |     kernel.to_string()
 64 | }
 65 | 
 66 | fn calc_hash<T: Hash>(t: &T) -> u64 {
 67 |     let mut s = DefaultHasher::new();
 68 |     t.hash(&mut s);
 69 |     s.finish()
 70 | }
 71 | 
 72 | fn project_id() -> String {
 73 |     let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
 74 |     let hash = calc_hash(&manifest_dir);
 75 |     let stem = PathBuf::from(manifest_dir)
 76 |         .file_stem()
 77 |         .unwrap()
 78 |         .to_str()
 79 |         .unwrap()
 80 |         .to_string();
 81 |     format!("{}-{:x}", stem, hash)
 82 | }
 83 | 
 84 | pub fn compile_tokens(func: &syn::ItemFn) -> Fallible<String> {
 85 |     let meta = MetaData::from_token(func)?;
 86 | 
 87 |     // Create crate
 88 |     let dir = dirs::cache_dir()
 89 |         .unwrap()
 90 |         .join("accel-derive")
 91 |         .join(project_id())
 92 |         .join(meta.name());
 93 |     fs::create_dir_all(dir.join("src"))?;
 94 | 
 95 |     // Generate lib.rs and write into a file
 96 |     let mut lib_rs = fs::File::create(dir.join("src/lib.rs"))?;
 97 |     lib_rs.write_all(ptx_kernel(func).as_bytes())?;
 98 |     lib_rs.sync_data()?;
 99 | 
100 |     // Generate Cargo.toml
101 |     let mut cargo_toml = fs::File::create(dir.join("Cargo.toml"))?;
102 |     cargo_toml.write_all(toml::to_string(&meta)?.as_bytes())?;
103 |     cargo_toml.sync_data()?;
104 | 
105 |     // Build
106 |     Command::new("cargo")
107 |         .args(&[&format!("+{}", NIGHTLY_VERSION), "fmt"])
108 |         .current_dir(&dir)
109 |         .check_run()?;
110 |     Command::new("cargo")
111 |         .args(&[
112 |             &format!("+{}", NIGHTLY_VERSION),
113 |             "build",
114 |             "--release",
115 |             "--target",
116 |             "nvptx64-nvidia-cuda",
117 |         ])
118 |         .current_dir(&dir)
119 |         .check_run()?;
120 | 
121 |     // Read PTX file
122 |     let mut ptx = fs::File::open(dir.join(format!(
123 |         "target/nvptx64-nvidia-cuda/release/{}.ptx",
124 |         meta.name()
125 |     )))?;
126 |     let mut buf = String::new();
127 |     ptx.read_to_string(&mut buf)?;
128 |     Ok(buf)
129 | }
130 | 
131 | #[cfg(test)]
132 | mod tests {
133 |     use super::*;
134 | 
135 |     #[test]
136 |     fn build_do_nothing() {
137 |         let func = syn::parse_str("unsafe fn do_nothing() {}").unwrap();
138 |         let ptx = compile_tokens(&func).unwrap();
139 |         assert!(ptx.len() > 0);
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/accel-derive/src/contexted.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro2::*;
 2 | use quote::quote;
 3 | use syn::*;
 4 | 
 5 | fn seek_context_ident(input: &DeriveInput) -> Ident {
 6 |     match &input.data {
 7 |         syn::Data::Struct(syn::DataStruct { fields, .. }) => match fields {
 8 |             Fields::Named(fields_named) => {
 9 |                 for field in fields_named.named.iter() {
10 |                     let field = field.ident.clone().unwrap();
11 |                     if field.to_string() == "context" || field.to_string() == "ctx" {
12 |                         return field;
13 |                     }
14 |                 }
15 |             }
16 |             _ => unreachable!("Must be named field"),
17 |         },
18 |         _ => unreachable!("Must be a struct"),
19 |     };
20 |     unreachable!("context or ctx not found")
21 | }
22 | 
23 | pub fn contexted(input: DeriveInput) -> TokenStream {
24 |     let name = &input.ident;
25 |     let generics = &input.generics;
26 |     let context_ident = seek_context_ident(&input);
27 |     quote! {
28 |         impl #generics Contexted for #name #generics {
29 |             fn sync(&self) -> Result<()> {
30 |                 self.#context_ident.sync()
31 |             }
32 | 
33 |             fn version(&self) -> Result<u32> {
34 |                 self.#context_ident.version()
35 |             }
36 | 
37 |             fn guard(&self) -> Result<ContextGuard> {
38 |                 self.#context_ident.guard()
39 |             }
40 | 
41 |             fn get_ref(&self) -> ContextRef {
42 |                 self.#context_ident.get_ref()
43 |             }
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/accel-derive/src/host.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro2::{Span, TokenStream};
  2 | use quote::quote;
  3 | 
  4 | /// Split out types from function definition
  5 | ///
  6 | /// - Reference type, e.g. `&i32` will be modified into lifetimed reference `&'arg i32`
  7 | ///
  8 | fn input_types(func: &syn::ItemFn) -> Vec<syn::Type> {
  9 |     func.sig
 10 |         .inputs
 11 |         .iter()
 12 |         .map(|arg| match arg {
 13 |             syn::FnArg::Typed(ref val) => {
 14 |                 let mut ty = *val.ty.clone();
 15 |                 match &mut ty {
 16 |                     syn::Type::Reference(re) => {
 17 |                         re.lifetime = Some(syn::Lifetime::new("'arg", Span::call_site()))
 18 |                     }
 19 |                     _ => {}
 20 |                 }
 21 |                 ty
 22 |             }
 23 |             _ => panic!("Unsupported kernel input type sigunature"),
 24 |         })
 25 |         .collect()
 26 | }
 27 | 
 28 | fn accel_path() -> String {
 29 |     if let Ok(name) = proc_macro_crate::crate_name("accel") {
 30 |         // accel exists as an external crate
 31 |         return name;
 32 |     }
 33 | 
 34 |     if std::env::var("CARGO_PKG_NAME").unwrap() == "accel" {
 35 |         // doctest in accel
 36 |         //
 37 |         // "--crate-type bin" should be specified for doctest
 38 |         let mut find_flag = false;
 39 |         for arg in std::env::args() {
 40 |             if arg == "--crate-type" {
 41 |                 find_flag = true;
 42 |             }
 43 |             if find_flag {
 44 |                 if arg == "bin" {
 45 |                     return "accel".into();
 46 |                 }
 47 |             }
 48 |         }
 49 | 
 50 |         // in accel crate
 51 |         return "crate".into();
 52 |     }
 53 |     unreachable!("Cannot determine accel crate name");
 54 | }
 55 | 
 56 | fn impl_submodule(ptx_str: &str, func: &syn::ItemFn) -> TokenStream {
 57 |     let input_types = input_types(func);
 58 |     let accel = accel_path();
 59 | 
 60 |     let launchable: syn::Path = syn::parse_str(&format!(
 61 |         "{}::execution::Launchable{}",
 62 |         accel,
 63 |         input_types.len()
 64 |     ))
 65 |     .unwrap();
 66 | 
 67 |     let targets: Vec<syn::Ident> = (1..=input_types.len())
 68 |         .into_iter()
 69 |         .map(|k| syn::Ident::new(&format!("Target{}", k), Span::call_site()))
 70 |         .collect();
 71 | 
 72 |     let ident = &func.sig.ident;
 73 | 
 74 |     let accel = syn::Ident::new(&accel, Span::call_site());
 75 |     let kernel_name = quote! { #ident }.to_string();
 76 |     quote! {
 77 |         /// Auto-generated by accel-derive
 78 |         mod #ident {
 79 |             pub const PTX_STR: &'static str = #ptx_str;
 80 | 
 81 |             pub struct Module(#accel::Module);
 82 | 
 83 |             impl Module {
 84 |                 pub fn new(ctx: &#accel::Context) -> #accel::error::Result<Self> {
 85 |                     Ok(Module(#accel::Module::from_str(ctx, PTX_STR)?))
 86 |                 }
 87 |             }
 88 | 
 89 |             impl<'arg> #launchable <'arg> for Module {
 90 |                 #(
 91 |                     type #targets = #input_types;
 92 |                 )*
 93 |                 fn get_kernel(&self) -> #accel::error::Result<#accel::Kernel> {
 94 |                     Ok(self.0.get_kernel(#kernel_name)?)
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | }
100 | 
101 | fn caller(func: &syn::ItemFn) -> TokenStream {
102 |     let accel = accel_path();
103 |     let vis = &func.vis;
104 |     let ident = &func.sig.ident;
105 |     let fn_token = &func.sig.fn_token;
106 | 
107 |     let input_types = input_types(func);
108 | 
109 |     let args_types: Vec<syn::Ident> = (1..=input_types.len())
110 |         .into_iter()
111 |         .map(|k| syn::Ident::new(&format!("Arg{}", k), Span::call_site()))
112 |         .collect();
113 | 
114 |     let launchable: syn::Path = syn::parse_str(&format!(
115 |         "{}::execution::Launchable{}",
116 |         accel,
117 |         input_types.len()
118 |     ))
119 |     .unwrap();
120 | 
121 |     let accel = syn::Ident::new(&accel, Span::call_site());
122 | 
123 |     quote! {
124 |         #vis #fn_token #ident<'arg, #(#args_types),* >(
125 |             ctx: &#accel::Context,
126 |             grid: impl Into<#accel::Grid>,
127 |             block: impl Into<#accel::Block>,
128 |             args: (#(#args_types,)*)
129 |         ) -> #accel::error::Result<()>
130 |         where
131 |             #(
132 |                 #args_types: #accel::execution::DeviceSend<Target = #input_types>
133 |             ),*
134 |         {
135 |             use #launchable;
136 |             let module = #ident::Module::new(ctx)?;
137 |             module.launch(grid, block, args)?;
138 |             Ok(())
139 |         }
140 |     }
141 | }
142 | 
143 | pub fn func2caller(ptx_str: &str, func: &syn::ItemFn) -> TokenStream {
144 |     let impl_submodule = impl_submodule(ptx_str, func);
145 |     let caller = caller(func);
146 |     quote! {
147 |         #impl_submodule
148 |         #caller
149 |     }
150 | }
151 | 
152 | #[cfg(test)]
153 | mod tests {
154 |     use anyhow::Result;
155 |     use std::{
156 |         io::Write,
157 |         process::{Command, Stdio},
158 |     };
159 | 
160 |     const TEST_KERNEL: &'static str = r#"
161 |     fn kernel_name(arg1: i32, arg2: f64) {}
162 |     "#;
163 | 
164 |     /// Format TokenStream by rustfmt
165 |     ///
166 |     /// This can test if the input TokenStream is valid in terms of rustfmt.
167 |     fn pretty_print(tt: &impl ToString) -> Result<()> {
168 |         let mut fmt = Command::new("rustfmt")
169 |             .stdin(Stdio::piped())
170 |             .stdout(Stdio::piped())
171 |             .spawn()?;
172 |         fmt.stdin
173 |             .as_mut()
174 |             .unwrap()
175 |             .write(tt.to_string().as_bytes())?;
176 |         let out = fmt.wait_with_output()?;
177 |         println!("{}", String::from_utf8_lossy(&out.stdout));
178 |         Ok(())
179 |     }
180 | 
181 |     #[test]
182 |     fn impl_submodule() -> Result<()> {
183 |         let func: syn::ItemFn = syn::parse_str(TEST_KERNEL)?;
184 |         let ts = super::impl_submodule("", &func);
185 |         pretty_print(&ts)?;
186 |         Ok(())
187 |     }
188 | 
189 |     #[test]
190 |     fn caller() -> Result<()> {
191 |         let func: syn::ItemFn = syn::parse_str(TEST_KERNEL)?;
192 |         let ts = super::caller(&func);
193 |         pretty_print(&ts)?;
194 |         Ok(())
195 |     }
196 | }
197 | 


--------------------------------------------------------------------------------
/accel-derive/src/launchable.rs:
--------------------------------------------------------------------------------
  1 | use proc_macro2::{Span, TokenStream};
  2 | use quote::quote;
  3 | pub fn generate(item: TokenStream) -> TokenStream {
  4 |     let literal: syn::LitInt = syn::parse2(item).unwrap();
  5 |     let n: usize = literal.base10_parse().unwrap();
  6 |     (0..=n)
  7 |         .into_iter()
  8 |         .map(|i| {
  9 |             let name = syn::Ident::new(&format!("Launchable{}", i), Span::call_site());
 10 |             let targets: Vec<syn::Ident> = (1..=i)
 11 |                 .into_iter()
 12 |                 .map(|k| syn::Ident::new(&format!("Target{}", k), Span::call_site()))
 13 |                 .collect();
 14 |             let args_value: Vec<syn::Ident> = (1..=i)
 15 |                 .into_iter()
 16 |                 .map(|k| syn::Ident::new(&format!("arg{}", k), Span::call_site()))
 17 |                 .collect();
 18 |             let args_types: Vec<syn::Ident> = (1..=i)
 19 |                 .into_iter()
 20 |                 .map(|k| syn::Ident::new(&format!("Arg{}", k), Span::call_site()))
 21 |                 .collect();
 22 |             quote! {
 23 |                 /// Launchable Kernel with N-arguments
 24 |                 ///
 25 |                 /// This is auto-generated by `accel_derive::define_launchable!` proc-macro.
 26 |                 /// See [module level document](index.html) for detail.
 27 |                 pub trait #name <'arg> {
 28 |                     #(
 29 |                         type #targets;
 30 |                     )*
 31 |                     fn get_kernel(&self) -> Result<Kernel>;
 32 |                     fn launch<#(#args_types),*>(
 33 |                         &self,
 34 |                         grid: impl Into<Grid>,
 35 |                         block: impl Into<Block>,
 36 |                         (#(#args_value,)*): (#(#args_types,)*),
 37 |                     ) -> Result<()>
 38 |                     where
 39 |                         #(
 40 |                             #args_types: DeviceSend<Target = Self::#targets>
 41 |                         ),*
 42 |                     {
 43 |                         let grid = grid.into();
 44 |                         let block = block.into();
 45 |                         let kernel = self.get_kernel()?;
 46 |                         let mut args = [#(#args_value.as_kernel_parameter()),*];
 47 |                         unsafe {
 48 |                             contexted_call!(
 49 |                                 &kernel,
 50 |                                 cuLaunchKernel,
 51 |                                 kernel.func,
 52 |                                 grid.x,
 53 |                                 grid.y,
 54 |                                 grid.z,
 55 |                                 block.x,
 56 |                                 block.y,
 57 |                                 block.z,
 58 |                                 0,          /* FIXME: no shared memory */
 59 |                                 null_mut(), /* use default stream */
 60 |                                 args.as_mut_ptr(),
 61 |                                 null_mut() /* no extra */
 62 |                             )?;
 63 |                         }
 64 |                         kernel.sync()?;
 65 |                         Ok(())
 66 |                     }
 67 | 
 68 |                     fn launch_async<#(#args_types),*>(
 69 |                         &self,
 70 |                         grid: impl Into<Grid>,
 71 |                         block: impl Into<Block>,
 72 |                         (#(#args_value,)*): (#(#args_types,)*),
 73 |                     ) -> ::futures::future::BoxFuture<'arg, Result<()>>
 74 |                     where
 75 |                         #(
 76 |                             #args_types: DeviceSend<Target = Self::#targets> + 'arg
 77 |                         ),*
 78 |                     {
 79 |                         let grid = grid.into();
 80 |                         let block = block.into();
 81 |                         let kernel = self.get_kernel().unwrap();
 82 |                         let stream = stream::Stream::new(kernel.get_ref());
 83 |                         let mut args = [#(#args_value.as_kernel_parameter()),*];
 84 |                         unsafe {
 85 |                             contexted_call!(
 86 |                                 &kernel,
 87 |                                 cuLaunchKernel,
 88 |                                 kernel.func,
 89 |                                 grid.x,
 90 |                                 grid.y,
 91 |                                 grid.z,
 92 |                                 block.x,
 93 |                                 block.y,
 94 |                                 block.z,
 95 |                                 0, /* FIXME: no shared memory */
 96 |                                 stream.stream,
 97 |                                 args.as_mut_ptr(),
 98 |                                 null_mut() /* no extra */
 99 |                             )
100 |                         }
101 |                         .expect("Asynchronous kernel launch has been failed");
102 |                         Box::pin(stream.into_future())
103 |                     }
104 |                 }
105 |             }
106 |         })
107 |         .collect()
108 | }
109 | 


--------------------------------------------------------------------------------
/accel-derive/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![recursion_limit = "128"]
 2 | 
 3 | //! Get compiled PTX as `String`
 4 | //! ----------------------------
 5 | //!
 6 | //! The proc-macro `#[kernel]` creates a submodule `add::` in addition to a function `add`.
 7 | //! Kernel Rust code is compiled into PTX string using rustc's `nvptx64-nvidia-cuda` toolchain.
 8 | //! Generated PTX string is embedded into proc-macro output as `{kernel_name}::PTX_STR`.
 9 | //!
10 | //! ```
11 | //! use accel_derive::kernel;
12 | //!
13 | //! #[kernel]
14 | //! unsafe fn add(a: *const f64, b: *const f64, c: *mut f64, n: usize) {
15 | //!     let i = accel_core::index();
16 | //!     if (i as usize) < n {
17 | //!         *c.offset(i) = *a.offset(i) + *b.offset(i);
18 | //!     }
19 | //! }
20 | //!
21 | //! // PTX assembler code is embedded as `add::PTX_STR`
22 | //! println!("{}", add::PTX_STR);
23 | //! ```
24 | 
25 | mod builder;
26 | mod contexted;
27 | mod host;
28 | mod launchable;
29 | mod parser;
30 | 
31 | use proc_macro::TokenStream;
32 | 
33 | #[proc_macro_attribute]
34 | pub fn kernel(_attr: TokenStream, func: TokenStream) -> TokenStream {
35 |     let func: syn::ItemFn = syn::parse(func).expect("Not a function");
36 |     let ptx_str = builder::compile_tokens(&func).expect("Failed to compile to PTX");
37 |     host::func2caller(&ptx_str, &func).into()
38 | }
39 | 
40 | #[proc_macro_derive(Contexted)]
41 | pub fn contexted(input: TokenStream) -> TokenStream {
42 |     contexted::contexted(syn::parse(input).unwrap()).into()
43 | }
44 | 
45 | #[proc_macro]
46 | pub fn define_launchable(item: TokenStream) -> TokenStream {
47 |     launchable::generate(item.into()).into()
48 | }
49 | 


--------------------------------------------------------------------------------
/accel-derive/src/parser.rs:
--------------------------------------------------------------------------------
  1 | use failure::*;
  2 | use maplit::hashmap;
  3 | use quote::ToTokens;
  4 | use serde::{Deserialize, Serialize};
  5 | use std::collections::HashMap;
  6 | 
  7 | #[derive(Debug, Serialize)]
  8 | pub struct MetaData {
  9 |     package: HashMap<&'static str, String>,
 10 |     lib: HashMap<&'static str, Vec<&'static str>>,
 11 |     dependencies: HashMap<String, Depenency>,
 12 | }
 13 | 
 14 | impl MetaData {
 15 |     fn new(name: &str) -> Self {
 16 |         MetaData {
 17 |             package: hashmap! { "version" => "0.0.0".into(), "name" => name.into(), "edition" => "2018".into() },
 18 |             lib: hashmap! { "crate-type" => vec![ "cdylib" ] },
 19 |             dependencies: HashMap::new(),
 20 |         }
 21 |     }
 22 | 
 23 |     pub fn name(&self) -> &str {
 24 |         &self.package["name"]
 25 |     }
 26 | 
 27 |     pub fn from_token(func: &syn::ItemFn) -> Fallible<Self> {
 28 |         let attrs = &func.attrs;
 29 |         let mut kernel_attrs = MetaData::new(&func.sig.ident.to_string());
 30 |         for attr in attrs {
 31 |             let path = attr.path.to_token_stream().to_string();
 32 |             match path.as_ref() {
 33 |                 "dependencies" => {
 34 |                     let dep = parse_dependency(
 35 |                         attr.tokens
 36 |                             .to_string()
 37 |                             .trim_start_matches('(')
 38 |                             .trim_end_matches(')'),
 39 |                     )?;
 40 |                     for (key, val) in dep {
 41 |                         kernel_attrs.dependencies.insert(key, val);
 42 |                     }
 43 |                 }
 44 |                 "name" => {
 45 |                     let token = attr.tokens.to_string();
 46 |                     let name = token.trim_start_matches('(').trim_end_matches(')').trim();
 47 |                     kernel_attrs.package.insert("name", name.into());
 48 |                 }
 49 |                 _ => {
 50 |                     continue;
 51 |                 }
 52 |             }
 53 |         }
 54 |         kernel_attrs
 55 |             .dependencies
 56 |             .entry("accel-core".into())
 57 |             .or_insert_with(|| Depenency::Version("0.3.0-alpha.4".into()));
 58 |         Ok(kernel_attrs)
 59 |     }
 60 | }
 61 | 
 62 | // Should I use `cargo::core::dependency::Depenency`?
 63 | // https://docs.rs/cargo/0.41.0/cargo/core/dependency/struct.Dependency.html
 64 | #[derive(Debug, PartialEq, Serialize, Deserialize)]
 65 | #[serde(untagged, deny_unknown_fields)]
 66 | enum Depenency {
 67 |     Version(String),
 68 |     VersionTable {
 69 |         version: String,
 70 |         #[serde(default)]
 71 |         features: Vec<String>,
 72 |     },
 73 |     Git {
 74 |         git: String,
 75 |         branch: Option<String>,
 76 |         tag: Option<String>,
 77 |         hash: Option<String>,
 78 |         #[serde(default)]
 79 |         features: Vec<String>,
 80 |     },
 81 |     Path {
 82 |         path: String,
 83 |         #[serde(default)]
 84 |         features: Vec<String>,
 85 |     },
 86 | }
 87 | 
 88 | fn parse_dependency(dep: &str) -> Fallible<HashMap<String, Depenency>> {
 89 |     Ok(toml::from_str(&dep.replace("\n", ""))?)
 90 | }
 91 | 
 92 | #[cfg(test)]
 93 | mod tests {
 94 |     #[test]
 95 |     fn parse_dependency() {
 96 |         let map = super::parse_dependency(r#"accel-core = "0.1.1""#).unwrap();
 97 |         dbg!(map);
 98 |         let map = super::parse_dependency(r#"accel-core = { version = "0.1.1" }"#).unwrap();
 99 |         dbg!(map);
100 | 
101 |         let map = super::parse_dependency(
102 |             r#"accel-core = { git = "https://github.com/rust-accel/accel" }"#,
103 |         )
104 |         .unwrap();
105 |         dbg!(map);
106 | 
107 |         let map = super::parse_dependency(
108 |             r#"accel-core = { git = "https://github.com/rust-accel/accel", branch = "master" }"#,
109 |         )
110 |         .unwrap();
111 |         dbg!(map);
112 | 
113 |         // `git` is lacked
114 |         assert!(super::parse_dependency(r#"accel-core = { branch = "master" }"#,).is_err());
115 | 
116 |         // Unsupported tag
117 |         assert!(super::parse_dependency(
118 |             r#"accel-core = { git = "https://github.com/rust-accel/accel", homhom = "master" }"#,
119 |         )
120 |         .is_err());
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/accel-derive/tests/kernels/arguments.rs:
--------------------------------------------------------------------------------
 1 | //! Testing launch arguments are correctly handled
 2 | 
 3 | use accel::*;
 4 | use accel_derive::kernel;
 5 | use anyhow::Result;
 6 | 
 7 | #[kernel]
 8 | pub fn launch(i: i32) {
 9 |     accel_core::println!("i = {}", i);
10 | }
11 | 
12 | fn test() -> Result<()> {
13 |     let device = Device::nth(0)?;
14 |     let ctx = device.create_context();
15 |     let i = 12;
16 |     let grid = Grid::x(1);
17 |     let block = Block::x(4);
18 |     launch(&ctx, grid, block, (i,))?;
19 |     Ok(())
20 | }
21 | 
22 | // Only check `test` can be compiled. not run here
23 | fn main() {}
24 | 


--------------------------------------------------------------------------------
/accel-derive/tests/kernels/dependencies.rs:
--------------------------------------------------------------------------------
 1 | use accel_derive::kernel;
 2 | 
 3 | #[kernel]
 4 | #[dependencies("accel-core" = "0.3.0-alpha.4")]
 5 | unsafe fn version() {
 6 |     let _i = accel_core::index();
 7 | }
 8 | 
 9 | #[kernel]
10 | #[dependencies("accel-core" = { version = "0.3.0-alpha.4" })]
11 | unsafe fn version_table() {
12 |     let _i = accel_core::index();
13 | }
14 | 
15 | fn main() {}
16 | 


--------------------------------------------------------------------------------
/accel-derive/tests/kernels/dependencies_default.rs:
--------------------------------------------------------------------------------
1 | use accel_derive::kernel;
2 | 
3 | #[kernel]
4 | unsafe fn dependencies_default() {
5 |     let _i = accel_core::index(); // accel-core exists
6 | }
7 | 
8 | fn main() {}
9 | 


--------------------------------------------------------------------------------
/accel-derive/tests/kernels/dependencies_git.rs:
--------------------------------------------------------------------------------
 1 | use accel_derive::kernel;
 2 | 
 3 | #[kernel]
 4 | #[dependencies("accel-core" = { git = "https://gitlab.com/termoshtt/accel" })]
 5 | unsafe fn git() {
 6 |     let _i = accel_core::index();
 7 | }
 8 | 
 9 | #[kernel]
10 | #[dependencies("accel-core" = { git = "https://gitlab.com/termoshtt/accel", branch = "master" })]
11 | unsafe fn git_branch() {
12 |     let _i = accel_core::index();
13 | }
14 | 
15 | fn main() {}
16 | 


--------------------------------------------------------------------------------
/accel-derive/tests/kernels/do_nothing.rs:
--------------------------------------------------------------------------------
1 | use accel_derive::kernel;
2 | 
3 | // Build test
4 | #[kernel]
5 | unsafe fn do_nothing() {}
6 | 
7 | fn main() {}
8 | 


--------------------------------------------------------------------------------
/accel-derive/tests/try_build.rs:
--------------------------------------------------------------------------------
 1 | #[test]
 2 | fn kernel_generate() {
 3 |     let t = trybuild::TestCases::new();
 4 |     t.pass("tests/kernels/do_nothing.rs");
 5 |     t.pass("tests/kernels/dependencies.rs");
 6 |     t.pass("tests/kernels/dependencies_git.rs");
 7 |     t.pass("tests/kernels/dependencies_default.rs");
 8 |     t.pass("tests/kernels/arguments.rs");
 9 | }
10 | 


--------------------------------------------------------------------------------
/accel/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "accel"
 3 | version = "0.4.0-alpha.0"
 4 | authors = ["Toshiki Teramura <toshiki.teramura@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | description   = "GPGPU Framework for Rust"
 8 | documentation = "https://docs.rs/accel"
 9 | repository    = "https://gitlab.com/termoshtt/accel"
10 | keywords      = ["GPGPU", "CUDA"]
11 | license       = "MIT/Apache-2.0"
12 | readme        = "../README.md"
13 | categories    = []
14 | 
15 | [dependencies]
16 | accel-derive = { version = "0.3.0", path = "../accel-derive" }
17 | bitflags = "1.2.1"
18 | cuda-driver-sys = "0.3.0"
19 | derive-new = "0.5.8"
20 | futures = "0.3.5"
21 | log = "0.4.8"
22 | num-derive = "0.3.0"
23 | num-traits = "0.2.11"
24 | paste = "0.1.15"
25 | thiserror = "1.0.19"
26 | tokio = { version = "0.2.21", features = ["blocking"] }
27 | 
28 | [dev-dependencies]
29 | criterion = "0.3.2"
30 | tokio = { version = "0.2.21", features = ["full"] }
31 | trybuild = "1.0.27"
32 | 
33 | [[bench]]
34 | name = "memcpy"
35 | harness = false
36 | 


--------------------------------------------------------------------------------
/accel/benches/memcpy.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | use criterion::*;
 3 | 
 4 | fn h2d(c: &mut Criterion) {
 5 |     let device = Device::nth(0).unwrap();
 6 |     let context = device.create_context();
 7 |     let mut group = c.benchmark_group("h2d");
 8 | 
 9 |     macro_rules! impl_HtoD {
10 |         ($host:expr, $id:expr) => {
11 |             let host = $host;
12 |             let n = host.len();
13 |             let mut dev = DeviceMemory::zeros(&context, n);
14 |             group.bench_with_input(
15 |                 BenchmarkId::new(&format!("direct_{}", $id), n),
16 |                 &n,
17 |                 |b, _| {
18 |                     b.iter(|| {
19 |                         for i in 0..n {
20 |                             dev[i] = host[i];
21 |                         }
22 |                     })
23 |                 },
24 |             );
25 |             group.bench_with_input(
26 |                 BenchmarkId::new(format!("memcpy_{}", $id), n),
27 |                 &n,
28 |                 |b, _| {
29 |                     b.iter(|| {
30 |                         dev.copy_from(&host);
31 |                     })
32 |                 },
33 |             );
34 |         };
35 |     }
36 | 
37 |     for &n in &[1000, 10_000, 100_000] {
38 |         // impl_HtoD!(vec![0_u32; n], "vec");
39 |         impl_HtoD!(PageLockedMemory::<u32>::zeros(&context, n), "page_locked");
40 |         let mut vec_tmp = vec![0_u32; n];
41 |         impl_HtoD!(RegisteredMemory::new(&context, &mut vec_tmp), "registered");
42 |     }
43 | }
44 | 
45 | fn d2h(c: &mut Criterion) {
46 |     let device = Device::nth(0).unwrap();
47 |     let context = device.create_context();
48 |     let mut group = c.benchmark_group("d2h");
49 | 
50 |     macro_rules! impl_DtoH {
51 |         ($host:expr, $id:expr) => {
52 |             let mut host = $host;
53 |             let n = host.len();
54 |             let dev = DeviceMemory::zeros(&context, n);
55 |             group.bench_with_input(
56 |                 BenchmarkId::new(&format!("direct_{}", $id), n),
57 |                 &n,
58 |                 |b, _| {
59 |                     b.iter(|| {
60 |                         for i in 0..n {
61 |                             host[i] = dev[i];
62 |                         }
63 |                     })
64 |                 },
65 |             );
66 |             group.bench_with_input(
67 |                 BenchmarkId::new(format!("memcpy_{}", $id), n),
68 |                 &n,
69 |                 |b, _| {
70 |                     b.iter(|| {
71 |                         host.copy_from(&dev);
72 |                     })
73 |                 },
74 |             );
75 |         };
76 |     }
77 | 
78 |     for &n in &[1000, 10_000, 100_000] {
79 |         impl_DtoH!(vec![0_u32; n], "vec");
80 |         impl_DtoH!(PageLockedMemory::<u32>::zeros(&context, n), "page_locked");
81 |         let mut vec_tmp = vec![0_u32; n];
82 |         impl_DtoH!(RegisteredMemory::new(&context, &mut vec_tmp), "registered");
83 |     }
84 | }
85 | 
86 | criterion_group!(benches, h2d, d2h);
87 | criterion_main!(benches);
88 | 


--------------------------------------------------------------------------------
/accel/examples/add.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | unsafe fn add(a: *const f32, b: *const f32, c: *mut f32, n: usize) {
 5 |     let i = accel_core::index();
 6 |     if (i as usize) < n {
 7 |         *c.offset(i) = *a.offset(i) + *b.offset(i);
 8 |     }
 9 | }
10 | 
11 | fn main() -> error::Result<()> {
12 |     let device = Device::nth(0)?;
13 |     let ctx = device.create_context();
14 | 
15 |     let _pf = Profiler::start(&ctx);
16 | 
17 |     // Allocate memories on GPU
18 |     let n = 1024;
19 |     let mut a = DeviceMemory::<f32>::zeros(&ctx, n);
20 |     let mut b = DeviceMemory::<f32>::zeros(&ctx, n);
21 |     let mut c = DeviceMemory::<f32>::zeros(&ctx, n);
22 | 
23 |     // Accessible from CPU as usual Rust slice (though this will be slow)
24 |     for i in 0..n {
25 |         a[i] = i as f32;
26 |         b[i] = 2.0 * i as f32;
27 |     }
28 | 
29 |     // Launch kernel synchronously
30 |     add(
31 |         &ctx,
32 |         1, /* grid */
33 |         n, /* block */
34 |         (a.as_ptr(), b.as_ptr(), c.as_mut_ptr(), n),
35 |     )
36 |     .expect("Kernel call failed");
37 | 
38 |     Ok(())
39 | }
40 | 


--------------------------------------------------------------------------------
/accel/src/block.rs:
--------------------------------------------------------------------------------
  1 | use num_traits::ToPrimitive;
  2 | 
  3 | /// Size of Block (thread block) in [CUDA thread hierarchy]( http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-model )
  4 | ///
  5 | /// Every input integer and float convert into `u32` using [ToPrimitive].
  6 | /// If the conversion is impossible, e.g. negative or too large integers, the conversion will panics.
  7 | ///
  8 | /// [ToPrimitive]: https://docs.rs/num-traits/0.2.11/num_traits/cast/trait.ToPrimitive.html
  9 | ///
 10 | /// Examples
 11 | /// --------
 12 | ///
 13 | /// - Explicit creation
 14 | ///
 15 | /// ```
 16 | /// # use accel::*;
 17 | /// let block1d = Block::x(64);
 18 | /// assert_eq!(block1d.x, 64);
 19 | ///
 20 | /// let block2d = Block::xy(64, 128);
 21 | /// assert_eq!(block2d.x, 64);
 22 | /// assert_eq!(block2d.y, 128);
 23 | ///
 24 | /// let block3d = Block::xyz(64, 128, 256);
 25 | /// assert_eq!(block3d.x, 64);
 26 | /// assert_eq!(block3d.y, 128);
 27 | /// assert_eq!(block3d.z, 256);
 28 | /// ```
 29 | ///
 30 | /// - From single integer (unsigned and signed)
 31 | ///
 32 | /// ```
 33 | /// # use accel::*;
 34 | /// let block1d: Block = 64_usize.into();
 35 | /// assert_eq!(block1d.x, 64);
 36 | ///
 37 | /// let block1d: Block = 64_i32.into();
 38 | /// assert_eq!(block1d.x, 64);
 39 | /// ```
 40 | ///
 41 | /// - From tuple
 42 | ///
 43 | /// ```
 44 | /// # use accel::*;
 45 | /// let block1d: Block = (64,).into();
 46 | /// assert_eq!(block1d.x, 64);
 47 | ///
 48 | /// let block2d: Block = (64, 128).into();
 49 | /// assert_eq!(block2d.x, 64);
 50 | /// assert_eq!(block2d.y, 128);
 51 | ///
 52 | /// let block3d: Block = (64, 128, 256).into();
 53 | /// assert_eq!(block3d.x, 64);
 54 | /// assert_eq!(block3d.y, 128);
 55 | /// assert_eq!(block3d.z, 256);
 56 | /// ```
 57 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
 58 | pub struct Block {
 59 |     pub x: u32,
 60 |     pub y: u32,
 61 |     pub z: u32,
 62 | }
 63 | 
 64 | impl Block {
 65 |     /// 1D Block
 66 |     ///
 67 |     /// Panic
 68 |     /// -----
 69 |     /// - If input values cannot convert to u32
 70 |     pub fn x<I: ToPrimitive>(x: I) -> Self {
 71 |         Block {
 72 |             x: x.to_u32().expect("Cannot convert to u32"),
 73 |             y: 1,
 74 |             z: 1,
 75 |         }
 76 |     }
 77 | 
 78 |     /// 2D Block
 79 |     ///
 80 |     /// Panic
 81 |     /// -----
 82 |     /// - If input values cannot convert to u32
 83 |     pub fn xy<I1: ToPrimitive, I2: ToPrimitive>(x: I1, y: I2) -> Self {
 84 |         Block {
 85 |             x: x.to_u32().expect("Cannot convert to u32"),
 86 |             y: y.to_u32().expect("Cannot convert to u32"),
 87 |             z: 1,
 88 |         }
 89 |     }
 90 | 
 91 |     /// 3D Block
 92 |     ///
 93 |     /// Panic
 94 |     /// -----
 95 |     /// - If input values cannot convert to u32
 96 |     pub fn xyz<I1: ToPrimitive, I2: ToPrimitive, I3: ToPrimitive>(x: I1, y: I2, z: I3) -> Self {
 97 |         Block {
 98 |             x: x.to_u32().expect("Cannot convert to u32"),
 99 |             y: y.to_u32().expect("Cannot convert to u32"),
100 |             z: z.to_u32().expect("Cannot convert to u32"),
101 |         }
102 |     }
103 | }
104 | 
105 | impl<I: ToPrimitive> Into<Block> for (I,) {
106 |     fn into(self) -> Block {
107 |         Block::x(self.0)
108 |     }
109 | }
110 | 
111 | impl<I1: ToPrimitive, I2: ToPrimitive> Into<Block> for (I1, I2) {
112 |     fn into(self) -> Block {
113 |         Block::xy(self.0, self.1)
114 |     }
115 | }
116 | 
117 | impl<I1: ToPrimitive, I2: ToPrimitive, I3: ToPrimitive> Into<Block> for (I1, I2, I3) {
118 |     fn into(self) -> Block {
119 |         Block::xyz(self.0, self.1, self.2)
120 |     }
121 | }
122 | 
123 | macro_rules! impl_into_block {
124 |     ($integer:ty) => {
125 |         impl Into<Block> for $integer {
126 |             fn into(self) -> Block {
127 |                 Block::x(self)
128 |             }
129 |         }
130 |     };
131 | }
132 | 
133 | impl_into_block!(u8);
134 | impl_into_block!(u16);
135 | impl_into_block!(u32);
136 | impl_into_block!(u64);
137 | impl_into_block!(u128);
138 | impl_into_block!(usize);
139 | impl_into_block!(i8);
140 | impl_into_block!(i16);
141 | impl_into_block!(i32);
142 | impl_into_block!(i64);
143 | impl_into_block!(i128);
144 | impl_into_block!(isize);
145 | 


--------------------------------------------------------------------------------
/accel/src/device.rs:
--------------------------------------------------------------------------------
  1 | //! CUDA [Device] and [Context]
  2 | //!
  3 | //! [Device]:  https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html
  4 | //! [Context]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html
  5 | 
  6 | use crate::{error::*, *};
  7 | use cuda::*;
  8 | use std::sync::{Arc, Once};
  9 | 
 10 | pub use accel_derive::Contexted;
 11 | 
 12 | /// Handler for device and its primary context
 13 | #[derive(Debug, PartialEq, PartialOrd)]
 14 | pub struct Device {
 15 |     device: CUdevice,
 16 | }
 17 | 
 18 | impl Device {
 19 |     /// Initializer for CUDA Driver API
 20 |     fn init() {
 21 |         static DRIVER_API_INIT: Once = Once::new();
 22 |         DRIVER_API_INIT.call_once(|| unsafe {
 23 |             ffi_call!(cuda::cuInit, 0).expect("Initialization of CUDA Driver API failed");
 24 |         });
 25 |     }
 26 | 
 27 |     /// Get number of available GPUs
 28 |     pub fn get_count() -> Result<usize> {
 29 |         Self::init();
 30 |         let mut count: i32 = 0;
 31 |         unsafe {
 32 |             ffi_call!(cuDeviceGetCount, &mut count as *mut i32)?;
 33 |         }
 34 |         Ok(count as usize)
 35 |     }
 36 | 
 37 |     pub fn nth(id: usize) -> Result<Self> {
 38 |         let count = Self::get_count()?;
 39 |         if id >= count {
 40 |             return Err(AccelError::DeviceNotFound { id, count });
 41 |         }
 42 |         let device = unsafe { ffi_new!(cuDeviceGet, id as i32)? };
 43 |         Ok(Device { device })
 44 |     }
 45 | 
 46 |     /// Get total memory of GPU
 47 |     pub fn total_memory(&self) -> Result<usize> {
 48 |         let mut mem = 0;
 49 |         unsafe {
 50 |             ffi_call!(cuDeviceTotalMem_v2, &mut mem as *mut _, self.device)?;
 51 |         }
 52 |         Ok(mem)
 53 |     }
 54 | 
 55 |     /// Get name of GPU
 56 |     pub fn get_name(&self) -> Result<String> {
 57 |         let mut bytes: Vec<u8> = vec![0_u8; 1024];
 58 |         unsafe {
 59 |             ffi_call!(
 60 |                 cuDeviceGetName,
 61 |                 bytes.as_mut_ptr() as *mut i8,
 62 |                 1024,
 63 |                 self.device
 64 |             )?;
 65 |         }
 66 |         Ok(String::from_utf8(bytes).expect("GPU name is not UTF8"))
 67 |     }
 68 | 
 69 |     /// Create a new CUDA context on this device.
 70 |     ///
 71 |     /// ```
 72 |     /// # use accel::*;
 73 |     /// let device = Device::nth(0).unwrap();
 74 |     /// let ctx = device.create_context();
 75 |     /// ```
 76 |     pub fn create_context(&self) -> Context {
 77 |         let ptr = unsafe {
 78 |             ffi_new!(
 79 |                 cuCtxCreate_v2,
 80 |                 CUctx_flags_enum::CU_CTX_SCHED_AUTO as u32,
 81 |                 self.device
 82 |             )
 83 |         }
 84 |         .expect("Failed to create a new context");
 85 |         if ptr.is_null() {
 86 |             panic!("Cannot crate a new context");
 87 |         }
 88 |         let ptr_new = ctx_pop().unwrap();
 89 |         assert_eq!(ptr, ptr_new);
 90 |         Arc::new(ContextOwned { ptr })
 91 |     }
 92 | }
 93 | 
 94 | /// Push to the context stack of this thread
 95 | fn ctx_push(ptr: CUcontext) -> Result<()> {
 96 |     unsafe { ffi_call!(cuCtxPushCurrent_v2, ptr) }?;
 97 |     Ok(())
 98 | }
 99 | 
100 | /// Pop from the context stack of this thread
101 | fn ctx_pop() -> Result<CUcontext> {
102 |     let ptr = unsafe { ffi_new!(cuCtxPopCurrent_v2) }?;
103 |     if ptr.is_null() {
104 |         panic!("No current context");
105 |     }
106 |     Ok(ptr)
107 | }
108 | 
109 | /// Get API version
110 | fn ctx_version(ptr: CUcontext) -> Result<u32> {
111 |     let mut version: u32 = 0;
112 |     unsafe { ffi_call!(cuCtxGetApiVersion, ptr, &mut version as *mut _) }?;
113 |     Ok(version)
114 | }
115 | 
116 | /// Block until all tasks in this context to be complete.
117 | fn ctx_sync(ptr: CUcontext) -> Result<()> {
118 |     ctx_push(ptr)?;
119 |     unsafe { ffi_call!(cuCtxSynchronize) }?;
120 |     let ptr_new = ctx_pop()?;
121 |     assert_eq!(ptr, ptr_new);
122 |     Ok(())
123 | }
124 | 
125 | /// Object with CUDA context
126 | pub trait Contexted {
127 |     fn guard(&self) -> Result<ContextGuard>;
128 |     fn sync(&self) -> Result<()>;
129 |     fn version(&self) -> Result<u32>;
130 |     /// Get a reference
131 |     ///
132 |     /// This is **NOT** a Rust reference, i.e. you can drop owned context while the reference exists.
133 |     /// The reference becomes expired after owned context is released, and it will cause a runtime error.
134 |     ///
135 |     fn get_ref(&self) -> ContextRef;
136 | }
137 | 
138 | /// Owend handler for CUDA context
139 | #[derive(Debug, PartialEq)]
140 | pub struct ContextOwned {
141 |     ptr: CUcontext,
142 | }
143 | 
144 | pub type Context = Arc<ContextOwned>;
145 | 
146 | impl Drop for ContextOwned {
147 |     fn drop(&mut self) {
148 |         if let Err(e) = unsafe { ffi_call!(cuCtxDestroy_v2, self.ptr) } {
149 |             log::error!("Context remove failed: {:?}", e);
150 |         }
151 |     }
152 | }
153 | 
154 | unsafe impl Send for ContextOwned {}
155 | unsafe impl Sync for ContextOwned {}
156 | 
157 | impl Contexted for Context {
158 |     fn sync(&self) -> Result<()> {
159 |         ctx_sync(self.ptr)
160 |     }
161 | 
162 |     fn version(&self) -> Result<u32> {
163 |         ctx_version(self.ptr)
164 |     }
165 | 
166 |     fn guard(&self) -> Result<ContextGuard> {
167 |         ctx_push(self.ptr)?;
168 |         Ok(ContextGuard { ptr: self.ptr })
169 |     }
170 | 
171 |     fn get_ref(&self) -> ContextRef {
172 |         ContextRef { ptr: self.ptr }
173 |     }
174 | }
175 | 
176 | /// Non-Owend handler for CUDA context
177 | ///
178 | /// The validity of reference is checked dynamically.
179 | /// CUDA APIs (e.g. [cuPointerGetAttribute]) allow us to get a pointer to CUDA context,
180 | /// but its validity cannot be assured by Rust lifetime system.
181 | ///
182 | /// [cuPointerGetAttribute]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED_1g0c28ed0aff848042bc0533110e45820c
183 | ///
184 | #[derive(Debug, PartialEq, Clone, Copy)]
185 | pub struct ContextRef {
186 |     ptr: CUcontext,
187 | }
188 | 
189 | impl ContextRef {
190 |     pub(crate) fn from_ptr(ptr: CUcontext) -> Self {
191 |         assert!(!ptr.is_null());
192 |         ContextRef { ptr }
193 |     }
194 | }
195 | 
196 | unsafe impl Send for ContextRef {}
197 | unsafe impl Sync for ContextRef {}
198 | 
199 | impl Contexted for ContextRef {
200 |     fn sync(&self) -> Result<()> {
201 |         ctx_sync(self.ptr)
202 |     }
203 | 
204 |     fn version(&self) -> Result<u32> {
205 |         ctx_version(self.ptr)
206 |     }
207 | 
208 |     fn guard(&self) -> Result<ContextGuard> {
209 |         ctx_push(self.ptr)?;
210 |         Ok(ContextGuard { ptr: self.ptr })
211 |     }
212 | 
213 |     fn get_ref(&self) -> ContextRef {
214 |         self.clone()
215 |     }
216 | }
217 | 
218 | impl std::cmp::PartialEq<ContextRef> for ContextOwned {
219 |     fn eq(&self, ctx: &ContextRef) -> bool {
220 |         self.ptr == ctx.ptr
221 |     }
222 | }
223 | 
224 | impl std::cmp::PartialEq<ContextOwned> for ContextRef {
225 |     fn eq(&self, ctx: &ContextOwned) -> bool {
226 |         self.ptr == ctx.ptr
227 |     }
228 | }
229 | 
230 | /// RAII handler for using CUDA context
231 | ///
232 | /// As described in [CUDA Programming Guide], library using CUDA should push context before using
233 | /// it, and then pop it.
234 | ///
235 | /// [CUDA Programming Guide]: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#context
236 | pub struct ContextGuard {
237 |     ptr: CUcontext,
238 | }
239 | 
240 | impl Drop for ContextGuard {
241 |     fn drop(&mut self) {
242 |         match ctx_pop() {
243 |             Ok(ptr) => {
244 |                 if ptr != self.ptr {
245 |                     log::error!("Poped context is different from pushed: {:?}", ptr);
246 |                 }
247 |             }
248 |             Err(e) => {
249 |                 log::error!("Failed to pop context: {}", e);
250 |             }
251 |         }
252 |     }
253 | }
254 | 
255 | #[cfg(test)]
256 | mod tests {
257 |     use super::*;
258 | 
259 |     #[test]
260 |     fn get_count() -> Result<()> {
261 |         Device::get_count()?;
262 |         Ok(())
263 |     }
264 | 
265 |     #[test]
266 |     fn get_zeroth() -> Result<()> {
267 |         Device::nth(0)?;
268 |         Ok(())
269 |     }
270 | 
271 |     #[test]
272 |     fn out_of_range() -> Result<()> {
273 |         assert!(Device::nth(129).is_err());
274 |         Ok(())
275 |     }
276 | 
277 |     #[test]
278 |     fn create() -> Result<()> {
279 |         let device = Device::nth(0)?;
280 |         let ctx = device.create_context();
281 |         dbg!(&ctx);
282 |         Ok(())
283 |     }
284 | 
285 |     #[should_panic]
286 |     #[test]
287 |     fn expired_context_ref() {
288 |         let device = Device::nth(0).unwrap();
289 |         let ctx = device.create_context();
290 |         let ctx_ref = ctx.get_ref();
291 |         drop(ctx);
292 |         let _version = ctx_ref.version().unwrap(); // ctx has been expired
293 |     }
294 | 
295 |     #[should_panic]
296 |     #[test]
297 |     fn expired_contexted_call() {
298 |         let device = Device::nth(0).unwrap();
299 |         let ctx = device.create_context();
300 |         let ctx_ref = ctx.get_ref();
301 |         drop(ctx);
302 |         unsafe { contexted_call!(&ctx_ref, cuCtxSynchronize) }.unwrap();
303 |     }
304 | }
305 | 


--------------------------------------------------------------------------------
/accel/src/error.rs:
--------------------------------------------------------------------------------
 1 | use cuda::cudaError_enum as DeviceError;
 2 | use std::path::PathBuf;
 3 | 
 4 | pub type Result<T> = ::std::result::Result<T, AccelError>;
 5 | 
 6 | #[derive(thiserror::Error, Debug)]
 7 | pub enum AccelError {
 8 |     /// Raw errors originates from CUDA Device APIs
 9 |     #[error("CUDA Device API Error: {api_name}, {error:?}")]
10 |     CUDAError {
11 |         api_name: String,
12 |         error: DeviceError,
13 |     },
14 | 
15 |     // This is not an error potentially, but it should be a bug if not captured by accel
16 |     #[error("Async operations issues previously have not completed yet")]
17 |     AsyncOperationNotReady,
18 | 
19 |     /// Error for user device code assertion
20 |     #[error("Assertion in device code has failed")]
21 |     DeviceAssertionFailed,
22 | 
23 |     #[error("No device found for given ID")]
24 |     DeviceNotFound { id: usize, count: usize },
25 | 
26 |     #[error("File not found: {path:?}")]
27 |     FileNotFound { path: PathBuf },
28 | 
29 |     #[error(transparent)]
30 |     AsyncTaskFailed(#[from] tokio::task::JoinError),
31 | }
32 | 
33 | /// Convert return code of CUDA Driver/Runtime API into Result
34 | pub(crate) fn check(error: DeviceError, api_name: &str) -> Result<()> {
35 |     match error {
36 |         DeviceError::CUDA_SUCCESS => Ok(()),
37 |         DeviceError::CUDA_ERROR_ASSERT => Err(AccelError::DeviceAssertionFailed),
38 |         DeviceError::CUDA_ERROR_NOT_READY => Err(AccelError::AsyncOperationNotReady),
39 |         _ => Err(AccelError::CUDAError {
40 |             api_name: api_name.into(),
41 |             error,
42 |         }),
43 |     }
44 | }
45 | 
46 | #[macro_export]
47 | macro_rules! ffi_call {
48 |     ($ffi:path $(,$args:expr)*) => {
49 |         {
50 |             $crate::error::check($ffi($($args),*), stringify!($ffi))
51 |         }
52 |     };
53 | }
54 | 
55 | #[macro_export]
56 | macro_rules! ffi_new {
57 |     ($ffi:path $(,$args:expr)*) => {
58 |         {
59 |             let mut value = ::std::mem::MaybeUninit::uninit();
60 |             $crate::error::check($ffi(value.as_mut_ptr(), $($args),*), stringify!($ffi)).map(|_| value.assume_init())
61 |         }
62 |     };
63 | }
64 | 
65 | #[macro_export]
66 | macro_rules! contexted_call {
67 |     ($ctx:expr, $ffi:path $(,$args:expr)*) => {
68 |         $crate::Contexted::guard($ctx).and_then(|_g| { $crate::ffi_call!($ffi $(,$args)*) })
69 |     };
70 | }
71 | 
72 | #[macro_export]
73 | macro_rules! contexted_new {
74 |     ($ctx:expr, $ffi:path $(,$args:expr)*) => {
75 |         $crate::Contexted::guard($ctx).and_then(|_g| { $crate::ffi_new!($ffi $(,$args)*) })
76 |     };
77 | }
78 | 


--------------------------------------------------------------------------------
/accel/src/execution.rs:
--------------------------------------------------------------------------------
  1 | //! Traits for CUDA Kernel launching
  2 | //!
  3 | //! Launchable traits
  4 | //! -----------------
  5 | //!
  6 | //! Launchable traits, i.e. `Launchable0`, `Launchable1`, ..., implement `launch` function which launches a kernel on device.
  7 | //!
  8 | //! ```
  9 | //! use accel::{*, error::Result};
 10 | //!
 11 | //! // Trait for 2-arg kernel
 12 | //! pub trait Launchable2 {
 13 | //!     // Type of arg1 on device
 14 | //!     type Target1;
 15 | //!     // Type of arg2 on device
 16 | //!     type Target2;
 17 | //!
 18 | //!     // Launch kernel code on device
 19 | //!     fn launch<
 20 | //!         Arg1 /* Type of arg1 on host */,
 21 | //!         Arg2 /* Type of arg2 on host */
 22 | //!     >(
 23 | //!         &self,
 24 | //!         grid:  impl Into<Grid>,
 25 | //!         block: impl Into<Block>,
 26 | //!         (arg1, arg2): (Arg1, Arg2)
 27 | //!     ) -> Result<()>
 28 | //!     where
 29 | //!         // Types on host and on device are bundled by DeviceSend trait
 30 | //!         Arg1: DeviceSend<Target=Self::Target1>,
 31 | //!         Arg2: DeviceSend<Target=Self::Target2>,
 32 | //!     {
 33 | //!         // default impl which uses crate-internal features
 34 | //!         todo!() // skip for document
 35 | //!     }
 36 | //!
 37 | //!     // Specify entry point (see following example)
 38 | //!     fn get_kernel(&self) -> Result<Kernel>;
 39 | //! }
 40 | //! ```
 41 | //!
 42 | //! These traits are generated by `accel_derive::define_launchable!` proc-macro.
 43 | //! Launchable traits are specialized for N-args functions because it uses a tuple `(Arg1, Arg2, ..., ArgN)`
 44 | //! for `launch` argument.
 45 | //! [DeviceSend] trait specify how the host value is sent to device.
 46 | //!
 47 | //! One of Launchable traits will be implemented automatically by [accel::kernel] for an auto-generated [Module] struct:
 48 | //!
 49 | //! ```
 50 | //! #[accel::kernel]
 51 | //! fn f(a: i32) {}
 52 | //! ```
 53 | //!
 54 | //! This simple definition will create a submodule `f` (same name of the function):
 55 | //!
 56 | //! ```
 57 | //! mod f { // same name sub-module
 58 | //!
 59 | //!     pub const PTX_STR: &str = "{{ PTX string generated by rustc/nvptx64-nvidia-cuda }}";
 60 | //!
 61 | //!     // wrapper for implement one of Launchable traits
 62 | //!     pub struct Module(::accel::Module);
 63 | //!
 64 | //!     // impl Launchable1 because number of arugment is 1
 65 | //!     impl ::accel::execution::Launchable1<'_> for Module {
 66 | //!         type Target1 = i32; // first argument of `f`
 67 | //!
 68 | //!         // How to get kernel PTX code
 69 | //!         fn get_kernel(&self) -> ::accel::error::Result<::accel::Kernel> {
 70 | //!             self.0.get_kernel("f")
 71 | //!         }
 72 | //!     }
 73 | //! }
 74 | //! ```
 75 | //!
 76 | //! For a function which takes N arguments, `Launchable{N}` will be implemented for corresponding module.
 77 | //! Be sure that this sub-module will be generated where the `f` is defined.
 78 | //! `get_kernel` and default implementation of `launch` are separated to keep unsafe codes in this crate.
 79 | //!
 80 | //! [DeviceSend]: trait.DeviceSend.html
 81 | //! [accel::kernel]: ../attr.kernel.html
 82 | //! [Module]: ../module/struct.Module.html
 83 | 
 84 | use crate::{contexted_call, device::*, error::*, *};
 85 | use cuda::*;
 86 | use std::{ffi::*, ptr::null_mut};
 87 | 
 88 | /// Type which can be sent to device
 89 | pub trait DeviceSend {
 90 |     /// Type on device
 91 |     type Target;
 92 |     fn as_kernel_parameter(&self) -> *mut c_void {
 93 |         self as *const Self as *mut c_void
 94 |     }
 95 | }
 96 | 
 97 | impl<T: Sized> DeviceSend for *mut T {
 98 |     type Target = Self;
 99 | }
100 | 
101 | impl<T: Sized> DeviceSend for *const T {
102 |     type Target = Self;
103 | }
104 | 
105 | impl<'arg, T: Sized> DeviceSend for &'arg [T] {
106 |     type Target = *const T;
107 | }
108 | 
109 | impl<'arg, T: Sized> DeviceSend for &'arg mut [T] {
110 |     type Target = *mut T;
111 | }
112 | 
113 | macro_rules! impl_device_send {
114 |     ($pri:ty) => {
115 |         impl DeviceSend for $pri {
116 |             type Target = Self;
117 |         }
118 | 
119 |         impl<'arg> DeviceSend for &'arg $pri {
120 |             type Target = Self;
121 |         }
122 | 
123 |         impl<'arg> DeviceSend for &'arg mut $pri {
124 |             type Target = Self;
125 |         }
126 |     };
127 | }
128 | 
129 | impl_device_send!(bool);
130 | impl_device_send!(i8);
131 | impl_device_send!(i16);
132 | impl_device_send!(i32);
133 | impl_device_send!(i64);
134 | impl_device_send!(i128);
135 | impl_device_send!(isize);
136 | impl_device_send!(u8);
137 | impl_device_send!(u16);
138 | impl_device_send!(u32);
139 | impl_device_send!(u64);
140 | impl_device_send!(u128);
141 | impl_device_send!(usize);
142 | impl_device_send!(f32);
143 | impl_device_send!(f64);
144 | 
145 | accel_derive::define_launchable!(12 /* 0..=12 */);
146 | 


--------------------------------------------------------------------------------
/accel/src/grid.rs:
--------------------------------------------------------------------------------
  1 | use num_traits::ToPrimitive;
  2 | 
  3 | /// Size of Grid (grid of blocks) in [CUDA thread hierarchy]( http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-model )
  4 | ///
  5 | /// Every input integer and float convert into `u32` using [ToPrimitive].
  6 | /// If the conversion is impossible, e.g. negative or too large integers, the conversion will panics.
  7 | ///
  8 | /// [ToPrimitive]: https://docs.rs/num-traits/0.2.11/num_traits/cast/trait.ToPrimitive.html
  9 | ///
 10 | /// Examples
 11 | /// --------
 12 | ///
 13 | /// - Explicit creation
 14 | ///
 15 | /// ```
 16 | /// # use accel::*;
 17 | /// let grid1d = Grid::x(64);
 18 | /// assert_eq!(grid1d.x, 64);
 19 | ///
 20 | /// let grid2d = Grid::xy(64, 128);
 21 | /// assert_eq!(grid2d.x, 64);
 22 | /// assert_eq!(grid2d.y, 128);
 23 | ///
 24 | /// let grid3d = Grid::xyz(64, 128, 256);
 25 | /// assert_eq!(grid3d.x, 64);
 26 | /// assert_eq!(grid3d.y, 128);
 27 | /// assert_eq!(grid3d.z, 256);
 28 | /// ```
 29 | ///
 30 | /// - From single integer (unsigned and signed)
 31 | ///
 32 | /// ```
 33 | /// # use accel::*;
 34 | /// let grid1d: Grid = 64_usize.into();
 35 | /// assert_eq!(grid1d.x, 64);
 36 | ///
 37 | /// let grid1d: Grid = 64_i32.into();
 38 | /// assert_eq!(grid1d.x, 64);
 39 | /// ```
 40 | ///
 41 | /// - From tuple
 42 | ///
 43 | /// ```
 44 | /// # use accel::*;
 45 | /// let grid1d: Grid = (64,).into();
 46 | /// assert_eq!(grid1d.x, 64);
 47 | ///
 48 | /// let grid2d: Grid = (64, 128).into();
 49 | /// assert_eq!(grid2d.x, 64);
 50 | /// assert_eq!(grid2d.y, 128);
 51 | ///
 52 | /// let grid3d: Grid = (64, 128, 256).into();
 53 | /// assert_eq!(grid3d.x, 64);
 54 | /// assert_eq!(grid3d.y, 128);
 55 | /// assert_eq!(grid3d.z, 256);
 56 | /// ```
 57 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
 58 | pub struct Grid {
 59 |     pub x: u32,
 60 |     pub y: u32,
 61 |     pub z: u32,
 62 | }
 63 | 
 64 | impl Grid {
 65 |     /// 1D Grid
 66 |     ///
 67 |     /// Panic
 68 |     /// -----
 69 |     /// - If input values cannot convert to u32
 70 |     pub fn x<I: ToPrimitive>(x: I) -> Self {
 71 |         Grid {
 72 |             x: x.to_u32().expect("Cannot convert to u32"),
 73 |             y: 1,
 74 |             z: 1,
 75 |         }
 76 |     }
 77 | 
 78 |     /// 2D Grid
 79 |     ///
 80 |     /// Panic
 81 |     /// -----
 82 |     /// - If input values cannot convert to u32
 83 |     pub fn xy<I1: ToPrimitive, I2: ToPrimitive>(x: I1, y: I2) -> Self {
 84 |         Grid {
 85 |             x: x.to_u32().expect("Cannot convert to u32"),
 86 |             y: y.to_u32().expect("Cannot convert to u32"),
 87 |             z: 1,
 88 |         }
 89 |     }
 90 | 
 91 |     /// 3D Grid
 92 |     ///
 93 |     /// Panic
 94 |     /// -----
 95 |     /// - If input values cannot convert to u32
 96 |     pub fn xyz<I1: ToPrimitive, I2: ToPrimitive, I3: ToPrimitive>(x: I1, y: I2, z: I3) -> Self {
 97 |         Grid {
 98 |             x: x.to_u32().expect("Cannot convert to u32"),
 99 |             y: y.to_u32().expect("Cannot convert to u32"),
100 |             z: z.to_u32().expect("Cannot convert to u32"),
101 |         }
102 |     }
103 | }
104 | 
105 | impl<I: ToPrimitive> Into<Grid> for (I,) {
106 |     fn into(self) -> Grid {
107 |         Grid::x(self.0)
108 |     }
109 | }
110 | 
111 | impl<I1: ToPrimitive, I2: ToPrimitive> Into<Grid> for (I1, I2) {
112 |     fn into(self) -> Grid {
113 |         Grid::xy(self.0, self.1)
114 |     }
115 | }
116 | 
117 | impl<I1: ToPrimitive, I2: ToPrimitive, I3: ToPrimitive> Into<Grid> for (I1, I2, I3) {
118 |     fn into(self) -> Grid {
119 |         Grid::xyz(self.0, self.1, self.2)
120 |     }
121 | }
122 | 
123 | macro_rules! impl_into_grid {
124 |     ($integer:ty) => {
125 |         impl Into<Grid> for $integer {
126 |             fn into(self) -> Grid {
127 |                 Grid::x(self)
128 |             }
129 |         }
130 |     };
131 | }
132 | 
133 | impl_into_grid!(u8);
134 | impl_into_grid!(u16);
135 | impl_into_grid!(u32);
136 | impl_into_grid!(u64);
137 | impl_into_grid!(u128);
138 | impl_into_grid!(usize);
139 | impl_into_grid!(i8);
140 | impl_into_grid!(i16);
141 | impl_into_grid!(i32);
142 | impl_into_grid!(i64);
143 | impl_into_grid!(i128);
144 | impl_into_grid!(isize);
145 | 


--------------------------------------------------------------------------------
/accel/src/instruction.rs:
--------------------------------------------------------------------------------
 1 | use crate::{error::*, *};
 2 | use cuda::*;
 3 | use std::{ffi::*, path::*};
 4 | 
 5 | /// Represent the resource of CUDA middle-IR (PTX/cubin)
 6 | #[derive(Debug)]
 7 | pub enum Instruction {
 8 |     PTX(CString),
 9 |     PTXFile(PathBuf),
10 |     Cubin(Vec<u8>),
11 |     CubinFile(PathBuf),
12 | }
13 | 
14 | impl Instruction {
15 |     /// Constructor for `Instruction::PTX`
16 |     pub fn ptx(s: &str) -> Instruction {
17 |         let ptx = CString::new(s).expect("Invalid PTX string");
18 |         Instruction::PTX(ptx)
19 |     }
20 | 
21 |     /// Constructor for `Instruction::Cubin`
22 |     pub fn cubin(sl: &[u8]) -> Instruction {
23 |         Instruction::Cubin(sl.to_vec())
24 |     }
25 | 
26 |     /// Constructor for `Instruction::PTXFile`
27 |     pub fn ptx_file(path: &Path) -> Result<Self> {
28 |         if !path.exists() {
29 |             return Err(AccelError::FileNotFound {
30 |                 path: path.to_owned(),
31 |             });
32 |         }
33 |         Ok(Instruction::PTXFile(path.to_owned()))
34 |     }
35 | 
36 |     /// Constructor for `Instruction::CubinFile`
37 |     pub fn cubin_file(path: &Path) -> Result<Self> {
38 |         if !path.exists() {
39 |             return Err(AccelError::FileNotFound {
40 |                 path: path.to_owned(),
41 |             });
42 |         }
43 |         Ok(Instruction::CubinFile(path.to_owned()))
44 |     }
45 | }
46 | 
47 | impl Instruction {
48 |     /// Get type of PTX/cubin
49 |     pub fn input_type(&self) -> CUjitInputType {
50 |         match *self {
51 |             Instruction::PTX(_) | Instruction::PTXFile(_) => CUjitInputType_enum::CU_JIT_INPUT_PTX,
52 |             Instruction::Cubin(_) | Instruction::CubinFile(_) => {
53 |                 CUjitInputType_enum::CU_JIT_INPUT_CUBIN
54 |             }
55 |         }
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/accel/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! GPGPU framework for Rust based on [CUDA Driver API]
  2 | //!
  3 | //! [CUDA Driver API]: https://docs.nvidia.com/cuda/cuda-driver-api/
  4 | //!
  5 | //! Setup
  6 | //! -----
  7 | //! Currently (0.3.0), accel works only on Linux system. Windows support will come in future release (0.3.x or 0.4~).
  8 | //!
  9 | //! 1. Install [CUDA](https://developer.nvidia.com/cuda-downloads) on your system
 10 | //! 2. Setup Rust environement using rustup (Requires 1.42 or later)
 11 | //! 3. Add `nvptx64-nvidia-cuda` target and install `ptx-linker`, or run
 12 | //!
 13 | //!     ```shell
 14 | //!     curl -sSL https://gitlab.com/termoshtt/accel/raw/master/setup_nvptx_toolchain.sh | bash
 15 | //!     ```
 16 | //!
 17 | //! Examples
 18 | //! --------
 19 | //! accel works with stable Rust
 20 | //!
 21 | //! ```toml
 22 | //! [dependencies]
 23 | //! accel = "=0.3.0-alpha.2"
 24 | //! ```
 25 | //!
 26 | //! Do **NOT** add `accel-core` to `[dependencies]`.
 27 | //! It will be linked automatically into the device code.
 28 | //!
 29 | //! ### Vector Add
 30 | //!
 31 | //! ```
 32 | //! use accel::*;
 33 | //!
 34 | //! #[kernel]
 35 | //! unsafe fn add(a: *const f32, b: *const f32, c: *mut f32, n: usize) {
 36 | //!     let i = accel_core::index();
 37 | //!     if (i as usize) < n {
 38 | //!         *c.offset(i) = *a.offset(i) + *b.offset(i);
 39 | //!     }
 40 | //! }
 41 | //!
 42 | //! fn main() -> error::Result<()> {
 43 | //!     let device = Device::nth(0)?;
 44 | //!     let ctx = device.create_context();
 45 | //!
 46 | //!     // Allocate memories on GPU
 47 | //!     let n = 32;
 48 | //!     let mut a = DeviceMemory::<f32>::zeros(&ctx, n);
 49 | //!     let mut b = DeviceMemory::<f32>::zeros(&ctx, n);
 50 | //!     let mut c = DeviceMemory::<f32>::zeros(&ctx, n);
 51 | //!
 52 | //!     // Accessible from CPU as usual Rust slice (though this will be slow)
 53 | //!     for i in 0..n {
 54 | //!         a[i] = i as f32;
 55 | //!         b[i] = 2.0 * i as f32;
 56 | //!     }
 57 | //!     println!("a = {:?}", a.as_slice());
 58 | //!     println!("b = {:?}", b.as_slice());
 59 | //!
 60 | //!     // Launch kernel synchronously
 61 | //!     add(&ctx,
 62 | //!         1 /* grid */,
 63 | //!         n /* block */,
 64 | //!         (a.as_ptr(), b.as_ptr(), c.as_mut_ptr(), n)
 65 | //!     ).expect("Kernel call failed");
 66 | //!
 67 | //!     println!("c = {:?}", c.as_slice());
 68 | //!     Ok(())
 69 | //! }
 70 | //! ```
 71 | //!
 72 | //! ### Assertion on GPU
 73 | //!
 74 | //! ```
 75 | //! use accel::*;
 76 | //!
 77 | //! #[kernel]
 78 | //! fn assert() {
 79 | //!     accel_core::assert_eq!(1 + 2, 4);  // will fail
 80 | //! }
 81 | //!
 82 | //! fn main() -> error::Result<()> {
 83 | //!     let device = Device::nth(0)?;
 84 | //!     let ctx = device.create_context();
 85 | //!     let result = assert(&ctx, 1 /* grid */, 4 /* block */, ());
 86 | //!     assert!(result.is_err()); // assertion failed
 87 | //!     Ok(())
 88 | //! }
 89 | //! ```
 90 | //!
 91 | //! ### Print from GPU
 92 | //!
 93 | //! ```
 94 | //! use accel::*;
 95 | //!
 96 | //! #[kernel]
 97 | //! pub fn print() {
 98 | //!     let i = accel_core::index();
 99 | //!     accel_core::println!("Hello from {}", i);
100 | //! }
101 | //!
102 | //! fn main() -> error::Result<()> {
103 | //!     let device = Device::nth(0)?;
104 | //!     let ctx = device.create_context();
105 | //!     print(&ctx, 1, 4, ())?;
106 | //!     Ok(())
107 | //! }
108 | //! ```
109 | 
110 | extern crate cuda_driver_sys as cuda;
111 | 
112 | pub use accel_derive::kernel;
113 | 
114 | pub mod device;
115 | pub mod error;
116 | pub mod execution;
117 | pub mod linker;
118 | pub mod memory;
119 | pub mod module;
120 | pub mod profiler;
121 | pub mod stream;
122 | 
123 | mod block;
124 | mod grid;
125 | mod instruction;
126 | 
127 | pub use block::Block;
128 | pub use device::*;
129 | pub use execution::*;
130 | pub use grid::Grid;
131 | pub use instruction::Instruction;
132 | pub use linker::*;
133 | pub use memory::*;
134 | pub use module::*;
135 | pub use profiler::*;
136 | pub use stream::*;
137 | 
138 | #[cfg(test)]
139 | mod tests {
140 |     /// Test accel_derive::kernel can be used in accel crate itself
141 |     #[super::kernel]
142 |     fn f() {}
143 | }
144 | 


--------------------------------------------------------------------------------
/accel/src/linker.rs:
--------------------------------------------------------------------------------
  1 | //! CUDA JIT compiler and Linkers
  2 | 
  3 | use crate::{contexted_call, device::*, error::*, module::*, *};
  4 | use cuda::*;
  5 | use std::{
  6 |     collections::HashMap,
  7 |     ffi::{CStr, CString},
  8 |     mem::MaybeUninit,
  9 |     os::raw::c_void,
 10 |     path::Path,
 11 |     ptr::null_mut,
 12 | };
 13 | 
 14 | // TODO
 15 | #[derive(Debug, Clone)]
 16 | pub struct LogBuffer {}
 17 | 
 18 | /// Configure generator for [CUjit_option] required in `cuLink*` APIs
 19 | ///
 20 | /// [CUjit_option]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g5527fa8030d5cabedc781a04dbd1997d
 21 | #[derive(Debug, Clone, Default)]
 22 | pub struct JITConfig {
 23 |     /// CU_JIT_MAX_REGISTERS, Applies to compiler only
 24 |     ///
 25 |     /// - Max number of registers that a thread may use.
 26 |     pub max_registers: Option<u32>,
 27 | 
 28 |     /// CU_JIT_THREADS_PER_BLOCK, Applies to compiler only
 29 |     ///
 30 |     /// - **IN**: Specifies minimum number of threads per block to target compilation for
 31 |     /// - **OUT**: Returns the number of threads the compiler actually targeted.
 32 |     ///   This restricts the resource utilization fo the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations.
 33 |     ///
 34 |     /// Note
 35 |     /// ----
 36 |     /// This option does not currently take into account any other resource limitations, such as shared memory utilization. Cannot be combined with CU_JIT_TARGET.
 37 |     pub threads_per_block: Option<u32>,
 38 | 
 39 |     /// CU_JIT_WALL_TIME, Applies to compiler and linker
 40 |     ///
 41 |     /// - Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker
 42 |     /// - Option type: float
 43 |     pub wall_time: Option<f32>,
 44 | 
 45 |     /// CU_JIT_INFO_LOG_BUFFER, Applies to compiler and linker
 46 |     ///
 47 |     /// - Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)
 48 |     ///
 49 |     /// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, Applies to compiler and linker
 50 |     ///
 51 |     /// - **IN**: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
 52 |     /// - **OUT**: Amount of log buffer filled with messages
 53 |     pub info_log_buffer: Option<LogBuffer>,
 54 | 
 55 |     /// CU_JIT_ERROR_LOG_BUFFER, Applies to compiler and linker
 56 |     ///
 57 |     /// - Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)
 58 |     ///
 59 |     /// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, Applies to compiler and linker
 60 |     ///
 61 |     /// - **IN**: Log buffer size in bytes. Log messages will be capped at this size (including null terminator)
 62 |     /// - **OUT**: Amount of log buffer filled with messages
 63 |     pub error_log_buffer: Option<LogBuffer>,
 64 | 
 65 |     /// CU_JIT_OPTIMIZATION_LEVEL, Applies to compiler only
 66 |     ///
 67 |     /// - Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations.
 68 |     pub optimization_level: Option<u32>,
 69 | 
 70 |     /// CU_JIT_TARGET_FROM_CUCONTEXT, Applies to compiler and linker
 71 |     ///
 72 |     /// - No option value required. Determines the target based on the current attached context (default)
 73 |     pub target_from_cucontext: Option<()>,
 74 | 
 75 |     /// CU_JIT_TARGET, Applies to compiler and linker
 76 |     ///
 77 |     /// - Target is chosen based on supplied CUjit_target. Cannot be combined with CU_JIT_THREADS_PER_BLOCK.
 78 |     pub target: Option<CUjit_target>,
 79 | 
 80 |     /// CU_JIT_FALLBACK_STRATEGY, Applies to compiler only
 81 |     ///
 82 |     /// - Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied CUjit_fallback.
 83 |     ///   This option cannot be used with cuLink* APIs as the linker requires exact matches.
 84 |     pub fallback_strategy: Option<CUjit_fallback>,
 85 | 
 86 |     /// CU_JIT_GENERATE_DEBUG_INFO, Applies to compiler and linker
 87 |     ///
 88 |     /// - Specifies whether to create debug information in output (-g) (0: false, default)
 89 |     pub generate_debug_info: Option<i32>,
 90 | 
 91 |     /// CU_JIT_LOG_VERBOSE, Applies to compiler and linker
 92 |     ///
 93 |     /// - Generate verbose log messages (0: false, default)
 94 |     pub log_verbose: Option<i32>,
 95 | 
 96 |     /// CU_JIT_GENERATE_LINE_INFO, Applies to compiler only
 97 |     ///
 98 |     /// - Generate line number information (-lineinfo) (0: false, default)
 99 |     pub generate_line_info: Option<i32>,
100 | 
101 |     /// CU_JIT_CACHE_MODE, Applies to compiler only
102 |     ///
103 |     /// - Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied CUjit_cacheMode_enum.
104 |     pub cache_mode: Option<CUjit_cacheMode_enum>,
105 | 
106 |     /// CU_JIT_NEW_SM3X_OPT
107 |     ///
108 |     /// - The below jit options are used for internal purposes only, in this version of CUDA
109 |     pub new_sm3x_opt: Option<u32>,
110 | 
111 |     /// CU_JIT_FAST_COMPILE
112 |     pub fast_compile: bool,
113 | 
114 |     /// CU_JIT_GLOBAL_SYMBOL_NAMES, Applies to dynamic linker only
115 |     ///
116 |     /// - Array of device symbol names that will be relocated to the corresponing host addresses stored in CU_JIT_GLOBAL_SYMBOL_ADDRESSES.
117 |     ///   Must contain CU_JIT_GLOBAL_SYMBOL_COUNT entries. When loding a device module, driver will relocate all encountered unresolved symbols to the host addresses.
118 |     ///   It is only allowed to register symbols that correspond to unresolved global variables. It is illegal to register the same device symbol at multiple addresses.
119 |     ///
120 |     /// CU_JIT_GLOBAL_SYMBOL_ADDRESSES, Applies to dynamic linker only
121 |     ///
122 |     /// - Array of host addresses that will be used to relocate corresponding device symbols stored in CU_JIT_GLOBAL_SYMBOL_NAMES.
123 |     ///   Must contain CU_JIT_GLOBAL_SYMBOL_COUNT entries.
124 |     ///
125 |     /// CU_JIT_GLOBAL_SYMBOL_COUNT, Applies to dynamic linker only
126 |     ///
127 |     /// - Number of entries in CU_JIT_GLOBAL_SYMBOL_NAMES and CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.
128 |     pub global_symbol: HashMap<CString, *mut c_void>,
129 | }
130 | 
131 | impl JITConfig {
132 |     /// Pack configure into C API compatible format
133 |     fn pack(&mut self) -> (u32, Vec<CUjit_option>, Vec<*mut c_void>) {
134 |         let mut opt_keys = Vec::new();
135 |         let mut opt_values = Vec::new();
136 | 
137 |         macro_rules! check_option {
138 |             ( $tag:ident, $opt_name:ident) => {
139 |                 if let Some($opt_name) = self.$opt_name.as_ref() {
140 |                     opt_keys.push(CUjit_option::$tag);
141 |                     opt_values.push($opt_name as *const _ as *mut c_void);
142 |                 }
143 |             };
144 |         }
145 |         check_option!(CU_JIT_MAX_REGISTERS, max_registers);
146 |         check_option!(CU_JIT_THREADS_PER_BLOCK, threads_per_block);
147 |         check_option!(CU_JIT_WALL_TIME, wall_time);
148 |         check_option!(CU_JIT_OPTIMIZATION_LEVEL, optimization_level);
149 |         check_option!(CU_JIT_TARGET, target);
150 |         check_option!(CU_JIT_FALLBACK_STRATEGY, fallback_strategy);
151 |         check_option!(CU_JIT_GENERATE_DEBUG_INFO, generate_debug_info);
152 |         check_option!(CU_JIT_LOG_VERBOSE, log_verbose);
153 |         check_option!(CU_JIT_GENERATE_LINE_INFO, generate_line_info);
154 |         check_option!(CU_JIT_CACHE_MODE, cache_mode);
155 |         check_option!(CU_JIT_NEW_SM3X_OPT, new_sm3x_opt);
156 | 
157 |         if self.fast_compile {
158 |             opt_keys.push(CUjit_option::CU_JIT_FAST_COMPILE);
159 |             opt_values.push(&self.fast_compile as *const bool as *mut c_void);
160 |         }
161 | 
162 |         if let Some(_info_log_buffer) = self.info_log_buffer.as_mut() {
163 |             unimplemented!("Log for JIT is not supported yet");
164 |         }
165 | 
166 |         if let Some(_error_log_buffer) = self.error_log_buffer.as_mut() {
167 |             unimplemented!("Log for JIT is not supported yet");
168 |         }
169 | 
170 |         if !self.global_symbol.is_empty() {
171 |             unimplemented!("GLOBAL_SYMBOL flags are not supported yet");
172 |         }
173 |         assert_eq!(opt_keys.len(), opt_values.len());
174 |         (opt_keys.len() as u32, opt_keys, opt_values)
175 |     }
176 | }
177 | 
178 | /// Consuming builder for cubin from PTX and cubins
179 | #[derive(accel_derive::Contexted)]
180 | pub struct Linker {
181 |     state: CUlinkState,
182 |     cfg: JITConfig,
183 |     ctx: Context,
184 | }
185 | 
186 | impl Drop for Linker {
187 |     fn drop(&mut self) {
188 |         if let Err(e) = unsafe { contexted_call!(self, cuLinkDestroy, self.state) } {
189 |             log::error!("Failed to release Linker: {:?}", e)
190 |         }
191 |     }
192 | }
193 | 
194 | impl Linker {
195 |     /// Create a new Linker
196 |     pub fn create(ctx: &Context, mut cfg: JITConfig) -> Result<Self> {
197 |         let (n, mut opt, mut opts) = cfg.pack();
198 |         let state = unsafe {
199 |             let mut state = MaybeUninit::uninit();
200 |             contexted_call!(
201 |                 ctx,
202 |                 cuLinkCreate_v2,
203 |                 n,
204 |                 opt.as_mut_ptr(),
205 |                 opts.as_mut_ptr(),
206 |                 state.as_mut_ptr()
207 |             )?;
208 |             state.assume_init()
209 |         };
210 |         Ok(Linker {
211 |             state,
212 |             cfg,
213 |             ctx: ctx.clone(),
214 |         })
215 |     }
216 | 
217 |     /// Wrapper of cuLinkAddData
218 |     unsafe fn add_data(mut self, input_type: CUjitInputType, data: &[u8]) -> Result<Self> {
219 |         let (nopts, mut opts, mut opt_vals) = self.cfg.pack();
220 |         let name = CString::new("").unwrap();
221 |         contexted_call!(
222 |             &self,
223 |             cuLinkAddData_v2,
224 |             self.state,
225 |             input_type,
226 |             data.as_ptr() as *mut _,
227 |             data.len(),
228 |             name.as_ptr(),
229 |             nopts,
230 |             opts.as_mut_ptr(),
231 |             opt_vals.as_mut_ptr()
232 |         )?;
233 |         Ok(self)
234 |     }
235 | 
236 |     /// Wrapper of cuLinkAddFile
237 |     unsafe fn add_file(mut self, input_type: CUjitInputType, path: &Path) -> Result<Self> {
238 |         let filename = CString::new(path.to_str().unwrap()).expect("Invalid file path");
239 |         let (nopts, mut opts, mut opt_vals) = self.cfg.pack();
240 |         contexted_call!(
241 |             &self,
242 |             cuLinkAddFile_v2,
243 |             self.state,
244 |             input_type,
245 |             filename.as_ptr(),
246 |             nopts,
247 |             opts.as_mut_ptr(),
248 |             opt_vals.as_mut_ptr()
249 |         )?;
250 |         Ok(self)
251 |     }
252 | 
253 |     /// Add a resouce into the linker stack.
254 |     pub fn add(self, data: &Instruction) -> Result<Self> {
255 |         Ok(match *data {
256 |             Instruction::PTX(ref ptx) => unsafe {
257 |                 let cstr = CString::new(ptx.as_bytes()).expect("Invalid PTX String");
258 |                 self.add_data(data.input_type(), cstr.as_bytes_with_nul())?
259 |             },
260 |             Instruction::Cubin(ref bin) => unsafe { self.add_data(data.input_type(), &bin)? },
261 |             Instruction::PTXFile(ref path) | Instruction::CubinFile(ref path) => unsafe {
262 |                 self.add_file(data.input_type(), path)?
263 |             },
264 |         })
265 |     }
266 | 
267 |     /// Wrapper of cuLinkComplete
268 |     ///
269 |     /// LinkComplete returns a reference to cubin,
270 |     /// which is managed by LinkState.
271 |     /// Use owned strategy to avoid considering lifetime.
272 |     pub fn complete(self) -> Result<Instruction> {
273 |         let mut cb = null_mut();
274 |         unsafe {
275 |             contexted_call!(
276 |                 &self,
277 |                 cuLinkComplete,
278 |                 self.state,
279 |                 &mut cb as *mut _,
280 |                 null_mut()
281 |             )?;
282 |             Ok(Instruction::cubin(CStr::from_ptr(cb as _).to_bytes()))
283 |         }
284 |     }
285 | }
286 | 
287 | /// Link PTX/cubin into a module
288 | pub fn link(ctx: &Context, data: &[Instruction], opt: JITConfig) -> Result<Module> {
289 |     let mut l = Linker::create(&ctx, opt)?;
290 |     for d in data {
291 |         l = l.add(d)?;
292 |     }
293 |     let cubin = l.complete()?;
294 |     Module::load(ctx, &cubin)
295 | }
296 | 
297 | #[cfg(test)]
298 | mod tests {
299 |     use super::*;
300 | 
301 |     #[test]
302 |     fn create() -> Result<()> {
303 |         let device = Device::nth(0)?;
304 |         let ctx = device.create_context();
305 |         let _linker = Linker::create(&ctx, JITConfig::default())?;
306 |         Ok(())
307 |     }
308 | 
309 |     #[test]
310 |     fn ptx_file() -> Result<()> {
311 |         let device = Device::nth(0)?;
312 |         let ctx = device.create_context();
313 |         let linker = Linker::create(&ctx, JITConfig::default())?;
314 |         let data = Instruction::ptx_file(Path::new("tests/data/add.ptx"))?;
315 |         linker.add(&data)?;
316 |         Ok(())
317 |     }
318 | 
319 |     #[test]
320 |     fn linking() -> Result<()> {
321 |         let device = Device::nth(0)?;
322 |         let ctx = device.create_context();
323 | 
324 |         let data_add = Instruction::ptx_file(Path::new("tests/data/add.ptx"))?;
325 |         let data_sub = Instruction::ptx_file(Path::new("tests/data/sub.ptx"))?;
326 |         let _module = Linker::create(&ctx, JITConfig::default())?
327 |             .add(&data_add)?
328 |             .add(&data_sub)?
329 |             .complete()?;
330 |         Ok(())
331 |     }
332 | 
333 |     #[ignore] // FIXME Causes CUDA_ERROR_NO_BINARY_FOR_GPU
334 |     #[test]
335 |     fn cubin_file() -> Result<()> {
336 |         let device = Device::nth(0)?;
337 |         let ctx = device.create_context();
338 |         let linker = Linker::create(&ctx, JITConfig::default())?;
339 |         let data = Instruction::cubin_file(Path::new("tests/data/add.cubin"))?;
340 |         linker.add(&data)?;
341 |         Ok(())
342 |     }
343 | }
344 | 


--------------------------------------------------------------------------------
/accel/src/memory/array.rs:
--------------------------------------------------------------------------------
  1 | //! CUDA [Array] and [Texture], [Surface] Objects
  2 | //!
  3 | //! [Array]:   https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-arrays
  4 | //! [Texture]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html#group__CUDA__TEXOBJECT
  5 | //! [Surface]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html#group__CUDA__SURFOBJECT
  6 | 
  7 | use crate::{contexted_call, contexted_new, device::Contexted, error::Result, *};
  8 | use cuda::*;
  9 | use futures::future::BoxFuture;
 10 | use num_traits::ToPrimitive;
 11 | use std::marker::PhantomData;
 12 | 
 13 | pub use cuda::CUDA_ARRAY3D_DESCRIPTOR as Descriptor;
 14 | 
 15 | #[derive(Debug, Contexted)]
 16 | pub struct Array<T, Dim> {
 17 |     array: CUarray,
 18 |     dim: Dim,
 19 |     context: Context,
 20 |     phantom: PhantomData<T>,
 21 | }
 22 | 
 23 | unsafe impl<T, Dim> Send for Array<T, Dim> {}
 24 | unsafe impl<T, Dim> Sync for Array<T, Dim> {}
 25 | 
 26 | impl<T, Dim> Drop for Array<T, Dim> {
 27 |     fn drop(&mut self) {
 28 |         if let Err(e) = unsafe { contexted_call!(self, cuArrayDestroy, self.array) } {
 29 |             log::error!("Failed to cleanup array: {:?}", e);
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl<T: Scalar, Dim: Dimension> Array<T, Dim> {
 35 |     /// Get dimension
 36 |     pub fn dim(&self) -> &Dim {
 37 |         &self.dim
 38 |     }
 39 | }
 40 | 
 41 | impl<T: Scalar, Dim: Dimension> Memory for Array<T, Dim> {
 42 |     type Elem = T;
 43 |     fn head_addr(&self) -> *const T {
 44 |         self.array as _
 45 |     }
 46 |     fn head_addr_mut(&mut self) -> *mut T {
 47 |         self.array as _
 48 |     }
 49 | 
 50 |     fn num_elem(&self) -> usize {
 51 |         self.dim.len()
 52 |     }
 53 | 
 54 |     fn memory_type(&self) -> MemoryType {
 55 |         MemoryType::Array
 56 |     }
 57 | 
 58 |     fn set(&mut self, value: Self::Elem) {
 59 |         // FIXME CUDA does not have memcpy for array. This is easy but too expensive alternative way
 60 |         let src = PageLockedMemory::from_elem(&self.context, self.dim.len(), value);
 61 |         self.copy_from(&src);
 62 |     }
 63 | }
 64 | 
 65 | fn memcpy3d_param_h2a<T: Scalar, Dim: Dimension>(
 66 |     src: &[T],
 67 |     dst: &mut Array<T, Dim>,
 68 | ) -> CUDA_MEMCPY3D {
 69 |     let dim = dst.dim;
 70 |     CUDA_MEMCPY3D {
 71 |         srcMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED,
 72 |         srcDevice: src.as_ptr() as CUdeviceptr,
 73 | 
 74 |         dstMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_ARRAY,
 75 |         dstArray: dst.array,
 76 | 
 77 |         WidthInBytes: dim.width() * T::size_of() * dim.num_channels().to_usize().unwrap(),
 78 |         Height: dim.height(),
 79 |         Depth: dim.depth(),
 80 | 
 81 |         ..Default::default()
 82 |     }
 83 | }
 84 | 
 85 | impl<T: Scalar, Dim: Dimension> Memcpy<[T]> for Array<T, Dim> {
 86 |     fn copy_from(&mut self, src: &[T]) {
 87 |         assert_ne!(self.head_addr(), src.head_addr());
 88 |         assert_eq!(self.num_elem(), src.num_elem());
 89 |         unsafe { contexted_call!(self, cuMemcpy3D_v2, &memcpy3d_param_h2a(src, self)) }
 90 |             .expect("memcpy into array failed");
 91 |     }
 92 | 
 93 |     fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> {
 94 |         assert_ne!(self.head_addr(), src.head_addr());
 95 |         assert_eq!(self.num_elem(), src.num_elem());
 96 |         let stream = stream::Stream::new(self.context.get_ref());
 97 |         unsafe {
 98 |             contexted_call!(
 99 |                 self,
100 |                 cuMemcpy3DAsync_v2,
101 |                 &memcpy3d_param_h2a(src, self),
102 |                 stream.stream
103 |             )
104 |         }
105 |         .expect("memcpy into array failed");
106 |         Box::pin(async { stream.into_future().await.expect("async memcpy failed") })
107 |     }
108 | }
109 | 
110 | fn memcpy3d_param_a2h<T: Scalar, Dim: Dimension>(
111 |     src: &Array<T, Dim>,
112 |     dst: &mut [T],
113 | ) -> CUDA_MEMCPY3D {
114 |     let dim = src.dim;
115 |     CUDA_MEMCPY3D {
116 |         srcMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_ARRAY,
117 |         srcArray: src.array,
118 | 
119 |         dstMemoryType: CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED,
120 |         dstDevice: dst.as_mut_ptr() as CUdeviceptr,
121 | 
122 |         WidthInBytes: dim.width() * T::size_of() * dim.num_channels().to_usize().unwrap(),
123 |         Height: dim.height(),
124 |         Depth: dim.depth(),
125 | 
126 |         ..Default::default()
127 |     }
128 | }
129 | 
130 | impl<T: Scalar, Dim: Dimension> Memcpy<Array<T, Dim>> for [T] {
131 |     fn copy_from(&mut self, src: &Array<T, Dim>) {
132 |         assert_ne!(self.head_addr(), src.head_addr());
133 |         assert_eq!(self.num_elem(), src.num_elem());
134 |         unsafe { contexted_call!(src, cuMemcpy3D_v2, &memcpy3d_param_a2h(src, self)) }
135 |             .expect("memcpy from array failed");
136 |     }
137 | 
138 |     fn copy_from_async<'a>(&'a mut self, src: &'a Array<T, Dim>) -> BoxFuture<'a, ()> {
139 |         assert_ne!(self.head_addr(), src.head_addr());
140 |         assert_eq!(self.num_elem(), src.num_elem());
141 |         let stream = stream::Stream::new(src.context.get_ref());
142 |         unsafe {
143 |             contexted_call!(
144 |                 src,
145 |                 cuMemcpy3DAsync_v2,
146 |                 &memcpy3d_param_a2h(src, self),
147 |                 stream.stream
148 |             )
149 |         }
150 |         .expect("memcpy from array failed");
151 |         Box::pin(async { stream.into_future().await.expect("async memcpy failed") })
152 |     }
153 | }
154 | 
155 | macro_rules! impl_memcpy_array {
156 |     ($t:path) => {
157 |         impl<T: Scalar, Dim: Dimension> Memcpy<Array<T, Dim>> for $t {
158 |             fn copy_from(&mut self, src: &Array<T, Dim>) {
159 |                 self.as_mut_slice().copy_from(src);
160 |             }
161 |             fn copy_from_async<'a>(&'a mut self, src: &'a Array<T, Dim>) -> BoxFuture<'a, ()> {
162 |                 self.as_mut_slice().copy_from_async(src)
163 |             }
164 |         }
165 | 
166 |         impl<T: Scalar, Dim: Dimension> Memcpy<$t> for Array<T, Dim> {
167 |             fn copy_from(&mut self, src: &$t) {
168 |                 self.copy_from(src.as_slice());
169 |             }
170 |             fn copy_from_async<'a>(&'a mut self, src: &'a $t) -> BoxFuture<'a, ()> {
171 |                 self.copy_from_async(src.as_slice())
172 |             }
173 |         }
174 |     };
175 | }
176 | 
177 | impl_memcpy_array!(DeviceMemory::<T>);
178 | impl_memcpy_array!(PageLockedMemory::<T>);
179 | impl_memcpy_array!(RegisteredMemory::<'_, T>);
180 | 
181 | impl<T: Scalar, Dim: Dimension> Allocatable for Array<T, Dim> {
182 |     type Shape = Dim;
183 |     unsafe fn uninitialized(context: &Context, dim: Dim) -> Self {
184 |         let desc = dim.as_descriptor::<T>();
185 |         let array =
186 |             contexted_new!(context, cuArray3DCreate_v2, &desc).expect("Cannot create a new array");
187 |         Array {
188 |             array,
189 |             dim,
190 |             context: context.clone(),
191 |             phantom: PhantomData,
192 |         }
193 |     }
194 | }
195 | 
196 | #[cfg(test)]
197 | mod tests {
198 |     use super::*;
199 |     use crate::device::*;
200 | 
201 |     #[test]
202 |     fn new_1d() -> Result<()> {
203 |         let device = Device::nth(0)?;
204 |         let context = device.create_context();
205 |         let _array1: Array<f32, Ix1> = Array::zeros(&context, 10.into());
206 |         let _array2: Array<f32, Ix1> = Array::zeros(&context, (10,).into());
207 |         Ok(())
208 |     }
209 | 
210 |     #[test]
211 |     fn new_2d() -> Result<()> {
212 |         let device = Device::nth(0)?;
213 |         let context = device.create_context();
214 |         let _array: Array<f32, Ix2> = Array::zeros(&context, (10, 12).into());
215 |         Ok(())
216 |     }
217 | 
218 |     #[test]
219 |     fn new_3d() -> Result<()> {
220 |         let device = Device::nth(0)?;
221 |         let context = device.create_context();
222 |         let _array: Array<f32, Ix3> = Array::zeros(&context, (10, 12, 8).into());
223 |         Ok(())
224 |     }
225 | 
226 |     #[test]
227 |     fn new_1d_layered() -> Result<()> {
228 |         let device = Device::nth(0)?;
229 |         let context = device.create_context();
230 |         let _array: Array<f32, Ix1Layered> = Array::zeros(&context, (10, 12).into());
231 |         Ok(())
232 |     }
233 | 
234 |     #[test]
235 |     fn new_2d_layered() -> Result<()> {
236 |         let device = Device::nth(0)?;
237 |         let context = device.create_context();
238 |         let _array: Array<f32, Ix2Layered> = Array::zeros(&context, (10, 12, 8).into());
239 |         Ok(())
240 |     }
241 | 
242 |     #[test]
243 |     fn memcpy_h2a2h_1d() -> Result<()> {
244 |         let device = Device::nth(0)?;
245 |         let context = device.create_context();
246 |         let n = 10;
247 |         let src = PageLockedMemory::from_elem(&context, n, 2_u32);
248 |         let mut dst = PageLockedMemory::zeros(&context, n);
249 |         let mut array = unsafe { Array::<u32, Ix1>::uninitialized(&context, n.into()) };
250 |         array.copy_from(&src);
251 |         dst.copy_from(&array);
252 |         dbg!(dst.as_slice());
253 |         for i in 0..n {
254 |             assert_eq!(dst[i], 2_u32);
255 |         }
256 |         Ok(())
257 |     }
258 | 
259 |     #[test]
260 |     fn memcpy_d2a2d_2d() -> Result<()> {
261 |         let device = Device::nth(0)?;
262 |         let context = device.create_context();
263 |         let n = 3;
264 |         let m = 4;
265 |         let src = DeviceMemory::from_elem(&context, n * m, 2_u32);
266 |         let mut dst = DeviceMemory::zeros(&context, n * m);
267 |         let mut array = unsafe { Array::<u32, Ix2>::uninitialized(&context, (n, m).into()) };
268 |         array.copy_from(&src);
269 |         dst.copy_from(&array);
270 |         dbg!(dst.as_slice());
271 |         for i in 0..n * m {
272 |             assert_eq!(dst[i], 2_u32);
273 |         }
274 |         Ok(())
275 |     }
276 | 
277 |     #[test]
278 |     fn memcpy_h2a2h_2d() -> Result<()> {
279 |         let device = Device::nth(0)?;
280 |         let context = device.create_context();
281 |         let n = 3;
282 |         let m = 4;
283 |         let src = PageLockedMemory::from_elem(&context, n * m, 2_u32);
284 |         let mut dst = PageLockedMemory::zeros(&context, n * m);
285 |         let mut array = unsafe { Array::<u32, Ix2>::uninitialized(&context, (n, m).into()) };
286 |         array.copy_from(&src);
287 |         dst.copy_from(&array);
288 |         dbg!(dst.as_slice());
289 |         for i in 0..n {
290 |             assert_eq!(dst[i], 2_u32);
291 |         }
292 |         Ok(())
293 |     }
294 | 
295 |     #[test]
296 |     fn memcpy_d2a2d_1d() -> Result<()> {
297 |         let device = Device::nth(0)?;
298 |         let context = device.create_context();
299 |         let n = 3;
300 |         let m = 4;
301 |         let src = DeviceMemory::from_elem(&context, n * m, 2_u32);
302 |         let mut dst = DeviceMemory::zeros(&context, n * m);
303 |         let mut array = unsafe { Array::<u32, Ix2>::uninitialized(&context, (n, m).into()) };
304 |         array.copy_from(&src);
305 |         dst.copy_from(&array);
306 |         dbg!(dst.as_slice());
307 |         for i in 0..n {
308 |             assert_eq!(dst[i], 2_u32);
309 |         }
310 |         Ok(())
311 |     }
312 |     #[test]
313 |     fn memcpy_h2a2h_3d() -> Result<()> {
314 |         let device = Device::nth(0)?;
315 |         let context = device.create_context();
316 |         let n = 3;
317 |         let m = 4;
318 |         let l = 2;
319 |         let src = PageLockedMemory::from_elem(&context, n * m * l, 2_u32);
320 |         let mut dst = PageLockedMemory::zeros(&context, n * m * l);
321 |         let mut array = unsafe { Array::<u32, Ix3>::uninitialized(&context, (n, m, l).into()) };
322 |         array.copy_from(&src);
323 |         dst.copy_from(&array);
324 |         dbg!(dst.as_slice());
325 |         for i in 0..n {
326 |             assert_eq!(dst[i], 2_u32);
327 |         }
328 |         Ok(())
329 |     }
330 | 
331 |     #[test]
332 |     fn memcpy_d2a2d_3d() -> Result<()> {
333 |         let device = Device::nth(0)?;
334 |         let context = device.create_context();
335 |         let n = 3;
336 |         let m = 4;
337 |         let l = 2;
338 |         let src = DeviceMemory::from_elem(&context, n * l * m, 2_u32);
339 |         let mut dst = DeviceMemory::zeros(&context, n * l * m);
340 |         let mut array = unsafe { Array::<u32, Ix3>::uninitialized(&context, (n, m, l).into()) };
341 |         array.copy_from(&src);
342 |         dst.copy_from(&array);
343 |         dbg!(dst.as_slice());
344 |         for i in 0..n {
345 |             assert_eq!(dst[i], 2_u32);
346 |         }
347 |         Ok(())
348 |     }
349 | 
350 |     #[test]
351 |     fn memcpy_h2a2h_1dlayer() -> Result<()> {
352 |         let device = Device::nth(0)?;
353 |         let context = device.create_context();
354 |         let n = 3;
355 |         let m = 4;
356 |         let src = PageLockedMemory::from_elem(&context, n * m, 2_u32);
357 |         let mut dst = PageLockedMemory::zeros(&context, n * m);
358 |         let mut array = unsafe { Array::<u32, Ix1Layered>::uninitialized(&context, (n, m).into()) };
359 |         array.copy_from(&src);
360 |         dst.copy_from(&array);
361 |         dbg!(dst.as_slice());
362 |         for i in 0..n {
363 |             assert_eq!(dst[i], 2_u32);
364 |         }
365 |         Ok(())
366 |     }
367 | 
368 |     #[test]
369 |     fn memcpy_d2a2d_1dlayer() -> Result<()> {
370 |         let device = Device::nth(0)?;
371 |         let context = device.create_context();
372 |         let n = 3;
373 |         let m = 4;
374 |         let src = DeviceMemory::from_elem(&context, n * m, 2_u32);
375 |         let mut dst = DeviceMemory::zeros(&context, n * m);
376 |         let mut array = unsafe { Array::<u32, Ix1Layered>::uninitialized(&context, (n, m).into()) };
377 |         array.copy_from(&src);
378 |         dst.copy_from(&array);
379 |         dbg!(dst.as_slice());
380 |         for i in 0..n {
381 |             assert_eq!(dst[i], 2_u32);
382 |         }
383 |         Ok(())
384 |     }
385 | 
386 |     #[test]
387 |     fn memcpy_h2a2h_2dlayer() -> Result<()> {
388 |         let device = Device::nth(0)?;
389 |         let context = device.create_context();
390 |         let n = 3;
391 |         let m = 4;
392 |         let l = 2;
393 |         let src = PageLockedMemory::from_elem(&context, n * m * l, 2_u32);
394 |         let mut dst = PageLockedMemory::zeros(&context, n * m * l);
395 |         let mut array =
396 |             unsafe { Array::<u32, Ix2Layered>::uninitialized(&context, (n, m, l).into()) };
397 |         array.copy_from(&src);
398 |         dst.copy_from(&array);
399 |         dbg!(dst.as_slice());
400 |         for i in 0..n {
401 |             assert_eq!(dst[i], 2_u32);
402 |         }
403 |         Ok(())
404 |     }
405 | 
406 |     #[test]
407 |     fn memcpy_d2a2d_2dlayer() -> Result<()> {
408 |         let device = Device::nth(0)?;
409 |         let context = device.create_context();
410 |         let n = 3;
411 |         let m = 4;
412 |         let l = 2;
413 |         let src = DeviceMemory::from_elem(&context, n * m * l, 2_u32);
414 |         let mut dst = DeviceMemory::zeros(&context, n * m * l);
415 |         let mut array =
416 |             unsafe { Array::<u32, Ix2Layered>::uninitialized(&context, (n, m, l).into()) };
417 |         array.copy_from(&src);
418 |         dst.copy_from(&array);
419 |         dbg!(dst.as_slice());
420 |         for i in 0..n {
421 |             assert_eq!(dst[i], 2_u32);
422 |         }
423 |         Ok(())
424 |     }
425 | }
426 | 


--------------------------------------------------------------------------------
/accel/src/memory/device.rs:
--------------------------------------------------------------------------------
  1 | //! Device and Host memory handlers
  2 | 
  3 | use super::*;
  4 | use crate::{error::*, *};
  5 | use cuda::*;
  6 | use std::{
  7 |     fmt,
  8 |     marker::PhantomData,
  9 |     ops::{Deref, DerefMut},
 10 | };
 11 | 
 12 | use cuda::CUmemAttach_flags_enum as AttachFlag;
 13 | 
 14 | /// Memory allocated on the device.
 15 | #[derive(Contexted)]
 16 | pub struct DeviceMemory<T> {
 17 |     ptr: CUdeviceptr,
 18 |     size: usize,
 19 |     context: Context,
 20 |     phantom: PhantomData<T>,
 21 | }
 22 | 
 23 | unsafe impl<T> Sync for DeviceMemory<T> {}
 24 | unsafe impl<T> Send for DeviceMemory<T> {}
 25 | 
 26 | impl<T> Drop for DeviceMemory<T> {
 27 |     fn drop(&mut self) {
 28 |         if let Err(e) = unsafe { contexted_call!(self, cuMemFree_v2, self.ptr) } {
 29 |             log::error!("Failed to free device memory: {:?}", e);
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl<T: Scalar> fmt::Debug for DeviceMemory<T> {
 35 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 36 |         f.debug_struct("DeviceMemory")
 37 |             .field("context", &self.context)
 38 |             .field("data", &self.as_slice())
 39 |             .finish()
 40 |     }
 41 | }
 42 | 
 43 | impl<T> Deref for DeviceMemory<T> {
 44 |     type Target = [T];
 45 |     fn deref(&self) -> &[T] {
 46 |         unsafe { std::slice::from_raw_parts(self.ptr as _, self.size) }
 47 |     }
 48 | }
 49 | 
 50 | impl<T> DerefMut for DeviceMemory<T> {
 51 |     fn deref_mut(&mut self) -> &mut [T] {
 52 |         unsafe { std::slice::from_raw_parts_mut(self.ptr as _, self.size) }
 53 |     }
 54 | }
 55 | 
 56 | impl<T: Scalar> PartialEq for DeviceMemory<T> {
 57 |     fn eq(&self, other: &Self) -> bool {
 58 |         // FIXME should be tested on device
 59 |         self.as_slice().eq(other.as_slice())
 60 |     }
 61 | }
 62 | 
 63 | impl<T: Scalar> PartialEq<[T]> for DeviceMemory<T> {
 64 |     fn eq(&self, other: &[T]) -> bool {
 65 |         // FIXME should be tested on device
 66 |         self.as_slice().eq(other)
 67 |     }
 68 | }
 69 | 
 70 | impl<T: Scalar> Memory for DeviceMemory<T> {
 71 |     type Elem = T;
 72 |     fn head_addr(&self) -> *const T {
 73 |         self.ptr as _
 74 |     }
 75 | 
 76 |     fn head_addr_mut(&mut self) -> *mut T {
 77 |         self.ptr as _
 78 |     }
 79 | 
 80 |     fn num_elem(&self) -> usize {
 81 |         self.size
 82 |     }
 83 | 
 84 |     fn memory_type(&self) -> MemoryType {
 85 |         MemoryType::Device
 86 |     }
 87 | 
 88 |     fn set(&mut self, value: T) {
 89 |         match T::size_of() {
 90 |             1 => unsafe {
 91 |                 contexted_call!(
 92 |                     self,
 93 |                     cuMemsetD8_v2,
 94 |                     self.head_addr_mut() as CUdeviceptr,
 95 |                     value.to_le_u8().unwrap(),
 96 |                     self.num_elem()
 97 |                 )
 98 |             }
 99 |             .expect("memset failed for 8-bit scalar"),
100 |             2 => unsafe {
101 |                 contexted_call!(
102 |                     self,
103 |                     cuMemsetD16_v2,
104 |                     self.head_addr_mut() as CUdeviceptr,
105 |                     value.to_le_u16().unwrap(),
106 |                     self.num_elem()
107 |                 )
108 |             }
109 |             .expect("memset failed for 16-bit scalar"),
110 |             4 => unsafe {
111 |                 contexted_call!(
112 |                     self,
113 |                     cuMemsetD32_v2,
114 |                     self.head_addr_mut() as CUdeviceptr,
115 |                     value.to_le_u32().unwrap(),
116 |                     self.num_elem()
117 |                 )
118 |             }
119 |             .expect("memset failed for 64-bit scalar"),
120 |             _ => {
121 |                 unimplemented!("memset for Device memory is only supported for 8/16/32-bit scalars")
122 |             }
123 |         }
124 |     }
125 | }
126 | 
127 | impl<T: Scalar> Continuous for DeviceMemory<T> {
128 |     fn as_slice(&self) -> &[T] {
129 |         self
130 |     }
131 |     fn as_mut_slice(&mut self) -> &mut [T] {
132 |         self
133 |     }
134 | }
135 | 
136 | impl<T: Scalar> Allocatable for DeviceMemory<T> {
137 |     type Shape = usize;
138 |     unsafe fn uninitialized(context: &Context, size: usize) -> Self {
139 |         assert!(size > 0, "Zero-sized malloc is forbidden");
140 |         let ptr = contexted_new!(
141 |             context,
142 |             cuMemAllocManaged,
143 |             size * std::mem::size_of::<T>(),
144 |             AttachFlag::CU_MEM_ATTACH_GLOBAL as u32
145 |         )
146 |         .expect("Cannot allocate device memory");
147 |         DeviceMemory {
148 |             ptr,
149 |             size,
150 |             context: context.clone(),
151 |             phantom: PhantomData,
152 |         }
153 |     }
154 | }
155 | 
156 | impl<'arg, T: Scalar> DeviceSend for &'arg DeviceMemory<T> {
157 |     type Target = *const T;
158 |     fn as_kernel_parameter(&self) -> *mut c_void {
159 |         &self.ptr as *const CUdeviceptr as *mut c_void
160 |     }
161 | }
162 | 
163 | impl<'arg, T: Scalar> DeviceSend for &'arg mut DeviceMemory<T> {
164 |     type Target = *mut T;
165 |     fn as_kernel_parameter(&self) -> *mut c_void {
166 |         &self.ptr as *const CUdeviceptr as *mut c_void
167 |     }
168 | }
169 | 
170 | #[cfg(test)]
171 | mod tests {
172 |     use super::*;
173 | 
174 |     #[test]
175 |     fn as_mut_slice() -> Result<()> {
176 |         let device = Device::nth(0)?;
177 |         let context = device.create_context();
178 |         let mut mem = DeviceMemory::<i32>::zeros(&context, 12);
179 |         let sl = mem.as_mut_slice();
180 |         sl[0] = 3; // test if accessible from host
181 |         assert_eq!(sl.num_elem(), 12);
182 |         Ok(())
183 |     }
184 | 
185 |     #[should_panic(expected = "Zero-sized malloc is forbidden")]
186 |     #[test]
187 |     fn device_new_zero() {
188 |         let device = Device::nth(0).unwrap();
189 |         let context = device.create_context();
190 |         let _a = DeviceMemory::<i32>::zeros(&context, 0);
191 |     }
192 | }
193 | 


--------------------------------------------------------------------------------
/accel/src/memory/dimension.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | use derive_new::new;
  3 | use num_derive::{FromPrimitive, ToPrimitive};
  4 | use num_traits::{ToPrimitive, Zero};
  5 | use std::{fmt::Debug, ops::Add};
  6 | 
  7 | pub use cuda::CUDA_ARRAY3D_DESCRIPTOR as Descriptor;
  8 | 
  9 | /// This specifies the number of packed elements per "CUDA array element".
 10 | ///
 11 | /// - The CUDA array element approach is useful e.g. for [RGBA color model],
 12 | ///   which has 4 values at each point of figures.
 13 | /// - For example, When `T=f32` and `NumChannels::Two`,
 14 | ///   the size of "CUDA array element" is 64bit as packed two 32bit float values.
 15 | /// - We call `T` element, although "CUDA array element" represents `[T; num_channels]`.
 16 | ///   `Memory::num_elem()` returns how many `T` exists in this array.
 17 | ///
 18 | /// [RGBA color model]: https://en.wikipedia.org/wiki/RGBA_color_model
 19 | #[repr(u32)]
 20 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, FromPrimitive, ToPrimitive)]
 21 | pub enum NumChannels {
 22 |     /// Single element in each "CUDA Array element"
 23 |     One = 1,
 24 |     /// Two scalars in each CUDA Array element
 25 |     Two = 2,
 26 |     /// Four scalars in each CUDA Array element
 27 |     Four = 4,
 28 | }
 29 | 
 30 | impl Default for NumChannels {
 31 |     fn default() -> Self {
 32 |         NumChannels::One
 33 |     }
 34 | }
 35 | 
 36 | pub trait Dimension: Zero + Debug + Clone + Copy + PartialEq + Send + Sync + 'static {
 37 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor;
 38 | 
 39 |     /// Number of elements
 40 |     fn len(&self) -> usize;
 41 | 
 42 |     /// Get number of element `T` in each "CUDA Array element"
 43 |     fn num_channels(&self) -> NumChannels;
 44 | 
 45 |     fn width(&self) -> usize {
 46 |         self.as_descriptor::<u32>().Width
 47 |     }
 48 | 
 49 |     fn height(&self) -> usize {
 50 |         std::cmp::max(self.as_descriptor::<u32>().Height, 1)
 51 |     }
 52 | 
 53 |     fn depth(&self) -> usize {
 54 |         std::cmp::max(self.as_descriptor::<u32>().Depth, 1)
 55 |     }
 56 | }
 57 | 
 58 | /// Spec of 1D Array
 59 | #[derive(Debug, Clone, Copy, PartialEq, new)]
 60 | pub struct Ix1 {
 61 |     pub width: usize,
 62 |     #[new(default)]
 63 |     pub num_channels: NumChannels,
 64 | }
 65 | 
 66 | impl From<usize> for Ix1 {
 67 |     fn from(width: usize) -> Ix1 {
 68 |         Ix1 {
 69 |             width,
 70 |             num_channels: NumChannels::One,
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | impl From<(usize,)> for Ix1 {
 76 |     fn from((width,): (usize,)) -> Ix1 {
 77 |         Ix1 {
 78 |             width,
 79 |             num_channels: NumChannels::One,
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | impl Add for Ix1 {
 85 |     type Output = Self;
 86 |     fn add(self, other: Self) -> Self {
 87 |         assert_eq!(self.num_channels, other.num_channels);
 88 |         Self {
 89 |             width: self.width + other.width,
 90 |             num_channels: self.num_channels,
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | impl Zero for Ix1 {
 96 |     fn zero() -> Self {
 97 |         Ix1::new(0)
 98 |     }
 99 | 
100 |     fn is_zero(&self) -> bool {
101 |         self.len() == 0
102 |     }
103 | }
104 | 
105 | impl Dimension for Ix1 {
106 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor {
107 |         Descriptor {
108 |             Width: self.width,
109 |             Height: 0,
110 |             Depth: 0,
111 |             NumChannels: self.num_channels.to_u32().unwrap(),
112 |             Flags: ArrayFlag::empty().bits(),
113 |             Format: T::format(),
114 |         }
115 |     }
116 | 
117 |     fn len(&self) -> usize {
118 |         self.width * self.num_channels.to_usize().unwrap()
119 |     }
120 | 
121 |     fn num_channels(&self) -> NumChannels {
122 |         self.num_channels
123 |     }
124 | }
125 | 
126 | /// Spec of 2D Array
127 | #[derive(Debug, Clone, Copy, PartialEq, new)]
128 | pub struct Ix2 {
129 |     pub width: usize,
130 |     pub height: usize,
131 |     #[new(default)]
132 |     pub num_channels: NumChannels,
133 | }
134 | 
135 | impl From<(usize, usize)> for Ix2 {
136 |     fn from((width, height): (usize, usize)) -> Ix2 {
137 |         Ix2 {
138 |             width,
139 |             height,
140 |             num_channels: NumChannels::One,
141 |         }
142 |     }
143 | }
144 | 
145 | impl Add for Ix2 {
146 |     type Output = Self;
147 |     fn add(self, other: Self) -> Self {
148 |         assert_eq!(self.num_channels, other.num_channels);
149 |         Self {
150 |             width: self.width + other.width,
151 |             height: self.height + other.height,
152 |             num_channels: self.num_channels,
153 |         }
154 |     }
155 | }
156 | 
157 | impl Zero for Ix2 {
158 |     fn zero() -> Self {
159 |         Ix2::new(0, 0)
160 |     }
161 | 
162 |     fn is_zero(&self) -> bool {
163 |         self.len() == 0
164 |     }
165 | }
166 | 
167 | impl Dimension for Ix2 {
168 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor {
169 |         Descriptor {
170 |             Width: self.width,
171 |             Height: self.height,
172 |             Depth: 0,
173 |             NumChannels: self.num_channels.to_u32().unwrap(),
174 |             Flags: ArrayFlag::empty().bits(),
175 |             Format: T::format(),
176 |         }
177 |     }
178 | 
179 |     fn len(&self) -> usize {
180 |         self.width * self.height * self.num_channels.to_usize().unwrap()
181 |     }
182 | 
183 |     fn num_channels(&self) -> NumChannels {
184 |         self.num_channels
185 |     }
186 | }
187 | 
188 | /// Spec of 3D Array
189 | #[derive(Debug, Clone, Copy, PartialEq, new)]
190 | pub struct Ix3 {
191 |     pub width: usize,
192 |     pub height: usize,
193 |     pub depth: usize,
194 |     #[new(default)]
195 |     pub num_channels: NumChannels,
196 | }
197 | 
198 | impl From<(usize, usize, usize)> for Ix3 {
199 |     fn from((width, height, depth): (usize, usize, usize)) -> Ix3 {
200 |         Ix3 {
201 |             width,
202 |             height,
203 |             depth,
204 |             num_channels: NumChannels::One,
205 |         }
206 |     }
207 | }
208 | 
209 | impl Add for Ix3 {
210 |     type Output = Self;
211 |     fn add(self, other: Self) -> Self {
212 |         assert_eq!(self.num_channels, other.num_channels);
213 |         Self {
214 |             width: self.width + other.width,
215 |             height: self.height + other.height,
216 |             depth: self.depth + other.depth,
217 |             num_channels: self.num_channels,
218 |         }
219 |     }
220 | }
221 | 
222 | impl Zero for Ix3 {
223 |     fn zero() -> Self {
224 |         Ix3::new(0, 0, 0)
225 |     }
226 | 
227 |     fn is_zero(&self) -> bool {
228 |         self.len() == 0
229 |     }
230 | }
231 | 
232 | impl Dimension for Ix3 {
233 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor {
234 |         Descriptor {
235 |             Width: self.width,
236 |             Height: self.height,
237 |             Depth: self.depth,
238 |             NumChannels: self.num_channels.to_u32().unwrap(),
239 |             Flags: ArrayFlag::empty().bits(),
240 |             Format: T::format(),
241 |         }
242 |     }
243 | 
244 |     fn len(&self) -> usize {
245 |         self.width * self.height * self.depth * self.num_channels().to_usize().unwrap()
246 |     }
247 | 
248 |     fn num_channels(&self) -> NumChannels {
249 |         self.num_channels
250 |     }
251 | }
252 | 
253 | /// Spec of Layered 1D Array
254 | #[derive(Debug, Clone, Copy, PartialEq, new)]
255 | pub struct Ix1Layered {
256 |     /// Width of each layer
257 |     pub width: usize,
258 |     /// Depth of layer
259 |     pub depth: usize,
260 |     #[new(default)]
261 |     pub num_channels: NumChannels,
262 | }
263 | 
264 | impl From<(usize, usize)> for Ix1Layered {
265 |     fn from((width, depth): (usize, usize)) -> Ix1Layered {
266 |         Ix1Layered {
267 |             width,
268 |             depth,
269 |             num_channels: NumChannels::One,
270 |         }
271 |     }
272 | }
273 | 
274 | impl Add for Ix1Layered {
275 |     type Output = Self;
276 |     fn add(self, other: Self) -> Self {
277 |         assert_eq!(self.num_channels, other.num_channels);
278 |         Self {
279 |             width: self.width + other.width,
280 |             depth: self.depth + other.depth,
281 |             num_channels: self.num_channels,
282 |         }
283 |     }
284 | }
285 | 
286 | impl Zero for Ix1Layered {
287 |     fn zero() -> Self {
288 |         Self::new(0, 0)
289 |     }
290 | 
291 |     fn is_zero(&self) -> bool {
292 |         self.len() == 0
293 |     }
294 | }
295 | 
296 | impl Dimension for Ix1Layered {
297 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor {
298 |         Descriptor {
299 |             Width: self.width,
300 |             Height: 0,
301 |             Depth: self.depth,
302 |             NumChannels: self.num_channels.to_u32().unwrap(),
303 |             Flags: ArrayFlag::LAYERED.bits(),
304 |             Format: T::format(),
305 |         }
306 |     }
307 | 
308 |     fn len(&self) -> usize {
309 |         self.width * self.depth * self.num_channels.to_usize().unwrap()
310 |     }
311 | 
312 |     fn num_channels(&self) -> NumChannels {
313 |         self.num_channels
314 |     }
315 | }
316 | 
317 | /// Spec of Layered 2D Array
318 | #[derive(Debug, Clone, Copy, PartialEq, new)]
319 | pub struct Ix2Layered {
320 |     /// Width of each layer
321 |     pub width: usize,
322 |     /// height of each layer
323 |     pub height: usize,
324 |     /// Depth of layer
325 |     pub depth: usize,
326 |     #[new(default)]
327 |     pub num_channels: NumChannels,
328 | }
329 | 
330 | impl From<(usize, usize, usize)> for Ix2Layered {
331 |     fn from((width, height, depth): (usize, usize, usize)) -> Ix2Layered {
332 |         Ix2Layered {
333 |             width,
334 |             height,
335 |             depth,
336 |             num_channels: NumChannels::One,
337 |         }
338 |     }
339 | }
340 | 
341 | impl Add for Ix2Layered {
342 |     type Output = Self;
343 |     fn add(self, other: Self) -> Self {
344 |         assert_eq!(self.num_channels, other.num_channels);
345 |         Self {
346 |             width: self.width + other.width,
347 |             height: self.height + other.height,
348 |             depth: self.depth + other.depth,
349 |             num_channels: self.num_channels,
350 |         }
351 |     }
352 | }
353 | 
354 | impl Zero for Ix2Layered {
355 |     fn zero() -> Self {
356 |         Self::new(0, 0, 0)
357 |     }
358 | 
359 |     fn is_zero(&self) -> bool {
360 |         self.len() == 0
361 |     }
362 | }
363 | 
364 | impl Dimension for Ix2Layered {
365 |     fn as_descriptor<T: Scalar>(&self) -> Descriptor {
366 |         Descriptor {
367 |             Width: self.width,
368 |             Height: self.height,
369 |             Depth: self.depth,
370 |             NumChannels: self.num_channels.to_u32().unwrap(),
371 |             Flags: ArrayFlag::LAYERED.bits(),
372 |             Format: T::format(),
373 |         }
374 |     }
375 | 
376 |     fn len(&self) -> usize {
377 |         self.width * self.height * self.depth * self.num_channels.to_usize().unwrap()
378 |     }
379 | 
380 |     fn num_channels(&self) -> NumChannels {
381 |         self.num_channels
382 |     }
383 | }
384 | 
385 | bitflags::bitflags! {
386 |     pub struct ArrayFlag: u32 {
387 |         /// If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array.
388 |         const LAYERED = 0x01;
389 |         /// This flag must be set in order to bind a surface reference to the CUDA array
390 |         const SURFACE_LDST = 0x02;
391 |         /// If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six.
392 |         const CUBEMAP = 0x04;
393 |         /// This flag must be set in order to perform texture gather operations on a CUDA array.
394 |         const TEXTURE_GATHER = 0x08;
395 |         /// This flag if set indicates that the CUDA array is a DEPTH_TEXTURE.
396 |         const DEPTH_TEXTURE = 0x10;
397 |         /// This flag indicates that the CUDA array may be bound as a color target in an external graphics API
398 |         const COLOR_ATTACHMENT = 0x20;
399 |     }
400 | }
401 | 


--------------------------------------------------------------------------------
/accel/src/memory/info.rs:
--------------------------------------------------------------------------------
 1 | use crate::{contexted_call, device::*};
 2 | use cuda::*;
 3 | 
 4 | /// Total and Free memory size of the device (in bytes)
 5 | #[derive(Debug, Clone, Copy, PartialEq)]
 6 | struct MemoryInfo {
 7 |     free: usize,
 8 |     total: usize,
 9 | }
10 | 
11 | impl MemoryInfo {
12 |     fn get(ctx: Context) -> Self {
13 |         let mut free = 0;
14 |         let mut total = 0;
15 |         unsafe {
16 |             contexted_call!(
17 |                 &ctx,
18 |                 cuMemGetInfo_v2,
19 |                 &mut free as *mut usize,
20 |                 &mut total as *mut usize
21 |             )
22 |         }
23 |         .expect("Cannot get memory info");
24 |         MemoryInfo { free, total }
25 |     }
26 | }
27 | 
28 | /// Get total memory size in bytes of the current device
29 | ///
30 | /// Panic
31 | /// ------
32 | /// - when given context is not current
33 | pub fn total_memory(ctx: Context) -> usize {
34 |     MemoryInfo::get(ctx).total
35 | }
36 | 
37 | /// Get free memory size in bytes of the current device
38 | ///
39 | /// Panic
40 | /// ------
41 | /// - when given context is not current
42 | pub fn free_memory(ctx: Context) -> usize {
43 |     MemoryInfo::get(ctx).free
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use super::*;
49 |     use crate::error::*;
50 | 
51 |     #[test]
52 |     fn info() -> Result<()> {
53 |         let device = Device::nth(0)?;
54 |         let ctx = device.create_context();
55 |         let mem_info = MemoryInfo::get(ctx);
56 |         dbg!(&mem_info);
57 |         assert!(mem_info.free > 0);
58 |         assert!(mem_info.total > mem_info.free);
59 |         Ok(())
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/accel/src/memory/mod.rs:
--------------------------------------------------------------------------------
  1 | //! Memory management
  2 | //!
  3 | //! Unified address
  4 | //! ---------------
  5 | //!
  6 | //! - All memories are mapped into a single 64bit memory space
  7 | //! - We can get where the pointed memory exists from its value.
  8 | //!
  9 | //! Memory Types
 10 | //! ------------
 11 | //!
 12 | //! |name                 | where exists | From Host | From Device | As slice | Description                                                            |
 13 | //! |:--------------------|:------------:|:---------:|:-----------:|:--------:|:-----------------------------------------------------------------------|
 14 | //! | (usual) Host memory | Host         | ✓         |  -          |  ✓       | allocated by usual manner, e.g. `vec![0; n]`                           |
 15 | //! | [RegisteredMemory]  | Host         | ✓         |  ✓          |  ✓       | A host memory registered into CUDA memory management system            |
 16 | //! | [PageLockedMemory]  | Host         | ✓         |  ✓          |  ✓       | OS memory paging is disabled for accelerating memory transfer          |
 17 | //! | [DeviceMemory]      | Device       | ✓         |  ✓          |  ✓       | allocated on device as a single span                                   |
 18 | //! | [Array]             | Device       | ✓         |  ✓          |  -       | properly aligned memory on device for using Texture and Surface memory |
 19 | //!
 20 | //! Traits
 21 | //! -------
 22 | //!
 23 | //! |traits       |`[T]`|[RegisteredMemory]|[PageLockedMemory]|[DeviceMemory]|[Array]| Description                                |
 24 | //! |:------------|:---:|:----------------:|:----------------:|:------------:|:-----:|:-------------------------------------------|
 25 | //! |[Memory]     | ✓   | ✓                | ✓                | ✓            | ✓     | Has Unified address and element size       |
 26 | //! |[Contexted]  | -   | ✓                | ✓                | ✓            | ✓     | with CUDA Context                          |
 27 | //! |[Continuous] | ✓   | ✓                | ✓                | ✓            | -     | Can be treated as a Rust slice             |
 28 | //! |[Allocatable]| -   | -                | ✓                | ✓            | ✓     | Newly allocatable with its shape and value |
 29 | //!
 30 | //! [RegisteredMemory]: ./struct.RegisteredMemory.html
 31 | //! [PageLockedMemory]: ./struct.PageLockedMemory.html
 32 | //! [DeviceMemory]: ./struct.DeviceMemory.html
 33 | //! [Array]: ./struct.Array.html
 34 | //!
 35 | //! [Memory]: ./trait.Memory.html
 36 | //! [Memset]: ./trait.Memset.html
 37 | //! [Contexted]: ../device/trait.Contexted.html
 38 | //! [Continuous]: ./trait.Continuous.html
 39 | //! [Allocatable]: ./trait.Allocatable.html
 40 | 
 41 | mod array;
 42 | mod device;
 43 | mod dimension;
 44 | mod info;
 45 | mod page_locked;
 46 | mod registered;
 47 | mod scalar;
 48 | mod slice;
 49 | 
 50 | pub use array::*;
 51 | pub use device::*;
 52 | pub use dimension::*;
 53 | pub use info::*;
 54 | pub use page_locked::*;
 55 | pub use registered::*;
 56 | pub use scalar::*;
 57 | 
 58 | use crate::*;
 59 | use cuda::*;
 60 | use futures::future::BoxFuture;
 61 | use num_traits::Zero;
 62 | use std::{ffi::c_void, mem::MaybeUninit};
 63 | 
 64 | /// Memory type
 65 | ///
 66 | /// Because of [unified addressing], we can get the memory type after casted into slice:
 67 | ///
 68 | /// - [DeviceMemory]
 69 | ///
 70 | /// ```
 71 | /// # use accel::{*, memory::*};
 72 | /// # let device = Device::nth(0).unwrap();
 73 | /// # let ctx = device.create_context();
 74 | /// let mem = DeviceMemory::<i32>::zeros(&ctx, 12);
 75 | /// let sl = mem.as_slice();
 76 | /// assert_eq!(sl.memory_type(), MemoryType::Device);
 77 | /// ```
 78 | ///
 79 | /// - [PageLockedMemory]
 80 | ///
 81 | /// ```
 82 | /// # use accel::{*, memory::*};
 83 | /// # let device = Device::nth(0).unwrap();
 84 | /// # let ctx = device.create_context();
 85 | /// let mem = PageLockedMemory::<i32>::zeros(&ctx, 12);
 86 | /// let sl = mem.as_slice();
 87 | /// assert_eq!(sl.memory_type(), MemoryType::PageLocked);
 88 | /// ```
 89 | ///
 90 | /// - [RegisteredMemory]
 91 | ///   - Be sure that [RegisteredMemory] and [PageLockedMemory] are indistinguishable
 92 | ///
 93 | /// ```
 94 | /// # use accel::{*, memory::*};
 95 | /// # let device = Device::nth(0).unwrap();
 96 | /// # let ctx = device.create_context();
 97 | /// let mut a = vec![0_i32; 12];
 98 | /// let mem = RegisteredMemory::<i32>::new(&ctx, &mut a);
 99 | /// let sl = mem.as_slice();
100 | /// assert_eq!(sl.memory_type(), MemoryType::PageLocked);
101 | /// ```
102 | ///
103 | /// - [Array] cannot be casted into a slice
104 | ///
105 | /// [unified addressing]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED
106 | /// [Array]: ./struct.Array.html
107 | /// [DeviceMemory]: ./struct.DeviceMemory.html
108 | /// [RegisteredMemory]: ./struct.RegisteredMemory.html
109 | /// [PageLockedMemory]: ./struct.PageLockedMemory.html
110 | ///
111 | #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
112 | pub enum MemoryType {
113 |     /// Host memory **not** managed by CUDA memory system
114 |     Host,
115 |     /// Host memory managed by CUDA memory system, i.e.
116 |     /// [RegisteredMemory](./struct.RegisteredMemory.html), and
117 |     /// [PageLockedMemory](./struct.PageLockedMemory.html)
118 |     PageLocked,
119 |     /// Device memory
120 |     Device,
121 |     /// Array memory
122 |     Array,
123 | }
124 | 
125 | /// Has unique head address and allocated size.
126 | pub trait Memory {
127 |     /// Scalar type of each element
128 |     type Elem: Scalar;
129 | 
130 |     /// Get head address of the memory as a const pointer
131 |     fn head_addr(&self) -> *const Self::Elem;
132 | 
133 |     /// Get head address of the memory as a mutable pointer
134 |     fn head_addr_mut(&mut self) -> *mut Self::Elem;
135 | 
136 |     /// Number of elements
137 |     fn num_elem(&self) -> usize;
138 | 
139 |     /// Get memory type, See [MemoryType](./enum.MemoryType.html) for detail.
140 |     fn memory_type(&self) -> MemoryType;
141 | 
142 |     /// Set all elements by `value`
143 |     ///
144 |     /// Examples
145 |     /// ---------
146 |     ///
147 |     /// - Set `i32`
148 |     ///
149 |     /// ```
150 |     /// # use accel::*;
151 |     /// # let device = Device::nth(0).unwrap();
152 |     /// # let ctx = device.create_context();
153 |     /// let mut mem = DeviceMemory::<i32>::zeros(&ctx, 12);
154 |     /// mem.set(1234);
155 |     /// for &val in mem.as_slice() {
156 |     ///   assert_eq!(val, 1234);
157 |     /// }
158 |     /// ```
159 |     ///
160 |     /// - Set `f32`
161 |     ///   - Be sure that `f64` is not supported yet because CUDA does not support 64-bit memset.
162 |     ///
163 |     /// ```
164 |     /// # use accel::*;
165 |     /// # let device = Device::nth(0).unwrap();
166 |     /// # let ctx = device.create_context();
167 |     /// let mut mem = DeviceMemory::<f32>::zeros(&ctx, 12);
168 |     /// mem.set(1.0);
169 |     /// for &val in mem.as_slice() {
170 |     ///   assert_eq!(val, 1.0);
171 |     /// }
172 |     /// ```
173 |     ///
174 |     /// - Set for host memory equals to `mem.iter_mut().for_each(|v| *v = value)`
175 |     ///
176 |     /// ```
177 |     /// # use accel::*;
178 |     /// # let device = Device::nth(0).unwrap();
179 |     /// # let ctx = device.create_context();
180 |     /// let mut mem = PageLockedMemory::<i32>::zeros(&ctx, 12);
181 |     /// mem.set(1234);
182 |     /// for &val in mem.as_slice() {
183 |     ///   assert_eq!(val, 1234);
184 |     /// }
185 |     /// ```
186 |     fn set(&mut self, value: Self::Elem);
187 | }
188 | 
189 | /// Copy data from one to another
190 | pub trait Memcpy<Target: Memory<Elem = Self::Elem> + ?Sized>: Memory {
191 |     /// Examples
192 |     /// ---------
193 |     ///
194 |     /// - memcpy from page-locked host memory to device memory
195 |     ///
196 |     /// ```
197 |     /// # use accel::*;
198 |     /// # let device = Device::nth(0).unwrap();
199 |     /// # let ctx = device.create_context();
200 |     /// let mut dest = DeviceMemory::<i32>::zeros(&ctx, 12);
201 |     /// let src = PageLockedMemory::<i32>::zeros(&ctx, 12);
202 |     /// dest.copy_from(&src);
203 |     /// ```
204 |     ///
205 |     /// - memcpy from device memory to page-locked host memory
206 |     ///
207 |     /// ```
208 |     /// # use accel::*;
209 |     /// # let device = Device::nth(0).unwrap();
210 |     /// # let ctx = device.create_context();
211 |     /// let mut dest = PageLockedMemory::<i32>::zeros(&ctx, 12);
212 |     /// let src = DeviceMemory::<i32>::zeros(&ctx, 12);
213 |     /// dest.copy_from(&src);
214 |     /// ```
215 |     ///
216 |     /// - memcpy from device to device
217 |     ///
218 |     /// ```
219 |     /// # use accel::*;
220 |     /// # let device = Device::nth(0).unwrap();
221 |     /// # let ctx = device.create_context();
222 |     /// let mut dest = DeviceMemory::<i32>::zeros(&ctx, 12);
223 |     /// let src = DeviceMemory::<i32>::zeros(&ctx, 12);
224 |     /// dest.copy_from(&src);
225 |     /// ```
226 |     ///
227 |     /// - memcpy from Rust slice to device memory
228 |     ///
229 |     /// ```
230 |     /// # use accel::*;
231 |     /// # use std::ops::DerefMut;
232 |     /// # let device = Device::nth(0).unwrap();
233 |     /// # let ctx = device.create_context();
234 |     /// let mut dest = DeviceMemory::<i32>::zeros(&ctx, 12);
235 |     /// let src = vec![0_i32; 12];
236 |     /// dest.copy_from(src.as_slice()); // requires explicit cast to slice
237 |     /// ```
238 |     ///
239 |     /// - memcpy from device memory to Rust slice
240 |     ///
241 |     /// ```
242 |     /// # use accel::*;
243 |     /// # let device = Device::nth(0).unwrap();
244 |     /// # let ctx = device.create_context();
245 |     /// let mut dest = vec![0_i32; 12];
246 |     /// let src = DeviceMemory::<i32>::zeros(&ctx, 12);
247 |     /// dest.copy_from(&src);
248 |     /// ```
249 |     ///
250 |     /// - Cannot copy between different types
251 |     ///
252 |     /// ```compile_fail
253 |     /// # use accel::*;
254 |     /// # let device = Device::nth(0).unwrap();
255 |     /// # let ctx = device.create_context();
256 |     /// let mut dest = DeviceMemory::<i64>::zeros(&ctx, 12);
257 |     /// let src = PageLockedMemory::<i32>::zeros(&ctx, 12);
258 |     /// dest.copy_from(&src); // compile fail
259 |     /// ```
260 |     ///
261 |     /// - Panics if sizes are different
262 |     ///
263 |     /// ```should_panic
264 |     /// # use accel::*;
265 |     /// # let device = Device::nth(0).unwrap();
266 |     /// # let ctx = device.create_context();
267 |     /// let mut dest = DeviceMemory::<i32>::zeros(&ctx, 24);
268 |     /// let src = PageLockedMemory::<i32>::zeros(&ctx, 12);
269 |     /// dest.copy_from(&src); // will panic
270 |     /// ```
271 |     ///
272 |     /// Panics
273 |     /// -------
274 |     ///
275 |     /// - if `self` and `src` are identical
276 |     /// - if sizes of memory mismatch
277 |     ///
278 |     fn copy_from(&mut self, source: &Target);
279 | 
280 |     /// Copy data in async manner
281 |     ///
282 |     /// ```
283 |     /// use accel::*;
284 |     ///
285 |     /// #[tokio::main]
286 |     /// async fn main() {
287 |     ///   let device = Device::nth(0).unwrap();
288 |     ///   let ctx = device.create_context();
289 |     ///   let mut dest = DeviceMemory::<f32>::zeros(&ctx, 12);
290 |     ///   let src = PageLockedMemory::<f32>::zeros(&ctx, 12);
291 |     ///   dest.copy_from_async(&src).await;
292 |     /// }
293 |     /// ```
294 |     ///
295 |     /// - Arrays are captured until await:
296 |     ///
297 |     /// ```
298 |     /// # use accel::*;
299 |     /// # #[tokio::main]
300 |     /// # async fn main() {
301 |     /// # let device = Device::nth(0).unwrap();
302 |     /// # let ctx = device.create_context();
303 |     /// # let mut dest = DeviceMemory::<f32>::zeros(&ctx, 12);
304 |     /// # let src = PageLockedMemory::<f32>::zeros(&ctx, 12);
305 |     /// let future = dest.copy_from_async(&src);
306 |     /// println!("src[0] = {}", src[0]);  // Source is always accessible as usual &-reference
307 |     /// future.await;
308 |     /// # }
309 |     /// ```
310 |     ///
311 |     /// ```compile_fail
312 |     /// # use accel::*;
313 |     /// # #[tokio::main]
314 |     /// # async fn main() {
315 |     /// # let device = Device::nth(0).unwrap();
316 |     /// # let ctx = device.create_context();
317 |     /// # let mut dest = DeviceMemory::<f32>::zeros(&ctx, 12);
318 |     /// # let src = PageLockedMemory::<f32>::zeros(&ctx, 12);
319 |     /// let future = dest.copy_from_async(&src);
320 |     /// println!("dest[0] = {}", dest[0]);  // Destination is not accessible until .await
321 |     /// future.await;
322 |     /// # }
323 |     /// ```
324 |     fn copy_from_async<'a>(&'a mut self, src: &'a Target) -> BoxFuture<'a, ()>;
325 | }
326 | 
327 | /// Allocatable memories with CUDA context
328 | pub trait Allocatable: Contexted + Memory + Sized {
329 |     /// Shape for initialization
330 |     type Shape: Zero;
331 | 
332 |     /// Allocate a memory without initialization
333 |     ///
334 |     /// Safety
335 |     /// ------
336 |     /// - Cause undefined behavior when read before write
337 |     ///
338 |     /// Panic
339 |     /// ------
340 |     /// - if shape is zero
341 |     unsafe fn uninitialized(ctx: &Context, shape: Self::Shape) -> Self;
342 | 
343 |     /// uniformly initialized
344 |     ///
345 |     /// Panic
346 |     /// ------
347 |     /// - if shape is zero
348 |     fn from_elem(ctx: &Context, shape: Self::Shape, elem: Self::Elem) -> Self {
349 |         let mut mem = unsafe { Self::uninitialized(ctx, shape) };
350 |         mem.set(elem);
351 |         mem
352 |     }
353 | 
354 |     /// uniformly initialized by zero
355 |     ///
356 |     /// Panic
357 |     /// ------
358 |     /// - if shape is zero
359 |     fn zeros(ctx: &Context, shape: Self::Shape) -> Self {
360 |         Self::from_elem(ctx, shape, <Self::Elem as Zero>::zero())
361 |     }
362 | }
363 | 
364 | /// Memory which has continuous 1D index, i.e. can be treated as a Rust slice
365 | pub trait Continuous: Memory {
366 |     fn as_slice(&self) -> &[Self::Elem];
367 |     fn as_mut_slice(&mut self) -> &mut [Self::Elem];
368 | }
369 | 


--------------------------------------------------------------------------------
/accel/src/memory/page_locked.rs:
--------------------------------------------------------------------------------
  1 | //! Device and Host memory handlers
  2 | 
  3 | use super::*;
  4 | use crate::{error::Result, *};
  5 | use cuda::*;
  6 | use std::{
  7 |     fmt,
  8 |     ops::{Deref, DerefMut},
  9 | };
 10 | 
 11 | /// Host memory as page-locked.
 12 | ///
 13 | /// Allocating excessive amounts of pinned memory may degrade system performance,
 14 | /// since it reduces the amount of memory available to the system for paging.
 15 | /// As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device.
 16 | ///
 17 | /// See also [cuMemAllocHost].
 18 | ///
 19 | /// [cuMemAllocHost]: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
 20 | #[derive(Contexted)]
 21 | pub struct PageLockedMemory<T> {
 22 |     ptr: *mut T,
 23 |     size: usize,
 24 |     context: Context,
 25 | }
 26 | 
 27 | unsafe impl<T> Sync for PageLockedMemory<T> {}
 28 | unsafe impl<T> Send for PageLockedMemory<T> {}
 29 | 
 30 | impl<T> Drop for PageLockedMemory<T> {
 31 |     fn drop(&mut self) {
 32 |         if let Err(e) = unsafe { contexted_call!(self, cuMemFreeHost, self.ptr as *mut _) } {
 33 |             log::error!("Cannot free page-locked memory: {:?}", e);
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | impl<T: Scalar> fmt::Debug for PageLockedMemory<T> {
 39 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 40 |         f.debug_struct("PageLockedMemory")
 41 |             .field("context", &self.context)
 42 |             .field("data", &self.as_slice())
 43 |             .finish()
 44 |     }
 45 | }
 46 | 
 47 | impl<T> Deref for PageLockedMemory<T> {
 48 |     type Target = [T];
 49 |     fn deref(&self) -> &[T] {
 50 |         unsafe { std::slice::from_raw_parts(self.ptr as _, self.size) }
 51 |     }
 52 | }
 53 | 
 54 | impl<T> DerefMut for PageLockedMemory<T> {
 55 |     fn deref_mut(&mut self) -> &mut [T] {
 56 |         unsafe { std::slice::from_raw_parts_mut(self.ptr, self.size) }
 57 |     }
 58 | }
 59 | 
 60 | impl<T: Scalar> PartialEq for PageLockedMemory<T> {
 61 |     fn eq(&self, other: &Self) -> bool {
 62 |         self.as_slice().eq(other.as_slice())
 63 |     }
 64 | }
 65 | 
 66 | impl<T: Scalar> PartialEq<[T]> for PageLockedMemory<T> {
 67 |     fn eq(&self, other: &[T]) -> bool {
 68 |         self.as_slice().eq(other)
 69 |     }
 70 | }
 71 | 
 72 | impl<T: Scalar> Memory for PageLockedMemory<T> {
 73 |     type Elem = T;
 74 |     fn head_addr(&self) -> *const T {
 75 |         self.ptr as _
 76 |     }
 77 | 
 78 |     fn head_addr_mut(&mut self) -> *mut T {
 79 |         self.ptr as _
 80 |     }
 81 | 
 82 |     fn num_elem(&self) -> usize {
 83 |         self.size
 84 |     }
 85 | 
 86 |     fn memory_type(&self) -> MemoryType {
 87 |         MemoryType::PageLocked
 88 |     }
 89 | 
 90 |     fn set(&mut self, value: Self::Elem) {
 91 |         self.iter_mut().for_each(|v| *v = value);
 92 |     }
 93 | }
 94 | 
 95 | impl<T: Scalar> Continuous for PageLockedMemory<T> {
 96 |     fn as_slice(&self) -> &[T] {
 97 |         self
 98 |     }
 99 |     fn as_mut_slice(&mut self) -> &mut [T] {
100 |         self
101 |     }
102 | }
103 | 
104 | impl<T: Scalar> Allocatable for PageLockedMemory<T> {
105 |     type Shape = usize;
106 |     unsafe fn uninitialized(context: &Context, size: usize) -> Self {
107 |         assert!(size > 0, "Zero-sized malloc is forbidden");
108 |         let ptr = contexted_new!(context, cuMemAllocHost_v2, size * std::mem::size_of::<T>())
109 |             .expect("Cannot allocate page-locked memory");
110 |         Self {
111 |             ptr: ptr as *mut T,
112 |             size,
113 |             context: context.clone(),
114 |         }
115 |     }
116 | }
117 | 
118 | impl<'arg, T: Scalar> DeviceSend for &'arg PageLockedMemory<T> {
119 |     type Target = *const T;
120 |     fn as_kernel_parameter(&self) -> *mut c_void {
121 |         &self.ptr as *const *mut T as *mut c_void
122 |     }
123 | }
124 | 
125 | impl<'arg, T: Scalar> DeviceSend for &'arg mut PageLockedMemory<T> {
126 |     type Target = *mut T;
127 |     fn as_kernel_parameter(&self) -> *mut c_void {
128 |         &self.ptr as *const *mut T as *mut c_void
129 |     }
130 | }
131 | 
132 | #[cfg(test)]
133 | mod tests {
134 |     use super::*;
135 | 
136 |     #[test]
137 |     fn as_mut_slice() -> Result<()> {
138 |         let device = Device::nth(0)?;
139 |         let context = device.create_context();
140 |         let mut mem = PageLockedMemory::<i32>::zeros(&context, 12);
141 |         let sl = mem.as_mut_slice();
142 | 
143 |         sl[0] = 3; // test if accessible
144 |         assert_eq!(sl.num_elem(), 12);
145 |         Ok(())
146 |     }
147 | 
148 |     #[should_panic(expected = "Zero-sized malloc is forbidden")]
149 |     #[test]
150 |     fn page_locked_new_zero() {
151 |         let device = Device::nth(0).unwrap();
152 |         let context = device.create_context();
153 |         let _a = PageLockedMemory::<i32>::zeros(&context, 0);
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/accel/src/memory/registered.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use crate::{error::Result, *};
  3 | use cuda::*;
  4 | use std::{
  5 |     ffi::c_void,
  6 |     ops::{Deref, DerefMut},
  7 | };
  8 | 
  9 | #[derive(Contexted, Debug)]
 10 | pub struct RegisteredMemory<'a, T> {
 11 |     context: Context,
 12 |     data: &'a mut [T],
 13 | }
 14 | 
 15 | unsafe impl<T> Sync for RegisteredMemory<'_, T> {}
 16 | unsafe impl<T> Send for RegisteredMemory<'_, T> {}
 17 | 
 18 | impl<T> Deref for RegisteredMemory<'_, T> {
 19 |     type Target = [T];
 20 |     fn deref(&self) -> &[T] {
 21 |         self.data
 22 |     }
 23 | }
 24 | 
 25 | impl<T> DerefMut for RegisteredMemory<'_, T> {
 26 |     fn deref_mut(&mut self) -> &mut [T] {
 27 |         self.data
 28 |     }
 29 | }
 30 | 
 31 | impl<T: Scalar> PartialEq for RegisteredMemory<'_, T> {
 32 |     fn eq(&self, other: &Self) -> bool {
 33 |         self.as_slice().eq(other.as_slice())
 34 |     }
 35 | }
 36 | 
 37 | impl<T: Scalar> PartialEq<[T]> for RegisteredMemory<'_, T> {
 38 |     fn eq(&self, other: &[T]) -> bool {
 39 |         self.as_slice().eq(other)
 40 |     }
 41 | }
 42 | 
 43 | impl<T> Drop for RegisteredMemory<'_, T> {
 44 |     fn drop(&mut self) {
 45 |         if let Err(e) = unsafe {
 46 |             contexted_call!(
 47 |                 &self.context,
 48 |                 cuMemHostUnregister,
 49 |                 self.data.as_mut_ptr() as *mut c_void
 50 |             )
 51 |         } {
 52 |             log::error!("Failed to unregister memory: {:?}", e);
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | impl<'a, T: Scalar> RegisteredMemory<'a, T> {
 58 |     pub fn new(context: &Context, data: &'a mut [T]) -> Self {
 59 |         unsafe {
 60 |             contexted_call!(
 61 |                 context,
 62 |                 cuMemHostRegister_v2,
 63 |                 data.as_mut_ptr() as *mut c_void,
 64 |                 data.len() * T::size_of(),
 65 |                 0
 66 |             )
 67 |         }
 68 |         .expect("Failed to register host memory into CUDA memory system");
 69 |         Self {
 70 |             context: context.clone(),
 71 |             data,
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | impl<T: Scalar> Memory for RegisteredMemory<'_, T> {
 77 |     type Elem = T;
 78 | 
 79 |     fn head_addr(&self) -> *const T {
 80 |         self.data.as_ptr()
 81 |     }
 82 | 
 83 |     fn head_addr_mut(&mut self) -> *mut T {
 84 |         self.data.as_mut_ptr()
 85 |     }
 86 | 
 87 |     fn num_elem(&self) -> usize {
 88 |         self.data.len()
 89 |     }
 90 | 
 91 |     fn memory_type(&self) -> MemoryType {
 92 |         MemoryType::Host
 93 |     }
 94 | 
 95 |     fn set(&mut self, value: Self::Elem) {
 96 |         self.iter_mut().for_each(|v| *v = value);
 97 |     }
 98 | }
 99 | 
100 | impl<T: Scalar> Continuous for RegisteredMemory<'_, T> {
101 |     fn as_slice(&self) -> &[T] {
102 |         self
103 |     }
104 |     fn as_mut_slice(&mut self) -> &mut [T] {
105 |         self
106 |     }
107 | }
108 | 
109 | impl<'arg, 'a: 'arg, T: Scalar> DeviceSend for &'arg RegisteredMemory<'a, T> {
110 |     type Target = *const T;
111 |     fn as_kernel_parameter(&self) -> *mut c_void {
112 |         self.data.as_kernel_parameter()
113 |     }
114 | }
115 | 
116 | impl<'arg, 'a: 'arg, T: Scalar> DeviceSend for &'arg mut RegisteredMemory<'a, T> {
117 |     type Target = *mut T;
118 |     fn as_kernel_parameter(&self) -> *mut c_void {
119 |         self.data.as_kernel_parameter()
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/accel/src/memory/scalar.rs:
--------------------------------------------------------------------------------
 1 | pub use cuda::CUarray_format as ArrayFormatTag;
 2 | use num_traits::Num;
 3 | 
 4 | pub trait Scalar: Num + std::fmt::Debug + Copy + Send + Sync {
 5 |     fn format() -> ArrayFormatTag;
 6 | 
 7 |     fn size_of() -> usize {
 8 |         std::mem::size_of::<Self>()
 9 |     }
10 | 
11 |     /// Get little endian format in u8
12 |     fn to_le_u8(self) -> Option<u8> {
13 |         assert_ne!(Self::size_of(), u8::size_of());
14 |         None
15 |     }
16 | 
17 |     /// Get little endian format in u16
18 |     fn to_le_u16(self) -> Option<u16> {
19 |         assert_ne!(Self::size_of(), u16::size_of());
20 |         None
21 |     }
22 | 
23 |     /// Get little endian format in u32
24 |     fn to_le_u32(self) -> Option<u32> {
25 |         assert_ne!(Self::size_of(), u32::size_of());
26 |         None
27 |     }
28 | }
29 | 
30 | macro_rules! impl_array_scalar {
31 |     ($scalar:ty, $le:ty, $format:ident) => {
32 |         impl Scalar for $scalar {
33 |             fn format() -> ArrayFormatTag {
34 |                 ArrayFormatTag::$format
35 |             }
36 |             paste::item! {
37 |                 fn [< to_le_ $le >](self) -> Option<$le> {
38 |                     assert_eq!(Self::size_of(), <$le>::size_of());
39 |                     Some(<$le>::from_le_bytes(self.to_le_bytes()))
40 |                 }
41 |             }
42 |         }
43 |     };
44 | }
45 | 
46 | impl_array_scalar!(u8, u8, CU_AD_FORMAT_UNSIGNED_INT8);
47 | impl_array_scalar!(u16, u16, CU_AD_FORMAT_UNSIGNED_INT16);
48 | impl_array_scalar!(u32, u32, CU_AD_FORMAT_UNSIGNED_INT32);
49 | impl_array_scalar!(i8, u8, CU_AD_FORMAT_SIGNED_INT8);
50 | impl_array_scalar!(i16, u16, CU_AD_FORMAT_SIGNED_INT16);
51 | impl_array_scalar!(i32, u32, CU_AD_FORMAT_SIGNED_INT32);
52 | // FIXME f16 is not supported yet
53 | // impl_array_scalar!(f16, u16, CU_AD_FORMAT_HALF);
54 | impl_array_scalar!(f32, u32, CU_AD_FORMAT_FLOAT);
55 | 


--------------------------------------------------------------------------------
/accel/src/memory/slice.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | /// Typed wrapper of cuPointerGetAttribute
  4 | fn get_attr<T, Attr>(ptr: *const T, attr: CUpointer_attribute) -> error::Result<Attr> {
  5 |     let mut data = MaybeUninit::<Attr>::uninit();
  6 |     unsafe {
  7 |         ffi_call!(
  8 |             cuPointerGetAttribute,
  9 |             data.as_mut_ptr() as *mut c_void,
 10 |             attr,
 11 |             ptr as CUdeviceptr
 12 |         )?;
 13 |         Ok(data.assume_init())
 14 |     }
 15 | }
 16 | 
 17 | /// Determine actual memory type dynamically
 18 | ///
 19 | /// Because `Continuous` memories can be treated as a slice,
 20 | /// input slice may represents any type of memory.
 21 | fn memory_type<T>(ptr: *const T) -> MemoryType {
 22 |     match get_attr(ptr, CUpointer_attribute::CU_POINTER_ATTRIBUTE_MEMORY_TYPE) {
 23 |         Ok(CUmemorytype_enum::CU_MEMORYTYPE_HOST) => MemoryType::PageLocked,
 24 |         Ok(CUmemorytype_enum::CU_MEMORYTYPE_DEVICE) => MemoryType::Device,
 25 |         Ok(CUmemorytype_enum::CU_MEMORYTYPE_ARRAY) => MemoryType::Array,
 26 |         Ok(CUmemorytype_enum::CU_MEMORYTYPE_UNIFIED) => {
 27 |             unreachable!("CU_POINTER_ATTRIBUTE_MEMORY_TYPE never be UNIFED")
 28 |         }
 29 |         Err(_) => {
 30 |             // unmanaged by CUDA memory system, i.e. host memory
 31 |             MemoryType::Host
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | fn get_context<T>(ptr: *const T) -> Option<ContextRef> {
 37 |     let ptr =
 38 |         get_attr::<_, CUcontext>(ptr, CUpointer_attribute::CU_POINTER_ATTRIBUTE_CONTEXT).ok()?;
 39 |     Some(ContextRef::from_ptr(ptr))
 40 | }
 41 | 
 42 | impl<T: Scalar> Memory for [T] {
 43 |     type Elem = T;
 44 |     fn head_addr(&self) -> *const T {
 45 |         self.as_ptr()
 46 |     }
 47 | 
 48 |     fn head_addr_mut(&mut self) -> *mut T {
 49 |         self.as_mut_ptr()
 50 |     }
 51 | 
 52 |     fn num_elem(&self) -> usize {
 53 |         self.len()
 54 |     }
 55 | 
 56 |     fn memory_type(&self) -> MemoryType {
 57 |         memory_type(self.as_ptr())
 58 |     }
 59 | 
 60 |     fn set(&mut self, value: T) {
 61 |         for val in self {
 62 |             *val = value;
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | impl<T: Scalar> Memcpy<[T]> for [T] {
 68 |     fn copy_from(&mut self, src: &[T]) {
 69 |         assert_ne!(self.head_addr(), src.head_addr());
 70 |         assert_eq!(self.num_elem(), src.num_elem());
 71 |         if let Some(ctx) = get_context(self.head_addr()).or_else(|| get_context(src.head_addr())) {
 72 |             unsafe {
 73 |                 contexted_call!(
 74 |                     &ctx,
 75 |                     cuMemcpy,
 76 |                     self.head_addr_mut() as CUdeviceptr,
 77 |                     src.as_ptr() as CUdeviceptr,
 78 |                     self.num_elem() * T::size_of()
 79 |                 )
 80 |             }
 81 |             .unwrap()
 82 |         } else {
 83 |             self.copy_from_slice(src);
 84 |         }
 85 |     }
 86 | 
 87 |     fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> {
 88 |         assert_ne!(self.head_addr(), src.head_addr());
 89 |         assert_eq!(self.num_elem(), src.num_elem());
 90 |         let ctx1 = get_context(self.head_addr());
 91 |         let ctx2 = get_context(src.head_addr());
 92 |         if let Some(ctx) = ctx1.or(ctx2) {
 93 |             let stream = stream::Stream::new(ctx);
 94 |             let byte_count = self.len() * std::mem::size_of::<T>();
 95 |             unsafe {
 96 |                 contexted_call!(
 97 |                     &ctx,
 98 |                     cuMemcpyAsync,
 99 |                     src.as_ptr() as CUdeviceptr,
100 |                     self.as_mut_ptr() as CUdeviceptr,
101 |                     byte_count,
102 |                     stream.stream
103 |                 )
104 |             }
105 |             .expect("Failed to start async memcpy");
106 |             Box::pin(async {
107 |                 stream
108 |                     .into_future()
109 |                     .await
110 |                     .expect("Async memcpy thread failed")
111 |             })
112 |         } else {
113 |             self.copy_from_slice(src);
114 |             Box::pin(async {})
115 |         }
116 |     }
117 | }
118 | 
119 | macro_rules! impl_memcpy_slice {
120 |     ($t:path) => {
121 |         impl<T: Scalar> Memcpy<[T]> for $t {
122 |             fn copy_from(&mut self, src: &[T]) {
123 |                 self.as_mut_slice().copy_from(src);
124 |             }
125 |             fn copy_from_async<'a>(&'a mut self, src: &'a [T]) -> BoxFuture<'a, ()> {
126 |                 self.as_mut_slice().copy_from_async(src)
127 |             }
128 |         }
129 | 
130 |         impl<T: Scalar> Memcpy<$t> for [T] {
131 |             fn copy_from(&mut self, src: &$t) {
132 |                 self.copy_from(src.as_slice());
133 |             }
134 |             fn copy_from_async<'a>(&'a mut self, src: &'a $t) -> BoxFuture<'a, ()> {
135 |                 self.copy_from_async(src.as_slice())
136 |             }
137 |         }
138 |     };
139 | }
140 | 
141 | impl_memcpy_slice!(DeviceMemory::<T>);
142 | impl_memcpy_slice!(PageLockedMemory::<T>);
143 | impl_memcpy_slice!(RegisteredMemory::<'_, T>);
144 | 
145 | macro_rules! impl_memcpy {
146 |     ($from:path, $to:path) => {
147 |         impl<T: Scalar> Memcpy<$from> for $to {
148 |             fn copy_from(&mut self, src: &$from) {
149 |                 self.as_mut_slice().copy_from(src.as_slice());
150 |             }
151 |             fn copy_from_async<'a>(&'a mut self, src: &'a $from) -> BoxFuture<'a, ()> {
152 |                 self.as_mut_slice().copy_from_async(src.as_slice())
153 |             }
154 |         }
155 |     };
156 | }
157 | 
158 | impl_memcpy!(DeviceMemory::<T>, DeviceMemory::<T>);
159 | impl_memcpy!(DeviceMemory::<T>, RegisteredMemory::<'_, T>);
160 | impl_memcpy!(DeviceMemory::<T>, PageLockedMemory::<T>);
161 | impl_memcpy!(PageLockedMemory::<T>, DeviceMemory::<T>);
162 | impl_memcpy!(PageLockedMemory::<T>, RegisteredMemory::<'_, T>);
163 | impl_memcpy!(PageLockedMemory::<T>, PageLockedMemory::<T>);
164 | impl_memcpy!(RegisteredMemory::<'_, T>, DeviceMemory::<T>);
165 | impl_memcpy!(RegisteredMemory::<'_, T>, RegisteredMemory::<'_, T>);
166 | impl_memcpy!(RegisteredMemory::<'_, T>, PageLockedMemory::<T>);
167 | 
168 | impl<T: Scalar> Continuous for [T] {
169 |     fn as_slice(&self) -> &[Self::Elem] {
170 |         self
171 |     }
172 | 
173 |     fn as_mut_slice(&mut self) -> &mut [Self::Elem] {
174 |         self
175 |     }
176 | }
177 | 
178 | #[cfg(test)]
179 | mod tests {
180 |     use super::*;
181 | 
182 |     #[test]
183 |     fn memory_type_host_vec() -> error::Result<()> {
184 |         let a = vec![0_u32; 12];
185 |         assert_eq!(a.as_slice().memory_type(), MemoryType::Host);
186 |         assert_eq!(a.as_slice().num_elem(), 12);
187 |         Ok(())
188 |     }
189 | 
190 |     #[test]
191 |     fn memory_type_host_vec_with_context() -> error::Result<()> {
192 |         let device = Device::nth(0)?;
193 |         let _ctx = device.create_context();
194 |         let a = vec![0_u32; 12];
195 |         assert_eq!(a.as_slice().memory_type(), MemoryType::Host);
196 |         assert_eq!(a.as_slice().num_elem(), 12);
197 |         Ok(())
198 |     }
199 | 
200 |     #[test]
201 |     fn restore_context() -> error::Result<()> {
202 |         let device = Device::nth(0)?;
203 |         let ctx = device.create_context();
204 |         let a = PageLockedMemory::<i32>::zeros(&ctx, 12);
205 |         let ctx_ptr = get_context(a.head_addr()).unwrap();
206 |         assert_eq!(*ctx, ctx_ptr);
207 |         Ok(())
208 |     }
209 | 
210 |     #[tokio::test]
211 |     async fn memcpy_async_host() {
212 |         let a = vec![1_u32; 12];
213 |         let mut b1 = vec![0_u32; 12];
214 |         let mut b2 = vec![0_u32; 12];
215 |         let mut b3 = vec![0_u32; 12];
216 |         let fut1 = b1.copy_from_async(a.as_slice());
217 |         let fut2 = b2.copy_from_async(a.as_slice());
218 |         let fut3 = b3.copy_from_async(a.as_slice());
219 |         fut3.await;
220 |         fut2.await;
221 |         fut1.await;
222 |         assert_eq!(a, b1);
223 |         assert_eq!(a, b2);
224 |         assert_eq!(a, b3);
225 |     }
226 | 
227 |     #[tokio::test]
228 |     async fn memcpy_async_d2h() {
229 |         let device = Device::nth(0).unwrap();
230 |         let ctx = device.create_context();
231 |         let a = DeviceMemory::from_elem(&ctx, 12, 1_u32);
232 |         let mut b1 = vec![0_u32; 12];
233 |         let mut b2 = vec![0_u32; 12];
234 |         let mut b3 = vec![0_u32; 12];
235 |         let fut1 = b1.copy_from_async(&a);
236 |         let fut2 = b2.copy_from_async(&a);
237 |         let fut3 = b3.copy_from_async(&a);
238 |         fut3.await;
239 |         fut2.await;
240 |         fut1.await;
241 |         assert_eq!(a.as_slice(), b1.as_slice());
242 |         assert_eq!(a.as_slice(), b2.as_slice());
243 |         assert_eq!(a.as_slice(), b3.as_slice());
244 |     }
245 | 
246 |     #[tokio::test]
247 |     async fn memcpy_async_h2d() {
248 |         let device = Device::nth(0).unwrap();
249 |         let ctx = device.create_context();
250 |         let a = PageLockedMemory::from_elem(&ctx, 12, 1_u32);
251 |         let mut b1 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
252 |         let mut b2 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
253 |         let mut b3 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
254 |         let fut1 = b1.copy_from_async(&a);
255 |         let fut2 = b2.copy_from_async(&a);
256 |         let fut3 = b3.copy_from_async(&a);
257 |         fut3.await;
258 |         fut2.await;
259 |         fut1.await;
260 |         assert_eq!(a.as_slice(), b1.as_slice());
261 |         assert_eq!(a.as_slice(), b2.as_slice());
262 |         assert_eq!(a.as_slice(), b3.as_slice());
263 |     }
264 | 
265 |     #[tokio::test]
266 |     async fn memcpy_async_d2d() {
267 |         let device = Device::nth(0).unwrap();
268 |         let ctx = device.create_context();
269 |         let a = DeviceMemory::from_elem(&ctx, 12, 1_u32);
270 |         let mut b1 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
271 |         let mut b2 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
272 |         let mut b3 = DeviceMemory::from_elem(&ctx, 12, 0_u32);
273 |         let fut1 = b1.copy_from_async(&a);
274 |         let fut2 = b2.copy_from_async(&a);
275 |         let fut3 = b3.copy_from_async(&a);
276 |         fut3.await;
277 |         fut2.await;
278 |         fut1.await;
279 |         assert_eq!(a.as_slice(), b1.as_slice());
280 |         assert_eq!(a.as_slice(), b2.as_slice());
281 |         assert_eq!(a.as_slice(), b3.as_slice());
282 |     }
283 | }
284 | 


--------------------------------------------------------------------------------
/accel/src/module.rs:
--------------------------------------------------------------------------------
  1 | //! CUDA Module (i.e. loaded PTX or cubin)
  2 | 
  3 | use crate::{contexted_call, contexted_new, device::*, error::*, *};
  4 | use cuda::*;
  5 | use std::ffi::*;
  6 | 
  7 | /// CUDA Kernel function
  8 | #[derive(Debug)]
  9 | pub struct Kernel<'module> {
 10 |     pub(crate) func: CUfunction,
 11 |     module: &'module Module,
 12 | }
 13 | 
 14 | impl Contexted for Kernel<'_> {
 15 |     fn sync(&self) -> Result<()> {
 16 |         self.module.context.sync()
 17 |     }
 18 | 
 19 |     fn version(&self) -> Result<u32> {
 20 |         self.module.context.version()
 21 |     }
 22 | 
 23 |     fn guard(&self) -> Result<ContextGuard> {
 24 |         self.module.context.guard()
 25 |     }
 26 | 
 27 |     fn get_ref(&self) -> ContextRef {
 28 |         self.module.get_ref()
 29 |     }
 30 | }
 31 | 
 32 | /// OOP-like wrapper of `cuModule*` APIs
 33 | #[derive(Debug, Contexted)]
 34 | pub struct Module {
 35 |     module: CUmodule,
 36 |     context: Context,
 37 | }
 38 | 
 39 | impl Drop for Module {
 40 |     fn drop(&mut self) {
 41 |         if let Err(e) = unsafe { contexted_call!(&self.context, cuModuleUnload, self.module) } {
 42 |             log::error!("Failed to unload module: {:?}", e);
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | impl Module {
 48 |     /// integrated loader of Instruction
 49 |     pub fn load(context: &Context, data: &Instruction) -> Result<Self> {
 50 |         match *data {
 51 |             Instruction::PTX(ref ptx) => {
 52 |                 let module =
 53 |                     unsafe { contexted_new!(context, cuModuleLoadData, ptx.as_ptr() as *const _)? };
 54 |                 Ok(Module {
 55 |                     module,
 56 |                     context: context.clone(),
 57 |                 })
 58 |             }
 59 |             Instruction::Cubin(ref bin) => {
 60 |                 let module =
 61 |                     unsafe { contexted_new!(context, cuModuleLoadData, bin.as_ptr() as *const _)? };
 62 |                 Ok(Module {
 63 |                     module,
 64 |                     context: context.clone(),
 65 |                 })
 66 |             }
 67 |             Instruction::PTXFile(ref path) | Instruction::CubinFile(ref path) => {
 68 |                 let filename = CString::new(path.to_str().unwrap()).expect("Invalid Path");
 69 |                 let module = unsafe { contexted_new!(context, cuModuleLoad, filename.as_ptr())? };
 70 |                 Ok(Module {
 71 |                     module,
 72 |                     context: context.clone(),
 73 |                 })
 74 |             }
 75 |         }
 76 |     }
 77 | 
 78 |     pub fn from_str(context: &Context, ptx: &str) -> Result<Self> {
 79 |         let data = Instruction::ptx(ptx);
 80 |         Self::load(context, &data)
 81 |     }
 82 | 
 83 |     /// Wrapper of `cuModuleGetFunction`
 84 |     pub fn get_kernel(&self, name: &str) -> Result<Kernel> {
 85 |         let name = CString::new(name).expect("Invalid Kernel name");
 86 |         let func =
 87 |             unsafe { contexted_new!(self, cuModuleGetFunction, self.module, name.as_ptr()) }?;
 88 |         Ok(Kernel { func, module: self })
 89 |     }
 90 | }
 91 | 
 92 | #[cfg(test)]
 93 | mod tests {
 94 |     use super::*;
 95 | 
 96 |     #[test]
 97 |     fn load_do_nothing() -> Result<()> {
 98 |         // generated by do_nothing example in accel-derive
 99 |         let ptx = r#"
100 |         .version 3.2
101 |         .target sm_30
102 |         .address_size 64
103 |         .visible .entry do_nothing()
104 |         {
105 |           ret;
106 |         }
107 |         "#;
108 |         let device = Device::nth(0)?;
109 |         let ctx = device.create_context();
110 |         let _mod = Module::from_str(&ctx, ptx)?;
111 |         Ok(())
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/accel/src/profiler.rs:
--------------------------------------------------------------------------------
 1 | //! Profiling GPU kernels and host CUDA API calls
 2 | 
 3 | use crate::*;
 4 | use cuda::*;
 5 | 
 6 | /// RAII handler for nvprof profiling
 7 | ///
 8 | /// - Profiling starts by `Profiler::start`, and stops by `Drop` of `Profiler`.
 9 | /// - Unified memory profiling is not supported. You must add an option `--unified-memory-profiling off` to `nvprof` command.
10 | ///   ```shell
11 | ///   $ nvprof --unified-memory-profiling off ./target/release/examples/add
12 | ///   ```
13 | /// - You will find more options at [nvprof user's guide](https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview)
14 | pub struct Profiler {
15 |     ctx: Context,
16 | }
17 | 
18 | impl Drop for Profiler {
19 |     fn drop(&mut self) {
20 |         if let Err(e) = unsafe { contexted_call!(&self.ctx, cuProfilerStop) } {
21 |             log::error!("Failed to stop profiling: {:?}", e);
22 |         }
23 |     }
24 | }
25 | 
26 | impl Profiler {
27 |     pub fn start(ctx: &Context) -> Self {
28 |         unsafe { contexted_call!(ctx, cuProfilerStart) }.expect("Profiler has already started");
29 |         Self { ctx: ctx.clone() }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/accel/src/stream.rs:
--------------------------------------------------------------------------------
  1 | use crate::{contexted_call, contexted_new, device::*, error::*};
  2 | use cuda::*;
  3 | use std::future::Future;
  4 | 
  5 | /// Handler for non-blocking CUDA Stream
  6 | #[derive(Debug, Contexted)]
  7 | pub struct Stream {
  8 |     pub(crate) stream: CUstream,
  9 |     context: ContextRef,
 10 | }
 11 | 
 12 | unsafe impl Sync for Stream {}
 13 | unsafe impl Send for Stream {}
 14 | 
 15 | impl Drop for Stream {
 16 |     fn drop(&mut self) {
 17 |         if let Err(e) = unsafe { contexted_call!(self, cuStreamDestroy_v2, self.stream) } {
 18 |             log::error!("Failed to delete CUDA stream: {:?}", e);
 19 |         }
 20 |     }
 21 | }
 22 | 
 23 | impl Stream {
 24 |     /// Create a new non-blocking CUDA stream on the current context
 25 |     pub fn new(context: ContextRef) -> Self {
 26 |         let stream = unsafe {
 27 |             contexted_new!(
 28 |                 &context,
 29 |                 cuStreamCreate,
 30 |                 CUstream_flags::CU_STREAM_NON_BLOCKING as u32
 31 |             )
 32 |         }
 33 |         .expect("Failed to create CUDA stream");
 34 |         Stream { context, stream }
 35 |     }
 36 | 
 37 |     /// Check all tasks in this stream have been completed
 38 |     pub fn query(&self) -> bool {
 39 |         match unsafe { contexted_call!(self, cuStreamQuery, self.stream) } {
 40 |             Ok(_) => true,
 41 |             Err(AccelError::AsyncOperationNotReady) => false,
 42 |             Err(e) => panic!("Unknown error is happened while cuStreamQuery: {:?}", e),
 43 |         }
 44 |     }
 45 | 
 46 |     /// Wait until all tasks in this stream have been completed
 47 |     pub fn sync(&self) -> Result<()> {
 48 |         unsafe { contexted_call!(self, cuStreamSynchronize, self.stream) }?;
 49 |         Ok(())
 50 |     }
 51 | 
 52 |     /// Consume and convert into a Future
 53 |     pub fn into_future(self) -> impl Future<Output = Result<()>> + Send {
 54 |         async { tokio::task::spawn_blocking(move || self.sync()).await? }
 55 |     }
 56 | 
 57 |     /// Wait event to sync another stream
 58 |     pub fn wait_event(&mut self, event: &Event) {
 59 |         unsafe { contexted_call!(self, cuStreamWaitEvent, self.stream, event.event, 0) }
 60 |             .expect("Failed to register an CUDA event waiting on CUDA stream");
 61 |     }
 62 | }
 63 | 
 64 | #[derive(Contexted)]
 65 | pub struct Event {
 66 |     event: CUevent,
 67 |     context: ContextRef,
 68 | }
 69 | 
 70 | unsafe impl Sync for Event {}
 71 | unsafe impl Send for Event {}
 72 | 
 73 | impl Drop for Event {
 74 |     fn drop(&mut self) {
 75 |         if let Err(e) = unsafe { contexted_call!(self, cuEventDestroy_v2, self.event) } {
 76 |             log::error!("Failed to delete CUDA event: {:?}", e);
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | impl Event {
 82 |     pub fn new(context: ContextRef) -> Self {
 83 |         let event = unsafe {
 84 |             contexted_new!(
 85 |                 &context,
 86 |                 cuEventCreate,
 87 |                 CUevent_flags_enum::CU_EVENT_BLOCKING_SYNC as u32
 88 |             )
 89 |         }
 90 |         .expect("Failed to create CUDA event");
 91 |         Event { context, event }
 92 |     }
 93 | 
 94 |     pub fn record(&mut self, stream: &mut Stream) {
 95 |         unsafe { contexted_call!(self, cuEventRecord, self.event, stream.stream) }
 96 |             .expect("Failed to set event record");
 97 |     }
 98 | 
 99 |     /// Query if the event has occur, returns true if already occurs
100 |     pub fn query(&self) -> bool {
101 |         match unsafe { contexted_call!(self, cuEventQuery, self.event) } {
102 |             Ok(_) => true,
103 |             Err(AccelError::AsyncOperationNotReady) => false,
104 |             Err(e) => panic!("Unknown error occurs while cuEventQuery: {:?}", e),
105 |         }
106 |     }
107 | 
108 |     /// Wait until the event occurs with blocking
109 |     pub fn sync(&self) -> Result<()> {
110 |         unsafe { contexted_call!(self, cuEventSynchronize, self.event) }?;
111 |         Ok(())
112 |     }
113 | }
114 | 
115 | #[cfg(test)]
116 | mod tests {
117 |     use super::*;
118 | 
119 |     #[test]
120 |     fn new() -> Result<()> {
121 |         let device = Device::nth(0)?;
122 |         let context = device.create_context();
123 |         let _st = Stream::new(context.get_ref());
124 |         Ok(())
125 |     }
126 | 
127 |     #[test]
128 |     fn trivial_sync() -> Result<()> {
129 |         let device = Device::nth(0)?;
130 |         let context = device.create_context();
131 |         let mut stream = Stream::new(context.get_ref());
132 |         let mut event = Event::new(context.get_ref());
133 |         event.record(&mut stream);
134 |         // nothing to be waited
135 |         event.sync()?;
136 |         stream.sync()?;
137 |         Ok(())
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/accel/tests/argref.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | fn f(a: &i32, b: &mut i32) {
 5 |     if accel_core::index() == 0 {
 6 |         *b = *a;
 7 |     }
 8 | }
 9 | 
10 | #[test]
11 | fn mut_ref_dev() -> error::Result<()> {
12 |     let device = Device::nth(0)?;
13 |     let ctx = device.create_context();
14 |     let mut a = DeviceMemory::<i32>::zeros(&ctx, 1);
15 |     let mut b = DeviceMemory::<i32>::zeros(&ctx, 1);
16 |     a[0] = 1;
17 |     f(&ctx, 1, 1, (&a[0], &mut b[0]))?;
18 |     assert_eq!(a, b);
19 |     Ok(())
20 | }
21 | 
22 | #[test]
23 | fn mut_ref_host() -> error::Result<()> {
24 |     let device = Device::nth(0)?;
25 |     let ctx = device.create_context();
26 |     let mut a = PageLockedMemory::<i32>::zeros(&ctx, 1);
27 |     let mut b = PageLockedMemory::<i32>::zeros(&ctx, 1);
28 |     a[0] = 1;
29 |     f(&ctx, 1, 1, (&a[0], &mut b[0]))?;
30 |     assert_eq!(a, b);
31 |     Ok(())
32 | }
33 | 


--------------------------------------------------------------------------------
/accel/tests/data/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all clean
 2 | 
 3 | all: add.ptx add.cubin sub.ptx sub.cubin
 4 | 
 5 | clean:
 6 | 	rm *.ptx *.cubin
 7 | 
 8 | %.ptx: %.cu
 9 | 	nvcc -ptx $<
10 | 
11 | %.cubin: %.cu
12 | 	nvcc -cubin $<
13 | 


--------------------------------------------------------------------------------
/accel/tests/data/add.cu:
--------------------------------------------------------------------------------
1 | __global__ void add(const int a[], const int b[], int c[]) {
2 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
3 |     c[i] = a[i] + b[i];
4 | }
5 | 


--------------------------------------------------------------------------------
/accel/tests/data/add.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/accel/tests/data/add.cubin


--------------------------------------------------------------------------------
/accel/tests/data/add.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-27506705
 5 | // Cuda compilation tools, release 10.2, V10.2.89
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 6.5
10 | .target sm_30
11 | .address_size 64
12 | 
13 | 	// .globl	_Z3addPKiS0_Pi
14 | 
15 | .visible .entry _Z3addPKiS0_Pi(
16 | 	.param .u64 _Z3addPKiS0_Pi_param_0,
17 | 	.param .u64 _Z3addPKiS0_Pi_param_1,
18 | 	.param .u64 _Z3addPKiS0_Pi_param_2
19 | )
20 | {
21 | 	.reg .b32 	%r<8>;
22 | 	.reg .b64 	%rd<11>;
23 | 
24 | 
25 | 	ld.param.u64 	%rd1, [_Z3addPKiS0_Pi_param_0];
26 | 	ld.param.u64 	%rd2, [_Z3addPKiS0_Pi_param_1];
27 | 	ld.param.u64 	%rd3, [_Z3addPKiS0_Pi_param_2];
28 | 	cvta.to.global.u64 	%rd4, %rd3;
29 | 	cvta.to.global.u64 	%rd5, %rd2;
30 | 	cvta.to.global.u64 	%rd6, %rd1;
31 | 	mov.u32 	%r1, %ntid.x;
32 | 	mov.u32 	%r2, %ctaid.x;
33 | 	mov.u32 	%r3, %tid.x;
34 | 	mad.lo.s32 	%r4, %r2, %r1, %r3;
35 | 	mul.wide.s32 	%rd7, %r4, 4;
36 | 	add.s64 	%rd8, %rd6, %rd7;
37 | 	ld.global.u32 	%r5, [%rd8];
38 | 	add.s64 	%rd9, %rd5, %rd7;
39 | 	ld.global.u32 	%r6, [%rd9];
40 | 	add.s32 	%r7, %r6, %r5;
41 | 	add.s64 	%rd10, %rd4, %rd7;
42 | 	st.global.u32 	[%rd10], %r7;
43 | 	ret;
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/accel/tests/data/sub.cu:
--------------------------------------------------------------------------------
1 | __global__ void sub(const int a[], const int b[], int c[]) {
2 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
3 |     c[i] = a[i] - b[i];
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/accel/tests/data/sub.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/accel/tests/data/sub.cubin


--------------------------------------------------------------------------------
/accel/tests/data/sub.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-27506705
 5 | // Cuda compilation tools, release 10.2, V10.2.89
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 6.5
10 | .target sm_30
11 | .address_size 64
12 | 
13 | 	// .globl	_Z3subPKiS0_Pi
14 | 
15 | .visible .entry _Z3subPKiS0_Pi(
16 | 	.param .u64 _Z3subPKiS0_Pi_param_0,
17 | 	.param .u64 _Z3subPKiS0_Pi_param_1,
18 | 	.param .u64 _Z3subPKiS0_Pi_param_2
19 | )
20 | {
21 | 	.reg .b32 	%r<8>;
22 | 	.reg .b64 	%rd<11>;
23 | 
24 | 
25 | 	ld.param.u64 	%rd1, [_Z3subPKiS0_Pi_param_0];
26 | 	ld.param.u64 	%rd2, [_Z3subPKiS0_Pi_param_1];
27 | 	ld.param.u64 	%rd3, [_Z3subPKiS0_Pi_param_2];
28 | 	cvta.to.global.u64 	%rd4, %rd3;
29 | 	cvta.to.global.u64 	%rd5, %rd2;
30 | 	cvta.to.global.u64 	%rd6, %rd1;
31 | 	mov.u32 	%r1, %ntid.x;
32 | 	mov.u32 	%r2, %ctaid.x;
33 | 	mov.u32 	%r3, %tid.x;
34 | 	mad.lo.s32 	%r4, %r2, %r1, %r3;
35 | 	mul.wide.s32 	%rd7, %r4, 4;
36 | 	add.s64 	%rd8, %rd6, %rd7;
37 | 	ld.global.u32 	%r5, [%rd8];
38 | 	add.s64 	%rd9, %rd5, %rd7;
39 | 	ld.global.u32 	%r6, [%rd9];
40 | 	sub.s32 	%r7, %r5, %r6;
41 | 	add.s64 	%rd10, %rd4, %rd7;
42 | 	st.global.u32 	[%rd10], %r7;
43 | 	ret;
44 | }
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/accel/tests/launch_async.rs:
--------------------------------------------------------------------------------
1 | #[test]
2 | fn launch_async_build_test() {
3 |     let t = trybuild::TestCases::new();
4 |     t.compile_fail("tests/launch_async/mut_ref_fail.rs");
5 |     t.pass("tests/launch_async/mut_ref_success.rs");
6 | }
7 | 


--------------------------------------------------------------------------------
/accel/tests/launch_async/mut_ref_fail.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | unsafe fn add(a: *const u32, b: *const u32, c: *mut u32, n: usize) {
 5 |     let i = accel_core::index();
 6 |     if (i as usize) < n {
 7 |         *c.offset(i) = *a.offset(i) + *b.offset(i);
 8 |     }
 9 | }
10 | 
11 | #[tokio::main]
12 | async fn main() -> error::Result<()> {
13 |     let device = Device::nth(0)?;
14 |     let ctx = device.create_context();
15 |     let n = 16;
16 |     let mut a = DeviceMemory::<u32>::zeros(&ctx, n);
17 |     let mut b = DeviceMemory::<u32>::zeros(&ctx, n);
18 |     let mut c = DeviceMemory::<u32>::zeros(&ctx, n);
19 | 
20 |     for i in 0..n {
21 |         a[i] = i as u32;
22 |         b[i] = 2 * i as u32;
23 |     }
24 | 
25 |     let md = add::Module::new(&ctx)?;
26 |     let future = md.launch_async(1, n, (&a, &b, &mut c, n));
27 | 
28 |     println!("{:?}", c); // cannot be borrow
29 |     future.await?;
30 | 
31 |     Ok(())
32 | }
33 | 


--------------------------------------------------------------------------------
/accel/tests/launch_async/mut_ref_fail.stderr:
--------------------------------------------------------------------------------
 1 | error[E0502]: cannot borrow `c` as immutable because it is also borrowed as mutable
 2 |   --> $DIR/mut_ref_fail.rs:28:22
 3 |    |
 4 | 26 |     let future = md.launch_async(1, n, (&a, &b, &mut c, n));
 5 |    |                                                 ------ mutable borrow occurs here
 6 | 27 |
 7 | 28 |     println!("{:?}", c); // cannot be borrow
 8 |    |                      ^ immutable borrow occurs here
 9 | 29 |     future.await?;
10 |    |     ------ mutable borrow later used here
11 | 


--------------------------------------------------------------------------------
/accel/tests/launch_async/mut_ref_success.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | unsafe fn add(a: *const u32, b: *const u32, c: *mut u32, n: usize) {
 5 |     let i = accel_core::index();
 6 |     if (i as usize) < n {
 7 |         *c.offset(i) = *a.offset(i) + *b.offset(i);
 8 |     }
 9 | }
10 | 
11 | #[tokio::main]
12 | async fn main() -> error::Result<()> {
13 |     let device = Device::nth(0)?;
14 |     let ctx = device.create_context();
15 |     let n = 16;
16 |     let mut a = DeviceMemory::<u32>::zeros(&ctx, n);
17 |     let mut b = DeviceMemory::<u32>::zeros(&ctx, n);
18 |     let mut c = DeviceMemory::<u32>::zeros(&ctx, n);
19 | 
20 |     for i in 0..n {
21 |         a[i] = i as u32;
22 |         b[i] = 2 * i as u32;
23 |     }
24 | 
25 |     let md = add::Module::new(&ctx)?;
26 |     let future = md.launch_async(1, n, (&a, &b, &mut c, n));
27 |     future.await?;
28 |     for i in 0..n {
29 |         assert_eq!(c[i], 3 * i as u32); // can be borrow
30 |     }
31 | 
32 |     Ok(())
33 | }
34 | 


--------------------------------------------------------------------------------
/accel/tests/read_host_memory.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | pub unsafe fn read_host_memory(a: *const i32) {
 5 |     let i = accel_core::index() as isize;
 6 |     accel_core::println!("a[{}] = {}", i, unsafe { *(a.offset(i)) });
 7 | }
 8 | 
 9 | #[test]
10 | fn page_locked() -> error::Result<()> {
11 |     let device = Device::nth(0)?;
12 |     let ctx = device.create_context();
13 | 
14 |     let mut a = PageLockedMemory::zeros(&ctx, 4);
15 |     a[0] = 0;
16 |     a[1] = 1;
17 |     a[2] = 2;
18 |     a[3] = 3;
19 |     read_host_memory(&ctx, 1, 4, (a.as_ptr(),))?;
20 |     Ok(())
21 | }
22 | 
23 | #[test]
24 | fn registered() -> error::Result<()> {
25 |     let device = Device::nth(0)?;
26 |     let ctx = device.create_context();
27 | 
28 |     let mut a = vec![0; 4];
29 |     let mut mem = RegisteredMemory::new(&ctx, &mut a);
30 |     mem[0] = 0;
31 |     mem[1] = 1;
32 |     mem[2] = 2;
33 |     mem[3] = 3;
34 |     read_host_memory(&ctx, 1, 4, (mem.as_ptr(),))?;
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/accel/tests/slice.rs:
--------------------------------------------------------------------------------
 1 | use accel::*;
 2 | 
 3 | #[kernel]
 4 | unsafe fn set1(a: *mut i32, n: usize) {
 5 |     let i = accel_core::index();
 6 |     if i < n as isize {
 7 |         *a.offset(i) = 1;
 8 |     }
 9 | }
10 | 
11 | #[test]
12 | fn slice_to_pointer_host() -> error::Result<()> {
13 |     let device = Device::nth(0)?;
14 |     let ctx = device.create_context();
15 |     let n = 12;
16 |     let mut a = PageLockedMemory::<i32>::zeros(&ctx, n);
17 |     set1(&ctx, 1, n, (&mut a, n))?;
18 |     assert_eq!(a.as_slice(), vec![1_i32; n].as_slice());
19 |     Ok(())
20 | }
21 | 
22 | #[test]
23 | fn slice_to_pointer_dev() -> error::Result<()> {
24 |     let device = Device::nth(0)?;
25 |     let ctx = device.create_context();
26 |     let n = 12;
27 |     let mut a = DeviceMemory::<i32>::zeros(&ctx, n);
28 |     set1(&ctx, 1, n, (&mut a, n))?;
29 |     assert_eq!(a.as_slice(), vec![1_i32; n].as_slice());
30 |     Ok(())
31 | }
32 | 
33 | #[test]
34 | fn slice_to_pointer_registered() -> error::Result<()> {
35 |     let device = Device::nth(0)?;
36 |     let ctx = device.create_context();
37 |     let n = 12;
38 |     let mut v = vec![0_i32; n];
39 |     let mut a = RegisteredMemory::new(&ctx, &mut v);
40 |     set1(&ctx, 1, n, (&mut a, n))?;
41 |     assert_eq!(a.as_slice(), vec![1_i32; n].as_slice());
42 |     Ok(())
43 | }
44 | 


--------------------------------------------------------------------------------
/diagrams/.gitignore:
--------------------------------------------------------------------------------
 1 | *.aux
 2 | *.log
 3 | *.pdf
 4 | *.bbl
 5 | *.blg
 6 | *.nav
 7 | *.out
 8 | *.snm
 9 | *.toc
10 | *.dvi
11 | *.bb
12 | *.xbb
13 | *.fls
14 | *.fdb_latexmk
15 | *.synctex.gz
16 | 


--------------------------------------------------------------------------------
/diagrams/compile_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/termoshtt/accel/2ee324f6f9cd35832ef3d74a3c0f9191958d0289/diagrams/compile_flow.png


--------------------------------------------------------------------------------
/diagrams/compile_flow.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{standalone}
 2 | 
 3 | \usepackage{listings}
 4 | \usepackage{tikz}
 5 | \usetikzlibrary{positioning,fit,calc}
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[
 9 |   node distance=7mm,
10 |   title/.style={font=\fontsize{6}{6}\color{black!50}\ttfamily},
11 |   typetag/.style={rectangle, draw=black!50, font=\scriptsize\ttfamily, anchor=west},
12 |   arrow-annotate/.style={midway, font=\fontsize{6}{6}\color{blue!70}\ttfamily}
13 | ]
14 |   \node (main) [title] {main.rs};
15 |   \node (proc-macro) [below=of main.west, typetag, xshift=2mm, text width=6cm] {
16 |     \#[accel::kernel]
17 |     \parbox{8cm}{fn add(a: *const f32, b: *const f32, c: *mut f32)}
18 |   };
19 |   \node (main-ptx) [below=of proc-macro.west, xshift=35mm, yshift=-1cm, typetag, text width=42mm] {
20 |       mod add \{
21 |         \parbox{5cm}{const PTX\_STR = "\{Generated PTX\}";}
22 |       \}
23 |   };
24 |   \node (main-caller) [below=of proc-macro.west, yshift=-25mm, typetag, text width=6cm] {
25 |       fn add(context, grid, block, \&(a, b, c)) \{ \}
26 |   };
27 |   \draw[->] (proc-macro) -- (main-ptx) node[arrow-annotate, right] {Helper sub-module};
28 |   \draw[->] (proc-macro) -- (main-caller) node[arrow-annotate, left] {GPU Kernel caller};
29 |   \node[draw=black!50, fit={(main) (proc-macro) (main-ptx) (main-caller)}] {};
30 | 
31 |   \node (ptx-builder-title) at (9cm, 0) [title, right, text width=25mm] {accel-derive/add crate};
32 |   \node (lib)  [below=of ptx-builder-title.west, typetag, xshift=2mm] {lib.rs};
33 |   \node (toml) [below=of lib.west, typetag] {Cargo.toml};
34 |   \node (ptx-builder) [draw=black!50, fit={(ptx-builder-title) (lib) (toml)}] {};
35 | 
36 |   \draw[->] (proc-macro) -- (ptx-builder) node[arrow-annotate, above] {Create on \$HOME/.cache};
37 | 
38 |   \node (ptx-title) at (9cm, -28mm) [title, right, text width=25mm] {Generated PTX};
39 |   \node (ptx-add)  [below=of ptx-title.west, typetag, xshift=2mm, yshift=-3mm, text width=2cm] {
40 |     .entry add(
41 |     \parbox{17mm}{.param .u64 a},
42 |     \parbox{17mm}{.param .u64 b},
43 |     \parbox{17mm}{.param .u64 c}
44 |     )
45 |   };
46 |   \node (ptx) [draw=black!50, fit={(ptx-title) (ptx-add)}] {};
47 | 
48 |   \draw[->] (ptx-builder) -- (ptx) node[arrow-annotate, right] {nvptx64-nvidia-cuda target};
49 |   \draw[->] (ptx) -- (main-ptx) node[arrow-annotate, below left]{embedded as String};
50 | 
51 | \end{tikzpicture}
52 | \end{document}
53 | 


--------------------------------------------------------------------------------
/docker/.gitignore:
--------------------------------------------------------------------------------
1 | ubuntu*-cuda*-nightly*.Dockerfile
2 | centos*-cuda*-nightly*.Dockerfile


--------------------------------------------------------------------------------
/docker/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_VERSIONS    := 10.0 10.1 10.2
 2 | NIGHTLY_VERSIONS := 2020-01-02 2020-05-01
 3 | 
 4 | CI_REGISTRY_IMAGE  ?= registry.gitlab.com/termoshtt/accel
 5 | CI_COMMIT_REF_SLUG ?= manual
 6 | 
 7 | define ubuntu
 8 | ubuntu$(1)-cuda$(2)-nightly$(3):
 9 | 	sed -e "s/UBUNTU_VERSION/$(1)/"  \
10 | 	    -e "s/CUDA_VERSION/$(2)/"    \
11 | 	    -e "s/NIGHTLY_VERSION/$(3)/" \
12 | 	    < ubuntu.Dockerfile          \
13 | 	    > $$@.Dockerfile
14 | 	docker build -f $$@.Dockerfile -t $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) .
15 | 	docker push $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG)
16 | endef
17 | 
18 | define centos
19 | centos$(1)-cuda$(2)-nightly$(3):
20 | 	sed -e "s/CENTOS_VERSION/$(1)/"  \
21 | 	    -e "s/CUDA_VERSION/$(2)/"    \
22 | 	    -e "s/NIGHTLY_VERSION/$(3)/" \
23 | 	    < centos.Dockerfile          \
24 | 	    > $$@.Dockerfile
25 | 	docker build -f $$@.Dockerfile -t $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG) .
26 | 	docker push $(CI_REGISTRY_IMAGE)/$$@:$(CI_COMMIT_REF_SLUG)
27 | endef
28 | 
29 | .PHONY: clean
30 | 
31 | all: $(foreach NIGHTLY_VERSION,$(NIGHTLY_VERSIONS), \
32 |        $(foreach CUDA_VERSION,$(CUDA_VERSIONS),\
33 |          ubuntu18.04-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \
34 |          centos6-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \
35 |          centos7-cuda$(CUDA_VERSION)-nightly$(NIGHTLY_VERSION) \
36 |        ) \
37 |      )
38 | 
39 | $(foreach NIGHTLY_VERSION,$(NIGHTLY_VERSIONS), \
40 |   $(foreach CUDA_VERSION,$(CUDA_VERSIONS),     \
41 |     $(eval $(call ubuntu,18.04,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \
42 |     $(eval $(call centos,6,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \
43 |     $(eval $(call centos,7,$(CUDA_VERSION),$(NIGHTLY_VERSION))) \
44 |   ) \
45 | )
46 | 
47 | clean:
48 | 	rm -rf *-cuda*-nightly*.Dockerfile


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | rust-cuda containers
 2 | =====================
 3 | 
 4 | Docker container including
 5 | 
 6 | - CUDA based on [nvidia/cuda](https://hub.docker.com/r/nvidia/cuda/) containers
 7 | - NVPTX target for Rust
 8 | 
 9 | ```
10 | docker run -it --rm registry.gitlab.com/termoshtt/accel/ubuntu18.04-cuda10.2:master
11 | ```
12 | 
13 | See also https://gitlab.com/termoshtt/accel/container_registry
14 | 
15 | Supported Platforms
16 | ------------------
17 | 
18 | |CUDA | Ubuntu 18.04 | Ubuntu 16.04 | RedHat UBI8 | RedHat UBI7 | CentOS 7 | CentOS 6 |
19 | |:---:|:------------:|:------------:|:-----------:|:-----------:|:--------:|:--------:|
20 | |10.2 | ✔️            | ✔️            |             |             | ✔️        | ✔️        |
21 | |10.1 | ✔️            | ✔️            |             |             | ✔️        | ✔️        |
22 | |10.0 | ✔️            | ✔️            | -           | -           | ✔️        | ✔️        |
23 | |9.2  | ✔️            | ✔️            | -           | -           | ✔️        | ✔️        |
24 | |9.1  | -            | ✔️            | -           | -           | ✔️        | ✔️        |
25 | |9.0  | -            | ✔️            | -           | -           | ✔️        | ✔️        |
26 | |8.0  | -            | ✔️            | -           | -           | ✔️        | ✔️        |
27 | 
28 | - https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/supported-tags.md
29 | - https://gitlab.com/nvidia/container-images/cuda/blob/master/doc/unsupported-tags.md
30 | 


--------------------------------------------------------------------------------
/docker/centos.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:CUDA_VERSION-base-centosCENTOS_VERSION
 2 | 
 3 | COPY cuda.conf /etc/ld.so.conf.d
 4 | RUN ldconfig
 5 | ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs
 6 | 
 7 | RUN yum install -y gcc && yum clean all
 8 | 
 9 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain 1.42.0
10 | ENV PATH /root/.cargo/bin:$PATH
11 | 
12 | RUN cargo install ptx-linker
13 | RUN rustup toolchain add nightly-NIGHTLY_VERSION
14 | RUN rustup target add nvptx64-nvidia-cuda --toolchain nightly-NIGHTLY_VERSION
15 | 
16 | RUN rustup component add rustfmt clippy
17 | 


--------------------------------------------------------------------------------
/docker/cuda.conf:
--------------------------------------------------------------------------------
1 | /usr/local/cuda/lib64
2 | /usr/local/cuda/lib64/stubs
3 | 


--------------------------------------------------------------------------------
/docker/ubuntu.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:CUDA_VERSION-base-ubuntuUBUNTU_VERSION
 2 | 
 3 | COPY cuda.conf /etc/ld.so.conf.d
 4 | RUN ldconfig
 5 | ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs
 6 | 
 7 | RUN apt-get update \
 8 |  && apt-get install -y curl gcc \
 9 |  && apt-get clean \
10 |  && rm -rf /var/lib/apt/lists/*
11 | 
12 | RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain 1.42.0
13 | ENV PATH /root/.cargo/bin:$PATH
14 | 
15 | RUN cargo install ptx-linker
16 | RUN rustup toolchain add nightly-NIGHTLY_VERSION
17 | RUN rustup target add nvptx64-nvidia-cuda --toolchain nightly-NIGHTLY_VERSION
18 | 
19 | RUN rustup component add rustfmt clippy
20 | 


--------------------------------------------------------------------------------
/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <title>CI artifact of termoshtt/accel project</title>
 5 |   </head>
 6 |   <body>
 7 |     <ul>
 8 |       <li><a href="accel/accel/index.html">Document for accel and accel-derive crates</a></li>
 9 |       <li><a href="accel-core/accel_core/index.html">Document for accel-core crate</a></li>
10 |       <li><a href="benchmark/report/index.html">Benchmark Result</a></li>
11 |     </ul>
12 |   </body>
13 | </html>
14 | 
15 | 


--------------------------------------------------------------------------------
/setup_nvptx_toolchain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -xue
3 | 
4 | NIGHTLY=nightly-2020-05-01
5 | rustup toolchain add ${NIGHTLY}
6 | rustup target add nvptx64-nvidia-cuda --toolchain ${NIGHTLY}
7 | cargo install ptx-linker -f
8 | 


--------------------------------------------------------------------------------