├── JCudaSamples ├── localMavenRepository │ ├── de │ │ └── javagl │ │ │ └── matrixmarketreader │ │ │ ├── maven-metadata-local.xml.md5 │ │ │ ├── maven-metadata-local.xml.sha1 │ │ │ ├── 0.0.1-SNAPSHOT │ │ │ ├── maven-metadata-local.xml.md5 │ │ │ ├── maven-metadata-local.xml.sha1 │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.jar.md5 │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.pom.md5 │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.jar.sha1 │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.pom.sha1 │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.jar │ │ │ ├── matrixmarketreader-0.0.1-SNAPSHOT.pom │ │ │ └── maven-metadata-local.xml │ │ │ └── maven-metadata-local.xml │ └── org │ │ └── jcuda │ │ └── jcuda-matrix-utils │ │ ├── maven-metadata-local.xml.md5 │ │ ├── maven-metadata-local.xml.sha1 │ │ ├── 0.0.1-SNAPSHOT │ │ ├── maven-metadata-local.xml.md5 │ │ ├── maven-metadata-local.xml.sha1 │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.md5 │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.md5 │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.sha1 │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.sha1 │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar │ │ ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom │ │ └── maven-metadata-local.xml │ │ └── maven-metadata-local.xml ├── src │ └── main │ │ ├── resources │ │ ├── data │ │ │ ├── driver │ │ │ │ └── gl │ │ │ │ │ └── Bucky.raw │ │ │ └── jcudnn │ │ │ │ └── mnist │ │ │ │ ├── conv1.bin │ │ │ │ ├── conv2.bin │ │ │ │ ├── ip1.bin │ │ │ │ ├── ip2.bin │ │ │ │ ├── conv1.bias.bin │ │ │ │ ├── conv2.bias.bin │ │ │ │ ├── five_28x28.pgm │ │ │ │ ├── ip1.bias.bin │ │ │ │ ├── ip2.bias.bin │ │ │ │ ├── one_28x28.pgm │ │ │ │ └── three_28x28.pgm │ │ └── kernels │ │ │ ├── JCudaVectorAddKernel.cu │ │ │ ├── JCudaConstantMemoryKernel.cu │ │ │ ├── JCudaDriverSimpleGLKernel.cu │ │ │ ├── JCudaVectorAddKernel.ptx │ │ │ ├── JCudaDynamicParallelismKernel.cu │ │ │ ├── JCudaAllocationInKernelKernel.cu │ │ │ ├── JCudaReductionKernel.cu │ │ │ └── JCudaDriverVolumeRendererKernel.cu │ │ └── java │ │ └── jcuda │ │ ├── runtime │ │ └── samples │ │ │ ├── JCudaPrintDeviceInfo.java │ │ │ ├── JCudaRuntimeUnifiedMemory.java │ │ │ ├── JCudaRuntimeBasicStreamCallback.java │ │ │ ├── JCudaRuntimeMappedMemory.java │ │ │ └── JCudaRuntimeMemoryBandwidths.java │ │ ├── jcufft │ │ └── samples │ │ │ └── JCufftSample.java │ │ ├── driver │ │ ├── samples │ │ │ ├── JCudaDriverHostFunction.java │ │ │ ├── JCudaDriverUnifiedMemory.java │ │ │ ├── JCudaDriverBasicStreamCallback.java │ │ │ ├── JCudaConstantMemoryExample.java │ │ │ ├── JCudaDynamicParallelism.java │ │ │ ├── JCudaVectorAdd.java │ │ │ ├── JCudaAllocationInKernel.java │ │ │ ├── JCudaDriverStreamCallbacks.java │ │ │ └── JCudaReduction.java │ │ └── gl │ │ │ └── samples │ │ │ └── SimpleInteraction.java │ │ ├── jcurand │ │ └── samples │ │ │ └── JCurandSample.java │ │ ├── vec │ │ └── samples │ │ │ ├── VecFloatSample.java │ │ │ └── VecDoubleSample.java │ │ ├── jcublas │ │ └── samples │ │ │ ├── JCublas2Sample.java │ │ │ ├── JCublas2PointerModes.java │ │ │ ├── JCublas2SgemmExSample.java │ │ │ ├── JCublas2SgemmBatched.java │ │ │ └── JCublas2MatrixInvert.java │ │ ├── nvrtc │ │ └── samples │ │ │ ├── JNvrtcLoweredNames.java │ │ │ └── JNvrtcVectorAdd.java │ │ ├── jcudnn │ │ └── samples │ │ │ └── JCudnnMnistUtils.java │ │ └── samples │ │ └── utils │ │ └── JCudaSamplesUtils.java └── pom.xml ├── .gitignore ├── README.md └── LICENSE /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/maven-metadata-local.xml.md5: -------------------------------------------------------------------------------- 1 | b95802f5bafd13d9521ada1c42de69e2 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml.md5: -------------------------------------------------------------------------------- 1 | 2428239c37417b89acedb85528a29f5b -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/maven-metadata-local.xml.sha1: -------------------------------------------------------------------------------- 1 | ed0459380fe56fe148a632e471cff8f0fb588178 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml.sha1: -------------------------------------------------------------------------------- 1 | 191b56ff1714aa6441f7a527520804eabed1d6ad -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml.md5: -------------------------------------------------------------------------------- 1 | 3a733cd451cda335f5aca40b4f25e8a0 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml.md5: -------------------------------------------------------------------------------- 1 | 107587a329bcfa73a6b8b73c68ae5686 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml.sha1: -------------------------------------------------------------------------------- 1 | bfa5c79df415720103ace72adf1945b933e568c0 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml.sha1: -------------------------------------------------------------------------------- 1 | 886fc25d151db52534eed818de5f8b4f97b36932 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar.md5: -------------------------------------------------------------------------------- 1 | f39ac4ec4d2d50fe7bf1e5d12ab8479c -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom.md5: -------------------------------------------------------------------------------- 1 | b15aaba1771436557698778d6c01f4d0 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.md5: -------------------------------------------------------------------------------- 1 | b1d419efac0e2b1f577f242812d403cb -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.md5: -------------------------------------------------------------------------------- 1 | 1386ca6a00c80b800e756af4c53467a2 -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar.sha1: -------------------------------------------------------------------------------- 1 | 8791d1ae86e4678241a6eb752ee1616b9ba51a7d -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom.sha1: -------------------------------------------------------------------------------- 1 | 5326dc181c8e65a7f315cd538df661c30008b08d -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.sha1: -------------------------------------------------------------------------------- 1 | 152ba788a1f28b730efb6fef722718b66877e4dd -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.sha1: -------------------------------------------------------------------------------- 1 | badc33ecf009d2e6360f7c96c3152ea4cf3be920 -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/driver/gl/Bucky.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/driver/gl/Bucky.raw -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bias.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bias.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/five_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/five_28x28.pgm -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bias.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bias.bin -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/one_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/one_28x28.pgm -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/data/jcudnn/mnist/three_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/three_28x28.pgm -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | JCudaVec/target/ 2 | /JCudaSamples/cudnn64_5.dll 3 | /JCudaSamples/.settings 4 | /JCudaSamples/.classpath 5 | /JCudaSamples/.project 6 | /JCudaSamples/target 7 | /JCudaSamples/src/main/resources/kernels/*.cubin 8 | /JCudaSamples/src/main/resources/kernels/*.ptx 9 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaVectorAddKernel.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void add(int n, float *a, float *b, float *sum) 3 | { 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | if (i 2 | 3 | de.javagl 4 | matrixmarketreader 5 | 6 | 7 | 0.0.1-SNAPSHOT 8 | 9 | 20161010163950 10 | 11 | 12 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.jcuda 4 | jcuda-matrix-utils 5 | 6 | 7 | 0.0.1-SNAPSHOT 8 | 9 | 20190308184031 10 | 11 | 12 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaConstantMemoryKernel.cu: -------------------------------------------------------------------------------- 1 | #define CONSTANT_MEMORY_SIZE 100 2 | __constant__ float constantMemoryData[CONSTANT_MEMORY_SIZE]; 3 | 4 | extern "C" 5 | __global__ void constantMemoryKernel(float* array, int size) 6 | { 7 | int index = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (index < size && index < CONSTANT_MEMORY_SIZE) { 9 | array[index] = constantMemoryData[index]; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | de.javagl 6 | matrixmarketreader 7 | 0.0.1-SNAPSHOT 8 | POM was created from install:install-file 9 | 10 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | org.jcuda 6 | jcuda-matrix-utils 7 | 0.0.1-SNAPSHOT 8 | POM was created from install:install-file 9 | 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jcuda-samples 2 | 3 | This repository contains samples for the JCuda libraries. 4 | 5 | **Note:** Some of the samples require third-party libraries, JCuda 6 | libraries that are not part of the [`jcuda-main`](https://github.com/jcuda/jcuda-main) 7 | package (for example, [`JCudaVec`](https://github.com/jcuda/jcuda-vec) or 8 | [`JCudnn`](https://github.com/jcuda/jcudnn)), or utility libraries 9 | that are not available in Maven Central. In order to compile these 10 | samples, additional setup steps may be necessary. The main goal 11 | of this repository is to collect and maintain the samples in a 12 | form that allows them to serve as a collection of snippets that 13 | can easily be copied and pasted into own projects to get started. 14 | 15 | 16 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | de.javagl 4 | matrixmarketreader 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | true 9 | 10 | 20161010163950 11 | 12 | 13 | jar 14 | 0.0.1-SNAPSHOT 15 | 20161010163950 16 | 17 | 18 | pom 19 | 0.0.1-SNAPSHOT 20 | 20161010155311 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.jcuda 4 | jcuda-matrix-utils 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | true 9 | 10 | 20190308184031 11 | 12 | 13 | jar 14 | 0.0.1-SNAPSHOT 15 | 20190308184031 16 | 17 | 18 | pom 19 | 0.0.1-SNAPSHOT 20 | 20161010175417 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu: -------------------------------------------------------------------------------- 1 | // Taken from the NVIDIA "2_Graphics\simpleGL" sample: 2 | 3 | // A kernel that modifies the z-coordinates of a rectangular 4 | // grid of vertices, based on a time value, so that they 5 | // form an animated sine wave 6 | 7 | extern "C" 8 | __global__ void simple_vbo_kernel( 9 | float4 *pos, unsigned int width, unsigned int height, float time) 10 | { 11 | unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 12 | unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 13 | 14 | // calculate uv coordinates 15 | float u = x / (float) width; 16 | float v = y / (float) height; 17 | u = u*2.0f - 1.0f; 18 | v = v*2.0f - 1.0f; 19 | 20 | // calculate simple sine wave pattern 21 | float freq = 4.0f; 22 | float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; 23 | 24 | // write output vertex 25 | pos[y*width+x] = make_float4(u, w, v, 1.0f); 26 | } 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2008-2016 Marco Hutter - http://www.jcuda.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaPrintDeviceInfo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2018 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.runtime.samples; 7 | 8 | import static jcuda.runtime.JCuda.cudaGetDeviceCount; 9 | import static jcuda.runtime.JCuda.cudaGetDeviceProperties; 10 | 11 | import jcuda.runtime.JCuda; 12 | import jcuda.runtime.cudaDeviceProp; 13 | 14 | /** 15 | * A sample that prints information about all available CUDA devices 16 | */ 17 | public class JCudaPrintDeviceInfo 18 | { 19 | public static void main(String[] args) 20 | { 21 | JCuda.setExceptionsEnabled(true); 22 | int deviceCount[] = { 0 }; 23 | cudaGetDeviceCount(deviceCount); 24 | System.out.println("Found " + deviceCount[0] + " devices"); 25 | for (int device = 0; device < deviceCount[0]; device++) 26 | { 27 | System.out.println("Properties of device " + device + ":"); 28 | cudaDeviceProp deviceProperties = new cudaDeviceProp(); 29 | cudaGetDeviceProperties(deviceProperties, device); 30 | System.out.println(deviceProperties.toFormattedString()); 31 | } 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaVectorAddKernel.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-20732876 5 | // Cuda compilation tools, release 8.0, V8.0.26 6 | // Based on LLVM 3.4svn 7 | // 8 | 9 | .version 5.0 10 | .target sm_20 11 | .address_size 64 12 | 13 | // .globl add 14 | 15 | .visible .entry add( 16 | .param .u32 add_param_0, 17 | .param .u64 add_param_1, 18 | .param .u64 add_param_2, 19 | .param .u64 add_param_3 20 | ) 21 | { 22 | .reg .pred %p<2>; 23 | .reg .f32 %f<4>; 24 | .reg .b32 %r<6>; 25 | .reg .b64 %rd<11>; 26 | 27 | 28 | ld.param.u32 %r2, [add_param_0]; 29 | ld.param.u64 %rd1, [add_param_1]; 30 | ld.param.u64 %rd2, [add_param_2]; 31 | ld.param.u64 %rd3, [add_param_3]; 32 | mov.u32 %r3, %ctaid.x; 33 | mov.u32 %r4, %ntid.x; 34 | mov.u32 %r5, %tid.x; 35 | mad.lo.s32 %r1, %r4, %r3, %r5; 36 | setp.ge.s32 %p1, %r1, %r2; 37 | @%p1 bra BB0_2; 38 | 39 | cvta.to.global.u64 %rd4, %rd1; 40 | mul.wide.s32 %rd5, %r1, 4; 41 | add.s64 %rd6, %rd4, %rd5; 42 | cvta.to.global.u64 %rd7, %rd2; 43 | add.s64 %rd8, %rd7, %rd5; 44 | ld.global.f32 %f1, [%rd8]; 45 | ld.global.f32 %f2, [%rd6]; 46 | add.f32 %f3, %f2, %f1; 47 | cvta.to.global.u64 %rd9, %rd3; 48 | add.s64 %rd10, %rd9, %rd5; 49 | st.global.f32 [%rd10], %f3; 50 | 51 | BB0_2: 52 | ret; 53 | } 54 | 55 | 56 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaDynamicParallelismKernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | #include 7 | 8 | // A simple example of using dynamic parallelism. This kernel can 9 | // be compiled into an object file by calling 10 | // 11 | // nvcc -dc -arch=sm_52 JCudaDynamicParallelismKernel.cu -o JCudaDynamicParallelismKernel.o 12 | // 13 | // The resulting object file can be linked into a CUBIN file with 14 | // 15 | // nvcc -dlink -arch=sm_52 -cubin JCudaDynamicParallelismKernel.o -o JCudaDynamicParallelismKernel.cubin 16 | // 17 | // Alternatively, both steps can be taken at once, by calling 18 | // 19 | // nvcc -dlink -arch=sm_52 -cubin -c JCudaDynamicParallelismKernel.cu -o JCudaDynamicParallelismKernel.cubin 20 | // 21 | // The architecture (here, sm_52) must match the architecture of 22 | // the target device. 23 | 24 | extern "C" 25 | __global__ void childKernel(unsigned int parentThreadIndex, float* data) 26 | { 27 | printf("Parent thread index: %d, child thread index: %d\n", 28 | parentThreadIndex, threadIdx.x); 29 | data[threadIdx.x] = parentThreadIndex + 0.1f * threadIdx.x; 30 | } 31 | 32 | extern "C" 33 | __global__ void parentKernel(unsigned int size, float *data) 34 | { 35 | childKernel<<<1, 8>>>(threadIdx.x, data + threadIdx.x * 8); 36 | cudaDeviceSynchronize(); 37 | __syncthreads(); 38 | } 39 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaAllocationInKernelKernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | extern "C" 4 | __global__ void allocatingKernel(void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice) 5 | { 6 | int thread = threadIdx.x + blockDim.x * blockIdx.x; 7 | 8 | short* devicePointerAllocatedOnDevice = (short*) malloc(3 * sizeof(short)); 9 | printf("In thread %d allocated %p\n", thread, devicePointerAllocatedOnDevice); 10 | for(int i=0; i < 3; i++) 11 | { 12 | devicePointerAllocatedOnDevice[i] = thread * 10 + i; 13 | } 14 | devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread] = 15 | devicePointerAllocatedOnDevice; 16 | } 17 | 18 | extern "C" 19 | __global__ void copyingKernel( 20 | void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice, 21 | void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost) 22 | { 23 | int thread = threadIdx.x + blockDim.x * blockIdx.x; 24 | 25 | short* devicePointerAllocatedOnDevice = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread]; 26 | short* devicePointerAllocatedOnHost = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost[thread]; 27 | 28 | printf("In thread %d copy from %p to %p\n", thread, devicePointerAllocatedOnDevice, devicePointerAllocatedOnHost); 29 | 30 | for(int i=0; i < 3; i++) 31 | { 32 | devicePointerAllocatedOnHost[i] = devicePointerAllocatedOnDevice[i]; 33 | } 34 | } 35 | 36 | extern "C" 37 | __global__ void freeingKernel( 38 | void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice) 39 | { 40 | int thread = threadIdx.x + blockDim.x * blockIdx.x; 41 | 42 | short* devicePointerAllocatedOnDevice = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread]; 43 | 44 | printf("In thread %d free %p\n", thread, devicePointerAllocatedOnDevice); 45 | 46 | free(devicePointerAllocatedOnDevice); 47 | } 48 | 49 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcufft/samples/JCufftSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcufft.samples; 7 | 8 | import static jcuda.jcufft.JCufft.CUFFT_FORWARD; 9 | import static jcuda.jcufft.JCufft.cufftDestroy; 10 | import static jcuda.jcufft.JCufft.cufftExecC2C; 11 | import static jcuda.jcufft.JCufft.cufftPlan1d; 12 | 13 | import org.jtransforms.fft.FloatFFT_1D; 14 | 15 | import jcuda.jcufft.cufftHandle; 16 | import jcuda.jcufft.cufftType; 17 | import jcuda.samples.utils.JCudaSamplesUtils; 18 | 19 | /** 20 | * This is a sample class that performs a 1D Complex-To-Complex 21 | * forward FFT with JCufft, and compares the result to the 22 | * reference computed with JTransforms. 23 | */ 24 | class JCufftSample 25 | { 26 | public static void main(String args[]) 27 | { 28 | testC2C1D(1<<20); 29 | } 30 | 31 | /** 32 | * Test the 1D C2C transform with the given size. 33 | * 34 | * @param size The size of the transform 35 | */ 36 | private static void testC2C1D(int size) 37 | { 38 | System.out.println("Creating input data..."); 39 | float input[] = JCudaSamplesUtils.createRandomFloatData(size * 2); 40 | 41 | System.out.println("Performing 1D C2C transform with JTransforms..."); 42 | float outputJTransforms[] = input.clone(); 43 | FloatFFT_1D fft = new FloatFFT_1D(size); 44 | fft.complexForward(outputJTransforms); 45 | 46 | System.out.println("Performing 1D C2C transform with JCufft..."); 47 | float outputJCufft[] = input.clone(); 48 | cufftHandle plan = new cufftHandle(); 49 | cufftPlan1d(plan, size, cufftType.CUFFT_C2C, 1); 50 | cufftExecC2C(plan, outputJCufft, outputJCufft, CUFFT_FORWARD); 51 | cufftDestroy(plan); 52 | 53 | boolean passed = JCudaSamplesUtils.equalByNorm( 54 | outputJTransforms, outputJCufft); 55 | System.out.println("testC2C1D " + (passed ? "PASSED" : "FAILED")); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverHostFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 9 | import static jcuda.driver.JCudaDriver.cuCtxDestroy; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuLaunchHostFunc; 13 | import static jcuda.driver.JCudaDriver.cuStreamCreate; 14 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize; 15 | 16 | import jcuda.driver.CUcontext; 17 | import jcuda.driver.CUdevice; 18 | import jcuda.driver.CUhostFn; 19 | import jcuda.driver.CUstream; 20 | import jcuda.driver.JCudaDriver; 21 | 22 | /** 23 | * An example showing how to call a host function via the driver API 24 | */ 25 | public class JCudaDriverHostFunction 26 | { 27 | /** 28 | * Entry point 29 | * 30 | * @param args Not used 31 | */ 32 | public static void main(String[] args) 33 | { 34 | // Default initialization 35 | JCudaDriver.setExceptionsEnabled(true); 36 | cuInit(0); 37 | CUcontext context = new CUcontext(); 38 | CUdevice device = new CUdevice(); 39 | cuDeviceGet(device, 0); 40 | cuCtxCreate(context, 0, device); 41 | 42 | // Create a stream 43 | CUstream stream = new CUstream(); 44 | cuStreamCreate(stream, 0); 45 | 46 | // Define a host function and launch it 47 | CUhostFn fn = new CUhostFn() 48 | { 49 | @Override 50 | public void call(Object userData) 51 | { 52 | System.out.println("Called with " + userData); 53 | } 54 | }; 55 | cuLaunchHostFunc(stream, fn, "Example user object"); 56 | 57 | // Wait for the stream to finish 58 | cuStreamSynchronize(stream); 59 | 60 | // Clean up 61 | cuCtxDestroy(context); 62 | 63 | System.out.println("Done"); 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcurand/samples/JCurandSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcurand.samples; 7 | 8 | import static jcuda.jcurand.JCurand.curandCreateGenerator; 9 | import static jcuda.jcurand.JCurand.curandDestroyGenerator; 10 | import static jcuda.jcurand.JCurand.curandGenerateUniform; 11 | import static jcuda.jcurand.JCurand.curandSetPseudoRandomGeneratorSeed; 12 | import static jcuda.jcurand.curandRngType.CURAND_RNG_PSEUDO_DEFAULT; 13 | import static jcuda.runtime.JCuda.cudaFree; 14 | import static jcuda.runtime.JCuda.cudaMalloc; 15 | import static jcuda.runtime.JCuda.cudaMemcpy; 16 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; 17 | 18 | import java.util.Arrays; 19 | 20 | import jcuda.Pointer; 21 | import jcuda.Sizeof; 22 | import jcuda.jcurand.JCurand; 23 | import jcuda.jcurand.curandGenerator; 24 | import jcuda.runtime.JCuda; 25 | 26 | /** 27 | * A small sample application showing how to use JCurand.
28 | *
29 | * This is a direct port of the NVIDIA CURAND documentation example. 30 | */ 31 | public class JCurandSample 32 | { 33 | public static void main(String args[]) 34 | { 35 | // Enable exceptions and omit all subsequent error checks 36 | JCuda.setExceptionsEnabled(true); 37 | JCurand.setExceptionsEnabled(true); 38 | 39 | int n = 100; 40 | curandGenerator generator = new curandGenerator(); 41 | 42 | // Allocate n floats on host 43 | float hostData[] = new float[n]; 44 | 45 | // Allocate n floats on device 46 | Pointer deviceData = new Pointer(); 47 | cudaMalloc(deviceData, n * Sizeof.FLOAT); 48 | 49 | // Create pseudo-random number generator 50 | curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT); 51 | 52 | // Set seed 53 | curandSetPseudoRandomGeneratorSeed(generator, 1234); 54 | 55 | // Generate n floats on device 56 | curandGenerateUniform(generator, deviceData, n); 57 | 58 | // Copy device memory to host 59 | cudaMemcpy(Pointer.to(hostData), deviceData, 60 | n * Sizeof.FLOAT, cudaMemcpyDeviceToHost); 61 | 62 | // Show result 63 | System.out.println(Arrays.toString(hostData)); 64 | 65 | // Cleanup 66 | curandDestroyGenerator(generator); 67 | cudaFree(deviceData); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaReductionKernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA driver and runtime API 3 | * http://www.jcuda.org 4 | * 5 | * 6 | * This code is based on the NVIDIA 'reduction' CUDA sample, 7 | * Copyright 1993-2010 NVIDIA Corporation. 8 | */ 9 | extern "C" 10 | __global__ void reduce(float *g_idata, float *g_odata, unsigned int n) 11 | { 12 | extern __shared__ float sdata[]; 13 | 14 | // perform first level of reduction, 15 | // reading from global memory, writing to shared memory 16 | unsigned int tid = threadIdx.x; 17 | unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x; 18 | unsigned int gridSize = blockDim.x*2*gridDim.x; 19 | 20 | float mySum = 0; 21 | 22 | // we reduce multiple elements per thread. The number is determined by the 23 | // number of active thread blocks (via gridDim). More blocks will result 24 | // in a larger gridSize and therefore fewer elements per thread 25 | while (i < n) 26 | { 27 | mySum += g_idata[i]; 28 | // ensure we don't read out of bounds 29 | if (i + blockDim.x < n) 30 | mySum += g_idata[i+blockDim.x]; 31 | i += gridSize; 32 | } 33 | 34 | // each thread puts its local sum into shared memory 35 | sdata[tid] = mySum; 36 | __syncthreads(); 37 | 38 | 39 | // do reduction in shared mem 40 | if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); } 41 | if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); } 42 | if (blockDim.x >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); } 43 | 44 | if (tid < 32) 45 | { 46 | // now that we are using warp-synchronous programming (below) 47 | // we need to declare our shared memory volatile so that the compiler 48 | // doesn't reorder stores to it and induce incorrect behavior. 49 | volatile float* smem = sdata; 50 | if (blockDim.x >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; } 51 | if (blockDim.x >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; } 52 | if (blockDim.x >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; } 53 | if (blockDim.x >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; } 54 | if (blockDim.x >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; } 55 | if (blockDim.x >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; } 56 | } 57 | 58 | // write result for this block to global mem 59 | if (tid == 0) 60 | g_odata[blockIdx.x] = sdata[0]; 61 | } 62 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeUnifiedMemory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.runtime.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasSdot; 10 | import static jcuda.runtime.JCuda.cudaDeviceGetAttribute; 11 | import static jcuda.runtime.JCuda.cudaMallocManaged; 12 | import static jcuda.runtime.JCuda.cudaMemAttachGlobal; 13 | import static jcuda.runtime.JCuda.cudaMemAttachHost; 14 | import static jcuda.runtime.JCuda.cudaStreamAttachMemAsync; 15 | import static jcuda.runtime.JCuda.cudaStreamSynchronize; 16 | import static jcuda.runtime.cudaDeviceAttr.cudaDevAttrManagedMemory; 17 | 18 | import java.nio.ByteBuffer; 19 | import java.nio.ByteOrder; 20 | import java.nio.FloatBuffer; 21 | 22 | import jcuda.Pointer; 23 | import jcuda.Sizeof; 24 | import jcuda.jcublas.JCublas; 25 | import jcuda.jcublas.cublasHandle; 26 | import jcuda.runtime.JCuda; 27 | 28 | /** 29 | * An example showing how to use Unified / Managed memory with the 30 | * JCuda Runtime API 31 | */ 32 | public class JCudaRuntimeUnifiedMemory 33 | { 34 | public static void main(String[] args) 35 | { 36 | JCuda.setExceptionsEnabled(true); 37 | JCublas.setExceptionsEnabled(true); 38 | 39 | // Check if the device supports managed memory 40 | int supported[] = { 0 }; 41 | cudaDeviceGetAttribute(supported, cudaDevAttrManagedMemory, 0); 42 | if (supported[0] == 0) 43 | { 44 | System.err.println("Device does not support managed memory"); 45 | return; 46 | } 47 | 48 | // Allocate managed memory that is accessible to the host 49 | int n = 10; 50 | long size = n * Sizeof.FLOAT; 51 | Pointer p = new Pointer(); 52 | cudaMallocManaged(p, size, cudaMemAttachHost); 53 | 54 | // Obtain the byte buffer from the pointer. This is supported only 55 | // for memory that was allocated to be accessible on the host: 56 | ByteBuffer bb = p.getByteBuffer(0, size); 57 | 58 | System.out.println("Buffer on host side: " + bb); 59 | 60 | // Fill the buffer with sample data 61 | FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer(); 62 | for (int i = 0; i < n; i++) 63 | { 64 | fb.put(i, i); 65 | } 66 | 67 | // Make the buffer accessible to all devices 68 | cudaStreamAttachMemAsync(null, p, 0, cudaMemAttachGlobal); 69 | cudaStreamSynchronize(null); 70 | 71 | // Use the pointer in a device operation (here, a dot product with 72 | // JCublas, for example). The data that was filled in by the host 73 | // will now be used by the device. 74 | cublasHandle handle = new cublasHandle(); 75 | cublasCreate(handle); 76 | float result[] = { -1.0f }; 77 | cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result)); 78 | System.out.println("Result: " + result[0]); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeBasicStreamCallback.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.runtime.samples; 7 | 8 | import static jcuda.runtime.JCuda.cudaFree; 9 | import static jcuda.runtime.JCuda.cudaMalloc; 10 | import static jcuda.runtime.JCuda.cudaMemcpyAsync; 11 | import static jcuda.runtime.JCuda.cudaStreamAddCallback; 12 | import static jcuda.runtime.JCuda.cudaStreamCreate; 13 | import static jcuda.runtime.JCuda.cudaStreamSynchronize; 14 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; 15 | 16 | import jcuda.Pointer; 17 | import jcuda.Sizeof; 18 | import jcuda.runtime.JCuda; 19 | import jcuda.runtime.cudaStreamCallback; 20 | import jcuda.runtime.cudaStream_t; 21 | 22 | /** 23 | * A very basic example / test for the stream callback functionality in the 24 | * JCuda Runtime API 25 | */ 26 | public class JCudaRuntimeBasicStreamCallback 27 | { 28 | /** 29 | * Entry point of this program 30 | * 31 | * @param args Not used 32 | */ 33 | public static void main(String[] args) 34 | { 35 | JCuda.setExceptionsEnabled(true); 36 | 37 | // The stream on which the callbacks will be registered. 38 | // When this is "null", then it is the default stream. 39 | cudaStream_t stream = null; 40 | 41 | boolean useDefaultStream = true; 42 | useDefaultStream = false; 43 | if (!useDefaultStream) 44 | { 45 | stream = new cudaStream_t(); 46 | cudaStreamCreate(stream); 47 | } 48 | System.out.println("Using stream " + stream); 49 | 50 | // Define the callback 51 | cudaStreamCallback callback = new cudaStreamCallback() 52 | { 53 | @Override 54 | public void call(cudaStream_t stream, int status, Object userData) 55 | { 56 | System.out.println("Callback called"); 57 | System.out.println(" stream : " + stream); 58 | System.out.println(" status : " + status); 59 | System.out.println(" userData: " + userData); 60 | System.out.println(" thread : " + Thread.currentThread()); 61 | } 62 | }; 63 | 64 | // Create some dummy data on the host, and copy it to the 65 | // device asynchronously 66 | int n = 100000; 67 | float hostData[] = new float[n]; 68 | Pointer deviceData = new Pointer(); 69 | cudaMalloc(deviceData, n * Sizeof.FLOAT); 70 | cudaMemcpyAsync(deviceData, Pointer.to(hostData), 71 | n * Sizeof.FLOAT, cudaMemcpyHostToDevice, stream); 72 | 73 | // Add the callback to the stream that carries the copy operation 74 | Object userData = "Example user data"; 75 | cudaStreamAddCallback(stream, callback, userData, 0); 76 | 77 | // Wait until the stream is finished 78 | cudaStreamSynchronize(stream); 79 | 80 | // Clean up 81 | cudaFree(deviceData); 82 | 83 | System.out.println("Done"); 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverUnifiedMemory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY; 9 | import static jcuda.driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL; 10 | import static jcuda.driver.CUmemAttach_flags.CU_MEM_ATTACH_HOST; 11 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 13 | import static jcuda.driver.JCudaDriver.cuDeviceGetAttribute; 14 | import static jcuda.driver.JCudaDriver.cuInit; 15 | import static jcuda.driver.JCudaDriver.cuMemAllocManaged; 16 | import static jcuda.driver.JCudaDriver.cuStreamAttachMemAsync; 17 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize; 18 | import static jcuda.jcublas.JCublas2.cublasCreate; 19 | import static jcuda.jcublas.JCublas2.cublasSdot; 20 | 21 | import java.nio.ByteBuffer; 22 | import java.nio.ByteOrder; 23 | import java.nio.FloatBuffer; 24 | 25 | import jcuda.Pointer; 26 | import jcuda.Sizeof; 27 | import jcuda.driver.CUcontext; 28 | import jcuda.driver.CUdevice; 29 | import jcuda.driver.CUdeviceptr; 30 | import jcuda.driver.JCudaDriver; 31 | import jcuda.jcublas.JCublas; 32 | import jcuda.jcublas.cublasHandle; 33 | 34 | /** 35 | * An example showing how to use Unified / Managed memory with the 36 | * JCuda Driver API 37 | */ 38 | public class JCudaDriverUnifiedMemory 39 | { 40 | public static void main(String[] args) 41 | { 42 | JCudaDriver.setExceptionsEnabled(true); 43 | JCublas.setExceptionsEnabled(true); 44 | 45 | // Initialize the driver and create a context for the first device. 46 | cuInit(0); 47 | CUdevice device = new CUdevice(); 48 | cuDeviceGet(device, 0); 49 | CUcontext context = new CUcontext(); 50 | cuCtxCreate(context, 0, device); 51 | 52 | // Check if the device supports managed memory 53 | int supported[] = { 0 }; 54 | cuDeviceGetAttribute(supported, 55 | CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device); 56 | if (supported[0] == 0) 57 | { 58 | System.err.println("Device does not support managed memory"); 59 | return; 60 | } 61 | 62 | // Allocate managed memory that is accessible to the host 63 | int n = 10; 64 | long size = n * Sizeof.FLOAT; 65 | CUdeviceptr p = new CUdeviceptr(); 66 | cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST); 67 | 68 | // Obtain the byte buffer from the pointer. This is supported only 69 | // for memory that was allocated to be accessible on the host: 70 | ByteBuffer bb = p.getByteBuffer(0, size); 71 | 72 | System.out.println("Buffer on host side: " + bb); 73 | 74 | // Fill the buffer with sample data 75 | FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer(); 76 | for (int i = 0; i < n; i++) 77 | { 78 | fb.put(i, i); 79 | } 80 | 81 | // Make the buffer accessible to all devices 82 | cuStreamAttachMemAsync(null, p, 0, CU_MEM_ATTACH_GLOBAL); 83 | cuStreamSynchronize(null); 84 | 85 | // Use the pointer in a device operation (here, a dot product with 86 | // JCublas, for example). The data that was filled in by the host 87 | // will now be used by the device. 88 | cublasHandle handle = new cublasHandle(); 89 | cublasCreate(handle); 90 | float result[] = { -1.0f }; 91 | cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result)); 92 | System.out.println("Result: " + result[0]); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverBasicStreamCallback.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 9 | import static jcuda.driver.JCudaDriver.cuCtxDestroy; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 13 | import static jcuda.driver.JCudaDriver.cuMemFree; 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoDAsync; 15 | import static jcuda.driver.JCudaDriver.cuStreamAddCallback; 16 | import static jcuda.driver.JCudaDriver.cuStreamCreate; 17 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize; 18 | 19 | import jcuda.Pointer; 20 | import jcuda.Sizeof; 21 | import jcuda.driver.CUcontext; 22 | import jcuda.driver.CUdevice; 23 | import jcuda.driver.CUdeviceptr; 24 | import jcuda.driver.CUstream; 25 | import jcuda.driver.CUstreamCallback; 26 | import jcuda.driver.JCudaDriver; 27 | 28 | /** 29 | * A very basic example / test for the stream callback functionality in the 30 | * JCuda Driver API 31 | */ 32 | public class JCudaDriverBasicStreamCallback 33 | { 34 | /** 35 | * Entry point of this program 36 | * 37 | * @param args Not used 38 | */ 39 | public static void main(String[] args) 40 | { 41 | JCudaDriver.setExceptionsEnabled(true); 42 | 43 | // Default initialization 44 | cuInit(0); 45 | CUcontext context = new CUcontext(); 46 | CUdevice device = new CUdevice(); 47 | cuDeviceGet(device, 0); 48 | cuCtxCreate(context, 0, device); 49 | 50 | // The stream on which the callbacks will be registered. 51 | // When this is "null", then it is the default stream. 52 | CUstream stream = null; 53 | 54 | boolean useDefaultStream = true; 55 | useDefaultStream = false; 56 | if (!useDefaultStream) 57 | { 58 | stream = new CUstream(); 59 | cuStreamCreate(stream, 0); 60 | } 61 | System.out.println("Using stream " + stream); 62 | 63 | // Define the callback 64 | CUstreamCallback callback = new CUstreamCallback() 65 | { 66 | @Override 67 | public void call(CUstream hStream, int status, Object userData) 68 | { 69 | System.out.println("Callback called"); 70 | System.out.println(" stream : " + hStream); 71 | System.out.println(" status : " + status); 72 | System.out.println(" userData: " + userData); 73 | System.out.println(" thread : " + Thread.currentThread()); 74 | } 75 | }; 76 | 77 | // Create some dummy data on the host, and copy it to the 78 | // device asynchronously 79 | int n = 100000; 80 | float hostData[] = new float[n]; 81 | CUdeviceptr deviceData = new CUdeviceptr(); 82 | cuMemAlloc(deviceData, n * Sizeof.FLOAT); 83 | cuMemcpyHtoDAsync(deviceData, Pointer.to(hostData), 84 | n * Sizeof.FLOAT, stream); 85 | 86 | // Add the callback to the stream that carries the copy operation 87 | Object userData = "Example user data"; 88 | cuStreamAddCallback(stream, callback, userData, 0); 89 | 90 | // Wait until the stream is finished 91 | cuStreamSynchronize(stream); 92 | 93 | // Clean up 94 | cuMemFree(deviceData); 95 | cuCtxDestroy(context); 96 | 97 | System.out.println("Done"); 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/vec/samples/VecFloatSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCudaVec - Vector operations for JCuda 3 | * http://www.jcuda.org 4 | * 5 | * Copyright (c) 2013-2015 Marco Hutter - http://www.jcuda.org 6 | */ 7 | package jcuda.vec.samples; 8 | 9 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 13 | import static jcuda.driver.JCudaDriver.cuMemFree; 14 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 15 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 16 | import jcuda.Pointer; 17 | import jcuda.Sizeof; 18 | import jcuda.driver.CUcontext; 19 | import jcuda.driver.CUdevice; 20 | import jcuda.driver.CUdeviceptr; 21 | import jcuda.driver.JCudaDriver; 22 | import jcuda.vec.VecFloat; 23 | 24 | /** 25 | * A sample showing how to use the JCuda vector library 26 | */ 27 | public class VecFloatSample 28 | { 29 | public static void main(String[] args) 30 | { 31 | // Enable exceptions and omit all subsequent error checks 32 | JCudaDriver.setExceptionsEnabled(true); 33 | 34 | // Initialize the driver and create a context for the first device. 35 | cuInit(0); 36 | CUdevice device = new CUdevice(); 37 | cuDeviceGet(device, 0); 38 | CUcontext context = new CUcontext(); 39 | cuCtxCreate(context, 0, device); 40 | 41 | // Afterwards, initialize the vector library, which will 42 | // attach to the current context 43 | VecFloat.init(); 44 | 45 | // Allocate and fill the host input data 46 | int n = 50000; 47 | float hostX[] = new float[n]; 48 | float hostY[] = new float[n]; 49 | for(int i = 0; i < n; i++) 50 | { 51 | hostX[i] = (float)i; 52 | hostY[i] = (float)i; 53 | } 54 | 55 | // Allocate the device pointers, and copy the 56 | // host input data to the device 57 | CUdeviceptr deviceX = new CUdeviceptr(); 58 | cuMemAlloc(deviceX, n * Sizeof.FLOAT); 59 | cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT); 60 | 61 | CUdeviceptr deviceY = new CUdeviceptr(); 62 | cuMemAlloc(deviceY, n * Sizeof.FLOAT); 63 | cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT); 64 | 65 | CUdeviceptr deviceResult = new CUdeviceptr(); 66 | cuMemAlloc(deviceResult, n * Sizeof.FLOAT); 67 | 68 | // Perform the vector operations 69 | VecFloat.cos(n, deviceX, deviceX); // x = cos(x) 70 | VecFloat.mul(n, deviceX, deviceX, deviceX); // x = x*x 71 | VecFloat.sin(n, deviceY, deviceY); // y = sin(y) 72 | VecFloat.mul(n, deviceY, deviceY, deviceY); // y = y*y 73 | VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y 74 | 75 | // Allocate host output memory and copy the device output 76 | // to the host. 77 | float hostResult[] = new float[n]; 78 | cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT); 79 | 80 | // Verify the result 81 | boolean passed = true; 82 | for(int i = 0; i < n; i++) 83 | { 84 | float expected = (float)( 85 | Math.cos(hostX[i])*Math.cos(hostX[i])+ 86 | Math.sin(hostY[i])*Math.sin(hostY[i])); 87 | if (Math.abs(hostResult[i] - expected) > 1e-5) 88 | { 89 | System.out.println( 90 | "At index "+i+ " found "+hostResult[i]+ 91 | " but expected "+expected); 92 | passed = false; 93 | break; 94 | } 95 | } 96 | System.out.println("Test "+(passed?"PASSED":"FAILED")); 97 | 98 | // Clean up. 99 | cuMemFree(deviceX); 100 | cuMemFree(deviceY); 101 | cuMemFree(deviceResult); 102 | VecFloat.shutdown(); 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/vec/samples/VecDoubleSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCudaVec - Vector operations for JCuda 3 | * http://www.jcuda.org 4 | * 5 | * Copyright (c) 2013-2015 Marco Hutter - http://www.jcuda.org 6 | */ 7 | package jcuda.vec.samples; 8 | 9 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 13 | import static jcuda.driver.JCudaDriver.cuMemFree; 14 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 15 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 16 | import jcuda.Pointer; 17 | import jcuda.Sizeof; 18 | import jcuda.driver.CUcontext; 19 | import jcuda.driver.CUdevice; 20 | import jcuda.driver.CUdeviceptr; 21 | import jcuda.driver.JCudaDriver; 22 | import jcuda.vec.VecDouble; 23 | import jcuda.vec.VecFloat; 24 | 25 | /** 26 | * A sample showing how to use the JCuda vector library 27 | */ 28 | public class VecDoubleSample 29 | { 30 | public static void main(String[] args) 31 | { 32 | // Enable exceptions and omit all subsequent error checks 33 | JCudaDriver.setExceptionsEnabled(true); 34 | 35 | // Initialize the driver and create a context for the first device. 36 | cuInit(0); 37 | CUdevice device = new CUdevice(); 38 | cuDeviceGet(device, 0); 39 | CUcontext context = new CUcontext(); 40 | cuCtxCreate(context, 0, device); 41 | 42 | // Afterwards, initialize the vector library, which will 43 | // attach to the current context 44 | VecDouble.init(); 45 | 46 | // Allocate and fill the host input data 47 | int n = 50000; 48 | double hostX[] = new double[n]; 49 | double hostY[] = new double[n]; 50 | for(int i = 0; i < n; i++) 51 | { 52 | hostX[i] = (double)i; 53 | hostY[i] = (double)i; 54 | } 55 | 56 | // Allocate the device pointers, and copy the 57 | // host input data to the device 58 | CUdeviceptr deviceX = new CUdeviceptr(); 59 | cuMemAlloc(deviceX, n * Sizeof.DOUBLE); 60 | cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.DOUBLE); 61 | 62 | CUdeviceptr deviceY = new CUdeviceptr(); 63 | cuMemAlloc(deviceY, n * Sizeof.DOUBLE); 64 | cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.DOUBLE); 65 | 66 | CUdeviceptr deviceResult = new CUdeviceptr(); 67 | cuMemAlloc(deviceResult, n * Sizeof.DOUBLE); 68 | 69 | // Perform the vector operations 70 | VecDouble.cos(n, deviceX, deviceX); // x = cos(x) 71 | VecDouble.mul(n, deviceX, deviceX, deviceX); // x = x*x 72 | VecDouble.sin(n, deviceY, deviceY); // y = sin(y) 73 | VecDouble.mul(n, deviceY, deviceY, deviceY); // y = y*y 74 | VecDouble.add(n, deviceResult, deviceX, deviceY); // result = x+y 75 | 76 | // Allocate host output memory and copy the device output 77 | // to the host. 78 | double hostResult[] = new double[n]; 79 | cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.DOUBLE); 80 | 81 | // Verify the result 82 | boolean passed = true; 83 | for(int i = 0; i < n; i++) 84 | { 85 | double expected = 86 | Math.cos(hostX[i])*Math.cos(hostX[i])+ 87 | Math.sin(hostY[i])*Math.sin(hostY[i]); 88 | if (Math.abs(hostResult[i] - expected) > 1e-14) 89 | { 90 | System.out.println( 91 | "At index "+i+ " found "+hostResult[i]+ 92 | " but expected "+expected); 93 | passed = false; 94 | break; 95 | } 96 | } 97 | System.out.println("Test "+(passed?"PASSED":"FAILED")); 98 | 99 | // Clean up. 100 | cuMemFree(deviceX); 101 | cuMemFree(deviceY); 102 | cuMemFree(deviceResult); 103 | VecDouble.shutdown(); 104 | } 105 | 106 | } 107 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeMappedMemory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.runtime.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasDestroy; 10 | import static jcuda.jcublas.JCublas2.cublasSscal; 11 | import static jcuda.runtime.JCuda.cudaDeviceMapHost; 12 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize; 13 | import static jcuda.runtime.JCuda.cudaFreeHost; 14 | import static jcuda.runtime.JCuda.cudaGetDeviceProperties; 15 | import static jcuda.runtime.JCuda.cudaHostAlloc; 16 | import static jcuda.runtime.JCuda.cudaHostAllocMapped; 17 | import static jcuda.runtime.JCuda.cudaHostGetDevicePointer; 18 | import static jcuda.runtime.JCuda.cudaSetDeviceFlags; 19 | 20 | import java.nio.ByteBuffer; 21 | import java.nio.ByteOrder; 22 | import java.nio.FloatBuffer; 23 | 24 | import jcuda.Pointer; 25 | import jcuda.Sizeof; 26 | import jcuda.jcublas.JCublas2; 27 | import jcuda.jcublas.cublasHandle; 28 | import jcuda.runtime.JCuda; 29 | import jcuda.runtime.cudaDeviceProp; 30 | 31 | /** 32 | * An example showing how to use mapped memory in JCuda. Host memory is 33 | * allocated and mapped to the device. There, it is modified with a 34 | * runtime library function (CUBLAS, for example), which then 35 | * effectively writes to host memory. 36 | */ 37 | public class JCudaRuntimeMappedMemory 38 | { 39 | /** 40 | * Entry point of this sample 41 | * 42 | * @param args Not used 43 | */ 44 | public static void main(String args[]) 45 | { 46 | // Enable exceptions to quickly be informed about errors in this test 47 | JCuda.setExceptionsEnabled(true); 48 | JCublas2.setExceptionsEnabled(true); 49 | 50 | // Check if the device supports mapped host memory 51 | cudaDeviceProp deviceProperties = new cudaDeviceProp(); 52 | cudaGetDeviceProperties(deviceProperties, 0); 53 | if (deviceProperties.canMapHostMemory == 0) 54 | { 55 | System.err.println("This device can not map host memory"); 56 | System.err.println(deviceProperties.toFormattedString()); 57 | return; 58 | } 59 | 60 | // Set the flag indicating that mapped memory will be used 61 | cudaSetDeviceFlags(cudaDeviceMapHost); 62 | 63 | // Allocate mappable host memory 64 | int n = 5; 65 | Pointer hostPointer = new Pointer(); 66 | cudaHostAlloc(hostPointer, n * Sizeof.FLOAT, cudaHostAllocMapped); 67 | 68 | // Create a device pointer mapping the host memory 69 | Pointer devicePointer = new Pointer(); 70 | cudaHostGetDevicePointer(devicePointer, hostPointer, 0); 71 | 72 | // Obtain a ByteBuffer for accessing the data in the host 73 | // pointer. Modifications in this ByteBuffer will be 74 | // visible in the device memory. 75 | ByteBuffer byteBuffer = hostPointer.getByteBuffer(0, n * Sizeof.FLOAT); 76 | 77 | // Set the byte order of the ByteBuffer 78 | byteBuffer.order(ByteOrder.nativeOrder()); 79 | 80 | // For convenience, view the ByteBuffer as a FloatBuffer 81 | // and fill it with some sample data 82 | FloatBuffer floatBuffer = byteBuffer.asFloatBuffer(); 83 | System.out.print("Input : "); 84 | for (int i = 0; i < n; i++) 85 | { 86 | floatBuffer.put(i, (float) i); 87 | System.out.print(floatBuffer.get(i) + ", "); 88 | } 89 | System.out.println(); 90 | 91 | // Apply a CUBLAS routine to the device pointer. This will 92 | // modify the host data, which was mapped to the device. 93 | cublasHandle handle = new cublasHandle(); 94 | cublasCreate(handle); 95 | Pointer two = Pointer.to(new float[] { 2.0f }); 96 | cublasSscal(handle, n, two, devicePointer, 1); 97 | cublasDestroy(handle); 98 | cudaDeviceSynchronize(); 99 | 100 | // Print the contents of the host memory after the 101 | // modification via the mapped pointer. 102 | System.out.print("Output: "); 103 | for (int i = 0; i < n; i++) 104 | { 105 | System.out.print(floatBuffer.get(i) + ", "); 106 | } 107 | System.out.println(); 108 | 109 | // Clean up 110 | cudaFreeHost(hostPointer); 111 | } 112 | } -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaConstantMemoryExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2018 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 8 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 9 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 10 | import static jcuda.driver.JCudaDriver.cuInit; 11 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 13 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 15 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 16 | import static jcuda.driver.JCudaDriver.*; 17 | 18 | import java.io.IOException; 19 | import java.util.Arrays; 20 | 21 | import jcuda.Pointer; 22 | import jcuda.Sizeof; 23 | import jcuda.driver.CUcontext; 24 | import jcuda.driver.CUdevice; 25 | import jcuda.driver.CUdeviceptr; 26 | import jcuda.driver.CUfunction; 27 | import jcuda.driver.CUmodule; 28 | import jcuda.driver.JCudaDriver; 29 | import jcuda.samples.utils.JCudaSamplesUtils; 30 | 31 | /** 32 | * An example showing how to use constant memory in kernels.
33 | */ 34 | public class JCudaConstantMemoryExample 35 | { 36 | public static void main(String[] args) throws IOException 37 | { 38 | // Enable exceptions and omit all subsequent error checks 39 | JCudaDriver.setExceptionsEnabled(true); 40 | 41 | // Initialize the driver and create a context for the first device. 42 | cuInit(0); 43 | CUdevice device = new CUdevice(); 44 | cuDeviceGet(device, 0); 45 | CUcontext context = new CUcontext(); 46 | cuCtxCreate(context, 0, device); 47 | 48 | // Create the PTX file by calling the NVCC 49 | String ptxFileName = JCudaSamplesUtils.preparePtxFile( 50 | "src/main/resources/kernels/JCudaConstantMemoryKernel.cu"); 51 | 52 | // Load the PTX file. 53 | CUmodule module = new CUmodule(); 54 | cuModuleLoad(module, ptxFileName); 55 | 56 | // Obtain the pointer to the constant memory, and print some info 57 | CUdeviceptr constantMemoryPointer = new CUdeviceptr(); 58 | long constantMemorySizeArray[] = { 0 }; 59 | cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 60 | module, "constantMemoryData"); 61 | int constantMemorySize = (int)constantMemorySizeArray[0]; 62 | 63 | System.out.println("constantMemoryPointer: " + constantMemoryPointer); 64 | System.out.println("constantMemorySize: " + constantMemorySize); 65 | 66 | // Copy some host data to the constant memory 67 | int numElements = constantMemorySize / Sizeof.FLOAT; 68 | float hostData[] = new float[numElements]; 69 | for (int i = 0; i < numElements; i++) 70 | { 71 | hostData[i] = i; 72 | } 73 | cuMemcpyHtoD(constantMemoryPointer, 74 | Pointer.to(hostData), constantMemorySize); 75 | 76 | // Now use the constant memory in the kernel call: 77 | 78 | // Obtain a function pointer to the "constantMemoryKernel" function. 79 | CUfunction kernel = new CUfunction(); 80 | cuModuleGetFunction(kernel, module, "constantMemoryKernel"); 81 | 82 | // Allocate some device memory 83 | CUdeviceptr deviceData = new CUdeviceptr(); 84 | cuMemAlloc(deviceData, constantMemorySize); 85 | 86 | // Set up the kernel parameters 87 | Pointer kernelParameters = Pointer.to( 88 | Pointer.to(deviceData), 89 | Pointer.to(new int[]{numElements}) 90 | ); 91 | 92 | // Launch the kernel 93 | int blockSizeX = numElements; 94 | int gridSizeX = 1; 95 | cuLaunchKernel(kernel, 96 | gridSizeX, 1, 1, 97 | blockSizeX, 1, 1, 98 | 0, null, 99 | kernelParameters, null 100 | ); 101 | cuCtxSynchronize(); 102 | 103 | // Copy the result back to the host, and verify that it is 104 | // the same that was copied to the constant memory 105 | float hostResult[] = new float[numElements]; 106 | cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize); 107 | 108 | boolean passed = Arrays.equals(hostData, hostResult); 109 | System.out.println("Test " + (passed ? "PASSED" : "FAILED")); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2Sample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcublas.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasDestroy; 10 | import static jcuda.jcublas.JCublas2.cublasGetVector; 11 | import static jcuda.jcublas.JCublas2.cublasSetVector; 12 | import static jcuda.jcublas.JCublas2.cublasSgemm; 13 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; 14 | import static jcuda.runtime.JCuda.cudaFree; 15 | import static jcuda.runtime.JCuda.cudaMalloc; 16 | 17 | import jcuda.Pointer; 18 | import jcuda.Sizeof; 19 | import jcuda.jcublas.JCublas2; 20 | import jcuda.jcublas.cublasHandle; 21 | import jcuda.samples.utils.JCudaSamplesUtils; 22 | 23 | /** 24 | * This is a sample class demonstrating the application of JCublas2 for 25 | * performing a BLAS 'sgemm' operation, i.e. for computing the matrix
26 | * C = alpha * A * B + beta * C
27 | * for single-precision floating point values alpha and beta, and matrices 28 | * A, B and C of size 1000x1000. 29 | */ 30 | public class JCublas2Sample 31 | { 32 | public static void main(String args[]) 33 | { 34 | JCublas2.setExceptionsEnabled(true); 35 | testSgemm(1000); 36 | } 37 | 38 | /** 39 | * Test the JCublas sgemm operation for matrices of size n x x 40 | * 41 | * @param n The matrix size 42 | */ 43 | public static void testSgemm(int n) 44 | { 45 | float alpha = 0.3f; 46 | float beta = 0.7f; 47 | int nn = n * n; 48 | 49 | System.out.println("Creating input data..."); 50 | float h_A[] = JCudaSamplesUtils.createRandomFloatData(nn); 51 | float h_B[] = JCudaSamplesUtils.createRandomFloatData(nn); 52 | float h_C[] = JCudaSamplesUtils.createRandomFloatData(nn); 53 | float h_C_ref[] = h_C.clone(); 54 | 55 | System.out.println("Performing Sgemm with Java..."); 56 | sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref); 57 | 58 | System.out.println("Performing Sgemm with JCublas..."); 59 | sgemmJCublas(n, alpha, h_A, h_B, beta, h_C); 60 | 61 | boolean passed = JCudaSamplesUtils.equalByNorm(h_C, h_C_ref); 62 | System.out.println("testSgemm " + (passed ? "PASSED" : "FAILED")); 63 | } 64 | 65 | /** 66 | * Implementation of sgemm using JCublas 67 | */ 68 | private static void sgemmJCublas( 69 | int n, float alpha, float A[], float B[], float beta, float C[]) 70 | { 71 | int nn = n * n; 72 | 73 | // Create a CUBLAS handle 74 | cublasHandle handle = new cublasHandle(); 75 | cublasCreate(handle); 76 | 77 | // Allocate memory on the device 78 | Pointer d_A = new Pointer(); 79 | Pointer d_B = new Pointer(); 80 | Pointer d_C = new Pointer(); 81 | cudaMalloc(d_A, nn * Sizeof.FLOAT); 82 | cudaMalloc(d_B, nn * Sizeof.FLOAT); 83 | cudaMalloc(d_C, nn * Sizeof.FLOAT); 84 | 85 | // Copy the memory from the host to the device 86 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1); 87 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1); 88 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1); 89 | 90 | // Execute sgemm 91 | Pointer pAlpha = Pointer.to(new float[] { alpha }); 92 | Pointer pBeta = Pointer.to(new float[] { beta }); 93 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, pAlpha, d_A, n, 94 | d_B, n, pBeta, d_C, n); 95 | 96 | // Copy the result from the device to the host 97 | cublasGetVector(nn, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1); 98 | 99 | // Clean up 100 | cudaFree(d_A); 101 | cudaFree(d_B); 102 | cudaFree(d_C); 103 | cublasDestroy(handle); 104 | } 105 | 106 | /** 107 | * Simple implementation of sgemm, using plain Java 108 | */ 109 | private static void sgemmJava( 110 | int n, float alpha, float A[], float B[], float beta, float C[]) 111 | { 112 | for (int i = 0; i < n; ++i) 113 | { 114 | for (int j = 0; j < n; ++j) 115 | { 116 | float prod = 0; 117 | for (int k = 0; k < n; ++k) 118 | { 119 | prod += A[k * n + i] * B[j * n + k]; 120 | } 121 | C[j * n + i] = alpha * prod + beta * C[j * n + i]; 122 | } 123 | } 124 | } 125 | 126 | } -------------------------------------------------------------------------------- /JCudaSamples/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.jcuda 6 | jcuda-samples 7 | 0.0.0-SNAPSHOT 8 | 9 | 10 | UTF-8 11 | 11.2.0 12 | 13 | 14 | JCudaSamples 15 | Samples for JCuda 16 | http://www.jcuda.org 17 | 18 | 19 | 20 | MIT 21 | http://jcuda.org/License.txt 22 | 23 | 24 | 25 | 26 | 27 | Marco Hutter 28 | jcuda@jcuda.org 29 | 30 | developer 31 | 32 | 33 | 34 | 35 | 36 | 37 | localMocalMepository 38 | file://${basedir}/localMavenRepository 39 | 40 | 41 | 42 | 43 | 44 | org.jcuda 45 | jcuda 46 | ${jcuda.jcudaVersion} 47 | 48 | 49 | org.jcuda 50 | jcublas 51 | ${jcuda.jcudaVersion} 52 | 53 | 54 | org.jcuda 55 | jcufft 56 | ${jcuda.jcudaVersion} 57 | 58 | 59 | org.jcuda 60 | jcurand 61 | ${jcuda.jcudaVersion} 62 | 63 | 64 | org.jcuda 65 | jcusparse 66 | ${jcuda.jcudaVersion} 67 | 68 | 69 | org.jcuda 70 | jcusolver 71 | ${jcuda.jcudaVersion} 72 | 73 | 74 | org.jcuda 75 | jcudnn 76 | ${jcuda.jcudaVersion} 77 | 78 | 79 | org.jcuda 80 | jcuda-vec 81 | 0.0.2 82 | 83 | 84 | de.javagl 85 | matrixmarketreader 86 | 0.0.1-SNAPSHOT 87 | 88 | 89 | org.jcuda 90 | jcuda-matrix-utils 91 | 0.0.1-SNAPSHOT 92 | 93 | 94 | com.github.wendykierp 95 | JTransforms 96 | 3.1 97 | with-dependencies 98 | 99 | 100 | org.jogamp.gluegen 101 | gluegen-rt-main 102 | 2.3.2 103 | 104 | 105 | org.jogamp.jogl 106 | jogl-all-main 107 | 2.3.2 108 | 109 | 110 | org.lwjgl.lwjgl 111 | lwjgl 112 | 2.9.3 113 | 114 | 115 | 116 | 117 | 118 | 119 | maven-compiler-plugin 120 | 2.3.2 121 | 122 | 1.6 123 | 1.6 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDynamicParallelism.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 9 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 14 | import static jcuda.driver.JCudaDriver.cuMemFree; 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 16 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 17 | import static jcuda.driver.JCudaDriver.cuModuleLoad; 18 | 19 | import java.util.Arrays; 20 | 21 | import jcuda.Pointer; 22 | import jcuda.Sizeof; 23 | import jcuda.driver.CUcontext; 24 | import jcuda.driver.CUdevice; 25 | import jcuda.driver.CUdeviceptr; 26 | import jcuda.driver.CUfunction; 27 | import jcuda.driver.CUmodule; 28 | import jcuda.driver.JCudaDriver; 29 | import jcuda.samples.utils.JCudaSamplesUtils; 30 | 31 | /** 32 | * A simple example showing how a kernel with dynamic parallelism 33 | * can be loaded from a CUBIN file and launched. 34 | */ 35 | public class JCudaDynamicParallelism 36 | { 37 | public static void main(String[] args) 38 | { 39 | JCudaDriver.setExceptionsEnabled(true); 40 | 41 | // Initialize a context for the first device 42 | cuInit(0); 43 | CUcontext context = new CUcontext(); 44 | CUdevice device = new CUdevice(); 45 | cuDeviceGet(device, 0); 46 | cuCtxCreate(context, 0, device); 47 | 48 | // Create the CUBIN file by calling the NVCC. 49 | // See the prepareDefaultCubinFile method for the details about 50 | // the NVCC parameters that are used here. 51 | String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile( 52 | "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu"); 53 | 54 | // Load the CUBIN file 55 | CUmodule module = new CUmodule(); 56 | cuModuleLoad(module, cubinFileName); 57 | 58 | // Obtain a function pointer to the "parentKernel" function. 59 | CUfunction function = new CUfunction(); 60 | cuModuleGetFunction(function, module, "parentKernel"); 61 | 62 | // Define the nesting structure. 63 | // 64 | // NOTE: The number of child threads MUST match the value that 65 | // is used in the kernel, for the childKernel<<<1, 8>>> call! 66 | // 67 | int numParentThreads = 8; 68 | int numChildThreads = 8; 69 | 70 | // Allocate the device data that will be filled by the kernel 71 | int numElements = numParentThreads * numChildThreads; 72 | CUdeviceptr deviceData = new CUdeviceptr(); 73 | cuMemAlloc(deviceData, numElements * Sizeof.FLOAT); 74 | 75 | // Set up the kernel parameters: A pointer to an array 76 | // of pointers which point to the actual values. 77 | Pointer kernelParameters = Pointer.to( 78 | Pointer.to(new int[] { numElements }), 79 | Pointer.to(deviceData) 80 | ); 81 | 82 | // Call the kernel function. 83 | int blockSizeX = numParentThreads; 84 | int gridSizeX = (numElements + numElements - 1) / blockSizeX; 85 | cuLaunchKernel(function, 86 | gridSizeX, 1, 1, // Grid dimension 87 | blockSizeX, 1, 1, // Block dimension 88 | 0, null, // Shared memory size and stream 89 | kernelParameters, null // Kernel- and extra parameters 90 | ); 91 | cuCtxSynchronize(); 92 | 93 | // Copy the device data to the host 94 | float hostData[] = new float[numElements]; 95 | for(int i = 0; i < numElements; i++) 96 | { 97 | hostData[i] = i; 98 | } 99 | cuMemcpyDtoH(Pointer.to(hostData), 100 | deviceData, numElements * Sizeof.FLOAT); 101 | 102 | // Compare the host data with the expected values 103 | float hostDataRef[] = new float[numElements]; 104 | for(int i = 0; i < numParentThreads; i++) 105 | { 106 | for (int j=0; j < numChildThreads; j++) 107 | { 108 | hostDataRef[i * numChildThreads + j] = i + 0.1f * j; 109 | } 110 | } 111 | System.out.println("Result: "+Arrays.toString(hostData)); 112 | boolean passed = Arrays.equals(hostData, hostDataRef); 113 | System.out.println(passed ? "PASSED" : "FAILED"); 114 | 115 | // Clean up. 116 | cuMemFree(deviceData); 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2PointerModes.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcublas.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasDestroy; 10 | import static jcuda.jcublas.JCublas2.cublasSdot; 11 | import static jcuda.jcublas.JCublas2.cublasSetPointerMode; 12 | import static jcuda.jcublas.cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE; 13 | import static jcuda.jcublas.cublasPointerMode.CUBLAS_POINTER_MODE_HOST; 14 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize; 15 | import static jcuda.runtime.JCuda.cudaFree; 16 | import static jcuda.runtime.JCuda.cudaMalloc; 17 | import static jcuda.runtime.JCuda.cudaMemcpy; 18 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; 19 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; 20 | 21 | import java.util.Arrays; 22 | 23 | import jcuda.Pointer; 24 | import jcuda.Sizeof; 25 | import jcuda.jcublas.JCublas2; 26 | import jcuda.jcublas.cublasHandle; 27 | import jcuda.runtime.JCuda; 28 | 29 | 30 | /** 31 | * A sample demonstrating the different pointer modes for CUBLAS 2. 32 | * With CUBLAS 2, functions may receive pointers as arguments which are 33 | * either used as input parameters or will store results. These pointers 34 | * may either be pointers to host or to device memory. This sample shows 35 | * how to obtain the result of a 'dot' operation in host- or device 36 | * memory. 37 | */ 38 | public class JCublas2PointerModes 39 | { 40 | /** 41 | * Entry point of this sample 42 | * 43 | * @param args Not used 44 | */ 45 | public static void main(String[] args) 46 | { 47 | // Enable exceptions and omit subsequent error checks 48 | JCublas2.setExceptionsEnabled(true); 49 | JCuda.setExceptionsEnabled(true); 50 | 51 | // Create the input data: A vector containing the 52 | // value 1.0 exactly n times. 53 | int n = 1000000; 54 | float hostData[] = new float[n]; 55 | Arrays.fill(hostData, 1.0f); 56 | 57 | // Allocate device memory, and copy the input data to the device 58 | Pointer deviceData = new Pointer(); 59 | cudaMalloc(deviceData, n * Sizeof.FLOAT); 60 | cudaMemcpy(deviceData, Pointer.to(hostData), n * Sizeof.FLOAT, 61 | cudaMemcpyHostToDevice); 62 | 63 | // Create a CUBLAS handle 64 | cublasHandle handle = new cublasHandle(); 65 | cublasCreate(handle); 66 | 67 | 68 | // Execute the 'dot' function in HOST pointer mode: 69 | // The result will be written to a pointer that 70 | // points to host memory. 71 | 72 | // Set the pointer mode to HOST 73 | cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); 74 | 75 | // Prepare the pointer for the result in HOST memory 76 | float hostResult[] = { -1.0f }; 77 | Pointer hostResultPointer = Pointer.to(hostResult); 78 | 79 | // Execute the 'dot' function 80 | long beforeHostCall = System.nanoTime(); 81 | cublasSdot(handle, n, deviceData, 1, deviceData, 1, hostResultPointer); 82 | long afterHostCall = System.nanoTime(); 83 | 84 | // Print the result and timing information 85 | double hostDuration = (afterHostCall - beforeHostCall) / 1e6; 86 | System.out.println("Host call duration: " + hostDuration + " ms"); 87 | System.out.println("Result: " + hostResult[0]); 88 | 89 | 90 | // Execute the 'dot' function in DEVICE pointer mode: 91 | // The result will be written to a pointer that 92 | // points to device memory. 93 | 94 | // Set the pointer mode to DEVICE 95 | cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); 96 | 97 | // Prepare the pointer for the result in DEVICE memory 98 | Pointer deviceResultPointer = new Pointer(); 99 | cudaMalloc(deviceResultPointer, Sizeof.FLOAT); 100 | 101 | // Execute the 'dot' function 102 | long beforeDeviceCall = System.nanoTime(); 103 | cublasSdot(handle, n, deviceData, 1, deviceData, 1, 104 | deviceResultPointer); 105 | long afterDeviceCall = System.nanoTime(); 106 | 107 | // Synchronize in order to wait for the result to 108 | // be available (note that this is done implicitly 109 | // when cudaMemcpy is called) 110 | cudaDeviceSynchronize(); 111 | long afterDeviceSync = System.nanoTime(); 112 | 113 | // Copy the result from the device to the host 114 | float deviceResult[] = { -1.0f }; 115 | cudaMemcpy(Pointer.to(deviceResult), deviceResultPointer, 116 | Sizeof.FLOAT, cudaMemcpyDeviceToHost); 117 | 118 | // Print the result and timing information 119 | double deviceCallDuration = (afterDeviceCall - beforeDeviceCall) / 1e6; 120 | double deviceFullDuration = (afterDeviceSync - beforeDeviceCall) / 1e6; 121 | System.out .println( 122 | "Device call duration: " + deviceCallDuration + " ms"); 123 | System.out.println( 124 | "Device full duration: " + deviceFullDuration + " ms"); 125 | System.out.println("Result: " + deviceResult[0]); 126 | 127 | // Clean up 128 | cudaFree(deviceData); 129 | cublasDestroy(handle); 130 | } 131 | 132 | 133 | } 134 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2SgemmExSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcublas.samples; 7 | 8 | import static jcuda.cudaDataType.CUDA_R_32F; 9 | import static jcuda.jcublas.JCublas2.cublasCreate; 10 | import static jcuda.jcublas.JCublas2.cublasDestroy; 11 | import static jcuda.jcublas.JCublas2.cublasGemmEx; 12 | import static jcuda.jcublas.JCublas2.cublasGetVector; 13 | import static jcuda.jcublas.JCublas2.cublasSetVector; 14 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO0; 15 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO2; 16 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO4; 17 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO5; 18 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO6; 19 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO7; 20 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; 21 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize; 22 | import static jcuda.runtime.JCuda.cudaFree; 23 | import static jcuda.runtime.JCuda.cudaMalloc; 24 | 25 | import java.util.Arrays; 26 | import java.util.List; 27 | 28 | import jcuda.Pointer; 29 | import jcuda.Sizeof; 30 | import jcuda.jcublas.JCublas2; 31 | import jcuda.jcublas.cublasHandle; 32 | import jcuda.samples.utils.JCudaSamplesUtils; 33 | 34 | /** 35 | * This is a sample class demonstrating the application of JCublas2 for 36 | * performing a BLAS 'sgemm' operation, i.e. for computing the matrix
37 | * C = alpha * A * B + beta * C
38 | * for single-precision floating point values alpha and beta, and matrices 39 | * A, B and C, using the extended CUBLAS GEMM function 40 | */ 41 | public class JCublas2SgemmExSample 42 | { 43 | public static void main(String args[]) 44 | { 45 | JCublas2.setExceptionsEnabled(true); 46 | testSgemm(2000); 47 | } 48 | 49 | // The list of CUBLAS GEMM algorithms to use. Note that the set of 50 | // supported algorithms will likely depend on the platform, the 51 | // size of the matrix, and other factors. 52 | private static final List GEMM_ALGORITHMS = Arrays.asList( 53 | CUBLAS_GEMM_ALGO2, 54 | CUBLAS_GEMM_ALGO4, 55 | CUBLAS_GEMM_ALGO5, 56 | CUBLAS_GEMM_ALGO6, 57 | CUBLAS_GEMM_ALGO7 58 | ); 59 | private static int GEMM_ALGO = CUBLAS_GEMM_ALGO0; 60 | 61 | /** 62 | * Test the JCublas sgemm operation for matrices of size n x x 63 | * 64 | * @param n The matrix size 65 | */ 66 | public static void testSgemm(int n) 67 | { 68 | float alpha = 0.3f; 69 | float beta = 0.7f; 70 | int nn = n * n; 71 | 72 | System.out.println("Creating input data..."); 73 | float h_A[] = JCudaSamplesUtils.createRandomFloatData(nn); 74 | float h_B[] = JCudaSamplesUtils.createRandomFloatData(nn); 75 | float h_C[] = JCudaSamplesUtils.createRandomFloatData(nn); 76 | 77 | System.out.println("Performing Sgemm with JCublas..."); 78 | for (int i : GEMM_ALGORITHMS) 79 | { 80 | GEMM_ALGO = i; 81 | try 82 | { 83 | sgemmJCublas(n, alpha, h_A, h_B, beta, h_C); 84 | } 85 | catch (Exception e) 86 | { 87 | e.printStackTrace(); 88 | } 89 | } 90 | 91 | } 92 | 93 | /** 94 | * Implementation of sgemm using JCublas 95 | */ 96 | private static void sgemmJCublas( 97 | int n, float alpha, float A[], float B[], float beta, float C[]) 98 | { 99 | int nn = n * n; 100 | 101 | // Create a CUBLAS handle 102 | cublasHandle handle = new cublasHandle(); 103 | cublasCreate(handle); 104 | 105 | // Allocate memory on the device 106 | Pointer d_A = new Pointer(); 107 | Pointer d_B = new Pointer(); 108 | Pointer d_C = new Pointer(); 109 | cudaMalloc(d_A, nn * Sizeof.FLOAT); 110 | cudaMalloc(d_B, nn * Sizeof.FLOAT); 111 | cudaMalloc(d_C, nn * Sizeof.FLOAT); 112 | 113 | // Copy the memory from the host to the device 114 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1); 115 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1); 116 | cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1); 117 | 118 | // Execute sgemm 119 | Pointer pAlpha = Pointer.to(new float[] { alpha }); 120 | Pointer pBeta = Pointer.to(new float[] { beta }); 121 | 122 | long before = System.nanoTime(); 123 | 124 | cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, 125 | pAlpha, d_A, CUDA_R_32F, n, d_B, CUDA_R_32F, n, 126 | pBeta, d_C, CUDA_R_32F, n, CUDA_R_32F, GEMM_ALGO); 127 | 128 | cudaDeviceSynchronize(); 129 | 130 | long after = System.nanoTime(); 131 | double durationMs = (after - before) / 1e6; 132 | System.out.println( 133 | "Algorithm " + GEMM_ALGO + " took " + durationMs + " ms"); 134 | 135 | // Copy the result from the device to the host 136 | cublasGetVector(nn, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1); 137 | 138 | // Clean up 139 | cudaFree(d_A); 140 | cudaFree(d_B); 141 | cudaFree(d_C); 142 | cublasDestroy(handle); 143 | } 144 | 145 | } -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaVectorAdd.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 9 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 14 | import static jcuda.driver.JCudaDriver.cuMemFree; 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 16 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 17 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 18 | import static jcuda.driver.JCudaDriver.cuModuleLoad; 19 | 20 | import java.io.IOException; 21 | 22 | import jcuda.Pointer; 23 | import jcuda.Sizeof; 24 | import jcuda.driver.CUcontext; 25 | import jcuda.driver.CUdevice; 26 | import jcuda.driver.CUdeviceptr; 27 | import jcuda.driver.CUfunction; 28 | import jcuda.driver.CUmodule; 29 | import jcuda.driver.JCudaDriver; 30 | import jcuda.samples.utils.JCudaSamplesUtils; 31 | 32 | /** 33 | * This is a sample class demonstrating how to use the JCuda driver 34 | * bindings to load and execute a CUDA vector addition kernel. 35 | * The sample reads a CUDA file, compiles it to a PTX file 36 | * using NVCC, loads the PTX file as a module and executes 37 | * the kernel function. 38 | */ 39 | public class JCudaVectorAdd 40 | { 41 | /** 42 | * Entry point of this sample 43 | * 44 | * @param args Not used 45 | * @throws IOException If an IO error occurs 46 | */ 47 | public static void main(String args[]) throws IOException 48 | { 49 | // Enable exceptions and omit all subsequent error checks 50 | JCudaDriver.setExceptionsEnabled(true); 51 | 52 | // Create the PTX file by calling the NVCC 53 | String ptxFileName = JCudaSamplesUtils.preparePtxFile( 54 | "src/main/resources/kernels/JCudaVectorAddKernel.cu"); 55 | 56 | // Initialize the driver and create a context for the first device. 57 | cuInit(0); 58 | CUdevice device = new CUdevice(); 59 | cuDeviceGet(device, 0); 60 | CUcontext context = new CUcontext(); 61 | cuCtxCreate(context, 0, device); 62 | 63 | // Load the ptx file. 64 | CUmodule module = new CUmodule(); 65 | cuModuleLoad(module, ptxFileName); 66 | 67 | // Obtain a function pointer to the "add" function. 68 | CUfunction function = new CUfunction(); 69 | cuModuleGetFunction(function, module, "add"); 70 | 71 | int numElements = 1024; 72 | 73 | // Allocate and fill the host input data 74 | float hostInputA[] = new float[numElements]; 75 | float hostInputB[] = new float[numElements]; 76 | for(int i = 0; i < numElements; i++) 77 | { 78 | hostInputA[i] = (float)i; 79 | hostInputB[i] = (float)i; 80 | } 81 | 82 | // Allocate the device input data, and copy the 83 | // host input data to the device 84 | CUdeviceptr deviceInputA = new CUdeviceptr(); 85 | cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT); 86 | cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), 87 | numElements * Sizeof.FLOAT); 88 | CUdeviceptr deviceInputB = new CUdeviceptr(); 89 | cuMemAlloc(deviceInputB, numElements * Sizeof.FLOAT); 90 | cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB), 91 | numElements * Sizeof.FLOAT); 92 | 93 | // Allocate device output memory 94 | CUdeviceptr deviceOutput = new CUdeviceptr(); 95 | cuMemAlloc(deviceOutput, numElements * Sizeof.FLOAT); 96 | 97 | // Set up the kernel parameters: A pointer to an array 98 | // of pointers which point to the actual values. 99 | Pointer kernelParameters = Pointer.to( 100 | Pointer.to(new int[]{numElements}), 101 | Pointer.to(deviceInputA), 102 | Pointer.to(deviceInputB), 103 | Pointer.to(deviceOutput) 104 | ); 105 | 106 | // Call the kernel function. 107 | int blockSizeX = 256; 108 | int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX); 109 | cuLaunchKernel(function, 110 | gridSizeX, 1, 1, // Grid dimension 111 | blockSizeX, 1, 1, // Block dimension 112 | 0, null, // Shared memory size and stream 113 | kernelParameters, null // Kernel- and extra parameters 114 | ); 115 | cuCtxSynchronize(); 116 | 117 | // Allocate host output memory and copy the device output 118 | // to the host. 119 | float hostOutput[] = new float[numElements]; 120 | cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, 121 | numElements * Sizeof.FLOAT); 122 | 123 | // Verify the result 124 | boolean passed = true; 125 | for(int i = 0; i < numElements; i++) 126 | { 127 | float expected = i+i; 128 | if (Math.abs(hostOutput[i] - expected) > 1e-5) 129 | { 130 | System.out.println( 131 | "At index "+i+ " found "+hostOutput[i]+ 132 | " but expected "+expected); 133 | passed = false; 134 | break; 135 | } 136 | } 137 | System.out.println("Test "+(passed?"PASSED":"FAILED")); 138 | 139 | // Clean up. 140 | cuMemFree(deviceInputA); 141 | cuMemFree(deviceInputB); 142 | cuMemFree(deviceOutput); 143 | } 144 | 145 | 146 | } 147 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/resources/kernels/JCudaDriverVolumeRendererKernel.cu: -------------------------------------------------------------------------------- 1 | // Note: This file is basically the same as in the original NVIDIA CUDA 2 | // "volumeRender" sample, with minor modifications: 3 | // - The host functions and other parts that are not used 4 | // here have been omitted 5 | // - The render function is declared as 6 | // extern "C" 7 | // so that it keeps its original name 8 | 9 | /* 10 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 11 | * 12 | * Please refer to the NVIDIA end user license agreement (EULA) associated 13 | * with this source code for terms and conditions that govern your use of 14 | * this software. Any use, reproduction, disclosure, or distribution of 15 | * this software and related documentation outside the terms of the EULA 16 | * is strictly prohibited. 17 | * 18 | */ 19 | 20 | // Simple 3D volume renderer 21 | 22 | #ifndef _VOLUMERENDER_KERNEL_CU_ 23 | #define _VOLUMERENDER_KERNEL_CU_ 24 | 25 | #include "helper_math.h" 26 | 27 | typedef unsigned int uint; 28 | typedef unsigned char uchar; 29 | 30 | cudaArray *d_volumeArray = 0; 31 | cudaArray *d_transferFuncArray; 32 | 33 | typedef unsigned char VolumeType; 34 | //typedef unsigned short VolumeType; 35 | 36 | texture tex; // 3D texture 37 | texture transferTex; // 1D transfer function texture 38 | 39 | typedef struct 40 | { 41 | float4 m[3]; 42 | } float3x4; 43 | 44 | __constant__ float3x4 c_invViewMatrix; // inverse view matrix 45 | 46 | struct Ray 47 | { 48 | float3 o; // origin 49 | float3 d; // direction 50 | }; 51 | 52 | // intersect ray with a box 53 | // http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm 54 | 55 | __device__ 56 | int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, float *tfar) 57 | { 58 | // compute intersection of ray with all six bbox planes 59 | float3 invR = make_float3(1.0f) / r.d; 60 | float3 tbot = invR * (boxmin - r.o); 61 | float3 ttop = invR * (boxmax - r.o); 62 | 63 | // re-order intersections to find smallest and largest on each axis 64 | float3 tmin = fminf(ttop, tbot); 65 | float3 tmax = fmaxf(ttop, tbot); 66 | 67 | // find the largest tmin and the smallest tmax 68 | float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); 69 | float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); 70 | 71 | *tnear = largest_tmin; 72 | *tfar = smallest_tmax; 73 | 74 | return smallest_tmax > largest_tmin; 75 | } 76 | 77 | // transform vector by matrix (no translation) 78 | __device__ 79 | float3 mul(const float3x4 &M, const float3 &v) 80 | { 81 | float3 r; 82 | r.x = dot(v, make_float3(M.m[0])); 83 | r.y = dot(v, make_float3(M.m[1])); 84 | r.z = dot(v, make_float3(M.m[2])); 85 | return r; 86 | } 87 | 88 | // transform vector by matrix with translation 89 | __device__ 90 | float4 mul(const float3x4 &M, const float4 &v) 91 | { 92 | float4 r; 93 | r.x = dot(v, M.m[0]); 94 | r.y = dot(v, M.m[1]); 95 | r.z = dot(v, M.m[2]); 96 | r.w = 1.0f; 97 | return r; 98 | } 99 | 100 | __device__ uint rgbaFloatToInt(float4 rgba) 101 | { 102 | rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] 103 | rgba.y = __saturatef(rgba.y); 104 | rgba.z = __saturatef(rgba.z); 105 | rgba.w = __saturatef(rgba.w); 106 | return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255); 107 | } 108 | 109 | extern "C" 110 | __global__ void 111 | d_render(uint *d_output, uint imageW, uint imageH, 112 | float density, float brightness, 113 | float transferOffset, float transferScale) 114 | { 115 | const int maxSteps = 500; 116 | const float tstep = 0.01f; 117 | const float opacityThreshold = 0.95f; 118 | const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f); 119 | const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f); 120 | 121 | uint x = blockIdx.x*blockDim.x + threadIdx.x; 122 | uint y = blockIdx.y*blockDim.y + threadIdx.y; 123 | 124 | if ((x >= imageW) || (y >= imageH)) return; 125 | 126 | float u = (x / (float) imageW)*2.0f-1.0f; 127 | float v = (y / (float) imageH)*2.0f-1.0f; 128 | 129 | // calculate eye ray in world space 130 | Ray eyeRay; 131 | eyeRay.o = make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f))); 132 | eyeRay.d = normalize(make_float3(u, v, -2.0f)); 133 | eyeRay.d = mul(c_invViewMatrix, eyeRay.d); 134 | 135 | // find intersection with box 136 | float tnear, tfar; 137 | int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar); 138 | 139 | if (!hit) return; 140 | 141 | if (tnear < 0.0f) tnear = 0.0f; // clamp to near plane 142 | 143 | // march along ray from front to back, accumulating color 144 | float4 sum = make_float4(0.0f); 145 | float t = tnear; 146 | float3 pos = eyeRay.o + eyeRay.d*tnear; 147 | float3 step = eyeRay.d*tstep; 148 | 149 | for (int i=0; i opacityThreshold) 172 | break; 173 | 174 | t += tstep; 175 | 176 | if (t > tfar) break; 177 | 178 | pos += step; 179 | } 180 | 181 | sum *= brightness; 182 | 183 | // write output color 184 | d_output[y*imageW + x] = rgbaFloatToInt(sum); 185 | } 186 | 187 | 188 | #endif // #ifndef _VOLUMERENDER_KERNEL_CU_ 189 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2SgemmBatched.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcublas.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasDestroy; 10 | import static jcuda.jcublas.JCublas2.cublasSgemmBatched; 11 | import static jcuda.runtime.JCuda.cudaFree; 12 | import static jcuda.runtime.JCuda.cudaMalloc; 13 | import static jcuda.runtime.JCuda.cudaMemcpy; 14 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; 15 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; 16 | 17 | import jcuda.Pointer; 18 | import jcuda.Sizeof; 19 | import jcuda.jcublas.JCublas2; 20 | import jcuda.jcublas.cublasHandle; 21 | import jcuda.jcublas.cublasOperation; 22 | import jcuda.runtime.JCuda; 23 | import jcuda.samples.utils.JCudaSamplesUtils; 24 | 25 | /** 26 | * This is a sample class demonstrating the application of JCublas2 for 27 | * performing a batched BLAS 'sgemm' operation, i.e. for computing the 28 | * multiple matrices
29 | * C = alpha * A * B + beta * C
30 | * for single-precision floating point values alpha and beta, and matrices 31 | * A, B and C 32 | */ 33 | class JCublas2SgemmBatched 34 | { 35 | public static void main(String[] args) 36 | { 37 | JCublas2.setExceptionsEnabled(true); 38 | JCuda.setExceptionsEnabled(true); 39 | testSgemmBatched(10, 100); 40 | } 41 | 42 | public static boolean testSgemmBatched(int b, int n) 43 | { 44 | System.out.println("Testing Sgemm with " + b + " batches of size " + n); 45 | 46 | float alpha = 0.3f; 47 | float beta = 0.7f; 48 | int nn = n * n; 49 | 50 | float h_A[][] = new float[b][]; 51 | float h_B[][] = new float[b][]; 52 | float h_C[][] = new float[b][]; 53 | float h_C_ref[][] = new float[b][]; 54 | for (int i = 0; i < b; i++) 55 | { 56 | h_A[i] = JCudaSamplesUtils.createRandomFloatData(nn); 57 | h_B[i] = JCudaSamplesUtils.createRandomFloatData(nn); 58 | h_C[i] = JCudaSamplesUtils.createRandomFloatData(nn); 59 | h_C_ref[i] = h_C[i].clone(); 60 | } 61 | 62 | System.out.println("Performing Sgemm with Java..."); 63 | sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref); 64 | 65 | System.out.println("Performing Sgemm with JCublas2..."); 66 | sgemmBatchedJCublas2(n, alpha, h_A, h_B, beta, h_C); 67 | 68 | // Print the test results 69 | boolean passed = true; 70 | for (int i = 0; i < b; i++) 71 | { 72 | passed &= JCudaSamplesUtils.equalByNorm(h_C[i], h_C_ref[i]); 73 | } 74 | System.out.println(String.format("testSgemm %s", 75 | passed ? "PASSED" : "FAILED")); 76 | return passed; 77 | } 78 | 79 | static void sgemmBatchedJCublas2(int n, float alpha, 80 | float h_A[][], float h_B[][], float beta, float h_C[][]) 81 | { 82 | int nn = n * n; 83 | int b = h_A.length; 84 | Pointer[] h_Aarray = new Pointer[b]; 85 | Pointer[] h_Barray = new Pointer[b]; 86 | Pointer[] h_Carray = new Pointer[b]; 87 | for (int i = 0; i < b; i++) 88 | { 89 | h_Aarray[i] = new Pointer(); 90 | h_Barray[i] = new Pointer(); 91 | h_Carray[i] = new Pointer(); 92 | cudaMalloc(h_Aarray[i], nn * Sizeof.FLOAT); 93 | cudaMalloc(h_Barray[i], nn * Sizeof.FLOAT); 94 | cudaMalloc(h_Carray[i], nn * Sizeof.FLOAT); 95 | cudaMemcpy(h_Aarray[i], Pointer.to(h_A[i]), 96 | nn * Sizeof.FLOAT, cudaMemcpyHostToDevice); 97 | cudaMemcpy(h_Barray[i], Pointer.to(h_B[i]), 98 | nn * Sizeof.FLOAT, cudaMemcpyHostToDevice); 99 | cudaMemcpy(h_Carray[i], Pointer.to(h_C[i]), 100 | nn * Sizeof.FLOAT, cudaMemcpyHostToDevice); 101 | } 102 | Pointer d_Aarray = new Pointer(); 103 | Pointer d_Barray = new Pointer(); 104 | Pointer d_Carray = new Pointer(); 105 | cudaMalloc(d_Aarray, b * Sizeof.POINTER); 106 | cudaMalloc(d_Barray, b * Sizeof.POINTER); 107 | cudaMalloc(d_Carray, b * Sizeof.POINTER); 108 | cudaMemcpy(d_Aarray, Pointer.to(h_Aarray), 109 | b * Sizeof.POINTER, cudaMemcpyHostToDevice); 110 | cudaMemcpy(d_Barray, Pointer.to(h_Barray), 111 | b * Sizeof.POINTER, cudaMemcpyHostToDevice); 112 | cudaMemcpy(d_Carray, Pointer.to(h_Carray), 113 | b * Sizeof.POINTER, cudaMemcpyHostToDevice); 114 | 115 | cublasHandle handle = new cublasHandle(); 116 | cublasCreate(handle); 117 | 118 | cublasSgemmBatched( 119 | handle, 120 | cublasOperation.CUBLAS_OP_N, 121 | cublasOperation.CUBLAS_OP_N, 122 | n, n, n, 123 | Pointer.to(new float[]{ alpha }), 124 | d_Aarray, n, d_Barray, n, 125 | Pointer.to(new float[]{ beta }), 126 | d_Carray, n, b); 127 | 128 | for (int i = 0; i < b; i++) 129 | { 130 | cudaMemcpy(Pointer.to(h_C[i]), h_Carray[i], 131 | nn * Sizeof.FLOAT, cudaMemcpyDeviceToHost); 132 | cudaFree(h_Aarray[i]); 133 | cudaFree(h_Barray[i]); 134 | cudaFree(h_Carray[i]); 135 | } 136 | cudaFree(d_Aarray); 137 | cudaFree(d_Barray); 138 | cudaFree(d_Carray); 139 | cublasDestroy(handle); 140 | 141 | } 142 | 143 | static void sgemmJava(int n, float alpha, 144 | float A[][], float B[][], float beta, float C[][]) 145 | { 146 | for (int i = 0; i < A.length; i++) 147 | { 148 | sgemmJava(n, alpha, A[i], B[i], beta, C[i]); 149 | } 150 | } 151 | 152 | static void sgemmJava(int n, float alpha, 153 | float A[], float B[], float beta, float C[]) 154 | { 155 | for (int i = 0; i < n; ++i) 156 | { 157 | for (int j = 0; j < n; ++j) 158 | { 159 | float prod = 0; 160 | for (int k = 0; k < n; ++k) 161 | { 162 | prod += A[k * n + i] * B[j * n + k]; 163 | } 164 | C[j * n + i] = alpha * prod + beta * C[j * n + i]; 165 | } 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/nvrtc/samples/JNvrtcLoweredNames.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA driver and runtime API 3 | * http://www.jcuda.org 4 | * 5 | * Copyright 2016 Marco Hutter - http://www.jcuda.org 6 | */ 7 | 8 | package jcuda.nvrtc.samples; 9 | 10 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 13 | import static jcuda.driver.JCudaDriver.cuInit; 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 16 | import static jcuda.driver.JCudaDriver.cuMemFree; 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 18 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 19 | import static jcuda.driver.JCudaDriver.cuModuleLoadData; 20 | import static jcuda.nvrtc.JNvrtc.nvrtcAddNameExpression; 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram; 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram; 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram; 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetLoweredName; 25 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX; 26 | import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog; 27 | 28 | import java.util.Arrays; 29 | import java.util.List; 30 | 31 | import jcuda.Pointer; 32 | import jcuda.Sizeof; 33 | import jcuda.driver.CUcontext; 34 | import jcuda.driver.CUdevice; 35 | import jcuda.driver.CUdeviceptr; 36 | import jcuda.driver.CUfunction; 37 | import jcuda.driver.CUmodule; 38 | import jcuda.driver.JCudaDriver; 39 | import jcuda.nvrtc.JNvrtc; 40 | import jcuda.nvrtc.nvrtcProgram; 41 | 42 | /** 43 | * An example showing how to obtain the mangled names from kernels that 44 | * are compiled with the NVRTC at runtime 45 | */ 46 | public class JNvrtcLoweredNames 47 | { 48 | /** 49 | * The source code of the program that contains different global 50 | * functions and function templates. 51 | * (Taken from the NVIDIA NVRTC User Guide) 52 | */ 53 | private static String programSourceCode = 54 | "static __global__ void f1(int *result) { *result = 10; }" + "\n" + 55 | "namespace N1 {" + "\n" + 56 | " namespace N2 {" + "\n" + 57 | " __global__ void f2(int *result) { *result = 20; }" + "\n" + 58 | " }" + "\n" + 59 | "}" + "\n" + 60 | "template" + "\n" + 61 | "__global__ void f3(int *result) { *result = sizeof(T); }" + "\n"; 62 | 63 | /** 64 | * Entry point of this sample 65 | * 66 | * @param args Not used 67 | */ 68 | public static void main(String[] args) 69 | { 70 | // Enable exceptions and omit all subsequent error checks 71 | JCudaDriver.setExceptionsEnabled(true); 72 | JNvrtc.setExceptionsEnabled(true); 73 | 74 | // Initialize the driver and create a context for the first device. 75 | cuInit(0); 76 | CUdevice device = new CUdevice(); 77 | cuDeviceGet(device, 0); 78 | CUcontext context = new CUcontext(); 79 | cuCtxCreate(context, 0, device); 80 | 81 | // Use the NVRTC to create a program 82 | nvrtcProgram program = new nvrtcProgram(); 83 | nvrtcCreateProgram(program, programSourceCode, null, 0, null, null); 84 | 85 | // Add the name expressions that refer to the global functions 86 | // and template instantiations 87 | List functionNameExpressions = Arrays.asList( 88 | "&f1", 89 | "N1::N2::f2", 90 | "f3", 91 | "f3" 92 | ); 93 | for (String functionNameExpression : functionNameExpressions) 94 | { 95 | nvrtcAddNameExpression(program, functionNameExpression); 96 | } 97 | List expectedResults = Arrays.asList(10, 20, 4, 8); 98 | 99 | // Compile the program 100 | nvrtcCompileProgram(program, 0, null); 101 | 102 | // Print the compilation log (for the case there are any warnings) 103 | String programLog[] = new String[1]; 104 | nvrtcGetProgramLog(program, programLog); 105 | System.out.println("Program compilation log:\n" + programLog[0]); 106 | 107 | // Obtain the PTX ("CUDA Assembler") code of the compiled program 108 | String[] ptx = new String[1]; 109 | nvrtcGetPTX(program, ptx); 110 | 111 | // Create a CUDA module from the PTX code 112 | CUmodule module = new CUmodule(); 113 | cuModuleLoadData(module, ptx[0]); 114 | 115 | // Allocate the output memory on the device 116 | CUdeviceptr dResult = new CUdeviceptr(); 117 | cuMemAlloc(dResult, Sizeof.INT); 118 | 119 | // For each function name expression, obtain the lowered (mangled) 120 | // function name and print it 121 | boolean passed = true; 122 | for (int i = 0; i < functionNameExpressions.size(); i++) 123 | { 124 | // Obtain the lowered name. Note that this must be called 125 | // BEFORE the program is destroyed! 126 | String functionNameExpression = functionNameExpressions.get(i); 127 | String loweredName[] = { null }; 128 | nvrtcGetLoweredName(program, functionNameExpression, loweredName); 129 | 130 | System.out.println( 131 | "Lowered name for " + functionNameExpression 132 | + " is " + loweredName[0]); 133 | 134 | // Obtain the function pointer to the function from the module, 135 | // using the lowered name 136 | CUfunction function = new CUfunction(); 137 | cuModuleGetFunction(function, module, loweredName[0]); 138 | 139 | // Call the kernel function 140 | Pointer kernelParameters = Pointer.to(Pointer.to(dResult)); 141 | cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, 142 | kernelParameters, null); 143 | cuCtxSynchronize(); 144 | 145 | // Copy the result back to the host, and verify it 146 | int hResult[] = { 0 }; 147 | cuMemcpyDtoH(Pointer.to(hResult), dResult, Sizeof.INT); 148 | 149 | System.out.println("Result: " + hResult[0]); 150 | 151 | int expectedResult = expectedResults.get(i); 152 | passed &= (expectedResult == hResult[0]); 153 | } 154 | 155 | System.out.println("Test " + (passed ? "PASSED" : "FAILED")); 156 | 157 | // Clean up. 158 | nvrtcDestroyProgram(program); 159 | cuMemFree(dResult); 160 | 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/nvrtc/samples/JNvrtcVectorAdd.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA driver and runtime API 3 | * http://www.jcuda.org 4 | * 5 | * Copyright 2016 Marco Hutter - http://www.jcuda.org 6 | */ 7 | 8 | package jcuda.nvrtc.samples; 9 | 10 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 13 | import static jcuda.driver.JCudaDriver.cuInit; 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 16 | import static jcuda.driver.JCudaDriver.cuMemFree; 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 18 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 19 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 20 | import static jcuda.driver.JCudaDriver.cuModuleLoadData; 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram; 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram; 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram; 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX; 25 | import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog; 26 | import jcuda.Pointer; 27 | import jcuda.Sizeof; 28 | import jcuda.driver.CUcontext; 29 | import jcuda.driver.CUdevice; 30 | import jcuda.driver.CUdeviceptr; 31 | import jcuda.driver.CUfunction; 32 | import jcuda.driver.CUmodule; 33 | import jcuda.driver.JCudaDriver; 34 | import jcuda.nvrtc.JNvrtc; 35 | import jcuda.nvrtc.nvrtcProgram; 36 | 37 | /** 38 | * An example showing how to use the NVRTC (NVIDIA Runtime Compiler) API 39 | * to compile CUDA kernel code at runtime. 40 | */ 41 | public class JNvrtcVectorAdd 42 | { 43 | /** 44 | * The source code of the program that will be compiled at runtime: 45 | * A simple vector addition kernel. 46 | * 47 | * Note: The function should be declared as 48 | * extern "C" 49 | * to make sure that it can be found under the given name. 50 | */ 51 | private static String programSourceCode = 52 | "extern \"C\"" + "\n" + 53 | "__global__ void add(int n, float *a, float *b, float *sum)" + "\n" + 54 | "{" + "\n" + 55 | " int i = blockIdx.x * blockDim.x + threadIdx.x;" + "\n" + 56 | " if (i 1e-5) 167 | { 168 | System.out.println( 169 | "At index "+i+ " found "+hostOutput[i]+ 170 | " but expected "+expected); 171 | passed = false; 172 | break; 173 | } 174 | } 175 | System.out.println("Test "+(passed?"PASSED":"FAILED")); 176 | 177 | // Clean up. 178 | cuMemFree(deviceInputA); 179 | cuMemFree(deviceInputB); 180 | cuMemFree(deviceOutput); 181 | 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaAllocationInKernel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 8 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 9 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 10 | import static jcuda.driver.JCudaDriver.cuInit; 11 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 13 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 15 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 16 | import static jcuda.driver.JCudaDriver.cuModuleLoad; 17 | 18 | import java.io.IOException; 19 | import java.util.Arrays; 20 | 21 | import jcuda.Pointer; 22 | import jcuda.Sizeof; 23 | import jcuda.driver.CUcontext; 24 | import jcuda.driver.CUdevice; 25 | import jcuda.driver.CUdeviceptr; 26 | import jcuda.driver.CUfunction; 27 | import jcuda.driver.CUmodule; 28 | import jcuda.driver.JCudaDriver; 29 | import jcuda.samples.utils.JCudaSamplesUtils; 30 | 31 | /** 32 | * An example showing how to allocate memory in kernels.
33 | *
34 | * Kernels may allocate memory, using the standard malloc and 35 | * free functions. When used inside a kernel, these functions 36 | * will allocate device memory. This device memory can NOT be used in 37 | * host functions (not even the ones that operate on device memory!). 38 | * The device memory that was allocated on the device is thus not compatible 39 | * with the device memory that was allocated on the host. 40 | * See http://stackoverflow.com/a/13043240 for details.
41 | *
42 | * This example shows how to allocate, use and free memory in kernels. The 43 | * usage pattern shown here does not necessarily make any sense, but it points 44 | * out the difference between device memory allocated on the host, and device 45 | * memory allocated on the device, using overly elaborate variable names. 46 | */ 47 | public class JCudaAllocationInKernel 48 | { 49 | public static void main(String[] args) throws IOException 50 | { 51 | // Enable exceptions and omit all subsequent error checks 52 | JCudaDriver.setExceptionsEnabled(true); 53 | 54 | // Initialize the driver and create a context for the first device. 55 | cuInit(0); 56 | CUdevice device = new CUdevice(); 57 | cuDeviceGet(device, 0); 58 | CUcontext context = new CUcontext(); 59 | cuCtxCreate(context, 0, device); 60 | 61 | // Create the PTX file by calling the NVCC 62 | String ptxFileName = JCudaSamplesUtils.preparePtxFile( 63 | "src/main/resources/kernels/JCudaAllocationInKernelKernel.cu"); 64 | 65 | // Load the PTX file. 66 | CUmodule module = new CUmodule(); 67 | cuModuleLoad(module, ptxFileName); 68 | 69 | // Obtain a function pointer to the "allocatingKernel" function. 70 | CUfunction allocatingKernel = new CUfunction(); 71 | cuModuleGetFunction(allocatingKernel, module, "allocatingKernel"); 72 | 73 | // Obtain a function pointer to the "copyingKernel" function. 74 | CUfunction copyingKernel = new CUfunction(); 75 | cuModuleGetFunction(copyingKernel, module, "copyingKernel"); 76 | 77 | // Obtain a function pointer to the "freeingKernel" function. 78 | CUfunction freeingKernel = new CUfunction(); 79 | cuModuleGetFunction(freeingKernel, module, "freeingKernel"); 80 | 81 | int numThreads = 4; 82 | 83 | // NOTE: This must match the value in the kernels! 84 | int numberOfShortsAllocatedInKernel = 3; 85 | 86 | // What will arrive in the allocating kernel: A device pointer that is 87 | // allocated on the host. Each element of this "array" will afterwards 88 | // contain a device pointer that was allocated on the device. 89 | CUdeviceptr devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice = 90 | new CUdeviceptr(); 91 | cuMemAlloc(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice, 92 | numThreads * Sizeof.POINTER); 93 | 94 | // The parameter for the allocating kernel: 95 | // A pointer to a pointer that points to the device pointer that 96 | // was allocated on the host, and points to the device pointers 97 | // that will be allocated on the device. Yeah. 98 | Pointer allocatingKernelParameters = Pointer.to( 99 | Pointer.to(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice) 100 | ); 101 | 102 | // Launch the allocating kernel 103 | int blockSizeX = numThreads; 104 | int gridSizeX = 1; 105 | cuLaunchKernel(allocatingKernel, 106 | gridSizeX, 1, 1, 107 | blockSizeX, 1, 1, 108 | 0, null, 109 | allocatingKernelParameters, null 110 | ); 111 | cuCtxSynchronize(); 112 | 113 | // Create the (host) array of device pointers that are allocated on 114 | // the host 115 | CUdeviceptr devicePointersAllocatedOnHost[] = 116 | new CUdeviceptr[numThreads]; 117 | for (int i=0; i 31 | *
32 | * This test computes the bandwidth of the data transfer from the host to 33 | * the device for different host memory types: 34 | *
    35 | *
  • 36 | * Host data is once allocated as pinned memory 37 | * (using cudaHostAlloc) 38 | *
  • 39 | *
  • 40 | * Host data that is stored in pageable memory (comparable to 41 | * malloc in C), 42 | *
      43 | *
    • in a Java array
    • 44 | *
    • a direct buffer
    • 45 | *
    46 | *
  • 47 | *
48 | */ 49 | public class JCudaRuntimeMemoryBandwidths 50 | { 51 | /** 52 | * Memory modes for the host memory 53 | */ 54 | enum HostMemoryMode 55 | { 56 | /** 57 | * Pinned host memory, allocated with cudaHostAlloc 58 | */ 59 | PINNED, 60 | 61 | /** 62 | * Pageable memory in form of a Pointer.to(array) 63 | */ 64 | PAGEABLE_ARRAY, 65 | 66 | /** 67 | * Pageable memory in form of a Pointer.to(directBuffer) 68 | */ 69 | PAGEABLE_DIRECT_BUFFER, 70 | } 71 | 72 | /** 73 | * Entry point of this sample 74 | * 75 | * @param args Not used 76 | */ 77 | public static void main(String[] args) 78 | { 79 | int device = 0; 80 | cudaSetDevice(device); 81 | 82 | int hostAllocFlags = cudaHostAllocWriteCombined; 83 | run(HostMemoryMode.PINNED, hostAllocFlags); 84 | run(HostMemoryMode.PAGEABLE_ARRAY, hostAllocFlags); 85 | run(HostMemoryMode.PAGEABLE_DIRECT_BUFFER, hostAllocFlags); 86 | 87 | System.out.println("Done"); 88 | } 89 | 90 | 91 | /** 92 | * Run the computation of the bandwidth for copying host memory to the 93 | * device, using various memory block sizes, and print the results 94 | * 95 | * @param hostMemoryMode The {@link HostMemoryMode} 96 | * @param hostAllocFlags The flags for cudaHostAlloc 97 | */ 98 | static void run(HostMemoryMode hostMemoryMode, int hostAllocFlags) 99 | { 100 | int minExponent = 10; 101 | int maxExponent = 28; 102 | int count = maxExponent - minExponent; 103 | int memorySizes[] = new int[count]; 104 | float bandwidths[] = new float[memorySizes.length]; 105 | 106 | System.out.print("Running with " + hostMemoryMode); 107 | for (int i = 0; i < count; i++) 108 | { 109 | System.out.print("."); 110 | memorySizes[i] = (1 << minExponent + i); 111 | float bandwidth = computeBandwidth( 112 | hostMemoryMode, hostAllocFlags, memorySizes[i]); 113 | bandwidths[i] = bandwidth; 114 | } 115 | System.out.println(); 116 | 117 | System.out.println("Bandwidths for " + hostMemoryMode); 118 | for (int i = 0; i < memorySizes.length; i++) 119 | { 120 | String s = String.format("%10d", memorySizes[i]); 121 | String b = String.format(Locale.ENGLISH, "%5.3f", bandwidths[i]); 122 | System.out.println(s + " bytes : " + b + " MB/s"); 123 | } 124 | System.out.println("\n"); 125 | } 126 | 127 | 128 | /** 129 | * Compute the bandwidth in MB per second for copying data from the 130 | * host to the device 131 | * 132 | * @param hostMemoryMode The {@link HostMemoryMode} 133 | * @param hostAllocFlags The flags for the cudaHostAlloc call 134 | * @param memorySizes The memory sizes, in bytes 135 | * @param bandwidths Will store the bandwidth, in MB per second 136 | */ 137 | static void computeBandwidths( 138 | HostMemoryMode hostMemoryMode, int hostAllocFlags, 139 | int memorySizes[], float bandwidths[]) 140 | { 141 | for (int i = 0; i < memorySizes.length; i++) 142 | { 143 | int memorySize = memorySizes[i]; 144 | float bandwidth = computeBandwidth( 145 | hostMemoryMode, hostAllocFlags, memorySize); 146 | bandwidths[i] = bandwidth; 147 | } 148 | } 149 | 150 | /** 151 | * Compute the bandwidth in MB per second for copying data from the 152 | * host to the device 153 | * 154 | * @param hostMemoryMode The {@link HostMemoryMode} 155 | * @param hostAllocFlags The flags for the cudaHostAlloc call 156 | * @param memorySize The memory size, in bytes 157 | * @return The bandwidth, in MB per second 158 | */ 159 | static float computeBandwidth( 160 | HostMemoryMode hostMemoryMode, int hostAllocFlags, int memorySize) 161 | { 162 | // Initialize the host memory 163 | Pointer hostData = null; 164 | ByteBuffer hostDataBuffer = null; 165 | if (hostMemoryMode == HostMemoryMode.PINNED) 166 | { 167 | // Allocate pinned (page-locked) host memory 168 | hostData = new Pointer(); 169 | cudaHostAlloc(hostData, memorySize, hostAllocFlags); 170 | hostDataBuffer = hostData.getByteBuffer(0, memorySize); 171 | } 172 | else if (hostMemoryMode == HostMemoryMode.PAGEABLE_ARRAY) 173 | { 174 | // The host memory is pageable and stored in a Java array 175 | byte array[] = new byte[memorySize]; 176 | hostDataBuffer = ByteBuffer.wrap(array); 177 | hostData = Pointer.to(array); 178 | } 179 | else 180 | { 181 | // The host memory is pageable and stored in a direct byte buffer 182 | hostDataBuffer = ByteBuffer.allocateDirect(memorySize); 183 | hostData = Pointer.to(hostDataBuffer); 184 | } 185 | 186 | // Fill the memory with arbitrary data 187 | for (int i = 0; i < memorySize; i++) 188 | { 189 | hostDataBuffer.put(i, (byte) i); 190 | } 191 | 192 | // Allocate device memory 193 | Pointer deviceData = new Pointer(); 194 | cudaMalloc(deviceData, memorySize); 195 | 196 | final int runs = 10; 197 | float bandwidth = computeBandwidth( 198 | deviceData, hostData, cudaMemcpyHostToDevice, memorySize, runs); 199 | 200 | // Clean up 201 | if (hostMemoryMode == HostMemoryMode.PINNED) 202 | { 203 | cudaFreeHost(hostData); 204 | } 205 | cudaFree(deviceData); 206 | return bandwidth; 207 | } 208 | 209 | 210 | /** 211 | * Compute the bandwidth in MB per second for copying data from the 212 | * given source pointer to the given destination pointer 213 | * 214 | * @param dstData The destination pointer 215 | * @param srcData The source pointer 216 | * @param memcopyKind The cudaMemcpyKind. Must match the types 217 | * of the source and destination pointers! 218 | * @param memSize The memory size, in bytes 219 | * @param runs The number of times that the copying operation 220 | * should be repeated 221 | * @return The bandwidth in MB per second 222 | */ 223 | static float computeBandwidth( 224 | Pointer dstData, Pointer srcData, 225 | int memcopyKind, int memSize, int runs) 226 | { 227 | // Initialize the events for the time measure 228 | cudaEvent_t start = new cudaEvent_t(); 229 | cudaEvent_t stop = new cudaEvent_t(); 230 | cudaEventCreate(start); 231 | cudaEventCreate(stop); 232 | 233 | // Perform the specified number of copying operations 234 | cudaEventRecord(start, null); 235 | for (int i = 0; i < runs; i++) 236 | { 237 | cudaMemcpyAsync(dstData, srcData, memSize, memcopyKind, null); 238 | } 239 | cudaEventRecord(stop, null); 240 | cudaDeviceSynchronize(); 241 | 242 | // Compute the elapsed time and bandwidth 243 | // in MB per second 244 | float elapsedTimeMsArray[] = { Float.NaN }; 245 | cudaEventElapsedTime(elapsedTimeMsArray, start, stop); 246 | float elapsedTimeMs = elapsedTimeMsArray[0]; 247 | float bandwidthInBytesPerMs = ((float) memSize * runs) / elapsedTimeMs; 248 | float bandwidth = bandwidthInBytesPerMs / 1024; 249 | 250 | // Clean up 251 | cudaEventDestroy(stop); 252 | cudaEventDestroy(start); 253 | return bandwidth; 254 | } 255 | } -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/gl/samples/SimpleInteraction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.gl.samples; 7 | 8 | import java.awt.Point; 9 | import java.awt.event.MouseEvent; 10 | import java.awt.event.MouseMotionListener; 11 | import java.awt.event.MouseWheelEvent; 12 | import java.awt.event.MouseWheelListener; 13 | import java.util.Arrays; 14 | 15 | /** 16 | * A class encapsulating a VERY simple mouse interaction for the GL samples: 17 | * It offers a {@link #getMouseControl()} that may be attached as a 18 | * MouseMotionListener and MouseWheelListener to an arbitrary component, 19 | * and methods to obtain a {@link #getModelviewMatrix() model-view} and 20 | * {@link #getProjectionMatrix() projection} matrix. 21 | */ 22 | class SimpleInteraction 23 | { 24 | /** 25 | * The translation in X-direction 26 | */ 27 | private float translationX = 0; 28 | 29 | /** 30 | * The translation in Y-direction 31 | */ 32 | private float translationY = 0; 33 | 34 | /** 35 | * The translation in Z-direction 36 | */ 37 | private float translationZ = -4; 38 | 39 | /** 40 | * The rotation about the X-axis, in degrees 41 | */ 42 | private float rotationDegX = 40; 43 | 44 | /** 45 | * The rotation about the Y-axis, in degrees 46 | */ 47 | private float rotationDegY = 30; 48 | 49 | /** 50 | * The current projection matrix 51 | */ 52 | private float projectionMatrix[] = new float[16]; 53 | 54 | /** 55 | * The current modelview matrix 56 | */ 57 | private float modelviewMatrix[] = new float[16]; 58 | 59 | /** 60 | * Inner class encapsulating the MouseMotionListener and 61 | * MouseWheelListener for the interaction 62 | */ 63 | class MouseControl implements MouseMotionListener, MouseWheelListener 64 | { 65 | private Point previousMousePosition = new Point(); 66 | 67 | @Override 68 | public void mouseDragged(MouseEvent e) 69 | { 70 | int dx = e.getX() - previousMousePosition.x; 71 | int dy = e.getY() - previousMousePosition.y; 72 | 73 | // If the left button is held down, move the object 74 | if ((e.getModifiersEx() & MouseEvent.BUTTON1_DOWN_MASK) == 75 | MouseEvent.BUTTON1_DOWN_MASK) 76 | { 77 | translationX += dx / 100.0f; 78 | translationY -= dy / 100.0f; 79 | } 80 | 81 | // If the right button is held down, rotate the object 82 | else if ((e.getModifiersEx() & MouseEvent.BUTTON3_DOWN_MASK) == 83 | MouseEvent.BUTTON3_DOWN_MASK) 84 | { 85 | rotationDegX += dy; 86 | rotationDegY += dx; 87 | } 88 | previousMousePosition = e.getPoint(); 89 | updateModelviewMatrix(); 90 | } 91 | 92 | @Override 93 | public void mouseMoved(MouseEvent e) 94 | { 95 | previousMousePosition = e.getPoint(); 96 | } 97 | 98 | @Override 99 | public void mouseWheelMoved(MouseWheelEvent e) 100 | { 101 | // Translate along the Z-axis 102 | translationZ += e.getWheelRotation() * 0.25f; 103 | previousMousePosition = e.getPoint(); 104 | updateModelviewMatrix(); 105 | } 106 | } 107 | 108 | /** 109 | * The mouse control 110 | */ 111 | private final MouseControl mouseControl; 112 | 113 | /** 114 | * Default constructor 115 | */ 116 | SimpleInteraction() 117 | { 118 | this.mouseControl = new MouseControl(); 119 | updateModelviewMatrix(); 120 | } 121 | 122 | /** 123 | * Returns the mouse control that may be attached to a component 124 | * as a MouseMotionListener and MouseWheelListener 125 | * 126 | * @return The mouse control 127 | */ 128 | MouseControl getMouseControl() 129 | { 130 | return mouseControl; 131 | } 132 | 133 | /** 134 | * Update the modelview matrix depending on the 135 | * current translation and rotation 136 | */ 137 | private void updateModelviewMatrix() 138 | { 139 | float m0[] = translation(translationX, translationY, translationZ); 140 | float m1[] = rotationX(rotationDegX); 141 | float m2[] = rotationY(rotationDegY); 142 | modelviewMatrix = multiply(multiply(m1,m2), m0); 143 | } 144 | 145 | /** 146 | * Update the projection matrix for the given screen width and height 147 | * 148 | * @param w The width 149 | * @param h The height 150 | */ 151 | void updateProjectionMatrix(int w, int h) 152 | { 153 | float aspect = (float) w / h; 154 | projectionMatrix = perspective(50, aspect, 0.1f, 100.0f); 155 | } 156 | 157 | /** 158 | * Returns a reference to the modelview matrix 159 | * 160 | * @return The matrix 161 | */ 162 | float[] getModelviewMatrix() 163 | { 164 | return modelviewMatrix; 165 | } 166 | 167 | /** 168 | * Returns a reference to the projection matrix 169 | * 170 | * @return The matrix 171 | */ 172 | float[] getProjectionMatrix() 173 | { 174 | return projectionMatrix; 175 | } 176 | 177 | /** 178 | * Returns the rotation around the x-axis, in degrees 179 | * 180 | * @return The rotation 181 | */ 182 | float getRotationDegX() 183 | { 184 | return rotationDegX; 185 | } 186 | 187 | /** 188 | * Returns the rotation around the y-axis, in degrees 189 | * 190 | * @return The rotation 191 | */ 192 | float getRotationDegY() 193 | { 194 | return rotationDegY; 195 | } 196 | 197 | /** 198 | * Returns the translation along the x-axis 199 | * 200 | * @return The translation 201 | */ 202 | float getTranslationX() 203 | { 204 | return translationX; 205 | } 206 | 207 | /** 208 | * Returns the translation along the y-axis 209 | * 210 | * @return The translation 211 | */ 212 | float getTranslationY() 213 | { 214 | return translationY; 215 | } 216 | 217 | /** 218 | * Returns the translation along the z-axis 219 | * 220 | * @return The translation 221 | */ 222 | float getTranslationZ() 223 | { 224 | return translationZ; 225 | } 226 | 227 | /** 228 | * Helper method that creates a perspective matrix 229 | * @param fovy The fov in y-direction, in degrees 230 | * 231 | * @param aspect The aspect ratio 232 | * @param zNear The near clipping plane 233 | * @param zFar The far clipping plane 234 | * @return A perspective matrix 235 | */ 236 | private static float[] perspective( 237 | float fovy, float aspect, float zNear, float zFar) 238 | { 239 | float radians = (float)Math.toRadians(fovy / 2); 240 | float deltaZ = zFar - zNear; 241 | float sine = (float)Math.sin(radians); 242 | if ((deltaZ == 0) || (sine == 0) || (aspect == 0)) 243 | { 244 | return identity(); 245 | } 246 | float cotangent = (float)Math.cos(radians) / sine; 247 | float m[] = identity(); 248 | m[0*4+0] = cotangent / aspect; 249 | m[1*4+1] = cotangent; 250 | m[2*4+2] = -(zFar + zNear) / deltaZ; 251 | m[2*4+3] = -1; 252 | m[3*4+2] = -2 * zNear * zFar / deltaZ; 253 | m[3*4+3] = 0; 254 | return m; 255 | } 256 | 257 | /** 258 | * Creates an identity matrix 259 | * 260 | * @return An identity matrix 261 | */ 262 | private static float[] identity() 263 | { 264 | float m[] = new float[16]; 265 | Arrays.fill(m, 0); 266 | m[0] = m[5] = m[10] = m[15] = 1.0f; 267 | return m; 268 | } 269 | 270 | /** 271 | * Multiplies the given matrices and returns the result 272 | * 273 | * @param m0 The first matrix 274 | * @param m1 The second matrix 275 | * @return The product m0*m1 276 | */ 277 | private static float[] multiply(float m0[], float m1[]) 278 | { 279 | float m[] = new float[16]; 280 | for (int x=0; x < 4; x++) 281 | { 282 | for(int y=0; y < 4; y++) 283 | { 284 | m[x*4 + y] = 285 | m0[x*4+0] * m1[y+ 0] + 286 | m0[x*4+1] * m1[y+ 4] + 287 | m0[x*4+2] * m1[y+ 8] + 288 | m0[x*4+3] * m1[y+12]; 289 | } 290 | } 291 | return m; 292 | } 293 | 294 | /** 295 | * Creates a translation matrix 296 | * 297 | * @param x The x translation 298 | * @param y The y translation 299 | * @param z The z translation 300 | * @return A translation matrix 301 | */ 302 | private static float[] translation(float x, float y, float z) 303 | { 304 | float m[] = identity(); 305 | m[12] = x; 306 | m[13] = y; 307 | m[14] = z; 308 | return m; 309 | } 310 | 311 | /** 312 | * Creates a matrix describing a rotation around the x-axis 313 | * 314 | * @param angleDeg The rotation angle, in degrees 315 | * @return The rotation matrix 316 | */ 317 | private static float[] rotationX(float angleDeg) 318 | { 319 | float m[] = identity(); 320 | float angleRad = (float)Math.toRadians(angleDeg); 321 | float ca = (float)Math.cos(angleRad); 322 | float sa = (float)Math.sin(angleRad); 323 | m[ 5] = ca; 324 | m[ 6] = sa; 325 | m[ 9] = -sa; 326 | m[10] = ca; 327 | return m; 328 | } 329 | 330 | /** 331 | * Creates a matrix describing a rotation around the y-axis 332 | * 333 | * @param angleDeg The rotation angle, in degrees 334 | * @return The rotation matrix 335 | */ 336 | private static float[] rotationY(float angleDeg) 337 | { 338 | float m[] = identity(); 339 | float angleRad = (float)Math.toRadians(angleDeg); 340 | float ca = (float)Math.cos(angleRad); 341 | float sa = (float)Math.sin(angleRad); 342 | m[ 0] = ca; 343 | m[ 2] = -sa; 344 | m[ 8] = sa; 345 | m[10] = ca; 346 | return m; 347 | } 348 | 349 | 350 | 351 | } 352 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2MatrixInvert.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcublas.samples; 7 | 8 | import static jcuda.jcublas.JCublas2.cublasCreate; 9 | import static jcuda.jcublas.JCublas2.cublasDestroy; 10 | import static jcuda.jcublas.JCublas2.cublasGetMatrix; 11 | import static jcuda.jcublas.JCublas2.cublasGetVector; 12 | import static jcuda.jcublas.JCublas2.cublasIsamax; 13 | import static jcuda.jcublas.JCublas2.cublasSetMatrix; 14 | import static jcuda.jcublas.JCublas2.cublasSetVector; 15 | import static jcuda.jcublas.JCublas2.cublasSgemm; 16 | import static jcuda.jcublas.JCublas2.cublasSgemv; 17 | import static jcuda.jcublas.JCublas2.cublasSger; 18 | import static jcuda.jcublas.JCublas2.cublasSscal; 19 | import static jcuda.jcublas.JCublas2.cublasSswap; 20 | import static jcuda.jcublas.JCublas2.cublasStrmv; 21 | import static jcuda.jcublas.cublasFillMode.CUBLAS_FILL_MODE_UPPER; 22 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N; 23 | import static jcuda.runtime.JCuda.cudaFree; 24 | import static jcuda.runtime.JCuda.cudaMalloc; 25 | import static jcuda.runtime.JCuda.cudaMemcpy; 26 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; 27 | 28 | import jcuda.Pointer; 29 | import jcuda.Sizeof; 30 | import jcuda.jcublas.cublasHandle; 31 | import jcuda.samples.utils.JCudaSamplesUtils; 32 | 33 | /** 34 | * Example of a matrix inversion using JCublas2. 35 | */ 36 | public class JCublas2MatrixInvert 37 | { 38 | /** 39 | * Entry point of this sample 40 | * 41 | * @param args Not used 42 | */ 43 | public static void main(String[] args) 44 | { 45 | // Create a CUBLAS handle 46 | cublasHandle handle = new cublasHandle(); 47 | cublasCreate(handle); 48 | 49 | // Create the input matrix 50 | int size = 7; 51 | float A[] = JCudaSamplesUtils.createRandomFloatData(size * size); 52 | 53 | // Invert the matrix 54 | float invA[] = A.clone(); 55 | invertMatrix(handle, size, invA); 56 | 57 | // Compute A*invA, which should yield the identity matrix 58 | float identity[] = new float[size * size]; 59 | multiply(handle, size, A, invA, identity); 60 | 61 | // Print the results 62 | System.out.println("A:"); 63 | System.out.println(JCudaSamplesUtils.toString2D(A, size)); 64 | System.out.println("invA:"); 65 | System.out.println(JCudaSamplesUtils.toString2D(invA, size)); 66 | System.out.println("identity:"); 67 | System.out.println(JCudaSamplesUtils.toString2D(identity, size)); 68 | 69 | // Verify the result 70 | boolean passed = true; 71 | final float epsilon = 1e-5f; 72 | for (int i = 0; i < size; i++) 73 | { 74 | for (int j = 0; j < size; j++) 75 | { 76 | int index = i * size + j; 77 | float value = identity[index]; 78 | if (i == j) 79 | { 80 | passed &= Math.abs(value - 1.0f) <= epsilon; 81 | } 82 | else 83 | { 84 | passed &= Math.abs(value) <= epsilon; 85 | } 86 | } 87 | } 88 | System.out.println((passed ? "PASSED" : "FAILED")); 89 | 90 | // Clean up 91 | cublasDestroy(handle); 92 | } 93 | 94 | /** 95 | * Copies the given n x n matrix into device memory, inverts it by calling 96 | * {@link #invertMatrix(cublasHandle, int, Pointer)}, and copies it back 97 | * into the given array. 98 | * 99 | * @param handle The CUBLAS handle 100 | * @param n The size of the matrix 101 | * @param A The matrix 102 | */ 103 | public static void invertMatrix(cublasHandle handle, int n, float A[]) 104 | { 105 | Pointer dA = new Pointer(); 106 | cudaMalloc(dA, n * n * Sizeof.FLOAT); 107 | cublasSetMatrix(n, n, Sizeof.FLOAT, Pointer.to(A), n, dA, n); 108 | 109 | invertMatrix(handle, n, dA); 110 | 111 | cublasGetMatrix(n, n, Sizeof.FLOAT, dA, n, Pointer.to(A), n); 112 | cudaFree(dA); 113 | } 114 | 115 | /** 116 | * Invert the n x n matrix that is given in device memory. 117 | * 118 | * @param n The size of the matrix 119 | * @param dA The matrix 120 | */ 121 | public static void invertMatrix(cublasHandle handle, int n, Pointer dA) 122 | { 123 | // Perform LU factorization 124 | int[] pivots = cudaSgetrfSquare(handle, n, dA); 125 | 126 | // Perform inversion on factorized matrix 127 | cudaSgetri(handle, n, dA, pivots); 128 | } 129 | 130 | /** 131 | * Convenience method that returns a pointer with the given offset (in 132 | * number of 4-byte float elements) from the given pointer. 133 | * 134 | * @param p The pointer 135 | * @param floatOffset The offset, in number of float elements 136 | * @return The new pointer 137 | */ 138 | private static Pointer at(Pointer p, int floatOffset) 139 | { 140 | return p.withByteOffset(floatOffset * Sizeof.FLOAT); 141 | } 142 | 143 | /** 144 | * cudaSgetrf performs an in-place LU factorization on a square matrix. 145 | * Uses the unblocked BLAS2 approach 146 | * 147 | * @param n The matrix size 148 | * @param dA The pointer to the matrix (in device memory) 149 | * @return The pivots 150 | */ 151 | private static int[] cudaSgetrfSquare( 152 | cublasHandle handle, int n, Pointer dA) 153 | { 154 | int[] pivots = new int[n]; 155 | for (int i = 0; i < n; i++) 156 | { 157 | pivots[i] = i; 158 | } 159 | 160 | Pointer minusOne = Pointer.to(new float[] { -1.0f }); 161 | float[] factor = { 0.0f }; 162 | Pointer pFactor = Pointer.to(factor); 163 | for (int i = 0; i < n - 1; i++) 164 | { 165 | Pointer offset = at(dA, i * n + i); 166 | 167 | int max[] = { 0 }; 168 | cublasIsamax(handle, n - i, offset, 1, Pointer.to(max)); 169 | int pivot = i - 1 + max[0]; 170 | if (pivot != i) 171 | { 172 | pivots[i] = pivot; 173 | cublasSswap(handle, n, at(dA, pivot), n, at(dA, i), n); 174 | } 175 | 176 | cublasGetVector(1, Sizeof.FLOAT, offset, 1, pFactor, 1); 177 | factor[0] = 1 / factor[0]; 178 | cublasSscal(handle, n - i - 1, pFactor, at(offset, 1), 1); 179 | cublasSger(handle, n - i - 1, n - i - 1, minusOne, at(offset, 1), 180 | 1, at(offset, n), n, at(offset, n + 1), n); 181 | } 182 | return pivots; 183 | } 184 | 185 | /*** 186 | * cudaSgetri Computes the inverse of an LU-factorized square matrix 187 | * 188 | * @param n The matrix size 189 | * @param dA The matrix in device memory 190 | * @param pivots The pivots 191 | */ 192 | private static void cudaSgetri( 193 | cublasHandle handle, int n, Pointer dA, int[] pivots) 194 | { 195 | // Perform inv(U) 196 | cudaStrtri(handle, n, dA); 197 | 198 | // Solve inv(A)*L = inv(U) 199 | Pointer dWork = new Pointer(); 200 | cudaMalloc(dWork, (n - 1) * Sizeof.FLOAT); 201 | 202 | Pointer zero = Pointer.to(new float[]{ 0.0f }); 203 | Pointer one = Pointer.to(new float[]{ 1.0f }); 204 | Pointer minusOne = Pointer.to(new float[]{ -1.0f }); 205 | for (int i = n - 1; i > 0; i--) 206 | { 207 | Pointer offset = at(dA, ((i - 1) * n + i)); 208 | cudaMemcpy(dWork, offset, (n - 1) * Sizeof.FLOAT, 209 | cudaMemcpyDeviceToDevice); 210 | cublasSscal(handle, n - i, zero, offset, 1); 211 | cublasSgemv(handle, CUBLAS_OP_N, n, n - i, minusOne, 212 | at(dA, i * n), n, dWork, 1, one, at(dA, ((i - 1) * n)), 1); 213 | } 214 | 215 | cudaFree(dWork); 216 | 217 | // Pivot back to original order 218 | for (int i = n - 1; i >= 0; i--) 219 | { 220 | if (i != pivots[i]) 221 | { 222 | cublasSswap(handle, n, at(dA, i * n), 1, 223 | at(dA, pivots[i] * n), 1); 224 | } 225 | } 226 | 227 | } 228 | 229 | /*** 230 | * cudaStrtri Computes the inverse of an upper triangular matrix in place 231 | * Uses the unblocked BLAS2 approach 232 | * 233 | * @param n The size of the matrix 234 | * @param dA The matrix 235 | */ 236 | private static void cudaStrtri(cublasHandle handle, int n, Pointer dA) 237 | { 238 | float[] factor = { 0.0f }; 239 | Pointer pFactor = Pointer.to(factor); 240 | for (int i = 0; i < n; i++) 241 | { 242 | Pointer offset = at(dA, i * n); 243 | cublasGetVector(1, Sizeof.FLOAT, at(offset, i), 1, pFactor, 1); 244 | factor[0] = 1 / factor[0]; 245 | cublasSetVector(1, Sizeof.FLOAT, pFactor, 1, at(offset, i), 1); 246 | 247 | factor[0] = -factor[0]; 248 | cublasStrmv(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, 249 | CUBLAS_OP_N, i, dA, n, offset, 1); 250 | cublasSscal(handle, i, pFactor, offset, 1); 251 | } 252 | } 253 | 254 | // === Utility methods for this sample ==================================== 255 | 256 | /** 257 | * Multiplies the matrices A and B and writes the result into C. 258 | * 259 | * @param size The size of the matrices 260 | * @param A Matrix A 261 | * @param B Matrix B 262 | * @param C Matrix C 263 | */ 264 | private static void multiply(cublasHandle handle, int size, float A[], 265 | float B[], float C[]) 266 | { 267 | Pointer dA = new Pointer(); 268 | Pointer dB = new Pointer(); 269 | Pointer dC = new Pointer(); 270 | 271 | cudaMalloc(dA, size * size * Sizeof.FLOAT); 272 | cudaMalloc(dB, size * size * Sizeof.FLOAT); 273 | cudaMalloc(dC, size * size * Sizeof.FLOAT); 274 | cublasSetVector(size * size, Sizeof.FLOAT, Pointer.to(A), 1, dA, 1); 275 | cublasSetVector(size * size, Sizeof.FLOAT, Pointer.to(B), 1, dB, 1); 276 | 277 | Pointer zero = Pointer.to(new float[]{ 0.0f }); 278 | Pointer one = Pointer.to(new float[]{ 1.0f }); 279 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, one, 280 | dA, size, dB, size, zero, dC, size); 281 | 282 | cublasGetVector(size * size, Sizeof.FLOAT, dC, 1, Pointer.to(C), 1); 283 | cudaFree(dA); 284 | cudaFree(dB); 285 | cudaFree(dC); 286 | } 287 | 288 | } -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/jcudnn/samples/JCudnnMnistUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2020 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.jcudnn.samples; 7 | 8 | import static jcuda.runtime.JCuda.cudaDeviceReset; 9 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize; 10 | import static jcuda.runtime.JCuda.cudaMalloc; 11 | import static jcuda.runtime.JCuda.cudaMemcpy; 12 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; 13 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; 14 | 15 | import java.io.ByteArrayOutputStream; 16 | import java.io.DataInputStream; 17 | import java.io.File; 18 | import java.io.FileInputStream; 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.nio.ByteBuffer; 22 | import java.nio.ByteOrder; 23 | import java.nio.FloatBuffer; 24 | 25 | import jcuda.CudaException; 26 | import jcuda.Pointer; 27 | import jcuda.Sizeof; 28 | import jcuda.jcudnn.cudnnDataType; 29 | 30 | /** 31 | * Utility methods for the JCudnnMnist sample. These are mainly file IO 32 | * methods for the sample files that contain the binary data of the 33 | * trained network, and the images. 34 | */ 35 | class JCudnnMnistUtils 36 | { 37 | static Pointer readBinaryFileAsDeviceDataUnchecked( 38 | String fileName, int dataType) 39 | { 40 | if (dataType == cudnnDataType.CUDNN_DATA_FLOAT) 41 | { 42 | float data[] = readBinaryFileAsFloatsUnchecked(fileName); 43 | return createDevicePointer(data); 44 | } 45 | if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE) 46 | { 47 | float data[] = readBinaryFileAsFloatsUnchecked(fileName); 48 | double doubleData[] = toDouble(data); 49 | return createDevicePointer(doubleData); 50 | } 51 | throw new IllegalArgumentException( 52 | "Invalid data type: " + cudnnDataType.stringFor(dataType)); 53 | } 54 | 55 | private static float[] readBinaryFileAsFloatsUnchecked(String fileName) 56 | { 57 | try 58 | { 59 | return readBinaryFileAsFloats(fileName); 60 | } 61 | catch (IOException e) 62 | { 63 | cudaDeviceReset(); 64 | throw new CudaException("Could not read input file", e); 65 | } 66 | } 67 | 68 | private static float[] readBinaryFileAsFloats(String fileName) 69 | throws IOException 70 | { 71 | FileInputStream fis = new FileInputStream(new File(fileName)); 72 | byte data[] = readFully(fis); 73 | ByteBuffer bb = ByteBuffer.wrap(data); 74 | bb.order(ByteOrder.nativeOrder()); 75 | FloatBuffer fb = bb.asFloatBuffer(); 76 | float result[] = new float[fb.capacity()]; 77 | fb.get(result); 78 | return result; 79 | } 80 | 81 | private static double[] toDouble(float array[]) 82 | { 83 | double result[] = new double[array.length]; 84 | for (int i = 0; i < array.length; i++) 85 | { 86 | result[i] = array[i]; 87 | } 88 | return result; 89 | } 90 | 91 | private static byte[] readFully(InputStream inputStream) throws IOException 92 | { 93 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 94 | byte buffer[] = new byte[1024]; 95 | while (true) 96 | { 97 | int n = inputStream.read(buffer); 98 | if (n < 0) 99 | { 100 | break; 101 | } 102 | baos.write(buffer, 0, n); 103 | } 104 | byte data[] = baos.toByteArray(); 105 | return data; 106 | } 107 | 108 | static Pointer readImageDataUnchecked(String fileName, int dataType) 109 | { 110 | if (dataType == cudnnDataType.CUDNN_DATA_FLOAT) 111 | { 112 | float data[] = readImageDataAsFloatsUnchecked(fileName); 113 | return Pointer.to(data); 114 | } 115 | if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE) 116 | { 117 | double data[] = readImageDataAsDoublesUnchecked(fileName); 118 | return Pointer.to(data); 119 | } 120 | throw new IllegalArgumentException( 121 | "Invalid data type: " + cudnnDataType.stringFor(dataType)); 122 | } 123 | 124 | private static double[] readImageDataAsDoublesUnchecked(String fileName) 125 | { 126 | try 127 | { 128 | return readImageDataAsDoubles(fileName); 129 | } 130 | catch (IOException e) 131 | { 132 | cudaDeviceReset(); 133 | throw new CudaException("Could not read input file", e); 134 | } 135 | } 136 | 137 | private static double[] readImageDataAsDoubles(String fileName) throws IOException 138 | { 139 | InputStream is = new FileInputStream(new File(fileName)); 140 | byte data[] = readBinaryPortableGraymap8bitData(is); 141 | double imageData[] = new double[data.length]; 142 | for (int i = 0; i < data.length; i++) 143 | { 144 | imageData[i] = (((int) data[i]) & 0xff) / 255.0; 145 | } 146 | return imageData; 147 | } 148 | 149 | private static float[] readImageDataAsFloatsUnchecked(String fileName) 150 | { 151 | try 152 | { 153 | return readImageDataAsFloats(fileName); 154 | } 155 | catch (IOException e) 156 | { 157 | cudaDeviceReset(); 158 | throw new CudaException("Could not read input file", e); 159 | } 160 | } 161 | 162 | private static float[] readImageDataAsFloats(String fileName) throws IOException 163 | { 164 | InputStream is = new FileInputStream(new File(fileName)); 165 | byte data[] = readBinaryPortableGraymap8bitData(is); 166 | float imageData[] = new float[data.length]; 167 | for (int i = 0; i < data.length; i++) 168 | { 169 | imageData[i] = (((int) data[i]) & 0xff) / 255.0f; 170 | } 171 | return imageData; 172 | } 173 | 174 | @SuppressWarnings("deprecation") 175 | private static byte[] readBinaryPortableGraymap8bitData( 176 | InputStream inputStream) throws IOException 177 | { 178 | DataInputStream dis = new DataInputStream(inputStream); 179 | String line = null; 180 | boolean firstLine = true; 181 | Integer width = null; 182 | Integer maxBrightness = null; 183 | while (true) 184 | { 185 | // The DataInputStream#readLine is deprecated, 186 | // but for ASCII input, it is safe to use it 187 | line = dis.readLine(); 188 | if (line == null) 189 | { 190 | break; 191 | } 192 | line = line.trim(); 193 | if (line.startsWith("#")) 194 | { 195 | continue; 196 | } 197 | if (firstLine) 198 | { 199 | firstLine = false; 200 | if (!line.equals("P5")) 201 | { 202 | throw new IOException( 203 | "Data is not a binary portable " + 204 | "graymap (P5), but " + line); 205 | } 206 | else 207 | { 208 | continue; 209 | } 210 | } 211 | if (width == null) 212 | { 213 | String tokens[] = line.split(" "); 214 | if (tokens.length < 2) 215 | { 216 | throw new IOException( 217 | "Expected dimensions, found " + line); 218 | } 219 | width = parseInt(tokens[0]); 220 | } 221 | else if (maxBrightness == null) 222 | { 223 | maxBrightness = parseInt(line); 224 | if (maxBrightness > 255) 225 | { 226 | throw new IOException( 227 | "Only 8 bit values supported. " + 228 | "Maximum value is " + maxBrightness); 229 | } 230 | break; 231 | } 232 | } 233 | byte data[] = readFully(inputStream); 234 | return data; 235 | } 236 | 237 | private static Integer parseInt(String s) throws IOException 238 | { 239 | try 240 | { 241 | return Integer.parseInt(s); 242 | } 243 | catch (NumberFormatException e) 244 | { 245 | throw new IOException(e); 246 | } 247 | } 248 | 249 | static void printDeviceVector(int size, Pointer d, int dataType) 250 | { 251 | if (dataType == cudnnDataType.CUDNN_DATA_FLOAT) 252 | { 253 | printFloatDeviceVector(size, d); 254 | } 255 | else if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE) 256 | { 257 | printDoubleDeviceVector(size, d); 258 | } 259 | else 260 | { 261 | throw new IllegalArgumentException( 262 | "Invalid data type: " + cudnnDataType.stringFor(dataType)); 263 | } 264 | } 265 | 266 | private static void printFloatDeviceVector(int size, Pointer d) 267 | { 268 | float h[] = new float[size]; 269 | cudaDeviceSynchronize(); 270 | cudaMemcpy(Pointer.to(h), d, size * Sizeof.FLOAT, 271 | cudaMemcpyDeviceToHost); 272 | for (int i = 0; i < size; i++) 273 | { 274 | System.out.print(h[i] + " "); 275 | } 276 | System.out.println(); 277 | } 278 | private static void printDoubleDeviceVector(int size, Pointer d) 279 | { 280 | double h[] = new double[size]; 281 | cudaDeviceSynchronize(); 282 | cudaMemcpy(Pointer.to(h), d, size * Sizeof.DOUBLE, 283 | cudaMemcpyDeviceToHost); 284 | for (int i = 0; i < size; i++) 285 | { 286 | System.out.print(h[i] + " "); 287 | } 288 | System.out.println(); 289 | } 290 | 291 | static int computeIndexOfMax(Pointer d, int length, int dataType) 292 | { 293 | if (dataType == cudnnDataType.CUDNN_DATA_FLOAT) 294 | { 295 | return computeIndexOfMaxFloat(d, length); 296 | } 297 | if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE) 298 | { 299 | return computeIndexOfMaxDouble(d, length); 300 | } 301 | throw new IllegalArgumentException( 302 | "Invalid data type: " + cudnnDataType.stringFor(dataType)); 303 | } 304 | 305 | private static int computeIndexOfMaxFloat(Pointer d, int length) 306 | { 307 | float result[] = new float[length]; 308 | cudaMemcpy(Pointer.to(result), d, 309 | length * Sizeof.FLOAT, 310 | cudaMemcpyDeviceToHost); 311 | int id = 0; 312 | for (int i = 1; i < length; i++) 313 | { 314 | if (result[id] < result[i]) 315 | id = i; 316 | } 317 | return id; 318 | } 319 | 320 | private static int computeIndexOfMaxDouble(Pointer d, int length) 321 | { 322 | double result[] = new double[length]; 323 | cudaMemcpy(Pointer.to(result), d, 324 | length * Sizeof.DOUBLE, 325 | cudaMemcpyDeviceToHost); 326 | int id = 0; 327 | for (int i = 1; i < length; i++) 328 | { 329 | if (result[id] < result[i]) 330 | id = i; 331 | } 332 | return id; 333 | } 334 | 335 | private static Pointer createDevicePointer(float data[]) 336 | { 337 | int size = data.length * Sizeof.FLOAT; 338 | Pointer deviceData = new Pointer(); 339 | cudaMalloc(deviceData, size); 340 | cudaMemcpy(deviceData, Pointer.to(data), size, cudaMemcpyHostToDevice); 341 | return deviceData; 342 | } 343 | 344 | private static Pointer createDevicePointer(double data[]) 345 | { 346 | int size = data.length * Sizeof.DOUBLE; 347 | Pointer deviceData = new Pointer(); 348 | cudaMalloc(deviceData, size); 349 | cudaMemcpy(deviceData, Pointer.to(data), size, cudaMemcpyHostToDevice); 350 | return deviceData; 351 | } 352 | 353 | } 354 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverStreamCallbacks.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.driver.samples; 7 | 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 9 | import static jcuda.driver.JCudaDriver.cuCtxSetCurrent; 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 11 | import static jcuda.driver.JCudaDriver.cuInit; 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 14 | import static jcuda.driver.JCudaDriver.cuMemHostAlloc; 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoHAsync; 16 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoDAsync; 17 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 18 | import static jcuda.driver.JCudaDriver.cuModuleLoadData; 19 | import static jcuda.driver.JCudaDriver.cuStreamAddCallback; 20 | import static jcuda.driver.JCudaDriver.cuStreamCreate; 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram; 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram; 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram; 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX; 25 | 26 | import java.nio.ByteBuffer; 27 | import java.nio.ByteOrder; 28 | import java.nio.IntBuffer; 29 | import java.util.concurrent.CancellationException; 30 | import java.util.concurrent.ExecutionException; 31 | import java.util.concurrent.ExecutorService; 32 | import java.util.concurrent.Future; 33 | import java.util.concurrent.LinkedBlockingQueue; 34 | import java.util.concurrent.ThreadPoolExecutor; 35 | import java.util.concurrent.TimeUnit; 36 | 37 | import jcuda.Pointer; 38 | import jcuda.Sizeof; 39 | import jcuda.driver.CUcontext; 40 | import jcuda.driver.CUdevice; 41 | import jcuda.driver.CUdeviceptr; 42 | import jcuda.driver.CUfunction; 43 | import jcuda.driver.CUmodule; 44 | import jcuda.driver.CUstream; 45 | import jcuda.driver.CUstreamCallback; 46 | import jcuda.driver.JCudaDriver; 47 | import jcuda.nvrtc.JNvrtc; 48 | import jcuda.nvrtc.nvrtcProgram; 49 | 50 | /** 51 | * An example showing stream callbacks involving multiple streams 52 | * and threads 53 | */ 54 | public class JCudaDriverStreamCallbacks 55 | { 56 | /** 57 | * A kernel that increments all elements of an int array by 1 58 | */ 59 | private static String programSourceCode = 60 | "extern \"C\"" + "\n" + 61 | "__global__ void example(int n, int *data)" + "\n" + 62 | "{" + "\n" + 63 | " int i = blockIdx.x * blockDim.x + threadIdx.x;" + "\n" + 64 | " if (i()) 308 | { 309 | @Override 310 | protected void afterExecute(Runnable r, Throwable t) 311 | { 312 | super.afterExecute(r, t); 313 | if (t == null && r instanceof Future) 314 | { 315 | try 316 | { 317 | Future future = (Future) r; 318 | if (future.isDone()) 319 | { 320 | future.get(); 321 | } 322 | } 323 | catch (CancellationException ce) 324 | { 325 | t = ce; 326 | } 327 | catch (ExecutionException ee) 328 | { 329 | t = ee.getCause(); 330 | } 331 | catch (InterruptedException ie) 332 | { 333 | Thread.currentThread().interrupt(); 334 | } 335 | } 336 | if (t != null) 337 | { 338 | throw new RuntimeException(t); 339 | } 340 | } 341 | }; 342 | e.allowCoreThreadTimeOut(true); 343 | return e; 344 | } 345 | 346 | } 347 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/samples/utils/JCudaSamplesUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA 3 | * 4 | * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org 5 | */ 6 | package jcuda.samples.utils; 7 | 8 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR; 9 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR; 10 | import static jcuda.driver.JCudaDriver.cuCtxGetDevice; 11 | import static jcuda.driver.JCudaDriver.cuDeviceGetAttribute; 12 | 13 | import java.io.ByteArrayOutputStream; 14 | import java.io.File; 15 | import java.io.IOException; 16 | import java.io.InputStream; 17 | import java.util.Locale; 18 | import java.util.Random; 19 | import java.util.logging.Logger; 20 | 21 | import jcuda.CudaException; 22 | import jcuda.driver.CUdevice; 23 | import jcuda.driver.CUresult; 24 | 25 | /** 26 | * Utility methods that are used in the JCuda samples.
27 | *
28 | * NOTE: This class is not part of a public API. It is only intended for 29 | * the use in the samples. Parts of its functionality could be replaced 30 | * with the runtime compilation features that have been added in CUDA 7.5. 31 | */ 32 | public class JCudaSamplesUtils 33 | { 34 | /** 35 | * The logger used in this class 36 | */ 37 | private static final Logger logger = 38 | Logger.getLogger(JCudaSamplesUtils.class.getName()); 39 | 40 | /** 41 | * Compiles the given CUDA file into a PTX file using NVCC, and returns 42 | * the name of the resulting PTX file 43 | * 44 | * @param cuFileName The CUDA file name 45 | * @return The PTX file name 46 | * @throws CudaException If an error occurs - i.e. when the input file 47 | * does not exist, or the NVCC call caused an error. 48 | */ 49 | public static String preparePtxFile(String cuFileName) 50 | { 51 | return invokeNvcc(cuFileName, "ptx", true); 52 | } 53 | 54 | /** 55 | * Compiles the given CUDA file into a CUBIN file using NVCC, and returns 56 | * the name of the resulting CUBIN file. By default, the NVCC will be 57 | * invoked with the -dlink parameter, and an 58 | * -arch parameter for the compute capability of the 59 | * device of the current context.
60 | *
61 | * Note that there must be a current context when this function 62 | * is called! 63 | * 64 | * @param cuFileName The CUDA file name 65 | * @return The PTX file name 66 | * @throws CudaException If an error occurs - i.e. when the input file 67 | * does not exist, or the NVCC call caused an error. 68 | * @throws CudaException If there is no current context 69 | */ 70 | public static String prepareDefaultCubinFile(String cuFileName) 71 | { 72 | int computeCapability = computeComputeCapability(); 73 | String nvccArguments[] = new String[] { 74 | "-dlink", 75 | "-arch=sm_"+computeCapability 76 | }; 77 | return invokeNvcc(cuFileName, "cubin", true, nvccArguments); 78 | } 79 | 80 | /** 81 | * Tries to create a PTX or CUBIN file for the given CUDA file.
82 | *
83 | * The extension of the given file name is replaced with 84 | * "cubin" or "ptx", depending on the 85 | * targetFileType.
86 | *
87 | * If the file with the resulting name does not exist yet, or if 88 | * forceRebuild is true, then it is compiled 89 | * from the given file using NVCC, using the given parameters.
90 | *
91 | * The name of the resulting output file is returned. 92 | * 93 | * @param cuFileName The name of the .CU file 94 | * @param targetFileType The target file type. Must be "cubin" 95 | * or "ptx" (case-insensitively) 96 | * @param forceRebuild Whether the PTX file should be created even if 97 | * it already exists 98 | * @return The name of the PTX file 99 | * @throws CudaException If an error occurs - i.e. when the input file 100 | * does not exist, or the NVCC call caused an error. 101 | * @throws IllegalArgumentException If the target file type is not valid 102 | */ 103 | private static String invokeNvcc( 104 | String cuFileName, String targetFileType, 105 | boolean forceRebuild, String ... nvccArguments) 106 | { 107 | if (!"cubin".equalsIgnoreCase(targetFileType) && 108 | !"ptx".equalsIgnoreCase(targetFileType)) 109 | { 110 | throw new IllegalArgumentException( 111 | "Target file type must be \"ptx\" or \"cubin\", but is " + 112 | targetFileType); 113 | } 114 | logger.info("Creating " + targetFileType + " file for " + cuFileName); 115 | 116 | int dotIndex = cuFileName.lastIndexOf('.'); 117 | if (dotIndex == -1) 118 | { 119 | dotIndex = cuFileName.length(); 120 | } 121 | String otuputFileName = cuFileName.substring(0, dotIndex) + 122 | "." + targetFileType.toLowerCase(); 123 | File ptxFile = new File(otuputFileName); 124 | if (ptxFile.exists() && !forceRebuild) 125 | { 126 | return otuputFileName; 127 | } 128 | 129 | File cuFile = new File(cuFileName); 130 | if (!cuFile.exists()) 131 | { 132 | throw new CudaException("Input file not found: " + cuFileName + 133 | " (" + cuFile.getAbsolutePath() + ")"); 134 | } 135 | String modelString = "-m" + System.getProperty("sun.arch.data.model"); 136 | String command = "nvcc "; 137 | command += modelString + " "; 138 | command += "-" + targetFileType + " "; 139 | for (String a : nvccArguments) 140 | { 141 | command += a + " "; 142 | } 143 | command += cuFileName + " -o " + otuputFileName; 144 | 145 | logger.info("Executing\n" + command); 146 | try 147 | { 148 | Process process = Runtime.getRuntime().exec(command); 149 | 150 | String errorMessage = 151 | new String(toByteArray(process.getErrorStream())); 152 | String outputMessage = 153 | new String(toByteArray(process.getInputStream())); 154 | int exitValue = 0; 155 | try 156 | { 157 | exitValue = process.waitFor(); 158 | } 159 | catch (InterruptedException e) 160 | { 161 | Thread.currentThread().interrupt(); 162 | throw new CudaException( 163 | "Interrupted while waiting for nvcc output", e); 164 | } 165 | if (exitValue != 0) 166 | { 167 | logger.severe("nvcc process exitValue " + exitValue); 168 | logger.severe("errorMessage:\n" + errorMessage); 169 | logger.severe("outputMessage:\n" + outputMessage); 170 | throw new CudaException("Could not create " + targetFileType + 171 | " file: " + errorMessage); 172 | } 173 | } 174 | catch (IOException e) 175 | { 176 | throw new CudaException("Could not create " + targetFileType + 177 | " file", e); 178 | } 179 | 180 | logger.info("Finished creating " + targetFileType + " file"); 181 | return otuputFileName; 182 | } 183 | 184 | /** 185 | * Fully reads the given InputStream and returns it as a byte array 186 | * 187 | * @param inputStream The input stream to read 188 | * @return The byte array containing the data from the input stream 189 | * @throws IOException If an I/O error occurs 190 | */ 191 | private static byte[] toByteArray(InputStream inputStream) 192 | throws IOException 193 | { 194 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 195 | byte buffer[] = new byte[8192]; 196 | while (true) 197 | { 198 | int read = inputStream.read(buffer); 199 | if (read == -1) 200 | { 201 | break; 202 | } 203 | baos.write(buffer, 0, read); 204 | } 205 | return baos.toByteArray(); 206 | } 207 | 208 | /** 209 | * Compute the compute capability of the device device of the current 210 | * context. The compute capability will be returned as an int value 211 | * major * 10 + minor. For example, the return value 212 | * will be 52 for a device with compute capability 5.2. 213 | * 214 | * @return The compute capability of the current device 215 | * @throws CudaException If there is no current context 216 | */ 217 | private static int computeComputeCapability() 218 | { 219 | CUdevice device = new CUdevice(); 220 | int status = cuCtxGetDevice(device); 221 | if (status != CUresult.CUDA_SUCCESS) 222 | { 223 | throw new CudaException(CUresult.stringFor(status)); 224 | } 225 | return computeComputeCapability(device); 226 | } 227 | 228 | 229 | /** 230 | * Compute the compute capability of the given device. The compute 231 | * capability will be returned as an int value 232 | * major * 10 + minor. For example, the return value 233 | * will be 52 for a device with compute capability 5.2. 234 | * 235 | * @param device The device 236 | * @return The compute capability 237 | */ 238 | private static int computeComputeCapability(CUdevice device) 239 | { 240 | int majorArray[] = { 0 }; 241 | int minorArray[] = { 0 }; 242 | cuDeviceGetAttribute(majorArray, 243 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 244 | cuDeviceGetAttribute(minorArray, 245 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 246 | int major = majorArray[0]; 247 | int minor = minorArray[0]; 248 | return major * 10 + minor; 249 | } 250 | 251 | 252 | /** 253 | * Creates an array of the specified size, containing float values from 254 | * the range [0.0f, 1.0f) 255 | * 256 | * @param n The size of the array 257 | * @return The array of random values 258 | */ 259 | public static float[] createRandomFloatData(int n) 260 | { 261 | Random random = new Random(0); 262 | float a[] = new float[n]; 263 | for (int i = 0; i < n; i++) 264 | { 265 | a[i] = random.nextFloat(); 266 | } 267 | return a; 268 | } 269 | 270 | /** 271 | * Compares the given result against a reference, and returns whether the 272 | * error norm is below a small epsilon threshold 273 | * 274 | * @param result The result 275 | * @param reference The reference 276 | * @return Whether the arrays are equal based on the error norm 277 | * @throws NullPointerException If any argument is null 278 | * @throws IllegalArgumentException If the arrays have different lengths 279 | */ 280 | public static boolean equalByNorm(float result[], float reference[]) 281 | { 282 | if (result == null) 283 | { 284 | throw new NullPointerException("The result is null"); 285 | } 286 | if (reference == null) 287 | { 288 | throw new NullPointerException("The reference is null"); 289 | } 290 | if (result.length != reference.length) 291 | { 292 | throw new IllegalArgumentException( 293 | "The result and reference array have different lengths: " + 294 | result.length + " and " + reference.length); 295 | } 296 | final float epsilon = 1e-6f; 297 | float errorNorm = 0; 298 | float refNorm = 0; 299 | for (int i = 0; i < result.length; ++i) 300 | { 301 | float diff = reference[i] - result[i]; 302 | errorNorm += diff * diff; 303 | refNorm += reference[i] * result[i]; 304 | } 305 | errorNorm = (float) Math.sqrt(errorNorm); 306 | refNorm = (float) Math.sqrt(refNorm); 307 | if (Math.abs(refNorm) < epsilon) 308 | { 309 | return false; 310 | } 311 | return (errorNorm / refNorm < epsilon); 312 | } 313 | 314 | 315 | /** 316 | * Creates a string representation of the given array as a matrix with 317 | * with given number of columns. 318 | * 319 | * @param a The array 320 | * @param columns The number of columns 321 | * @return The string representation 322 | */ 323 | public static String toString2D(float[] a, int columns) 324 | { 325 | StringBuilder sb = new StringBuilder(); 326 | for (int i = 0; i < a.length; i++) 327 | { 328 | if ((i > 0) && (i % columns == 0)) 329 | { 330 | sb.append("\n"); 331 | } 332 | sb.append(String.format(Locale.ENGLISH, "%7.4f ", a[i])); 333 | } 334 | return sb.toString(); 335 | } 336 | 337 | 338 | 339 | } 340 | -------------------------------------------------------------------------------- /JCudaSamples/src/main/java/jcuda/driver/samples/JCudaReduction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JCuda - Java bindings for NVIDIA CUDA driver and runtime API 3 | * http://www.jcuda.org 4 | * 5 | * Copyright 2011-2018 Marco Hutter - http://www.jcuda.org 6 | */ 7 | package jcuda.driver.samples; 8 | 9 | import static jcuda.driver.JCudaDriver.cuCtxCreate; 10 | import static jcuda.driver.JCudaDriver.cuCtxDestroy; 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize; 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet; 13 | import static jcuda.driver.JCudaDriver.cuInit; 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel; 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc; 16 | import static jcuda.driver.JCudaDriver.cuMemFree; 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH; 18 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD; 19 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction; 20 | import static jcuda.driver.JCudaDriver.cuModuleLoad; 21 | import static jcuda.driver.JCudaDriver.cuModuleUnload; 22 | 23 | import java.util.Locale; 24 | import java.util.Random; 25 | 26 | import jcuda.Pointer; 27 | import jcuda.Sizeof; 28 | import jcuda.driver.CUcontext; 29 | import jcuda.driver.CUdevice; 30 | import jcuda.driver.CUdeviceptr; 31 | import jcuda.driver.CUfunction; 32 | import jcuda.driver.CUmodule; 33 | import jcuda.driver.JCudaDriver; 34 | import jcuda.samples.utils.JCudaSamplesUtils; 35 | 36 | /** 37 | * Example of a reduction. It is based on the NVIDIA 'reduction' sample, 38 | * and uses an adapted version of one of the kernels presented in 39 | * this sample (see src/main/resources/kernels/JCudaReductionKernel.cu) 40 | */ 41 | public class JCudaReduction 42 | { 43 | /** 44 | * The CUDA context created by this sample 45 | */ 46 | private static CUcontext context; 47 | 48 | /** 49 | * The module which is loaded in form of a PTX file 50 | */ 51 | private static CUmodule module; 52 | 53 | /** 54 | * The actual kernel function from the module 55 | */ 56 | private static CUfunction function; 57 | 58 | /** 59 | * Temporary memory for the device output 60 | */ 61 | private static CUdeviceptr deviceBuffer; 62 | 63 | /** 64 | * Entry point of this sample 65 | * 66 | * @param args Not used 67 | */ 68 | public static void main(String args[]) 69 | { 70 | // Enable exceptions and omit all subsequent error checks 71 | JCudaDriver.setExceptionsEnabled(true); 72 | 73 | init(); 74 | boolean passed = true; 75 | for (int n = 100000; n <= 26500000; n *= 2) 76 | { 77 | float hostInput[] = createRandomArray(n); 78 | 79 | long timeNs0 = 0; 80 | long timeNs1 = 0; 81 | 82 | // Copy the input data to the device 83 | timeNs0 = System.nanoTime(); 84 | CUdeviceptr deviceInput = new CUdeviceptr(); 85 | cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT); 86 | cuMemcpyHtoD(deviceInput, Pointer.to(hostInput), 87 | hostInput.length * Sizeof.FLOAT); 88 | timeNs1 = System.nanoTime(); 89 | long durationCopyNs = timeNs1 - timeNs0; 90 | 91 | // Execute the reduction with CUDA 92 | timeNs0 = System.nanoTime(); 93 | float resultJCuda = reduce(deviceInput, hostInput.length); 94 | timeNs1 = System.nanoTime(); 95 | long durationCompNs = timeNs1 - timeNs0; 96 | 97 | cuMemFree(deviceInput); 98 | 99 | // Execute the reduction with Java 100 | timeNs0 = System.nanoTime(); 101 | float resultJava = reduceHost(hostInput); 102 | timeNs1 = System.nanoTime(); 103 | long durationJavaNs = timeNs1 - timeNs0; 104 | 105 | System.out.println("Reduction of " + n + " elements"); 106 | System.out.printf(Locale.ENGLISH, 107 | " JCuda: %7.3f ms, result: %f " + 108 | "(copy: %7.3f ms, comp: %7.3f ms)\n", 109 | (durationCopyNs + durationCompNs) / 1e6, resultJCuda, 110 | durationCopyNs / 1e6, durationCompNs / 1e6); 111 | System.out.printf(Locale.ENGLISH, 112 | " Java : %7.3f ms, result: %f\n", 113 | durationJavaNs / 1e6, resultJava); 114 | 115 | passed &= 116 | Math.abs(resultJCuda - resultJava) < resultJava * 1e-5; 117 | 118 | } 119 | System.out.println("Test " + (passed ? "PASSED" : "FAILED")); 120 | 121 | shutdown(); 122 | } 123 | 124 | 125 | /** 126 | * Implementation of a Kahan summation reduction in plain Java 127 | * 128 | * @param input The input 129 | * @return The reduction result 130 | */ 131 | private static float reduceHost(float data[]) 132 | { 133 | float sum = data[0]; 134 | float c = 0.0f; 135 | for (int i = 1; i < data.length; i++) 136 | { 137 | float y = data[i] - c; 138 | float t = sum + y; 139 | c = (t - sum) - y; 140 | sum = t; 141 | } 142 | return sum; 143 | } 144 | 145 | 146 | /** 147 | * Initialize the context, module, function and other elements used 148 | * in this sample 149 | */ 150 | private static void init() 151 | { 152 | // Initialize the driver API and create a context for the first device 153 | cuInit(0); 154 | CUdevice device = new CUdevice(); 155 | cuDeviceGet(device, 0); 156 | context = new CUcontext(); 157 | cuCtxCreate(context, 0, device); 158 | 159 | // Create the PTX file by calling the NVCC 160 | String ptxFileName = JCudaSamplesUtils.preparePtxFile( 161 | "src/main/resources/kernels/JCudaReductionKernel.cu"); 162 | 163 | // Load the module from the PTX file 164 | module = new CUmodule(); 165 | cuModuleLoad(module, ptxFileName); 166 | 167 | // Obtain a function pointer to the "reduce" function. 168 | function = new CUfunction(); 169 | cuModuleGetFunction(function, module, "reduce"); 170 | 171 | // Allocate a chunk of temporary memory (must be at least 172 | // numberOfBlocks * Sizeof.FLOAT) 173 | deviceBuffer = new CUdeviceptr(); 174 | cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT); 175 | 176 | } 177 | 178 | /** 179 | * Release all resources allocated by this class 180 | */ 181 | private static void shutdown() 182 | { 183 | cuModuleUnload(module); 184 | cuMemFree(deviceBuffer); 185 | cuCtxDestroy(context); 186 | } 187 | 188 | /** 189 | * Performs a reduction on the given device memory with the given 190 | * number of elements. 191 | * 192 | * @param deviceInput The device input memory 193 | * @param numElements The number of elements to reduce 194 | * @return The reduction result 195 | */ 196 | private static float reduce( 197 | Pointer deviceInput, int numElements) 198 | { 199 | return reduce(deviceInput, numElements, 128, 64); 200 | } 201 | 202 | 203 | /** 204 | * Performs a reduction on the given device memory with the given 205 | * number of elements and the specified limits for threads and 206 | * blocks. 207 | * 208 | * @param deviceInput The device input memory 209 | * @param numElements The number of elements to reduce 210 | * @param maxThreads The maximum number of threads 211 | * @param maxBlocks The maximum number of blocks 212 | * @return The reduction result 213 | */ 214 | private static float reduce( 215 | Pointer deviceInput, int numElements, 216 | int maxThreads, int maxBlocks) 217 | { 218 | // Determine the number of threads and blocks for the input 219 | int numBlocks = getNumBlocks(numElements, maxBlocks, maxThreads); 220 | int numThreads = getNumThreads(numElements, maxBlocks, maxThreads); 221 | 222 | // Call the main reduction method 223 | float result = reduce(numElements, numThreads, numBlocks, 224 | maxThreads, maxBlocks, deviceInput); 225 | return result; 226 | } 227 | 228 | 229 | 230 | /** 231 | * Performs a reduction on the given device memory. 232 | * 233 | * @param n The number of elements for the reduction 234 | * @param numThreads The number of threads 235 | * @param numBlocks The number of blocks 236 | * @param maxThreads The maximum number of threads 237 | * @param maxBlocks The maximum number of blocks 238 | * @param deviceInput The input memory 239 | * @return The reduction result 240 | */ 241 | private static float reduce( 242 | int n, int numThreads, int numBlocks, 243 | int maxThreads, int maxBlocks, Pointer deviceInput) 244 | { 245 | // Perform a "tree like" reduction as in the NVIDIA sample 246 | reduce(n, numThreads, numBlocks, deviceInput, deviceBuffer); 247 | int s = numBlocks; 248 | while(s > 1) 249 | { 250 | int threads = getNumThreads(s, maxBlocks, maxThreads); 251 | int blocks = getNumBlocks(s, maxBlocks, maxThreads); 252 | 253 | reduce(s, threads, blocks, deviceBuffer, deviceBuffer); 254 | s = (s + (threads * 2 - 1)) / (threads * 2); 255 | } 256 | 257 | float result[] = {0.0f}; 258 | cuMemcpyDtoH(Pointer.to(result), deviceBuffer, Sizeof.FLOAT); 259 | return result[0]; 260 | } 261 | 262 | 263 | /** 264 | * Perform a reduction of the specified number of elements in the given 265 | * device input memory, using the given number of threads and blocks, 266 | * and write the results into the given output memory. 267 | * 268 | * @param size The size (number of elements) 269 | * @param threads The number of threads 270 | * @param blocks The number of blocks 271 | * @param deviceInput The device input memory 272 | * @param deviceOutput The device output memory. Its size must at least 273 | * be numBlocks*Sizeof.FLOAT 274 | */ 275 | private static void reduce(int size, int threads, int blocks, 276 | Pointer deviceInput, Pointer deviceOutput) 277 | { 278 | // Compute the shared memory size (as done in 279 | // the NIVIDA sample) 280 | int sharedMemSize = threads * Sizeof.FLOAT; 281 | if (threads <= 32) 282 | { 283 | sharedMemSize *= 2; 284 | } 285 | 286 | // Set up the kernel parameters: A pointer to an array 287 | // of pointers which point to the actual values. 288 | Pointer kernelParameters = Pointer.to( 289 | Pointer.to(deviceInput), 290 | Pointer.to(deviceOutput), 291 | Pointer.to(new int[]{size}) 292 | ); 293 | 294 | // Call the kernel function. 295 | cuLaunchKernel(function, 296 | blocks, 1, 1, // Grid dimension 297 | threads, 1, 1, // Block dimension 298 | sharedMemSize, null, // Shared memory size and stream 299 | kernelParameters, null // Kernel- and extra parameters 300 | ); 301 | cuCtxSynchronize(); 302 | } 303 | 304 | 305 | /** 306 | * Compute the number of blocks that should be used for the 307 | * given input size and limits 308 | * 309 | * @param n The input size 310 | * @param maxBlocks The maximum number of blocks 311 | * @param maxThreads The maximum number of threads 312 | * @return The number of blocks 313 | */ 314 | private static int getNumBlocks(int n, int maxBlocks, int maxThreads) 315 | { 316 | int blocks = 0; 317 | int threads = getNumThreads(n, maxBlocks, maxThreads); 318 | blocks = (n + (threads * 2 - 1)) / (threads * 2); 319 | blocks = Math.min(maxBlocks, blocks); 320 | return blocks; 321 | } 322 | 323 | /** 324 | * Compute the number of threads that should be used for the 325 | * given input size and limits 326 | * 327 | * @param n The input size 328 | * @param maxBlocks The maximum number of blocks 329 | * @param maxThreads The maximum number of threads 330 | * @return The number of threads 331 | */ 332 | private static int getNumThreads(int n, int maxBlocks, int maxThreads) 333 | { 334 | int threads = 0; 335 | threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads; 336 | return threads; 337 | } 338 | 339 | /** 340 | * Returns the power of 2 that is equal to or greater than x 341 | * 342 | * @param x The input 343 | * @return The next power of 2 344 | */ 345 | private static int nextPow2(int x) 346 | { 347 | --x; 348 | x |= x >> 1; 349 | x |= x >> 2; 350 | x |= x >> 4; 351 | x |= x >> 8; 352 | x |= x >> 16; 353 | return ++x; 354 | } 355 | 356 | 357 | /** 358 | * Create an array of the given size, with random data 359 | * 360 | * @param size The array size 361 | * @return The array 362 | */ 363 | private static float[] createRandomArray(int size) 364 | { 365 | Random random = new Random(0); 366 | float array[] = new float[size]; 367 | for(int i = 0; i < size; i++) 368 | { 369 | array[i] = random.nextFloat() * 0.01f; 370 | } 371 | return array; 372 | } 373 | } 374 | 375 | 376 | --------------------------------------------------------------------------------