├── JCudaSamples
    ├── localMavenRepository
    │   ├── de
    │   │   └── javagl
    │   │   │   └── matrixmarketreader
    │   │   │       ├── maven-metadata-local.xml.md5
    │   │   │       ├── maven-metadata-local.xml.sha1
    │   │   │       ├── 0.0.1-SNAPSHOT
    │   │   │           ├── maven-metadata-local.xml.md5
    │   │   │           ├── maven-metadata-local.xml.sha1
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.jar.md5
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.pom.md5
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.jar.sha1
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.pom.sha1
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.jar
    │   │   │           ├── matrixmarketreader-0.0.1-SNAPSHOT.pom
    │   │   │           └── maven-metadata-local.xml
    │   │   │       └── maven-metadata-local.xml
    │   └── org
    │   │   └── jcuda
    │   │       └── jcuda-matrix-utils
    │   │           ├── maven-metadata-local.xml.md5
    │   │           ├── maven-metadata-local.xml.sha1
    │   │           ├── 0.0.1-SNAPSHOT
    │   │               ├── maven-metadata-local.xml.md5
    │   │               ├── maven-metadata-local.xml.sha1
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.md5
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.md5
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.sha1
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.sha1
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.jar
    │   │               ├── jcuda-matrix-utils-0.0.1-SNAPSHOT.pom
    │   │               └── maven-metadata-local.xml
    │   │           └── maven-metadata-local.xml
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       ├── data
    │   │       │   ├── driver
    │   │       │   │   └── gl
    │   │       │   │   │   └── Bucky.raw
    │   │       │   └── jcudnn
    │   │       │   │   └── mnist
    │   │       │   │       ├── conv1.bin
    │   │       │   │       ├── conv2.bin
    │   │       │   │       ├── ip1.bin
    │   │       │   │       ├── ip2.bin
    │   │       │   │       ├── conv1.bias.bin
    │   │       │   │       ├── conv2.bias.bin
    │   │       │   │       ├── five_28x28.pgm
    │   │       │   │       ├── ip1.bias.bin
    │   │       │   │       ├── ip2.bias.bin
    │   │       │   │       ├── one_28x28.pgm
    │   │       │   │       └── three_28x28.pgm
    │   │       └── kernels
    │   │       │   ├── JCudaVectorAddKernel.cu
    │   │       │   ├── JCudaConstantMemoryKernel.cu
    │   │       │   ├── JCudaDriverSimpleGLKernel.cu
    │   │       │   ├── JCudaVectorAddKernel.ptx
    │   │       │   ├── JCudaDynamicParallelismKernel.cu
    │   │       │   ├── JCudaAllocationInKernelKernel.cu
    │   │       │   ├── JCudaReductionKernel.cu
    │   │       │   └── JCudaDriverVolumeRendererKernel.cu
    │   │   └── java
    │   │       └── jcuda
    │   │           ├── runtime
    │   │               └── samples
    │   │               │   ├── JCudaPrintDeviceInfo.java
    │   │               │   ├── JCudaRuntimeUnifiedMemory.java
    │   │               │   ├── JCudaRuntimeBasicStreamCallback.java
    │   │               │   ├── JCudaRuntimeMappedMemory.java
    │   │               │   └── JCudaRuntimeMemoryBandwidths.java
    │   │           ├── jcufft
    │   │               └── samples
    │   │               │   └── JCufftSample.java
    │   │           ├── driver
    │   │               ├── samples
    │   │               │   ├── JCudaDriverHostFunction.java
    │   │               │   ├── JCudaDriverUnifiedMemory.java
    │   │               │   ├── JCudaDriverBasicStreamCallback.java
    │   │               │   ├── JCudaConstantMemoryExample.java
    │   │               │   ├── JCudaDynamicParallelism.java
    │   │               │   ├── JCudaVectorAdd.java
    │   │               │   ├── JCudaAllocationInKernel.java
    │   │               │   ├── JCudaDriverStreamCallbacks.java
    │   │               │   └── JCudaReduction.java
    │   │               └── gl
    │   │               │   └── samples
    │   │               │       └── SimpleInteraction.java
    │   │           ├── jcurand
    │   │               └── samples
    │   │               │   └── JCurandSample.java
    │   │           ├── vec
    │   │               └── samples
    │   │               │   ├── VecFloatSample.java
    │   │               │   └── VecDoubleSample.java
    │   │           ├── jcublas
    │   │               └── samples
    │   │               │   ├── JCublas2Sample.java
    │   │               │   ├── JCublas2PointerModes.java
    │   │               │   ├── JCublas2SgemmExSample.java
    │   │               │   ├── JCublas2SgemmBatched.java
    │   │               │   └── JCublas2MatrixInvert.java
    │   │           ├── nvrtc
    │   │               └── samples
    │   │               │   ├── JNvrtcLoweredNames.java
    │   │               │   └── JNvrtcVectorAdd.java
    │   │           ├── jcudnn
    │   │               └── samples
    │   │               │   └── JCudnnMnistUtils.java
    │   │           └── samples
    │   │               └── utils
    │   │                   └── JCudaSamplesUtils.java
    └── pom.xml
├── .gitignore
├── README.md
└── LICENSE


/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/maven-metadata-local.xml.md5:
--------------------------------------------------------------------------------
1 | b95802f5bafd13d9521ada1c42de69e2


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml.md5:
--------------------------------------------------------------------------------
1 | 2428239c37417b89acedb85528a29f5b


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/maven-metadata-local.xml.sha1:
--------------------------------------------------------------------------------
1 | ed0459380fe56fe148a632e471cff8f0fb588178


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml.sha1:
--------------------------------------------------------------------------------
1 | 191b56ff1714aa6441f7a527520804eabed1d6ad


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml.md5:
--------------------------------------------------------------------------------
1 | 3a733cd451cda335f5aca40b4f25e8a0


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml.md5:
--------------------------------------------------------------------------------
1 | 107587a329bcfa73a6b8b73c68ae5686


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml.sha1:
--------------------------------------------------------------------------------
1 | bfa5c79df415720103ace72adf1945b933e568c0


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml.sha1:
--------------------------------------------------------------------------------
1 | 886fc25d151db52534eed818de5f8b4f97b36932


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar.md5:
--------------------------------------------------------------------------------
1 | f39ac4ec4d2d50fe7bf1e5d12ab8479c


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom.md5:
--------------------------------------------------------------------------------
1 | b15aaba1771436557698778d6c01f4d0


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.md5:
--------------------------------------------------------------------------------
1 | b1d419efac0e2b1f577f242812d403cb


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.md5:
--------------------------------------------------------------------------------
1 | 1386ca6a00c80b800e756af4c53467a2


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar.sha1:
--------------------------------------------------------------------------------
1 | 8791d1ae86e4678241a6eb752ee1616b9ba51a7d


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom.sha1:
--------------------------------------------------------------------------------
1 | 5326dc181c8e65a7f315cd538df661c30008b08d


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar.sha1:
--------------------------------------------------------------------------------
1 | 152ba788a1f28b730efb6fef722718b66877e4dd


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom.sha1:
--------------------------------------------------------------------------------
1 | badc33ecf009d2e6360f7c96c3152ea4cf3be920


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/driver/gl/Bucky.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/driver/gl/Bucky.raw


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv1.bias.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/conv2.bias.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/five_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/five_28x28.pgm


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip1.bias.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/ip2.bias.bin


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/one_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/one_28x28.pgm


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/data/jcudnn/mnist/three_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/src/main/resources/data/jcudnn/mnist/three_28x28.pgm


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | JCudaVec/target/
2 | /JCudaSamples/cudnn64_5.dll
3 | /JCudaSamples/.settings
4 | /JCudaSamples/.classpath
5 | /JCudaSamples/.project
6 | /JCudaSamples/target
7 | /JCudaSamples/src/main/resources/kernels/*.cubin
8 | /JCudaSamples/src/main/resources/kernels/*.ptx
9 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.jar


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcuda/jcuda-samples/HEAD/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.jar


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaVectorAddKernel.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void add(int n, float *a, float *b, float *sum)
 3 | {
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 5 |     if (i<n)
 6 |     {
 7 |         sum[i] = a[i] + b[i];
 8 |     }
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/maven-metadata-local.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <metadata>
 3 |   <groupId>de.javagl</groupId>
 4 |   <artifactId>matrixmarketreader</artifactId>
 5 |   <versioning>
 6 |     <versions>
 7 |       <version>0.0.1-SNAPSHOT</version>
 8 |     </versions>
 9 |     <lastUpdated>20161010163950</lastUpdated>
10 |   </versioning>
11 | </metadata>
12 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/maven-metadata-local.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <metadata>
 3 |   <groupId>org.jcuda</groupId>
 4 |   <artifactId>jcuda-matrix-utils</artifactId>
 5 |   <versioning>
 6 |     <versions>
 7 |       <version>0.0.1-SNAPSHOT</version>
 8 |     </versions>
 9 |     <lastUpdated>20190308184031</lastUpdated>
10 |   </versioning>
11 | </metadata>
12 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaConstantMemoryKernel.cu:
--------------------------------------------------------------------------------
 1 | #define CONSTANT_MEMORY_SIZE 100
 2 | __constant__ float constantMemoryData[CONSTANT_MEMORY_SIZE];
 3 | 
 4 | extern "C"
 5 | __global__ void constantMemoryKernel(float* array, int size)
 6 | {
 7 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (index < size && index < CONSTANT_MEMORY_SIZE) {
 9 |         array[index] = constantMemoryData[index];
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/matrixmarketreader-0.0.1-SNAPSHOT.pom:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
 3 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 4 |   <modelVersion>4.0.0</modelVersion>
 5 |   <groupId>de.javagl</groupId>
 6 |   <artifactId>matrixmarketreader</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <description>POM was created from install:install-file</description>
 9 | </project>
10 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/jcuda-matrix-utils-0.0.1-SNAPSHOT.pom:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
 3 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 4 |   <modelVersion>4.0.0</modelVersion>
 5 |   <groupId>org.jcuda</groupId>
 6 |   <artifactId>jcuda-matrix-utils</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <description>POM was created from install:install-file</description>
 9 | </project>
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jcuda-samples
 2 | 
 3 | This repository contains samples for the JCuda libraries.
 4 | 
 5 | **Note:** Some of the samples require third-party libraries, JCuda
 6 | libraries that are not part of the [`jcuda-main`](https://github.com/jcuda/jcuda-main) 
 7 | package (for example, [`JCudaVec`](https://github.com/jcuda/jcuda-vec) or 
 8 | [`JCudnn`](https://github.com/jcuda/jcudnn)), or utility libraries
 9 | that are not available in Maven Central. In order to compile these
10 | samples, additional setup steps may be necessary. The main goal
11 | of this repository is to collect and maintain the samples in a 
12 | form that allows them to serve as a collection of snippets that
13 | can easily be copied and pasted into own projects to get started.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/de/javagl/matrixmarketreader/0.0.1-SNAPSHOT/maven-metadata-local.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <metadata modelVersion="1.1.0">
 3 |   <groupId>de.javagl</groupId>
 4 |   <artifactId>matrixmarketreader</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <versioning>
 7 |     <snapshot>
 8 |       <localCopy>true</localCopy>
 9 |     </snapshot>
10 |     <lastUpdated>20161010163950</lastUpdated>
11 |     <snapshotVersions>
12 |       <snapshotVersion>
13 |         <extension>jar</extension>
14 |         <value>0.0.1-SNAPSHOT</value>
15 |         <updated>20161010163950</updated>
16 |       </snapshotVersion>
17 |       <snapshotVersion>
18 |         <extension>pom</extension>
19 |         <value>0.0.1-SNAPSHOT</value>
20 |         <updated>20161010155311</updated>
21 |       </snapshotVersion>
22 |     </snapshotVersions>
23 |   </versioning>
24 | </metadata>
25 | 


--------------------------------------------------------------------------------
/JCudaSamples/localMavenRepository/org/jcuda/jcuda-matrix-utils/0.0.1-SNAPSHOT/maven-metadata-local.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <metadata modelVersion="1.1.0">
 3 |   <groupId>org.jcuda</groupId>
 4 |   <artifactId>jcuda-matrix-utils</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <versioning>
 7 |     <snapshot>
 8 |       <localCopy>true</localCopy>
 9 |     </snapshot>
10 |     <lastUpdated>20190308184031</lastUpdated>
11 |     <snapshotVersions>
12 |       <snapshotVersion>
13 |         <extension>jar</extension>
14 |         <value>0.0.1-SNAPSHOT</value>
15 |         <updated>20190308184031</updated>
16 |       </snapshotVersion>
17 |       <snapshotVersion>
18 |         <extension>pom</extension>
19 |         <value>0.0.1-SNAPSHOT</value>
20 |         <updated>20161010175417</updated>
21 |       </snapshotVersion>
22 |     </snapshotVersions>
23 |   </versioning>
24 | </metadata>
25 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu:
--------------------------------------------------------------------------------
 1 | // Taken from the NVIDIA "2_Graphics\simpleGL" sample:
 2 | 
 3 | // A kernel that modifies the z-coordinates of a rectangular
 4 | // grid of vertices, based on a time value, so that they 
 5 | // form an animated sine wave
 6 | 
 7 | extern "C"
 8 | __global__ void simple_vbo_kernel(
 9 |     float4 *pos, unsigned int width, unsigned int height, float time)
10 | {
11 |     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
12 |     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
13 | 
14 |     // calculate uv coordinates
15 |     float u = x / (float) width;
16 |     float v = y / (float) height;
17 |     u = u*2.0f - 1.0f;
18 |     v = v*2.0f - 1.0f;
19 | 
20 |     // calculate simple sine wave pattern
21 |     float freq = 4.0f;
22 |     float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f;
23 | 
24 |     // write output vertex
25 |     pos[y*width+x] = make_float4(u, w, v, 1.0f);
26 | }
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2008-2016 Marco Hutter - http://www.jcuda.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaPrintDeviceInfo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2018 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.runtime.samples;
 7 | 
 8 | import static jcuda.runtime.JCuda.cudaGetDeviceCount;
 9 | import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
10 | 
11 | import jcuda.runtime.JCuda;
12 | import jcuda.runtime.cudaDeviceProp;
13 | 
14 | /**
15 |  * A sample that prints information about all available CUDA devices
16 |  */
17 | public class JCudaPrintDeviceInfo
18 | {
19 |     public static void main(String[] args)
20 |     {
21 |         JCuda.setExceptionsEnabled(true);
22 |         int deviceCount[] = { 0 };
23 |         cudaGetDeviceCount(deviceCount);
24 |         System.out.println("Found " + deviceCount[0] + " devices");
25 |         for (int device = 0; device < deviceCount[0]; device++)
26 |         {
27 |             System.out.println("Properties of device " + device + ":");
28 |             cudaDeviceProp deviceProperties = new cudaDeviceProp();
29 |             cudaGetDeviceProperties(deviceProperties, device);
30 |             System.out.println(deviceProperties.toFormattedString());
31 |         }
32 |         
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaVectorAddKernel.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-20732876
 5 | // Cuda compilation tools, release 8.0, V8.0.26
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 5.0
10 | .target sm_20
11 | .address_size 64
12 | 
13 | 	// .globl	add
14 | 
15 | .visible .entry add(
16 | 	.param .u32 add_param_0,
17 | 	.param .u64 add_param_1,
18 | 	.param .u64 add_param_2,
19 | 	.param .u64 add_param_3
20 | )
21 | {
22 | 	.reg .pred 	%p<2>;
23 | 	.reg .f32 	%f<4>;
24 | 	.reg .b32 	%r<6>;
25 | 	.reg .b64 	%rd<11>;
26 | 
27 | 
28 | 	ld.param.u32 	%r2, [add_param_0];
29 | 	ld.param.u64 	%rd1, [add_param_1];
30 | 	ld.param.u64 	%rd2, [add_param_2];
31 | 	ld.param.u64 	%rd3, [add_param_3];
32 | 	mov.u32 	%r3, %ctaid.x;
33 | 	mov.u32 	%r4, %ntid.x;
34 | 	mov.u32 	%r5, %tid.x;
35 | 	mad.lo.s32 	%r1, %r4, %r3, %r5;
36 | 	setp.ge.s32	%p1, %r1, %r2;
37 | 	@%p1 bra 	BB0_2;
38 | 
39 | 	cvta.to.global.u64 	%rd4, %rd1;
40 | 	mul.wide.s32 	%rd5, %r1, 4;
41 | 	add.s64 	%rd6, %rd4, %rd5;
42 | 	cvta.to.global.u64 	%rd7, %rd2;
43 | 	add.s64 	%rd8, %rd7, %rd5;
44 | 	ld.global.f32 	%f1, [%rd8];
45 | 	ld.global.f32 	%f2, [%rd6];
46 | 	add.f32 	%f3, %f2, %f1;
47 | 	cvta.to.global.u64 	%rd9, %rd3;
48 | 	add.s64 	%rd10, %rd9, %rd5;
49 | 	st.global.f32 	[%rd10], %f3;
50 | 
51 | BB0_2:
52 | 	ret;
53 | }
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaDynamicParallelismKernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | #include <stdio.h>
 7 | 
 8 | // A simple example of using dynamic parallelism. This kernel can
 9 | // be compiled into an object file by calling
10 | //
11 | //     nvcc -dc -arch=sm_52 JCudaDynamicParallelismKernel.cu -o JCudaDynamicParallelismKernel.o
12 | // 
13 | // The resulting object file can be linked into a CUBIN file with
14 | // 
15 | //     nvcc -dlink -arch=sm_52 -cubin JCudaDynamicParallelismKernel.o -o JCudaDynamicParallelismKernel.cubin
16 | // 
17 | // Alternatively, both steps can be taken at once, by calling
18 | // 
19 | //     nvcc -dlink -arch=sm_52 -cubin -c JCudaDynamicParallelismKernel.cu -o JCudaDynamicParallelismKernel.cubin
20 | // 
21 | // The architecture (here, sm_52) must match the architecture of
22 | // the target device. 
23 | 
24 | extern "C"
25 | __global__ void childKernel(unsigned int parentThreadIndex, float* data)
26 | {
27 |     printf("Parent thread index: %d, child thread index: %d\n", 
28 |         parentThreadIndex, threadIdx.x);    
29 |     data[threadIdx.x] = parentThreadIndex + 0.1f * threadIdx.x;
30 | }
31 | 
32 | extern "C"
33 | __global__ void parentKernel(unsigned int size, float *data)
34 | {
35 |     childKernel<<<1, 8>>>(threadIdx.x, data + threadIdx.x * 8);
36 |     cudaDeviceSynchronize();
37 |     __syncthreads();
38 | }
39 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaAllocationInKernelKernel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | extern "C"
 4 | __global__ void allocatingKernel(void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice)
 5 | {
 6 |     int thread = threadIdx.x + blockDim.x * blockIdx.x;
 7 | 
 8 |     short* devicePointerAllocatedOnDevice = (short*) malloc(3 * sizeof(short));
 9 |     printf("In thread %d allocated %p\n", thread, devicePointerAllocatedOnDevice);
10 |     for(int i=0; i < 3; i++)
11 |     {
12 |         devicePointerAllocatedOnDevice[i] = thread * 10 + i;
13 |     }
14 |     devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread] = 
15 |         devicePointerAllocatedOnDevice;
16 | }
17 | 
18 | extern "C"
19 | __global__ void copyingKernel(
20 |     void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice, 
21 |     void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost)
22 | {
23 |     int thread = threadIdx.x + blockDim.x * blockIdx.x;
24 | 
25 |     short* devicePointerAllocatedOnDevice = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread];
26 |     short* devicePointerAllocatedOnHost = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost[thread];
27 |     
28 |     printf("In thread %d copy from %p to %p\n", thread, devicePointerAllocatedOnDevice, devicePointerAllocatedOnHost);
29 |     
30 |     for(int i=0; i < 3; i++)
31 |     {
32 |         devicePointerAllocatedOnHost[i] = devicePointerAllocatedOnDevice[i];
33 |     }
34 | }
35 | 
36 | extern "C"
37 | __global__ void freeingKernel(
38 |     void** devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice)
39 | {
40 |     int thread = threadIdx.x + blockDim.x * blockIdx.x;
41 | 
42 |     short* devicePointerAllocatedOnDevice = (short*)devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice[thread];
43 |     
44 |     printf("In thread %d free %p\n", thread, devicePointerAllocatedOnDevice);
45 |     
46 |     free(devicePointerAllocatedOnDevice);
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcufft/samples/JCufftSample.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.jcufft.samples;
 7 | 
 8 | import static jcuda.jcufft.JCufft.CUFFT_FORWARD;
 9 | import static jcuda.jcufft.JCufft.cufftDestroy;
10 | import static jcuda.jcufft.JCufft.cufftExecC2C;
11 | import static jcuda.jcufft.JCufft.cufftPlan1d;
12 | 
13 | import org.jtransforms.fft.FloatFFT_1D;
14 | 
15 | import jcuda.jcufft.cufftHandle;
16 | import jcuda.jcufft.cufftType;
17 | import jcuda.samples.utils.JCudaSamplesUtils;
18 | 
19 | /**
20 |  * This is a sample class that performs a 1D Complex-To-Complex 
21 |  * forward FFT with JCufft, and compares the result to the 
22 |  * reference computed with JTransforms.
23 |  */
24 | class JCufftSample
25 | {
26 |     public static void main(String args[])
27 |     {
28 |         testC2C1D(1<<20);
29 |     }
30 | 
31 |     /**
32 |      * Test the 1D C2C transform with the given size.
33 |      * 
34 |      * @param size The size of the transform
35 |      */
36 |     private static void testC2C1D(int size)
37 |     {
38 |         System.out.println("Creating input data...");
39 |         float input[] = JCudaSamplesUtils.createRandomFloatData(size * 2);
40 | 
41 |         System.out.println("Performing 1D C2C transform with JTransforms...");
42 |         float outputJTransforms[] = input.clone();
43 |         FloatFFT_1D fft = new FloatFFT_1D(size);
44 |         fft.complexForward(outputJTransforms);
45 | 
46 |         System.out.println("Performing 1D C2C transform with JCufft...");
47 |         float outputJCufft[] = input.clone();
48 |         cufftHandle plan = new cufftHandle();
49 |         cufftPlan1d(plan, size, cufftType.CUFFT_C2C, 1);
50 |         cufftExecC2C(plan, outputJCufft, outputJCufft, CUFFT_FORWARD);
51 |         cufftDestroy(plan);
52 | 
53 |         boolean passed = JCudaSamplesUtils.equalByNorm(
54 |             outputJTransforms, outputJCufft);
55 |         System.out.println("testC2C1D " + (passed ? "PASSED" : "FAILED"));
56 |     }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverHostFunction.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.driver.samples;
 7 | 
 8 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 9 | import static jcuda.driver.JCudaDriver.cuCtxDestroy;
10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
11 | import static jcuda.driver.JCudaDriver.cuInit;
12 | import static jcuda.driver.JCudaDriver.cuLaunchHostFunc;
13 | import static jcuda.driver.JCudaDriver.cuStreamCreate;
14 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize;
15 | 
16 | import jcuda.driver.CUcontext;
17 | import jcuda.driver.CUdevice;
18 | import jcuda.driver.CUhostFn;
19 | import jcuda.driver.CUstream;
20 | import jcuda.driver.JCudaDriver;
21 | 
22 | /**
23 |  * An example showing how to call a host function via the driver API
24 |  */
25 | public class JCudaDriverHostFunction
26 | {
27 |     /**
28 |      * Entry point
29 |      * 
30 |      * @param args Not used
31 |      */
32 |     public static void main(String[] args)
33 |     {
34 |         // Default initialization
35 |         JCudaDriver.setExceptionsEnabled(true);
36 |         cuInit(0);
37 |         CUcontext context = new CUcontext();
38 |         CUdevice device = new CUdevice();
39 |         cuDeviceGet(device, 0);
40 |         cuCtxCreate(context, 0, device);
41 | 
42 |         // Create a stream
43 |         CUstream stream = new CUstream();
44 |         cuStreamCreate(stream, 0);
45 |         
46 |         // Define a host function and launch it
47 |         CUhostFn fn = new CUhostFn()
48 |         {
49 |             @Override
50 |             public void call(Object userData)
51 |             {
52 |                 System.out.println("Called with " + userData);
53 |             }
54 |         };
55 |         cuLaunchHostFunc(stream, fn, "Example user object");
56 |         
57 |         // Wait for the stream to finish
58 |         cuStreamSynchronize(stream);
59 | 
60 |         // Clean up
61 |         cuCtxDestroy(context);
62 |         
63 |         System.out.println("Done");
64 |     }
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcurand/samples/JCurandSample.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.jcurand.samples;
 7 | 
 8 | import static jcuda.jcurand.JCurand.curandCreateGenerator;
 9 | import static jcuda.jcurand.JCurand.curandDestroyGenerator;
10 | import static jcuda.jcurand.JCurand.curandGenerateUniform;
11 | import static jcuda.jcurand.JCurand.curandSetPseudoRandomGeneratorSeed;
12 | import static jcuda.jcurand.curandRngType.CURAND_RNG_PSEUDO_DEFAULT;
13 | import static jcuda.runtime.JCuda.cudaFree;
14 | import static jcuda.runtime.JCuda.cudaMalloc;
15 | import static jcuda.runtime.JCuda.cudaMemcpy;
16 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
17 | 
18 | import java.util.Arrays;
19 | 
20 | import jcuda.Pointer;
21 | import jcuda.Sizeof;
22 | import jcuda.jcurand.JCurand;
23 | import jcuda.jcurand.curandGenerator;
24 | import jcuda.runtime.JCuda;
25 | 
26 | /**
27 |  * A small sample application showing how to use JCurand.<br>
28 |  * <br>
29 |  * This is a direct port of the NVIDIA CURAND documentation example.
30 |  */
31 | public class JCurandSample
32 | {
33 |     public static void main(String args[])
34 |     {
35 |         // Enable exceptions and omit all subsequent error checks
36 |         JCuda.setExceptionsEnabled(true);
37 |         JCurand.setExceptionsEnabled(true);
38 | 
39 |         int n = 100;
40 |         curandGenerator generator = new curandGenerator();
41 | 
42 |         // Allocate n floats on host 
43 |         float hostData[] = new float[n];
44 | 
45 |         // Allocate n floats on device 
46 |         Pointer deviceData = new Pointer();
47 |         cudaMalloc(deviceData, n * Sizeof.FLOAT);
48 | 
49 |         // Create pseudo-random number generator 
50 |         curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT);
51 | 
52 |         // Set seed 
53 |         curandSetPseudoRandomGeneratorSeed(generator, 1234);
54 | 
55 |         // Generate n floats on device 
56 |         curandGenerateUniform(generator, deviceData, n);
57 | 
58 |         // Copy device memory to host 
59 |         cudaMemcpy(Pointer.to(hostData), deviceData, 
60 |             n * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
61 | 
62 |         // Show result
63 |         System.out.println(Arrays.toString(hostData));
64 | 
65 |         // Cleanup 
66 |         curandDestroyGenerator(generator);
67 |         cudaFree(deviceData);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaReductionKernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA driver and runtime API
 3 |  * http://www.jcuda.org
 4 |  *
 5 |  *
 6 |  * This code is based on the NVIDIA 'reduction' CUDA sample,
 7 |  * Copyright 1993-2010 NVIDIA Corporation.
 8 |  */
 9 | extern "C"
10 | __global__ void reduce(float *g_idata, float *g_odata, unsigned int n)
11 | {
12 |     extern __shared__ float sdata[];
13 | 
14 |     // perform first level of reduction,
15 |     // reading from global memory, writing to shared memory
16 |     unsigned int tid = threadIdx.x;
17 |     unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
18 |     unsigned int gridSize = blockDim.x*2*gridDim.x;
19 | 
20 |     float mySum = 0;
21 | 
22 |     // we reduce multiple elements per thread.  The number is determined by the
23 |     // number of active thread blocks (via gridDim).  More blocks will result
24 |     // in a larger gridSize and therefore fewer elements per thread
25 |     while (i < n)
26 |     {
27 |         mySum += g_idata[i];
28 |         // ensure we don't read out of bounds
29 |         if (i + blockDim.x < n)
30 |             mySum += g_idata[i+blockDim.x];
31 |         i += gridSize;
32 |     }
33 | 
34 |     // each thread puts its local sum into shared memory
35 |     sdata[tid] = mySum;
36 |     __syncthreads();
37 | 
38 | 
39 |     // do reduction in shared mem
40 |     if (blockDim.x >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
41 |     if (blockDim.x >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
42 |     if (blockDim.x >= 128) { if (tid <  64) { sdata[tid] = mySum = mySum + sdata[tid +  64]; } __syncthreads(); }
43 | 
44 |     if (tid < 32)
45 |     {
46 |         // now that we are using warp-synchronous programming (below)
47 |         // we need to declare our shared memory volatile so that the compiler
48 |         // doesn't reorder stores to it and induce incorrect behavior.
49 |         volatile float* smem = sdata;
50 |         if (blockDim.x >=  64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
51 |         if (blockDim.x >=  32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
52 |         if (blockDim.x >=  16) { smem[tid] = mySum = mySum + smem[tid +  8]; }
53 |         if (blockDim.x >=   8) { smem[tid] = mySum = mySum + smem[tid +  4]; }
54 |         if (blockDim.x >=   4) { smem[tid] = mySum = mySum + smem[tid +  2]; }
55 |         if (blockDim.x >=   2) { smem[tid] = mySum = mySum + smem[tid +  1]; }
56 |     }
57 | 
58 |     // write result for this block to global mem
59 |     if (tid == 0)
60 |         g_odata[blockIdx.x] = sdata[0];
61 | }
62 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeUnifiedMemory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.runtime.samples;
 7 | 
 8 | import static jcuda.jcublas.JCublas2.cublasCreate;
 9 | import static jcuda.jcublas.JCublas2.cublasSdot;
10 | import static jcuda.runtime.JCuda.cudaDeviceGetAttribute;
11 | import static jcuda.runtime.JCuda.cudaMallocManaged;
12 | import static jcuda.runtime.JCuda.cudaMemAttachGlobal;
13 | import static jcuda.runtime.JCuda.cudaMemAttachHost;
14 | import static jcuda.runtime.JCuda.cudaStreamAttachMemAsync;
15 | import static jcuda.runtime.JCuda.cudaStreamSynchronize;
16 | import static jcuda.runtime.cudaDeviceAttr.cudaDevAttrManagedMemory;
17 | 
18 | import java.nio.ByteBuffer;
19 | import java.nio.ByteOrder;
20 | import java.nio.FloatBuffer;
21 | 
22 | import jcuda.Pointer;
23 | import jcuda.Sizeof;
24 | import jcuda.jcublas.JCublas;
25 | import jcuda.jcublas.cublasHandle;
26 | import jcuda.runtime.JCuda;
27 | 
28 | /**
29 |  * An example showing how to use Unified / Managed memory with the 
30 |  * JCuda Runtime API
31 |  */
32 | public class JCudaRuntimeUnifiedMemory
33 | {
34 |     public static void main(String[] args)
35 |     {
36 |         JCuda.setExceptionsEnabled(true);
37 |         JCublas.setExceptionsEnabled(true);
38 |         
39 |         // Check if the device supports managed memory
40 |         int supported[] = { 0 };
41 |         cudaDeviceGetAttribute(supported, cudaDevAttrManagedMemory, 0);
42 |         if (supported[0] == 0)
43 |         {
44 |             System.err.println("Device does not support managed memory");
45 |             return;
46 |         }
47 | 
48 |         // Allocate managed memory that is accessible to the host
49 |         int n = 10;
50 |         long size = n * Sizeof.FLOAT;
51 |         Pointer p = new Pointer();
52 |         cudaMallocManaged(p, size, cudaMemAttachHost);
53 | 
54 |         // Obtain the byte buffer from the pointer. This is supported only
55 |         // for memory that was allocated to be accessible on the host:
56 |         ByteBuffer bb = p.getByteBuffer(0, size);
57 |         
58 |         System.out.println("Buffer on host side: " + bb);
59 | 
60 |         // Fill the buffer with sample data
61 |         FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
62 |         for (int i = 0; i < n; i++)
63 |         {
64 |             fb.put(i, i);
65 |         }
66 | 
67 |         // Make the buffer accessible to all devices
68 |         cudaStreamAttachMemAsync(null, p, 0, cudaMemAttachGlobal);
69 |         cudaStreamSynchronize(null);
70 | 
71 |         // Use the pointer in a device operation (here, a dot product with 
72 |         // JCublas, for example). The data that was filled in by the host
73 |         // will now be used by the device.
74 |         cublasHandle handle = new cublasHandle();
75 |         cublasCreate(handle);
76 |         float result[] = { -1.0f };
77 |         cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
78 |         System.out.println("Result: " + result[0]);
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeBasicStreamCallback.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.runtime.samples;
 7 | 
 8 | import static jcuda.runtime.JCuda.cudaFree;
 9 | import static jcuda.runtime.JCuda.cudaMalloc;
10 | import static jcuda.runtime.JCuda.cudaMemcpyAsync;
11 | import static jcuda.runtime.JCuda.cudaStreamAddCallback;
12 | import static jcuda.runtime.JCuda.cudaStreamCreate;
13 | import static jcuda.runtime.JCuda.cudaStreamSynchronize;
14 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
15 | 
16 | import jcuda.Pointer;
17 | import jcuda.Sizeof;
18 | import jcuda.runtime.JCuda;
19 | import jcuda.runtime.cudaStreamCallback;
20 | import jcuda.runtime.cudaStream_t;
21 | 
22 | /**
23 |  * A very basic example / test for the stream callback functionality in the
24 |  * JCuda Runtime API
25 |  */
26 | public class JCudaRuntimeBasicStreamCallback
27 | {
28 |     /**
29 |      * Entry point of this program
30 |      * 
31 |      * @param args Not used
32 |      */
33 |     public static void main(String[] args)
34 |     {
35 |         JCuda.setExceptionsEnabled(true);
36 | 
37 |         // The stream on which the callbacks will be registered.
38 |         // When this is "null", then it is the default stream.
39 |         cudaStream_t stream = null;
40 | 
41 |         boolean useDefaultStream = true;
42 |         useDefaultStream = false;
43 |         if (!useDefaultStream)
44 |         {
45 |             stream = new cudaStream_t();
46 |             cudaStreamCreate(stream);
47 |         }
48 |         System.out.println("Using stream " + stream);
49 | 
50 |         // Define the callback
51 |         cudaStreamCallback callback = new cudaStreamCallback()
52 |         {
53 |             @Override
54 |             public void call(cudaStream_t stream, int status, Object userData)
55 |             {
56 |                 System.out.println("Callback called");
57 |                 System.out.println("    stream  : " + stream);
58 |                 System.out.println("    status  : " + status);
59 |                 System.out.println("    userData: " + userData);
60 |                 System.out.println("    thread  : " + Thread.currentThread());
61 |             }
62 |         };
63 | 
64 |         // Create some dummy data on the host, and copy it to the
65 |         // device asynchronously
66 |         int n = 100000;
67 |         float hostData[] = new float[n];
68 |         Pointer deviceData = new Pointer();
69 |         cudaMalloc(deviceData, n * Sizeof.FLOAT);
70 |         cudaMemcpyAsync(deviceData, Pointer.to(hostData), 
71 |             n * Sizeof.FLOAT, cudaMemcpyHostToDevice, stream);
72 | 
73 |         // Add the callback to the stream that carries the copy operation
74 |         Object userData = "Example user data";
75 |         cudaStreamAddCallback(stream, callback, userData, 0);
76 | 
77 |         // Wait until the stream is finished
78 |         cudaStreamSynchronize(stream);
79 | 
80 |         // Clean up
81 |         cudaFree(deviceData);
82 | 
83 |         System.out.println("Done");
84 |     }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverUnifiedMemory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * JCuda - Java bindings for NVIDIA CUDA
 3 |  *
 4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
 5 |  */
 6 | package jcuda.driver.samples;
 7 | 
 8 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY;
 9 | import static jcuda.driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL;
10 | import static jcuda.driver.CUmemAttach_flags.CU_MEM_ATTACH_HOST;
11 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
12 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
13 | import static jcuda.driver.JCudaDriver.cuDeviceGetAttribute;
14 | import static jcuda.driver.JCudaDriver.cuInit;
15 | import static jcuda.driver.JCudaDriver.cuMemAllocManaged;
16 | import static jcuda.driver.JCudaDriver.cuStreamAttachMemAsync;
17 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize;
18 | import static jcuda.jcublas.JCublas2.cublasCreate;
19 | import static jcuda.jcublas.JCublas2.cublasSdot;
20 | 
21 | import java.nio.ByteBuffer;
22 | import java.nio.ByteOrder;
23 | import java.nio.FloatBuffer;
24 | 
25 | import jcuda.Pointer;
26 | import jcuda.Sizeof;
27 | import jcuda.driver.CUcontext;
28 | import jcuda.driver.CUdevice;
29 | import jcuda.driver.CUdeviceptr;
30 | import jcuda.driver.JCudaDriver;
31 | import jcuda.jcublas.JCublas;
32 | import jcuda.jcublas.cublasHandle;
33 | 
34 | /**
35 |  * An example showing how to use Unified / Managed memory with the 
36 |  * JCuda Driver API
37 |  */
38 | public class JCudaDriverUnifiedMemory
39 | {
40 |     public static void main(String[] args)
41 |     {
42 |         JCudaDriver.setExceptionsEnabled(true);
43 |         JCublas.setExceptionsEnabled(true);
44 |         
45 |         // Initialize the driver and create a context for the first device.
46 |         cuInit(0);
47 |         CUdevice device = new CUdevice();
48 |         cuDeviceGet(device, 0);
49 |         CUcontext context = new CUcontext();
50 |         cuCtxCreate(context, 0, device);
51 |         
52 |         // Check if the device supports managed memory
53 |         int supported[] = { 0 };
54 |         cuDeviceGetAttribute(supported, 
55 |             CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device);
56 |         if (supported[0] == 0)
57 |         {
58 |             System.err.println("Device does not support managed memory");
59 |             return;
60 |         }
61 | 
62 |         // Allocate managed memory that is accessible to the host
63 |         int n = 10;
64 |         long size = n * Sizeof.FLOAT;
65 |         CUdeviceptr p = new CUdeviceptr();
66 |         cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST);
67 | 
68 |         // Obtain the byte buffer from the pointer. This is supported only
69 |         // for memory that was allocated to be accessible on the host:
70 |         ByteBuffer bb = p.getByteBuffer(0, size);
71 |         
72 |         System.out.println("Buffer on host side: " + bb);
73 | 
74 |         // Fill the buffer with sample data
75 |         FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
76 |         for (int i = 0; i < n; i++)
77 |         {
78 |             fb.put(i, i);
79 |         }
80 | 
81 |         // Make the buffer accessible to all devices
82 |         cuStreamAttachMemAsync(null, p, 0,  CU_MEM_ATTACH_GLOBAL);
83 |         cuStreamSynchronize(null);
84 | 
85 |         // Use the pointer in a device operation (here, a dot product with 
86 |         // JCublas, for example). The data that was filled in by the host
87 |         // will now be used by the device.
88 |         cublasHandle handle = new cublasHandle();
89 |         cublasCreate(handle);
90 |         float result[] = { -1.0f };
91 |         cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
92 |         System.out.println("Result: " + result[0]);
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverBasicStreamCallback.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | 
  8 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  9 | import static jcuda.driver.JCudaDriver.cuCtxDestroy;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 13 | import static jcuda.driver.JCudaDriver.cuMemFree;
 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoDAsync;
 15 | import static jcuda.driver.JCudaDriver.cuStreamAddCallback;
 16 | import static jcuda.driver.JCudaDriver.cuStreamCreate;
 17 | import static jcuda.driver.JCudaDriver.cuStreamSynchronize;
 18 | 
 19 | import jcuda.Pointer;
 20 | import jcuda.Sizeof;
 21 | import jcuda.driver.CUcontext;
 22 | import jcuda.driver.CUdevice;
 23 | import jcuda.driver.CUdeviceptr;
 24 | import jcuda.driver.CUstream;
 25 | import jcuda.driver.CUstreamCallback;
 26 | import jcuda.driver.JCudaDriver;
 27 | 
 28 | /**
 29 |  * A very basic example / test for the stream callback functionality in the
 30 |  * JCuda Driver API
 31 |  */
 32 | public class JCudaDriverBasicStreamCallback
 33 | {
 34 |     /**
 35 |      * Entry point of this program
 36 |      * 
 37 |      * @param args Not used
 38 |      */
 39 |     public static void main(String[] args)
 40 |     {
 41 |         JCudaDriver.setExceptionsEnabled(true);
 42 | 
 43 |         // Default initialization
 44 |         cuInit(0);
 45 |         CUcontext context = new CUcontext();
 46 |         CUdevice device = new CUdevice();
 47 |         cuDeviceGet(device, 0);
 48 |         cuCtxCreate(context, 0, device);
 49 | 
 50 |         // The stream on which the callbacks will be registered.
 51 |         // When this is "null", then it is the default stream.
 52 |         CUstream stream = null;
 53 | 
 54 |         boolean useDefaultStream = true;
 55 |         useDefaultStream = false;
 56 |         if (!useDefaultStream)
 57 |         {
 58 |             stream = new CUstream();
 59 |             cuStreamCreate(stream, 0);
 60 |         }
 61 |         System.out.println("Using stream " + stream);
 62 | 
 63 |         // Define the callback
 64 |         CUstreamCallback callback = new CUstreamCallback()
 65 |         {
 66 |             @Override
 67 |             public void call(CUstream hStream, int status, Object userData)
 68 |             {
 69 |                 System.out.println("Callback called");
 70 |                 System.out.println("    stream  : " + hStream);
 71 |                 System.out.println("    status  : " + status);
 72 |                 System.out.println("    userData: " + userData);
 73 |                 System.out.println("    thread  : " + Thread.currentThread());
 74 |             }
 75 |         };
 76 | 
 77 |         // Create some dummy data on the host, and copy it to the
 78 |         // device asynchronously
 79 |         int n = 100000;
 80 |         float hostData[] = new float[n];
 81 |         CUdeviceptr deviceData = new CUdeviceptr();
 82 |         cuMemAlloc(deviceData, n * Sizeof.FLOAT);
 83 |         cuMemcpyHtoDAsync(deviceData, Pointer.to(hostData), 
 84 |             n * Sizeof.FLOAT, stream);
 85 | 
 86 |         // Add the callback to the stream that carries the copy operation
 87 |         Object userData = "Example user data";
 88 |         cuStreamAddCallback(stream, callback, userData, 0);
 89 | 
 90 |         // Wait until the stream is finished
 91 |         cuStreamSynchronize(stream);
 92 | 
 93 |         // Clean up
 94 |         cuMemFree(deviceData);
 95 |         cuCtxDestroy(context);
 96 | 
 97 |         System.out.println("Done");
 98 |     }
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/vec/samples/VecFloatSample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCudaVec - Vector operations for JCuda 
  3 |  * http://www.jcuda.org
  4 |  *
  5 |  * Copyright (c) 2013-2015 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | package jcuda.vec.samples;
  8 | 
  9 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 13 | import static jcuda.driver.JCudaDriver.cuMemFree;
 14 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 15 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 16 | import jcuda.Pointer;
 17 | import jcuda.Sizeof;
 18 | import jcuda.driver.CUcontext;
 19 | import jcuda.driver.CUdevice;
 20 | import jcuda.driver.CUdeviceptr;
 21 | import jcuda.driver.JCudaDriver;
 22 | import jcuda.vec.VecFloat;
 23 | 
 24 | /**
 25 |  * A sample showing how to use the JCuda vector library
 26 |  */
 27 | public class VecFloatSample
 28 | {
 29 |     public static void main(String[] args)
 30 |     {
 31 |         // Enable exceptions and omit all subsequent error checks
 32 |         JCudaDriver.setExceptionsEnabled(true);
 33 | 
 34 |         // Initialize the driver and create a context for the first device.
 35 |         cuInit(0);
 36 |         CUdevice device = new CUdevice();
 37 |         cuDeviceGet(device, 0);
 38 |         CUcontext context = new CUcontext();
 39 |         cuCtxCreate(context, 0, device);
 40 | 
 41 |         // Afterwards, initialize the vector library, which will
 42 |         // attach to the current context
 43 |         VecFloat.init();
 44 |         
 45 |         // Allocate and fill the host input data
 46 |         int n = 50000;
 47 |         float hostX[] = new float[n];
 48 |         float hostY[] = new float[n];
 49 |         for(int i = 0; i < n; i++)
 50 |         {
 51 |             hostX[i] = (float)i;
 52 |             hostY[i] = (float)i;
 53 |         }
 54 | 
 55 |         // Allocate the device pointers, and copy the
 56 |         // host input data to the device
 57 |         CUdeviceptr deviceX = new CUdeviceptr();
 58 |         cuMemAlloc(deviceX, n * Sizeof.FLOAT);
 59 |         cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT);
 60 | 
 61 |         CUdeviceptr deviceY = new CUdeviceptr();
 62 |         cuMemAlloc(deviceY, n * Sizeof.FLOAT); 
 63 |         cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT);
 64 | 
 65 |         CUdeviceptr deviceResult = new CUdeviceptr();
 66 |         cuMemAlloc(deviceResult, n * Sizeof.FLOAT);
 67 | 
 68 |         // Perform the vector operations
 69 |         VecFloat.cos(n, deviceX, deviceX);               // x = cos(x)  
 70 |         VecFloat.mul(n, deviceX, deviceX, deviceX);      // x = x*x
 71 |         VecFloat.sin(n, deviceY, deviceY);               // y = sin(y)
 72 |         VecFloat.mul(n, deviceY, deviceY, deviceY);      // y = y*y
 73 |         VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y
 74 | 
 75 |         // Allocate host output memory and copy the device output
 76 |         // to the host.
 77 |         float hostResult[] = new float[n];
 78 |         cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT);
 79 | 
 80 |         // Verify the result
 81 |         boolean passed = true;
 82 |         for(int i = 0; i < n; i++)
 83 |         {
 84 |             float expected = (float)(
 85 |                 Math.cos(hostX[i])*Math.cos(hostX[i])+
 86 |                 Math.sin(hostY[i])*Math.sin(hostY[i]));
 87 |             if (Math.abs(hostResult[i] - expected) > 1e-5)
 88 |             {
 89 |                 System.out.println(
 90 |                     "At index "+i+ " found "+hostResult[i]+
 91 |                     " but expected "+expected);
 92 |                 passed = false;
 93 |                 break;
 94 |             }
 95 |         }
 96 |         System.out.println("Test "+(passed?"PASSED":"FAILED"));
 97 | 
 98 |         // Clean up.
 99 |         cuMemFree(deviceX);
100 |         cuMemFree(deviceY);
101 |         cuMemFree(deviceResult);
102 |         VecFloat.shutdown();
103 |     }
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/vec/samples/VecDoubleSample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCudaVec - Vector operations for JCuda 
  3 |  * http://www.jcuda.org
  4 |  *
  5 |  * Copyright (c) 2013-2015 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | package jcuda.vec.samples;
  8 | 
  9 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 13 | import static jcuda.driver.JCudaDriver.cuMemFree;
 14 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 15 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 16 | import jcuda.Pointer;
 17 | import jcuda.Sizeof;
 18 | import jcuda.driver.CUcontext;
 19 | import jcuda.driver.CUdevice;
 20 | import jcuda.driver.CUdeviceptr;
 21 | import jcuda.driver.JCudaDriver;
 22 | import jcuda.vec.VecDouble;
 23 | import jcuda.vec.VecFloat;
 24 | 
 25 | /**
 26 |  * A sample showing how to use the JCuda vector library
 27 |  */
 28 | public class VecDoubleSample
 29 | {
 30 |     public static void main(String[] args)
 31 |     {
 32 |         // Enable exceptions and omit all subsequent error checks
 33 |         JCudaDriver.setExceptionsEnabled(true);
 34 | 
 35 |         // Initialize the driver and create a context for the first device.
 36 |         cuInit(0);
 37 |         CUdevice device = new CUdevice();
 38 |         cuDeviceGet(device, 0);
 39 |         CUcontext context = new CUcontext();
 40 |         cuCtxCreate(context, 0, device);
 41 | 
 42 |         // Afterwards, initialize the vector library, which will
 43 |         // attach to the current context
 44 |         VecDouble.init();
 45 |         
 46 |         // Allocate and fill the host input data
 47 |         int n = 50000;
 48 |         double hostX[] = new double[n];
 49 |         double hostY[] = new double[n];
 50 |         for(int i = 0; i < n; i++)
 51 |         {
 52 |             hostX[i] = (double)i;
 53 |             hostY[i] = (double)i;
 54 |         }
 55 | 
 56 |         // Allocate the device pointers, and copy the
 57 |         // host input data to the device
 58 |         CUdeviceptr deviceX = new CUdeviceptr();
 59 |         cuMemAlloc(deviceX, n * Sizeof.DOUBLE);
 60 |         cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.DOUBLE);
 61 | 
 62 |         CUdeviceptr deviceY = new CUdeviceptr();
 63 |         cuMemAlloc(deviceY, n * Sizeof.DOUBLE); 
 64 |         cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.DOUBLE);
 65 | 
 66 |         CUdeviceptr deviceResult = new CUdeviceptr();
 67 |         cuMemAlloc(deviceResult, n * Sizeof.DOUBLE);
 68 | 
 69 |         // Perform the vector operations
 70 |         VecDouble.cos(n, deviceX, deviceX);               // x = cos(x)  
 71 |         VecDouble.mul(n, deviceX, deviceX, deviceX);      // x = x*x
 72 |         VecDouble.sin(n, deviceY, deviceY);               // y = sin(y)
 73 |         VecDouble.mul(n, deviceY, deviceY, deviceY);      // y = y*y
 74 |         VecDouble.add(n, deviceResult, deviceX, deviceY); // result = x+y
 75 | 
 76 |         // Allocate host output memory and copy the device output
 77 |         // to the host.
 78 |         double hostResult[] = new double[n];
 79 |         cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.DOUBLE);
 80 | 
 81 |         // Verify the result
 82 |         boolean passed = true;
 83 |         for(int i = 0; i < n; i++)
 84 |         {
 85 |             double expected = 
 86 |                 Math.cos(hostX[i])*Math.cos(hostX[i])+
 87 |                 Math.sin(hostY[i])*Math.sin(hostY[i]);
 88 |             if (Math.abs(hostResult[i] - expected) > 1e-14)
 89 |             {
 90 |                 System.out.println(
 91 |                     "At index "+i+ " found "+hostResult[i]+
 92 |                     " but expected "+expected);
 93 |                 passed = false;
 94 |                 break;
 95 |             }
 96 |         }
 97 |         System.out.println("Test "+(passed?"PASSED":"FAILED"));
 98 | 
 99 |         // Clean up.
100 |         cuMemFree(deviceX);
101 |         cuMemFree(deviceY);
102 |         cuMemFree(deviceResult);
103 |         VecDouble.shutdown();
104 |     }
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeMappedMemory.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.runtime.samples;
  7 | 
  8 | import static jcuda.jcublas.JCublas2.cublasCreate;
  9 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 10 | import static jcuda.jcublas.JCublas2.cublasSscal;
 11 | import static jcuda.runtime.JCuda.cudaDeviceMapHost;
 12 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 13 | import static jcuda.runtime.JCuda.cudaFreeHost;
 14 | import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
 15 | import static jcuda.runtime.JCuda.cudaHostAlloc;
 16 | import static jcuda.runtime.JCuda.cudaHostAllocMapped;
 17 | import static jcuda.runtime.JCuda.cudaHostGetDevicePointer;
 18 | import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 19 | 
 20 | import java.nio.ByteBuffer;
 21 | import java.nio.ByteOrder;
 22 | import java.nio.FloatBuffer;
 23 | 
 24 | import jcuda.Pointer;
 25 | import jcuda.Sizeof;
 26 | import jcuda.jcublas.JCublas2;
 27 | import jcuda.jcublas.cublasHandle;
 28 | import jcuda.runtime.JCuda;
 29 | import jcuda.runtime.cudaDeviceProp;
 30 | 
 31 | /**
 32 |  * An example showing how to use mapped memory in JCuda. Host memory is
 33 |  * allocated and mapped to the device. There, it is modified with a 
 34 |  * runtime library function (CUBLAS, for example), which then 
 35 |  * effectively writes to host memory.
 36 |  */
 37 | public class JCudaRuntimeMappedMemory
 38 | {
 39 |     /**
 40 |      * Entry point of this sample
 41 |      * 
 42 |      * @param args Not used
 43 |      */
 44 |     public static void main(String args[])
 45 |     {
 46 |         // Enable exceptions to quickly be informed about errors in this test
 47 |         JCuda.setExceptionsEnabled(true);
 48 |         JCublas2.setExceptionsEnabled(true);
 49 | 
 50 |         // Check if the device supports mapped host memory
 51 |         cudaDeviceProp deviceProperties = new cudaDeviceProp();
 52 |         cudaGetDeviceProperties(deviceProperties, 0);
 53 |         if (deviceProperties.canMapHostMemory == 0)
 54 |         {
 55 |             System.err.println("This device can not map host memory");
 56 |             System.err.println(deviceProperties.toFormattedString());
 57 |             return;
 58 |         }
 59 | 
 60 |         // Set the flag indicating that mapped memory will be used
 61 |         cudaSetDeviceFlags(cudaDeviceMapHost);
 62 | 
 63 |         // Allocate mappable host memory
 64 |         int n = 5;
 65 |         Pointer hostPointer = new Pointer();
 66 |         cudaHostAlloc(hostPointer, n * Sizeof.FLOAT, cudaHostAllocMapped);
 67 | 
 68 |         // Create a device pointer mapping the host memory
 69 |         Pointer devicePointer = new Pointer();
 70 |         cudaHostGetDevicePointer(devicePointer, hostPointer, 0);
 71 | 
 72 |         // Obtain a ByteBuffer for accessing the data in the host
 73 |         // pointer. Modifications in this ByteBuffer will be
 74 |         // visible in the device memory.
 75 |         ByteBuffer byteBuffer = hostPointer.getByteBuffer(0, n * Sizeof.FLOAT);
 76 | 
 77 |         // Set the byte order of the ByteBuffer
 78 |         byteBuffer.order(ByteOrder.nativeOrder());
 79 | 
 80 |         // For convenience, view the ByteBuffer as a FloatBuffer
 81 |         // and fill it with some sample data
 82 |         FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
 83 |         System.out.print("Input : ");
 84 |         for (int i = 0; i < n; i++)
 85 |         {
 86 |             floatBuffer.put(i, (float) i);
 87 |             System.out.print(floatBuffer.get(i) + ", ");
 88 |         }
 89 |         System.out.println();
 90 | 
 91 |         // Apply a CUBLAS routine to the device pointer. This will
 92 |         // modify the host data, which was mapped to the device.
 93 |         cublasHandle handle = new cublasHandle();
 94 |         cublasCreate(handle);
 95 |         Pointer two = Pointer.to(new float[] { 2.0f });
 96 |         cublasSscal(handle, n, two, devicePointer, 1);
 97 |         cublasDestroy(handle);
 98 |         cudaDeviceSynchronize();
 99 | 
100 |         // Print the contents of the host memory after the
101 |         // modification via the mapped pointer.
102 |         System.out.print("Output: ");
103 |         for (int i = 0; i < n; i++)
104 |         {
105 |             System.out.print(floatBuffer.get(i) + ", ");
106 |         }
107 |         System.out.println();
108 | 
109 |         // Clean up
110 |         cudaFreeHost(hostPointer);
111 |     }
112 | }


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaConstantMemoryExample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2018 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  8 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
  9 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 10 | import static jcuda.driver.JCudaDriver.cuInit;
 11 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 13 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 15 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 16 | import static jcuda.driver.JCudaDriver.*;
 17 | 
 18 | import java.io.IOException;
 19 | import java.util.Arrays;
 20 | 
 21 | import jcuda.Pointer;
 22 | import jcuda.Sizeof;
 23 | import jcuda.driver.CUcontext;
 24 | import jcuda.driver.CUdevice;
 25 | import jcuda.driver.CUdeviceptr;
 26 | import jcuda.driver.CUfunction;
 27 | import jcuda.driver.CUmodule;
 28 | import jcuda.driver.JCudaDriver;
 29 | import jcuda.samples.utils.JCudaSamplesUtils;
 30 | 
 31 | /**
 32 |  * An example showing how to use constant memory in kernels. <br>
 33 |  */
 34 | public class JCudaConstantMemoryExample 
 35 | {
 36 |     public static void main(String[] args) throws IOException 
 37 |     {
 38 |         // Enable exceptions and omit all subsequent error checks
 39 |         JCudaDriver.setExceptionsEnabled(true);
 40 | 
 41 |         // Initialize the driver and create a context for the first device.
 42 |         cuInit(0);
 43 |         CUdevice device = new CUdevice();
 44 |         cuDeviceGet(device, 0);
 45 |         CUcontext context = new CUcontext();
 46 |         cuCtxCreate(context, 0, device);
 47 | 
 48 |         // Create the PTX file by calling the NVCC
 49 |         String ptxFileName = JCudaSamplesUtils.preparePtxFile(
 50 |             "src/main/resources/kernels/JCudaConstantMemoryKernel.cu");
 51 | 
 52 |         // Load the PTX file.
 53 |         CUmodule module = new CUmodule();
 54 |         cuModuleLoad(module, ptxFileName);
 55 | 
 56 |         // Obtain the pointer to the constant memory, and print some info
 57 |         CUdeviceptr constantMemoryPointer = new CUdeviceptr();
 58 |         long constantMemorySizeArray[] = { 0 };
 59 |         cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 
 60 |             module, "constantMemoryData");
 61 |         int constantMemorySize = (int)constantMemorySizeArray[0];
 62 |         
 63 |         System.out.println("constantMemoryPointer: " + constantMemoryPointer);
 64 |         System.out.println("constantMemorySize: " + constantMemorySize);
 65 | 
 66 |         // Copy some host data to the constant memory
 67 |         int numElements = constantMemorySize / Sizeof.FLOAT;
 68 |         float hostData[] = new float[numElements];
 69 |         for (int i = 0; i < numElements; i++)
 70 |         {
 71 |             hostData[i] = i;
 72 |         }
 73 |         cuMemcpyHtoD(constantMemoryPointer, 
 74 |             Pointer.to(hostData), constantMemorySize);
 75 |         
 76 |         // Now use the constant memory in the kernel call:
 77 |         
 78 |         // Obtain a function pointer to the "constantMemoryKernel" function.
 79 |         CUfunction kernel = new CUfunction();
 80 |         cuModuleGetFunction(kernel, module, "constantMemoryKernel");
 81 | 
 82 |         // Allocate some device memory
 83 |         CUdeviceptr deviceData = new CUdeviceptr();
 84 |         cuMemAlloc(deviceData, constantMemorySize);
 85 |         
 86 |         // Set up the kernel parameters
 87 |         Pointer kernelParameters = Pointer.to(
 88 |             Pointer.to(deviceData),
 89 |             Pointer.to(new int[]{numElements})
 90 |         );
 91 |         
 92 |         // Launch the kernel
 93 |         int blockSizeX = numElements;
 94 |         int gridSizeX = 1;
 95 |         cuLaunchKernel(kernel,
 96 |             gridSizeX,  1, 1, 
 97 |             blockSizeX, 1, 1,
 98 |             0, null,         
 99 |             kernelParameters, null 
100 |         );
101 |         cuCtxSynchronize();
102 |         
103 |         // Copy the result back to the host, and verify that it is
104 |         // the same that was copied to the constant memory
105 |         float hostResult[] = new float[numElements];
106 |         cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
107 |         
108 |         boolean passed = Arrays.equals(hostData,  hostResult);
109 |         System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2Sample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcublas.samples;
  7 | 
  8 | import static jcuda.jcublas.JCublas2.cublasCreate;
  9 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 10 | import static jcuda.jcublas.JCublas2.cublasGetVector;
 11 | import static jcuda.jcublas.JCublas2.cublasSetVector;
 12 | import static jcuda.jcublas.JCublas2.cublasSgemm;
 13 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 14 | import static jcuda.runtime.JCuda.cudaFree;
 15 | import static jcuda.runtime.JCuda.cudaMalloc;
 16 | 
 17 | import jcuda.Pointer;
 18 | import jcuda.Sizeof;
 19 | import jcuda.jcublas.JCublas2;
 20 | import jcuda.jcublas.cublasHandle;
 21 | import jcuda.samples.utils.JCudaSamplesUtils;
 22 | 
 23 | /**
 24 |  * This is a sample class demonstrating the application of JCublas2 for
 25 |  * performing a BLAS 'sgemm' operation, i.e. for computing the matrix <br>
 26 |  * <code>C = alpha * A * B + beta * C</code> <br>
 27 |  * for single-precision floating point values alpha and beta, and matrices 
 28 |  * A, B and C of size 1000x1000.
 29 |  */
 30 | public class JCublas2Sample
 31 | {
 32 |     public static void main(String args[])
 33 |     {
 34 |         JCublas2.setExceptionsEnabled(true);
 35 |         testSgemm(1000);
 36 |     }
 37 | 
 38 |     /**
 39 |      * Test the JCublas sgemm operation for matrices of size n x x
 40 |      * 
 41 |      * @param n The matrix size
 42 |      */
 43 |     public static void testSgemm(int n)
 44 |     {
 45 |         float alpha = 0.3f;
 46 |         float beta = 0.7f;
 47 |         int nn = n * n;
 48 | 
 49 |         System.out.println("Creating input data...");
 50 |         float h_A[] = JCudaSamplesUtils.createRandomFloatData(nn);
 51 |         float h_B[] = JCudaSamplesUtils.createRandomFloatData(nn);
 52 |         float h_C[] = JCudaSamplesUtils.createRandomFloatData(nn);
 53 |         float h_C_ref[] = h_C.clone();
 54 | 
 55 |         System.out.println("Performing Sgemm with Java...");
 56 |         sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref);
 57 | 
 58 |         System.out.println("Performing Sgemm with JCublas...");
 59 |         sgemmJCublas(n, alpha, h_A, h_B, beta, h_C);
 60 | 
 61 |         boolean passed = JCudaSamplesUtils.equalByNorm(h_C, h_C_ref);
 62 |         System.out.println("testSgemm " + (passed ? "PASSED" : "FAILED"));
 63 |     }
 64 | 
 65 |     /**
 66 |      * Implementation of sgemm using JCublas
 67 |      */
 68 |     private static void sgemmJCublas(
 69 |         int n, float alpha, float A[], float B[], float beta, float C[])
 70 |     {
 71 |         int nn = n * n;
 72 | 
 73 |         // Create a CUBLAS handle
 74 |         cublasHandle handle = new cublasHandle();
 75 |         cublasCreate(handle);
 76 | 
 77 |         // Allocate memory on the device
 78 |         Pointer d_A = new Pointer();
 79 |         Pointer d_B = new Pointer();
 80 |         Pointer d_C = new Pointer();
 81 |         cudaMalloc(d_A, nn * Sizeof.FLOAT);
 82 |         cudaMalloc(d_B, nn * Sizeof.FLOAT);
 83 |         cudaMalloc(d_C, nn * Sizeof.FLOAT);
 84 | 
 85 |         // Copy the memory from the host to the device
 86 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1);
 87 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1);
 88 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1);
 89 | 
 90 |         // Execute sgemm
 91 |         Pointer pAlpha = Pointer.to(new float[] { alpha });
 92 |         Pointer pBeta = Pointer.to(new float[] { beta });
 93 |         cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, pAlpha, d_A, n,
 94 |             d_B, n, pBeta, d_C, n);
 95 | 
 96 |         // Copy the result from the device to the host
 97 |         cublasGetVector(nn, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1);
 98 | 
 99 |         // Clean up
100 |         cudaFree(d_A);
101 |         cudaFree(d_B);
102 |         cudaFree(d_C);
103 |         cublasDestroy(handle);
104 |     }
105 | 
106 |     /**
107 |      * Simple implementation of sgemm, using plain Java
108 |      */
109 |     private static void sgemmJava(
110 |         int n, float alpha, float A[], float B[], float beta, float C[])
111 |     {
112 |         for (int i = 0; i < n; ++i)
113 |         {
114 |             for (int j = 0; j < n; ++j)
115 |             {
116 |                 float prod = 0;
117 |                 for (int k = 0; k < n; ++k)
118 |                 {
119 |                     prod += A[k * n + i] * B[j * n + k];
120 |                 }
121 |                 C[j * n + i] = alpha * prod + beta * C[j * n + i];
122 |             }
123 |         }
124 |     }
125 | 
126 | }


--------------------------------------------------------------------------------
/JCudaSamples/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>org.jcuda</groupId>
  6 |     <artifactId>jcuda-samples</artifactId>
  7 |     <version>0.0.0-SNAPSHOT</version>
  8 | 
  9 |     <properties>
 10 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 11 |         <jcuda.jcudaVersion>11.2.0</jcuda.jcudaVersion>
 12 |     </properties>
 13 | 
 14 |     <name>JCudaSamples</name>
 15 |     <description>Samples for JCuda</description>
 16 |     <url>http://www.jcuda.org</url>
 17 | 
 18 |     <licenses>
 19 |         <license>
 20 |             <name>MIT</name>
 21 |             <url>http://jcuda.org/License.txt</url>
 22 |         </license>
 23 |     </licenses>
 24 | 
 25 |     <developers>
 26 |         <developer>
 27 |             <name>Marco Hutter</name>
 28 |             <email>jcuda@jcuda.org</email>
 29 |             <roles>
 30 |                 <role>developer</role>
 31 |             </roles>
 32 |         </developer>
 33 |     </developers>
 34 | 
 35 |     <repositories>
 36 |         <repository>
 37 |             <id>localMocalMepository</id>
 38 |             <url>file://${basedir}/localMavenRepository</url>
 39 |         </repository>
 40 |     </repositories>
 41 | 
 42 |     <dependencies>
 43 |         <dependency>
 44 |             <groupId>org.jcuda</groupId>
 45 |             <artifactId>jcuda</artifactId>
 46 |             <version>${jcuda.jcudaVersion}</version>
 47 |         </dependency>
 48 |         <dependency>
 49 |             <groupId>org.jcuda</groupId>
 50 |             <artifactId>jcublas</artifactId>
 51 |             <version>${jcuda.jcudaVersion}</version>
 52 |         </dependency>
 53 |         <dependency>
 54 |             <groupId>org.jcuda</groupId>
 55 |             <artifactId>jcufft</artifactId>
 56 |             <version>${jcuda.jcudaVersion}</version>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.jcuda</groupId>
 60 |             <artifactId>jcurand</artifactId>
 61 |             <version>${jcuda.jcudaVersion}</version>
 62 |         </dependency>
 63 |         <dependency>
 64 |             <groupId>org.jcuda</groupId>
 65 |             <artifactId>jcusparse</artifactId>
 66 |             <version>${jcuda.jcudaVersion}</version>
 67 |         </dependency>
 68 |         <dependency>
 69 |             <groupId>org.jcuda</groupId>
 70 |             <artifactId>jcusolver</artifactId>
 71 |             <version>${jcuda.jcudaVersion}</version>
 72 |         </dependency>
 73 |         <dependency>
 74 |             <groupId>org.jcuda</groupId>
 75 |             <artifactId>jcudnn</artifactId>
 76 |             <version>${jcuda.jcudaVersion}</version>
 77 |         </dependency>
 78 |         <dependency>
 79 |             <groupId>org.jcuda</groupId>
 80 |             <artifactId>jcuda-vec</artifactId>
 81 |             <version>0.0.2</version>
 82 |         </dependency>
 83 |         <dependency>
 84 |             <groupId>de.javagl</groupId>
 85 |             <artifactId>matrixmarketreader</artifactId>
 86 |             <version>0.0.1-SNAPSHOT</version>
 87 |         </dependency>
 88 |         <dependency>
 89 |             <groupId>org.jcuda</groupId>
 90 |             <artifactId>jcuda-matrix-utils</artifactId>
 91 |             <version>0.0.1-SNAPSHOT</version>
 92 |         </dependency>
 93 |         <dependency>
 94 |             <groupId>com.github.wendykierp</groupId>
 95 |             <artifactId>JTransforms</artifactId>
 96 |             <version>3.1</version>
 97 |             <classifier>with-dependencies</classifier>
 98 |         </dependency>
 99 |         <dependency>
100 |             <groupId>org.jogamp.gluegen</groupId>
101 |             <artifactId>gluegen-rt-main</artifactId>
102 |             <version>2.3.2</version>
103 |         </dependency>
104 |         <dependency>
105 |             <groupId>org.jogamp.jogl</groupId>
106 |             <artifactId>jogl-all-main</artifactId>
107 |             <version>2.3.2</version>
108 |         </dependency>
109 |         <dependency>
110 |             <groupId>org.lwjgl.lwjgl</groupId>
111 |             <artifactId>lwjgl</artifactId>
112 |             <version>2.9.3</version>
113 |         </dependency>
114 |     </dependencies>
115 | 
116 |     <build>
117 |         <plugins>
118 |             <plugin>
119 |                 <artifactId>maven-compiler-plugin</artifactId>
120 |                 <version>2.3.2</version>
121 |                 <configuration>
122 |                     <source>1.6</source>
123 |                     <target>1.6</target>
124 |                 </configuration>
125 |             </plugin>
126 |         </plugins>
127 |     </build>
128 | 
129 | </project>


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDynamicParallelism.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | 
  8 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  9 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 14 | import static jcuda.driver.JCudaDriver.cuMemFree;
 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 16 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 17 | import static jcuda.driver.JCudaDriver.cuModuleLoad;
 18 | 
 19 | import java.util.Arrays;
 20 | 
 21 | import jcuda.Pointer;
 22 | import jcuda.Sizeof;
 23 | import jcuda.driver.CUcontext;
 24 | import jcuda.driver.CUdevice;
 25 | import jcuda.driver.CUdeviceptr;
 26 | import jcuda.driver.CUfunction;
 27 | import jcuda.driver.CUmodule;
 28 | import jcuda.driver.JCudaDriver;
 29 | import jcuda.samples.utils.JCudaSamplesUtils;
 30 | 
 31 | /**
 32 |  * A simple example showing how a kernel with dynamic parallelism 
 33 |  * can be loaded from a CUBIN file and launched.
 34 |  */
 35 | public class JCudaDynamicParallelism
 36 | {
 37 |     public static void main(String[] args)
 38 |     {
 39 |         JCudaDriver.setExceptionsEnabled(true);
 40 | 
 41 |         // Initialize a context for the first device
 42 |         cuInit(0);
 43 |         CUcontext context = new CUcontext();
 44 |         CUdevice device = new CUdevice();
 45 |         cuDeviceGet(device, 0);
 46 |         cuCtxCreate(context, 0, device);
 47 | 
 48 |         // Create the CUBIN file by calling the NVCC. 
 49 |         // See the prepareDefaultCubinFile method for the details about
 50 |         // the NVCC parameters that are used here. 
 51 |         String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
 52 |             "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");
 53 | 
 54 |         // Load the CUBIN file 
 55 |         CUmodule module = new CUmodule();
 56 |         cuModuleLoad(module, cubinFileName);
 57 | 
 58 |         // Obtain a function pointer to the "parentKernel" function.
 59 |         CUfunction function = new CUfunction();
 60 |         cuModuleGetFunction(function, module, "parentKernel");
 61 | 
 62 |         // Define the nesting structure. 
 63 |         // 
 64 |         // NOTE: The number of child threads MUST match the value that 
 65 |         // is used in the kernel, for the childKernel<<<1, 8>>> call!
 66 |         // 
 67 |         int numParentThreads = 8;
 68 |         int numChildThreads = 8;
 69 | 
 70 |         // Allocate the device data that will be filled by the kernel
 71 |         int numElements = numParentThreads * numChildThreads;
 72 |         CUdeviceptr deviceData = new CUdeviceptr();
 73 |         cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);
 74 | 
 75 |         // Set up the kernel parameters: A pointer to an array
 76 |         // of pointers which point to the actual values.
 77 |         Pointer kernelParameters = Pointer.to(
 78 |             Pointer.to(new int[] { numElements }),
 79 |             Pointer.to(deviceData)
 80 |         );
 81 | 
 82 |         // Call the kernel function.
 83 |         int blockSizeX = numParentThreads;
 84 |         int gridSizeX = (numElements + numElements - 1) / blockSizeX;
 85 |         cuLaunchKernel(function,
 86 |             gridSizeX,  1, 1,      // Grid dimension
 87 |             blockSizeX, 1, 1,      // Block dimension
 88 |             0, null,               // Shared memory size and stream
 89 |             kernelParameters, null // Kernel- and extra parameters
 90 |         );
 91 |         cuCtxSynchronize();
 92 | 
 93 |         // Copy the device data to the host
 94 |         float hostData[] = new float[numElements];
 95 |         for(int i = 0; i < numElements; i++)
 96 |         {
 97 |             hostData[i] = i;
 98 |         }
 99 |         cuMemcpyDtoH(Pointer.to(hostData), 
100 |             deviceData, numElements * Sizeof.FLOAT);
101 | 
102 |         // Compare the host data with the expected values
103 |         float hostDataRef[] = new float[numElements];
104 |         for(int i = 0; i < numParentThreads; i++)
105 |         {
106 |             for (int j=0; j < numChildThreads; j++)
107 |             {
108 |                 hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
109 |             }
110 |         }
111 |         System.out.println("Result: "+Arrays.toString(hostData));
112 |         boolean passed = Arrays.equals(hostData, hostDataRef);
113 |         System.out.println(passed ? "PASSED" : "FAILED");
114 | 
115 |         // Clean up.
116 |         cuMemFree(deviceData);
117 |     }
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2PointerModes.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcublas.samples;
  7 | 
  8 | import static jcuda.jcublas.JCublas2.cublasCreate;
  9 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 10 | import static jcuda.jcublas.JCublas2.cublasSdot;
 11 | import static jcuda.jcublas.JCublas2.cublasSetPointerMode;
 12 | import static jcuda.jcublas.cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE;
 13 | import static jcuda.jcublas.cublasPointerMode.CUBLAS_POINTER_MODE_HOST;
 14 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 15 | import static jcuda.runtime.JCuda.cudaFree;
 16 | import static jcuda.runtime.JCuda.cudaMalloc;
 17 | import static jcuda.runtime.JCuda.cudaMemcpy;
 18 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 19 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 20 | 
 21 | import java.util.Arrays;
 22 | 
 23 | import jcuda.Pointer;
 24 | import jcuda.Sizeof;
 25 | import jcuda.jcublas.JCublas2;
 26 | import jcuda.jcublas.cublasHandle;
 27 | import jcuda.runtime.JCuda;
 28 | 
 29 | 
 30 | /**
 31 |  * A sample demonstrating the different pointer modes for CUBLAS 2.
 32 |  * With CUBLAS 2, functions may receive pointers as arguments which are
 33 |  * either used as input parameters or will store results. These pointers
 34 |  * may either be pointers to host or to device memory. This sample shows
 35 |  * how to obtain the result of a 'dot' operation in host- or device 
 36 |  * memory.
 37 |  */
 38 | public class JCublas2PointerModes
 39 | {
 40 |     /**
 41 |      * Entry point of this sample
 42 |      * 
 43 |      * @param args Not used
 44 |      */
 45 |     public static void main(String[] args)
 46 |     {
 47 |         // Enable exceptions and omit subsequent error checks
 48 |         JCublas2.setExceptionsEnabled(true);
 49 |         JCuda.setExceptionsEnabled(true);
 50 | 
 51 |         // Create the input data: A vector containing the
 52 |         // value 1.0 exactly n times.
 53 |         int n = 1000000;
 54 |         float hostData[] = new float[n];
 55 |         Arrays.fill(hostData,  1.0f);
 56 | 
 57 |         // Allocate device memory, and copy the input data to the device
 58 |         Pointer deviceData = new Pointer();
 59 |         cudaMalloc(deviceData, n * Sizeof.FLOAT);
 60 |         cudaMemcpy(deviceData, Pointer.to(hostData), n * Sizeof.FLOAT,
 61 |             cudaMemcpyHostToDevice);
 62 | 
 63 |         // Create a CUBLAS handle
 64 |         cublasHandle handle = new cublasHandle();
 65 |         cublasCreate(handle);
 66 | 
 67 | 
 68 |         // Execute the 'dot' function in HOST pointer mode:
 69 |         // The result will be written to a pointer that
 70 |         // points to host memory.
 71 | 
 72 |         // Set the pointer mode to HOST
 73 |         cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
 74 | 
 75 |         // Prepare the pointer for the result in HOST memory
 76 |         float hostResult[] = { -1.0f };
 77 |         Pointer hostResultPointer = Pointer.to(hostResult);
 78 | 
 79 |         // Execute the 'dot' function
 80 |         long beforeHostCall = System.nanoTime();
 81 |         cublasSdot(handle, n, deviceData, 1, deviceData, 1, hostResultPointer);
 82 |         long afterHostCall = System.nanoTime();
 83 | 
 84 |         // Print the result and timing information
 85 |         double hostDuration = (afterHostCall - beforeHostCall) / 1e6;
 86 |         System.out.println("Host call duration: " + hostDuration + " ms");
 87 |         System.out.println("Result: " + hostResult[0]);
 88 | 
 89 | 
 90 |         // Execute the 'dot' function in DEVICE pointer mode:
 91 |         // The result will be written to a pointer that
 92 |         // points to device memory.
 93 | 
 94 |         // Set the pointer mode to DEVICE
 95 |         cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
 96 | 
 97 |         // Prepare the pointer for the result in DEVICE memory
 98 |         Pointer deviceResultPointer = new Pointer();
 99 |         cudaMalloc(deviceResultPointer, Sizeof.FLOAT);
100 | 
101 |         // Execute the 'dot' function
102 |         long beforeDeviceCall = System.nanoTime();
103 |         cublasSdot(handle, n, deviceData, 1, deviceData, 1,
104 |             deviceResultPointer);
105 |         long afterDeviceCall = System.nanoTime();
106 | 
107 |         // Synchronize in order to wait for the result to
108 |         // be available (note that this is done implicitly
109 |         // when cudaMemcpy is called)
110 |         cudaDeviceSynchronize();
111 |         long afterDeviceSync = System.nanoTime();
112 | 
113 |         // Copy the result from the device to the host
114 |         float deviceResult[] = { -1.0f };
115 |         cudaMemcpy(Pointer.to(deviceResult), deviceResultPointer, 
116 |             Sizeof.FLOAT, cudaMemcpyDeviceToHost);
117 | 
118 |         // Print the result and timing information
119 |         double deviceCallDuration = (afterDeviceCall - beforeDeviceCall) / 1e6;
120 |         double deviceFullDuration = (afterDeviceSync - beforeDeviceCall) / 1e6;
121 |         System.out .println(
122 |             "Device call duration: " + deviceCallDuration + " ms");
123 |         System.out.println(
124 |             "Device full duration: " + deviceFullDuration + " ms");
125 |         System.out.println("Result: " + deviceResult[0]);
126 | 
127 |         // Clean up
128 |         cudaFree(deviceData);
129 |         cublasDestroy(handle);
130 |     }
131 | 
132 | 
133 | }
134 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2SgemmExSample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcublas.samples;
  7 | 
  8 | import static jcuda.cudaDataType.CUDA_R_32F;
  9 | import static jcuda.jcublas.JCublas2.cublasCreate;
 10 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 11 | import static jcuda.jcublas.JCublas2.cublasGemmEx;
 12 | import static jcuda.jcublas.JCublas2.cublasGetVector;
 13 | import static jcuda.jcublas.JCublas2.cublasSetVector;
 14 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO0;
 15 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO2;
 16 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO4;
 17 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO5;
 18 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO6;
 19 | import static jcuda.jcublas.cublasGemmAlgo.CUBLAS_GEMM_ALGO7;
 20 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 21 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 22 | import static jcuda.runtime.JCuda.cudaFree;
 23 | import static jcuda.runtime.JCuda.cudaMalloc;
 24 | 
 25 | import java.util.Arrays;
 26 | import java.util.List;
 27 | 
 28 | import jcuda.Pointer;
 29 | import jcuda.Sizeof;
 30 | import jcuda.jcublas.JCublas2;
 31 | import jcuda.jcublas.cublasHandle;
 32 | import jcuda.samples.utils.JCudaSamplesUtils;
 33 | 
 34 | /**
 35 |  * This is a sample class demonstrating the application of JCublas2 for
 36 |  * performing a BLAS 'sgemm' operation, i.e. for computing the matrix <br>
 37 |  * <code>C = alpha * A * B + beta * C</code> <br>
 38 |  * for single-precision floating point values alpha and beta, and matrices 
 39 |  * A, B and C, using the extended CUBLAS GEMM function
 40 |  */
 41 | public class JCublas2SgemmExSample
 42 | {
 43 |     public static void main(String args[])
 44 |     {
 45 |         JCublas2.setExceptionsEnabled(true);
 46 |         testSgemm(2000);
 47 |     }
 48 |     
 49 |     // The list of CUBLAS GEMM algorithms to use. Note that the set of
 50 |     // supported algorithms will likely depend on the platform, the
 51 |     // size of the matrix, and other factors.
 52 |     private static final List<Integer> GEMM_ALGORITHMS = Arrays.asList(
 53 |         CUBLAS_GEMM_ALGO2,
 54 |         CUBLAS_GEMM_ALGO4,
 55 |         CUBLAS_GEMM_ALGO5,
 56 |         CUBLAS_GEMM_ALGO6,
 57 |         CUBLAS_GEMM_ALGO7
 58 |     );
 59 |     private static int GEMM_ALGO = CUBLAS_GEMM_ALGO0;
 60 | 
 61 |     /**
 62 |      * Test the JCublas sgemm operation for matrices of size n x x
 63 |      * 
 64 |      * @param n The matrix size
 65 |      */
 66 |     public static void testSgemm(int n)
 67 |     {
 68 |         float alpha = 0.3f;
 69 |         float beta = 0.7f;
 70 |         int nn = n * n;
 71 | 
 72 |         System.out.println("Creating input data...");
 73 |         float h_A[] = JCudaSamplesUtils.createRandomFloatData(nn);
 74 |         float h_B[] = JCudaSamplesUtils.createRandomFloatData(nn);
 75 |         float h_C[] = JCudaSamplesUtils.createRandomFloatData(nn);
 76 | 
 77 |         System.out.println("Performing Sgemm with JCublas...");
 78 |         for (int i : GEMM_ALGORITHMS)
 79 |         {
 80 |             GEMM_ALGO = i;
 81 |             try
 82 |             {
 83 |                 sgemmJCublas(n, alpha, h_A, h_B, beta, h_C);
 84 |             }
 85 |             catch (Exception e)
 86 |             {
 87 |                 e.printStackTrace();
 88 |             }
 89 |         }
 90 | 
 91 |     }
 92 | 
 93 |     /**
 94 |      * Implementation of sgemm using JCublas
 95 |      */
 96 |     private static void sgemmJCublas(
 97 |         int n, float alpha, float A[], float B[], float beta, float C[])
 98 |     {
 99 |         int nn = n * n;
100 | 
101 |         // Create a CUBLAS handle
102 |         cublasHandle handle = new cublasHandle();
103 |         cublasCreate(handle);
104 | 
105 |         // Allocate memory on the device
106 |         Pointer d_A = new Pointer();
107 |         Pointer d_B = new Pointer();
108 |         Pointer d_C = new Pointer();
109 |         cudaMalloc(d_A, nn * Sizeof.FLOAT);
110 |         cudaMalloc(d_B, nn * Sizeof.FLOAT);
111 |         cudaMalloc(d_C, nn * Sizeof.FLOAT);
112 | 
113 |         // Copy the memory from the host to the device
114 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1);
115 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1);
116 |         cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1);
117 | 
118 |         // Execute sgemm
119 |         Pointer pAlpha = Pointer.to(new float[] { alpha });
120 |         Pointer pBeta = Pointer.to(new float[] { beta });
121 |         
122 |         long before = System.nanoTime();
123 |         
124 |         cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, 
125 |             pAlpha, d_A, CUDA_R_32F, n, d_B, CUDA_R_32F, n, 
126 |             pBeta, d_C, CUDA_R_32F, n, CUDA_R_32F, GEMM_ALGO);
127 |         
128 |         cudaDeviceSynchronize();
129 |         
130 |         long after = System.nanoTime();
131 |         double durationMs = (after - before) / 1e6;
132 |         System.out.println(
133 |             "Algorithm " + GEMM_ALGO + " took " + durationMs + " ms");
134 | 
135 |         // Copy the result from the device to the host
136 |         cublasGetVector(nn, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1);
137 | 
138 |         // Clean up
139 |         cudaFree(d_A);
140 |         cudaFree(d_B);
141 |         cudaFree(d_C);
142 |         cublasDestroy(handle);
143 |     }
144 | 
145 | }


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaVectorAdd.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | 
  8 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  9 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 14 | import static jcuda.driver.JCudaDriver.cuMemFree;
 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 16 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 17 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 18 | import static jcuda.driver.JCudaDriver.cuModuleLoad;
 19 | 
 20 | import java.io.IOException;
 21 | 
 22 | import jcuda.Pointer;
 23 | import jcuda.Sizeof;
 24 | import jcuda.driver.CUcontext;
 25 | import jcuda.driver.CUdevice;
 26 | import jcuda.driver.CUdeviceptr;
 27 | import jcuda.driver.CUfunction;
 28 | import jcuda.driver.CUmodule;
 29 | import jcuda.driver.JCudaDriver;
 30 | import jcuda.samples.utils.JCudaSamplesUtils;
 31 | 
 32 | /**
 33 |  * This is a sample class demonstrating how to use the JCuda driver
 34 |  * bindings to load and execute a CUDA vector addition kernel.
 35 |  * The sample reads a CUDA file, compiles it to a PTX file
 36 |  * using NVCC, loads the PTX file as a module and executes
 37 |  * the kernel function.
 38 |  */
 39 | public class JCudaVectorAdd
 40 | {
 41 |     /**
 42 |      * Entry point of this sample
 43 |      *
 44 |      * @param args Not used
 45 |      * @throws IOException If an IO error occurs
 46 |      */
 47 |     public static void main(String args[]) throws IOException
 48 |     {
 49 |         // Enable exceptions and omit all subsequent error checks
 50 |         JCudaDriver.setExceptionsEnabled(true);
 51 | 
 52 |         // Create the PTX file by calling the NVCC
 53 |         String ptxFileName = JCudaSamplesUtils.preparePtxFile(
 54 |             "src/main/resources/kernels/JCudaVectorAddKernel.cu");
 55 | 
 56 |         // Initialize the driver and create a context for the first device.
 57 |         cuInit(0);
 58 |         CUdevice device = new CUdevice();
 59 |         cuDeviceGet(device, 0);
 60 |         CUcontext context = new CUcontext();
 61 |         cuCtxCreate(context, 0, device);
 62 | 
 63 |         // Load the ptx file.
 64 |         CUmodule module = new CUmodule();
 65 |         cuModuleLoad(module, ptxFileName);
 66 | 
 67 |         // Obtain a function pointer to the "add" function.
 68 |         CUfunction function = new CUfunction();
 69 |         cuModuleGetFunction(function, module, "add");
 70 | 
 71 |         int numElements = 1024;
 72 | 
 73 |         // Allocate and fill the host input data
 74 |         float hostInputA[] = new float[numElements];
 75 |         float hostInputB[] = new float[numElements];
 76 |         for(int i = 0; i < numElements; i++)
 77 |         {
 78 |             hostInputA[i] = (float)i;
 79 |             hostInputB[i] = (float)i;
 80 |         }
 81 | 
 82 |         // Allocate the device input data, and copy the
 83 |         // host input data to the device
 84 |         CUdeviceptr deviceInputA = new CUdeviceptr();
 85 |         cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
 86 |         cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA),
 87 |             numElements * Sizeof.FLOAT);
 88 |         CUdeviceptr deviceInputB = new CUdeviceptr();
 89 |         cuMemAlloc(deviceInputB, numElements * Sizeof.FLOAT);
 90 |         cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB),
 91 |             numElements * Sizeof.FLOAT);
 92 | 
 93 |         // Allocate device output memory
 94 |         CUdeviceptr deviceOutput = new CUdeviceptr();
 95 |         cuMemAlloc(deviceOutput, numElements * Sizeof.FLOAT);
 96 | 
 97 |         // Set up the kernel parameters: A pointer to an array
 98 |         // of pointers which point to the actual values.
 99 |         Pointer kernelParameters = Pointer.to(
100 |             Pointer.to(new int[]{numElements}),
101 |             Pointer.to(deviceInputA),
102 |             Pointer.to(deviceInputB),
103 |             Pointer.to(deviceOutput)
104 |         );
105 | 
106 |         // Call the kernel function.
107 |         int blockSizeX = 256;
108 |         int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
109 |         cuLaunchKernel(function,
110 |             gridSizeX,  1, 1,      // Grid dimension
111 |             blockSizeX, 1, 1,      // Block dimension
112 |             0, null,               // Shared memory size and stream
113 |             kernelParameters, null // Kernel- and extra parameters
114 |         );
115 |         cuCtxSynchronize();
116 | 
117 |         // Allocate host output memory and copy the device output
118 |         // to the host.
119 |         float hostOutput[] = new float[numElements];
120 |         cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput,
121 |             numElements * Sizeof.FLOAT);
122 | 
123 |         // Verify the result
124 |         boolean passed = true;
125 |         for(int i = 0; i < numElements; i++)
126 |         {
127 |             float expected = i+i;
128 |             if (Math.abs(hostOutput[i] - expected) > 1e-5)
129 |             {
130 |                 System.out.println(
131 |                     "At index "+i+ " found "+hostOutput[i]+
132 |                     " but expected "+expected);
133 |                 passed = false;
134 |                 break;
135 |             }
136 |         }
137 |         System.out.println("Test "+(passed?"PASSED":"FAILED"));
138 | 
139 |         // Clean up.
140 |         cuMemFree(deviceInputA);
141 |         cuMemFree(deviceInputB);
142 |         cuMemFree(deviceOutput);
143 |     }
144 | 
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/resources/kernels/JCudaDriverVolumeRendererKernel.cu:
--------------------------------------------------------------------------------
  1 | // Note: This file is basically the same as in the original NVIDIA CUDA 
  2 | // "volumeRender" sample, with minor modifications:
  3 | // - The host functions and other parts that are not used 
  4 | //   here have been omitted
  5 | // - The render function is declared as 
  6 | //   extern "C"
  7 | //   so that it keeps its original name
  8 | 
  9 | /*
 10 |  * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 11 |  *
 12 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
 13 |  * with this source code for terms and conditions that govern your use of
 14 |  * this software. Any use, reproduction, disclosure, or distribution of
 15 |  * this software and related documentation outside the terms of the EULA
 16 |  * is strictly prohibited.
 17 |  *
 18 |  */
 19 | 
 20 | // Simple 3D volume renderer
 21 | 
 22 | #ifndef _VOLUMERENDER_KERNEL_CU_
 23 | #define _VOLUMERENDER_KERNEL_CU_
 24 | 
 25 | #include "helper_math.h"
 26 | 
 27 | typedef unsigned int  uint;
 28 | typedef unsigned char uchar;
 29 | 
 30 | cudaArray *d_volumeArray = 0;
 31 | cudaArray *d_transferFuncArray;
 32 | 
 33 | typedef unsigned char VolumeType;
 34 | //typedef unsigned short VolumeType;
 35 | 
 36 | texture<VolumeType, 3, cudaReadModeNormalizedFloat> tex;         // 3D texture
 37 | texture<float4, 1, cudaReadModeElementType>         transferTex; // 1D transfer function texture
 38 | 
 39 | typedef struct
 40 | {
 41 |     float4 m[3];
 42 | } float3x4;
 43 | 
 44 | __constant__ float3x4 c_invViewMatrix;  // inverse view matrix
 45 | 
 46 | struct Ray
 47 | {
 48 |     float3 o;   // origin
 49 |     float3 d;   // direction
 50 | };
 51 | 
 52 | // intersect ray with a box
 53 | // http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm
 54 | 
 55 | __device__
 56 | int intersectBox(Ray r, float3 boxmin, float3 boxmax, float *tnear, float *tfar)
 57 | {
 58 |     // compute intersection of ray with all six bbox planes
 59 |     float3 invR = make_float3(1.0f) / r.d;
 60 |     float3 tbot = invR * (boxmin - r.o);
 61 |     float3 ttop = invR * (boxmax - r.o);
 62 | 
 63 |     // re-order intersections to find smallest and largest on each axis
 64 |     float3 tmin = fminf(ttop, tbot);
 65 |     float3 tmax = fmaxf(ttop, tbot);
 66 | 
 67 |     // find the largest tmin and the smallest tmax
 68 |     float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z));
 69 |     float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z));
 70 | 
 71 |     *tnear = largest_tmin;
 72 |     *tfar = smallest_tmax;
 73 | 
 74 |     return smallest_tmax > largest_tmin;
 75 | }
 76 | 
 77 | // transform vector by matrix (no translation)
 78 | __device__
 79 | float3 mul(const float3x4 &M, const float3 &v)
 80 | {
 81 |     float3 r;
 82 |     r.x = dot(v, make_float3(M.m[0]));
 83 |     r.y = dot(v, make_float3(M.m[1]));
 84 |     r.z = dot(v, make_float3(M.m[2]));
 85 |     return r;
 86 | }
 87 | 
 88 | // transform vector by matrix with translation
 89 | __device__
 90 | float4 mul(const float3x4 &M, const float4 &v)
 91 | {
 92 |     float4 r;
 93 |     r.x = dot(v, M.m[0]);
 94 |     r.y = dot(v, M.m[1]);
 95 |     r.z = dot(v, M.m[2]);
 96 |     r.w = 1.0f;
 97 |     return r;
 98 | }
 99 | 
100 | __device__ uint rgbaFloatToInt(float4 rgba)
101 | {
102 |     rgba.x = __saturatef(rgba.x);   // clamp to [0.0, 1.0]
103 |     rgba.y = __saturatef(rgba.y);
104 |     rgba.z = __saturatef(rgba.z);
105 |     rgba.w = __saturatef(rgba.w);
106 |     return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
107 | }
108 | 
109 | extern "C"
110 | __global__ void
111 | d_render(uint *d_output, uint imageW, uint imageH,
112 |          float density, float brightness,
113 |          float transferOffset, float transferScale)
114 | {
115 |     const int maxSteps = 500;
116 |     const float tstep = 0.01f;
117 |     const float opacityThreshold = 0.95f;
118 |     const float3 boxMin = make_float3(-1.0f, -1.0f, -1.0f);
119 |     const float3 boxMax = make_float3(1.0f, 1.0f, 1.0f);
120 | 
121 |     uint x = blockIdx.x*blockDim.x + threadIdx.x;
122 |     uint y = blockIdx.y*blockDim.y + threadIdx.y;
123 | 
124 |     if ((x >= imageW) || (y >= imageH)) return;
125 | 
126 |     float u = (x / (float) imageW)*2.0f-1.0f;
127 |     float v = (y / (float) imageH)*2.0f-1.0f;
128 | 
129 |     // calculate eye ray in world space
130 |     Ray eyeRay;
131 |     eyeRay.o = make_float3(mul(c_invViewMatrix, make_float4(0.0f, 0.0f, 0.0f, 1.0f)));
132 |     eyeRay.d = normalize(make_float3(u, v, -2.0f));
133 |     eyeRay.d = mul(c_invViewMatrix, eyeRay.d);
134 | 
135 |     // find intersection with box
136 |     float tnear, tfar;
137 |     int hit = intersectBox(eyeRay, boxMin, boxMax, &tnear, &tfar);
138 | 
139 |     if (!hit) return;
140 | 
141 |     if (tnear < 0.0f) tnear = 0.0f;     // clamp to near plane
142 | 
143 |     // march along ray from front to back, accumulating color
144 |     float4 sum = make_float4(0.0f);
145 |     float t = tnear;
146 |     float3 pos = eyeRay.o + eyeRay.d*tnear;
147 |     float3 step = eyeRay.d*tstep;
148 | 
149 |     for (int i=0; i<maxSteps; i++)
150 |     {
151 |         // read from 3D texture
152 |         // remap position to [0, 1] coordinates
153 |         float sample = tex3D(tex, pos.x*0.5f+0.5f, pos.y*0.5f+0.5f, pos.z*0.5f+0.5f);
154 |         //sample *= 64.0f;    // scale for 10-bit data
155 | 
156 |         // lookup in transfer function texture
157 |         float4 col = tex1D(transferTex, (sample-transferOffset)*transferScale);
158 |         col.w *= density;
159 | 
160 |         // "under" operator for back-to-front blending
161 |         //sum = lerp(sum, col, col.w);
162 | 
163 |         // pre-multiply alpha
164 |         col.x *= col.w;
165 |         col.y *= col.w;
166 |         col.z *= col.w;
167 |         // "over" operator for front-to-back blending
168 |         sum = sum + col*(1.0f - sum.w);
169 | 
170 |         // exit early if opaque
171 |         if (sum.w > opacityThreshold)
172 |             break;
173 | 
174 |         t += tstep;
175 | 
176 |         if (t > tfar) break;
177 | 
178 |         pos += step;
179 |     }
180 | 
181 |     sum *= brightness;
182 | 
183 |     // write output color
184 |     d_output[y*imageW + x] = rgbaFloatToInt(sum);
185 | }
186 | 
187 | 
188 | #endif // #ifndef _VOLUMERENDER_KERNEL_CU_
189 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2SgemmBatched.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcublas.samples;
  7 | 
  8 | import static jcuda.jcublas.JCublas2.cublasCreate;
  9 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 10 | import static jcuda.jcublas.JCublas2.cublasSgemmBatched;
 11 | import static jcuda.runtime.JCuda.cudaFree;
 12 | import static jcuda.runtime.JCuda.cudaMalloc;
 13 | import static jcuda.runtime.JCuda.cudaMemcpy;
 14 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 15 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 16 | 
 17 | import jcuda.Pointer;
 18 | import jcuda.Sizeof;
 19 | import jcuda.jcublas.JCublas2;
 20 | import jcuda.jcublas.cublasHandle;
 21 | import jcuda.jcublas.cublasOperation;
 22 | import jcuda.runtime.JCuda;
 23 | import jcuda.samples.utils.JCudaSamplesUtils;
 24 | 
 25 | /**
 26 |  * This is a sample class demonstrating the application of JCublas2 for
 27 |  * performing a batched BLAS 'sgemm' operation, i.e. for computing the 
 28 |  * multiple matrices <br>
 29 |  * <code>C = alpha * A * B + beta * C</code> <br>
 30 |  * for single-precision floating point values alpha and beta, and matrices 
 31 |  * A, B and C
 32 |  */
 33 | class JCublas2SgemmBatched
 34 | {
 35 |     public static void main(String[] args)
 36 |     {
 37 |         JCublas2.setExceptionsEnabled(true);
 38 |         JCuda.setExceptionsEnabled(true);
 39 |         testSgemmBatched(10, 100);
 40 |     }
 41 | 
 42 |     public static boolean testSgemmBatched(int b, int n)
 43 |     {
 44 |         System.out.println("Testing Sgemm with " + b + " batches of size " + n);
 45 | 
 46 |         float alpha = 0.3f;
 47 |         float beta = 0.7f;
 48 |         int nn = n * n;
 49 | 
 50 |         float h_A[][] = new float[b][];
 51 |         float h_B[][] = new float[b][];
 52 |         float h_C[][] = new float[b][];
 53 |         float h_C_ref[][] = new float[b][];
 54 |         for (int i = 0; i < b; i++)
 55 |         {
 56 |             h_A[i] = JCudaSamplesUtils.createRandomFloatData(nn);
 57 |             h_B[i] = JCudaSamplesUtils.createRandomFloatData(nn);
 58 |             h_C[i] = JCudaSamplesUtils.createRandomFloatData(nn);
 59 |             h_C_ref[i] = h_C[i].clone();
 60 |         }
 61 | 
 62 |         System.out.println("Performing Sgemm with Java...");
 63 |         sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref);
 64 | 
 65 |         System.out.println("Performing Sgemm with JCublas2...");
 66 |         sgemmBatchedJCublas2(n, alpha, h_A, h_B, beta, h_C);
 67 | 
 68 |         // Print the test results
 69 |         boolean passed = true;
 70 |         for (int i = 0; i < b; i++)
 71 |         {
 72 |             passed &= JCudaSamplesUtils.equalByNorm(h_C[i], h_C_ref[i]);
 73 |         }
 74 |         System.out.println(String.format("testSgemm %s", 
 75 |             passed ? "PASSED" : "FAILED"));
 76 |         return passed;
 77 |     }
 78 | 
 79 |     static void sgemmBatchedJCublas2(int n, float alpha, 
 80 |         float h_A[][], float h_B[][], float beta, float h_C[][])
 81 |     {
 82 |         int nn = n * n;
 83 |         int b = h_A.length;
 84 |         Pointer[] h_Aarray = new Pointer[b];
 85 |         Pointer[] h_Barray = new Pointer[b];
 86 |         Pointer[] h_Carray = new Pointer[b];
 87 |         for (int i = 0; i < b; i++)
 88 |         {
 89 |             h_Aarray[i] = new Pointer();
 90 |             h_Barray[i] = new Pointer();
 91 |             h_Carray[i] = new Pointer();
 92 |             cudaMalloc(h_Aarray[i], nn * Sizeof.FLOAT);
 93 |             cudaMalloc(h_Barray[i], nn * Sizeof.FLOAT);
 94 |             cudaMalloc(h_Carray[i], nn * Sizeof.FLOAT);
 95 |             cudaMemcpy(h_Aarray[i], Pointer.to(h_A[i]),
 96 |                 nn * Sizeof.FLOAT, cudaMemcpyHostToDevice);
 97 |             cudaMemcpy(h_Barray[i], Pointer.to(h_B[i]),
 98 |                 nn * Sizeof.FLOAT, cudaMemcpyHostToDevice);
 99 |             cudaMemcpy(h_Carray[i], Pointer.to(h_C[i]),
100 |                 nn * Sizeof.FLOAT, cudaMemcpyHostToDevice);
101 |         }
102 |         Pointer d_Aarray = new Pointer();
103 |         Pointer d_Barray = new Pointer();
104 |         Pointer d_Carray = new Pointer();
105 |         cudaMalloc(d_Aarray, b * Sizeof.POINTER);
106 |         cudaMalloc(d_Barray, b * Sizeof.POINTER);
107 |         cudaMalloc(d_Carray, b * Sizeof.POINTER);
108 |         cudaMemcpy(d_Aarray, Pointer.to(h_Aarray),
109 |             b * Sizeof.POINTER, cudaMemcpyHostToDevice);
110 |         cudaMemcpy(d_Barray, Pointer.to(h_Barray), 
111 |             b * Sizeof.POINTER, cudaMemcpyHostToDevice);
112 |         cudaMemcpy(d_Carray, Pointer.to(h_Carray), 
113 |             b * Sizeof.POINTER, cudaMemcpyHostToDevice);
114 |         
115 |         cublasHandle handle = new cublasHandle();
116 |         cublasCreate(handle);
117 | 
118 |         cublasSgemmBatched(
119 |             handle, 
120 |             cublasOperation.CUBLAS_OP_N, 
121 |             cublasOperation.CUBLAS_OP_N, 
122 |             n, n, n, 
123 |             Pointer.to(new float[]{ alpha }),            
124 |             d_Aarray, n, d_Barray, n, 
125 |             Pointer.to(new float[]{ beta }), 
126 |             d_Carray, n, b);
127 | 
128 |         for (int i = 0; i < b; i++)
129 |         {
130 |             cudaMemcpy(Pointer.to(h_C[i]), h_Carray[i], 
131 |                 nn * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
132 |             cudaFree(h_Aarray[i]);
133 |             cudaFree(h_Barray[i]);
134 |             cudaFree(h_Carray[i]);
135 |         }
136 |         cudaFree(d_Aarray);
137 |         cudaFree(d_Barray);
138 |         cudaFree(d_Carray);
139 |         cublasDestroy(handle);
140 |         
141 |     }
142 | 
143 |     static void sgemmJava(int n, float alpha, 
144 |         float A[][], float B[][], float beta, float C[][])
145 |     {
146 |         for (int i = 0; i < A.length; i++)
147 |         {
148 |             sgemmJava(n, alpha, A[i], B[i], beta, C[i]);
149 |         }
150 |     }
151 |     
152 |     static void sgemmJava(int n, float alpha, 
153 |         float A[], float B[], float beta, float C[])
154 |     {
155 |         for (int i = 0; i < n; ++i)
156 |         {
157 |             for (int j = 0; j < n; ++j)
158 |             {
159 |                 float prod = 0;
160 |                 for (int k = 0; k < n; ++k)
161 |                 {
162 |                     prod += A[k * n + i] * B[j * n + k];
163 |                 }
164 |                 C[j * n + i] = alpha * prod + beta * C[j * n + i];
165 |             }
166 |         }
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/nvrtc/samples/JNvrtcLoweredNames.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA driver and runtime API
  3 |  * http://www.jcuda.org
  4 |  *
  5 |  * Copyright 2016 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | 
  8 | package jcuda.nvrtc.samples;
  9 | 
 10 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 13 | import static jcuda.driver.JCudaDriver.cuInit;
 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 16 | import static jcuda.driver.JCudaDriver.cuMemFree;
 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 18 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 19 | import static jcuda.driver.JCudaDriver.cuModuleLoadData;
 20 | import static jcuda.nvrtc.JNvrtc.nvrtcAddNameExpression;
 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetLoweredName;
 25 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
 26 | import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog;
 27 | 
 28 | import java.util.Arrays;
 29 | import java.util.List;
 30 | 
 31 | import jcuda.Pointer;
 32 | import jcuda.Sizeof;
 33 | import jcuda.driver.CUcontext;
 34 | import jcuda.driver.CUdevice;
 35 | import jcuda.driver.CUdeviceptr;
 36 | import jcuda.driver.CUfunction;
 37 | import jcuda.driver.CUmodule;
 38 | import jcuda.driver.JCudaDriver;
 39 | import jcuda.nvrtc.JNvrtc;
 40 | import jcuda.nvrtc.nvrtcProgram;
 41 | 
 42 | /**
 43 |  * An example showing how to obtain the mangled names from kernels that
 44 |  * are compiled with the NVRTC at runtime
 45 |  */
 46 | public class JNvrtcLoweredNames
 47 | {
 48 |     /**
 49 |      * The source code of the program that contains different global
 50 |      * functions and function templates. 
 51 |      * (Taken from the NVIDIA NVRTC User Guide)
 52 |      */
 53 |     private static String programSourceCode = 
 54 |         "static __global__ void f1(int *result) { *result = 10; }" + "\n" + 
 55 |         "namespace N1 {" + "\n" + 
 56 |         "    namespace N2 {" + "\n" + 
 57 |         "        __global__ void f2(int *result) { *result = 20; }" + "\n" + 
 58 |         "    }" + "\n" + 
 59 |         "}" + "\n" + 
 60 |         "template<typename T>" + "\n" + 
 61 |         "__global__ void f3(int *result) { *result = sizeof(T); }" + "\n";
 62 |     
 63 |     /**
 64 |      * Entry point of this sample
 65 |      * 
 66 |      * @param args Not used
 67 |      */
 68 |     public static void main(String[] args)
 69 |     {
 70 |         // Enable exceptions and omit all subsequent error checks
 71 |         JCudaDriver.setExceptionsEnabled(true);
 72 |         JNvrtc.setExceptionsEnabled(true);
 73 | 
 74 |         // Initialize the driver and create a context for the first device.
 75 |         cuInit(0);
 76 |         CUdevice device = new CUdevice();
 77 |         cuDeviceGet(device, 0);
 78 |         CUcontext context = new CUcontext();
 79 |         cuCtxCreate(context, 0, device);
 80 | 
 81 |         // Use the NVRTC to create a program
 82 |         nvrtcProgram program = new nvrtcProgram();
 83 |         nvrtcCreateProgram(program, programSourceCode, null, 0, null, null);
 84 | 
 85 |         // Add the name expressions that refer to the global functions
 86 |         // and template instantiations
 87 |         List<String> functionNameExpressions = Arrays.asList(
 88 |             "&f1", 
 89 |             "N1::N2::f2", 
 90 |             "f3<int>", 
 91 |             "f3<double>"
 92 |         );
 93 |         for (String functionNameExpression : functionNameExpressions)
 94 |         {
 95 |             nvrtcAddNameExpression(program, functionNameExpression);
 96 |         }
 97 |         List<Integer> expectedResults = Arrays.asList(10, 20, 4, 8);
 98 | 
 99 |         // Compile the program
100 |         nvrtcCompileProgram(program, 0, null);
101 | 
102 |         // Print the compilation log (for the case there are any warnings)
103 |         String programLog[] = new String[1];
104 |         nvrtcGetProgramLog(program, programLog);
105 |         System.out.println("Program compilation log:\n" + programLog[0]);
106 | 
107 |         // Obtain the PTX ("CUDA Assembler") code of the compiled program
108 |         String[] ptx = new String[1];
109 |         nvrtcGetPTX(program, ptx);
110 | 
111 |         // Create a CUDA module from the PTX code
112 |         CUmodule module = new CUmodule();
113 |         cuModuleLoadData(module, ptx[0]);
114 | 
115 |         // Allocate the output memory on the device
116 |         CUdeviceptr dResult = new CUdeviceptr();
117 |         cuMemAlloc(dResult, Sizeof.INT);
118 | 
119 |         // For each function name expression, obtain the lowered (mangled)
120 |         // function name and print it
121 |         boolean passed = true;
122 |         for (int i = 0; i < functionNameExpressions.size(); i++)
123 |         {
124 |             // Obtain the lowered name. Note that this must be called
125 |             // BEFORE the program is destroyed!
126 |             String functionNameExpression = functionNameExpressions.get(i);
127 |             String loweredName[] = { null };
128 |             nvrtcGetLoweredName(program, functionNameExpression, loweredName);
129 | 
130 |             System.out.println(
131 |                 "Lowered name for " + functionNameExpression
132 |                 + " is " + loweredName[0]);
133 | 
134 |             // Obtain the function pointer to the function from the module,
135 |             // using the lowered name
136 |             CUfunction function = new CUfunction();
137 |             cuModuleGetFunction(function, module, loweredName[0]);
138 | 
139 |             // Call the kernel function
140 |             Pointer kernelParameters = Pointer.to(Pointer.to(dResult));
141 |             cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null,
142 |                 kernelParameters, null);
143 |             cuCtxSynchronize();
144 | 
145 |             // Copy the result back to the host, and verify it
146 |             int hResult[] = { 0 };
147 |             cuMemcpyDtoH(Pointer.to(hResult), dResult, Sizeof.INT);
148 | 
149 |             System.out.println("Result: " + hResult[0]);
150 | 
151 |             int expectedResult = expectedResults.get(i);
152 |             passed &= (expectedResult == hResult[0]);
153 |         }
154 | 
155 |         System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
156 | 
157 |         // Clean up.
158 |         nvrtcDestroyProgram(program);
159 |         cuMemFree(dResult);
160 | 
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/nvrtc/samples/JNvrtcVectorAdd.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA driver and runtime API
  3 |  * http://www.jcuda.org
  4 |  *
  5 |  * Copyright 2016 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | 
  8 | package jcuda.nvrtc.samples;
  9 | 
 10 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 13 | import static jcuda.driver.JCudaDriver.cuInit;
 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 16 | import static jcuda.driver.JCudaDriver.cuMemFree;
 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 18 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 19 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 20 | import static jcuda.driver.JCudaDriver.cuModuleLoadData;
 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
 25 | import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog;
 26 | import jcuda.Pointer;
 27 | import jcuda.Sizeof;
 28 | import jcuda.driver.CUcontext;
 29 | import jcuda.driver.CUdevice;
 30 | import jcuda.driver.CUdeviceptr;
 31 | import jcuda.driver.CUfunction;
 32 | import jcuda.driver.CUmodule;
 33 | import jcuda.driver.JCudaDriver;
 34 | import jcuda.nvrtc.JNvrtc;
 35 | import jcuda.nvrtc.nvrtcProgram;
 36 | 
 37 | /**
 38 |  * An example showing how to use the NVRTC (NVIDIA Runtime Compiler) API
 39 |  * to compile CUDA kernel code at runtime.
 40 |  */
 41 | public class JNvrtcVectorAdd
 42 | {
 43 |     /**
 44 |      * The source code of the program that will be compiled at runtime:
 45 |      * A simple vector addition kernel. 
 46 |      * 
 47 |      * Note: The function should be declared as  
 48 |      * extern "C"
 49 |      * to make sure that it can be found under the given name.
 50 |      */
 51 |     private static String programSourceCode = 
 52 |         "extern \"C\"" + "\n" +
 53 |         "__global__ void add(int n, float *a, float *b, float *sum)" + "\n" +
 54 |         "{" + "\n" +
 55 |         "    int i = blockIdx.x * blockDim.x + threadIdx.x;" + "\n" +
 56 |         "    if (i<n)" + "\n" +
 57 |         "    {" + "\n" +
 58 |         "        sum[i] = a[i] + b[i];" + "\n" +
 59 |         "    }" + "\n" +
 60 |         "}" + "\n";
 61 |     
 62 |     /**
 63 |      * Entry point of this sample
 64 |      * 
 65 |      * @param args Not used
 66 |      */
 67 |     public static void main(String[] args)
 68 |     {
 69 |         // Enable exceptions and omit all subsequent error checks
 70 |         JCudaDriver.setExceptionsEnabled(true);
 71 |         JNvrtc.setExceptionsEnabled(true);
 72 | 
 73 |         // Initialize the driver and create a context for the first device.
 74 |         cuInit(0);
 75 |         CUdevice device = new CUdevice();
 76 |         cuDeviceGet(device, 0);
 77 |         CUcontext context = new CUcontext();
 78 |         cuCtxCreate(context, 0, device);
 79 | 
 80 |         
 81 |         // Use the NVRTC to create a program by compiling the source code
 82 |         nvrtcProgram program = new nvrtcProgram();
 83 |         nvrtcCreateProgram(
 84 |             program, programSourceCode, null, 0, null, null);
 85 |         nvrtcCompileProgram(program, 0, null);
 86 |         
 87 |         // Print the compilation log (for the case there are any warnings)
 88 |         String programLog[] = new String[1];
 89 |         nvrtcGetProgramLog(program, programLog);
 90 |         System.out.println("Program compilation log:\n" + programLog[0]);        
 91 |         
 92 |         // Obtain the PTX ("CUDA Assembler") code of the compiled program
 93 |         String[] ptx = new String[1];
 94 |         nvrtcGetPTX(program, ptx);
 95 |         nvrtcDestroyProgram(program);
 96 | 
 97 |         // Create a CUDA module from the PTX code
 98 |         CUmodule module = new CUmodule();
 99 |         cuModuleLoadData(module, ptx[0]);
100 | 
101 |         // Obtain the function pointer to the "add" function from the module
102 |         CUfunction function = new CUfunction();
103 |         cuModuleGetFunction(function, module, "add");
104 | 
105 | 
106 |         // Continue with some basic setup for the vector addition itself:
107 | 
108 |         // Allocate and fill the host input data
109 |         int numElements = 256 * 100;
110 |         float hostInputA[] = new float[numElements];
111 |         float hostInputB[] = new float[numElements];
112 |         for(int i = 0; i < numElements; i++)
113 |         {
114 |             hostInputA[i] = (float)i;
115 |             hostInputB[i] = (float)i;
116 |         }
117 | 
118 |         // Allocate the device input data, and copy the
119 |         // host input data to the device
120 |         CUdeviceptr deviceInputA = new CUdeviceptr();
121 |         cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
122 |         cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA),
123 |             numElements * Sizeof.FLOAT);
124 |         CUdeviceptr deviceInputB = new CUdeviceptr();
125 |         cuMemAlloc(deviceInputB, numElements * Sizeof.FLOAT);
126 |         cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB),
127 |             numElements * Sizeof.FLOAT);
128 | 
129 |         // Allocate device output memory
130 |         CUdeviceptr deviceOutput = new CUdeviceptr();
131 |         cuMemAlloc(deviceOutput, numElements * Sizeof.FLOAT);
132 | 
133 |         // Set up the kernel parameters: A pointer to an array
134 |         // of pointers which point to the actual values.
135 |         Pointer kernelParameters = Pointer.to(
136 |             Pointer.to(new int[]{numElements}),
137 |             Pointer.to(deviceInputA),
138 |             Pointer.to(deviceInputB),
139 |             Pointer.to(deviceOutput)
140 |         );
141 | 
142 |         
143 |         // Call the kernel function, which was obtained from the
144 |         // module that was compiled at runtime
145 |         int blockSizeX = 256;
146 |         int gridSizeX = (numElements + blockSizeX - 1) / blockSizeX;
147 |         cuLaunchKernel(function,
148 |             gridSizeX,  1, 1,      // Grid dimension
149 |             blockSizeX, 1, 1,      // Block dimension
150 |             0, null,               // Shared memory size and stream
151 |             kernelParameters, null // Kernel- and extra parameters
152 |         );
153 |         cuCtxSynchronize();
154 | 
155 |         // Allocate host output memory and copy the device output
156 |         // to the host.
157 |         float hostOutput[] = new float[numElements];
158 |         cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput,
159 |             numElements * Sizeof.FLOAT);
160 | 
161 |         // Verify the result
162 |         boolean passed = true;
163 |         for(int i = 0; i < numElements; i++)
164 |         {
165 |             float expected = i+i;
166 |             if (Math.abs(hostOutput[i] - expected) > 1e-5)
167 |             {
168 |                 System.out.println(
169 |                     "At index "+i+ " found "+hostOutput[i]+
170 |                     " but expected "+expected);
171 |                 passed = false;
172 |                 break;
173 |             }
174 |         }
175 |         System.out.println("Test "+(passed?"PASSED":"FAILED"));
176 | 
177 |         // Clean up.
178 |         cuMemFree(deviceInputA);
179 |         cuMemFree(deviceInputB);
180 |         cuMemFree(deviceOutput);
181 |         
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaAllocationInKernel.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  8 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
  9 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 10 | import static jcuda.driver.JCudaDriver.cuInit;
 11 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 12 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 13 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 14 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 15 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 16 | import static jcuda.driver.JCudaDriver.cuModuleLoad;
 17 | 
 18 | import java.io.IOException;
 19 | import java.util.Arrays;
 20 | 
 21 | import jcuda.Pointer;
 22 | import jcuda.Sizeof;
 23 | import jcuda.driver.CUcontext;
 24 | import jcuda.driver.CUdevice;
 25 | import jcuda.driver.CUdeviceptr;
 26 | import jcuda.driver.CUfunction;
 27 | import jcuda.driver.CUmodule;
 28 | import jcuda.driver.JCudaDriver;
 29 | import jcuda.samples.utils.JCudaSamplesUtils;
 30 | 
 31 | /**
 32 |  * An example showing how to allocate memory in kernels. <br>
 33 |  * <br>
 34 |  * Kernels may allocate memory, using the standard <code>malloc</code> and
 35 |  * <code>free</code> functions. When used inside a kernel, these functions
 36 |  * will allocate device memory. This device memory can NOT be used in 
 37 |  * host functions (not even the ones that operate on device memory!).
 38 |  * The device memory that was allocated on the device is thus not compatible
 39 |  * with the device memory that was allocated on the host. 
 40 |  * See http://stackoverflow.com/a/13043240 for details.<br>
 41 |  * <br>
 42 |  * This example shows how to allocate, use and free memory in kernels. The 
 43 |  * usage pattern shown here does not necessarily make any sense, but it points 
 44 |  * out the difference between device memory allocated on the host, and device 
 45 |  * memory allocated on the device, using overly elaborate variable names.  
 46 |  */
 47 | public class JCudaAllocationInKernel 
 48 | {
 49 |     public static void main(String[] args) throws IOException 
 50 |     {
 51 |         // Enable exceptions and omit all subsequent error checks
 52 |         JCudaDriver.setExceptionsEnabled(true);
 53 | 
 54 |         // Initialize the driver and create a context for the first device.
 55 |         cuInit(0);
 56 |         CUdevice device = new CUdevice();
 57 |         cuDeviceGet(device, 0);
 58 |         CUcontext context = new CUcontext();
 59 |         cuCtxCreate(context, 0, device);
 60 | 
 61 |         // Create the PTX file by calling the NVCC
 62 |         String ptxFileName = JCudaSamplesUtils.preparePtxFile(
 63 |             "src/main/resources/kernels/JCudaAllocationInKernelKernel.cu");
 64 | 
 65 |         // Load the PTX file.
 66 |         CUmodule module = new CUmodule();
 67 |         cuModuleLoad(module, ptxFileName);
 68 | 
 69 |         // Obtain a function pointer to the "allocatingKernel" function.
 70 |         CUfunction allocatingKernel = new CUfunction();
 71 |         cuModuleGetFunction(allocatingKernel, module, "allocatingKernel");
 72 | 
 73 |         // Obtain a function pointer to the "copyingKernel" function.
 74 |         CUfunction copyingKernel = new CUfunction();
 75 |         cuModuleGetFunction(copyingKernel, module, "copyingKernel");
 76 | 
 77 |         // Obtain a function pointer to the "freeingKernel" function.
 78 |         CUfunction freeingKernel = new CUfunction();
 79 |         cuModuleGetFunction(freeingKernel, module, "freeingKernel");
 80 | 
 81 |         int numThreads = 4;
 82 | 
 83 |         // NOTE: This must match the value in the kernels! 
 84 |         int numberOfShortsAllocatedInKernel = 3;
 85 | 
 86 |         // What will arrive in the allocating kernel: A device pointer that is 
 87 |         // allocated on the host. Each element of this "array" will afterwards 
 88 |         // contain a device pointer that was allocated on the device.
 89 |         CUdeviceptr devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice = 
 90 |             new CUdeviceptr();
 91 |         cuMemAlloc(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice, 
 92 |             numThreads * Sizeof.POINTER);
 93 | 
 94 |         // The parameter for the allocating kernel: 
 95 |         // A pointer to a pointer that points to the device pointer that 
 96 |         // was allocated on the host, and points to the device pointers 
 97 |         // that will be allocated on the device. Yeah.
 98 |         Pointer allocatingKernelParameters = Pointer.to(
 99 |             Pointer.to(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice)
100 |             );
101 | 
102 |         // Launch the allocating kernel
103 |         int blockSizeX = numThreads;
104 |         int gridSizeX = 1;
105 |         cuLaunchKernel(allocatingKernel,
106 |             gridSizeX,  1, 1, 
107 |             blockSizeX, 1, 1,
108 |             0, null,         
109 |             allocatingKernelParameters, null 
110 |         );
111 |         cuCtxSynchronize();
112 | 
113 |         // Create the (host) array of device pointers that are allocated on
114 |         // the host
115 |         CUdeviceptr devicePointersAllocatedOnHost[] = 
116 |             new CUdeviceptr[numThreads];
117 |         for (int i=0; i<numThreads; i++)
118 |         {
119 |             devicePointersAllocatedOnHost[i] = new CUdeviceptr();
120 |             cuMemAlloc(devicePointersAllocatedOnHost[i], 
121 |                 numberOfShortsAllocatedInKernel * Sizeof.SHORT);
122 |         }
123 | 
124 |         // Allocate a device pointer on the host, and fill it with the 
125 |         // device pointers that are allocated on the host
126 |         CUdeviceptr devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost = 
127 |             new CUdeviceptr();
128 |         cuMemAlloc(devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost, 
129 |             numThreads * Sizeof.POINTER);
130 |         cuMemcpyHtoD(devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost, 
131 |             Pointer.to(devicePointersAllocatedOnHost), 
132 |             numThreads * Sizeof.POINTER);
133 | 
134 |         // The parameters for the copying kernel:
135 |         // - A pointer to a pointer that points to the device pointer that 
136 |         //   was allocated on the host, and points to the device pointers 
137 |         //   that have been be allocated on the device.
138 |         // - A pointer to a pointer that points to the device pointer that 
139 |         //   was allocated on the host, and points to the device pointers 
140 |         //   that have been be allocated on the host
141 |         Pointer copyingKernelParameters = Pointer.to(
142 |             Pointer.to(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice),
143 |             Pointer.to(devicePointerAllocatedOnHostToDevicePointersAllocatedOnHost)
144 |             );
145 | 
146 |         // Launch the copying kernel
147 |         cuLaunchKernel(copyingKernel,
148 |             gridSizeX,  1, 1, 
149 |             blockSizeX, 1, 1,
150 |             0, null,         
151 |             copyingKernelParameters, null 
152 |         );
153 |         cuCtxSynchronize();
154 | 
155 |         // Copy the contents of each pointer into a result array
156 |         short resultArrays[][] = new short[numThreads][numberOfShortsAllocatedInKernel];
157 |         for (int i=0; i<numThreads; i++)
158 |         {
159 |             cuMemcpyDtoH(Pointer.to(resultArrays[i]), 
160 |                 devicePointersAllocatedOnHost[i], 
161 |                 numberOfShortsAllocatedInKernel * Sizeof.SHORT);        
162 |         }
163 | 
164 |         // The parameters for the freeing kernel: 
165 |         // The same as for the allocating kernel
166 |         Pointer freeingKernelParameters = Pointer.to(
167 |             Pointer.to(devicePointerAllocatedOnHostToDevicePointersAllocatedOnDevice)
168 |             );
169 | 
170 |         // Launch the freeing kernel
171 |         cuLaunchKernel(freeingKernel,
172 |             gridSizeX,  1, 1, 
173 |             blockSizeX, 1, 1,
174 |             0, null,         
175 |             freeingKernelParameters, null 
176 |         );
177 |         cuCtxSynchronize();
178 | 
179 | 
180 |         // Print the results
181 |         for (int i = 0; i < numThreads; i++)
182 |         {
183 |             System.out.println("Result from thread " + i + " is " + 
184 |                 Arrays.toString(resultArrays[i]));
185 |         }
186 |     }
187 | 
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/runtime/samples/JCudaRuntimeMemoryBandwidths.java:
--------------------------------------------------------------------------------
  1 | package jcuda.runtime.samples;
  2 | /*
  3 |  * JCuda - Java bindings for NVIDIA CUDA
  4 |  *
  5 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | 
  8 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
  9 | import static jcuda.runtime.JCuda.cudaEventCreate;
 10 | import static jcuda.runtime.JCuda.cudaEventDestroy;
 11 | import static jcuda.runtime.JCuda.cudaEventElapsedTime;
 12 | import static jcuda.runtime.JCuda.cudaEventRecord;
 13 | import static jcuda.runtime.JCuda.cudaFree;
 14 | import static jcuda.runtime.JCuda.cudaFreeHost;
 15 | import static jcuda.runtime.JCuda.cudaHostAlloc;
 16 | import static jcuda.runtime.JCuda.cudaHostAllocWriteCombined;
 17 | import static jcuda.runtime.JCuda.cudaMalloc;
 18 | import static jcuda.runtime.JCuda.cudaMemcpyAsync;
 19 | import static jcuda.runtime.JCuda.cudaSetDevice;
 20 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 21 | 
 22 | import java.nio.ByteBuffer;
 23 | import java.util.Locale;
 24 | 
 25 | import jcuda.Pointer;
 26 | import jcuda.runtime.cudaEvent_t;
 27 | 
 28 | /**
 29 |  * A comparison of the bandwidth of memory copy operations, depending on
 30 |  * the memory type.<br> 
 31 |  * <br>
 32 |  * This test computes the bandwidth of the data transfer from the host to 
 33 |  * the device for different host memory types:
 34 |  * <ul>
 35 |  *   <li>
 36 |  *     Host data is once allocated as pinned memory 
 37 |  *     (using <code>cudaHostAlloc</code>)
 38 |  *   </li>  
 39 |  *   <li>
 40 |  *     Host data that is stored in pageable memory (comparable to 
 41 |  *     <code>malloc</code> in C), 
 42 |  *     <ul>
 43 |  *       <li>in a Java array</li>
 44 |  *       <li>a direct buffer</li>
 45 |  *     </ul>
 46 |  *   </li>  
 47 |  * </ul>      
 48 |  */
 49 | public class JCudaRuntimeMemoryBandwidths
 50 | {
 51 |     /**
 52 |      * Memory modes for the host memory
 53 |      */
 54 |     enum HostMemoryMode
 55 |     {
 56 |         /**
 57 |          * Pinned host memory, allocated with cudaHostAlloc
 58 |          */
 59 |         PINNED,
 60 |         
 61 |         /**
 62 |          * Pageable memory in form of a Pointer.to(array)
 63 |          */
 64 |         PAGEABLE_ARRAY,
 65 |         
 66 |         /**
 67 |          * Pageable memory in form of a Pointer.to(directBuffer)
 68 |          */
 69 |         PAGEABLE_DIRECT_BUFFER,
 70 |     }
 71 |     
 72 |     /**
 73 |      * Entry point of this sample
 74 |      * 
 75 |      * @param args Not used
 76 |      */
 77 |     public static void main(String[] args)
 78 |     {
 79 |         int device = 0;
 80 |         cudaSetDevice(device);
 81 |         
 82 |         int hostAllocFlags = cudaHostAllocWriteCombined;
 83 |         run(HostMemoryMode.PINNED, hostAllocFlags);
 84 |         run(HostMemoryMode.PAGEABLE_ARRAY, hostAllocFlags);
 85 |         run(HostMemoryMode.PAGEABLE_DIRECT_BUFFER, hostAllocFlags);
 86 |         
 87 |         System.out.println("Done");
 88 |     }
 89 |     
 90 | 
 91 |     /**
 92 |      * Run the computation of the bandwidth for copying host memory to the 
 93 |      * device, using various memory block sizes, and print the results
 94 |      * 
 95 |      * @param hostMemoryMode The {@link HostMemoryMode}
 96 |      * @param hostAllocFlags The flags for cudaHostAlloc
 97 |      */
 98 |     static void run(HostMemoryMode hostMemoryMode, int hostAllocFlags)
 99 |     {
100 |         int minExponent = 10;
101 |         int maxExponent = 28;
102 |         int count = maxExponent - minExponent;
103 |         int memorySizes[] = new int[count];
104 |         float bandwidths[] = new float[memorySizes.length];
105 |         
106 |         System.out.print("Running with " + hostMemoryMode);
107 |         for (int i = 0; i < count; i++)
108 |         {
109 |             System.out.print(".");
110 |             memorySizes[i] = (1 << minExponent + i);
111 |             float bandwidth = computeBandwidth(
112 |                 hostMemoryMode, hostAllocFlags, memorySizes[i]);
113 |             bandwidths[i] = bandwidth;
114 |         }
115 |         System.out.println();
116 | 
117 |         System.out.println("Bandwidths for " + hostMemoryMode);
118 |         for (int i = 0; i < memorySizes.length; i++)
119 |         {
120 |             String s = String.format("%10d", memorySizes[i]);
121 |             String b = String.format(Locale.ENGLISH, "%5.3f", bandwidths[i]);
122 |             System.out.println(s + " bytes : " + b + " MB/s");
123 |         }
124 |         System.out.println("\n");
125 |     }
126 |     
127 |     
128 |     /**
129 |      * Compute the bandwidth in MB per second for copying data from the
130 |      * host to the device
131 |      * 
132 |      * @param hostMemoryMode The {@link HostMemoryMode}
133 |      * @param hostAllocFlags The flags for the cudaHostAlloc call
134 |      * @param memorySizes The memory sizes, in bytes
135 |      * @param bandwidths Will store the bandwidth, in MB per second
136 |      */
137 |     static void computeBandwidths(
138 |         HostMemoryMode hostMemoryMode, int hostAllocFlags,
139 |         int memorySizes[], float bandwidths[])
140 |     {
141 |         for (int i = 0; i < memorySizes.length; i++)
142 |         {
143 |             int memorySize = memorySizes[i];
144 |             float bandwidth = computeBandwidth(
145 |                 hostMemoryMode, hostAllocFlags, memorySize);
146 |             bandwidths[i] = bandwidth;
147 |         }
148 |     }
149 |     
150 |     /**
151 |      * Compute the bandwidth in MB per second for copying data from the
152 |      * host to the device
153 |      * 
154 |      * @param hostMemoryMode The {@link HostMemoryMode}
155 |      * @param hostAllocFlags The flags for the cudaHostAlloc call
156 |      * @param memorySize The memory size, in bytes
157 |      * @return The bandwidth, in MB per second
158 |      */
159 |     static float computeBandwidth(
160 |         HostMemoryMode hostMemoryMode, int hostAllocFlags, int memorySize)
161 |     {
162 |         // Initialize the host memory
163 |         Pointer hostData = null;
164 |         ByteBuffer hostDataBuffer = null;
165 |         if (hostMemoryMode == HostMemoryMode.PINNED)
166 |         {
167 |             // Allocate pinned (page-locked) host memory
168 |             hostData = new Pointer();
169 |             cudaHostAlloc(hostData, memorySize, hostAllocFlags);
170 |             hostDataBuffer = hostData.getByteBuffer(0, memorySize);
171 |         }
172 |         else if (hostMemoryMode == HostMemoryMode.PAGEABLE_ARRAY)
173 |         {
174 |             // The host memory is pageable and stored in a Java array
175 |             byte array[] = new byte[memorySize];
176 |             hostDataBuffer = ByteBuffer.wrap(array);
177 |             hostData = Pointer.to(array);
178 |         }
179 |         else
180 |         {
181 |             // The host memory is pageable and stored in a direct byte buffer
182 |             hostDataBuffer = ByteBuffer.allocateDirect(memorySize);
183 |             hostData = Pointer.to(hostDataBuffer);
184 |         }
185 | 
186 |         // Fill the memory with arbitrary data
187 |         for (int i = 0; i < memorySize; i++)
188 |         {
189 |             hostDataBuffer.put(i, (byte) i);
190 |         }
191 | 
192 |         // Allocate device memory
193 |         Pointer deviceData = new Pointer();
194 |         cudaMalloc(deviceData, memorySize);
195 | 
196 |         final int runs = 10;
197 |         float bandwidth = computeBandwidth(
198 |             deviceData, hostData, cudaMemcpyHostToDevice, memorySize, runs);
199 | 
200 |         // Clean up
201 |         if (hostMemoryMode == HostMemoryMode.PINNED)
202 |         {
203 |             cudaFreeHost(hostData);
204 |         }
205 |         cudaFree(deviceData);
206 |         return bandwidth;
207 |     }
208 |     
209 | 
210 |     /**
211 |      * Compute the bandwidth in MB per second for copying data from the 
212 |      * given source pointer to the given destination pointer
213 |      * 
214 |      * @param dstData The destination pointer
215 |      * @param srcData The source pointer
216 |      * @param memcopyKind The cudaMemcpyKind. Must match the types 
217 |      * of the source and destination pointers!
218 |      * @param memSize The memory size, in bytes
219 |      * @param runs The number of times that the copying operation
220 |      * should be repeated
221 |      * @return The bandwidth in MB per second
222 |      */
223 |     static float computeBandwidth(
224 |         Pointer dstData, Pointer srcData, 
225 |         int memcopyKind, int memSize, int runs)
226 |     {
227 |         // Initialize the events for the time measure
228 |         cudaEvent_t start = new cudaEvent_t();
229 |         cudaEvent_t stop = new cudaEvent_t();
230 |         cudaEventCreate(start);
231 |         cudaEventCreate(stop);
232 | 
233 |         // Perform the specified number of copying operations
234 |         cudaEventRecord(start, null);
235 |         for (int i = 0; i < runs; i++)
236 |         {
237 |             cudaMemcpyAsync(dstData, srcData, memSize, memcopyKind, null);
238 |         }
239 |         cudaEventRecord(stop, null);
240 |         cudaDeviceSynchronize();
241 | 
242 |         // Compute the elapsed time and bandwidth
243 |         // in MB per second
244 |         float elapsedTimeMsArray[] = { Float.NaN };
245 |         cudaEventElapsedTime(elapsedTimeMsArray, start, stop);
246 |         float elapsedTimeMs = elapsedTimeMsArray[0];
247 |         float bandwidthInBytesPerMs = ((float) memSize * runs) / elapsedTimeMs;
248 |         float bandwidth = bandwidthInBytesPerMs / 1024;
249 | 
250 |         // Clean up
251 |         cudaEventDestroy(stop);
252 |         cudaEventDestroy(start);
253 |         return bandwidth;
254 |     }
255 | }


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/gl/samples/SimpleInteraction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.gl.samples;
  7 | 
  8 | import java.awt.Point;
  9 | import java.awt.event.MouseEvent;
 10 | import java.awt.event.MouseMotionListener;
 11 | import java.awt.event.MouseWheelEvent;
 12 | import java.awt.event.MouseWheelListener;
 13 | import java.util.Arrays;
 14 | 
 15 | /**
 16 |  * A class encapsulating a VERY simple mouse interaction for the GL samples:
 17 |  * It offers a {@link #getMouseControl()} that may be attached as a 
 18 |  * MouseMotionListener and MouseWheelListener to an arbitrary component,
 19 |  * and methods to obtain a {@link #getModelviewMatrix() model-view} and
 20 |  * {@link #getProjectionMatrix() projection} matrix.   
 21 |  */
 22 | class SimpleInteraction
 23 | {
 24 |     /**
 25 |      * The translation in X-direction
 26 |      */
 27 |     private float translationX = 0;
 28 | 
 29 |     /**
 30 |      * The translation in Y-direction
 31 |      */
 32 |     private float translationY = 0;
 33 | 
 34 |     /**
 35 |      * The translation in Z-direction
 36 |      */
 37 |     private float translationZ = -4;
 38 | 
 39 |     /**
 40 |      * The rotation about the X-axis, in degrees
 41 |      */
 42 |     private float rotationDegX = 40;
 43 | 
 44 |     /**
 45 |      * The rotation about the Y-axis, in degrees
 46 |      */
 47 |     private float rotationDegY = 30;
 48 | 
 49 |     /**
 50 |      * The current projection matrix
 51 |      */
 52 |     private float projectionMatrix[] = new float[16];
 53 | 
 54 |     /**
 55 |      * The current modelview matrix
 56 |      */
 57 |     private float modelviewMatrix[] = new float[16];
 58 |     
 59 |     /**
 60 |      * Inner class encapsulating the MouseMotionListener and
 61 |      * MouseWheelListener for the interaction
 62 |      */
 63 |     class MouseControl implements MouseMotionListener, MouseWheelListener
 64 |     {
 65 |         private Point previousMousePosition = new Point();
 66 | 
 67 |         @Override
 68 |         public void mouseDragged(MouseEvent e)
 69 |         {
 70 |             int dx = e.getX() - previousMousePosition.x;
 71 |             int dy = e.getY() - previousMousePosition.y;
 72 | 
 73 |             // If the left button is held down, move the object
 74 |             if ((e.getModifiersEx() & MouseEvent.BUTTON1_DOWN_MASK) == 
 75 |                 MouseEvent.BUTTON1_DOWN_MASK)
 76 |             {
 77 |                 translationX += dx / 100.0f;
 78 |                 translationY -= dy / 100.0f;
 79 |             }
 80 | 
 81 |             // If the right button is held down, rotate the object
 82 |             else if ((e.getModifiersEx() & MouseEvent.BUTTON3_DOWN_MASK) == 
 83 |                 MouseEvent.BUTTON3_DOWN_MASK)
 84 |             {
 85 |                 rotationDegX += dy;
 86 |                 rotationDegY += dx;
 87 |             }
 88 |             previousMousePosition = e.getPoint();
 89 |             updateModelviewMatrix();
 90 |         }
 91 | 
 92 |         @Override
 93 |         public void mouseMoved(MouseEvent e)
 94 |         {
 95 |             previousMousePosition = e.getPoint();
 96 |         }
 97 | 
 98 |         @Override
 99 |         public void mouseWheelMoved(MouseWheelEvent e)
100 |         {
101 |             // Translate along the Z-axis
102 |             translationZ += e.getWheelRotation() * 0.25f;
103 |             previousMousePosition = e.getPoint();
104 |             updateModelviewMatrix();
105 |         }
106 |     }
107 |     
108 |     /**
109 |      * The mouse control
110 |      */
111 |     private final MouseControl mouseControl;
112 |     
113 |     /**
114 |      * Default constructor
115 |      */
116 |     SimpleInteraction()
117 |     {
118 |         this.mouseControl = new MouseControl();
119 |         updateModelviewMatrix();
120 |     }
121 |     
122 |     /**
123 |      * Returns the mouse control that may be attached to a component
124 |      * as a MouseMotionListener and MouseWheelListener
125 |      * 
126 |      * @return The mouse control
127 |      */
128 |     MouseControl getMouseControl()
129 |     {
130 |         return mouseControl;
131 |     }
132 |     
133 |     /**
134 |      * Update the modelview matrix depending on the
135 |      * current translation and rotation
136 |      */
137 |     private void updateModelviewMatrix()
138 |     {
139 |         float m0[] = translation(translationX, translationY, translationZ);
140 |         float m1[] = rotationX(rotationDegX);
141 |         float m2[] = rotationY(rotationDegY);
142 |         modelviewMatrix = multiply(multiply(m1,m2), m0);
143 |     }
144 | 
145 |     /**
146 |      * Update the projection matrix for the given screen width and height
147 |      * 
148 |      * @param w The width
149 |      * @param h The height
150 |      */
151 |     void updateProjectionMatrix(int w, int h)
152 |     {
153 |         float aspect = (float) w / h;
154 |         projectionMatrix = perspective(50, aspect, 0.1f, 100.0f);
155 |     }
156 |     
157 |     /**
158 |      * Returns a <b>reference</b> to the modelview matrix
159 |      * 
160 |      * @return The matrix
161 |      */
162 |     float[] getModelviewMatrix()
163 |     {
164 |         return modelviewMatrix;
165 |     }
166 |     
167 |     /**
168 |      * Returns a <b>reference</b> to the projection matrix
169 |      * 
170 |      * @return The matrix
171 |      */
172 |     float[] getProjectionMatrix()
173 |     {
174 |         return projectionMatrix;
175 |     }
176 |     
177 |     /**
178 |      * Returns the rotation around the x-axis, in degrees
179 |      * 
180 |      * @return The rotation
181 |      */
182 |     float getRotationDegX()
183 |     {
184 |         return rotationDegX;
185 |     }
186 |     
187 |     /**
188 |      * Returns the rotation around the y-axis, in degrees
189 |      * 
190 |      * @return The rotation
191 |      */
192 |     float getRotationDegY()
193 |     {
194 |         return rotationDegY;
195 |     }
196 |     
197 |     /**
198 |      * Returns the translation along the x-axis
199 |      * 
200 |      * @return The translation
201 |      */
202 |     float getTranslationX()
203 |     {
204 |         return translationX;
205 |     }
206 |     
207 |     /**
208 |      * Returns the translation along the y-axis
209 |      * 
210 |      * @return The translation
211 |      */
212 |     float getTranslationY()
213 |     {
214 |         return translationY;
215 |     }
216 |     
217 |     /**
218 |      * Returns the translation along the z-axis
219 |      * 
220 |      * @return The translation
221 |      */
222 |     float getTranslationZ()
223 |     {
224 |         return translationZ;
225 |     }
226 |     
227 |     /**
228 |      * Helper method that creates a perspective matrix
229 |      * @param fovy The fov in y-direction, in degrees
230 |      * 
231 |      * @param aspect The aspect ratio
232 |      * @param zNear The near clipping plane
233 |      * @param zFar The far clipping plane
234 |      * @return A perspective matrix
235 |      */
236 |     private static float[] perspective(
237 |         float fovy, float aspect, float zNear, float zFar)
238 |     {
239 |         float radians = (float)Math.toRadians(fovy / 2);
240 |         float deltaZ = zFar - zNear;
241 |         float sine = (float)Math.sin(radians);
242 |         if ((deltaZ == 0) || (sine == 0) || (aspect == 0)) 
243 |         {
244 |             return identity();
245 |         }
246 |         float cotangent = (float)Math.cos(radians) / sine;
247 |         float m[] = identity();
248 |         m[0*4+0] = cotangent / aspect;
249 |         m[1*4+1] = cotangent;
250 |         m[2*4+2] = -(zFar + zNear) / deltaZ;
251 |         m[2*4+3] = -1;
252 |         m[3*4+2] = -2 * zNear * zFar / deltaZ;
253 |         m[3*4+3] = 0;
254 |         return m;
255 |     }
256 |     
257 |     /**
258 |      * Creates an identity matrix
259 |      * 
260 |      * @return An identity matrix 
261 |      */
262 |     private static float[] identity()
263 |     {
264 |         float m[] = new float[16];
265 |         Arrays.fill(m, 0);
266 |         m[0] = m[5] = m[10] = m[15] = 1.0f;
267 |         return m;
268 |     }
269 |     
270 |     /**
271 |      * Multiplies the given matrices and returns the result
272 |      * 
273 |      * @param m0 The first matrix
274 |      * @param m1 The second matrix
275 |      * @return The product m0*m1
276 |      */
277 |     private static float[] multiply(float m0[], float m1[])
278 |     {
279 |         float m[] = new float[16];
280 |         for (int x=0; x < 4; x++)
281 |         {
282 |             for(int y=0; y < 4; y++)
283 |             {
284 |                 m[x*4 + y] = 
285 |                     m0[x*4+0] * m1[y+ 0] +
286 |                     m0[x*4+1] * m1[y+ 4] +
287 |                     m0[x*4+2] * m1[y+ 8] +
288 |                     m0[x*4+3] * m1[y+12];
289 |             }
290 |         }
291 |         return m;
292 |     }
293 |     
294 |     /**
295 |      * Creates a translation matrix
296 |      * 
297 |      * @param x The x translation
298 |      * @param y The y translation
299 |      * @param z The z translation
300 |      * @return A translation matrix
301 |      */
302 |     private static float[] translation(float x, float y, float z)
303 |     {
304 |         float m[] = identity();
305 |         m[12] = x;
306 |         m[13] = y;
307 |         m[14] = z;
308 |         return m;
309 |     }
310 | 
311 |     /**
312 |      * Creates a matrix describing a rotation around the x-axis
313 |      * 
314 |      * @param angleDeg The rotation angle, in degrees
315 |      * @return The rotation matrix
316 |      */
317 |     private static float[] rotationX(float angleDeg)
318 |     {
319 |         float m[] = identity();
320 |         float angleRad = (float)Math.toRadians(angleDeg);
321 |         float ca = (float)Math.cos(angleRad);
322 |         float sa = (float)Math.sin(angleRad);
323 |         m[ 5] =  ca;
324 |         m[ 6] =  sa;
325 |         m[ 9] = -sa;
326 |         m[10] =  ca;
327 |         return m;
328 |     }
329 | 
330 |     /**
331 |      * Creates a matrix describing a rotation around the y-axis
332 |      * 
333 |      * @param angleDeg The rotation angle, in degrees
334 |      * @return The rotation matrix
335 |      */
336 |     private static float[] rotationY(float angleDeg)
337 |     {
338 |         float m[] = identity();
339 |         float angleRad = (float)Math.toRadians(angleDeg);
340 |         float ca = (float)Math.cos(angleRad);
341 |         float sa = (float)Math.sin(angleRad);
342 |         m[ 0] =  ca;
343 |         m[ 2] = -sa;
344 |         m[ 8] =  sa;
345 |         m[10] =  ca;
346 |         return m;
347 |     }
348 | 
349 |     
350 | 
351 | }
352 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcublas/samples/JCublas2MatrixInvert.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcublas.samples;
  7 | 
  8 | import static jcuda.jcublas.JCublas2.cublasCreate;
  9 | import static jcuda.jcublas.JCublas2.cublasDestroy;
 10 | import static jcuda.jcublas.JCublas2.cublasGetMatrix;
 11 | import static jcuda.jcublas.JCublas2.cublasGetVector;
 12 | import static jcuda.jcublas.JCublas2.cublasIsamax;
 13 | import static jcuda.jcublas.JCublas2.cublasSetMatrix;
 14 | import static jcuda.jcublas.JCublas2.cublasSetVector;
 15 | import static jcuda.jcublas.JCublas2.cublasSgemm;
 16 | import static jcuda.jcublas.JCublas2.cublasSgemv;
 17 | import static jcuda.jcublas.JCublas2.cublasSger;
 18 | import static jcuda.jcublas.JCublas2.cublasSscal;
 19 | import static jcuda.jcublas.JCublas2.cublasSswap;
 20 | import static jcuda.jcublas.JCublas2.cublasStrmv;
 21 | import static jcuda.jcublas.cublasFillMode.CUBLAS_FILL_MODE_UPPER;
 22 | import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
 23 | import static jcuda.runtime.JCuda.cudaFree;
 24 | import static jcuda.runtime.JCuda.cudaMalloc;
 25 | import static jcuda.runtime.JCuda.cudaMemcpy;
 26 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 27 | 
 28 | import jcuda.Pointer;
 29 | import jcuda.Sizeof;
 30 | import jcuda.jcublas.cublasHandle;
 31 | import jcuda.samples.utils.JCudaSamplesUtils;
 32 | 
 33 | /**
 34 |  * Example of a matrix inversion using JCublas2.
 35 |  */
 36 | public class JCublas2MatrixInvert
 37 | {
 38 |     /**
 39 |      * Entry point of this sample
 40 |      * 
 41 |      * @param args Not used
 42 |      */
 43 |     public static void main(String[] args)
 44 |     {
 45 |         // Create a CUBLAS handle
 46 |         cublasHandle handle = new cublasHandle();
 47 |         cublasCreate(handle);
 48 | 
 49 |         // Create the input matrix
 50 |         int size = 7;
 51 |         float A[] = JCudaSamplesUtils.createRandomFloatData(size * size);
 52 | 
 53 |         // Invert the matrix
 54 |         float invA[] = A.clone();
 55 |         invertMatrix(handle, size, invA);
 56 | 
 57 |         // Compute A*invA, which should yield the identity matrix
 58 |         float identity[] = new float[size * size];
 59 |         multiply(handle, size, A, invA, identity);
 60 | 
 61 |         // Print the results
 62 |         System.out.println("A:");
 63 |         System.out.println(JCudaSamplesUtils.toString2D(A, size));
 64 |         System.out.println("invA:");
 65 |         System.out.println(JCudaSamplesUtils.toString2D(invA, size));
 66 |         System.out.println("identity:");
 67 |         System.out.println(JCudaSamplesUtils.toString2D(identity, size));
 68 |         
 69 |         // Verify the result
 70 |         boolean passed = true;
 71 |         final float epsilon = 1e-5f;
 72 |         for (int i = 0; i < size; i++)
 73 |         {
 74 |             for (int j = 0; j < size; j++)
 75 |             {
 76 |                 int index = i * size + j;
 77 |                 float value = identity[index];
 78 |                 if (i == j)
 79 |                 {
 80 |                     passed &= Math.abs(value - 1.0f) <= epsilon;
 81 |                 }
 82 |                 else
 83 |                 {
 84 |                     passed &= Math.abs(value) <= epsilon;
 85 |                 }
 86 |             }
 87 |         }
 88 |         System.out.println((passed ? "PASSED" : "FAILED"));
 89 | 
 90 |         // Clean up
 91 |         cublasDestroy(handle);
 92 |     }
 93 | 
 94 |     /**
 95 |      * Copies the given n x n matrix into device memory, inverts it by calling
 96 |      * {@link #invertMatrix(cublasHandle, int, Pointer)}, and copies it back 
 97 |      * into the given array.
 98 |      * 
 99 |      * @param handle The CUBLAS handle
100 |      * @param n The size of the matrix
101 |      * @param A The matrix
102 |      */
103 |     public static void invertMatrix(cublasHandle handle, int n, float A[])
104 |     {
105 |         Pointer dA = new Pointer();
106 |         cudaMalloc(dA, n * n * Sizeof.FLOAT);
107 |         cublasSetMatrix(n, n, Sizeof.FLOAT, Pointer.to(A), n, dA, n);
108 | 
109 |         invertMatrix(handle, n, dA);
110 | 
111 |         cublasGetMatrix(n, n, Sizeof.FLOAT, dA, n, Pointer.to(A), n);
112 |         cudaFree(dA);
113 |     }
114 | 
115 |     /**
116 |      * Invert the n x n matrix that is given in device memory.
117 |      * 
118 |      * @param n The size of the matrix
119 |      * @param dA The matrix
120 |      */
121 |     public static void invertMatrix(cublasHandle handle, int n, Pointer dA)
122 |     {
123 |         // Perform LU factorization
124 |         int[] pivots = cudaSgetrfSquare(handle, n, dA);
125 | 
126 |         // Perform inversion on factorized matrix
127 |         cudaSgetri(handle, n, dA, pivots);
128 |     }
129 | 
130 |     /**
131 |      * Convenience method that returns a pointer with the given offset (in
132 |      * number of 4-byte float elements) from the given pointer.
133 |      * 
134 |      * @param p The pointer
135 |      * @param floatOffset The offset, in number of float elements
136 |      * @return The new pointer
137 |      */
138 |     private static Pointer at(Pointer p, int floatOffset)
139 |     {
140 |         return p.withByteOffset(floatOffset * Sizeof.FLOAT);
141 |     }
142 | 
143 |     /**
144 |      * cudaSgetrf performs an in-place LU factorization on a square matrix. 
145 |      * Uses the unblocked BLAS2 approach
146 |      * 
147 |      * @param n The matrix size
148 |      * @param dA The pointer to the matrix (in device memory)
149 |      * @return The pivots
150 |      */
151 |     private static int[] cudaSgetrfSquare(
152 |         cublasHandle handle, int n, Pointer dA)
153 |     {
154 |         int[] pivots = new int[n];
155 |         for (int i = 0; i < n; i++)
156 |         {
157 |             pivots[i] = i;
158 |         }
159 | 
160 |         Pointer minusOne = Pointer.to(new float[] { -1.0f });
161 |         float[] factor = { 0.0f };
162 |         Pointer pFactor = Pointer.to(factor);
163 |         for (int i = 0; i < n - 1; i++)
164 |         {
165 |             Pointer offset = at(dA, i * n + i);
166 | 
167 |             int max[] = { 0 };
168 |             cublasIsamax(handle, n - i, offset, 1, Pointer.to(max));
169 |             int pivot = i - 1 + max[0];
170 |             if (pivot != i)
171 |             {
172 |                 pivots[i] = pivot;
173 |                 cublasSswap(handle, n, at(dA, pivot), n, at(dA, i), n);
174 |             }
175 | 
176 |             cublasGetVector(1, Sizeof.FLOAT, offset, 1, pFactor, 1);
177 |             factor[0] = 1 / factor[0];
178 |             cublasSscal(handle, n - i - 1, pFactor, at(offset, 1), 1);
179 |             cublasSger(handle, n - i - 1, n - i - 1, minusOne, at(offset, 1), 
180 |                 1, at(offset, n), n, at(offset, n + 1), n);
181 |         }
182 |         return pivots;
183 |     }
184 | 
185 |     /***
186 |      * cudaSgetri Computes the inverse of an LU-factorized square matrix
187 |      * 
188 |      * @param n The matrix size
189 |      * @param dA The matrix in device memory
190 |      * @param pivots The pivots
191 |      */
192 |     private static void cudaSgetri(
193 |         cublasHandle handle, int n, Pointer dA, int[] pivots)
194 |     {
195 |         // Perform inv(U)
196 |         cudaStrtri(handle, n, dA);
197 | 
198 |         // Solve inv(A)*L = inv(U)
199 |         Pointer dWork = new Pointer();
200 |         cudaMalloc(dWork, (n - 1) * Sizeof.FLOAT);
201 | 
202 |         Pointer zero = Pointer.to(new float[]{ 0.0f });
203 |         Pointer one = Pointer.to(new float[]{ 1.0f });
204 |         Pointer minusOne = Pointer.to(new float[]{ -1.0f });
205 |         for (int i = n - 1; i > 0; i--)
206 |         {
207 |             Pointer offset = at(dA, ((i - 1) * n + i));
208 |             cudaMemcpy(dWork, offset, (n - 1) * Sizeof.FLOAT,
209 |                 cudaMemcpyDeviceToDevice);
210 |             cublasSscal(handle, n - i, zero, offset, 1);
211 |             cublasSgemv(handle, CUBLAS_OP_N, n, n - i, minusOne, 
212 |                 at(dA, i * n), n, dWork, 1, one, at(dA, ((i - 1) * n)), 1);
213 |         }
214 | 
215 |         cudaFree(dWork);
216 | 
217 |         // Pivot back to original order
218 |         for (int i = n - 1; i >= 0; i--)
219 |         {
220 |             if (i != pivots[i])
221 |             {
222 |                 cublasSswap(handle, n, at(dA, i * n), 1, 
223 |                     at(dA, pivots[i] * n), 1);
224 |             }
225 |         }
226 | 
227 |     }
228 | 
229 |     /***
230 |      * cudaStrtri Computes the inverse of an upper triangular matrix in place
231 |      * Uses the unblocked BLAS2 approach
232 |      * 
233 |      * @param n The size of the matrix
234 |      * @param dA The matrix
235 |      */
236 |     private static void cudaStrtri(cublasHandle handle, int n, Pointer dA)
237 |     {
238 |         float[] factor = { 0.0f };
239 |         Pointer pFactor = Pointer.to(factor);
240 |         for (int i = 0; i < n; i++)
241 |         {
242 |             Pointer offset = at(dA, i * n);
243 |             cublasGetVector(1, Sizeof.FLOAT, at(offset, i), 1, pFactor, 1);
244 |             factor[0] = 1 / factor[0];
245 |             cublasSetVector(1, Sizeof.FLOAT, pFactor, 1, at(offset, i), 1);
246 | 
247 |             factor[0] = -factor[0];
248 |             cublasStrmv(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
249 |                 CUBLAS_OP_N, i, dA, n, offset, 1);
250 |             cublasSscal(handle, i, pFactor, offset, 1);
251 |         }
252 |     }
253 | 
254 |     // === Utility methods for this sample ====================================
255 | 
256 |     /**
257 |      * Multiplies the matrices A and B and writes the result into C.
258 |      * 
259 |      * @param size The size of the matrices
260 |      * @param A Matrix A
261 |      * @param B Matrix B
262 |      * @param C Matrix C
263 |      */
264 |     private static void multiply(cublasHandle handle, int size, float A[],
265 |         float B[], float C[])
266 |     {
267 |         Pointer dA = new Pointer();
268 |         Pointer dB = new Pointer();
269 |         Pointer dC = new Pointer();
270 | 
271 |         cudaMalloc(dA, size * size * Sizeof.FLOAT);
272 |         cudaMalloc(dB, size * size * Sizeof.FLOAT);
273 |         cudaMalloc(dC, size * size * Sizeof.FLOAT);
274 |         cublasSetVector(size * size, Sizeof.FLOAT, Pointer.to(A), 1, dA, 1);
275 |         cublasSetVector(size * size, Sizeof.FLOAT, Pointer.to(B), 1, dB, 1);
276 | 
277 |         Pointer zero = Pointer.to(new float[]{ 0.0f });
278 |         Pointer one = Pointer.to(new float[]{ 1.0f });
279 |         cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, one, 
280 |             dA, size, dB, size, zero, dC, size);
281 | 
282 |         cublasGetVector(size * size, Sizeof.FLOAT, dC, 1, Pointer.to(C), 1);
283 |         cudaFree(dA);
284 |         cudaFree(dB);
285 |         cudaFree(dC);
286 |     }
287 | 
288 | }


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/jcudnn/samples/JCudnnMnistUtils.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2020 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.jcudnn.samples;
  7 | 
  8 | import static jcuda.runtime.JCuda.cudaDeviceReset;
  9 | import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
 10 | import static jcuda.runtime.JCuda.cudaMalloc;
 11 | import static jcuda.runtime.JCuda.cudaMemcpy;
 12 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 13 | import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 14 | 
 15 | import java.io.ByteArrayOutputStream;
 16 | import java.io.DataInputStream;
 17 | import java.io.File;
 18 | import java.io.FileInputStream;
 19 | import java.io.IOException;
 20 | import java.io.InputStream;
 21 | import java.nio.ByteBuffer;
 22 | import java.nio.ByteOrder;
 23 | import java.nio.FloatBuffer;
 24 | 
 25 | import jcuda.CudaException;
 26 | import jcuda.Pointer;
 27 | import jcuda.Sizeof;
 28 | import jcuda.jcudnn.cudnnDataType;
 29 | 
 30 | /**
 31 |  * Utility methods for the JCudnnMnist sample. These are mainly file IO 
 32 |  * methods for the sample files that contain the binary data of the 
 33 |  * trained network, and the images.
 34 |  */
 35 | class JCudnnMnistUtils
 36 | {
 37 |     static Pointer readBinaryFileAsDeviceDataUnchecked(
 38 |         String fileName, int dataType)
 39 |     {
 40 |         if (dataType == cudnnDataType.CUDNN_DATA_FLOAT)
 41 |         {
 42 |             float data[] = readBinaryFileAsFloatsUnchecked(fileName);
 43 |             return createDevicePointer(data);
 44 |         }
 45 |         if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE)
 46 |         {
 47 |             float data[] = readBinaryFileAsFloatsUnchecked(fileName);
 48 |             double doubleData[] = toDouble(data);
 49 |             return createDevicePointer(doubleData);
 50 |         }
 51 |         throw new IllegalArgumentException(
 52 |             "Invalid data type: " + cudnnDataType.stringFor(dataType));
 53 |     }
 54 |     
 55 |     private static float[] readBinaryFileAsFloatsUnchecked(String fileName)
 56 |     {
 57 |         try
 58 |         {
 59 |             return readBinaryFileAsFloats(fileName);
 60 |         }
 61 |         catch (IOException e)
 62 |         {
 63 |             cudaDeviceReset();
 64 |             throw new CudaException("Could not read input file", e);
 65 |         }
 66 |     }
 67 |     
 68 |     private static float[] readBinaryFileAsFloats(String fileName) 
 69 |         throws IOException
 70 |     {
 71 |         FileInputStream fis = new FileInputStream(new File(fileName));
 72 |         byte data[] = readFully(fis);
 73 |         ByteBuffer bb = ByteBuffer.wrap(data);
 74 |         bb.order(ByteOrder.nativeOrder());
 75 |         FloatBuffer fb = bb.asFloatBuffer();
 76 |         float result[] = new float[fb.capacity()];
 77 |         fb.get(result);
 78 |         return result;
 79 |     }
 80 |     
 81 |     private static double[] toDouble(float array[])
 82 |     {
 83 |         double result[] = new double[array.length];
 84 |         for (int i = 0; i < array.length; i++)
 85 |         {
 86 |             result[i] = array[i];
 87 |         }
 88 |         return result;
 89 |     }
 90 | 
 91 |     private static byte[] readFully(InputStream inputStream) throws IOException
 92 |     {
 93 |         ByteArrayOutputStream baos = new ByteArrayOutputStream();
 94 |         byte buffer[] = new byte[1024];
 95 |         while (true)
 96 |         {
 97 |             int n = inputStream.read(buffer);
 98 |             if (n < 0)
 99 |             {
100 |                 break;
101 |             }
102 |             baos.write(buffer, 0, n);
103 |         }
104 |         byte data[] = baos.toByteArray();
105 |         return data;
106 |     }
107 | 
108 |     static Pointer readImageDataUnchecked(String fileName, int dataType)
109 |     {
110 |         if (dataType == cudnnDataType.CUDNN_DATA_FLOAT)
111 |         {
112 |             float data[] = readImageDataAsFloatsUnchecked(fileName);
113 |             return Pointer.to(data);
114 |         }
115 |         if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE)
116 |         {
117 |             double data[] = readImageDataAsDoublesUnchecked(fileName);
118 |             return Pointer.to(data);
119 |         }
120 |         throw new IllegalArgumentException(
121 |             "Invalid data type: " + cudnnDataType.stringFor(dataType));
122 |     }
123 | 
124 |     private static double[] readImageDataAsDoublesUnchecked(String fileName)
125 |     {
126 |         try
127 |         {
128 |             return readImageDataAsDoubles(fileName);
129 |         }
130 |         catch (IOException e)
131 |         {
132 |             cudaDeviceReset();
133 |             throw new CudaException("Could not read input file", e);
134 |         }
135 |     }
136 | 
137 |     private static double[] readImageDataAsDoubles(String fileName) throws IOException
138 |     {
139 |         InputStream is = new FileInputStream(new File(fileName));
140 |         byte data[] = readBinaryPortableGraymap8bitData(is);
141 |         double imageData[] = new double[data.length];
142 |         for (int i = 0; i < data.length; i++)
143 |         {
144 |             imageData[i] = (((int) data[i]) & 0xff) / 255.0;
145 |         }
146 |         return imageData;
147 |     }
148 |     
149 |     private static float[] readImageDataAsFloatsUnchecked(String fileName)
150 |     {
151 |         try
152 |         {
153 |             return readImageDataAsFloats(fileName);
154 |         }
155 |         catch (IOException e)
156 |         {
157 |             cudaDeviceReset();
158 |             throw new CudaException("Could not read input file", e);
159 |         }
160 |     }
161 | 
162 |     private static float[] readImageDataAsFloats(String fileName) throws IOException
163 |     {
164 |         InputStream is = new FileInputStream(new File(fileName));
165 |         byte data[] = readBinaryPortableGraymap8bitData(is);
166 |         float imageData[] = new float[data.length];
167 |         for (int i = 0; i < data.length; i++)
168 |         {
169 |             imageData[i] = (((int) data[i]) & 0xff) / 255.0f;
170 |         }
171 |         return imageData;
172 |     }
173 |     
174 |     @SuppressWarnings("deprecation")
175 |     private static byte[] readBinaryPortableGraymap8bitData(
176 |         InputStream inputStream) throws IOException
177 |     {
178 |         DataInputStream dis = new DataInputStream(inputStream);
179 |         String line = null;
180 |         boolean firstLine = true;
181 |         Integer width = null;
182 |         Integer maxBrightness = null;
183 |         while (true)
184 |         {
185 |             // The DataInputStream#readLine is deprecated,
186 |             // but for ASCII input, it is safe to use it
187 |             line = dis.readLine();
188 |             if (line == null)
189 |             {
190 |                 break;
191 |             }
192 |             line = line.trim();
193 |             if (line.startsWith("#"))
194 |             {
195 |                 continue;
196 |             }
197 |             if (firstLine)
198 |             {
199 |                 firstLine = false;
200 |                 if (!line.equals("P5"))
201 |                 {
202 |                     throw new IOException(
203 |                         "Data is not a binary portable " + 
204 |                         "graymap (P5), but " + line);
205 |                 }
206 |                 else
207 |                 {
208 |                     continue;
209 |                 }
210 |             }
211 |             if (width == null)
212 |             {
213 |                 String tokens[] = line.split(" ");
214 |                 if (tokens.length < 2)
215 |                 {
216 |                     throw new IOException(
217 |                         "Expected dimensions, found " + line);
218 |                 }
219 |                 width = parseInt(tokens[0]);
220 |             }
221 |             else if (maxBrightness == null)
222 |             {
223 |                 maxBrightness = parseInt(line);
224 |                 if (maxBrightness > 255)
225 |                 {
226 |                     throw new IOException(
227 |                         "Only 8 bit values supported. " + 
228 |                         "Maximum value is " + maxBrightness);
229 |                 }
230 |                 break;
231 |             }
232 |         }
233 |         byte data[] = readFully(inputStream);
234 |         return data;
235 |     }
236 | 
237 |     private static Integer parseInt(String s) throws IOException
238 |     {
239 |         try
240 |         {
241 |             return Integer.parseInt(s);
242 |         }
243 |         catch (NumberFormatException e)
244 |         {
245 |             throw new IOException(e);
246 |         }
247 |     }
248 | 
249 |     static void printDeviceVector(int size, Pointer d, int dataType)
250 |     {
251 |         if (dataType == cudnnDataType.CUDNN_DATA_FLOAT)
252 |         {
253 |             printFloatDeviceVector(size, d);
254 |         }
255 |         else if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE)
256 |         {
257 |             printDoubleDeviceVector(size, d);
258 |         }
259 |         else
260 |         {
261 |             throw new IllegalArgumentException(
262 |                 "Invalid data type: " + cudnnDataType.stringFor(dataType));
263 |         }
264 |     }
265 |     
266 |     private static void printFloatDeviceVector(int size, Pointer d)
267 |     {
268 |         float h[] = new float[size];
269 |         cudaDeviceSynchronize();
270 |         cudaMemcpy(Pointer.to(h), d, size * Sizeof.FLOAT,
271 |             cudaMemcpyDeviceToHost);
272 |         for (int i = 0; i < size; i++)
273 |         {
274 |             System.out.print(h[i] + " ");
275 |         }
276 |         System.out.println();
277 |     }
278 |     private static void printDoubleDeviceVector(int size, Pointer d)
279 |     {
280 |         double h[] = new double[size];
281 |         cudaDeviceSynchronize();
282 |         cudaMemcpy(Pointer.to(h), d, size * Sizeof.DOUBLE,
283 |             cudaMemcpyDeviceToHost);
284 |         for (int i = 0; i < size; i++)
285 |         {
286 |             System.out.print(h[i] + " ");
287 |         }
288 |         System.out.println();
289 |     }
290 | 
291 |     static int computeIndexOfMax(Pointer d, int length, int dataType)
292 |     {
293 |         if (dataType == cudnnDataType.CUDNN_DATA_FLOAT)
294 |         {
295 |             return computeIndexOfMaxFloat(d, length);
296 |         }
297 |         if (dataType == cudnnDataType.CUDNN_DATA_DOUBLE)
298 |         {
299 |             return computeIndexOfMaxDouble(d, length);
300 |         }
301 |         throw new IllegalArgumentException(
302 |             "Invalid data type: " + cudnnDataType.stringFor(dataType));
303 |     }
304 | 
305 |     private static int computeIndexOfMaxFloat(Pointer d, int length)
306 |     {
307 |         float result[] = new float[length];
308 |         cudaMemcpy(Pointer.to(result), d, 
309 |             length * Sizeof.FLOAT,
310 |             cudaMemcpyDeviceToHost);
311 |         int id = 0;
312 |         for (int i = 1; i < length; i++)
313 |         {
314 |             if (result[id] < result[i])
315 |                 id = i;
316 |         }
317 |         return id;
318 |     }
319 |     
320 |     private static int computeIndexOfMaxDouble(Pointer d, int length)
321 |     {
322 |         double result[] = new double[length];
323 |         cudaMemcpy(Pointer.to(result), d, 
324 |             length * Sizeof.DOUBLE,
325 |             cudaMemcpyDeviceToHost);
326 |         int id = 0;
327 |         for (int i = 1; i < length; i++)
328 |         {
329 |             if (result[id] < result[i])
330 |                 id = i;
331 |         }
332 |         return id;
333 |     }
334 |     
335 |     private static Pointer createDevicePointer(float data[])
336 |     {
337 |         int size = data.length * Sizeof.FLOAT;
338 |         Pointer deviceData = new Pointer();
339 |         cudaMalloc(deviceData, size);
340 |         cudaMemcpy(deviceData, Pointer.to(data), size, cudaMemcpyHostToDevice);
341 |         return deviceData;
342 |     }
343 |     
344 |     private static Pointer createDevicePointer(double data[])
345 |     {
346 |         int size = data.length * Sizeof.DOUBLE;
347 |         Pointer deviceData = new Pointer();
348 |         cudaMalloc(deviceData, size);
349 |         cudaMemcpy(deviceData, Pointer.to(data), size, cudaMemcpyHostToDevice);
350 |         return deviceData;
351 |     }
352 | 
353 | }
354 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaDriverStreamCallbacks.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2017 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.driver.samples;
  7 | 
  8 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
  9 | import static jcuda.driver.JCudaDriver.cuCtxSetCurrent;
 10 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 11 | import static jcuda.driver.JCudaDriver.cuInit;
 12 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 13 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 14 | import static jcuda.driver.JCudaDriver.cuMemHostAlloc;
 15 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoHAsync;
 16 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoDAsync;
 17 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 18 | import static jcuda.driver.JCudaDriver.cuModuleLoadData;
 19 | import static jcuda.driver.JCudaDriver.cuStreamAddCallback;
 20 | import static jcuda.driver.JCudaDriver.cuStreamCreate;
 21 | import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
 22 | import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
 23 | import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
 24 | import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
 25 | 
 26 | import java.nio.ByteBuffer;
 27 | import java.nio.ByteOrder;
 28 | import java.nio.IntBuffer;
 29 | import java.util.concurrent.CancellationException;
 30 | import java.util.concurrent.ExecutionException;
 31 | import java.util.concurrent.ExecutorService;
 32 | import java.util.concurrent.Future;
 33 | import java.util.concurrent.LinkedBlockingQueue;
 34 | import java.util.concurrent.ThreadPoolExecutor;
 35 | import java.util.concurrent.TimeUnit;
 36 | 
 37 | import jcuda.Pointer;
 38 | import jcuda.Sizeof;
 39 | import jcuda.driver.CUcontext;
 40 | import jcuda.driver.CUdevice;
 41 | import jcuda.driver.CUdeviceptr;
 42 | import jcuda.driver.CUfunction;
 43 | import jcuda.driver.CUmodule;
 44 | import jcuda.driver.CUstream;
 45 | import jcuda.driver.CUstreamCallback;
 46 | import jcuda.driver.JCudaDriver;
 47 | import jcuda.nvrtc.JNvrtc;
 48 | import jcuda.nvrtc.nvrtcProgram;
 49 | 
 50 | /**
 51 |  * An example showing stream callbacks involving multiple streams
 52 |  * and threads
 53 |  */
 54 | public class JCudaDriverStreamCallbacks
 55 | {
 56 |     /**
 57 |      * A kernel that increments all elements of an int array by 1
 58 |      */
 59 |     private static String programSourceCode = 
 60 |         "extern \"C\"" + "\n" +
 61 |         "__global__ void example(int n, int *data)" + "\n" +
 62 |         "{" + "\n" +
 63 |         "    int i = blockIdx.x * blockDim.x + threadIdx.x;" + "\n" +
 64 |         "    if (i<n)" + "\n" +
 65 |         "    {" + "\n" +
 66 |         "        data[i]++;" + "\n" +
 67 |         "    }" + "\n" +
 68 |         "}" + "\n";
 69 |     
 70 |     /**
 71 |      * The CUDA context
 72 |      */
 73 |     private static CUcontext context;
 74 |     
 75 |     /**
 76 |      * The kernel function with the dummy workload
 77 |      */
 78 |     private static CUfunction function;
 79 |     
 80 |     /**
 81 |      * The size of the array in the workload class
 82 |      */
 83 |     private static final int WORKLOAD_SIZE = 100000;
 84 |     
 85 |     /**
 86 |      * The workload that is processed here: Host and device data, and
 87 |      * the stream on which the workload is processed. 
 88 |      */
 89 |     private static class Workload
 90 |     {
 91 |         int index;
 92 |         CUstream stream;
 93 |         Pointer hostData;
 94 |         CUdeviceptr deviceData;
 95 |     }
 96 | 
 97 |     /**
 98 |      * Create a Workload instance. This method is called by multiple host
 99 |      * threads, to create the individual workloads, and to send the 
100 |      * commands for processing the workloads to CUDA
101 |      * 
102 |      * @param index The index of the workload 
103 |      * @param executor The executor service 
104 |      */
105 |     private static void createWorkloadOnHost(
106 |         final int index, final ExecutorService executor)
107 |     {
108 |         // Make sure that the CUDA context is current for the calling thread
109 |         cuCtxSetCurrent(context);
110 | 
111 |         // Initialize the workload, and create the CUDA stream
112 | 
113 |         System.out.println(index + ": Initializing workload");
114 |         final Workload workload = new Workload();
115 |         workload.index = index;
116 |         workload.stream = new CUstream();
117 |         cuStreamCreate(workload.stream, 0);
118 |         
119 |         
120 |         // Create the host data of the workload
121 |         
122 |         System.out.println(index + ": Create host data");
123 |         workload.hostData = new Pointer();
124 |         cuMemHostAlloc(workload.hostData, WORKLOAD_SIZE * Sizeof.INT, 0);
125 |         ByteBuffer hostByteBuffer =
126 |             workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT);
127 |         IntBuffer hostIntBuffer = 
128 |             hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer();
129 |         for (int i = 0; i < WORKLOAD_SIZE; i++)
130 |         {
131 |             hostIntBuffer.put(i, i);
132 |         }
133 |         workload.deviceData = new CUdeviceptr();
134 |         cuMemAlloc(workload.deviceData, WORKLOAD_SIZE * Sizeof.INT);
135 | 
136 |         
137 |         // Execute the CUDA commands:
138 |         // - Copy the host data to the device
139 |         // - Execute the kernel
140 |         // - Copy the modified device data back to the host
141 |         // All this is done asynchronously
142 | 
143 |         System.out.println(index + ": Execute CUDA commands");
144 | 
145 |         cuMemcpyHtoDAsync(workload.deviceData, workload.hostData,
146 |             WORKLOAD_SIZE * Sizeof.INT, workload.stream);
147 | 
148 |         Pointer kernelParameters = Pointer.to(
149 |             Pointer.to(new int[]{WORKLOAD_SIZE}),
150 |             Pointer.to(workload.deviceData)
151 |         );
152 |         int blockSizeX = 256;
153 |         int gridSizeX = (WORKLOAD_SIZE + blockSizeX - 1) / blockSizeX;
154 |         cuLaunchKernel(function, gridSizeX,  1, 1, blockSizeX, 1, 1,
155 |             0, workload.stream, kernelParameters, null);
156 |         
157 |         cuMemcpyDtoHAsync(workload.hostData, workload.deviceData,
158 |             WORKLOAD_SIZE * Sizeof.INT, workload.stream);
159 |         
160 |         
161 |         // Define the callback that will be called when all CUDA commands
162 |         // on the stream have finished. This callback will forward the
163 |         // workload to the "finishWorkloadOnHost" method.
164 |         CUstreamCallback callback = new CUstreamCallback()
165 |         {
166 |             @Override
167 |             public void call(
168 |                 CUstream hStream, int status, final Object userData)
169 |             {
170 |                 System.out.println(index + ": Callback was called");
171 |                 Runnable runnable = new Runnable()
172 |                 {
173 |                     @Override
174 |                     public void run()
175 |                     {
176 |                         finishWorkloadOnHost(userData);
177 |                     }
178 |                 };
179 |                 executor.submit(runnable);
180 |             }
181 |         };
182 |         cuStreamAddCallback(workload.stream, callback, workload, 0);
183 |     }
184 |     
185 |     
186 |     /**
187 |      * A method that will be called by a stream callback, and receive the
188 |      * workload for which the CUDA commands have been finished
189 |      * 
190 |      * @param workloadObject The workload object
191 |      */
192 |     private static void finishWorkloadOnHost(Object workloadObject)
193 |     {
194 |         Workload workload = (Workload)workloadObject;
195 |         int index = workload.index;
196 |         
197 |         // Finish the task, by comparing the host data with the expected values
198 |         
199 |         System.out.println(index + ": Finishing");
200 |         
201 |         boolean passed = true;
202 |         ByteBuffer hostByteBuffer =
203 |             workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT);
204 |         IntBuffer hostIntBuffer = 
205 |             hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer();
206 |         for (int i = 0; i < WORKLOAD_SIZE; i++)
207 |         {
208 |             passed &= (hostIntBuffer.get(i) == (i + 1));
209 |         }
210 | 
211 |         System.out.println(index + ": " + (passed ? "PASSED" : "FAILED"));
212 |     }
213 |     
214 |     /**
215 |      * Entry point of this sample
216 |      * 
217 |      * @param args Not used
218 |      */
219 |     public static void main(String[] args)
220 |     {
221 |         initialize();
222 |         
223 |         final ExecutorService executor = createExecutorService(4);
224 | 
225 |         // Create tasks to create Workload objects, and pass them to
226 |         // the executor service. Each task will initialize its 
227 |         // workload on the host and pass the workload to CUDA.
228 |         // When the workload is done, the tasks to finish the
229 |         // workloads are created and passed to the executor service
230 |         int numWorkloads = 8;
231 |         for (int n=0; n<numWorkloads; n++)
232 |         {
233 |             final int index = n;
234 |             Runnable runnable = new Runnable()
235 |             {
236 |                 @Override
237 |                 public void run()
238 |                 {
239 |                     createWorkloadOnHost(index, executor);
240 |                 }
241 |             };
242 |             executor.submit(runnable);
243 |         }
244 |      
245 |         // Shut down the executor service
246 |         try
247 |         {
248 |             executor.awaitTermination(10,  TimeUnit.SECONDS);
249 |             executor.shutdown();
250 |         }
251 |         catch (InterruptedException e)
252 |         {
253 |             e.printStackTrace();
254 |         }
255 |         System.out.println("Done");
256 |         
257 |     }
258 |     
259 | 
260 |     /**
261 |      * Initialize the driver API, the {@link #context} and the 
262 |      * kernel {@link #function} 
263 |      */
264 |     private static void initialize()
265 |     {
266 |         System.out.println("Initializing...");
267 |         
268 |         JCudaDriver.setExceptionsEnabled(true);
269 |         JNvrtc.setExceptionsEnabled(true);
270 | 
271 |         cuInit(0);
272 |         CUdevice device = new CUdevice();
273 |         cuDeviceGet(device, 0);
274 |         context = new CUcontext();
275 |         cuCtxCreate(context, 0, device);
276 | 
277 |         nvrtcProgram program = new nvrtcProgram();
278 |         nvrtcCreateProgram(
279 |             program, programSourceCode, null, 0, null, null);
280 |         nvrtcCompileProgram(program, 0, null);
281 |         
282 |         String[] ptx = new String[1];
283 |         nvrtcGetPTX(program, ptx);
284 |         nvrtcDestroyProgram(program);
285 | 
286 |         CUmodule module = new CUmodule();
287 |         cuModuleLoadData(module, ptx[0]);
288 | 
289 |         function = new CUfunction();
290 |         cuModuleGetFunction(function, module, "example");
291 |         
292 |         System.out.println("Initializing DONE");
293 |     }
294 |     
295 |     /**
296 |      * Create an executor service with the given fixed pool size, whose
297 |      * core threads time out after a short time, and which re-throws
298 |      * all exceptions that happen in the tasks that it processes.
299 |      * 
300 |      * @param poolSize The pool size
301 |      * @return The executor service
302 |      */
303 |     private static ExecutorService createExecutorService(int poolSize)
304 |     {
305 |         ThreadPoolExecutor e = 
306 |             new ThreadPoolExecutor(poolSize, poolSize,
307 |                 5, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>())
308 |         {
309 |             @Override
310 |             protected void afterExecute(Runnable r, Throwable t)
311 |             {
312 |                 super.afterExecute(r, t);
313 |                 if (t == null && r instanceof Future<?>)
314 |                 {
315 |                     try
316 |                     {
317 |                         Future<?> future = (Future<?>) r;
318 |                         if (future.isDone())
319 |                         {
320 |                             future.get();
321 |                         }
322 |                     }
323 |                     catch (CancellationException ce)
324 |                     {
325 |                         t = ce;
326 |                     }
327 |                     catch (ExecutionException ee)
328 |                     {
329 |                         t = ee.getCause();
330 |                     }
331 |                     catch (InterruptedException ie)
332 |                     {
333 |                         Thread.currentThread().interrupt();
334 |                     }
335 |                 }
336 |                 if (t != null)
337 |                 {
338 |                     throw new RuntimeException(t);
339 |                 }
340 |             }
341 |         };
342 |         e.allowCoreThreadTimeOut(true);
343 |         return e;
344 |     }
345 |     
346 | }
347 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/samples/utils/JCudaSamplesUtils.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA
  3 |  *
  4 |  * Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
  5 |  */
  6 | package jcuda.samples.utils;
  7 | 
  8 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR;
  9 | import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR;
 10 | import static jcuda.driver.JCudaDriver.cuCtxGetDevice;
 11 | import static jcuda.driver.JCudaDriver.cuDeviceGetAttribute;
 12 | 
 13 | import java.io.ByteArrayOutputStream;
 14 | import java.io.File;
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.util.Locale;
 18 | import java.util.Random;
 19 | import java.util.logging.Logger;
 20 | 
 21 | import jcuda.CudaException;
 22 | import jcuda.driver.CUdevice;
 23 | import jcuda.driver.CUresult;
 24 | 
 25 | /**
 26 |  * Utility methods that are used in the JCuda samples.<br>
 27 |  * <br>
 28 |  * NOTE: This class is not part of a public API. It is only intended for
 29 |  * the use in the samples. Parts of its functionality could be replaced 
 30 |  * with the runtime compilation features that have been added in CUDA 7.5. 
 31 |  */
 32 | public class JCudaSamplesUtils
 33 | {
 34 |     /**
 35 |      * The logger used in this class
 36 |      */
 37 |     private static final Logger logger =
 38 |         Logger.getLogger(JCudaSamplesUtils.class.getName());
 39 | 
 40 |     /**
 41 |      * Compiles the given CUDA file into a PTX file using NVCC, and returns
 42 |      * the name of the resulting PTX file
 43 |      * 
 44 |      * @param cuFileName The CUDA file name
 45 |      * @return The PTX file name
 46 |      * @throws CudaException If an error occurs - i.e. when the input file
 47 |      * does not exist, or the NVCC call caused an error.
 48 |      */
 49 |     public static String preparePtxFile(String cuFileName)
 50 |     {
 51 |         return invokeNvcc(cuFileName, "ptx", true);
 52 |     }
 53 | 
 54 |     /**
 55 |      * Compiles the given CUDA file into a CUBIN file using NVCC, and returns
 56 |      * the name of the resulting CUBIN file. By default, the NVCC will be
 57 |      * invoked with the <code>-dlink</code> parameter, and an 
 58 |      * <code>-arch</code> parameter for the compute capability of the
 59 |      * device of the current context.<br>
 60 |      * <br>
 61 |      * Note that there must be a current context when this function 
 62 |      * is called!
 63 |      * 
 64 |      * @param cuFileName The CUDA file name
 65 |      * @return The PTX file name
 66 |      * @throws CudaException If an error occurs - i.e. when the input file
 67 |      * does not exist, or the NVCC call caused an error.
 68 |      * @throws CudaException If there is no current context
 69 |      */
 70 |     public static String prepareDefaultCubinFile(String cuFileName)
 71 |     {
 72 |         int computeCapability = computeComputeCapability();
 73 |         String nvccArguments[] = new String[] {
 74 |             "-dlink",
 75 |             "-arch=sm_"+computeCapability
 76 |         };
 77 |         return invokeNvcc(cuFileName, "cubin", true, nvccArguments);
 78 |     }
 79 | 
 80 |     /**
 81 |      * Tries to create a PTX or CUBIN file for the given CUDA file. <br>
 82 |      * <br>
 83 |      * The extension of the given file name is replaced with 
 84 |      * <code>"cubin"</code> or <code>"ptx"</code>, depending on the 
 85 |      * <code>targetFileType</code>.<br>
 86 |      * <br>
 87 |      * If the file with the resulting name does not exist yet, or if 
 88 |      * <code>forceRebuild</code> is <code>true</code>, then it is compiled 
 89 |      * from the given file using NVCC, using the given parameters.<br>
 90 |      * <br>
 91 |      * The name of the resulting output file is returned.
 92 |      *
 93 |      * @param cuFileName The name of the .CU file
 94 |      * @param targetFileType The target file type. Must be <code>"cubin"</code>
 95 |      * or <code>"ptx"</code> (case-insensitively)
 96 |      * @param forceRebuild Whether the PTX file should be created even if
 97 |      * it already exists
 98 |      * @return The name of the PTX file
 99 |      * @throws CudaException If an error occurs - i.e. when the input file
100 |      * does not exist, or the NVCC call caused an error.
101 |      * @throws IllegalArgumentException If the target file type is not valid
102 |      */
103 |     private static String invokeNvcc(
104 |         String cuFileName, String targetFileType, 
105 |         boolean forceRebuild, String ... nvccArguments)
106 |     {
107 |         if (!"cubin".equalsIgnoreCase(targetFileType) &&
108 |             !"ptx".equalsIgnoreCase(targetFileType))
109 |         {
110 |             throw new IllegalArgumentException(
111 |                 "Target file type must be \"ptx\" or \"cubin\", but is " + 
112 |                     targetFileType);
113 |         }
114 |         logger.info("Creating " + targetFileType + " file for " + cuFileName);
115 | 
116 |         int dotIndex = cuFileName.lastIndexOf('.');
117 |         if (dotIndex == -1)
118 |         {
119 |             dotIndex = cuFileName.length();
120 |         }
121 |         String otuputFileName = cuFileName.substring(0, dotIndex) + 
122 |             "." + targetFileType.toLowerCase();
123 |         File ptxFile = new File(otuputFileName);
124 |         if (ptxFile.exists() && !forceRebuild)
125 |         {
126 |             return otuputFileName;
127 |         }
128 | 
129 |         File cuFile = new File(cuFileName);
130 |         if (!cuFile.exists())
131 |         {
132 |             throw new CudaException("Input file not found: " + cuFileName + 
133 |                 " (" + cuFile.getAbsolutePath() + ")");
134 |         }
135 |         String modelString = "-m" + System.getProperty("sun.arch.data.model");
136 |         String command = "nvcc ";
137 |         command += modelString + " ";
138 |         command += "-" + targetFileType + " ";
139 |         for (String a : nvccArguments)
140 |         {
141 |             command += a + " ";
142 |         }
143 |         command += cuFileName + " -o " + otuputFileName;
144 | 
145 |         logger.info("Executing\n" + command);
146 |         try
147 |         {
148 |             Process process = Runtime.getRuntime().exec(command);
149 | 
150 |             String errorMessage = 
151 |                 new String(toByteArray(process.getErrorStream()));
152 |             String outputMessage =
153 |                 new String(toByteArray(process.getInputStream()));
154 |             int exitValue = 0;
155 |             try
156 |             {
157 |                 exitValue = process.waitFor();
158 |             }
159 |             catch (InterruptedException e)
160 |             {
161 |                 Thread.currentThread().interrupt();
162 |                 throw new CudaException(
163 |                     "Interrupted while waiting for nvcc output", e);
164 |             }
165 |             if (exitValue != 0)
166 |             {
167 |                 logger.severe("nvcc process exitValue " + exitValue);
168 |                 logger.severe("errorMessage:\n" + errorMessage);
169 |                 logger.severe("outputMessage:\n" + outputMessage);
170 |                 throw new CudaException("Could not create " + targetFileType + 
171 |                     " file: " + errorMessage);
172 |             }
173 |         }
174 |         catch (IOException e)
175 |         {
176 |             throw new CudaException("Could not create " + targetFileType + 
177 |                 " file", e);
178 |         }
179 | 
180 |         logger.info("Finished creating " + targetFileType + " file");
181 |         return otuputFileName;
182 |     }
183 | 
184 |     /**
185 |      * Fully reads the given InputStream and returns it as a byte array
186 |      *
187 |      * @param inputStream The input stream to read
188 |      * @return The byte array containing the data from the input stream
189 |      * @throws IOException If an I/O error occurs
190 |      */
191 |     private static byte[] toByteArray(InputStream inputStream)
192 |         throws IOException
193 |     {
194 |         ByteArrayOutputStream baos = new ByteArrayOutputStream();
195 |         byte buffer[] = new byte[8192];
196 |         while (true)
197 |         {
198 |             int read = inputStream.read(buffer);
199 |             if (read == -1)
200 |             {
201 |                 break;
202 |             }
203 |             baos.write(buffer, 0, read);
204 |         }
205 |         return baos.toByteArray();
206 |     }
207 | 
208 |     /**
209 |      * Compute the compute capability of the device device of the current
210 |      * context. The compute capability will be returned as an int value 
211 |      * <code>major * 10 + minor</code>. For example, the return value
212 |      * will be <code>52</code> for a device with compute capability 5.2.
213 |      * 
214 |      * @return The compute capability of the current device
215 |      * @throws CudaException If there is no current context
216 |      */
217 |     private static int computeComputeCapability()
218 |     {
219 |         CUdevice device = new CUdevice();
220 |         int status = cuCtxGetDevice(device);
221 |         if (status != CUresult.CUDA_SUCCESS)
222 |         {
223 |             throw new CudaException(CUresult.stringFor(status));
224 |         }
225 |         return computeComputeCapability(device);
226 |     }
227 | 
228 | 
229 |     /**
230 |      * Compute the compute capability of the given device. The compute 
231 |      * capability will be returned as an int value 
232 |      * <code>major * 10 + minor</code>. For example, the return value
233 |      * will be <code>52</code> for a device with compute capability 5.2.
234 |      * 
235 |      * @param device The device
236 |      * @return The compute capability
237 |      */
238 |     private static int computeComputeCapability(CUdevice device)
239 |     {
240 |         int majorArray[] = { 0 };
241 |         int minorArray[] = { 0 };
242 |         cuDeviceGetAttribute(majorArray,
243 |             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
244 |         cuDeviceGetAttribute(minorArray,
245 |             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
246 |         int major = majorArray[0];
247 |         int minor = minorArray[0];
248 |         return major * 10 + minor;
249 |     }
250 | 
251 | 
252 |     /**
253 |      * Creates an array of the specified size, containing float values from
254 |      * the range [0.0f, 1.0f)
255 |      * 
256 |      * @param n The size of the array
257 |      * @return The array of random values
258 |      */
259 |     public static float[] createRandomFloatData(int n)
260 |     {
261 |         Random random = new Random(0);
262 |         float a[] = new float[n];
263 |         for (int i = 0; i < n; i++)
264 |         {
265 |             a[i] = random.nextFloat();
266 |         }
267 |         return a;
268 |     }
269 | 
270 |     /**
271 |      * Compares the given result against a reference, and returns whether the
272 |      * error norm is below a small epsilon threshold
273 |      * 
274 |      * @param result The result
275 |      * @param reference The reference
276 |      * @return Whether the arrays are equal based on the error norm
277 |      * @throws NullPointerException If any argument is <code>null</code>
278 |      * @throws IllegalArgumentException If the arrays have different lengths
279 |      */
280 |     public static boolean equalByNorm(float result[], float reference[])
281 |     {
282 |         if (result == null)
283 |         {
284 |             throw new NullPointerException("The result is null");
285 |         }
286 |         if (reference == null)
287 |         {
288 |             throw new NullPointerException("The reference is null");
289 |         }
290 |         if (result.length != reference.length)
291 |         {
292 |             throw new IllegalArgumentException(
293 |                 "The result and reference array have different lengths: " + 
294 |                     result.length + " and " + reference.length);
295 |         }
296 |         final float epsilon = 1e-6f;
297 |         float errorNorm = 0;
298 |         float refNorm = 0;
299 |         for (int i = 0; i < result.length; ++i)
300 |         {
301 |             float diff = reference[i] - result[i];
302 |             errorNorm += diff * diff;
303 |             refNorm += reference[i] * result[i];
304 |         }
305 |         errorNorm = (float) Math.sqrt(errorNorm);
306 |         refNorm = (float) Math.sqrt(refNorm);
307 |         if (Math.abs(refNorm) < epsilon)
308 |         {
309 |             return false;
310 |         }
311 |         return (errorNorm / refNorm < epsilon);
312 |     }
313 |     
314 |     
315 |     /**
316 |      * Creates a string representation of the given array as a matrix with 
317 |      * with given number of columns.
318 |      * 
319 |      * @param a The array
320 |      * @param columns The number of columns
321 |      * @return The string representation
322 |      */
323 |     public static String toString2D(float[] a, int columns)
324 |     {
325 |         StringBuilder sb = new StringBuilder();
326 |         for (int i = 0; i < a.length; i++)
327 |         {
328 |             if ((i > 0) && (i % columns == 0))
329 |             {
330 |                 sb.append("\n");
331 |             }
332 |             sb.append(String.format(Locale.ENGLISH, "%7.4f ", a[i]));
333 |         }
334 |         return sb.toString();
335 |     }
336 |     
337 | 
338 | 
339 | }
340 | 


--------------------------------------------------------------------------------
/JCudaSamples/src/main/java/jcuda/driver/samples/JCudaReduction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * JCuda - Java bindings for NVIDIA CUDA driver and runtime API
  3 |  * http://www.jcuda.org
  4 |  *
  5 |  * Copyright 2011-2018 Marco Hutter - http://www.jcuda.org
  6 |  */
  7 | package jcuda.driver.samples;
  8 | 
  9 | import static jcuda.driver.JCudaDriver.cuCtxCreate;
 10 | import static jcuda.driver.JCudaDriver.cuCtxDestroy;
 11 | import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 12 | import static jcuda.driver.JCudaDriver.cuDeviceGet;
 13 | import static jcuda.driver.JCudaDriver.cuInit;
 14 | import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 15 | import static jcuda.driver.JCudaDriver.cuMemAlloc;
 16 | import static jcuda.driver.JCudaDriver.cuMemFree;
 17 | import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 18 | import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 19 | import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 20 | import static jcuda.driver.JCudaDriver.cuModuleLoad;
 21 | import static jcuda.driver.JCudaDriver.cuModuleUnload;
 22 | 
 23 | import java.util.Locale;
 24 | import java.util.Random;
 25 | 
 26 | import jcuda.Pointer;
 27 | import jcuda.Sizeof;
 28 | import jcuda.driver.CUcontext;
 29 | import jcuda.driver.CUdevice;
 30 | import jcuda.driver.CUdeviceptr;
 31 | import jcuda.driver.CUfunction;
 32 | import jcuda.driver.CUmodule;
 33 | import jcuda.driver.JCudaDriver;
 34 | import jcuda.samples.utils.JCudaSamplesUtils;
 35 | 
 36 | /**
 37 |  * Example of a reduction. It is based on the NVIDIA 'reduction' sample, 
 38 |  * and uses an adapted version of one of the kernels presented in 
 39 |  * this sample (see src/main/resources/kernels/JCudaReductionKernel.cu) 
 40 |  */
 41 | public class JCudaReduction
 42 | {
 43 |     /**
 44 |      * The CUDA context created by this sample
 45 |      */
 46 |     private static CUcontext context;
 47 |     
 48 |     /**
 49 |      * The module which is loaded in form of a PTX file
 50 |      */
 51 |     private static CUmodule module;
 52 |     
 53 |     /**
 54 |      * The actual kernel function from the module
 55 |      */
 56 |     private static CUfunction function;
 57 |     
 58 |     /**
 59 |      * Temporary memory for the device output
 60 |      */
 61 |     private static CUdeviceptr deviceBuffer;
 62 |     
 63 |     /**
 64 |      * Entry point of this sample
 65 |      *
 66 |      * @param args Not used
 67 |      */
 68 |     public static void main(String args[])
 69 |     {
 70 |         // Enable exceptions and omit all subsequent error checks
 71 |         JCudaDriver.setExceptionsEnabled(true);
 72 | 
 73 |         init();
 74 |         boolean passed = true;
 75 |         for (int n = 100000; n <= 26500000; n *= 2)
 76 |         {
 77 |             float hostInput[] = createRandomArray(n);
 78 | 
 79 |             long timeNs0 = 0;
 80 |             long timeNs1 = 0;
 81 | 
 82 |             // Copy the input data to the device
 83 |             timeNs0 = System.nanoTime();
 84 |             CUdeviceptr deviceInput = new CUdeviceptr();
 85 |             cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT);
 86 |             cuMemcpyHtoD(deviceInput, Pointer.to(hostInput), 
 87 |                 hostInput.length * Sizeof.FLOAT);
 88 |             timeNs1 = System.nanoTime();
 89 |             long durationCopyNs = timeNs1 - timeNs0;
 90 | 
 91 |             // Execute the reduction with CUDA
 92 |             timeNs0 = System.nanoTime();
 93 |             float resultJCuda = reduce(deviceInput, hostInput.length);
 94 |             timeNs1 = System.nanoTime();
 95 |             long durationCompNs = timeNs1 - timeNs0;
 96 | 
 97 |             cuMemFree(deviceInput);
 98 | 
 99 |             // Execute the reduction with Java
100 |             timeNs0 = System.nanoTime();
101 |             float resultJava = reduceHost(hostInput);
102 |             timeNs1 = System.nanoTime();
103 |             long durationJavaNs = timeNs1 - timeNs0;
104 | 
105 |             System.out.println("Reduction of " + n + " elements");
106 |             System.out.printf(Locale.ENGLISH,
107 |                 "  JCuda: %7.3f ms, result: %f " +
108 |                 "(copy: %7.3f ms, comp: %7.3f ms)\n",
109 |                 (durationCopyNs + durationCompNs) / 1e6, resultJCuda, 
110 |                 durationCopyNs / 1e6, durationCompNs / 1e6);
111 |             System.out.printf(Locale.ENGLISH,
112 |                 "  Java : %7.3f ms, result: %f\n", 
113 |                 durationJavaNs / 1e6, resultJava);
114 |             
115 |             passed &= 
116 |                 Math.abs(resultJCuda - resultJava) < resultJava * 1e-5;
117 |             
118 |         }
119 |         System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
120 | 
121 |         shutdown();
122 |     }    
123 |     
124 |     
125 |     /**
126 |      * Implementation of a Kahan summation reduction in plain Java
127 |      * 
128 |      * @param input The input 
129 |      * @return The reduction result
130 |      */
131 |     private static float reduceHost(float data[])
132 |     {
133 |         float sum = data[0];
134 |         float c = 0.0f;              
135 |         for (int i = 1; i < data.length; i++)
136 |         {
137 |             float y = data[i] - c;  
138 |             float t = sum + y;      
139 |             c = (t - sum) - y;  
140 |             sum = t;            
141 |         }
142 |         return sum;
143 |     }
144 |     
145 |     
146 |     /**
147 |      * Initialize the context, module, function and other elements used 
148 |      * in this sample
149 |      */
150 |     private static void init()
151 |     {
152 |         // Initialize the driver API and create a context for the first device
153 |         cuInit(0);
154 |         CUdevice device = new CUdevice();
155 |         cuDeviceGet(device, 0);
156 |         context = new CUcontext();
157 |         cuCtxCreate(context, 0, device);
158 | 
159 |         // Create the PTX file by calling the NVCC
160 |         String ptxFileName = JCudaSamplesUtils.preparePtxFile(
161 |             "src/main/resources/kernels/JCudaReductionKernel.cu");
162 |         
163 |         // Load the module from the PTX file
164 |         module = new CUmodule();
165 |         cuModuleLoad(module, ptxFileName);
166 | 
167 |         // Obtain a function pointer to the "reduce" function.
168 |         function = new CUfunction();
169 |         cuModuleGetFunction(function, module, "reduce");
170 |         
171 |         // Allocate a chunk of temporary memory (must be at least
172 |         // numberOfBlocks * Sizeof.FLOAT)
173 |         deviceBuffer = new CUdeviceptr();
174 |         cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
175 |         
176 |     }
177 |     
178 |     /**
179 |      * Release all resources allocated by this class
180 |      */
181 |     private static void shutdown()
182 |     {
183 |         cuModuleUnload(module);
184 |         cuMemFree(deviceBuffer);
185 |         cuCtxDestroy(context);
186 |     }
187 |     
188 |     /**
189 |      * Performs a reduction on the given device memory with the given
190 |      * number of elements.
191 |      * 
192 |      * @param deviceInput The device input memory
193 |      * @param numElements The number of elements to reduce
194 |      * @return The reduction result
195 |      */
196 |     private static float reduce(
197 |         Pointer deviceInput, int numElements)
198 |     {
199 |         return reduce(deviceInput, numElements, 128, 64);
200 |     }
201 |     
202 |     
203 |     /**
204 |      * Performs a reduction on the given device memory with the given
205 |      * number of elements and the specified limits for threads and
206 |      * blocks.
207 |      * 
208 |      * @param deviceInput The device input memory
209 |      * @param numElements The number of elements to reduce
210 |      * @param maxThreads The maximum number of threads
211 |      * @param maxBlocks The maximum number of blocks
212 |      * @return The reduction result
213 |      */
214 |     private static float reduce(
215 |         Pointer deviceInput, int numElements, 
216 |         int maxThreads, int maxBlocks)
217 |     {
218 |         // Determine the number of threads and blocks for the input 
219 |         int numBlocks = getNumBlocks(numElements, maxBlocks, maxThreads);
220 |         int numThreads = getNumThreads(numElements, maxBlocks, maxThreads);
221 |         
222 |         // Call the main reduction method
223 |         float result = reduce(numElements, numThreads, numBlocks, 
224 |             maxThreads, maxBlocks, deviceInput);
225 |         return result;
226 |     }
227 |     
228 | 
229 |     
230 |     /**
231 |      * Performs a reduction on the given device memory.
232 |      * 
233 |      * @param n The number of elements for the reduction
234 |      * @param numThreads The number of threads
235 |      * @param numBlocks The number of blocks
236 |      * @param maxThreads The maximum number of threads
237 |      * @param maxBlocks The maximum number of blocks
238 |      * @param deviceInput The input memory
239 |      * @return The reduction result
240 |      */
241 |     private static float reduce(
242 |         int  n, int  numThreads, int  numBlocks,
243 |         int  maxThreads, int  maxBlocks, Pointer deviceInput)
244 |     {
245 |         // Perform a "tree like" reduction as in the NVIDIA sample
246 |         reduce(n, numThreads, numBlocks, deviceInput, deviceBuffer);
247 |         int s = numBlocks;
248 |         while(s > 1) 
249 |         {
250 |             int threads = getNumThreads(s, maxBlocks, maxThreads);
251 |             int blocks = getNumBlocks(s, maxBlocks, maxThreads);
252 | 
253 |             reduce(s, threads, blocks, deviceBuffer, deviceBuffer);
254 |             s = (s + (threads * 2 - 1)) / (threads * 2);
255 |         }
256 |         
257 |         float result[] = {0.0f};
258 |         cuMemcpyDtoH(Pointer.to(result), deviceBuffer, Sizeof.FLOAT);     
259 |         return result[0];
260 |     }
261 |     
262 |     
263 |     /**
264 |      * Perform a reduction of the specified number of elements in the given 
265 |      * device input memory, using the given number of threads and blocks, 
266 |      * and write the results into the given output memory. 
267 |      * 
268 |      * @param size The size (number of elements) 
269 |      * @param threads The number of threads
270 |      * @param blocks The number of blocks
271 |      * @param deviceInput The device input memory
272 |      * @param deviceOutput The device output memory. Its size must at least 
273 |      * be numBlocks*Sizeof.FLOAT
274 |      */
275 |     private static void reduce(int size, int threads, int blocks, 
276 |         Pointer deviceInput, Pointer deviceOutput)
277 |     {
278 |         // Compute the shared memory size (as done in 
279 |         // the NIVIDA sample)
280 |         int sharedMemSize = threads * Sizeof.FLOAT;
281 |         if (threads <= 32) 
282 |         {
283 |             sharedMemSize *= 2;
284 |         }
285 |         
286 |         // Set up the kernel parameters: A pointer to an array
287 |         // of pointers which point to the actual values.
288 |         Pointer kernelParameters = Pointer.to(
289 |             Pointer.to(deviceInput),
290 |             Pointer.to(deviceOutput),
291 |             Pointer.to(new int[]{size})
292 |         );
293 | 
294 |         // Call the kernel function.
295 |         cuLaunchKernel(function,
296 |             blocks,  1, 1,         // Grid dimension
297 |             threads, 1, 1,         // Block dimension
298 |             sharedMemSize, null,   // Shared memory size and stream
299 |             kernelParameters, null // Kernel- and extra parameters
300 |         );
301 |         cuCtxSynchronize();
302 |     }
303 |     
304 |     
305 |     /**
306 |      * Compute the number of blocks that should be used for the
307 |      * given input size and limits
308 |      * 
309 |      * @param n The input size
310 |      * @param maxBlocks The maximum number of blocks
311 |      * @param maxThreads The maximum number of threads
312 |      * @return The number of blocks
313 |      */
314 |     private static int getNumBlocks(int n, int maxBlocks, int maxThreads)
315 |     {
316 |         int blocks = 0;
317 |         int threads = getNumThreads(n, maxBlocks, maxThreads);
318 |         blocks = (n + (threads * 2 - 1)) / (threads * 2);
319 |         blocks = Math.min(maxBlocks, blocks);
320 |         return blocks;
321 |     }
322 | 
323 |     /**
324 |      * Compute the number of threads that should be used for the
325 |      * given input size and limits
326 |      * 
327 |      * @param n The input size
328 |      * @param maxBlocks The maximum number of blocks
329 |      * @param maxThreads The maximum number of threads
330 |      * @return The number of threads
331 |      */
332 |     private static int getNumThreads(int n, int maxBlocks, int maxThreads)
333 |     {
334 |         int threads = 0;
335 |         threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
336 |         return threads;
337 |     }
338 |     
339 |     /**
340 |      * Returns the power of 2 that is equal to or greater than x
341 |      * 
342 |      * @param x The input
343 |      * @return The next power of 2
344 |      */
345 |     private static int nextPow2(int x)
346 |     {
347 |         --x;
348 |         x |= x >> 1;
349 |         x |= x >> 2;
350 |         x |= x >> 4;
351 |         x |= x >> 8;
352 |         x |= x >> 16;
353 |         return ++x;
354 |     }
355 | 
356 |     
357 |     /**
358 |      * Create an array of the given size, with random data
359 |      * 
360 |      * @param size The array size
361 |      * @return The array
362 |      */
363 |     private static float[] createRandomArray(int size)
364 |     {
365 |         Random random = new Random(0);
366 |         float array[] = new float[size];
367 |         for(int i = 0; i < size; i++)
368 |         {
369 |             array[i] = random.nextFloat() * 0.01f;
370 |         }
371 |         return array;
372 |     }
373 | }
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------