├── samplespath
├── runit.nvidia-smi
├── exercises
    ├── cuda
    │   ├── nn
    │   │   ├── runit
    │   │   ├── cpu
    │   │   │   ├── runit
    │   │   │   ├── libatlas.a
    │   │   │   ├── libcblas.a
    │   │   │   ├── setupData.sh
    │   │   │   └── Makefile
    │   │   ├── orig
    │   │   │   ├── runit
    │   │   │   └── Makefile
    │   │   ├── libatlas.a
    │   │   ├── libcblas.a
    │   │   ├── setupData.sh
    │   │   ├── README.md
    │   │   └── Makefile
    │   ├── make.common
    │   ├── hello_world
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_CPU
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction3
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction4
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── simple_add
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_streams
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── naive_transpose
    │   │   ├── runit
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── reduction_atomic
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_naive
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_thrust
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── simple_stencil
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── smem_transpose
    │   │   ├── runit
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── simple_add_blocks
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── simple_add_threads
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── reduction_cub_block
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_cub_device
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── simple_stencil_smem
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── svm_challenge
    │   │   ├── original
    │   │   │   ├── README.md
    │   │   │   ├── Makefile
    │   │   │   ├── processEmail.sh
    │   │   │   ├── processEmail.py
    │   │   │   └── headers.h
    │   │   ├── libatlas.a
    │   │   ├── libcblas.a
    │   │   ├── spamSample3.txt
    │   │   ├── spamSample2.txt
    │   │   ├── emailSample1.txt
    │   │   ├── spamSample1.txt
    │   │   ├── Makefile
    │   │   ├── processEmail.sh
    │   │   ├── processEmail.py
    │   │   ├── headers.h
    │   │   └── README.md
    │   ├── simple_add_blocks_threads
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_GPU_naive
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── matmul_GPU_shmem1
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── matmul_GPU_shmem
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── thrust_sort
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_CUBLAS
    │   │   └── Makefile
    │   └── debug.h
    └── openacc
    │   ├── 002-laplace2D-data
    │       ├── runit.acc
    │       ├── laplace_acc.job
    │       ├── runit.omp
    │       ├── laplace_omp.job
    │       ├── Makefile
    │       ├── Makefile_f90
    │       ├── timer.h
    │       ├── laplace2d.f90
    │       └── laplace2d.c
    │   └── 001-laplace2D-kernels
    │       ├── runit.acc
    │       ├── laplace_acc.job
    │       ├── runit.omp
    │       ├── laplace_omp.job
    │       ├── Makefile
    │       ├── Makefile_f90
    │       ├── timer.h
    │       ├── laplace2d.f90
    │       └── laplace2d.c
├── openaccscript
├── cudascript
├── exercise_solutions
    ├── cuda
    │   ├── nn
    │   │   ├── runit
    │   │   ├── libatlas.a
    │   │   ├── libcblas.a
    │   │   ├── setupData.sh
    │   │   └── Makefile
    │   ├── make.common
    │   ├── hello_world
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── reduction3
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction4
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── simple_add
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_CUBLAS
    │   │   ├── runit
    │   │   ├── matmul_CUBLAS.timeline.k20X
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── matmul_streams
    │   │   ├── runit
    │   │   ├── matmul_streams.timeline.k20X
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── simple_stencil
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── smem_transpose
    │   │   ├── runit
    │   │   ├── profile.sh
    │   │   ├── smem_transpose.no_conflict.analysis
    │   │   ├── smem_transpose.no_conflict.timeline
    │   │   ├── smem_transpose.bank_conflict.analysis
    │   │   ├── smem_transpose.bank_conflict.timeline
    │   │   ├── smem_transpose.no_conflict.analysis.k40
    │   │   ├── smem_transpose.no_conflict.timeline.k40
    │   │   ├── smem_transpose.bank_conflict.analysis.k40
    │   │   ├── smem_transpose.bank_conflict.timeline.k40
    │   │   ├── smem_transpose.no_conflict.analysis.c2050
    │   │   ├── smem_transpose.no_conflict.timeline.c2050
    │   │   ├── smem_transpose.bank_conflict.analysis.c2050
    │   │   ├── smem_transpose.bank_conflict.timeline.c2050
    │   │   └── Makefile
    │   ├── matmul_GPU_naive
    │   │   ├── runit
    │   │   ├── matmul_GPU_naive.analysis
    │   │   ├── matmul_GPU_naive.timeline
    │   │   ├── matmul_GPU_naive.analysis.k40
    │   │   ├── matmul_GPU_naive.timeline.k40
    │   │   ├── matmul_GPU_naive.analysis.c2050
    │   │   ├── matmul_GPU_naive.timeline.c2050
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── matmul_GPU_shmem
    │   │   ├── runit
    │   │   ├── matmul_GPU_shmem.analysis
    │   │   ├── matmul_GPU_shmem.timeline
    │   │   ├── matmul_GPU_shmem.analysis.k40
    │   │   ├── matmul_GPU_shmem.timeline.k40
    │   │   ├── matmul_GPU_shmem.analysis.c2050
    │   │   ├── matmul_GPU_shmem.timeline.c2050
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── naive_transpose
    │   │   ├── runit
    │   │   ├── naive_transpose.analysis
    │   │   ├── naive_transpose.timeline
    │   │   ├── naive_transpose.analysis.k40
    │   │   ├── naive_transpose.timeline.k40
    │   │   ├── naive_transpose.analysis.c2050
    │   │   ├── naive_transpose.timeline.c2050
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── reduction_atomic
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_naive
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_thrust
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── smem_transpose_opt
    │   │   ├── runit
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── matmul_GPU_shmem1
    │   │   ├── runit
    │   │   ├── matmul_GPU_shmem1.analysis
    │   │   ├── matmul_GPU_shmem1.timeline
    │   │   ├── matmul_GPU_shmem1.analysis.c2050
    │   │   ├── matmul_GPU_shmem1.analysis.k40
    │   │   ├── matmul_GPU_shmem1.timeline.c2050
    │   │   ├── matmul_GPU_shmem1.timeline.k40
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── naive_transpose_cutlass
    │   │   ├── runit
    │   │   ├── main.cu
    │   │   ├── naive_transpose.analysis
    │   │   ├── naive_transpose.timeline
    │   │   ├── naive_transpose.analysis.k40
    │   │   ├── naive_transpose.timeline.k40
    │   │   ├── naive_transpose.analysis.c2050
    │   │   ├── naive_transpose.timeline.c2050
    │   │   ├── profile.sh
    │   │   └── Makefile
    │   ├── reduction_cub_block
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── simple_add_blocks
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── simple_add_threads
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── simple_stencil_smem
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── reduction_cub_device
    │   │   ├── runit
    │   │   └── Makefile
    │   ├── svm_challenge
    │   │   ├── original
    │   │   │   ├── README.md
    │   │   │   ├── Makefile
    │   │   │   ├── processEmail.sh
    │   │   │   ├── processEmail.py
    │   │   │   └── headers.h
    │   │   ├── libatlas.a
    │   │   ├── libcblas.a
    │   │   ├── spamSample3.txt
    │   │   ├── spamSample2.txt
    │   │   ├── emailSample1.txt
    │   │   ├── spamSample1.txt
    │   │   ├── Makefile
    │   │   ├── processEmail.sh
    │   │   ├── processEmail.py
    │   │   ├── headers.h
    │   │   └── README.md
    │   ├── simple_add_blocks_threads
    │   │   ├── runit
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── thrust_sort
    │   │   ├── Makefile
    │   │   └── kernel.cu
    │   ├── matmul_CPU
    │   │   └── Makefile
    │   └── debug.h
    └── openacc
    │   ├── 002-laplace2D-data
    │       ├── runit.acc
    │       ├── laplace_acc.job
    │       ├── runit.omp
    │       ├── laplace_omp.job
    │       ├── Makefile
    │       ├── Makefile_f90
    │       ├── timer.h
    │       ├── laplace2d.f90
    │       └── laplace2d.c
    │   └── 001-laplace2D-kernels
    │       ├── laplace_acc.job
    │       ├── runit.acc
    │       ├── runit.omp
    │       ├── laplace_omp.job
    │       ├── Makefile
    │       ├── Makefile_f90
    │       ├── timer.h
    │       ├── laplace2d.f90
    │       └── laplace2d.c
├── runit.query
├── runit.bandwidth
├── runit.matmul
├── README.md
├── README.cluster
└── batch_setup.sh


/samplespath:
--------------------------------------------------------------------------------
1 | SAMPLESPATH=.
2 | 


--------------------------------------------------------------------------------
/runit.nvidia-smi:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | nvidia-smi
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.nn
6 | 


--------------------------------------------------------------------------------
/openaccscript:
--------------------------------------------------------------------------------
1 | #PBS -j oe
2 | 
3 | module load pgi
4 | cd $PBS_O_WORKDIR
5 | 


--------------------------------------------------------------------------------
/cudascript:
--------------------------------------------------------------------------------
1 | #PBS -j oe
2 | 
3 | module load cuda/5.5.22
4 | cd $PBS_O_WORKDIR
5 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/cpu/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.nn
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/orig/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.nn
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/nn/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.nn
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/make.common:
--------------------------------------------------------------------------------
1 | ARCH=-arch sm_30
2 | CUB_INCLUDE=../../../../cub-1.4.1
3 | 


--------------------------------------------------------------------------------
/exercises/cuda/hello_world/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.hello_world
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_CPU/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_CPU
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction3/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction3
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction4/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction4
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/make.common:
--------------------------------------------------------------------------------
1 | ARCH=-arch sm_30
2 | CUB_INCLUDE=../../../../cub-1.4.1
3 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/hello_world/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.hello_world
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction3/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction3
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction4/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction4
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_streams/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_streams
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/naive_transpose/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.naive_transpose
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_atomic/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_atomic
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_naive/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_naive
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_thrust/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_thrust
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_stencil/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_stencil
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/smem_transpose/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.smem_transpose
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_CUBLAS/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_CUBLAS
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_streams/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_streams
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_stencil/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_stencil
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.smem_transpose
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_blocks
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_threads/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_threads
6 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/runit.acc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./laplace2d_acc
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_GPU_naive
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_GPU_shmem
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.naive_transpose
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_atomic/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_atomic
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_naive/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_naive
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_thrust/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_thrust
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose_opt/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.smem_transpose
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_cub_block/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_cub_block
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_cub_device/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_cub_device
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_stencil_smem/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_stencil_smem
6 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/runit.acc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./laplace2d_acc
6 | 


--------------------------------------------------------------------------------
/runit.query:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | source ./samplespath
6 | 
7 | $SAMPLESPATH/deviceQuery
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.matmul_GPU_shmem1
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.naive_transpose
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_cub_block/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_cub_block
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_blocks
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_threads/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_threads
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_stencil_smem/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_stencil_smem
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/runit.acc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./laplace2d_acc
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/libatlas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/libatlas.a


--------------------------------------------------------------------------------
/exercises/cuda/nn/libcblas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/libcblas.a


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/original/README.md:
--------------------------------------------------------------------------------
1 | Original Files
2 | ==============
3 | 
4 | Original source files
5 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/laplace_acc.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | ./laplace2d_acc
4 | 
5 | 


--------------------------------------------------------------------------------
/runit.bandwidth:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | source ./samplespath
6 | 
7 | $SAMPLESPATH/bandwidthTest
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_cub_device/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.reduction_cub_device
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks_threads/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_blocks_threads
6 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/laplace_acc.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | ./laplace2d_acc
4 | 
5 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/original/README.md:
--------------------------------------------------------------------------------
1 | Original Files
2 | ==============
3 | 
4 | Original source files
5 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/laplace_acc.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | ./laplace2d_acc
4 | 
5 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/cpu/libatlas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/cpu/libatlas.a


--------------------------------------------------------------------------------
/exercises/cuda/nn/cpu/libcblas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/cpu/libcblas.a


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/runit.omp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | OMP_NUM_THREADS=1 ./laplace2d_omp
6 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/runit.omp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | OMP_NUM_THREADS=1 ./laplace2d_omp
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks_threads/runit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | ./x.simple_add_blocks_threads
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/laplace_acc.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | ./laplace2d_acc
4 | 
5 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/nn/libatlas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/nn/libatlas.a


--------------------------------------------------------------------------------
/exercise_solutions/cuda/nn/libcblas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/nn/libcblas.a


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/runit.acc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | PGI_ACC_TIME=1 ./laplace2d_acc
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/runit.omp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | OMP_NUM_THREADS=1 ./laplace2d_omp
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/runit.omp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | OMP_NUM_THREADS=1 ./laplace2d_omp
6 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/libatlas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/svm_challenge/libatlas.a


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/libcblas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/svm_challenge/libcblas.a


--------------------------------------------------------------------------------
/runit.matmul:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #BATCHARGS
4 | 
5 | source ./samplespath
6 | 
7 | $SAMPLESPATH/matrixMul
8 | $SAMPLESPATH/matrixMulCUBLAS
9 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/laplace_omp.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | setenv OMP_NUM_THREADS 6
4 | ./laplace2d_omp
5 | 
6 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/laplace_omp.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | setenv OMP_NUM_THREADS 6
4 | ./laplace2d_omp
5 | 
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/libatlas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/svm_challenge/libatlas.a


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/libcblas.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/svm_challenge/libcblas.a


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/laplace_omp.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | setenv OMP_NUM_THREADS 6
4 | ./laplace2d_omp
5 | 
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/laplace_omp.job:
--------------------------------------------------------------------------------
1 | #!/bin/csh
2 | #PBS -l walltime=3:00
3 | setenv OMP_NUM_THREADS 6
4 | ./laplace2d_omp
5 | 
6 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_CUBLAS/matmul_CUBLAS.timeline.k20X:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_CUBLAS/matmul_CUBLAS.timeline.k20X


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/spamSample3.txt:
--------------------------------------------------------------------------------
1 | Hello..
2 | 
3 | My name is Wilson, from ICICI BANK HK.
4 | I have a profitable/confidential deal worth over 48M Dollar to discuss with you.
5 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_streams/matmul_streams.timeline.k20X:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_streams/matmul_streams.timeline.k20X


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/main.cu:
--------------------------------------------------------------------------------
1 | #include <iostream>
2 | using namespace std;
3 | 
4 | int main() {
5 | 
6 |   cout << "hello world" << endl;
7 | 
8 | } /* end main */
9 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/spamSample3.txt:
--------------------------------------------------------------------------------
1 | Hello..
2 | 
3 | My name is Wilson, from ICICI BANK HK.
4 | I have a profitable/confidential deal worth over 48M Dollar to discuss with you.
5 | 


--------------------------------------------------------------------------------
/exercises/cuda/smem_transpose/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=smem_transpose
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.c2050


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_naive/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_naive
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_shmem1/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_shmem1
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=smem_transpose
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_naive
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_shmem1
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose_opt/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=smem_transpose_opt
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.k40:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.k40


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.c2050


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.c2050:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.c2050


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_shmem/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_shmem
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercises/cuda/naive_transpose/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=naive_transpose
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_CUBLAS/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_CUBLAS
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | #nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/spamSample2.txt:
--------------------------------------------------------------------------------
1 | Best Buy Viagra Generic Online
2 | 
3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!
4 | 
5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
6 | http://medphysitcstech.ru
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_GPU_shmem
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_streams/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=matmul_streams
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | #nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=naive_transpose
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/profile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bin=naive_transpose
4 | 
5 | nvprof --output-profile $bin.timeline ./x.$bin
6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin
7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin
8 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/spamSample2.txt:
--------------------------------------------------------------------------------
1 | Best Buy Viagra Generic Online
2 | 
3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!
4 | 
5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
6 | http://medphysitcstech.ru
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | gpu-edu-workshops
 2 | =================
 3 | 
 4 | Public repository for code that I use to teach hands-on NVIDIA GPU computing workshops.
 5 | 
 6 | 
 7 | License
 8 | -------
 9 | 
10 | These examples are released under the Apache 2.0 open source license.  Refer to LICENSE in this directory for full details.
11 | 
12 | 


--------------------------------------------------------------------------------
/README.cluster:
--------------------------------------------------------------------------------
 1 | There are batch scripts located in each subdirectory.  
 2 | 
 3 | Correct "cudascript" and "openaccscript" to add the proper batch arguments as
 4 | well as any other "module load" requirements and anything else required before
 5 | executing the executable.  Something like the following:
 6 | 
 7 | sed -i '/\#BATCHARGS/ r cudascript'  runit.nvidia-smi
 8 | 
 9 | or
10 | 
11 | find /path/to/dir/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \;
12 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/emailSample1.txt:
--------------------------------------------------------------------------------
 1 | > Anyone knows how much it costs to host a web portal ?
 2 | >
 3 | Well, it depends on how many visitors you're expecting.
 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 
 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
 6 | if youre running something big..
 7 | 
 8 | To unsubscribe yourself from this mailing list, send an email to:
 9 | groupname-unsubscribe@egroups.com
10 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/emailSample1.txt:
--------------------------------------------------------------------------------
 1 | > Anyone knows how much it costs to host a web portal ?
 2 | >
 3 | Well, it depends on how many visitors you're expecting.
 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 
 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
 6 | if youre running something big..
 7 | 
 8 | To unsubscribe yourself from this mailing list, send an email to:
 9 | groupname-unsubscribe@egroups.com
10 | 


--------------------------------------------------------------------------------
/batch_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | sed -i '/#BATCHARGS/ r cudascript' runit.nvidia-smi
 4 | sed -i '/#BATCHARGS/ r cudascript' runit.query
 5 | sed -i '/#BATCHARGS/ r cudascript' runit.matmul
 6 | sed -i '/#BATCHARGS/ r cudascript' runit.bandwidth
 7 | 
 8 | find ./exercises/cuda/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \;
 9 | find ./exercise_solutions/cuda/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \;
10 | 
11 | find ./exercises/openacc/ -type f -exec sed -i '/\#BATCHARGS/ r openaccscript' {} \;
12 | find ./exercise_solutions/openacc/ -type f -exec sed -i '/\#BATCHARGS/ r openaccscript' {} \;
13 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/setupData.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
 9 | 
10 | gunzip -f train-images-idx3-ubyte.gz
11 | gunzip -f train-labels-idx1-ubyte.gz
12 | gunzip -f t10k-images-idx3-ubyte.gz
13 | gunzip -f t10k-labels-idx1-ubyte.gz
14 | 
15 | cc -o mnist mnist.c
16 | 
17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt
18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt
19 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/cpu/setupData.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
 9 | 
10 | gunzip -f train-images-idx3-ubyte.gz
11 | gunzip -f train-labels-idx1-ubyte.gz
12 | gunzip -f t10k-images-idx3-ubyte.gz
13 | gunzip -f t10k-labels-idx1-ubyte.gz
14 | 
15 | cc -o mnist mnist.c
16 | 
17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt
18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt
19 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/nn/setupData.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
 9 | 
10 | gunzip -f train-images-idx3-ubyte.gz
11 | gunzip -f train-labels-idx1-ubyte.gz
12 | gunzip -f t10k-images-idx3-ubyte.gz
13 | gunzip -f t10k-labels-idx1-ubyte.gz
14 | 
15 | cc -o mnist mnist.c
16 | 
17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt
18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt
19 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/spamSample1.txt:
--------------------------------------------------------------------------------
 1 | Do You Want To Make $1000 Or More Per Week?
 2 | 
 3 |  
 4 | 
 5 | If you are a motivated and qualified individual - I 
 6 | will personally demonstrate to you a system that will 
 7 | make you $1,000 per week or more! This is NOT mlm.
 8 | 
 9 |  
10 | 
11 | Call our 24 hour pre-recorded number to get the 
12 | details.  
13 | 
14 |  
15 | 
16 | 000-456-789
17 | 
18 |  
19 | 
20 | I need people who want to make serious money.  Make 
21 | the call and get the facts. 
22 | 
23 | Invest 2 minutes in yourself now!
24 | 
25 |  
26 | 
27 | 000-456-789
28 | 
29 |  
30 | 
31 | Looking forward to your call and I will introduce you 
32 | to people like yourself who
33 | are currently making $10,000 plus per week!
34 | 
35 |  
36 | 
37 | 000-456-789
38 | 
39 | 
40 | 
41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72
42 | 
43 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/spamSample1.txt:
--------------------------------------------------------------------------------
 1 | Do You Want To Make $1000 Or More Per Week?
 2 | 
 3 |  
 4 | 
 5 | If you are a motivated and qualified individual - I 
 6 | will personally demonstrate to you a system that will 
 7 | make you $1,000 per week or more! This is NOT mlm.
 8 | 
 9 |  
10 | 
11 | Call our 24 hour pre-recorded number to get the 
12 | details.  
13 | 
14 |  
15 | 
16 | 000-456-789
17 | 
18 |  
19 | 
20 | I need people who want to make serious money.  Make 
21 | the call and get the facts. 
22 | 
23 | Invest 2 minutes in yourself now!
24 | 
25 |  
26 | 
27 | 000-456-789
28 | 
29 |  
30 | 
31 | Looking forward to your call and I will introduce you 
32 | to people like yourself who
33 | are currently making $10,000 plus per week!
34 | 
35 |  
36 | 
37 | 000-456-789
38 | 
39 | 
40 | 
41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72
42 | 
43 | 


--------------------------------------------------------------------------------
/exercises/cuda/thrust_sort/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.sort
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH)
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/thrust_sort/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.sort
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH)
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_CPU/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_CPU
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH)
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | NVCC=nvcc
19 | NVOPTS=$(ARCH) -DDEBUG
20 | 
21 | hello_world: kernel.o
22 | 	$(NVCC) $(NVOPTS) -o x.hello_world kernel.o
23 | 
24 | kernel.o: kernel.cu
25 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
26 | 
27 | clean:
28 | 	rm -rf kernel.o x.hello_world
29 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_CPU/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_CPU
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH)
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction3/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction3
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction4/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction4
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | NVCC=nvcc
19 | NVOPTS=$(ARCH) -DDEBUG
20 | 
21 | hello_world: kernel.o
22 | 	$(NVCC) $(NVOPTS) -o x.hello_world kernel.o
23 | 
24 | kernel.o: kernel.cu
25 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
26 | 
27 | clean:
28 | 	rm -rf kernel.o x.hello_world
29 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction3/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction3
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction4/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction4
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_atomic/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_atomic
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_naive/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_naive
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_thrust/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_thrust
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_blocks
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_threads/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_threads
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_stencil/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_stencil
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_stencil/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_stencil
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_CUBLAS/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_CUBLAS
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_streams/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_streams
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_atomic/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_atomic
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_naive/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_naive
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_thrust/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_thrust
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_threads/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_threads
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/naive_transpose/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.naive_transpose
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_stencil_smem/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_stencil_smem
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/smem_transpose/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.smem_transpose
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_CUBLAS/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_CUBLAS
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_streams/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_streams
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_stencil_smem/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_stencil_smem
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.smem_transpose
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks_threads/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_blocks_threads
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.naive_transpose
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_blocks
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_naive/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_naive
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_shmem/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_shmem
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/naive_transpose_cutlass/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2024 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.naive_transpose
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks_threads/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.simple_add_blocks_threads
19 | NVCC=nvcc
20 | NVOPTS=$(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/smem_transpose_opt/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.smem_transpose_opt
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/matmul_GPU_shmem1/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_shmem1
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_cub_block/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_cub_block
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE)
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_cub_device/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_cub_device
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE)
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_naive/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_naive
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_shmem
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/matmul_GPU_shmem1/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.matmul_GPU_shmem1
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu 
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_cub_block/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_cub_block
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE)
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_cub_device/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | include ../make.common
18 | BIN=x.reduction_cub_device
19 | NVCC=nvcc
20 | NVOPTS=-O3 $(ARCH) -DDEBUG
21 | 
22 | $(BIN): kernel.o
23 | 	$(NVCC) $(NVOPTS) -o $(BIN) kernel.o
24 | 
25 | kernel.o: kernel.cu
26 | 	$(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE)
27 | 
28 | clean:
29 | 	rm -rf kernel.o $(BIN)
30 | 


--------------------------------------------------------------------------------
/exercises/cuda/hello_world/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <stdio.h>
18 | #include "../debug.h"
19 | 
20 | __global__ void mykernel(){
21 |   printf("Hello world from device!\n");
22 | } /* end kernel */
23 | 
24 | int main(void) 
25 | {
26 |   mykernel<<<1,1>>>();
27 |   checkKERNEL()
28 |   printf("Hello World from Host\n");
29 |   return 0;
30 | } /* end main */
31 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/hello_world/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <stdio.h>
18 | #include "../debug.h"
19 | 
20 | __global__ void mykernel(){
21 | 	printf("Hello world from device!\n");
22 | } /* end kernel */
23 | 
24 | int main(void) 
25 | {
26 |   mykernel<<<1,1>>>();
27 |   checkKERNEL()
28 |   printf("Hello World from Host\n");
29 |   return 0;
30 | } /* end main */
31 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC       = pgcc
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.c
27 | 	$(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.c
30 | 	$(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/README.md:
--------------------------------------------------------------------------------
 1 | Instructions
 2 | ------------
 3 | 
 4 | To run the code which trains and then classifies a handwritten digit do the
 5 | following steps.
 6 | 
 7 | 1.) Grab the MNIST files from the Yann Lecun's website.
 8 | 
 9 | > sh setupData.sh
10 | 
11 | 2.) Build the code.  Ensure that nvcc is in your path.
12 | 
13 | > make
14 | 
15 | 3.) Run the code.  In this step the network will be trained on the 60,000 
16 | images from MNIST then compared against 10,000 test images.
17 | 
18 | ./x.nn
19 | 
20 | Learning rate lambda is               3.000e-01
21 | Batchsize is                          50
22 | Number of iterations is               1
23 | Hidden Layer Size is                  25
24 | Number of training examples           60000
25 | Number of features/pixels per example 784
26 | Number of test examples               10000
27 | |
28 | Total time for training is            1.277e+00 sec
29 | Total correct on training set is      48960
30 | Prediction rate of training set is    81.600
31 | Total correct on test set is          8214
32 | Prediction rate of test set is        82.140
33 |  
34 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC       = pgcc
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.c
27 | 	$(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.c
30 | 	$(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC       = pgcc
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.c
27 | 	$(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.c
30 | 	$(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC       = pgcc
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = FIXME# add OpenACC compiler options here
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.c
27 | 	$(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.c
30 | 	$(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/Makefile_f90:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | F90       = pgf90
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.f90
27 | 	$(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.f90
30 | 	$(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/Makefile_f90:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | F90       = pgf90
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = FIXME# add OpenACC compiler options here
20 | OMPFLAGS = -fast -mp -Minfo
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.f90
27 | 	$(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.f90
30 | 	$(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/cpu/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc -Xcompiler=-O3
19 | LIBS=-L. -lcblas -latlas
20 | INC=-I.
21 | 
22 | include ../../make.common
23 | 
24 | all: x.nn
25 | 
26 | x.nn: main.o auxiliary.o
27 | 	$(NVCC) $(ARCH) -o x.nn main.o auxiliary.o $(LIBS)
28 | 
29 | main.o: main.cu headers.h
30 | 	$(NVCC) $(ARCH) -c main.cu $(INC)
31 | 
32 | auxiliary.o: auxiliary.cu headers.h
33 | 	$(NVCC) $(ARCH) -c auxiliary.cu $(INC)
34 | 
35 | clean:
36 | 	rm -rf *.o
37 | 	rm -rf x.*
38 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/Makefile_f90:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | F90       = pgf90
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel -Mpreprocess
20 | OMPFLAGS = -fast -mp -Minfo -Mpreprocess
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.f90
27 | 	$(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.f90
30 | 	$(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/Makefile_f90:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2012 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | F90       = pgf90
18 | CCFLAGS  = -tp sandybridge-64
19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel -Mpreprocess
20 | OMPFLAGS = -fast -mp -Minfo -Mpreprocess
21 | 
22 | BIN =  laplace2d_omp laplace2d_acc
23 | 
24 | all: $(BIN)
25 | 
26 | laplace2d_acc: laplace2d.f90
27 | 	$(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $<
28 | 
29 | laplace2d_omp: laplace2d.f90
30 | 	$(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $<
31 | 
32 | clean:
33 | 	$(RM) $(BIN)
34 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG
20 | LIBS=-L. -lcblas -latlas -lcublas
21 | INC=-I.
22 | 
23 | include ../make.common
24 | 
25 | all: x.nn
26 | 
27 | x.nn: main.o auxiliary.o
28 | 	$(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS)
29 | 
30 | main.o: main.cu headers.h
31 | 	$(NVCC) $(NVOPTS) -c main.cu $(INC)
32 | 
33 | auxiliary.o: auxiliary.cu headers.h
34 | 	$(NVCC) $(NVOPTS) -c auxiliary.cu $(INC)
35 | 
36 | clean:
37 | 	rm -rf *.o
38 | 	rm -rf x.*
39 | 


--------------------------------------------------------------------------------
/exercises/cuda/nn/orig/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG
20 | LIBS=-L. -lcblas -latlas -lcublas
21 | INC=-I.
22 | 
23 | include ../make.common
24 | 
25 | all: x.nn
26 | 
27 | x.nn: main.o auxiliary.o
28 | 	$(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS)
29 | 
30 | main.o: main.cu headers.h
31 | 	$(NVCC) $(NVOPTS) -c main.cu $(INC)
32 | 
33 | auxiliary.o: auxiliary.cu headers.h
34 | 	$(NVCC) $(NVOPTS) -c auxiliary.cu $(INC)
35 | 
36 | clean:
37 | 	rm -rf *.o
38 | 	rm -rf x.*
39 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/nn/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG
20 | LIBS=-L. -lcblas -latlas -lcublas
21 | INC=-I.
22 | 
23 | include ../make.common
24 | 
25 | all: x.nn
26 | 
27 | x.nn: main.o auxiliary.o
28 | 	$(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS)
29 | 
30 | main.o: main.cu headers.h
31 | 	$(NVCC) $(NVOPTS) -c main.cu $(INC)
32 | 
33 | auxiliary.o: auxiliary.cu headers.h
34 | 	$(NVCC) $(NVOPTS) -c auxiliary.cu $(INC)
35 | 
36 | clean:
37 | 	rm -rf *.o
38 | 	rm -rf x.*
39 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | LIBS=-L. -lcblas -latlas
20 | INC=-I.
21 | 
22 | include ../make.common
23 | 
24 | all: x.train x.porterStemmer
25 | 
26 | x.train: main.o auxiliary.o
27 | 	$(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS)
28 | 
29 | main.o: main.cu headers.h
30 | 	$(NVCC) $(ARCH) -c main.cu $(INC)
31 | 
32 | auxiliary.o: auxiliary.cu headers.h
33 | 	$(NVCC) $(ARCH) -c auxiliary.cu $(INC)
34 | 
35 | x.porterStemmer: porterStemmer.o
36 | 	$(CC) -o x.porterStemmer porterStemmer.o
37 | 
38 | porterStemmer.o: porterStemmer.c
39 | 	$(CC) -c porterStemmer.c -o porterStemmer.o
40 | 
41 | clean:
42 | 	rm -rf *.o
43 | 	rm -rf x.*
44 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/original/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | LIBS=-L. -lcblas -latlas
20 | INC=-I.
21 | 
22 | include ../make.common
23 | 
24 | all: x.train x.porterStemmer
25 | 
26 | x.train: main.o auxiliary.o
27 | 	$(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS)
28 | 
29 | main.o: main.cu headers.h
30 | 	$(NVCC) $(ARCH) -c main.cu $(INC)
31 | 
32 | auxiliary.o: auxiliary.cu headers.h
33 | 	$(NVCC) $(ARCH) -c auxiliary.cu $(INC)
34 | 
35 | x.porterStemmer: porterStemmer.o
36 | 	$(CC) -o x.porterStemmer porterStemmer.o
37 | 
38 | porterStemmer.o: porterStemmer.c
39 | 	$(CC) -c porterStemmer.c -o porterStemmer.o
40 | 
41 | clean:
42 | 	rm -rf *.o
43 | 	rm -rf x.*
44 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc -lineinfo
19 | LIBS=-L. -lcblas -latlas -lcublas
20 | INC=-I.
21 | 
22 | include ../make.common
23 | 
24 | all: x.train x.porterStemmer
25 | 
26 | x.train: main.o auxiliary.o
27 | 	$(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS)
28 | 
29 | main.o: main.cu headers.h kernels.h
30 | 	$(NVCC) $(ARCH) -c main.cu $(INC) 
31 | 
32 | auxiliary.o: auxiliary.cu headers.h kernels.h
33 | 	$(NVCC) $(ARCH) -c auxiliary.cu $(INC)
34 | 
35 | x.porterStemmer: porterStemmer.o
36 | 	$(CC) -o x.porterStemmer porterStemmer.o
37 | 
38 | porterStemmer.o: porterStemmer.c
39 | 	$(CC) -c porterStemmer.c -o porterStemmer.o
40 | 
41 | clean:
42 | 	rm -rf *.o
43 | 	rm -rf x.*
44 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/original/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | CC=gcc
18 | NVCC=nvcc
19 | LIBS=-L. -lcblas -latlas -lcublas
20 | INC=-I.
21 | 
22 | include ../make.common
23 | 
24 | all: x.train x.porterStemmer
25 | 
26 | x.train: main.o auxiliary.o
27 | 	$(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS)
28 | 
29 | main.o: main.cu headers.h kernels.h
30 | 	$(NVCC) $(ARCH) -c main.cu $(INC) 
31 | 
32 | auxiliary.o: auxiliary.cu headers.h kernels.h
33 | 	$(NVCC) $(ARCH) -c auxiliary.cu $(INC)
34 | 
35 | x.porterStemmer: porterStemmer.o
36 | 	$(CC) -o x.porterStemmer porterStemmer.o
37 | 
38 | porterStemmer.o: porterStemmer.c
39 | 	$(CC) -c porterStemmer.c -o porterStemmer.o
40 | 
41 | clean:
42 | 	rm -rf *.o
43 | 	rm -rf x.*
44 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/processEmail.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #  Copyright 2017 NVIDIA Corporation
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | #
18 | 
19 | if [ "$#" -ne 1 ]; then
20 |   echo "illegal number of arguments"
21 |   echo "Usage: $0 <filename>"
22 |   exit
23 | fi
24 | 
25 | if [ ! -f "$1" ]; then
26 |   echo "$1 is not a file"
27 |   exit
28 | fi
29 | 
30 | inputEmail=$1
31 | 
32 | #cat $inputEmail | awk '{print tolower($0)}'
33 | 
34 | awk '{print tolower($0)}' $inputEmail | \
35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \
36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \
37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \
38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \
39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \
40 | awk 'NF > 0' > qwerty.txt
41 | 
42 | ./x.porterStemmer qwerty.txt
43 | 
44 | python processEmail.py > emailVector.txt
45 | 
46 | rm -f qwerty.txt
47 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/processEmail.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #  Copyright 2017 NVIDIA Corporation
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | #
18 | 
19 | if [ "$#" -ne 1 ]; then
20 |   echo "illegal number of arguments"
21 |   echo "Usage: $0 <filename>"
22 |   exit
23 | fi
24 | 
25 | if [ ! -f "$1" ]; then
26 |   echo "$1 is not a file"
27 |   exit
28 | fi
29 | 
30 | inputEmail=$1
31 | 
32 | #cat $inputEmail | awk '{print tolower($0)}'
33 | 
34 | awk '{print tolower($0)}' $inputEmail | \
35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \
36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \
37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \
38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \
39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \
40 | awk 'NF > 0' > qwerty.txt
41 | 
42 | ./x.porterStemmer qwerty.txt
43 | 
44 | python processEmail.py > emailVector.txt
45 | 
46 | rm -f qwerty.txt
47 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/original/processEmail.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #  Copyright 2017 NVIDIA Corporation
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | #
18 | 
19 | if [ "$#" -ne 1 ]; then
20 |   echo "illegal number of arguments"
21 |   echo "Usage: $0 <filename>"
22 |   exit
23 | fi
24 | 
25 | if [ ! -f "$1" ]; then
26 |   echo "$1 is not a file"
27 |   exit
28 | fi
29 | 
30 | inputEmail=$1
31 | 
32 | #cat $inputEmail | awk '{print tolower($0)}'
33 | 
34 | awk '{print tolower($0)}' $inputEmail | \
35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \
36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \
37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \
38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \
39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \
40 | awk 'NF > 0' > qwerty.txt
41 | 
42 | ./x.porterStemmer qwerty.txt
43 | 
44 | python processEmail.py > emailVector.txt
45 | 
46 | rm -f qwerty.txt
47 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/original/processEmail.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | #  Copyright 2017 NVIDIA Corporation
 5 | #
 6 | #  Licensed under the Apache License, Version 2.0 (the "License");
 7 | #  you may not use this file except in compliance with the License.
 8 | #  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | #  limitations under the License.
17 | #
18 | 
19 | if [ "$#" -ne 1 ]; then
20 |   echo "illegal number of arguments"
21 |   echo "Usage: $0 <filename>"
22 |   exit
23 | fi
24 | 
25 | if [ ! -f "$1" ]; then
26 |   echo "$1 is not a file"
27 |   exit
28 | fi
29 | 
30 | inputEmail=$1
31 | 
32 | #cat $inputEmail | awk '{print tolower($0)}'
33 | 
34 | awk '{print tolower($0)}' $inputEmail | \
35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \
36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \
37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \
38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \
39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \
40 | awk 'NF > 0' > qwerty.txt
41 | 
42 | ./x.porterStemmer qwerty.txt
43 | 
44 | python processEmail.py > emailVector.txt
45 | 
46 | rm -f qwerty.txt
47 | 


--------------------------------------------------------------------------------
/exercises/cuda/debug.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* CUDA debugging */
18 | 
19 | #ifdef DEBUG
20 | #define checkCUDA(F)  if( (F) != cudaSuccess ) \
21 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
22 |    __FILE__,__LINE__); exit(-1);}
23 | 
24 | #define checkKERNEL()  if( (cudaPeekAtLastError()) != cudaSuccess ) \
25 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
26 |    __FILE__,__LINE__-1); exit(-1);} \
27 |   if( (cudaDeviceSynchronize()) != cudaSuccess ) \
28 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
29 |    __FILE__,__LINE__); exit(-1);}
30 | 
31 | #define checkCUBLAS(F)  if( (F) != CUBLAS_STATUS_SUCCESS ) \
32 |   {printf("Error %d at %s:%d\n", F, \
33 |    __FILE__,__LINE__); exit(-1);}
34 | 
35 | #else
36 | 
37 | #define checkCUDA(F) (F)
38 | #define checkKERNEL()
39 | #define checkCUBLAS(F) (F)
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/debug.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* CUDA debugging */
18 | 
19 | #ifdef DEBUG
20 | #define checkCUDA(F)  if( (F) != cudaSuccess ) \
21 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
22 |    __FILE__,__LINE__); exit(-1);}
23 | 
24 | #define checkKERNEL()  if( (cudaPeekAtLastError()) != cudaSuccess ) \
25 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
26 |    __FILE__,__LINE__-1); exit(-1);} \
27 |   if( (cudaDeviceSynchronize()) != cudaSuccess ) \
28 |   {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
29 |    __FILE__,__LINE__); exit(-1);}
30 | 
31 | #define checkCUBLAS(F)  if( (F) != CUBLAS_STATUS_SUCCESS ) \
32 |   {printf("Error %d at %s:%d\n", F, \
33 |    __FILE__,__LINE__); exit(-1);}
34 | 
35 | #else
36 | 
37 | #define checkCUDA(F) (F)
38 | #define checkKERNEL()
39 | #define checkCUBLAS(F) (F)
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/processEmail.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | # read in the vocab list
18 | 
19 | f = open('vocab_formatted.txt')
20 | vocab = [line.strip() for line in f]
21 | f.close()
22 | 
23 | # read in the email line by line
24 | 
25 | email = []
26 | 
27 | f = open('qwerty.txt')
28 | [email.extend(line.strip().split()) for line in f]
29 | f.close
30 | 
31 | #print email
32 | 
33 | # check each word of the email against the vocab list and build 
34 | # up an array of word indices.  The index is the location of the word
35 | # in the vocab list
36 | 
37 | wordIndices = []
38 | 
39 | for i in email:
40 |   if i in vocab:
41 |     wordIndices.append(vocab.index(i))
42 | 
43 | #print wordIndices
44 | 
45 | # feature vector length is equal to length of vocabulary list
46 | 
47 | vecLength = len(vocab)
48 | featureVector = [0] * vecLength
49 | 
50 | for i in wordIndices:
51 |   featureVector[i] = 1
52 | 
53 | #print len(featureVector)
54 | #print sum(featureVector)
55 | #print featureVector
56 | for val in featureVector:
57 |   print val
58 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/processEmail.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | # read in the vocab list
18 | 
19 | f = open('vocab_formatted.txt')
20 | vocab = [line.strip() for line in f]
21 | f.close()
22 | 
23 | # read in the email line by line
24 | 
25 | email = []
26 | 
27 | f = open('qwerty.txt')
28 | [email.extend(line.strip().split()) for line in f]
29 | f.close
30 | 
31 | #print email
32 | 
33 | # check each word of the email against the vocab list and build 
34 | # up an array of word indices.  The index is the location of the word
35 | # in the vocab list
36 | 
37 | wordIndices = []
38 | 
39 | for i in email:
40 |   if i in vocab:
41 |     wordIndices.append(vocab.index(i))
42 | 
43 | #print wordIndices
44 | 
45 | # feature vector length is equal to length of vocabulary list
46 | 
47 | vecLength = len(vocab)
48 | featureVector = [0] * vecLength
49 | 
50 | for i in wordIndices:
51 |   featureVector[i] = 1
52 | 
53 | #print len(featureVector)
54 | #print sum(featureVector)
55 | #print featureVector
56 | for val in featureVector:
57 |   print val
58 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/original/processEmail.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | # read in the vocab list
18 | 
19 | f = open('vocab_formatted.txt')
20 | vocab = [line.strip() for line in f]
21 | f.close()
22 | 
23 | # read in the email line by line
24 | 
25 | email = []
26 | 
27 | f = open('qwerty.txt')
28 | [email.extend(line.strip().split()) for line in f]
29 | f.close
30 | 
31 | #print email
32 | 
33 | # check each word of the email against the vocab list and build 
34 | # up an array of word indices.  The index is the location of the word
35 | # in the vocab list
36 | 
37 | wordIndices = []
38 | 
39 | for i in email:
40 |   if i in vocab:
41 |     wordIndices.append(vocab.index(i))
42 | 
43 | #print wordIndices
44 | 
45 | # feature vector length is equal to length of vocabulary list
46 | 
47 | vecLength = len(vocab)
48 | featureVector = [0] * vecLength
49 | 
50 | for i in wordIndices:
51 |   featureVector[i] = 1
52 | 
53 | #print len(featureVector)
54 | #print sum(featureVector)
55 | #print featureVector
56 | for val in featureVector:
57 |   print val
58 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/original/processEmail.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 NVIDIA Corporation
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | # read in the vocab list
18 | 
19 | f = open('vocab_formatted.txt')
20 | vocab = [line.strip() for line in f]
21 | f.close()
22 | 
23 | # read in the email line by line
24 | 
25 | email = []
26 | 
27 | f = open('qwerty.txt')
28 | [email.extend(line.strip().split()) for line in f]
29 | f.close
30 | 
31 | #print email
32 | 
33 | # check each word of the email against the vocab list and build 
34 | # up an array of word indices.  The index is the location of the word
35 | # in the vocab list
36 | 
37 | wordIndices = []
38 | 
39 | for i in email:
40 |   if i in vocab:
41 |     wordIndices.append(vocab.index(i))
42 | 
43 | #print wordIndices
44 | 
45 | # feature vector length is equal to length of vocabulary list
46 | 
47 | vecLength = len(vocab)
48 | featureVector = [0] * vecLength
49 | 
50 | for i in wordIndices:
51 |   featureVector[i] = 1
52 | 
53 | #print len(featureVector)
54 | #print sum(featureVector)
55 | #print featureVector
56 | for val in featureVector:
57 |   print val
58 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/headers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* include the ATLAS headers */
18 | 
19 | extern "C"
20 | {
21 | #include <cblas.h>
22 | }
23 | 
24 | #include "../debug.h"
25 | 
26 | /* choose precision to train and classify.  Only float and double are 
27 |  * currently suppored
28 |  */
29 | 
30 | typedef float floatType_t;
31 | 
32 | /* macro to convert 2d coords to 1d offset */
33 | 
34 | #define INDX(row,col,ld) (((col) * (ld)) + (row))
35 | 
36 | /* macros for max/min to combine with argmin */
37 | 
38 | #define MYMAX(val,array,i,index) \
39 | if( array[i] > val ) \
40 | { \
41 |   val = array[i]; \
42 |   index = i; \
43 | } \
44 | 
45 | #define MYMIN(val,array,i,index) \
46 | if( array[i] < val ) \
47 | { \
48 |   val = array[i]; \
49 |   index = i; \
50 | } \
51 | 
52 | /* macro to clip values from min to max */
53 | 
54 | #define CLIP(val,min,max) \
55 | if( (val) < (min) ) val = (min); \
56 | else if( (val) > (max) ) val = (max);
57 | 
58 | /* hardcoded constants for training and test set size and feature
59 |  * vector size
60 |  */
61 | 
62 | #define FEATURE_VECTOR_SIZE (1899)
63 | #define TRAINING_SET_SIZE (4000)
64 | #define TEST_SET_SIZE (1000)
65 | 
66 | /* function defs */
67 | 
68 | void readMatrixFromFile( char *, int *, const int, const int );
69 | 
70 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const,
71 |                const int, const int,
72 |                const floatType_t , const int,
73 |                floatType_t * );
74 | 
75 | void svmPredict( floatType_t const *, floatType_t const *,  
76 |                  int const, int const, int * );
77 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/original/headers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* include the ATLAS headers */
18 | 
19 | extern "C"
20 | {
21 | #include <cblas.h>
22 | }
23 | 
24 | #include "../debug.h"
25 | 
26 | /* choose precision to train and classify.  Only float and double are 
27 |  * currently suppored
28 |  */
29 | 
30 | typedef float floatType_t;
31 | 
32 | /* macro to convert 2d coords to 1d offset */
33 | 
34 | #define INDX(row,col,ld) (((col) * (ld)) + (row))
35 | 
36 | /* macros for max/min to combine with argmin */
37 | 
38 | #define MYMAX(val,array,i,index) \
39 | if( array[i] > val ) \
40 | { \
41 |   val = array[i]; \
42 |   index = i; \
43 | } \
44 | 
45 | #define MYMIN(val,array,i,index) \
46 | if( array[i] < val ) \
47 | { \
48 |   val = array[i]; \
49 |   index = i; \
50 | } \
51 | 
52 | /* macro to clip values from min to max */
53 | 
54 | #define CLIP(val,min,max) \
55 | if( (val) < (min) ) val = (min); \
56 | else if( (val) > (max) ) val = (max);
57 | 
58 | /* hardcoded constants for training and test set size and feature
59 |  * vector size
60 |  */
61 | 
62 | #define FEATURE_VECTOR_SIZE (1899)
63 | #define TRAINING_SET_SIZE (4000)
64 | #define TEST_SET_SIZE (1000)
65 | 
66 | /* function defs */
67 | 
68 | void readMatrixFromFile( char *, int *, const int, const int );
69 | 
70 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const,
71 |                const int, const int,
72 |                const floatType_t , const int,
73 |                floatType_t * );
74 | 
75 | void svmPredict( floatType_t const *, floatType_t const *,  
76 |                  int const, int const, int * );
77 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/laplace2d.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error )
45 |     do j=1,m-2
46 |       do i=1,n-2
47 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
48 |                                      A(i  ,j-1) + A(i  ,j+1) )
49 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
50 |       end do
51 |     end do
52 | !$omp end parallel do
53 | 
54 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
55 |     iter = iter + 1
56 | 
57 | !$omp parallel do shared(m, n, Anew, A)
58 |     do j=1,m-2
59 |       do i=1,n-2
60 |         A(i,j) = Anew(i,j)
61 |       end do
62 |     end do
63 | !$omp end parallel do
64 | 
65 |   end do
66 | 
67 |   call cpu_time(stop_time) 
68 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
69 | 
70 |   deallocate (A,Anew)
71 | end program laplace
72 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/laplace2d.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 | 
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error )
45 | !$acc kernels
46 |     do j=1,m-2
47 |       do i=1,n-2
48 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
49 |                                      A(i  ,j-1) + A(i  ,j+1) )
50 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
51 |       end do
52 |     end do
53 | !$acc end kernels
54 | !$omp end parallel do
55 | 
56 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
57 |     iter = iter + 1
58 | 
59 | !$omp parallel do shared(m, n, Anew, A)
60 | !$acc kernels
61 |     do j=1,m-2
62 |       do i=1,n-2
63 |         A(i,j) = Anew(i,j)
64 |       end do
65 |     end do
66 | !$acc end kernels
67 | !$omp end parallel do
68 | 
69 |   end do
70 | 
71 |   call cpu_time(stop_time) 
72 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
73 | 
74 |   deallocate (A,Anew)
75 | end program laplace
76 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/laplace2d.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error )
45 | !$acc kernels
46 |     do j=1,m-2
47 |       do i=1,n-2
48 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
49 |                                      A(i  ,j-1) + A(i  ,j+1) )
50 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
51 |       end do
52 |     end do
53 | !$acc end kernels
54 | !$omp end parallel do
55 | 
56 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
57 |     iter = iter + 1
58 | 
59 | !$omp parallel do shared(m, n, Anew, A)
60 | !$acc kernels
61 |     do j=1,m-2
62 |       do i=1,n-2
63 |         A(i,j) = Anew(i,j)
64 |       end do
65 |     end do
66 | !$acc end kernels
67 | !$omp end parallel do
68 | 
69 |   end do
70 | 
71 |   call cpu_time(stop_time) 
72 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
73 | 
74 |   deallocate (A,Anew)
75 | end program laplace
76 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <stdio.h>
18 | #include "../debug.h"
19 | 
20 | __global__ void add(int *a, int *b, int *c)
21 | {
22 |   *c = *a + *b;
23 | }
24 | 
25 | int main()
26 | {
27 |   int a, b, c;
28 |   int *d_a, *d_b, *d_c;
29 |   int size = sizeof( int );
30 | 
31 | /* get GPU device number and name */
32 | 
33 |   int dev;
34 |   cudaDeviceProp deviceProp;
35 |   checkCUDA( cudaGetDevice( &dev ) );
36 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
37 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
38 | 
39 | /* allocate space for device copies of a, b, c */
40 | 
41 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
42 | /* enter code here to malloc d_b and d_c */
43 |   FIXME
44 | 
45 | /* zero out the device memory for C */
46 | 
47 |   checkCUDA( cudaMemset( d_c, 0, size ) );
48 | 
49 | /* setup initial values */
50 | 
51 |   a = 2;
52 |   b = 7;
53 |   c = -99;
54 | 
55 | /* copy inputs to device */
56 | 
57 |   checkCUDA( cudaMemcpy( d_a, &a, size, cudaMemcpyHostToDevice ) );
58 | /* enter code here to copy d_b to device */
59 |   FIXME
60 | 
61 | /* enter code here to launch the kernel on the GPU */
62 |   FIXME
63 | 
64 |   checkKERNEL()
65 | 
66 | /* copy result back to host */
67 | 
68 |   checkCUDA( cudaMemcpy( &c, d_c, size, cudaMemcpyDeviceToHost ) );
69 | 
70 |   printf("value of c after kernel is %d\n",c);
71 |   if( c == ( a + b ) ) printf("PASS\n");
72 |   else printf("FAIL\n");
73 | 
74 | /* clean up */
75 | 
76 |   checkCUDA( cudaFree( d_a ) );
77 |   FIXME
78 | /* enter code here to cudaFree the d_b and d_c pointers */
79 | 
80 | /* calling reset to check errors */
81 |   checkCUDA( cudaDeviceReset() );
82 | 	
83 |   return 0;
84 | } /* end main */
85 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/laplace2d.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 | !$acc data copy(A) create(Anew)
42 |   do while ( error .gt. tol .and. iter .lt. iter_max )
43 |     error=0.0_fp_kind
44 | 
45 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error )
46 | !$acc kernels
47 |     do j=1,m-2
48 |       do i=1,n-2
49 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
50 |                                      A(i  ,j-1) + A(i  ,j+1) )
51 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
52 |       end do
53 |     end do
54 | !$acc end kernels
55 | !$omp end parallel do
56 | 
57 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
58 |     iter = iter + 1
59 | 
60 | !$omp parallel do shared(m, n, Anew, A)
61 | !$acc kernels
62 |     do j=1,m-2
63 |       do i=1,n-2
64 |         A(i,j) = Anew(i,j)
65 |       end do
66 |     end do
67 | !$acc end kernels
68 | !$omp end parallel do
69 | 
70 |   end do
71 | !$acc end data
72 | 
73 |   call cpu_time(stop_time) 
74 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
75 | 
76 |   deallocate (A,Anew)
77 | end program laplace
78 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <stdio.h>
18 | #include "../debug.h"
19 | 
20 | __global__ void add(int *a, int *b, int *c)
21 | {
22 |   *c = *a + *b;
23 | }
24 | 
25 | int main()
26 | {
27 | 
28 |   int a, b, c;
29 |   int *d_a, *d_b, *d_c;
30 |   int size = sizeof( int );
31 | 
32 | /* get GPU device number and name */
33 |   
34 |   int dev;
35 |   cudaDeviceProp deviceProp;
36 |   checkCUDA( cudaGetDevice( &dev ) );
37 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
38 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
39 | 
40 | /* allocate space for device copies of a, b, c */
41 | 
42 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
43 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
44 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
45 | 
46 | /* setup initial values */
47 | 
48 |   a = 2;
49 |   b = 7;
50 |   c = -99;
51 | 
52 | 
53 | /* copy inputs to device */
54 | 
55 |   checkCUDA( cudaMemcpy( d_a, &a, size, cudaMemcpyHostToDevice ) );
56 |   checkCUDA( cudaMemcpy( d_b, &b, size, cudaMemcpyHostToDevice ) );
57 | 
58 | /* zero out the device memory for C */
59 | 
60 |   checkCUDA( cudaMemset( d_c, 0, size ) );
61 | 
62 | /* launch the kernel on the GPU */
63 | 
64 |   add<<< 1, 1 >>>( d_a, d_b, d_c );
65 |   checkKERNEL()
66 | 
67 | /* copy result back to host */
68 | 
69 |   checkCUDA( cudaMemcpy( &c, d_c, size, cudaMemcpyDeviceToHost ) );
70 | 
71 |   printf("value of c after kernel is %d\n",c);
72 |   if( c == ( a + b ) ) printf("PASS\n");
73 |   else printf("FAIL\n");
74 | 
75 | /* clean up */
76 | 
77 |   checkCUDA( cudaFree( d_a ) );
78 |   checkCUDA( cudaFree( d_b ) );
79 |   checkCUDA( cudaFree( d_c ) );
80 | 
81 |   checkCUDA( cudaDeviceReset() );
82 | 	
83 |   return 0;
84 | } /* end main */
85 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/headers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* include the ATLAS headers */
18 | 
19 | extern "C"
20 | {
21 | #include <cblas.h>
22 | }
23 | 
24 | #include "../debug.h"
25 | 
26 | /* choose precision to train and classify.  Only float and double are 
27 |  * currently suppored
28 |  */
29 | 
30 | typedef float floatType_t;
31 | 
32 | /* macro to convert 2d coords to 1d offset */
33 | 
34 | #define INDX(row,col,ld) (((col) * (ld)) + (row))
35 | 
36 | /* macros for max/min to combine with argmin */
37 | 
38 | #define MYMAX(val,array,i,index) \
39 | if( array[i] > val ) \
40 | { \
41 |   val = array[i]; \
42 |   index = i; \
43 | } \
44 | 
45 | #define MYMIN(val,array,i,index) \
46 | if( array[i] < val ) \
47 | { \
48 |   val = array[i]; \
49 |   index = i; \
50 | } \
51 | 
52 | /* macro to clip values from min to max */
53 | 
54 | #define CLIP(val,min,max) \
55 | if( (val) < (min) ) val = (min); \
56 | else if( (val) > (max) ) val = (max);
57 | 
58 | /* hardcoded constants for training and test set size and feature
59 |  * vector size
60 |  */
61 | 
62 | #define FEATURE_VECTOR_SIZE (1899)
63 | #define TRAINING_SET_SIZE (4000)
64 | #define TEST_SET_SIZE (1000)
65 | 
66 | /* function defs */
67 | 
68 | void readMatrixFromFile( char *, int *, const int, const int );
69 | 
70 | void calculateBI( floatType_t const *,
71 |                   floatType_t const *,
72 |                   floatType_t const *,
73 |                   int ,
74 |                   floatType_t *, floatType_t *,
75 |                   int *, int *,
76 |                   floatType_t const );
77 | 
78 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const,
79 |                const int, const int,
80 |                const floatType_t, floatType_t * );
81 | 
82 | void svmPredict( floatType_t const *, floatType_t const *,
83 |                  int const, int const, int * );
84 | 


--------------------------------------------------------------------------------
/exercises/openacc/001-laplace2D-kernels/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include "timer.h"
21 | 
22 | #define NN 4096
23 | #define NM 4096
24 | 
25 | double A[NN][NM];
26 | double Anew[NN][NM];
27 | 
28 | int main(int argc, char** argv)
29 | {
30 |     const int n = NN;
31 |     const int m = NM;
32 |     const int iter_max = 1000;
33 |     
34 |     const double tol = 1.0e-6;
35 |     double error     = 1.0;
36 |     
37 |     memset(A, 0, n * m * sizeof(double));
38 |     memset(Anew, 0, n * m * sizeof(double));
39 |         
40 |     for (int j = 0; j < n; j++)
41 |     {
42 |         A[j][0]    = 1.0;
43 |         Anew[j][0] = 1.0;
44 |     }
45 |     
46 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
47 |     
48 |     StartTimer();
49 |     int iter = 0;
50 |     
51 |     while ( error > tol && iter < iter_max )
52 |     {
53 |         error = 0.0;
54 | 
55 | #pragma omp parallel for shared(m, n, Anew, A)
56 |         for( int j = 1; j < n-1; j++)
57 |         {
58 |             for( int i = 1; i < m-1; i++ )
59 |             {
60 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
61 |                                     + A[j-1][i] + A[j+1][i]);
62 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
63 |             }
64 |         }
65 |         
66 | #pragma omp parallel for shared(m, n, Anew, A)
67 |         for( int j = 1; j < n-1; j++)
68 |         {
69 |             for( int i = 1; i < m-1; i++ )
70 |             {
71 |                 A[j][i] = Anew[j][i];    
72 |             }
73 |         }
74 | 
75 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
76 |         
77 |         iter++;
78 |     }
79 | 
80 |     double runtime = GetTimer();
81 |  
82 |     printf(" total: %f s\n", runtime / 1000);
83 |     return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/original/headers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | /* include the ATLAS headers */
18 | 
19 | extern "C"
20 | {
21 | #include <cblas.h>
22 | }
23 | 
24 | #include "../debug.h"
25 | 
26 | /* choose precision to train and classify.  Only float and double are 
27 |  * currently suppored
28 |  */
29 | 
30 | typedef float floatType_t;
31 | 
32 | /* macro to convert 2d coords to 1d offset */
33 | 
34 | #define INDX(row,col,ld) (((col) * (ld)) + (row))
35 | 
36 | /* macros for max/min to combine with argmin */
37 | 
38 | #define MYMAX(val,array,i,index) \
39 | if( array[i] > val ) \
40 | { \
41 |   val = array[i]; \
42 |   index = i; \
43 | } \
44 | 
45 | #define MYMIN(val,array,i,index) \
46 | if( array[i] < val ) \
47 | { \
48 |   val = array[i]; \
49 |   index = i; \
50 | } \
51 | 
52 | /* macro to clip values from min to max */
53 | 
54 | #define CLIP(val,min,max) \
55 | if( (val) < (min) ) val = (min); \
56 | else if( (val) > (max) ) val = (max);
57 | 
58 | /* hardcoded constants for training and test set size and feature
59 |  * vector size
60 |  */
61 | 
62 | #define FEATURE_VECTOR_SIZE (1899)
63 | #define TRAINING_SET_SIZE (4000)
64 | #define TEST_SET_SIZE (1000)
65 | 
66 | /* function defs */
67 | 
68 | void readMatrixFromFile( char *, int *, const int, const int );
69 | 
70 | void calculateBI( floatType_t const *,
71 |                   floatType_t const *,
72 |                   floatType_t const *,
73 |                   int ,
74 |                   floatType_t *, floatType_t *,
75 |                   int *, int *,
76 |                   floatType_t const );
77 | 
78 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const,
79 |                const int, const int,
80 |                const floatType_t, floatType_t * );
81 | 
82 | void svmPredict( floatType_t const *, floatType_t const *,
83 |                  int const, int const, int * );
84 | 


--------------------------------------------------------------------------------
/exercises/openacc/002-laplace2D-data/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include "timer.h"
20 | 
21 | #define NN 4096
22 | #define NM 4096
23 | 
24 | double A[NN][NM];
25 | double Anew[NN][NM];
26 | 
27 | int main(int argc, char** argv)
28 | {
29 |     const int n = NN;
30 |     const int m = NM;
31 |     const int iter_max = 1000;
32 |     
33 |     const double tol = 1.0e-6;
34 |     double error     = 1.0;
35 |     
36 |     memset(A, 0, n * m * sizeof(double));
37 |     memset(Anew, 0, n * m * sizeof(double));
38 |         
39 |     for (int j = 0; j < n; j++)
40 |     {
41 |         A[j][0]    = 1.0;
42 |         Anew[j][0] = 1.0;
43 |     }
44 |     
45 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
46 |     
47 |     StartTimer();
48 |     int iter = 0;
49 |     
50 |     while ( error > tol && iter < iter_max )
51 |     {
52 |         error = 0.0;
53 | 
54 | #pragma omp parallel for shared(m, n, Anew, A)
55 | #pragma acc kernels
56 |         for( int j = 1; j < n-1; j++)
57 |         {
58 |             for( int i = 1; i < m-1; i++ )
59 |             {
60 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
61 |                                     + A[j-1][i] + A[j+1][i]);
62 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
63 |             }
64 |         }
65 |         
66 | #pragma omp parallel for shared(m, n, Anew, A)
67 | #pragma acc kernels
68 |         for( int j = 1; j < n-1; j++)
69 |         {
70 |             for( int i = 1; i < m-1; i++ )
71 |             {
72 |                 A[j][i] = Anew[j][i];    
73 |             }
74 |         }
75 | 
76 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
77 |         
78 |         iter++;
79 |     }
80 | 
81 |     double runtime = GetTimer();
82 |  
83 |     printf(" total: %f s\n", runtime / 1000);
84 |     return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/001-laplace2D-kernels/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include "timer.h"
21 | 
22 | #define NN 4096
23 | #define NM 4096
24 | 
25 | double A[NN][NM];
26 | double Anew[NN][NM];
27 | 
28 | int main(int argc, char** argv)
29 | {
30 |     const int n = NN;
31 |     const int m = NM;
32 |     const int iter_max = 1000;
33 |     
34 |     const double tol = 1.0e-6;
35 |     double error     = 1.0;
36 |     
37 |     memset(A, 0, n * m * sizeof(double));
38 |     memset(Anew, 0, n * m * sizeof(double));
39 |         
40 |     for (int j = 0; j < n; j++)
41 |     {
42 |         A[j][0]    = 1.0;
43 |         Anew[j][0] = 1.0;
44 |     }
45 |     
46 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
47 |     
48 |     StartTimer();
49 |     int iter = 0;
50 |     
51 |     while ( error > tol && iter < iter_max )
52 |     {
53 |         error = 0.0;
54 | 
55 | #pragma omp parallel for shared(m, n, Anew, A)
56 | #pragma acc kernels
57 |         for( int j = 1; j < n-1; j++)
58 |         {
59 |             for( int i = 1; i < m-1; i++ )
60 |             {
61 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
62 |                                     + A[j-1][i] + A[j+1][i]);
63 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
64 |             }
65 |         }
66 |         
67 | #pragma omp parallel for shared(m, n, Anew, A)
68 | #pragma acc kernels
69 |         for( int j = 1; j < n-1; j++)
70 |         {
71 |             for( int i = 1; i < m-1; i++ )
72 |             {
73 |                 A[j][i] = Anew[j][i];    
74 |             }
75 |         }
76 | 
77 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
78 |         
79 |         iter++;
80 |     }
81 | 
82 |     double runtime = GetTimer();
83 |  
84 |     printf(" total: %f s\n", runtime / 1000);
85 |     return 0;
86 | }
87 | 


--------------------------------------------------------------------------------
/exercise_solutions/openacc/002-laplace2D-data/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include "timer.h"
20 | 
21 | #define NN 4096
22 | #define NM 4096
23 | 
24 | double A[NN][NM];
25 | double Anew[NN][NM];
26 | 
27 | int main(int argc, char** argv)
28 | {
29 |     const int n = NN;
30 |     const int m = NM;
31 |     const int iter_max = 1000;
32 |     
33 |     const double tol = 1.0e-6;
34 |     double error     = 1.0;
35 |     
36 |     memset(A, 0, n * m * sizeof(double));
37 |     memset(Anew, 0, n * m * sizeof(double));
38 |         
39 |     for (int j = 0; j < n; j++)
40 |     {
41 |         A[j][0]    = 1.0;
42 |         Anew[j][0] = 1.0;
43 |     }
44 |     
45 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
46 |     
47 |     StartTimer();
48 |     int iter = 0;
49 |     
50 | #pragma acc data copy(A), create(Anew)
51 |     while ( error > tol && iter < iter_max )
52 |     {
53 |         error = 0.0;
54 | 
55 | #pragma omp parallel for shared(m, n, Anew, A)
56 | #pragma acc kernels
57 |         for( int j = 1; j < n-1; j++)
58 |         {
59 |             for( int i = 1; i < m-1; i++ )
60 |             {
61 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
62 |                                     + A[j-1][i] + A[j+1][i]);
63 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
64 |             }
65 |         }
66 |         
67 | #pragma omp parallel for shared(m, n, Anew, A)
68 | #pragma acc kernels
69 |         for( int j = 1; j < n-1; j++)
70 |         {
71 |             for( int i = 1; i < m-1; i++ )
72 |             {
73 |                 A[j][i] = Anew[j][i];    
74 |             }
75 |         }
76 | 
77 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
78 |         
79 |         iter++;
80 |     }
81 | 
82 |     double runtime = GetTimer();
83 |  
84 |     printf(" total: %f s\n", runtime / 1000);
85 |     return 0;
86 | }
87 | 


--------------------------------------------------------------------------------
/exercises/cuda/thrust_sort/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <thrust/host_vector.h>
18 | #include <thrust/device_vector.h>
19 | #include <thrust/generate.h>
20 | #include <thrust/sort.h>
21 | #include <thrust/copy.h>
22 | #include <cstdlib>
23 | 
24 | int main(void)
25 | {
26 |   // generate 32M random numbers on the host
27 |   thrust::host_vector<int> h_vec( 32 << 20 );
28 |   thrust::generate( h_vec.begin(), h_vec.end(), rand );
29 | 
30 |   // replicate input on another host vector
31 |   thrust::host_vector<int> h_vec1 = h_vec;
32 | 
33 |   //transfer data to the device
34 |   thrust::device_vector<int> d_vec = h_vec;
35 | 
36 |   //create timers
37 |   cudaEvent_t start, stop;
38 |   cudaEventCreate(&start);
39 |   cudaEventCreate(&stop);
40 | 
41 |   cudaEventRecord( start, 0 );
42 | 
43 |   //sort data on the device
44 |   thrust::sort( d_vec.begin(), d_vec.end() );
45 | 
46 |   cudaEventRecord( stop, 0 );
47 |   cudaEventSynchronize( stop );
48 | 
49 |   float GPUelapsedTime;
50 |   cudaEventElapsedTime( &GPUelapsedTime, start, stop );
51 | 
52 |   GPUelapsedTime /= 1000.0;
53 | 
54 |   printf("sort of %ld in %f seconds\n", 32<<20, GPUelapsedTime );
55 |   printf("Sort of %f M / sec\n", (double)(32<<20) / (double)GPUelapsedTime *
56 |    1e-6);
57 | 
58 |   //transfer data back to host
59 |   thrust::copy( d_vec.begin(), d_vec.end(), h_vec.begin() );
60 | 
61 | 
62 |   cudaEventRecord( start, 0 );
63 | 
64 |   //sort data on host
65 |   thrust::sort(h_vec1.begin(), h_vec1.end() );
66 | 
67 |   cudaEventRecord( stop, 0 );
68 |   cudaEventSynchronize( stop );
69 | 
70 |   float CPUelapsedTime;
71 |   cudaEventElapsedTime( &CPUelapsedTime, start, stop );
72 |   CPUelapsedTime /= 1000.0;
73 | 
74 |   printf("sort of %ld in %f seconds\n", 32<<20,CPUelapsedTime );
75 |   printf("Sort of %f M / sec\n", (double)(32<<20) / (double)CPUelapsedTime *
76 |    1e-6);
77 | 
78 |   cudaEventDestroy(start);
79 |   cudaEventDestroy(stop);
80 | 
81 |   printf("GPU is %5.2fX faster than CPU\n", CPUelapsedTime/GPUelapsedTime );
82 | 
83 |   if ( thrust::equal( h_vec1.begin(), h_vec1.end(), h_vec.begin() ) )
84 |     printf("The arrays are equal\n");
85 |   else
86 |     printf("The arrays are different!\n");
87 | 
88 | 
89 |   return 0;
90 | } /* end main */
91 | 
92 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/thrust_sort/kernel.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <thrust/host_vector.h>
18 | #include <thrust/device_vector.h>
19 | #include <thrust/generate.h>
20 | #include <thrust/sort.h>
21 | #include <thrust/copy.h>
22 | #include <cstdlib>
23 | 
24 | int main(void)
25 | {
26 |   // generate 32M random numbers on the host
27 |   thrust::host_vector<int> h_vec( 32 << 20 );
28 |   thrust::generate( h_vec.begin(), h_vec.end(), rand );
29 | 
30 |   // replicate input on another host vector
31 |   thrust::host_vector<int> h_vec1 = h_vec;
32 | 
33 |   //transfer data to the device
34 |   thrust::device_vector<int> d_vec = h_vec;
35 | 
36 |   //create timers
37 |   cudaEvent_t start, stop;
38 |   cudaEventCreate(&start);
39 |   cudaEventCreate(&stop);
40 | 
41 |   cudaEventRecord( start, 0 );
42 | 
43 |   //sort data on the device
44 |   thrust::sort( d_vec.begin(), d_vec.end() );
45 | 
46 |   cudaEventRecord( stop, 0 );
47 |   cudaEventSynchronize( stop );
48 | 
49 |   float GPUelapsedTime;
50 |   cudaEventElapsedTime( &GPUelapsedTime, start, stop );
51 | 
52 |   GPUelapsedTime /= 1000.0;
53 | 
54 |   printf("sort of %ld in %f seconds\n", 32<<20, GPUelapsedTime );
55 |   printf("Sort of %f M / sec\n", (double)(32<<20) / (double)GPUelapsedTime *
56 |    1e-6);
57 | 
58 |   //transfer data back to host
59 |   thrust::copy( d_vec.begin(), d_vec.end(), h_vec.begin() );
60 | 
61 | 
62 |   cudaEventRecord( start, 0 );
63 | 
64 |   //sort data on host
65 |   thrust::sort(h_vec1.begin(), h_vec1.end() );
66 | 
67 |   cudaEventRecord( stop, 0 );
68 |   cudaEventSynchronize( stop );
69 | 
70 |   float CPUelapsedTime;
71 |   cudaEventElapsedTime( &CPUelapsedTime, start, stop );
72 |   CPUelapsedTime /= 1000.0;
73 | 
74 |   printf("sort of %ld in %f seconds\n", 32<<20,CPUelapsedTime );
75 |   printf("Sort of %f M / sec\n", (double)(32<<20) / (double)CPUelapsedTime *
76 |    1e-6);
77 | 
78 |   cudaEventDestroy(start);
79 |   cudaEventDestroy(stop);
80 | 
81 |   printf("GPU is %5.2fX faster than CPU\n", CPUelapsedTime/GPUelapsedTime );
82 | 
83 |   if ( thrust::equal( h_vec1.begin(), h_vec1.end(), h_vec.begin() ) )
84 |     printf("The arrays are equal\n");
85 |   else
86 |     printf("The arrays are different!\n");
87 | 
88 | 
89 |   return 0;
90 | } /* end main */
91 | 
92 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_threads/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 |   c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
 23 | }
 24 | 
 25 | #define N 32
 26 | 
 27 | int main()
 28 | {
 29 |   int *a, *b, *c;
 30 |   int *d_a, *d_b, *d_c;
 31 |   int size = N * sizeof( int );
 32 | 
 33 | /* get GPU device number and name */
 34 | 
 35 |   int dev;
 36 |   cudaDeviceProp deviceProp;
 37 |   checkCUDA( cudaGetDevice( &dev ) );
 38 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 39 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 40 | 
 41 | /* allocate space for device copies of a, b, c */
 42 | 
 43 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 44 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
 45 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
 46 | 
 47 | /* allocate space for host copies of a, b, c and setup input values */
 48 | 
 49 |   a = (int *)malloc( size );
 50 |   b = (int *)malloc( size );
 51 |   c = (int *)malloc( size );
 52 | 
 53 |   for( int i = 0; i < N; i++ )
 54 |   {
 55 |     a[i] = b[i] = i;
 56 |     c[i] = 0;
 57 |   }
 58 | 
 59 | /* copy inputs to device */
 60 | 
 61 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 62 |   checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) );
 63 | 
 64 | /* zero out C array */
 65 |  
 66 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 67 | 
 68 | /* launch the kernel on the GPU */
 69 | 
 70 |   add<<< 1, N >>>( d_a, d_b, d_c );
 71 |   checkKERNEL()
 72 | 
 73 | /* copy result back to host */
 74 | 
 75 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 76 | 
 77 |   int success = 1;
 78 | 
 79 |   for( int i = 0; i < N; i++ )
 80 |   {
 81 |     printf("c[%d] = %d\n",i,c[i]);
 82 |     if( c[i] != a[i] + b[i] )
 83 |     {
 84 |       success = 0;
 85 |       break;
 86 |     } /* end if */
 87 |   } /* end for */
 88 | 
 89 |   if( success == 1 ) printf("PASS\n");
 90 |   else               printf("FAIL\n");
 91 | 
 92 | /* clean up */
 93 | 
 94 |   free(a);
 95 |   free(b);
 96 |   free(c);
 97 |   checkCUDA( cudaFree( d_a ) );
 98 |   checkCUDA( cudaFree( d_b ) );
 99 |   checkCUDA( cudaFree( d_c ) );
100 | 
101 |   checkCUDA( cudaDeviceReset() );
102 | 
103 |   return 0;
104 | } /* end main */
105 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 |     c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
 23 | }
 24 | 
 25 | #define N 32
 26 | 
 27 | int main()
 28 | {
 29 |   int *a, *b, *c;
 30 |   int *d_a, *d_b, *d_c;
 31 |   int size = N * sizeof( int );
 32 | 
 33 | /* get GPU device number and name */
 34 | 
 35 |   int dev;
 36 |   cudaDeviceProp deviceProp;
 37 |   checkCUDA( cudaGetDevice( &dev ) );
 38 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 39 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 40 | 
 41 | /* allocate space for device copies of a, b, c */
 42 | 
 43 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 44 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
 45 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
 46 | 
 47 | /* allocate space for host copies of a, b, c and setup input values */
 48 | 
 49 |   a = (int *)malloc( size );
 50 |   b = (int *)malloc( size );
 51 |   c = (int *)malloc( size );
 52 | 
 53 |   for( int i = 0; i < N; i++ )
 54 |   {
 55 |     a[i] = b[i] = i;
 56 |     c[i] = 0;
 57 |   } /* end for */
 58 | 
 59 | /* copy inputs to device */
 60 | 
 61 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 62 |   checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) );
 63 | 
 64 | /* zero out C array */
 65 | 
 66 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 67 | 
 68 | /* launch the kernel on the GPU */
 69 | /* finish the kernel launch with N blocks and 1 thread per block */
 70 |   add<<< N, 1 >>>( d_a, d_b, d_c );
 71 |   checkKERNEL()
 72 | 
 73 | /* copy result back to host */
 74 | 
 75 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 76 | 
 77 |   int success = 1;
 78 | 
 79 |   for( int i = 0; i < N; i++ )
 80 |   {
 81 |     printf("c[%d] = %d\n",i,c[i]);
 82 |     if( c[i] != a[i] + b[i] )
 83 |     {
 84 |       success = 0;
 85 |       break;
 86 |     } /* end if */
 87 |   } /* end for */
 88 | 
 89 |   if( success == 1 ) printf("PASS\n");
 90 |   else               printf("FAIL\n");
 91 | 
 92 | /* clean up */
 93 | 
 94 |   free(a);
 95 |   free(b);
 96 |   free(c);
 97 |   checkCUDA( cudaFree( d_a ) );
 98 |   checkCUDA( cudaFree( d_b ) );
 99 |   checkCUDA( cudaFree( d_c ) );
100 | 	
101 |   return 0;
102 | } /* end main */
103 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 | /* add the proper index so each block calculates a different value in the 
 23 |    array  */
 24 |   c[FIXME] = a[FIXME] + b[FIXME];
 25 | }
 26 | 
 27 | #define N 32
 28 | 
 29 | int main()
 30 | {
 31 |   int *a, *b, *c;
 32 |   int *d_a, *d_b, *d_c;
 33 |   int size = N * sizeof( int );
 34 | 
 35 | /* get GPU device number and name */
 36 | 
 37 |   int dev;
 38 |   cudaDeviceProp deviceProp;
 39 |   checkCUDA( cudaGetDevice( &dev ) );
 40 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 41 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 42 | 
 43 | /* allocate space for device copies of a, b, c */
 44 | 
 45 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 46 | /* insert code here for d_b and d_c */
 47 |   FIXME
 48 | 
 49 | /* allocate space for host copies of a, b, c and setup input values */
 50 | 
 51 |   a = (int *)malloc( size );
 52 |   b = (int *)malloc( size );
 53 |   c = (int *)malloc( size );
 54 | 
 55 |   for( int i = 0; i < N; i++ )
 56 |   {
 57 |     a[i] = b[i] = i;
 58 |     c[i] = 0;
 59 |   } /* end for */
 60 | 
 61 | /* copy inputs to device */
 62 | 
 63 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 64 | /* insert code to copy b to the device */
 65 |   FIXME
 66 | 
 67 | /* zero out C array */
 68 | 
 69 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 70 | 
 71 | /* launch the kernel on the GPU */
 72 | /* finish the kernel launch with N blocks and 1 thread per block */
 73 |   add<<< FIXME, FIXME >>>( d_a, d_b, d_c );
 74 |   checkKERNEL()
 75 | 
 76 | /* copy result back to host */
 77 | 
 78 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 79 | 
 80 |   int success = 1;
 81 | 
 82 |   for( int i = 0; i < N; i++ )
 83 |   {
 84 |     printf("c[%d] = %d\n",i,c[i]);
 85 |     if( c[i] != a[i] + b[i] )
 86 |     {
 87 |       success = 0;
 88 |       break;
 89 |     } /* end if */
 90 |   } /* end for */
 91 | 
 92 |   if( success == 1 ) printf("PASS\n");
 93 |   else               printf("FAIL\n");
 94 | 
 95 | /* clean up */
 96 | 
 97 |   free(a);
 98 |   free(b);
 99 |   free(c);
100 |   checkCUDA( cudaFree( d_a ) );
101 |   checkCUDA( cudaFree( d_b ) );
102 |   checkCUDA( cudaFree( d_c ) );
103 | 	
104 |   return 0;
105 | } /* end main */
106 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/simple_add_blocks_threads/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 23 |   c[index] = a[index] + b[index];
 24 | }
 25 | 
 26 | #define N (2048*2048)
 27 | #define THREADS_PER_BLOCK 512
 28 | 
 29 | int main()
 30 | {
 31 |   int *a, *b, *c;
 32 |   int *d_a, *d_b, *d_c;
 33 |   int size = N * sizeof( int );
 34 | 
 35 | /* get GPU device number and name */
 36 | 
 37 |   int dev;
 38 |   cudaDeviceProp deviceProp;
 39 |   checkCUDA( cudaGetDevice( &dev ) );
 40 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 41 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 42 | 
 43 | /* allocate space for device copies of a, b, c */
 44 | 
 45 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 46 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
 47 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
 48 | 
 49 | /* allocate space for host copies of a, b, c and setup input values */
 50 | 
 51 |   a = (int *)malloc( size );
 52 |   b = (int *)malloc( size );
 53 |   c = (int *)malloc( size );
 54 | 
 55 |   for( int i = 0; i < N; i++ )
 56 |   {
 57 |     a[i] = b[i] = i;
 58 |     c[i] = 0;
 59 |   }
 60 | 
 61 | /* copy inputs to device */
 62 | 
 63 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 64 |   checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) );
 65 | 
 66 | /* zero out the C array */
 67 | 
 68 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 69 | 
 70 | /* launch the kernel on the GPU */
 71 | 
 72 |   add<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );
 73 |   checkKERNEL()
 74 | 
 75 | /* copy result back to host */
 76 | 
 77 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 78 | 
 79 |   int success = 1;
 80 | 
 81 |   for( int i = 0; i < N; i++ )
 82 |   {
 83 |     if( c[i] != a[i] + b[i] )
 84 |     {
 85 |       printf("c[%d] = %d\n",i,c[i] );
 86 |       success = 0;
 87 |       break;
 88 |     } /* end if */
 89 |   }
 90 | 
 91 |   if( success == 1 ) printf("PASS\n");
 92 |   else               printf("FAIL\n");
 93 | 
 94 | /* clean up */
 95 | 
 96 |   free(a);
 97 |   free(b);
 98 |   free(c);
 99 |   checkCUDA( cudaFree( d_a ) );
100 |   checkCUDA( cudaFree( d_b ) );
101 |   checkCUDA( cudaFree( d_c ) );
102 | 
103 |   checkCUDA( cudaDeviceReset() );
104 | 	
105 |   return 0;
106 | } /* end main */
107 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_blocks_threads/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 23 |   c[index] = a[index] + b[index];
 24 | }
 25 | 
 26 | #define N (2048*2048)
 27 | #define THREADS_PER_BLOCK 512
 28 | 
 29 | int main()
 30 | {
 31 |   int *a, *b, *c;
 32 |   int *d_a, *d_b, *d_c;
 33 |   int size = N * sizeof( int );
 34 | 
 35 | /* get GPU device number and name */
 36 | 
 37 |   int dev;
 38 |   cudaDeviceProp deviceProp;
 39 |   checkCUDA( cudaGetDevice( &dev ) );
 40 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 41 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 42 | 
 43 | /* allocate space for device copies of a, b, c */
 44 | 
 45 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 46 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
 47 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
 48 | 
 49 | /* allocate space for host copies of a, b, c and setup input values */
 50 | 
 51 |   a = (int *)malloc( size );
 52 |   b = (int *)malloc( size );
 53 |   c = (int *)malloc( size );
 54 | 
 55 |   for( int i = 0; i < N; i++ )
 56 |   {
 57 |     a[i] = b[i] = i;
 58 |     c[i] = 0;
 59 |   }
 60 | 
 61 | /* copy inputs to device */
 62 | 
 63 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 64 |   checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) );
 65 | 
 66 | /* zero out the C array */
 67 | 
 68 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 69 | 
 70 | /* launch the kernel on the GPU */
 71 | /* insert the launch parameters to launch properly using blocks and threads */
 72 |   add<<< FIXME, FIXME >>>( d_a, d_b, d_c );
 73 |   checkKERNEL()
 74 | 
 75 | /* copy result back to host */
 76 | 
 77 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 78 | 
 79 |   int success = 1;
 80 | 
 81 |   for( int i = 0; i < N; i++ )
 82 |   {
 83 |     if( c[i] != a[i] + b[i] )
 84 |     {
 85 |       printf("c[%d] = %d\n",i,c[i] );
 86 |       success = 0;
 87 |       break;
 88 |     } /* end if */
 89 |   }
 90 | 
 91 |   if( success == 1 ) printf("PASS\n");
 92 |   else               printf("FAIL\n");
 93 | 
 94 | /* clean up */
 95 | 
 96 |   free(a);
 97 |   free(b);
 98 |   free(c);
 99 |   checkCUDA( cudaFree( d_a ) );
100 |   checkCUDA( cudaFree( d_b ) );
101 |   checkCUDA( cudaFree( d_c ) );
102 | 
103 |   checkCUDA( cudaDeviceReset() );
104 | 	
105 |   return 0;
106 | } /* end main */
107 | 


--------------------------------------------------------------------------------
/exercises/cuda/svm_challenge/README.md:
--------------------------------------------------------------------------------
 1 | SVM Email Spam Filter
 2 | =====================
 3 | 
 4 | This is the top-level folder for a challenge problem dealing with the use of
 5 | support vector machine (SVM) algorithm used to implement a spam classifier.
 6 | 
 7 | The original idea for this code comes from a Machine Learning Coursera course
 8 | taught by Andrew Ng, accessed in 2014 at https://www.coursera.org/course/ml.
 9 | The training and test data are taken from his homework example in this course.
10 | He wrote the code in Octave and I changed it to C as well as altered the CPU
11 | and GPU algorithms to more closely align with the algorithm described in [1],
12 | labeled as "Algorithm 1" on page 105, where the working set choice is using the
13 | first order heuristic, also described in [1] and a linear kernel is used.  The
14 | general algorithm is the SMO algorithm from Platt [2].
15 | 
16 | The training set is 4000 emails and there are 1899 features (keywords).  This
17 | is admittedly a reduced training set and reduced feature size for illustration
18 | purposes only.
19 | 
20 | [1] B. C. Catanzaro, N. Sundaram, K. Keutzer, "Fast Support Vector Machine
21 | Training and Classification on Graphics Processors", Proceedings of the 25th
22 | International Comference on Machine Learning, Helsinki, Finland, 2008.
23 | 
24 | [2] J. C. Platt, "Fast training of support vector machines using sequential
25 | minimal optimization", Advances in kernel methods: support vector learning,
26 | Cambridge, MA, USA: MIT Press.
27 | 
28 | Instructions
29 | ------------
30 | 
31 | To run the code which trains and then classifies email as spam please do the 
32 | following steps.
33 | 
34 | 1.) Build the code.  Ensure that NVCC is in your path.
35 | 
36 | > make
37 | 
38 | 2.) Choose an email to be tested.  There is one genuine email and three spam
39 | emails to choose from.  If you wish to test your own email (either genuine or
40 | spam) put your email as a text file in this directory.  Copy/paste only the 
41 | text of the email.  Please omit the header information as this spam 
42 | classifier only cares about the text of the email.
43 | 
44 | 3.) Process the email.  The email text needs to be processed by stripping out 
45 | all non-text elements and then running a stemming algorithm on each resultant
46 | word.  This leaves you with just a tokenized email of stemmed words which is 
47 | easier to process.  When you run this command the stemmed email will be 
48 | printed to the screen and a file called "emailVector.txt" will be created
49 | which will be a vector of 0's and 1's depending on whether that specific 
50 | feature (word) exists in the email or not.
51 | 
52 | > sh processEmail.sh <emailTextfile.txt>
53 | 
54 | 4.) Train the SVM and classify your email.  In this step the SVM will be first
55 | be trained against a training set of size 4000.  Then it will be tested for
56 | accuracy against this set.  Then it will be tested against a test set of size
57 | 1000.  Both of these accuracies should be over 98%.  Finally the SVM will 
58 | classify your input email and either classify it as spam (1) or NOT spam (0).
59 | 
60 | > ./x.train
61 | Prediction success rate on training set is 99.750000
62 | Prediction success rate on test set is 98.200000
63 | Email test results 1 is SPAM 0 is NOT SPAM
64 | File Name emailVector.txt, classification 0 NOT SPAM
65 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/svm_challenge/README.md:
--------------------------------------------------------------------------------
 1 | SVM Email Spam Filter
 2 | =====================
 3 | 
 4 | This is the top-level folder for a challenge problem dealing with the use of
 5 | support vector machine (SVM) algorithm used to implement a spam classifier.
 6 | 
 7 | The original idea for this code comes from a Machine Learning Coursera course
 8 | taught by Andrew Ng, accessed in 2014 at https://www.coursera.org/course/ml.
 9 | The training and test data are taken from his homework example in this course.
10 | He wrote the code in Octave and I changed it to C as well as altered the CPU
11 | and GPU algorithms to more closely align with the algorithm described in [1],
12 | labeled as "Algorithm 1" on page 105, where the working set choice is using the
13 | first order heuristic, also described in [1] and a linear kernel is used.  The
14 | general algorithm is the SMO algorithm from Platt [2].
15 | 
16 | The training set is 4000 emails and there are 1899 features (keywords).  This
17 | is admittedly a reduced training set and reduced feature size for illustration
18 | purposes only.
19 | 
20 | [1] B. C. Catanzaro, N. Sundaram, K. Keutzer, "Fast Support Vector Machine
21 | Training and Classification on Graphics Processors", Proceedings of the 25th
22 | International Comference on Machine Learning, Helsinki, Finland, 2008.
23 | 
24 | [2] J. C. Platt, "Fast training of support vector machines using sequential
25 | minimal optimization", Advances in kernel methods: support vector learning,
26 | Cambridge, MA, USA: MIT Press.
27 | 
28 | Instructions
29 | ------------
30 | 
31 | To run the code which trains and then classifies email as spam please do the 
32 | following steps.
33 | 
34 | 1.) Build the code.  Ensure that NVCC is in your path.
35 | 
36 | > make
37 | 
38 | 2.) Choose an email to be tested.  There is one genuine email and three spam
39 | emails to choose from.  If you wish to test your own email (either genuine or
40 | spam) put your email as a text file in this directory.  Copy/paste only the 
41 | text of the email.  Please omit the header information as this spam 
42 | classifier only cares about the text of the email.
43 | 
44 | 3.) Process the email.  The email text needs to be processed by stripping out 
45 | all non-text elements and then running a stemming algorithm on each resultant
46 | word.  This leaves you with just a tokenized email of stemmed words which is 
47 | easier to process.  When you run this command the stemmed email will be 
48 | printed to the screen and a file called "emailVector.txt" will be created
49 | which will be a vector of 0's and 1's depending on whether that specific 
50 | feature (word) exists in the email or not.
51 | 
52 | > sh processEmail.sh <emailTextfile.txt>
53 | 
54 | 4.) Train the SVM and classify your email.  In this step the SVM will be first
55 | be trained against a training set of size 4000.  Then it will be tested for
56 | accuracy against this set.  Then it will be tested against a test set of size
57 | 1000.  Both of these accuracies should be over 98%.  Finally the SVM will 
58 | classify your input email and either classify it as spam (1) or NOT spam (0).
59 | 
60 | > ./x.train
61 | Prediction success rate on training set is 99.750000
62 | Prediction success rate on test set is 98.200000
63 | Email test results 1 is SPAM 0 is NOT SPAM
64 | File Name emailVector.txt, classification 0 NOT SPAM
65 | 


--------------------------------------------------------------------------------
/exercises/cuda/simple_add_threads/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include "../debug.h"
 19 | 
 20 | __global__ void add(int *a, int *b, int *c)
 21 | {
 22 | /* insert correct index so each element is calculated by a different thread */
 23 |   c[FIXME] = a[FIXME] + b[FIXME];
 24 | }
 25 | 
 26 | /* experiment with different values for N */
 27 | /* how large can you make it */
 28 | #define N 32
 29 | 
 30 | int main()
 31 | {
 32 |   int *a, *b, *c;
 33 |   int *d_a, *d_b, *d_c;
 34 |   int size = N * sizeof( int );
 35 | 
 36 | /* get GPU device number and name */
 37 | 
 38 |   int dev;
 39 |   cudaDeviceProp deviceProp;
 40 |   checkCUDA( cudaGetDevice( &dev ) );
 41 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 42 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 43 | 
 44 | /* allocate space for device copies of a, b, c */
 45 | 
 46 |   checkCUDA( cudaMalloc( (void **) &d_a, size ) );
 47 |   checkCUDA( cudaMalloc( (void **) &d_b, size ) );
 48 |   checkCUDA( cudaMalloc( (void **) &d_c, size ) );
 49 | 
 50 | /* allocate space for host copies of a, b, c and setup input values */
 51 | 
 52 |   a = (int *)malloc( size );
 53 |   b = (int *)malloc( size );
 54 |   c = (int *)malloc( size );
 55 | 
 56 |   for( int i = 0; i < N; i++ )
 57 |   {
 58 |     a[i] = b[i] = i;
 59 |     c[i] = 0;
 60 |   }
 61 | 
 62 | /* copy inputs to device */
 63 | 
 64 |   checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) );
 65 |   checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) );
 66 | 
 67 | /* zero out C array */
 68 | 
 69 |   checkCUDA( cudaMemset( d_c, 0, size ) );
 70 | 
 71 | /* launch the kernel on the GPU */
 72 | /* insert correct launch parameters to use 1 block and N threads */
 73 |   add<<< FIXME, FIXME >>>( d_a, d_b, d_c );
 74 |   checkKERNEL()
 75 | 
 76 | /* copy result back to host */
 77 | 
 78 |   checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) );
 79 | 
 80 |   int success = 1;
 81 | 
 82 |   for( int i = 0; i < N; i++ )
 83 |   {
 84 |     printf("c[%d] = %d\n",i,c[i]);
 85 |     if( c[i] != a[i] + b[i] )
 86 |     {
 87 |       success = 0;
 88 |       break;
 89 |     } /* end if */
 90 |   } /* end for */
 91 | 
 92 |   if( success == 1 ) printf("PASS\n");
 93 |   else               printf("FAIL\n");
 94 | 
 95 | /* clean up */
 96 | 
 97 |   free(a);
 98 |   free(b);
 99 |   free(c);
100 |   checkCUDA( cudaFree( d_a ) );
101 |   checkCUDA( cudaFree( d_b ) );
102 |   checkCUDA( cudaFree( d_c ) );
103 | 
104 |   checkCUDA( cudaDeviceReset() );
105 | 
106 |   return 0;
107 | } /* end main */
108 | 


--------------------------------------------------------------------------------
/exercises/cuda/reduction_thrust/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <thrust/host_vector.h>
 18 | #include <thrust/device_vector.h>
 19 | #include <thrust/generate.h>
 20 | #include <thrust/sort.h>
 21 | #include <thrust/copy.h>
 22 | #include <cstdlib>
 23 | #include "../debug.h"
 24 | 
 25 | #define N ( 1 << 27 )
 26 | #define FLOATTYPE_T float 
 27 | 
 28 | int main(void)
 29 | {
 30 |   int size = N;
 31 | 
 32 | /* get GPU device number and name */
 33 | 
 34 |   int dev;
 35 |   cudaDeviceProp deviceProp;
 36 |   checkCUDA( cudaGetDevice( &dev ) );
 37 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 38 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 39 | 
 40 | /* create the host array */  
 41 |   thrust::host_vector<FLOATTYPE_T> h_vec( FIXME );
 42 | 
 43 | /* generate random numbers on the host */
 44 |   for( int i = 0; i < size; i++ )
 45 |   {
 46 |     h_vec[i] = FLOATTYPE_T( rand() ) / ( FLOATTYPE_T (RAND_MAX) + 1.0 );
 47 |     if( i % 2 == 0 ) h_vec[i] = -h_vec[i];
 48 |   }
 49 | 
 50 | /* transfer data to the device */
 51 |   thrust::device_vector<FLOATTYPE_T> d_vec = FIXME;
 52 | 
 53 | /* create timers */
 54 |   cudaEvent_t start, stop;
 55 |   cudaEventCreate(&start);
 56 |   cudaEventCreate(&stop);
 57 | 
 58 |   cudaEventRecord( start, 0 );
 59 | 
 60 | /* reduce data on the device */
 61 |   FLOATTYPE_T devResult = thrust::reduce( FIXME, FIXME );
 62 | 
 63 | /* stop timers */
 64 |   cudaEventRecord( stop, 0 );
 65 |   cudaEventSynchronize( stop );
 66 | 
 67 |   float GPUelapsedTime;
 68 |   cudaEventElapsedTime( &GPUelapsedTime, start, stop );
 69 | 
 70 |   GPUelapsedTime /= 1000.0;
 71 | 
 72 | /* print GPU timing data */
 73 | 
 74 |   printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 
 75 |     (double)size * 1.e-9);
 76 |   printf("GPU total time is %f ms, bandwidth %f GB/s\n", GPUelapsedTime,
 77 |     sizeof(FLOATTYPE_T)*(double)size / 
 78 |     ( (double)GPUelapsedTime ) * 1.e-9 );
 79 | 
 80 | /* start CPU timer */
 81 |   cudaEventRecord( start, 0 );
 82 | 
 83 | /* reduce data on host */
 84 |   FLOATTYPE_T hostResult = thrust::reduce(h_vec.begin(), h_vec.end() );
 85 | 
 86 | /* stop timers */
 87 |   cudaEventRecord( stop, 0 );
 88 |   cudaEventSynchronize( stop );
 89 | 
 90 |   float CPUelapsedTime;
 91 |   cudaEventElapsedTime( &CPUelapsedTime, start, stop );
 92 |   CPUelapsedTime /= 1000.0;
 93 | 
 94 | /* print CPU timer */
 95 | 
 96 |   printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 
 97 |     (double)size * 1.e-9);
 98 |   printf("CPU total time is %f ms, bandwidth %f GB/s\n", CPUelapsedTime,
 99 |     sizeof(FLOATTYPE_T)*(double)size / 
100 |     ( (double)CPUelapsedTime ) * 1.e-9 );
101 | 
102 | 
103 |   cudaEventDestroy(start);
104 |   cudaEventDestroy(stop);
105 | 
106 | /* verify the results */
107 | 
108 |   double diff = abs( devResult - hostResult );
109 | 
110 |   if( diff / abs(hostResult) < 0.001 ) printf("PASS\n");
111 |   else
112 |   {
113 |     printf("FAIL\n");
114 |     printf("Error is %f\n", diff / hostResult );
115 |     printf("GPU result is %f, CPU result is %f\n",devResult, hostResult );
116 |   } /* end else */
117 | 
118 |   return 0;
119 | } /* end main */
120 | 
121 | 


--------------------------------------------------------------------------------
/exercise_solutions/cuda/reduction_thrust/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | #include <thrust/host_vector.h>
 18 | #include <thrust/device_vector.h>
 19 | #include <thrust/generate.h>
 20 | #include <thrust/sort.h>
 21 | #include <thrust/copy.h>
 22 | #include <cstdlib>
 23 | #include "../debug.h"
 24 | 
 25 | #define N ( 1 << 27 )
 26 | #define FLOATTYPE_T float 
 27 | 
 28 | int main(void)
 29 | {
 30 |   int size = N;
 31 | 
 32 | /* get GPU device number and name */
 33 | 
 34 |   int dev;
 35 |   cudaDeviceProp deviceProp;
 36 |   checkCUDA( cudaGetDevice( &dev ) );
 37 |   checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) );
 38 |   printf("Using GPU %d: %s\n", dev, deviceProp.name );
 39 | 
 40 | /* create the host array */  
 41 |   thrust::host_vector<FLOATTYPE_T> h_vec( size );
 42 | 
 43 | /* generate random numbers on the host */
 44 |   for( int i = 0; i < size; i++ )
 45 |   {
 46 |     h_vec[i] = FLOATTYPE_T( rand() ) / ( FLOATTYPE_T (RAND_MAX) + 1.0 );
 47 |     if( i % 2 == 0 ) h_vec[i] = -h_vec[i];
 48 |   }
 49 | 
 50 | /* transfer data to the device */
 51 |   thrust::device_vector<FLOATTYPE_T> d_vec = h_vec;
 52 | 
 53 | /* create timers */
 54 |   cudaEvent_t start, stop;
 55 |   cudaEventCreate(&start);
 56 |   cudaEventCreate(&stop);
 57 | 
 58 |   cudaEventRecord( start, 0 );
 59 | 
 60 | /* reduce data on the device */
 61 |   FLOATTYPE_T devResult = thrust::reduce( d_vec.begin(), d_vec.end() );
 62 | 
 63 | /* stop timers */
 64 |   cudaEventRecord( stop, 0 );
 65 |   cudaEventSynchronize( stop );
 66 | 
 67 |   float GPUelapsedTime;
 68 |   cudaEventElapsedTime( &GPUelapsedTime, start, stop );
 69 | 
 70 |   GPUelapsedTime /= 1000.0;
 71 | 
 72 | /* print GPU timing data */
 73 | 
 74 |   printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 
 75 |     (double)size * 1.e-9);
 76 |   printf("GPU total time is %f ms, bandwidth %f GB/s\n", GPUelapsedTime,
 77 |     sizeof(FLOATTYPE_T)*(double)size / 
 78 |     ( (double)GPUelapsedTime ) * 1.e-9 );
 79 | 
 80 | /* start CPU timer */
 81 |   cudaEventRecord( start, 0 );
 82 | 
 83 | /* reduce data on host */
 84 |   FLOATTYPE_T hostResult = thrust::reduce(h_vec.begin(), h_vec.end() );
 85 | 
 86 | /* stop timers */
 87 |   cudaEventRecord( stop, 0 );
 88 |   cudaEventSynchronize( stop );
 89 | 
 90 |   float CPUelapsedTime;
 91 |   cudaEventElapsedTime( &CPUelapsedTime, start, stop );
 92 |   CPUelapsedTime /= 1000.0;
 93 | 
 94 | /* print CPU timer */
 95 | 
 96 |   printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 
 97 |     (double)size * 1.e-9);
 98 |   printf("CPU total time is %f ms, bandwidth %f GB/s\n", CPUelapsedTime,
 99 |     sizeof(FLOATTYPE_T)*(double)size / 
100 |     ( (double)CPUelapsedTime ) * 1.e-9 );
101 | 
102 | 
103 |   cudaEventDestroy(start);
104 |   cudaEventDestroy(stop);
105 | 
106 | /* verify the results */
107 | 
108 |   double diff = abs( devResult - hostResult );
109 | 
110 |   if( diff / abs(hostResult) < 0.001 ) printf("PASS\n");
111 |   else
112 |   {
113 |     printf("FAIL\n");
114 |     printf("Error is %f\n", diff / hostResult );
115 |     printf("GPU result is %f, CPU result is %f\n",devResult, hostResult );
116 |   } /* end else */
117 | 
118 |   return 0;
119 | } /* end main */
120 | 
121 | 


--------------------------------------------------------------------------------