├── samplespath ├── runit.nvidia-smi ├── exercises ├── cuda │ ├── nn │ │ ├── runit │ │ ├── cpu │ │ │ ├── runit │ │ │ ├── libatlas.a │ │ │ ├── libcblas.a │ │ │ ├── setupData.sh │ │ │ └── Makefile │ │ ├── orig │ │ │ ├── runit │ │ │ └── Makefile │ │ ├── libatlas.a │ │ ├── libcblas.a │ │ ├── setupData.sh │ │ ├── README.md │ │ └── Makefile │ ├── make.common │ ├── hello_world │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_CPU │ │ ├── runit │ │ └── Makefile │ ├── reduction3 │ │ ├── runit │ │ └── Makefile │ ├── reduction4 │ │ ├── runit │ │ └── Makefile │ ├── simple_add │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_streams │ │ ├── runit │ │ └── Makefile │ ├── naive_transpose │ │ ├── runit │ │ ├── profile.sh │ │ └── Makefile │ ├── reduction_atomic │ │ ├── runit │ │ └── Makefile │ ├── reduction_naive │ │ ├── runit │ │ └── Makefile │ ├── reduction_thrust │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── simple_stencil │ │ ├── runit │ │ └── Makefile │ ├── smem_transpose │ │ ├── runit │ │ ├── profile.sh │ │ └── Makefile │ ├── simple_add_blocks │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── simple_add_threads │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── reduction_cub_block │ │ ├── runit │ │ └── Makefile │ ├── reduction_cub_device │ │ ├── runit │ │ └── Makefile │ ├── simple_stencil_smem │ │ ├── runit │ │ └── Makefile │ ├── svm_challenge │ │ ├── original │ │ │ ├── README.md │ │ │ ├── Makefile │ │ │ ├── processEmail.sh │ │ │ ├── processEmail.py │ │ │ └── headers.h │ │ ├── libatlas.a │ │ ├── libcblas.a │ │ ├── spamSample3.txt │ │ ├── spamSample2.txt │ │ ├── emailSample1.txt │ │ ├── spamSample1.txt │ │ ├── Makefile │ │ ├── processEmail.sh │ │ ├── processEmail.py │ │ ├── headers.h │ │ └── README.md │ ├── simple_add_blocks_threads │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_GPU_naive │ │ ├── profile.sh │ │ └── Makefile │ ├── matmul_GPU_shmem1 │ │ ├── profile.sh │ │ └── Makefile │ ├── matmul_GPU_shmem │ │ ├── profile.sh │ │ └── Makefile │ ├── thrust_sort │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_CUBLAS │ │ └── Makefile │ └── debug.h └── openacc │ ├── 002-laplace2D-data │ ├── runit.acc │ ├── laplace_acc.job │ ├── runit.omp │ ├── laplace_omp.job │ ├── Makefile │ ├── Makefile_f90 │ ├── timer.h │ ├── laplace2d.f90 │ └── laplace2d.c │ └── 001-laplace2D-kernels │ ├── runit.acc │ ├── laplace_acc.job │ ├── runit.omp │ ├── laplace_omp.job │ ├── Makefile │ ├── Makefile_f90 │ ├── timer.h │ ├── laplace2d.f90 │ └── laplace2d.c ├── openaccscript ├── cudascript ├── exercise_solutions ├── cuda │ ├── nn │ │ ├── runit │ │ ├── libatlas.a │ │ ├── libcblas.a │ │ ├── setupData.sh │ │ └── Makefile │ ├── make.common │ ├── hello_world │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── reduction3 │ │ ├── runit │ │ └── Makefile │ ├── reduction4 │ │ ├── runit │ │ └── Makefile │ ├── simple_add │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_CUBLAS │ │ ├── runit │ │ ├── matmul_CUBLAS.timeline.k20X │ │ ├── profile.sh │ │ └── Makefile │ ├── matmul_streams │ │ ├── runit │ │ ├── matmul_streams.timeline.k20X │ │ ├── profile.sh │ │ └── Makefile │ ├── simple_stencil │ │ ├── runit │ │ └── Makefile │ ├── smem_transpose │ │ ├── runit │ │ ├── profile.sh │ │ ├── smem_transpose.no_conflict.analysis │ │ ├── smem_transpose.no_conflict.timeline │ │ ├── smem_transpose.bank_conflict.analysis │ │ ├── smem_transpose.bank_conflict.timeline │ │ ├── smem_transpose.no_conflict.analysis.k40 │ │ ├── smem_transpose.no_conflict.timeline.k40 │ │ ├── smem_transpose.bank_conflict.analysis.k40 │ │ ├── smem_transpose.bank_conflict.timeline.k40 │ │ ├── smem_transpose.no_conflict.analysis.c2050 │ │ ├── smem_transpose.no_conflict.timeline.c2050 │ │ ├── smem_transpose.bank_conflict.analysis.c2050 │ │ ├── smem_transpose.bank_conflict.timeline.c2050 │ │ └── Makefile │ ├── matmul_GPU_naive │ │ ├── runit │ │ ├── matmul_GPU_naive.analysis │ │ ├── matmul_GPU_naive.timeline │ │ ├── matmul_GPU_naive.analysis.k40 │ │ ├── matmul_GPU_naive.timeline.k40 │ │ ├── matmul_GPU_naive.analysis.c2050 │ │ ├── matmul_GPU_naive.timeline.c2050 │ │ ├── profile.sh │ │ └── Makefile │ ├── matmul_GPU_shmem │ │ ├── runit │ │ ├── matmul_GPU_shmem.analysis │ │ ├── matmul_GPU_shmem.timeline │ │ ├── matmul_GPU_shmem.analysis.k40 │ │ ├── matmul_GPU_shmem.timeline.k40 │ │ ├── matmul_GPU_shmem.analysis.c2050 │ │ ├── matmul_GPU_shmem.timeline.c2050 │ │ ├── profile.sh │ │ └── Makefile │ ├── naive_transpose │ │ ├── runit │ │ ├── naive_transpose.analysis │ │ ├── naive_transpose.timeline │ │ ├── naive_transpose.analysis.k40 │ │ ├── naive_transpose.timeline.k40 │ │ ├── naive_transpose.analysis.c2050 │ │ ├── naive_transpose.timeline.c2050 │ │ ├── profile.sh │ │ └── Makefile │ ├── reduction_atomic │ │ ├── runit │ │ └── Makefile │ ├── reduction_naive │ │ ├── runit │ │ └── Makefile │ ├── reduction_thrust │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── smem_transpose_opt │ │ ├── runit │ │ ├── profile.sh │ │ └── Makefile │ ├── matmul_GPU_shmem1 │ │ ├── runit │ │ ├── matmul_GPU_shmem1.analysis │ │ ├── matmul_GPU_shmem1.timeline │ │ ├── matmul_GPU_shmem1.analysis.c2050 │ │ ├── matmul_GPU_shmem1.analysis.k40 │ │ ├── matmul_GPU_shmem1.timeline.c2050 │ │ ├── matmul_GPU_shmem1.timeline.k40 │ │ ├── profile.sh │ │ └── Makefile │ ├── naive_transpose_cutlass │ │ ├── runit │ │ ├── main.cu │ │ ├── naive_transpose.analysis │ │ ├── naive_transpose.timeline │ │ ├── naive_transpose.analysis.k40 │ │ ├── naive_transpose.timeline.k40 │ │ ├── naive_transpose.analysis.c2050 │ │ ├── naive_transpose.timeline.c2050 │ │ ├── profile.sh │ │ └── Makefile │ ├── reduction_cub_block │ │ ├── runit │ │ └── Makefile │ ├── simple_add_blocks │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── simple_add_threads │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── simple_stencil_smem │ │ ├── runit │ │ └── Makefile │ ├── reduction_cub_device │ │ ├── runit │ │ └── Makefile │ ├── svm_challenge │ │ ├── original │ │ │ ├── README.md │ │ │ ├── Makefile │ │ │ ├── processEmail.sh │ │ │ ├── processEmail.py │ │ │ └── headers.h │ │ ├── libatlas.a │ │ ├── libcblas.a │ │ ├── spamSample3.txt │ │ ├── spamSample2.txt │ │ ├── emailSample1.txt │ │ ├── spamSample1.txt │ │ ├── Makefile │ │ ├── processEmail.sh │ │ ├── processEmail.py │ │ ├── headers.h │ │ └── README.md │ ├── simple_add_blocks_threads │ │ ├── runit │ │ ├── Makefile │ │ └── kernel.cu │ ├── thrust_sort │ │ ├── Makefile │ │ └── kernel.cu │ ├── matmul_CPU │ │ └── Makefile │ └── debug.h └── openacc │ ├── 002-laplace2D-data │ ├── runit.acc │ ├── laplace_acc.job │ ├── runit.omp │ ├── laplace_omp.job │ ├── Makefile │ ├── Makefile_f90 │ ├── timer.h │ ├── laplace2d.f90 │ └── laplace2d.c │ └── 001-laplace2D-kernels │ ├── laplace_acc.job │ ├── runit.acc │ ├── runit.omp │ ├── laplace_omp.job │ ├── Makefile │ ├── Makefile_f90 │ ├── timer.h │ ├── laplace2d.f90 │ └── laplace2d.c ├── runit.query ├── runit.bandwidth ├── runit.matmul ├── README.md ├── README.cluster └── batch_setup.sh /samplespath: -------------------------------------------------------------------------------- 1 | SAMPLESPATH=. 2 | -------------------------------------------------------------------------------- /runit.nvidia-smi: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | nvidia-smi 6 | -------------------------------------------------------------------------------- /exercises/cuda/nn/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.nn 6 | -------------------------------------------------------------------------------- /openaccscript: -------------------------------------------------------------------------------- 1 | #PBS -j oe 2 | 3 | module load pgi 4 | cd $PBS_O_WORKDIR 5 | -------------------------------------------------------------------------------- /cudascript: -------------------------------------------------------------------------------- 1 | #PBS -j oe 2 | 3 | module load cuda/5.5.22 4 | cd $PBS_O_WORKDIR 5 | -------------------------------------------------------------------------------- /exercises/cuda/nn/cpu/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.nn 6 | -------------------------------------------------------------------------------- /exercises/cuda/nn/orig/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.nn 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/nn/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.nn 6 | -------------------------------------------------------------------------------- /exercises/cuda/make.common: -------------------------------------------------------------------------------- 1 | ARCH=-arch sm_30 2 | CUB_INCLUDE=../../../../cub-1.4.1 3 | -------------------------------------------------------------------------------- /exercises/cuda/hello_world/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.hello_world 6 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_CPU/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_CPU 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction3/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction3 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction4/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction4 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/make.common: -------------------------------------------------------------------------------- 1 | ARCH=-arch sm_30 2 | CUB_INCLUDE=../../../../cub-1.4.1 3 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/hello_world/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.hello_world 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction3/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction3 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction4/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction4 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add 6 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_streams/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_streams 6 | -------------------------------------------------------------------------------- /exercises/cuda/naive_transpose/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.naive_transpose 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_atomic/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_atomic 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_naive/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_naive 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_thrust/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_thrust 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_stencil/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_stencil 6 | -------------------------------------------------------------------------------- /exercises/cuda/smem_transpose/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.smem_transpose 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_CUBLAS/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_CUBLAS 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_streams/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_streams 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_stencil/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_stencil 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.smem_transpose 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_blocks 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_threads/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_threads 6 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/runit.acc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./laplace2d_acc 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_GPU_naive 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_GPU_shmem 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.naive_transpose 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_atomic/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_atomic 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_naive/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_naive 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_thrust/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_thrust 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose_opt/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.smem_transpose 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_cub_block/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_cub_block 6 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_cub_device/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_cub_device 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_stencil_smem/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_stencil_smem 6 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/runit.acc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./laplace2d_acc 6 | -------------------------------------------------------------------------------- /runit.query: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | source ./samplespath 6 | 7 | $SAMPLESPATH/deviceQuery 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.matmul_GPU_shmem1 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.naive_transpose 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_cub_block/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_cub_block 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_blocks 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_threads/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_threads 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_stencil_smem/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_stencil_smem 6 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/runit.acc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./laplace2d_acc 6 | -------------------------------------------------------------------------------- /exercises/cuda/nn/libatlas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/libatlas.a -------------------------------------------------------------------------------- /exercises/cuda/nn/libcblas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/libcblas.a -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/original/README.md: -------------------------------------------------------------------------------- 1 | Original Files 2 | ============== 3 | 4 | Original source files 5 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/laplace_acc.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | ./laplace2d_acc 4 | 5 | -------------------------------------------------------------------------------- /runit.bandwidth: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | source ./samplespath 6 | 7 | $SAMPLESPATH/bandwidthTest 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_cub_device/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.reduction_cub_device 6 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks_threads/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_blocks_threads 6 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/laplace_acc.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | ./laplace2d_acc 4 | 5 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/original/README.md: -------------------------------------------------------------------------------- 1 | Original Files 2 | ============== 3 | 4 | Original source files 5 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/laplace_acc.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | ./laplace2d_acc 4 | 5 | -------------------------------------------------------------------------------- /exercises/cuda/nn/cpu/libatlas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/cpu/libatlas.a -------------------------------------------------------------------------------- /exercises/cuda/nn/cpu/libcblas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/nn/cpu/libcblas.a -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/runit.omp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | OMP_NUM_THREADS=1 ./laplace2d_omp 6 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/runit.omp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | OMP_NUM_THREADS=1 ./laplace2d_omp 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks_threads/runit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | ./x.simple_add_blocks_threads 6 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/laplace_acc.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | ./laplace2d_acc 4 | 5 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/nn/libatlas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/nn/libatlas.a -------------------------------------------------------------------------------- /exercise_solutions/cuda/nn/libcblas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/nn/libcblas.a -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/runit.acc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | PGI_ACC_TIME=1 ./laplace2d_acc 6 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/runit.omp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | OMP_NUM_THREADS=1 ./laplace2d_omp 6 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/runit.omp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | OMP_NUM_THREADS=1 ./laplace2d_omp 6 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/libatlas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/svm_challenge/libatlas.a -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/libcblas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercises/cuda/svm_challenge/libcblas.a -------------------------------------------------------------------------------- /runit.matmul: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #BATCHARGS 4 | 5 | source ./samplespath 6 | 7 | $SAMPLESPATH/matrixMul 8 | $SAMPLESPATH/matrixMulCUBLAS 9 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/laplace_omp.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | setenv OMP_NUM_THREADS 6 4 | ./laplace2d_omp 5 | 6 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/laplace_omp.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | setenv OMP_NUM_THREADS 6 4 | ./laplace2d_omp 5 | 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/libatlas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/svm_challenge/libatlas.a -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/libcblas.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/svm_challenge/libcblas.a -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/laplace_omp.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | setenv OMP_NUM_THREADS 6 4 | ./laplace2d_omp 5 | 6 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/laplace_omp.job: -------------------------------------------------------------------------------- 1 | #!/bin/csh 2 | #PBS -l walltime=3:00 3 | setenv OMP_NUM_THREADS 6 4 | ./laplace2d_omp 5 | 6 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_CUBLAS/matmul_CUBLAS.timeline.k20X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_CUBLAS/matmul_CUBLAS.timeline.k20X -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/spamSample3.txt: -------------------------------------------------------------------------------- 1 | Hello.. 2 | 3 | My name is Wilson, from ICICI BANK HK. 4 | I have a profitable/confidential deal worth over 48M Dollar to discuss with you. 5 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_streams/matmul_streams.timeline.k20X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_streams/matmul_streams.timeline.k20X -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | int main() { 5 | 6 | cout << "hello world" << endl; 7 | 8 | } /* end main */ 9 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/spamSample3.txt: -------------------------------------------------------------------------------- 1 | Hello.. 2 | 3 | My name is Wilson, from ICICI BANK HK. 4 | I have a profitable/confidential deal worth over 48M Dollar to discuss with you. 5 | -------------------------------------------------------------------------------- /exercises/cuda/smem_transpose/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=smem_transpose 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose/naive_transpose.timeline.c2050 -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_naive/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_naive 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_shmem1/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_shmem1 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_naive/matmul_GPU_naive.timeline.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem/matmul_GPU_shmem.timeline.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/matmul_GPU_shmem1/matmul_GPU_shmem1.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=smem_transpose 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_naive 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_shmem1 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose_opt/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=smem_transpose_opt 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/naive_transpose_cutlass/naive_transpose.timeline.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.k40: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.k40 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.no_conflict.timeline.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.analysis.c2050 -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.c2050: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jnbntz/gpu-edu-workshops/HEAD/exercise_solutions/cuda/smem_transpose/smem_transpose.bank_conflict.timeline.c2050 -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_shmem/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_shmem 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercises/cuda/naive_transpose/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=naive_transpose 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_CUBLAS/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_CUBLAS 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | #nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/spamSample2.txt: -------------------------------------------------------------------------------- 1 | Best Buy Viagra Generic Online 2 | 3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed! 4 | 5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers! 6 | http://medphysitcstech.ru 7 | 8 | 9 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_GPU_shmem 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_streams/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=matmul_streams 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | #nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=naive_transpose 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bin=naive_transpose 4 | 5 | nvprof --output-profile $bin.timeline ./x.$bin 6 | nvprof --analysis-metrics -o $bin.analysis ./x.$bin 7 | #nvprof --metrics gld_efficiency,gst_efficiency,shared_efficiency,shared_replay_overhead -o $bin.metrics ./x.$bin 8 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/spamSample2.txt: -------------------------------------------------------------------------------- 1 | Best Buy Viagra Generic Online 2 | 3 | Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed! 4 | 5 | We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers! 6 | http://medphysitcstech.ru 7 | 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | gpu-edu-workshops 2 | ================= 3 | 4 | Public repository for code that I use to teach hands-on NVIDIA GPU computing workshops. 5 | 6 | 7 | License 8 | ------- 9 | 10 | These examples are released under the Apache 2.0 open source license. Refer to LICENSE in this directory for full details. 11 | 12 | -------------------------------------------------------------------------------- /README.cluster: -------------------------------------------------------------------------------- 1 | There are batch scripts located in each subdirectory. 2 | 3 | Correct "cudascript" and "openaccscript" to add the proper batch arguments as 4 | well as any other "module load" requirements and anything else required before 5 | executing the executable. Something like the following: 6 | 7 | sed -i '/\#BATCHARGS/ r cudascript' runit.nvidia-smi 8 | 9 | or 10 | 11 | find /path/to/dir/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \; 12 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/emailSample1.txt: -------------------------------------------------------------------------------- 1 | > Anyone knows how much it costs to host a web portal ? 2 | > 3 | Well, it depends on how many visitors you're expecting. 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 6 | if youre running something big.. 7 | 8 | To unsubscribe yourself from this mailing list, send an email to: 9 | groupname-unsubscribe@egroups.com 10 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/emailSample1.txt: -------------------------------------------------------------------------------- 1 | > Anyone knows how much it costs to host a web portal ? 2 | > 3 | Well, it depends on how many visitors you're expecting. 4 | This can be anywhere from less than 10 bucks a month to a couple of $100. 5 | You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 6 | if youre running something big.. 7 | 8 | To unsubscribe yourself from this mailing list, send an email to: 9 | groupname-unsubscribe@egroups.com 10 | -------------------------------------------------------------------------------- /batch_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | sed -i '/#BATCHARGS/ r cudascript' runit.nvidia-smi 4 | sed -i '/#BATCHARGS/ r cudascript' runit.query 5 | sed -i '/#BATCHARGS/ r cudascript' runit.matmul 6 | sed -i '/#BATCHARGS/ r cudascript' runit.bandwidth 7 | 8 | find ./exercises/cuda/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \; 9 | find ./exercise_solutions/cuda/ -type f -exec sed -i '/\#BATCHARGS/ r cudascript' {} \; 10 | 11 | find ./exercises/openacc/ -type f -exec sed -i '/\#BATCHARGS/ r openaccscript' {} \; 12 | find ./exercise_solutions/openacc/ -type f -exec sed -i '/\#BATCHARGS/ r openaccscript' {} \; 13 | -------------------------------------------------------------------------------- /exercises/cuda/nn/setupData.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz 9 | 10 | gunzip -f train-images-idx3-ubyte.gz 11 | gunzip -f train-labels-idx1-ubyte.gz 12 | gunzip -f t10k-images-idx3-ubyte.gz 13 | gunzip -f t10k-labels-idx1-ubyte.gz 14 | 15 | cc -o mnist mnist.c 16 | 17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt 18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt 19 | -------------------------------------------------------------------------------- /exercises/cuda/nn/cpu/setupData.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz 9 | 10 | gunzip -f train-images-idx3-ubyte.gz 11 | gunzip -f train-labels-idx1-ubyte.gz 12 | gunzip -f t10k-images-idx3-ubyte.gz 13 | gunzip -f t10k-labels-idx1-ubyte.gz 14 | 15 | cc -o mnist mnist.c 16 | 17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt 18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt 19 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/nn/setupData.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz 6 | wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz 7 | wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz 8 | wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz 9 | 10 | gunzip -f train-images-idx3-ubyte.gz 11 | gunzip -f train-labels-idx1-ubyte.gz 12 | gunzip -f t10k-images-idx3-ubyte.gz 13 | gunzip -f t10k-labels-idx1-ubyte.gz 14 | 15 | cc -o mnist mnist.c 16 | 17 | ./mnist -9 -l t10k-labels-idx1-ubyte -i t10k-images-idx3-ubyte > t10k-images.txt 2> t10k-labels.txt 18 | ./mnist -9 -l train-labels-idx1-ubyte -i train-images-idx3-ubyte > train-images.txt 2> train-labels.txt 19 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/spamSample1.txt: -------------------------------------------------------------------------------- 1 | Do You Want To Make $1000 Or More Per Week? 2 | 3 | 4 | 5 | If you are a motivated and qualified individual - I 6 | will personally demonstrate to you a system that will 7 | make you $1,000 per week or more! This is NOT mlm. 8 | 9 | 10 | 11 | Call our 24 hour pre-recorded number to get the 12 | details. 13 | 14 | 15 | 16 | 000-456-789 17 | 18 | 19 | 20 | I need people who want to make serious money. Make 21 | the call and get the facts. 22 | 23 | Invest 2 minutes in yourself now! 24 | 25 | 26 | 27 | 000-456-789 28 | 29 | 30 | 31 | Looking forward to your call and I will introduce you 32 | to people like yourself who 33 | are currently making $10,000 plus per week! 34 | 35 | 36 | 37 | 000-456-789 38 | 39 | 40 | 41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72 42 | 43 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/spamSample1.txt: -------------------------------------------------------------------------------- 1 | Do You Want To Make $1000 Or More Per Week? 2 | 3 | 4 | 5 | If you are a motivated and qualified individual - I 6 | will personally demonstrate to you a system that will 7 | make you $1,000 per week or more! This is NOT mlm. 8 | 9 | 10 | 11 | Call our 24 hour pre-recorded number to get the 12 | details. 13 | 14 | 15 | 16 | 000-456-789 17 | 18 | 19 | 20 | I need people who want to make serious money. Make 21 | the call and get the facts. 22 | 23 | Invest 2 minutes in yourself now! 24 | 25 | 26 | 27 | 000-456-789 28 | 29 | 30 | 31 | Looking forward to your call and I will introduce you 32 | to people like yourself who 33 | are currently making $10,000 plus per week! 34 | 35 | 36 | 37 | 000-456-789 38 | 39 | 40 | 41 | 3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72 42 | 43 | -------------------------------------------------------------------------------- /exercises/cuda/thrust_sort/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.sort 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/thrust_sort/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.sort 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_CPU/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_CPU 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | NVCC=nvcc 19 | NVOPTS=$(ARCH) -DDEBUG 20 | 21 | hello_world: kernel.o 22 | $(NVCC) $(NVOPTS) -o x.hello_world kernel.o 23 | 24 | kernel.o: kernel.cu 25 | $(NVCC) $(NVOPTS) -c kernel.cu 26 | 27 | clean: 28 | rm -rf kernel.o x.hello_world 29 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_CPU/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_CPU 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction3/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction3 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction4/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction4 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | NVCC=nvcc 19 | NVOPTS=$(ARCH) -DDEBUG 20 | 21 | hello_world: kernel.o 22 | $(NVCC) $(NVOPTS) -o x.hello_world kernel.o 23 | 24 | kernel.o: kernel.cu 25 | $(NVCC) $(NVOPTS) -c kernel.cu 26 | 27 | clean: 28 | rm -rf kernel.o x.hello_world 29 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction3/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction3 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction4/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction4 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_atomic/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_atomic 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_naive/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_naive 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_thrust/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_thrust 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_blocks 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_threads/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_threads 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/simple_stencil/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_stencil 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_stencil/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_stencil 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_CUBLAS/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_CUBLAS 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_streams/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_streams 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_atomic/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_atomic 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_naive/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_naive 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_thrust/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_thrust 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_threads/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_threads 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/naive_transpose/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.naive_transpose 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/simple_stencil_smem/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_stencil_smem 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/smem_transpose/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.smem_transpose 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_CUBLAS/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_CUBLAS 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_streams/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_streams 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_stencil_smem/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_stencil_smem 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.smem_transpose 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks_threads/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_blocks_threads 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.naive_transpose 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_blocks 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_naive/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_naive 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_shmem/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_shmem 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/naive_transpose_cutlass/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2024 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.naive_transpose 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks_threads/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.simple_add_blocks_threads 19 | NVCC=nvcc 20 | NVOPTS=$(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/smem_transpose_opt/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.smem_transpose_opt 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/matmul_GPU_shmem1/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_shmem1 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_cub_block/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_cub_block 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE) 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_cub_device/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_cub_device 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE) 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_naive/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_naive 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_shmem 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/matmul_GPU_shmem1/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.matmul_GPU_shmem1 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -lineinfo -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o -lcublas 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_cub_block/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_cub_block 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE) 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_cub_device/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include ../make.common 18 | BIN=x.reduction_cub_device 19 | NVCC=nvcc 20 | NVOPTS=-O3 $(ARCH) -DDEBUG 21 | 22 | $(BIN): kernel.o 23 | $(NVCC) $(NVOPTS) -o $(BIN) kernel.o 24 | 25 | kernel.o: kernel.cu 26 | $(NVCC) $(NVOPTS) -c kernel.cu -I$(CUB_INCLUDE) 27 | 28 | clean: 29 | rm -rf kernel.o $(BIN) 30 | -------------------------------------------------------------------------------- /exercises/cuda/hello_world/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void mykernel(){ 21 | printf("Hello world from device!\n"); 22 | } /* end kernel */ 23 | 24 | int main(void) 25 | { 26 | mykernel<<<1,1>>>(); 27 | checkKERNEL() 28 | printf("Hello World from Host\n"); 29 | return 0; 30 | } /* end main */ 31 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/hello_world/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void mykernel(){ 21 | printf("Hello world from device!\n"); 22 | } /* end kernel */ 23 | 24 | int main(void) 25 | { 26 | mykernel<<<1,1>>>(); 27 | checkKERNEL() 28 | printf("Hello World from Host\n"); 29 | return 0; 30 | } /* end main */ 31 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC = pgcc 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.c 27 | $(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.c 30 | $(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/cuda/nn/README.md: -------------------------------------------------------------------------------- 1 | Instructions 2 | ------------ 3 | 4 | To run the code which trains and then classifies a handwritten digit do the 5 | following steps. 6 | 7 | 1.) Grab the MNIST files from the Yann Lecun's website. 8 | 9 | > sh setupData.sh 10 | 11 | 2.) Build the code. Ensure that nvcc is in your path. 12 | 13 | > make 14 | 15 | 3.) Run the code. In this step the network will be trained on the 60,000 16 | images from MNIST then compared against 10,000 test images. 17 | 18 | ./x.nn 19 | 20 | Learning rate lambda is 3.000e-01 21 | Batchsize is 50 22 | Number of iterations is 1 23 | Hidden Layer Size is 25 24 | Number of training examples 60000 25 | Number of features/pixels per example 784 26 | Number of test examples 10000 27 | | 28 | Total time for training is 1.277e+00 sec 29 | Total correct on training set is 48960 30 | Prediction rate of training set is 81.600 31 | Total correct on test set is 8214 32 | Prediction rate of test set is 82.140 33 | 34 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC = pgcc 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.c 27 | $(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.c 30 | $(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC = pgcc 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.c 27 | $(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.c 30 | $(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC = pgcc 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = FIXME# add OpenACC compiler options here 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.c 27 | $(CC) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.c 30 | $(CC) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/Makefile_f90: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | F90 = pgf90 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.f90 27 | $(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.f90 30 | $(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/Makefile_f90: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | F90 = pgf90 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = FIXME# add OpenACC compiler options here 20 | OMPFLAGS = -fast -mp -Minfo 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.f90 27 | $(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.f90 30 | $(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/cuda/nn/cpu/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc -Xcompiler=-O3 19 | LIBS=-L. -lcblas -latlas 20 | INC=-I. 21 | 22 | include ../../make.common 23 | 24 | all: x.nn 25 | 26 | x.nn: main.o auxiliary.o 27 | $(NVCC) $(ARCH) -o x.nn main.o auxiliary.o $(LIBS) 28 | 29 | main.o: main.cu headers.h 30 | $(NVCC) $(ARCH) -c main.cu $(INC) 31 | 32 | auxiliary.o: auxiliary.cu headers.h 33 | $(NVCC) $(ARCH) -c auxiliary.cu $(INC) 34 | 35 | clean: 36 | rm -rf *.o 37 | rm -rf x.* 38 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/Makefile_f90: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | F90 = pgf90 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel -Mpreprocess 20 | OMPFLAGS = -fast -mp -Minfo -Mpreprocess 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.f90 27 | $(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.f90 30 | $(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/Makefile_f90: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | F90 = pgf90 18 | CCFLAGS = -tp sandybridge-64 19 | ACCFLAGS = -acc -ta=nvidia -Minfo=accel -Mpreprocess 20 | OMPFLAGS = -fast -mp -Minfo -Mpreprocess 21 | 22 | BIN = laplace2d_omp laplace2d_acc 23 | 24 | all: $(BIN) 25 | 26 | laplace2d_acc: laplace2d.f90 27 | $(F90) $(CCFLAGS) $(ACCFLAGS) -o $@ $< 28 | 29 | laplace2d_omp: laplace2d.f90 30 | $(F90) $(CCFLAGS) $(OMPFLAGS) -o $@ $< 31 | 32 | clean: 33 | $(RM) $(BIN) 34 | -------------------------------------------------------------------------------- /exercises/cuda/nn/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG 20 | LIBS=-L. -lcblas -latlas -lcublas 21 | INC=-I. 22 | 23 | include ../make.common 24 | 25 | all: x.nn 26 | 27 | x.nn: main.o auxiliary.o 28 | $(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS) 29 | 30 | main.o: main.cu headers.h 31 | $(NVCC) $(NVOPTS) -c main.cu $(INC) 32 | 33 | auxiliary.o: auxiliary.cu headers.h 34 | $(NVCC) $(NVOPTS) -c auxiliary.cu $(INC) 35 | 36 | clean: 37 | rm -rf *.o 38 | rm -rf x.* 39 | -------------------------------------------------------------------------------- /exercises/cuda/nn/orig/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG 20 | LIBS=-L. -lcblas -latlas -lcublas 21 | INC=-I. 22 | 23 | include ../make.common 24 | 25 | all: x.nn 26 | 27 | x.nn: main.o auxiliary.o 28 | $(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS) 29 | 30 | main.o: main.cu headers.h 31 | $(NVCC) $(NVOPTS) -c main.cu $(INC) 32 | 33 | auxiliary.o: auxiliary.cu headers.h 34 | $(NVCC) $(NVOPTS) -c auxiliary.cu $(INC) 35 | 36 | clean: 37 | rm -rf *.o 38 | rm -rf x.* 39 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/nn/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | NVOPTS=-Xcompiler=-O3 -lineinfo $(ARCH) -DDEBUG 20 | LIBS=-L. -lcblas -latlas -lcublas 21 | INC=-I. 22 | 23 | include ../make.common 24 | 25 | all: x.nn 26 | 27 | x.nn: main.o auxiliary.o 28 | $(NVCC) $(NVOPTS) -o x.nn main.o auxiliary.o $(LIBS) 29 | 30 | main.o: main.cu headers.h 31 | $(NVCC) $(NVOPTS) -c main.cu $(INC) 32 | 33 | auxiliary.o: auxiliary.cu headers.h 34 | $(NVCC) $(NVOPTS) -c auxiliary.cu $(INC) 35 | 36 | clean: 37 | rm -rf *.o 38 | rm -rf x.* 39 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | LIBS=-L. -lcblas -latlas 20 | INC=-I. 21 | 22 | include ../make.common 23 | 24 | all: x.train x.porterStemmer 25 | 26 | x.train: main.o auxiliary.o 27 | $(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS) 28 | 29 | main.o: main.cu headers.h 30 | $(NVCC) $(ARCH) -c main.cu $(INC) 31 | 32 | auxiliary.o: auxiliary.cu headers.h 33 | $(NVCC) $(ARCH) -c auxiliary.cu $(INC) 34 | 35 | x.porterStemmer: porterStemmer.o 36 | $(CC) -o x.porterStemmer porterStemmer.o 37 | 38 | porterStemmer.o: porterStemmer.c 39 | $(CC) -c porterStemmer.c -o porterStemmer.o 40 | 41 | clean: 42 | rm -rf *.o 43 | rm -rf x.* 44 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/original/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | LIBS=-L. -lcblas -latlas 20 | INC=-I. 21 | 22 | include ../make.common 23 | 24 | all: x.train x.porterStemmer 25 | 26 | x.train: main.o auxiliary.o 27 | $(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS) 28 | 29 | main.o: main.cu headers.h 30 | $(NVCC) $(ARCH) -c main.cu $(INC) 31 | 32 | auxiliary.o: auxiliary.cu headers.h 33 | $(NVCC) $(ARCH) -c auxiliary.cu $(INC) 34 | 35 | x.porterStemmer: porterStemmer.o 36 | $(CC) -o x.porterStemmer porterStemmer.o 37 | 38 | porterStemmer.o: porterStemmer.c 39 | $(CC) -c porterStemmer.c -o porterStemmer.o 40 | 41 | clean: 42 | rm -rf *.o 43 | rm -rf x.* 44 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc -lineinfo 19 | LIBS=-L. -lcblas -latlas -lcublas 20 | INC=-I. 21 | 22 | include ../make.common 23 | 24 | all: x.train x.porterStemmer 25 | 26 | x.train: main.o auxiliary.o 27 | $(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS) 28 | 29 | main.o: main.cu headers.h kernels.h 30 | $(NVCC) $(ARCH) -c main.cu $(INC) 31 | 32 | auxiliary.o: auxiliary.cu headers.h kernels.h 33 | $(NVCC) $(ARCH) -c auxiliary.cu $(INC) 34 | 35 | x.porterStemmer: porterStemmer.o 36 | $(CC) -o x.porterStemmer porterStemmer.o 37 | 38 | porterStemmer.o: porterStemmer.c 39 | $(CC) -c porterStemmer.c -o porterStemmer.o 40 | 41 | clean: 42 | rm -rf *.o 43 | rm -rf x.* 44 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/original/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CC=gcc 18 | NVCC=nvcc 19 | LIBS=-L. -lcblas -latlas -lcublas 20 | INC=-I. 21 | 22 | include ../make.common 23 | 24 | all: x.train x.porterStemmer 25 | 26 | x.train: main.o auxiliary.o 27 | $(NVCC) $(ARCH) -o x.train main.o auxiliary.o $(LIBS) 28 | 29 | main.o: main.cu headers.h kernels.h 30 | $(NVCC) $(ARCH) -c main.cu $(INC) 31 | 32 | auxiliary.o: auxiliary.cu headers.h kernels.h 33 | $(NVCC) $(ARCH) -c auxiliary.cu $(INC) 34 | 35 | x.porterStemmer: porterStemmer.o 36 | $(CC) -o x.porterStemmer porterStemmer.o 37 | 38 | porterStemmer.o: porterStemmer.c 39 | $(CC) -c porterStemmer.c -o porterStemmer.o 40 | 41 | clean: 42 | rm -rf *.o 43 | rm -rf x.* 44 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/processEmail.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2017 NVIDIA Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | if [ "$#" -ne 1 ]; then 20 | echo "illegal number of arguments" 21 | echo "Usage: $0 " 22 | exit 23 | fi 24 | 25 | if [ ! -f "$1" ]; then 26 | echo "$1 is not a file" 27 | exit 28 | fi 29 | 30 | inputEmail=$1 31 | 32 | #cat $inputEmail | awk '{print tolower($0)}' 33 | 34 | awk '{print tolower($0)}' $inputEmail | \ 35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \ 36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \ 37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \ 38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \ 39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \ 40 | awk 'NF > 0' > qwerty.txt 41 | 42 | ./x.porterStemmer qwerty.txt 43 | 44 | python processEmail.py > emailVector.txt 45 | 46 | rm -f qwerty.txt 47 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/processEmail.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2017 NVIDIA Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | if [ "$#" -ne 1 ]; then 20 | echo "illegal number of arguments" 21 | echo "Usage: $0 " 22 | exit 23 | fi 24 | 25 | if [ ! -f "$1" ]; then 26 | echo "$1 is not a file" 27 | exit 28 | fi 29 | 30 | inputEmail=$1 31 | 32 | #cat $inputEmail | awk '{print tolower($0)}' 33 | 34 | awk '{print tolower($0)}' $inputEmail | \ 35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \ 36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \ 37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \ 38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \ 39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \ 40 | awk 'NF > 0' > qwerty.txt 41 | 42 | ./x.porterStemmer qwerty.txt 43 | 44 | python processEmail.py > emailVector.txt 45 | 46 | rm -f qwerty.txt 47 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/original/processEmail.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2017 NVIDIA Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | if [ "$#" -ne 1 ]; then 20 | echo "illegal number of arguments" 21 | echo "Usage: $0 " 22 | exit 23 | fi 24 | 25 | if [ ! -f "$1" ]; then 26 | echo "$1 is not a file" 27 | exit 28 | fi 29 | 30 | inputEmail=$1 31 | 32 | #cat $inputEmail | awk '{print tolower($0)}' 33 | 34 | awk '{print tolower($0)}' $inputEmail | \ 35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \ 36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \ 37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \ 38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \ 39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \ 40 | awk 'NF > 0' > qwerty.txt 41 | 42 | ./x.porterStemmer qwerty.txt 43 | 44 | python processEmail.py > emailVector.txt 45 | 46 | rm -f qwerty.txt 47 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/original/processEmail.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2017 NVIDIA Corporation 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | if [ "$#" -ne 1 ]; then 20 | echo "illegal number of arguments" 21 | echo "Usage: $0 " 22 | exit 23 | fi 24 | 25 | if [ ! -f "$1" ]; then 26 | echo "$1 is not a file" 27 | exit 28 | fi 29 | 30 | inputEmail=$1 31 | 32 | #cat $inputEmail | awk '{print tolower($0)}' 33 | 34 | awk '{print tolower($0)}' $inputEmail | \ 35 | awk '{print gensub(/[[:digit:]]+/,"number","g")}' | \ 36 | awk '{print gensub(/(http|https)\:\/\/[[:graph:]]*/,"httpaddr","g")}' | \ 37 | awk '{print gensub(/[[:graph:]]+@[[:graph:]]+/,"emailaddr","g")}' | \ 38 | awk '{print gensub(/[$]+/,"dollar","g")}' | \ 39 | awk '{print gensub(/([^[:alnum:]|^[:blank:]])/,"","g")}' | \ 40 | awk 'NF > 0' > qwerty.txt 41 | 42 | ./x.porterStemmer qwerty.txt 43 | 44 | python processEmail.py > emailVector.txt 45 | 46 | rm -f qwerty.txt 47 | -------------------------------------------------------------------------------- /exercises/cuda/debug.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* CUDA debugging */ 18 | 19 | #ifdef DEBUG 20 | #define checkCUDA(F) if( (F) != cudaSuccess ) \ 21 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 22 | __FILE__,__LINE__); exit(-1);} 23 | 24 | #define checkKERNEL() if( (cudaPeekAtLastError()) != cudaSuccess ) \ 25 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 26 | __FILE__,__LINE__-1); exit(-1);} \ 27 | if( (cudaDeviceSynchronize()) != cudaSuccess ) \ 28 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 29 | __FILE__,__LINE__); exit(-1);} 30 | 31 | #define checkCUBLAS(F) if( (F) != CUBLAS_STATUS_SUCCESS ) \ 32 | {printf("Error %d at %s:%d\n", F, \ 33 | __FILE__,__LINE__); exit(-1);} 34 | 35 | #else 36 | 37 | #define checkCUDA(F) (F) 38 | #define checkKERNEL() 39 | #define checkCUBLAS(F) (F) 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/debug.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* CUDA debugging */ 18 | 19 | #ifdef DEBUG 20 | #define checkCUDA(F) if( (F) != cudaSuccess ) \ 21 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 22 | __FILE__,__LINE__); exit(-1);} 23 | 24 | #define checkKERNEL() if( (cudaPeekAtLastError()) != cudaSuccess ) \ 25 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 26 | __FILE__,__LINE__-1); exit(-1);} \ 27 | if( (cudaDeviceSynchronize()) != cudaSuccess ) \ 28 | {printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \ 29 | __FILE__,__LINE__); exit(-1);} 30 | 31 | #define checkCUBLAS(F) if( (F) != CUBLAS_STATUS_SUCCESS ) \ 32 | {printf("Error %d at %s:%d\n", F, \ 33 | __FILE__,__LINE__); exit(-1);} 34 | 35 | #else 36 | 37 | #define checkCUDA(F) (F) 38 | #define checkKERNEL() 39 | #define checkCUBLAS(F) (F) 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/processEmail.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # read in the vocab list 18 | 19 | f = open('vocab_formatted.txt') 20 | vocab = [line.strip() for line in f] 21 | f.close() 22 | 23 | # read in the email line by line 24 | 25 | email = [] 26 | 27 | f = open('qwerty.txt') 28 | [email.extend(line.strip().split()) for line in f] 29 | f.close 30 | 31 | #print email 32 | 33 | # check each word of the email against the vocab list and build 34 | # up an array of word indices. The index is the location of the word 35 | # in the vocab list 36 | 37 | wordIndices = [] 38 | 39 | for i in email: 40 | if i in vocab: 41 | wordIndices.append(vocab.index(i)) 42 | 43 | #print wordIndices 44 | 45 | # feature vector length is equal to length of vocabulary list 46 | 47 | vecLength = len(vocab) 48 | featureVector = [0] * vecLength 49 | 50 | for i in wordIndices: 51 | featureVector[i] = 1 52 | 53 | #print len(featureVector) 54 | #print sum(featureVector) 55 | #print featureVector 56 | for val in featureVector: 57 | print val 58 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/processEmail.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # read in the vocab list 18 | 19 | f = open('vocab_formatted.txt') 20 | vocab = [line.strip() for line in f] 21 | f.close() 22 | 23 | # read in the email line by line 24 | 25 | email = [] 26 | 27 | f = open('qwerty.txt') 28 | [email.extend(line.strip().split()) for line in f] 29 | f.close 30 | 31 | #print email 32 | 33 | # check each word of the email against the vocab list and build 34 | # up an array of word indices. The index is the location of the word 35 | # in the vocab list 36 | 37 | wordIndices = [] 38 | 39 | for i in email: 40 | if i in vocab: 41 | wordIndices.append(vocab.index(i)) 42 | 43 | #print wordIndices 44 | 45 | # feature vector length is equal to length of vocabulary list 46 | 47 | vecLength = len(vocab) 48 | featureVector = [0] * vecLength 49 | 50 | for i in wordIndices: 51 | featureVector[i] = 1 52 | 53 | #print len(featureVector) 54 | #print sum(featureVector) 55 | #print featureVector 56 | for val in featureVector: 57 | print val 58 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/original/processEmail.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # read in the vocab list 18 | 19 | f = open('vocab_formatted.txt') 20 | vocab = [line.strip() for line in f] 21 | f.close() 22 | 23 | # read in the email line by line 24 | 25 | email = [] 26 | 27 | f = open('qwerty.txt') 28 | [email.extend(line.strip().split()) for line in f] 29 | f.close 30 | 31 | #print email 32 | 33 | # check each word of the email against the vocab list and build 34 | # up an array of word indices. The index is the location of the word 35 | # in the vocab list 36 | 37 | wordIndices = [] 38 | 39 | for i in email: 40 | if i in vocab: 41 | wordIndices.append(vocab.index(i)) 42 | 43 | #print wordIndices 44 | 45 | # feature vector length is equal to length of vocabulary list 46 | 47 | vecLength = len(vocab) 48 | featureVector = [0] * vecLength 49 | 50 | for i in wordIndices: 51 | featureVector[i] = 1 52 | 53 | #print len(featureVector) 54 | #print sum(featureVector) 55 | #print featureVector 56 | for val in featureVector: 57 | print val 58 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/original/processEmail.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 NVIDIA Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # read in the vocab list 18 | 19 | f = open('vocab_formatted.txt') 20 | vocab = [line.strip() for line in f] 21 | f.close() 22 | 23 | # read in the email line by line 24 | 25 | email = [] 26 | 27 | f = open('qwerty.txt') 28 | [email.extend(line.strip().split()) for line in f] 29 | f.close 30 | 31 | #print email 32 | 33 | # check each word of the email against the vocab list and build 34 | # up an array of word indices. The index is the location of the word 35 | # in the vocab list 36 | 37 | wordIndices = [] 38 | 39 | for i in email: 40 | if i in vocab: 41 | wordIndices.append(vocab.index(i)) 42 | 43 | #print wordIndices 44 | 45 | # feature vector length is equal to length of vocabulary list 46 | 47 | vecLength = len(vocab) 48 | featureVector = [0] * vecLength 49 | 50 | for i in wordIndices: 51 | featureVector[i] = 1 52 | 53 | #print len(featureVector) 54 | #print sum(featureVector) 55 | #print featureVector 56 | for val in featureVector: 57 | print val 58 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TIMER_H 18 | #define TIMER_H 19 | 20 | #include 21 | 22 | #ifdef WIN32 23 | #define WIN32_LEAN_AND_MEAN 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #ifdef WIN32 30 | double PCFreq = 0.0; 31 | __int64 timerStart = 0; 32 | #else 33 | struct timeval timerStart; 34 | #endif 35 | 36 | void StartTimer() 37 | { 38 | #ifdef WIN32 39 | LARGE_INTEGER li; 40 | if(!QueryPerformanceFrequency(&li)) 41 | printf("QueryPerformanceFrequency failed!\n"); 42 | 43 | PCFreq = (double)li.QuadPart/1000.0; 44 | 45 | QueryPerformanceCounter(&li); 46 | timerStart = li.QuadPart; 47 | #else 48 | gettimeofday(&timerStart, NULL); 49 | #endif 50 | } 51 | 52 | // time elapsed in ms 53 | double GetTimer() 54 | { 55 | #ifdef WIN32 56 | LARGE_INTEGER li; 57 | QueryPerformanceCounter(&li); 58 | return (double)(li.QuadPart-timerStart)/PCFreq; 59 | #else 60 | struct timeval timerStop, timerElapsed; 61 | gettimeofday(&timerStop, NULL); 62 | timersub(&timerStop, &timerStart, &timerElapsed); 63 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 64 | #endif 65 | } 66 | 67 | #endif // TIMER_H 68 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TIMER_H 18 | #define TIMER_H 19 | 20 | #include 21 | 22 | #ifdef WIN32 23 | #define WIN32_LEAN_AND_MEAN 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #ifdef WIN32 30 | double PCFreq = 0.0; 31 | __int64 timerStart = 0; 32 | #else 33 | struct timeval timerStart; 34 | #endif 35 | 36 | void StartTimer() 37 | { 38 | #ifdef WIN32 39 | LARGE_INTEGER li; 40 | if(!QueryPerformanceFrequency(&li)) 41 | printf("QueryPerformanceFrequency failed!\n"); 42 | 43 | PCFreq = (double)li.QuadPart/1000.0; 44 | 45 | QueryPerformanceCounter(&li); 46 | timerStart = li.QuadPart; 47 | #else 48 | gettimeofday(&timerStart, NULL); 49 | #endif 50 | } 51 | 52 | // time elapsed in ms 53 | double GetTimer() 54 | { 55 | #ifdef WIN32 56 | LARGE_INTEGER li; 57 | QueryPerformanceCounter(&li); 58 | return (double)(li.QuadPart-timerStart)/PCFreq; 59 | #else 60 | struct timeval timerStop, timerElapsed; 61 | gettimeofday(&timerStop, NULL); 62 | timersub(&timerStop, &timerStart, &timerElapsed); 63 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 64 | #endif 65 | } 66 | 67 | #endif // TIMER_H 68 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TIMER_H 18 | #define TIMER_H 19 | 20 | #include 21 | 22 | #ifdef WIN32 23 | #define WIN32_LEAN_AND_MEAN 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #ifdef WIN32 30 | double PCFreq = 0.0; 31 | __int64 timerStart = 0; 32 | #else 33 | struct timeval timerStart; 34 | #endif 35 | 36 | void StartTimer() 37 | { 38 | #ifdef WIN32 39 | LARGE_INTEGER li; 40 | if(!QueryPerformanceFrequency(&li)) 41 | printf("QueryPerformanceFrequency failed!\n"); 42 | 43 | PCFreq = (double)li.QuadPart/1000.0; 44 | 45 | QueryPerformanceCounter(&li); 46 | timerStart = li.QuadPart; 47 | #else 48 | gettimeofday(&timerStart, NULL); 49 | #endif 50 | } 51 | 52 | // time elapsed in ms 53 | double GetTimer() 54 | { 55 | #ifdef WIN32 56 | LARGE_INTEGER li; 57 | QueryPerformanceCounter(&li); 58 | return (double)(li.QuadPart-timerStart)/PCFreq; 59 | #else 60 | struct timeval timerStop, timerElapsed; 61 | gettimeofday(&timerStop, NULL); 62 | timersub(&timerStop, &timerStart, &timerElapsed); 63 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 64 | #endif 65 | } 66 | 67 | #endif // TIMER_H 68 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TIMER_H 18 | #define TIMER_H 19 | 20 | #include 21 | 22 | #ifdef WIN32 23 | #define WIN32_LEAN_AND_MEAN 24 | #include 25 | #else 26 | #include 27 | #endif 28 | 29 | #ifdef WIN32 30 | double PCFreq = 0.0; 31 | __int64 timerStart = 0; 32 | #else 33 | struct timeval timerStart; 34 | #endif 35 | 36 | void StartTimer() 37 | { 38 | #ifdef WIN32 39 | LARGE_INTEGER li; 40 | if(!QueryPerformanceFrequency(&li)) 41 | printf("QueryPerformanceFrequency failed!\n"); 42 | 43 | PCFreq = (double)li.QuadPart/1000.0; 44 | 45 | QueryPerformanceCounter(&li); 46 | timerStart = li.QuadPart; 47 | #else 48 | gettimeofday(&timerStart, NULL); 49 | #endif 50 | } 51 | 52 | // time elapsed in ms 53 | double GetTimer() 54 | { 55 | #ifdef WIN32 56 | LARGE_INTEGER li; 57 | QueryPerformanceCounter(&li); 58 | return (double)(li.QuadPart-timerStart)/PCFreq; 59 | #else 60 | struct timeval timerStop, timerElapsed; 61 | gettimeofday(&timerStop, NULL); 62 | timersub(&timerStop, &timerStart, &timerElapsed); 63 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 64 | #endif 65 | } 66 | 67 | #endif // TIMER_H 68 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/headers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* include the ATLAS headers */ 18 | 19 | extern "C" 20 | { 21 | #include 22 | } 23 | 24 | #include "../debug.h" 25 | 26 | /* choose precision to train and classify. Only float and double are 27 | * currently suppored 28 | */ 29 | 30 | typedef float floatType_t; 31 | 32 | /* macro to convert 2d coords to 1d offset */ 33 | 34 | #define INDX(row,col,ld) (((col) * (ld)) + (row)) 35 | 36 | /* macros for max/min to combine with argmin */ 37 | 38 | #define MYMAX(val,array,i,index) \ 39 | if( array[i] > val ) \ 40 | { \ 41 | val = array[i]; \ 42 | index = i; \ 43 | } \ 44 | 45 | #define MYMIN(val,array,i,index) \ 46 | if( array[i] < val ) \ 47 | { \ 48 | val = array[i]; \ 49 | index = i; \ 50 | } \ 51 | 52 | /* macro to clip values from min to max */ 53 | 54 | #define CLIP(val,min,max) \ 55 | if( (val) < (min) ) val = (min); \ 56 | else if( (val) > (max) ) val = (max); 57 | 58 | /* hardcoded constants for training and test set size and feature 59 | * vector size 60 | */ 61 | 62 | #define FEATURE_VECTOR_SIZE (1899) 63 | #define TRAINING_SET_SIZE (4000) 64 | #define TEST_SET_SIZE (1000) 65 | 66 | /* function defs */ 67 | 68 | void readMatrixFromFile( char *, int *, const int, const int ); 69 | 70 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const, 71 | const int, const int, 72 | const floatType_t , const int, 73 | floatType_t * ); 74 | 75 | void svmPredict( floatType_t const *, floatType_t const *, 76 | int const, int const, int * ); 77 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/original/headers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* include the ATLAS headers */ 18 | 19 | extern "C" 20 | { 21 | #include 22 | } 23 | 24 | #include "../debug.h" 25 | 26 | /* choose precision to train and classify. Only float and double are 27 | * currently suppored 28 | */ 29 | 30 | typedef float floatType_t; 31 | 32 | /* macro to convert 2d coords to 1d offset */ 33 | 34 | #define INDX(row,col,ld) (((col) * (ld)) + (row)) 35 | 36 | /* macros for max/min to combine with argmin */ 37 | 38 | #define MYMAX(val,array,i,index) \ 39 | if( array[i] > val ) \ 40 | { \ 41 | val = array[i]; \ 42 | index = i; \ 43 | } \ 44 | 45 | #define MYMIN(val,array,i,index) \ 46 | if( array[i] < val ) \ 47 | { \ 48 | val = array[i]; \ 49 | index = i; \ 50 | } \ 51 | 52 | /* macro to clip values from min to max */ 53 | 54 | #define CLIP(val,min,max) \ 55 | if( (val) < (min) ) val = (min); \ 56 | else if( (val) > (max) ) val = (max); 57 | 58 | /* hardcoded constants for training and test set size and feature 59 | * vector size 60 | */ 61 | 62 | #define FEATURE_VECTOR_SIZE (1899) 63 | #define TRAINING_SET_SIZE (4000) 64 | #define TEST_SET_SIZE (1000) 65 | 66 | /* function defs */ 67 | 68 | void readMatrixFromFile( char *, int *, const int, const int ); 69 | 70 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const, 71 | const int, const int, 72 | const floatType_t , const int, 73 | floatType_t * ); 74 | 75 | void svmPredict( floatType_t const *, floatType_t const *, 76 | int const, int const, int * ); 77 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/laplace2d.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! Copyright 2012 NVIDIA Corporation 3 | ! 4 | ! Licensed under the Apache License, Version 2.0 (the "License"); 5 | ! you may not use this file except in compliance with the License. 6 | ! You may obtain a copy of the License at 7 | ! 8 | ! http://www.apache.org/licenses/LICENSE-2.0 9 | ! 10 | ! Unless required by applicable law or agreed to in writing, software 11 | ! distributed under the License is distributed on an "AS IS" BASIS, 12 | ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ! See the License for the specific language governing permissions and 14 | ! limitations under the License. 15 | ! 16 | 17 | program laplace 18 | implicit none 19 | integer, parameter :: fp_kind=kind(1.0d0) 20 | integer, parameter :: n=4096, m=4096, iter_max=1000 21 | integer :: i, j, iter 22 | real(fp_kind), dimension (:,:), allocatable :: A, Anew 23 | real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind 24 | real(fp_kind) :: start_time, stop_time 25 | 26 | allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) ) 27 | 28 | A = 0.0_fp_kind 29 | Anew = 0.0_fp_kind 30 | 31 | ! Set B.C. 32 | A(0,:) = 1.0_fp_kind 33 | Anew(0,:) = 1.0_fp_kind 34 | 35 | write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh' 36 | 37 | call cpu_time(start_time) 38 | 39 | iter=0 40 | 41 | do while ( error .gt. tol .and. iter .lt. iter_max ) 42 | error=0.0_fp_kind 43 | 44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error ) 45 | do j=1,m-2 46 | do i=1,n-2 47 | Anew(i,j) = 0.25_fp_kind * ( A(i+1,j ) + A(i-1,j ) + & 48 | A(i ,j-1) + A(i ,j+1) ) 49 | error = max( error, abs(Anew(i,j)-A(i,j)) ) 50 | end do 51 | end do 52 | !$omp end parallel do 53 | 54 | if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error 55 | iter = iter + 1 56 | 57 | !$omp parallel do shared(m, n, Anew, A) 58 | do j=1,m-2 59 | do i=1,n-2 60 | A(i,j) = Anew(i,j) 61 | end do 62 | end do 63 | !$omp end parallel do 64 | 65 | end do 66 | 67 | call cpu_time(stop_time) 68 | write(*,'(a,f10.3,a)') ' completed in ', stop_time-start_time, ' seconds' 69 | 70 | deallocate (A,Anew) 71 | end program laplace 72 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/laplace2d.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! Copyright 2012 NVIDIA Corporation 3 | ! 4 | ! Licensed under the Apache License, Version 2.0 (the "License"); 5 | ! you may not use this file except in compliance with the License. 6 | ! You may obtain a copy of the License at 7 | ! 8 | ! http://www.apache.org/licenses/LICENSE-2.0 9 | ! 10 | ! Unless required by applicable law or agreed to in writing, software 11 | ! distributed under the License is distributed on an "AS IS" BASIS, 12 | ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ! See the License for the specific language governing permissions and 14 | ! limitations under the License. 15 | ! 16 | 17 | program laplace 18 | implicit none 19 | integer, parameter :: fp_kind=kind(1.0d0) 20 | integer, parameter :: n=4096, m=4096, iter_max=1000 21 | integer :: i, j, iter 22 | real(fp_kind), dimension (:,:), allocatable :: A, Anew 23 | real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind 24 | real(fp_kind) :: start_time, stop_time 25 | 26 | allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) ) 27 | 28 | A = 0.0_fp_kind 29 | Anew = 0.0_fp_kind 30 | 31 | ! Set B.C. 32 | A(0,:) = 1.0_fp_kind 33 | Anew(0,:) = 1.0_fp_kind 34 | 35 | write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh' 36 | 37 | call cpu_time(start_time) 38 | 39 | iter=0 40 | 41 | do while ( error .gt. tol .and. iter .lt. iter_max ) 42 | error=0.0_fp_kind 43 | 44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error ) 45 | !$acc kernels 46 | do j=1,m-2 47 | do i=1,n-2 48 | Anew(i,j) = 0.25_fp_kind * ( A(i+1,j ) + A(i-1,j ) + & 49 | A(i ,j-1) + A(i ,j+1) ) 50 | error = max( error, abs(Anew(i,j)-A(i,j)) ) 51 | end do 52 | end do 53 | !$acc end kernels 54 | !$omp end parallel do 55 | 56 | if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error 57 | iter = iter + 1 58 | 59 | !$omp parallel do shared(m, n, Anew, A) 60 | !$acc kernels 61 | do j=1,m-2 62 | do i=1,n-2 63 | A(i,j) = Anew(i,j) 64 | end do 65 | end do 66 | !$acc end kernels 67 | !$omp end parallel do 68 | 69 | end do 70 | 71 | call cpu_time(stop_time) 72 | write(*,'(a,f10.3,a)') ' completed in ', stop_time-start_time, ' seconds' 73 | 74 | deallocate (A,Anew) 75 | end program laplace 76 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/laplace2d.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! Copyright 2012 NVIDIA Corporation 3 | ! 4 | ! Licensed under the Apache License, Version 2.0 (the "License"); 5 | ! you may not use this file except in compliance with the License. 6 | ! You may obtain a copy of the License at 7 | ! 8 | ! http://www.apache.org/licenses/LICENSE-2.0 9 | ! 10 | ! Unless required by applicable law or agreed to in writing, software 11 | ! distributed under the License is distributed on an "AS IS" BASIS, 12 | ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ! See the License for the specific language governing permissions and 14 | ! limitations under the License. 15 | ! 16 | 17 | program laplace 18 | implicit none 19 | integer, parameter :: fp_kind=kind(1.0d0) 20 | integer, parameter :: n=4096, m=4096, iter_max=1000 21 | integer :: i, j, iter 22 | real(fp_kind), dimension (:,:), allocatable :: A, Anew 23 | real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind 24 | real(fp_kind) :: start_time, stop_time 25 | 26 | allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) ) 27 | 28 | A = 0.0_fp_kind 29 | Anew = 0.0_fp_kind 30 | 31 | ! Set B.C. 32 | A(0,:) = 1.0_fp_kind 33 | Anew(0,:) = 1.0_fp_kind 34 | 35 | write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh' 36 | 37 | call cpu_time(start_time) 38 | 39 | iter=0 40 | 41 | do while ( error .gt. tol .and. iter .lt. iter_max ) 42 | error=0.0_fp_kind 43 | 44 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error ) 45 | !$acc kernels 46 | do j=1,m-2 47 | do i=1,n-2 48 | Anew(i,j) = 0.25_fp_kind * ( A(i+1,j ) + A(i-1,j ) + & 49 | A(i ,j-1) + A(i ,j+1) ) 50 | error = max( error, abs(Anew(i,j)-A(i,j)) ) 51 | end do 52 | end do 53 | !$acc end kernels 54 | !$omp end parallel do 55 | 56 | if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error 57 | iter = iter + 1 58 | 59 | !$omp parallel do shared(m, n, Anew, A) 60 | !$acc kernels 61 | do j=1,m-2 62 | do i=1,n-2 63 | A(i,j) = Anew(i,j) 64 | end do 65 | end do 66 | !$acc end kernels 67 | !$omp end parallel do 68 | 69 | end do 70 | 71 | call cpu_time(stop_time) 72 | write(*,'(a,f10.3,a)') ' completed in ', stop_time-start_time, ' seconds' 73 | 74 | deallocate (A,Anew) 75 | end program laplace 76 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | *c = *a + *b; 23 | } 24 | 25 | int main() 26 | { 27 | int a, b, c; 28 | int *d_a, *d_b, *d_c; 29 | int size = sizeof( int ); 30 | 31 | /* get GPU device number and name */ 32 | 33 | int dev; 34 | cudaDeviceProp deviceProp; 35 | checkCUDA( cudaGetDevice( &dev ) ); 36 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 37 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 38 | 39 | /* allocate space for device copies of a, b, c */ 40 | 41 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 42 | /* enter code here to malloc d_b and d_c */ 43 | FIXME 44 | 45 | /* zero out the device memory for C */ 46 | 47 | checkCUDA( cudaMemset( d_c, 0, size ) ); 48 | 49 | /* setup initial values */ 50 | 51 | a = 2; 52 | b = 7; 53 | c = -99; 54 | 55 | /* copy inputs to device */ 56 | 57 | checkCUDA( cudaMemcpy( d_a, &a, size, cudaMemcpyHostToDevice ) ); 58 | /* enter code here to copy d_b to device */ 59 | FIXME 60 | 61 | /* enter code here to launch the kernel on the GPU */ 62 | FIXME 63 | 64 | checkKERNEL() 65 | 66 | /* copy result back to host */ 67 | 68 | checkCUDA( cudaMemcpy( &c, d_c, size, cudaMemcpyDeviceToHost ) ); 69 | 70 | printf("value of c after kernel is %d\n",c); 71 | if( c == ( a + b ) ) printf("PASS\n"); 72 | else printf("FAIL\n"); 73 | 74 | /* clean up */ 75 | 76 | checkCUDA( cudaFree( d_a ) ); 77 | FIXME 78 | /* enter code here to cudaFree the d_b and d_c pointers */ 79 | 80 | /* calling reset to check errors */ 81 | checkCUDA( cudaDeviceReset() ); 82 | 83 | return 0; 84 | } /* end main */ 85 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/laplace2d.f90: -------------------------------------------------------------------------------- 1 | ! 2 | ! Copyright 2012 NVIDIA Corporation 3 | ! 4 | ! Licensed under the Apache License, Version 2.0 (the "License"); 5 | ! you may not use this file except in compliance with the License. 6 | ! You may obtain a copy of the License at 7 | ! 8 | ! http://www.apache.org/licenses/LICENSE-2.0 9 | ! 10 | ! Unless required by applicable law or agreed to in writing, software 11 | ! distributed under the License is distributed on an "AS IS" BASIS, 12 | ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | ! See the License for the specific language governing permissions and 14 | ! limitations under the License. 15 | ! 16 | 17 | program laplace 18 | implicit none 19 | integer, parameter :: fp_kind=kind(1.0d0) 20 | integer, parameter :: n=4096, m=4096, iter_max=1000 21 | integer :: i, j, iter 22 | real(fp_kind), dimension (:,:), allocatable :: A, Anew 23 | real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind 24 | real(fp_kind) :: start_time, stop_time 25 | 26 | allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) ) 27 | 28 | A = 0.0_fp_kind 29 | Anew = 0.0_fp_kind 30 | 31 | ! Set B.C. 32 | A(0,:) = 1.0_fp_kind 33 | Anew(0,:) = 1.0_fp_kind 34 | 35 | write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh' 36 | 37 | call cpu_time(start_time) 38 | 39 | iter=0 40 | 41 | !$acc data copy(A) create(Anew) 42 | do while ( error .gt. tol .and. iter .lt. iter_max ) 43 | error=0.0_fp_kind 44 | 45 | !$omp parallel do shared(m, n, Anew, A) reduction( max:error ) 46 | !$acc kernels 47 | do j=1,m-2 48 | do i=1,n-2 49 | Anew(i,j) = 0.25_fp_kind * ( A(i+1,j ) + A(i-1,j ) + & 50 | A(i ,j-1) + A(i ,j+1) ) 51 | error = max( error, abs(Anew(i,j)-A(i,j)) ) 52 | end do 53 | end do 54 | !$acc end kernels 55 | !$omp end parallel do 56 | 57 | if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error 58 | iter = iter + 1 59 | 60 | !$omp parallel do shared(m, n, Anew, A) 61 | !$acc kernels 62 | do j=1,m-2 63 | do i=1,n-2 64 | A(i,j) = Anew(i,j) 65 | end do 66 | end do 67 | !$acc end kernels 68 | !$omp end parallel do 69 | 70 | end do 71 | !$acc end data 72 | 73 | call cpu_time(stop_time) 74 | write(*,'(a,f10.3,a)') ' completed in ', stop_time-start_time, ' seconds' 75 | 76 | deallocate (A,Anew) 77 | end program laplace 78 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | *c = *a + *b; 23 | } 24 | 25 | int main() 26 | { 27 | 28 | int a, b, c; 29 | int *d_a, *d_b, *d_c; 30 | int size = sizeof( int ); 31 | 32 | /* get GPU device number and name */ 33 | 34 | int dev; 35 | cudaDeviceProp deviceProp; 36 | checkCUDA( cudaGetDevice( &dev ) ); 37 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 38 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 39 | 40 | /* allocate space for device copies of a, b, c */ 41 | 42 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 43 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 44 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 45 | 46 | /* setup initial values */ 47 | 48 | a = 2; 49 | b = 7; 50 | c = -99; 51 | 52 | 53 | /* copy inputs to device */ 54 | 55 | checkCUDA( cudaMemcpy( d_a, &a, size, cudaMemcpyHostToDevice ) ); 56 | checkCUDA( cudaMemcpy( d_b, &b, size, cudaMemcpyHostToDevice ) ); 57 | 58 | /* zero out the device memory for C */ 59 | 60 | checkCUDA( cudaMemset( d_c, 0, size ) ); 61 | 62 | /* launch the kernel on the GPU */ 63 | 64 | add<<< 1, 1 >>>( d_a, d_b, d_c ); 65 | checkKERNEL() 66 | 67 | /* copy result back to host */ 68 | 69 | checkCUDA( cudaMemcpy( &c, d_c, size, cudaMemcpyDeviceToHost ) ); 70 | 71 | printf("value of c after kernel is %d\n",c); 72 | if( c == ( a + b ) ) printf("PASS\n"); 73 | else printf("FAIL\n"); 74 | 75 | /* clean up */ 76 | 77 | checkCUDA( cudaFree( d_a ) ); 78 | checkCUDA( cudaFree( d_b ) ); 79 | checkCUDA( cudaFree( d_c ) ); 80 | 81 | checkCUDA( cudaDeviceReset() ); 82 | 83 | return 0; 84 | } /* end main */ 85 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/headers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* include the ATLAS headers */ 18 | 19 | extern "C" 20 | { 21 | #include 22 | } 23 | 24 | #include "../debug.h" 25 | 26 | /* choose precision to train and classify. Only float and double are 27 | * currently suppored 28 | */ 29 | 30 | typedef float floatType_t; 31 | 32 | /* macro to convert 2d coords to 1d offset */ 33 | 34 | #define INDX(row,col,ld) (((col) * (ld)) + (row)) 35 | 36 | /* macros for max/min to combine with argmin */ 37 | 38 | #define MYMAX(val,array,i,index) \ 39 | if( array[i] > val ) \ 40 | { \ 41 | val = array[i]; \ 42 | index = i; \ 43 | } \ 44 | 45 | #define MYMIN(val,array,i,index) \ 46 | if( array[i] < val ) \ 47 | { \ 48 | val = array[i]; \ 49 | index = i; \ 50 | } \ 51 | 52 | /* macro to clip values from min to max */ 53 | 54 | #define CLIP(val,min,max) \ 55 | if( (val) < (min) ) val = (min); \ 56 | else if( (val) > (max) ) val = (max); 57 | 58 | /* hardcoded constants for training and test set size and feature 59 | * vector size 60 | */ 61 | 62 | #define FEATURE_VECTOR_SIZE (1899) 63 | #define TRAINING_SET_SIZE (4000) 64 | #define TEST_SET_SIZE (1000) 65 | 66 | /* function defs */ 67 | 68 | void readMatrixFromFile( char *, int *, const int, const int ); 69 | 70 | void calculateBI( floatType_t const *, 71 | floatType_t const *, 72 | floatType_t const *, 73 | int , 74 | floatType_t *, floatType_t *, 75 | int *, int *, 76 | floatType_t const ); 77 | 78 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const, 79 | const int, const int, 80 | const floatType_t, floatType_t * ); 81 | 82 | void svmPredict( floatType_t const *, floatType_t const *, 83 | int const, int const, int * ); 84 | -------------------------------------------------------------------------------- /exercises/openacc/001-laplace2D-kernels/laplace2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "timer.h" 21 | 22 | #define NN 4096 23 | #define NM 4096 24 | 25 | double A[NN][NM]; 26 | double Anew[NN][NM]; 27 | 28 | int main(int argc, char** argv) 29 | { 30 | const int n = NN; 31 | const int m = NM; 32 | const int iter_max = 1000; 33 | 34 | const double tol = 1.0e-6; 35 | double error = 1.0; 36 | 37 | memset(A, 0, n * m * sizeof(double)); 38 | memset(Anew, 0, n * m * sizeof(double)); 39 | 40 | for (int j = 0; j < n; j++) 41 | { 42 | A[j][0] = 1.0; 43 | Anew[j][0] = 1.0; 44 | } 45 | 46 | printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); 47 | 48 | StartTimer(); 49 | int iter = 0; 50 | 51 | while ( error > tol && iter < iter_max ) 52 | { 53 | error = 0.0; 54 | 55 | #pragma omp parallel for shared(m, n, Anew, A) 56 | for( int j = 1; j < n-1; j++) 57 | { 58 | for( int i = 1; i < m-1; i++ ) 59 | { 60 | Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] 61 | + A[j-1][i] + A[j+1][i]); 62 | error = fmax( error, fabs(Anew[j][i] - A[j][i])); 63 | } 64 | } 65 | 66 | #pragma omp parallel for shared(m, n, Anew, A) 67 | for( int j = 1; j < n-1; j++) 68 | { 69 | for( int i = 1; i < m-1; i++ ) 70 | { 71 | A[j][i] = Anew[j][i]; 72 | } 73 | } 74 | 75 | if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); 76 | 77 | iter++; 78 | } 79 | 80 | double runtime = GetTimer(); 81 | 82 | printf(" total: %f s\n", runtime / 1000); 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/original/headers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* include the ATLAS headers */ 18 | 19 | extern "C" 20 | { 21 | #include 22 | } 23 | 24 | #include "../debug.h" 25 | 26 | /* choose precision to train and classify. Only float and double are 27 | * currently suppored 28 | */ 29 | 30 | typedef float floatType_t; 31 | 32 | /* macro to convert 2d coords to 1d offset */ 33 | 34 | #define INDX(row,col,ld) (((col) * (ld)) + (row)) 35 | 36 | /* macros for max/min to combine with argmin */ 37 | 38 | #define MYMAX(val,array,i,index) \ 39 | if( array[i] > val ) \ 40 | { \ 41 | val = array[i]; \ 42 | index = i; \ 43 | } \ 44 | 45 | #define MYMIN(val,array,i,index) \ 46 | if( array[i] < val ) \ 47 | { \ 48 | val = array[i]; \ 49 | index = i; \ 50 | } \ 51 | 52 | /* macro to clip values from min to max */ 53 | 54 | #define CLIP(val,min,max) \ 55 | if( (val) < (min) ) val = (min); \ 56 | else if( (val) > (max) ) val = (max); 57 | 58 | /* hardcoded constants for training and test set size and feature 59 | * vector size 60 | */ 61 | 62 | #define FEATURE_VECTOR_SIZE (1899) 63 | #define TRAINING_SET_SIZE (4000) 64 | #define TEST_SET_SIZE (1000) 65 | 66 | /* function defs */ 67 | 68 | void readMatrixFromFile( char *, int *, const int, const int ); 69 | 70 | void calculateBI( floatType_t const *, 71 | floatType_t const *, 72 | floatType_t const *, 73 | int , 74 | floatType_t *, floatType_t *, 75 | int *, int *, 76 | floatType_t const ); 77 | 78 | void svmTrain( floatType_t const *, floatType_t const *, floatType_t const, 79 | const int, const int, 80 | const floatType_t, floatType_t * ); 81 | 82 | void svmPredict( floatType_t const *, floatType_t const *, 83 | int const, int const, int * ); 84 | -------------------------------------------------------------------------------- /exercises/openacc/002-laplace2D-data/laplace2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include "timer.h" 20 | 21 | #define NN 4096 22 | #define NM 4096 23 | 24 | double A[NN][NM]; 25 | double Anew[NN][NM]; 26 | 27 | int main(int argc, char** argv) 28 | { 29 | const int n = NN; 30 | const int m = NM; 31 | const int iter_max = 1000; 32 | 33 | const double tol = 1.0e-6; 34 | double error = 1.0; 35 | 36 | memset(A, 0, n * m * sizeof(double)); 37 | memset(Anew, 0, n * m * sizeof(double)); 38 | 39 | for (int j = 0; j < n; j++) 40 | { 41 | A[j][0] = 1.0; 42 | Anew[j][0] = 1.0; 43 | } 44 | 45 | printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); 46 | 47 | StartTimer(); 48 | int iter = 0; 49 | 50 | while ( error > tol && iter < iter_max ) 51 | { 52 | error = 0.0; 53 | 54 | #pragma omp parallel for shared(m, n, Anew, A) 55 | #pragma acc kernels 56 | for( int j = 1; j < n-1; j++) 57 | { 58 | for( int i = 1; i < m-1; i++ ) 59 | { 60 | Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] 61 | + A[j-1][i] + A[j+1][i]); 62 | error = fmax( error, fabs(Anew[j][i] - A[j][i])); 63 | } 64 | } 65 | 66 | #pragma omp parallel for shared(m, n, Anew, A) 67 | #pragma acc kernels 68 | for( int j = 1; j < n-1; j++) 69 | { 70 | for( int i = 1; i < m-1; i++ ) 71 | { 72 | A[j][i] = Anew[j][i]; 73 | } 74 | } 75 | 76 | if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); 77 | 78 | iter++; 79 | } 80 | 81 | double runtime = GetTimer(); 82 | 83 | printf(" total: %f s\n", runtime / 1000); 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/001-laplace2D-kernels/laplace2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "timer.h" 21 | 22 | #define NN 4096 23 | #define NM 4096 24 | 25 | double A[NN][NM]; 26 | double Anew[NN][NM]; 27 | 28 | int main(int argc, char** argv) 29 | { 30 | const int n = NN; 31 | const int m = NM; 32 | const int iter_max = 1000; 33 | 34 | const double tol = 1.0e-6; 35 | double error = 1.0; 36 | 37 | memset(A, 0, n * m * sizeof(double)); 38 | memset(Anew, 0, n * m * sizeof(double)); 39 | 40 | for (int j = 0; j < n; j++) 41 | { 42 | A[j][0] = 1.0; 43 | Anew[j][0] = 1.0; 44 | } 45 | 46 | printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); 47 | 48 | StartTimer(); 49 | int iter = 0; 50 | 51 | while ( error > tol && iter < iter_max ) 52 | { 53 | error = 0.0; 54 | 55 | #pragma omp parallel for shared(m, n, Anew, A) 56 | #pragma acc kernels 57 | for( int j = 1; j < n-1; j++) 58 | { 59 | for( int i = 1; i < m-1; i++ ) 60 | { 61 | Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] 62 | + A[j-1][i] + A[j+1][i]); 63 | error = fmax( error, fabs(Anew[j][i] - A[j][i])); 64 | } 65 | } 66 | 67 | #pragma omp parallel for shared(m, n, Anew, A) 68 | #pragma acc kernels 69 | for( int j = 1; j < n-1; j++) 70 | { 71 | for( int i = 1; i < m-1; i++ ) 72 | { 73 | A[j][i] = Anew[j][i]; 74 | } 75 | } 76 | 77 | if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); 78 | 79 | iter++; 80 | } 81 | 82 | double runtime = GetTimer(); 83 | 84 | printf(" total: %f s\n", runtime / 1000); 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /exercise_solutions/openacc/002-laplace2D-data/laplace2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include "timer.h" 20 | 21 | #define NN 4096 22 | #define NM 4096 23 | 24 | double A[NN][NM]; 25 | double Anew[NN][NM]; 26 | 27 | int main(int argc, char** argv) 28 | { 29 | const int n = NN; 30 | const int m = NM; 31 | const int iter_max = 1000; 32 | 33 | const double tol = 1.0e-6; 34 | double error = 1.0; 35 | 36 | memset(A, 0, n * m * sizeof(double)); 37 | memset(Anew, 0, n * m * sizeof(double)); 38 | 39 | for (int j = 0; j < n; j++) 40 | { 41 | A[j][0] = 1.0; 42 | Anew[j][0] = 1.0; 43 | } 44 | 45 | printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); 46 | 47 | StartTimer(); 48 | int iter = 0; 49 | 50 | #pragma acc data copy(A), create(Anew) 51 | while ( error > tol && iter < iter_max ) 52 | { 53 | error = 0.0; 54 | 55 | #pragma omp parallel for shared(m, n, Anew, A) 56 | #pragma acc kernels 57 | for( int j = 1; j < n-1; j++) 58 | { 59 | for( int i = 1; i < m-1; i++ ) 60 | { 61 | Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] 62 | + A[j-1][i] + A[j+1][i]); 63 | error = fmax( error, fabs(Anew[j][i] - A[j][i])); 64 | } 65 | } 66 | 67 | #pragma omp parallel for shared(m, n, Anew, A) 68 | #pragma acc kernels 69 | for( int j = 1; j < n-1; j++) 70 | { 71 | for( int i = 1; i < m-1; i++ ) 72 | { 73 | A[j][i] = Anew[j][i]; 74 | } 75 | } 76 | 77 | if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); 78 | 79 | iter++; 80 | } 81 | 82 | double runtime = GetTimer(); 83 | 84 | printf(" total: %f s\n", runtime / 1000); 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /exercises/cuda/thrust_sort/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | int main(void) 25 | { 26 | // generate 32M random numbers on the host 27 | thrust::host_vector h_vec( 32 << 20 ); 28 | thrust::generate( h_vec.begin(), h_vec.end(), rand ); 29 | 30 | // replicate input on another host vector 31 | thrust::host_vector h_vec1 = h_vec; 32 | 33 | //transfer data to the device 34 | thrust::device_vector d_vec = h_vec; 35 | 36 | //create timers 37 | cudaEvent_t start, stop; 38 | cudaEventCreate(&start); 39 | cudaEventCreate(&stop); 40 | 41 | cudaEventRecord( start, 0 ); 42 | 43 | //sort data on the device 44 | thrust::sort( d_vec.begin(), d_vec.end() ); 45 | 46 | cudaEventRecord( stop, 0 ); 47 | cudaEventSynchronize( stop ); 48 | 49 | float GPUelapsedTime; 50 | cudaEventElapsedTime( &GPUelapsedTime, start, stop ); 51 | 52 | GPUelapsedTime /= 1000.0; 53 | 54 | printf("sort of %ld in %f seconds\n", 32<<20, GPUelapsedTime ); 55 | printf("Sort of %f M / sec\n", (double)(32<<20) / (double)GPUelapsedTime * 56 | 1e-6); 57 | 58 | //transfer data back to host 59 | thrust::copy( d_vec.begin(), d_vec.end(), h_vec.begin() ); 60 | 61 | 62 | cudaEventRecord( start, 0 ); 63 | 64 | //sort data on host 65 | thrust::sort(h_vec1.begin(), h_vec1.end() ); 66 | 67 | cudaEventRecord( stop, 0 ); 68 | cudaEventSynchronize( stop ); 69 | 70 | float CPUelapsedTime; 71 | cudaEventElapsedTime( &CPUelapsedTime, start, stop ); 72 | CPUelapsedTime /= 1000.0; 73 | 74 | printf("sort of %ld in %f seconds\n", 32<<20,CPUelapsedTime ); 75 | printf("Sort of %f M / sec\n", (double)(32<<20) / (double)CPUelapsedTime * 76 | 1e-6); 77 | 78 | cudaEventDestroy(start); 79 | cudaEventDestroy(stop); 80 | 81 | printf("GPU is %5.2fX faster than CPU\n", CPUelapsedTime/GPUelapsedTime ); 82 | 83 | if ( thrust::equal( h_vec1.begin(), h_vec1.end(), h_vec.begin() ) ) 84 | printf("The arrays are equal\n"); 85 | else 86 | printf("The arrays are different!\n"); 87 | 88 | 89 | return 0; 90 | } /* end main */ 91 | 92 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/thrust_sort/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | int main(void) 25 | { 26 | // generate 32M random numbers on the host 27 | thrust::host_vector h_vec( 32 << 20 ); 28 | thrust::generate( h_vec.begin(), h_vec.end(), rand ); 29 | 30 | // replicate input on another host vector 31 | thrust::host_vector h_vec1 = h_vec; 32 | 33 | //transfer data to the device 34 | thrust::device_vector d_vec = h_vec; 35 | 36 | //create timers 37 | cudaEvent_t start, stop; 38 | cudaEventCreate(&start); 39 | cudaEventCreate(&stop); 40 | 41 | cudaEventRecord( start, 0 ); 42 | 43 | //sort data on the device 44 | thrust::sort( d_vec.begin(), d_vec.end() ); 45 | 46 | cudaEventRecord( stop, 0 ); 47 | cudaEventSynchronize( stop ); 48 | 49 | float GPUelapsedTime; 50 | cudaEventElapsedTime( &GPUelapsedTime, start, stop ); 51 | 52 | GPUelapsedTime /= 1000.0; 53 | 54 | printf("sort of %ld in %f seconds\n", 32<<20, GPUelapsedTime ); 55 | printf("Sort of %f M / sec\n", (double)(32<<20) / (double)GPUelapsedTime * 56 | 1e-6); 57 | 58 | //transfer data back to host 59 | thrust::copy( d_vec.begin(), d_vec.end(), h_vec.begin() ); 60 | 61 | 62 | cudaEventRecord( start, 0 ); 63 | 64 | //sort data on host 65 | thrust::sort(h_vec1.begin(), h_vec1.end() ); 66 | 67 | cudaEventRecord( stop, 0 ); 68 | cudaEventSynchronize( stop ); 69 | 70 | float CPUelapsedTime; 71 | cudaEventElapsedTime( &CPUelapsedTime, start, stop ); 72 | CPUelapsedTime /= 1000.0; 73 | 74 | printf("sort of %ld in %f seconds\n", 32<<20,CPUelapsedTime ); 75 | printf("Sort of %f M / sec\n", (double)(32<<20) / (double)CPUelapsedTime * 76 | 1e-6); 77 | 78 | cudaEventDestroy(start); 79 | cudaEventDestroy(stop); 80 | 81 | printf("GPU is %5.2fX faster than CPU\n", CPUelapsedTime/GPUelapsedTime ); 82 | 83 | if ( thrust::equal( h_vec1.begin(), h_vec1.end(), h_vec.begin() ) ) 84 | printf("The arrays are equal\n"); 85 | else 86 | printf("The arrays are different!\n"); 87 | 88 | 89 | return 0; 90 | } /* end main */ 91 | 92 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_threads/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x]; 23 | } 24 | 25 | #define N 32 26 | 27 | int main() 28 | { 29 | int *a, *b, *c; 30 | int *d_a, *d_b, *d_c; 31 | int size = N * sizeof( int ); 32 | 33 | /* get GPU device number and name */ 34 | 35 | int dev; 36 | cudaDeviceProp deviceProp; 37 | checkCUDA( cudaGetDevice( &dev ) ); 38 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 39 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 40 | 41 | /* allocate space for device copies of a, b, c */ 42 | 43 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 44 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 45 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 46 | 47 | /* allocate space for host copies of a, b, c and setup input values */ 48 | 49 | a = (int *)malloc( size ); 50 | b = (int *)malloc( size ); 51 | c = (int *)malloc( size ); 52 | 53 | for( int i = 0; i < N; i++ ) 54 | { 55 | a[i] = b[i] = i; 56 | c[i] = 0; 57 | } 58 | 59 | /* copy inputs to device */ 60 | 61 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 62 | checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) ); 63 | 64 | /* zero out C array */ 65 | 66 | checkCUDA( cudaMemset( d_c, 0, size ) ); 67 | 68 | /* launch the kernel on the GPU */ 69 | 70 | add<<< 1, N >>>( d_a, d_b, d_c ); 71 | checkKERNEL() 72 | 73 | /* copy result back to host */ 74 | 75 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 76 | 77 | int success = 1; 78 | 79 | for( int i = 0; i < N; i++ ) 80 | { 81 | printf("c[%d] = %d\n",i,c[i]); 82 | if( c[i] != a[i] + b[i] ) 83 | { 84 | success = 0; 85 | break; 86 | } /* end if */ 87 | } /* end for */ 88 | 89 | if( success == 1 ) printf("PASS\n"); 90 | else printf("FAIL\n"); 91 | 92 | /* clean up */ 93 | 94 | free(a); 95 | free(b); 96 | free(c); 97 | checkCUDA( cudaFree( d_a ) ); 98 | checkCUDA( cudaFree( d_b ) ); 99 | checkCUDA( cudaFree( d_c ) ); 100 | 101 | checkCUDA( cudaDeviceReset() ); 102 | 103 | return 0; 104 | } /* end main */ 105 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x]; 23 | } 24 | 25 | #define N 32 26 | 27 | int main() 28 | { 29 | int *a, *b, *c; 30 | int *d_a, *d_b, *d_c; 31 | int size = N * sizeof( int ); 32 | 33 | /* get GPU device number and name */ 34 | 35 | int dev; 36 | cudaDeviceProp deviceProp; 37 | checkCUDA( cudaGetDevice( &dev ) ); 38 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 39 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 40 | 41 | /* allocate space for device copies of a, b, c */ 42 | 43 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 44 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 45 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 46 | 47 | /* allocate space for host copies of a, b, c and setup input values */ 48 | 49 | a = (int *)malloc( size ); 50 | b = (int *)malloc( size ); 51 | c = (int *)malloc( size ); 52 | 53 | for( int i = 0; i < N; i++ ) 54 | { 55 | a[i] = b[i] = i; 56 | c[i] = 0; 57 | } /* end for */ 58 | 59 | /* copy inputs to device */ 60 | 61 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 62 | checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) ); 63 | 64 | /* zero out C array */ 65 | 66 | checkCUDA( cudaMemset( d_c, 0, size ) ); 67 | 68 | /* launch the kernel on the GPU */ 69 | /* finish the kernel launch with N blocks and 1 thread per block */ 70 | add<<< N, 1 >>>( d_a, d_b, d_c ); 71 | checkKERNEL() 72 | 73 | /* copy result back to host */ 74 | 75 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 76 | 77 | int success = 1; 78 | 79 | for( int i = 0; i < N; i++ ) 80 | { 81 | printf("c[%d] = %d\n",i,c[i]); 82 | if( c[i] != a[i] + b[i] ) 83 | { 84 | success = 0; 85 | break; 86 | } /* end if */ 87 | } /* end for */ 88 | 89 | if( success == 1 ) printf("PASS\n"); 90 | else printf("FAIL\n"); 91 | 92 | /* clean up */ 93 | 94 | free(a); 95 | free(b); 96 | free(c); 97 | checkCUDA( cudaFree( d_a ) ); 98 | checkCUDA( cudaFree( d_b ) ); 99 | checkCUDA( cudaFree( d_c ) ); 100 | 101 | return 0; 102 | } /* end main */ 103 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | /* add the proper index so each block calculates a different value in the 23 | array */ 24 | c[FIXME] = a[FIXME] + b[FIXME]; 25 | } 26 | 27 | #define N 32 28 | 29 | int main() 30 | { 31 | int *a, *b, *c; 32 | int *d_a, *d_b, *d_c; 33 | int size = N * sizeof( int ); 34 | 35 | /* get GPU device number and name */ 36 | 37 | int dev; 38 | cudaDeviceProp deviceProp; 39 | checkCUDA( cudaGetDevice( &dev ) ); 40 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 41 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 42 | 43 | /* allocate space for device copies of a, b, c */ 44 | 45 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 46 | /* insert code here for d_b and d_c */ 47 | FIXME 48 | 49 | /* allocate space for host copies of a, b, c and setup input values */ 50 | 51 | a = (int *)malloc( size ); 52 | b = (int *)malloc( size ); 53 | c = (int *)malloc( size ); 54 | 55 | for( int i = 0; i < N; i++ ) 56 | { 57 | a[i] = b[i] = i; 58 | c[i] = 0; 59 | } /* end for */ 60 | 61 | /* copy inputs to device */ 62 | 63 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 64 | /* insert code to copy b to the device */ 65 | FIXME 66 | 67 | /* zero out C array */ 68 | 69 | checkCUDA( cudaMemset( d_c, 0, size ) ); 70 | 71 | /* launch the kernel on the GPU */ 72 | /* finish the kernel launch with N blocks and 1 thread per block */ 73 | add<<< FIXME, FIXME >>>( d_a, d_b, d_c ); 74 | checkKERNEL() 75 | 76 | /* copy result back to host */ 77 | 78 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 79 | 80 | int success = 1; 81 | 82 | for( int i = 0; i < N; i++ ) 83 | { 84 | printf("c[%d] = %d\n",i,c[i]); 85 | if( c[i] != a[i] + b[i] ) 86 | { 87 | success = 0; 88 | break; 89 | } /* end if */ 90 | } /* end for */ 91 | 92 | if( success == 1 ) printf("PASS\n"); 93 | else printf("FAIL\n"); 94 | 95 | /* clean up */ 96 | 97 | free(a); 98 | free(b); 99 | free(c); 100 | checkCUDA( cudaFree( d_a ) ); 101 | checkCUDA( cudaFree( d_b ) ); 102 | checkCUDA( cudaFree( d_c ) ); 103 | 104 | return 0; 105 | } /* end main */ 106 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/simple_add_blocks_threads/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | int index = threadIdx.x + blockIdx.x * blockDim.x; 23 | c[index] = a[index] + b[index]; 24 | } 25 | 26 | #define N (2048*2048) 27 | #define THREADS_PER_BLOCK 512 28 | 29 | int main() 30 | { 31 | int *a, *b, *c; 32 | int *d_a, *d_b, *d_c; 33 | int size = N * sizeof( int ); 34 | 35 | /* get GPU device number and name */ 36 | 37 | int dev; 38 | cudaDeviceProp deviceProp; 39 | checkCUDA( cudaGetDevice( &dev ) ); 40 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 41 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 42 | 43 | /* allocate space for device copies of a, b, c */ 44 | 45 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 46 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 47 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 48 | 49 | /* allocate space for host copies of a, b, c and setup input values */ 50 | 51 | a = (int *)malloc( size ); 52 | b = (int *)malloc( size ); 53 | c = (int *)malloc( size ); 54 | 55 | for( int i = 0; i < N; i++ ) 56 | { 57 | a[i] = b[i] = i; 58 | c[i] = 0; 59 | } 60 | 61 | /* copy inputs to device */ 62 | 63 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 64 | checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) ); 65 | 66 | /* zero out the C array */ 67 | 68 | checkCUDA( cudaMemset( d_c, 0, size ) ); 69 | 70 | /* launch the kernel on the GPU */ 71 | 72 | add<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c ); 73 | checkKERNEL() 74 | 75 | /* copy result back to host */ 76 | 77 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 78 | 79 | int success = 1; 80 | 81 | for( int i = 0; i < N; i++ ) 82 | { 83 | if( c[i] != a[i] + b[i] ) 84 | { 85 | printf("c[%d] = %d\n",i,c[i] ); 86 | success = 0; 87 | break; 88 | } /* end if */ 89 | } 90 | 91 | if( success == 1 ) printf("PASS\n"); 92 | else printf("FAIL\n"); 93 | 94 | /* clean up */ 95 | 96 | free(a); 97 | free(b); 98 | free(c); 99 | checkCUDA( cudaFree( d_a ) ); 100 | checkCUDA( cudaFree( d_b ) ); 101 | checkCUDA( cudaFree( d_c ) ); 102 | 103 | checkCUDA( cudaDeviceReset() ); 104 | 105 | return 0; 106 | } /* end main */ 107 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_blocks_threads/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | int index = threadIdx.x + blockIdx.x * blockDim.x; 23 | c[index] = a[index] + b[index]; 24 | } 25 | 26 | #define N (2048*2048) 27 | #define THREADS_PER_BLOCK 512 28 | 29 | int main() 30 | { 31 | int *a, *b, *c; 32 | int *d_a, *d_b, *d_c; 33 | int size = N * sizeof( int ); 34 | 35 | /* get GPU device number and name */ 36 | 37 | int dev; 38 | cudaDeviceProp deviceProp; 39 | checkCUDA( cudaGetDevice( &dev ) ); 40 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 41 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 42 | 43 | /* allocate space for device copies of a, b, c */ 44 | 45 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 46 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 47 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 48 | 49 | /* allocate space for host copies of a, b, c and setup input values */ 50 | 51 | a = (int *)malloc( size ); 52 | b = (int *)malloc( size ); 53 | c = (int *)malloc( size ); 54 | 55 | for( int i = 0; i < N; i++ ) 56 | { 57 | a[i] = b[i] = i; 58 | c[i] = 0; 59 | } 60 | 61 | /* copy inputs to device */ 62 | 63 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 64 | checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) ); 65 | 66 | /* zero out the C array */ 67 | 68 | checkCUDA( cudaMemset( d_c, 0, size ) ); 69 | 70 | /* launch the kernel on the GPU */ 71 | /* insert the launch parameters to launch properly using blocks and threads */ 72 | add<<< FIXME, FIXME >>>( d_a, d_b, d_c ); 73 | checkKERNEL() 74 | 75 | /* copy result back to host */ 76 | 77 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 78 | 79 | int success = 1; 80 | 81 | for( int i = 0; i < N; i++ ) 82 | { 83 | if( c[i] != a[i] + b[i] ) 84 | { 85 | printf("c[%d] = %d\n",i,c[i] ); 86 | success = 0; 87 | break; 88 | } /* end if */ 89 | } 90 | 91 | if( success == 1 ) printf("PASS\n"); 92 | else printf("FAIL\n"); 93 | 94 | /* clean up */ 95 | 96 | free(a); 97 | free(b); 98 | free(c); 99 | checkCUDA( cudaFree( d_a ) ); 100 | checkCUDA( cudaFree( d_b ) ); 101 | checkCUDA( cudaFree( d_c ) ); 102 | 103 | checkCUDA( cudaDeviceReset() ); 104 | 105 | return 0; 106 | } /* end main */ 107 | -------------------------------------------------------------------------------- /exercises/cuda/svm_challenge/README.md: -------------------------------------------------------------------------------- 1 | SVM Email Spam Filter 2 | ===================== 3 | 4 | This is the top-level folder for a challenge problem dealing with the use of 5 | support vector machine (SVM) algorithm used to implement a spam classifier. 6 | 7 | The original idea for this code comes from a Machine Learning Coursera course 8 | taught by Andrew Ng, accessed in 2014 at https://www.coursera.org/course/ml. 9 | The training and test data are taken from his homework example in this course. 10 | He wrote the code in Octave and I changed it to C as well as altered the CPU 11 | and GPU algorithms to more closely align with the algorithm described in [1], 12 | labeled as "Algorithm 1" on page 105, where the working set choice is using the 13 | first order heuristic, also described in [1] and a linear kernel is used. The 14 | general algorithm is the SMO algorithm from Platt [2]. 15 | 16 | The training set is 4000 emails and there are 1899 features (keywords). This 17 | is admittedly a reduced training set and reduced feature size for illustration 18 | purposes only. 19 | 20 | [1] B. C. Catanzaro, N. Sundaram, K. Keutzer, "Fast Support Vector Machine 21 | Training and Classification on Graphics Processors", Proceedings of the 25th 22 | International Comference on Machine Learning, Helsinki, Finland, 2008. 23 | 24 | [2] J. C. Platt, "Fast training of support vector machines using sequential 25 | minimal optimization", Advances in kernel methods: support vector learning, 26 | Cambridge, MA, USA: MIT Press. 27 | 28 | Instructions 29 | ------------ 30 | 31 | To run the code which trains and then classifies email as spam please do the 32 | following steps. 33 | 34 | 1.) Build the code. Ensure that NVCC is in your path. 35 | 36 | > make 37 | 38 | 2.) Choose an email to be tested. There is one genuine email and three spam 39 | emails to choose from. If you wish to test your own email (either genuine or 40 | spam) put your email as a text file in this directory. Copy/paste only the 41 | text of the email. Please omit the header information as this spam 42 | classifier only cares about the text of the email. 43 | 44 | 3.) Process the email. The email text needs to be processed by stripping out 45 | all non-text elements and then running a stemming algorithm on each resultant 46 | word. This leaves you with just a tokenized email of stemmed words which is 47 | easier to process. When you run this command the stemmed email will be 48 | printed to the screen and a file called "emailVector.txt" will be created 49 | which will be a vector of 0's and 1's depending on whether that specific 50 | feature (word) exists in the email or not. 51 | 52 | > sh processEmail.sh 53 | 54 | 4.) Train the SVM and classify your email. In this step the SVM will be first 55 | be trained against a training set of size 4000. Then it will be tested for 56 | accuracy against this set. Then it will be tested against a test set of size 57 | 1000. Both of these accuracies should be over 98%. Finally the SVM will 58 | classify your input email and either classify it as spam (1) or NOT spam (0). 59 | 60 | > ./x.train 61 | Prediction success rate on training set is 99.750000 62 | Prediction success rate on test set is 98.200000 63 | Email test results 1 is SPAM 0 is NOT SPAM 64 | File Name emailVector.txt, classification 0 NOT SPAM 65 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/svm_challenge/README.md: -------------------------------------------------------------------------------- 1 | SVM Email Spam Filter 2 | ===================== 3 | 4 | This is the top-level folder for a challenge problem dealing with the use of 5 | support vector machine (SVM) algorithm used to implement a spam classifier. 6 | 7 | The original idea for this code comes from a Machine Learning Coursera course 8 | taught by Andrew Ng, accessed in 2014 at https://www.coursera.org/course/ml. 9 | The training and test data are taken from his homework example in this course. 10 | He wrote the code in Octave and I changed it to C as well as altered the CPU 11 | and GPU algorithms to more closely align with the algorithm described in [1], 12 | labeled as "Algorithm 1" on page 105, where the working set choice is using the 13 | first order heuristic, also described in [1] and a linear kernel is used. The 14 | general algorithm is the SMO algorithm from Platt [2]. 15 | 16 | The training set is 4000 emails and there are 1899 features (keywords). This 17 | is admittedly a reduced training set and reduced feature size for illustration 18 | purposes only. 19 | 20 | [1] B. C. Catanzaro, N. Sundaram, K. Keutzer, "Fast Support Vector Machine 21 | Training and Classification on Graphics Processors", Proceedings of the 25th 22 | International Comference on Machine Learning, Helsinki, Finland, 2008. 23 | 24 | [2] J. C. Platt, "Fast training of support vector machines using sequential 25 | minimal optimization", Advances in kernel methods: support vector learning, 26 | Cambridge, MA, USA: MIT Press. 27 | 28 | Instructions 29 | ------------ 30 | 31 | To run the code which trains and then classifies email as spam please do the 32 | following steps. 33 | 34 | 1.) Build the code. Ensure that NVCC is in your path. 35 | 36 | > make 37 | 38 | 2.) Choose an email to be tested. There is one genuine email and three spam 39 | emails to choose from. If you wish to test your own email (either genuine or 40 | spam) put your email as a text file in this directory. Copy/paste only the 41 | text of the email. Please omit the header information as this spam 42 | classifier only cares about the text of the email. 43 | 44 | 3.) Process the email. The email text needs to be processed by stripping out 45 | all non-text elements and then running a stemming algorithm on each resultant 46 | word. This leaves you with just a tokenized email of stemmed words which is 47 | easier to process. When you run this command the stemmed email will be 48 | printed to the screen and a file called "emailVector.txt" will be created 49 | which will be a vector of 0's and 1's depending on whether that specific 50 | feature (word) exists in the email or not. 51 | 52 | > sh processEmail.sh 53 | 54 | 4.) Train the SVM and classify your email. In this step the SVM will be first 55 | be trained against a training set of size 4000. Then it will be tested for 56 | accuracy against this set. Then it will be tested against a test set of size 57 | 1000. Both of these accuracies should be over 98%. Finally the SVM will 58 | classify your input email and either classify it as spam (1) or NOT spam (0). 59 | 60 | > ./x.train 61 | Prediction success rate on training set is 99.750000 62 | Prediction success rate on test set is 98.200000 63 | Email test results 1 is SPAM 0 is NOT SPAM 64 | File Name emailVector.txt, classification 0 NOT SPAM 65 | -------------------------------------------------------------------------------- /exercises/cuda/simple_add_threads/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "../debug.h" 19 | 20 | __global__ void add(int *a, int *b, int *c) 21 | { 22 | /* insert correct index so each element is calculated by a different thread */ 23 | c[FIXME] = a[FIXME] + b[FIXME]; 24 | } 25 | 26 | /* experiment with different values for N */ 27 | /* how large can you make it */ 28 | #define N 32 29 | 30 | int main() 31 | { 32 | int *a, *b, *c; 33 | int *d_a, *d_b, *d_c; 34 | int size = N * sizeof( int ); 35 | 36 | /* get GPU device number and name */ 37 | 38 | int dev; 39 | cudaDeviceProp deviceProp; 40 | checkCUDA( cudaGetDevice( &dev ) ); 41 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 42 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 43 | 44 | /* allocate space for device copies of a, b, c */ 45 | 46 | checkCUDA( cudaMalloc( (void **) &d_a, size ) ); 47 | checkCUDA( cudaMalloc( (void **) &d_b, size ) ); 48 | checkCUDA( cudaMalloc( (void **) &d_c, size ) ); 49 | 50 | /* allocate space for host copies of a, b, c and setup input values */ 51 | 52 | a = (int *)malloc( size ); 53 | b = (int *)malloc( size ); 54 | c = (int *)malloc( size ); 55 | 56 | for( int i = 0; i < N; i++ ) 57 | { 58 | a[i] = b[i] = i; 59 | c[i] = 0; 60 | } 61 | 62 | /* copy inputs to device */ 63 | 64 | checkCUDA( cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ) ); 65 | checkCUDA( cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ) ); 66 | 67 | /* zero out C array */ 68 | 69 | checkCUDA( cudaMemset( d_c, 0, size ) ); 70 | 71 | /* launch the kernel on the GPU */ 72 | /* insert correct launch parameters to use 1 block and N threads */ 73 | add<<< FIXME, FIXME >>>( d_a, d_b, d_c ); 74 | checkKERNEL() 75 | 76 | /* copy result back to host */ 77 | 78 | checkCUDA( cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ) ); 79 | 80 | int success = 1; 81 | 82 | for( int i = 0; i < N; i++ ) 83 | { 84 | printf("c[%d] = %d\n",i,c[i]); 85 | if( c[i] != a[i] + b[i] ) 86 | { 87 | success = 0; 88 | break; 89 | } /* end if */ 90 | } /* end for */ 91 | 92 | if( success == 1 ) printf("PASS\n"); 93 | else printf("FAIL\n"); 94 | 95 | /* clean up */ 96 | 97 | free(a); 98 | free(b); 99 | free(c); 100 | checkCUDA( cudaFree( d_a ) ); 101 | checkCUDA( cudaFree( d_b ) ); 102 | checkCUDA( cudaFree( d_c ) ); 103 | 104 | checkCUDA( cudaDeviceReset() ); 105 | 106 | return 0; 107 | } /* end main */ 108 | -------------------------------------------------------------------------------- /exercises/cuda/reduction_thrust/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "../debug.h" 24 | 25 | #define N ( 1 << 27 ) 26 | #define FLOATTYPE_T float 27 | 28 | int main(void) 29 | { 30 | int size = N; 31 | 32 | /* get GPU device number and name */ 33 | 34 | int dev; 35 | cudaDeviceProp deviceProp; 36 | checkCUDA( cudaGetDevice( &dev ) ); 37 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 38 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 39 | 40 | /* create the host array */ 41 | thrust::host_vector h_vec( FIXME ); 42 | 43 | /* generate random numbers on the host */ 44 | for( int i = 0; i < size; i++ ) 45 | { 46 | h_vec[i] = FLOATTYPE_T( rand() ) / ( FLOATTYPE_T (RAND_MAX) + 1.0 ); 47 | if( i % 2 == 0 ) h_vec[i] = -h_vec[i]; 48 | } 49 | 50 | /* transfer data to the device */ 51 | thrust::device_vector d_vec = FIXME; 52 | 53 | /* create timers */ 54 | cudaEvent_t start, stop; 55 | cudaEventCreate(&start); 56 | cudaEventCreate(&stop); 57 | 58 | cudaEventRecord( start, 0 ); 59 | 60 | /* reduce data on the device */ 61 | FLOATTYPE_T devResult = thrust::reduce( FIXME, FIXME ); 62 | 63 | /* stop timers */ 64 | cudaEventRecord( stop, 0 ); 65 | cudaEventSynchronize( stop ); 66 | 67 | float GPUelapsedTime; 68 | cudaEventElapsedTime( &GPUelapsedTime, start, stop ); 69 | 70 | GPUelapsedTime /= 1000.0; 71 | 72 | /* print GPU timing data */ 73 | 74 | printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 75 | (double)size * 1.e-9); 76 | printf("GPU total time is %f ms, bandwidth %f GB/s\n", GPUelapsedTime, 77 | sizeof(FLOATTYPE_T)*(double)size / 78 | ( (double)GPUelapsedTime ) * 1.e-9 ); 79 | 80 | /* start CPU timer */ 81 | cudaEventRecord( start, 0 ); 82 | 83 | /* reduce data on host */ 84 | FLOATTYPE_T hostResult = thrust::reduce(h_vec.begin(), h_vec.end() ); 85 | 86 | /* stop timers */ 87 | cudaEventRecord( stop, 0 ); 88 | cudaEventSynchronize( stop ); 89 | 90 | float CPUelapsedTime; 91 | cudaEventElapsedTime( &CPUelapsedTime, start, stop ); 92 | CPUelapsedTime /= 1000.0; 93 | 94 | /* print CPU timer */ 95 | 96 | printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 97 | (double)size * 1.e-9); 98 | printf("CPU total time is %f ms, bandwidth %f GB/s\n", CPUelapsedTime, 99 | sizeof(FLOATTYPE_T)*(double)size / 100 | ( (double)CPUelapsedTime ) * 1.e-9 ); 101 | 102 | 103 | cudaEventDestroy(start); 104 | cudaEventDestroy(stop); 105 | 106 | /* verify the results */ 107 | 108 | double diff = abs( devResult - hostResult ); 109 | 110 | if( diff / abs(hostResult) < 0.001 ) printf("PASS\n"); 111 | else 112 | { 113 | printf("FAIL\n"); 114 | printf("Error is %f\n", diff / hostResult ); 115 | printf("GPU result is %f, CPU result is %f\n",devResult, hostResult ); 116 | } /* end else */ 117 | 118 | return 0; 119 | } /* end main */ 120 | 121 | -------------------------------------------------------------------------------- /exercise_solutions/cuda/reduction_thrust/kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "../debug.h" 24 | 25 | #define N ( 1 << 27 ) 26 | #define FLOATTYPE_T float 27 | 28 | int main(void) 29 | { 30 | int size = N; 31 | 32 | /* get GPU device number and name */ 33 | 34 | int dev; 35 | cudaDeviceProp deviceProp; 36 | checkCUDA( cudaGetDevice( &dev ) ); 37 | checkCUDA( cudaGetDeviceProperties( &deviceProp, dev ) ); 38 | printf("Using GPU %d: %s\n", dev, deviceProp.name ); 39 | 40 | /* create the host array */ 41 | thrust::host_vector h_vec( size ); 42 | 43 | /* generate random numbers on the host */ 44 | for( int i = 0; i < size; i++ ) 45 | { 46 | h_vec[i] = FLOATTYPE_T( rand() ) / ( FLOATTYPE_T (RAND_MAX) + 1.0 ); 47 | if( i % 2 == 0 ) h_vec[i] = -h_vec[i]; 48 | } 49 | 50 | /* transfer data to the device */ 51 | thrust::device_vector d_vec = h_vec; 52 | 53 | /* create timers */ 54 | cudaEvent_t start, stop; 55 | cudaEventCreate(&start); 56 | cudaEventCreate(&stop); 57 | 58 | cudaEventRecord( start, 0 ); 59 | 60 | /* reduce data on the device */ 61 | FLOATTYPE_T devResult = thrust::reduce( d_vec.begin(), d_vec.end() ); 62 | 63 | /* stop timers */ 64 | cudaEventRecord( stop, 0 ); 65 | cudaEventSynchronize( stop ); 66 | 67 | float GPUelapsedTime; 68 | cudaEventElapsedTime( &GPUelapsedTime, start, stop ); 69 | 70 | GPUelapsedTime /= 1000.0; 71 | 72 | /* print GPU timing data */ 73 | 74 | printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 75 | (double)size * 1.e-9); 76 | printf("GPU total time is %f ms, bandwidth %f GB/s\n", GPUelapsedTime, 77 | sizeof(FLOATTYPE_T)*(double)size / 78 | ( (double)GPUelapsedTime ) * 1.e-9 ); 79 | 80 | /* start CPU timer */ 81 | cudaEventRecord( start, 0 ); 82 | 83 | /* reduce data on host */ 84 | FLOATTYPE_T hostResult = thrust::reduce(h_vec.begin(), h_vec.end() ); 85 | 86 | /* stop timers */ 87 | cudaEventRecord( stop, 0 ); 88 | cudaEventSynchronize( stop ); 89 | 90 | float CPUelapsedTime; 91 | cudaEventElapsedTime( &CPUelapsedTime, start, stop ); 92 | CPUelapsedTime /= 1000.0; 93 | 94 | /* print CPU timer */ 95 | 96 | printf("Total elements is %d, %f GB\n", size, sizeof(FLOATTYPE_T) * 97 | (double)size * 1.e-9); 98 | printf("CPU total time is %f ms, bandwidth %f GB/s\n", CPUelapsedTime, 99 | sizeof(FLOATTYPE_T)*(double)size / 100 | ( (double)CPUelapsedTime ) * 1.e-9 ); 101 | 102 | 103 | cudaEventDestroy(start); 104 | cudaEventDestroy(stop); 105 | 106 | /* verify the results */ 107 | 108 | double diff = abs( devResult - hostResult ); 109 | 110 | if( diff / abs(hostResult) < 0.001 ) printf("PASS\n"); 111 | else 112 | { 113 | printf("FAIL\n"); 114 | printf("Error is %f\n", diff / hostResult ); 115 | printf("GPU result is %f, CPU result is %f\n",devResult, hostResult ); 116 | } /* end else */ 117 | 118 | return 0; 119 | } /* end main */ 120 | 121 | --------------------------------------------------------------------------------