├── Lecture1 ├── HIP │ ├── jacobi │ │ ├── Norm.o │ │ ├── Input.o │ │ ├── JacobiRun.o │ │ ├── Laplacian.o │ │ ├── HaloExchange.o │ │ ├── JacobiMain.o │ │ ├── JacobiSetup.o │ │ ├── JacobiIteration.o │ │ ├── input.txt │ │ ├── run.sh │ │ ├── run_all.sh │ │ ├── env.sh │ │ ├── wrapper.sh │ │ ├── JacobiIteration.hip │ │ └── JacobiMain.hip │ ├── hip-stream │ │ ├── Makefile.titan │ │ ├── Makefile │ │ ├── README.md │ │ └── CMakeLists.txt │ ├── vectorAdd │ │ ├── hip_batch_frontier.sh │ │ ├── Makefile │ │ ├── hip_batch_lumi.sh │ │ ├── README │ │ └── README.frontier │ ├── vectorAdd_w_cpp_ext │ │ ├── Makefile │ │ ├── hip_batch_frontier.sh │ │ ├── hip_batch_lumi.sh │ │ ├── README │ │ ├── README.frontier │ │ └── CMakeLists.txt │ ├── vectorAdd_w_cu_ext │ │ ├── hip_batch_frontier.sh │ │ ├── Makefile │ │ ├── hip_batch_lumi.sh │ │ ├── README │ │ └── README.frontier │ ├── saxpy │ │ ├── README │ │ ├── Makefile │ │ └── CMakeLists.txt │ └── dgemm │ │ └── src │ │ ├── timer.h │ │ ├── args.h │ │ ├── dgemm.h │ │ ├── CMakeLists.txt │ │ ├── matrix.h │ │ ├── darray.h │ │ ├── serialize.h │ │ └── utils.cpp └── 01 Exercises for HIP Introduction.pdf ├── Lecture3 ├── 03 MemoryHierarchy.pdf ├── 03 Exercises for Occupancy.pdf └── Occupancy │ ├── Makefile │ ├── Occupancy_naive │ └── Makefile │ ├── Occupancy_shmem │ └── Makefile │ ├── Occupancy_shmem_A │ └── Makefile │ ├── Occupancy_shmem_batched │ └── Makefile │ ├── Occupancy_shmem_batched_unroll │ └── Makefile │ ├── run_maketests.sh │ └── run_cmaketests.sh ├── Lecture5 └── OmniperfExamples │ ├── README.md │ ├── 4-StridedAccess │ ├── striding.PNG │ ├── no_stride.PNG │ ├── exercise4_problem_roofline_fp32.png │ ├── exercise4_solution_roofline_fp32.png │ ├── exercise1_problem_kernelName_legend.png │ ├── exercise4_problem_roofline_int8_fp16.png │ ├── exercise4_solution_roofline_int8_fp16.png │ ├── Makefile │ ├── solution │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── hipCheck.h │ └── problem.cpp │ ├── 5-AlgorithmicOptimizations │ ├── threadrows.PNG │ ├── wavefrontrow.PNG │ ├── exercise5_problem_roofline_fp32.png │ ├── exercise5_solution_roofline_fp32.png │ ├── exercise1_problem_kernelName_legend.png │ ├── exercise5_problem_roofline_int8_fp16.png │ ├── exercise5_solution_roofline_int8_fp16.png │ ├── Makefile │ ├── solution │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── hipCheck.h │ └── problem.cpp │ ├── 1-LaunchParameters │ ├── exercise1_problem_roofline_fp32.png │ ├── exercise1_solution_roofline_fp32.png │ ├── exercise1_problem_kernelName_legend.png │ ├── exercise1_problem_roofline_int8_fp16.png │ ├── exercise1_solution_roofline_int8_fp16.png │ ├── Makefile │ ├── solution │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── hipCheck.h │ └── problem.cpp │ ├── 2-LDSOccupancyLimit │ ├── exercise2_problem_roofline_fp32.png │ ├── exercise2_solution_roofline_fp32.png │ ├── exercise1_problem_kernelName_legend.png │ ├── exercise2_problem_roofline_int8_fp16.png │ ├── exercise2_solution_roofline_int8_fp16.png │ ├── Makefile │ ├── solution │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── solution-no-lds │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── hipCheck.h │ └── problem.cpp │ ├── 3-RegisterOccupancyLimit │ ├── exercise3_problem_roofline_fp32.png │ ├── exercise3_solution_roofline_fp32.png │ ├── exercise1_problem_kernelName_legend.png │ ├── exercise3_problem_roofline_int8_fp16.png │ ├── exercise3_solution_roofline_int8_fp16.png │ ├── Makefile │ ├── solution │ │ ├── Makefile │ │ ├── hipCheck.h │ │ └── solution.cpp │ ├── hipCheck.h │ └── problem.cpp │ ├── README-Perlmutter-users.md │ ├── LICENSE │ └── README-Frontier-users.md ├── Lecture4 ├── jacobi │ ├── input.txt │ ├── run.sh │ ├── rocprof_wrapper.sh │ ├── run_all.sh │ ├── env.sh │ ├── wrapper.sh │ ├── rocprof_wrapper.sh.old │ ├── JacobiIteration.hip │ └── JacobiMain.hip ├── 04 Performance_Timelines_Exercises.pdf └── saxpy │ ├── README │ ├── Makefile │ └── CMakeLists.txt ├── Lecture2 ├── HIPIFY │ ├── Pennant-hip │ │ ├── doc │ │ │ ├── noh-result.png │ │ │ ├── pennantdoc.pdf │ │ │ ├── side-maps.png │ │ │ ├── mesh-entities.png │ │ │ └── sedov-result.png │ │ ├── test │ │ │ └── sedovbig │ │ │ │ └── sedovbig.pnt │ │ ├── src │ │ │ ├── PolyGas.cc │ │ │ ├── TTS.cc │ │ │ ├── HydroBC.cc │ │ │ ├── QCS.cc │ │ │ ├── WriteXY.hh │ │ │ ├── TTS.hh │ │ │ ├── PolyGas.hh │ │ │ ├── Memory.hh │ │ │ ├── ImportGMV.hh │ │ │ ├── main.cc │ │ │ ├── QCS.hh │ │ │ ├── HydroBC.hh │ │ │ ├── InputFile.hh │ │ │ ├── ExportGold.hh │ │ │ ├── Driver.hh │ │ │ ├── HydroGPU.hh │ │ │ ├── Parallel.hh │ │ │ └── WriteXY.cc │ │ ├── README │ │ ├── Makefile │ │ ├── LICENSE │ │ └── tools │ │ │ └── gmvrect.py │ ├── Pennant-orig │ │ ├── doc │ │ │ ├── side-maps.png │ │ │ ├── noh-result.png │ │ │ ├── pennantdoc.pdf │ │ │ ├── sedov-result.png │ │ │ └── mesh-entities.png │ │ ├── test │ │ │ └── sedovbig │ │ │ │ └── sedovbig.pnt │ │ ├── src │ │ │ ├── PolyGas.cc │ │ │ ├── TTS.cc │ │ │ ├── HydroBC.cc │ │ │ ├── QCS.cc │ │ │ ├── WriteXY.hh │ │ │ ├── TTS.hh │ │ │ ├── PolyGas.hh │ │ │ ├── Memory.hh │ │ │ ├── ImportGMV.hh │ │ │ ├── main.cc │ │ │ ├── QCS.hh │ │ │ ├── HydroBC.hh │ │ │ ├── InputFile.hh │ │ │ ├── ExportGold.hh │ │ │ ├── Driver.hh │ │ │ ├── HydroGPU.hh │ │ │ ├── Parallel.hh │ │ │ └── WriteXY.cc │ │ ├── README │ │ ├── LICENSE │ │ ├── tools │ │ │ └── gmvrect.py │ │ └── Makefile │ ├── mini-nbody │ │ ├── shmoo-cpu-nbody.sh │ │ ├── cuda │ │ │ ├── shmoo-cuda-nbody-soa.sh │ │ │ ├── shmoo-cuda-nbody-orig.sh │ │ │ ├── shmoo-cuda-nbody-ftz.sh │ │ │ ├── shmoo-cuda-nbody-block.sh │ │ │ ├── shmoo-cuda-nbody-unroll.sh │ │ │ └── nbody-batch-lumi.sh │ │ ├── mic │ │ │ ├── shmoo-mic-nbody-orig.sh │ │ │ ├── shmoo-mic-nbody-soa.sh │ │ │ ├── shmoo-mic-nbody-ftz.sh │ │ │ ├── shmoo-mic-nbody-align.sh │ │ │ └── shmoo-mic-nbody-block.sh │ │ ├── Exercise2_frontier.sbatch │ │ ├── Exercise2_perlmutter.sbatch │ │ ├── hip │ │ │ ├── HIP-nbody-orig.sh │ │ │ ├── HIP-nbody-soa.sh │ │ │ ├── HIP-nbody-block.sh │ │ │ └── Makefile │ │ ├── timer.h │ │ └── nbody.c │ ├── hipexamine-perl.sh │ ├── hipconvertinplace-perl.sh │ ├── Makefile.new │ ├── frontier_pennant_setup.sh │ ├── Makefile.allhipcc │ └── Makefile.twopath ├── 02 Exercises for Application Porting.pdf └── libexec │ └── hipify │ └── findcode.sh ├── README.md └── markdown2pdf.sh /Lecture1/HIP/jacobi/Norm.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/Norm.o -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/Input.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/Input.o -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/JacobiRun.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/JacobiRun.o -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/Laplacian.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/Laplacian.o -------------------------------------------------------------------------------- /Lecture3/03 MemoryHierarchy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture3/03 MemoryHierarchy.pdf -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/HaloExchange.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/HaloExchange.o -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/JacobiMain.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/JacobiMain.o -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/JacobiSetup.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/JacobiSetup.o -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/README.md: -------------------------------------------------------------------------------- 1 | # OmniperfExamples 2 | Repo for developing examples for the HIP Lecture Series at ORNL 3 | -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/JacobiIteration.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/HIP/jacobi/JacobiIteration.o -------------------------------------------------------------------------------- /Lecture3/03 Exercises for Occupancy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture3/03 Exercises for Occupancy.pdf -------------------------------------------------------------------------------- /Lecture4/jacobi/input.txt: -------------------------------------------------------------------------------- 1 | pmc : WriteSize FetchSize VALUUtilization VALUBusy Wavefronts 2 | # pmc : L2CacheHit LDSBankConflict ALUStalledByLDS 3 | -------------------------------------------------------------------------------- /Lecture1/01 Exercises for HIP Introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture1/01 Exercises for HIP Introduction.pdf -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/input.txt: -------------------------------------------------------------------------------- 1 | pmc : WriteSize FetchSize VALUUtilization VALUBusy Wavefronts 2 | # pmc : L2CacheHit LDSBankConflict ALUStalledByLDS 3 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/doc/noh-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-hip/doc/noh-result.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/doc/pennantdoc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-hip/doc/pennantdoc.pdf -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/doc/side-maps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-hip/doc/side-maps.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/doc/side-maps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-orig/doc/side-maps.png -------------------------------------------------------------------------------- /Lecture2/02 Exercises for Application Porting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/02 Exercises for Application Porting.pdf -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/doc/mesh-entities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-hip/doc/mesh-entities.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/doc/sedov-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-hip/doc/sedov-result.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/doc/noh-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-orig/doc/noh-result.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/doc/pennantdoc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-orig/doc/pennantdoc.pdf -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/doc/sedov-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-orig/doc/sedov-result.png -------------------------------------------------------------------------------- /Lecture4/04 Performance_Timelines_Exercises.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture4/04 Performance_Timelines_Exercises.pdf -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/doc/mesh-entities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture2/HIPIFY/Pennant-orig/doc/mesh-entities.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/striding.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/striding.PNG -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/no_stride.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/no_stride.PNG -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/threadrows.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/threadrows.PNG -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/wavefrontrow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/wavefrontrow.PNG -------------------------------------------------------------------------------- /Lecture1/HIP/hip-stream/Makefile.titan: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | ARCH=sm_35 3 | 4 | stream : stream.cpp 5 | hipcc -std=c++11 -ccbin=$(CC) stream.cpp -arch=$(ARCH) -o stream 6 | 7 | 8 | clean : 9 | rm -f stream 10 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/exercise4_problem_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/exercise4_problem_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/exercise4_solution_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/exercise4_solution_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_solution_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_solution_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_problem_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_problem_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/exercise1_problem_kernelName_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/exercise1_problem_kernelName_legend.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_kernelName_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_kernelName_legend.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_solution_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_solution_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/exercise4_problem_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/exercise4_problem_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/exercise4_solution_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/4-StridedAccess/exercise4_solution_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_problem_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_solution_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/1-LaunchParameters/exercise1_solution_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise1_problem_kernelName_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise1_problem_kernelName_legend.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_problem_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_problem_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_problem_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_problem_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_solution_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/2-LDSOccupancyLimit/exercise2_solution_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_solution_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_solution_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_problem_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_problem_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_solution_roofline_fp32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_solution_roofline_fp32.png -------------------------------------------------------------------------------- /Lecture4/jacobi/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | 5 | source env.sh 6 | 7 | if [[ $DEPRICATED != 0 ]]; then 8 | ./wrapper.sh 9 | else 10 | mpirun -np $((${NGPUS} * ${NPROC_PER_GPU})) ./wrapper.sh 11 | fi 12 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise1_problem_kernelName_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise1_problem_kernelName_legend.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_problem_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_problem_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | 5 | source env.sh 6 | 7 | if [[ $DEPRICATED != 0 ]]; then 8 | ./wrapper.sh 9 | else 10 | mpirun -np $((${NGPUS} * ${NPROC_PER_GPU})) ./wrapper.sh 11 | fi 12 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_solution_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/exercise3_solution_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise1_problem_kernelName_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise1_problem_kernelName_legend.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_problem_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_problem_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_solution_roofline_int8_fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/olcf/hip-training-series/HEAD/Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/exercise5_solution_roofline_int8_fp16.png -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/shmoo-cpu-nbody.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody.c 2 | EXE=nbody 3 | gcc -std=c99 -O3 -fopenmp -DSHMOO -o $EXE $SRC -lm 4 | 5 | echo $EXE 6 | 7 | K=1024 8 | for i in {1..10} 9 | do 10 | ./$EXE $K 11 | K=$(($K*2)) 12 | done 13 | 14 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/shmoo-cuda-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.cu 2 | EXE=nbody-soa 3 | 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/shmoo-cuda-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-orig.cu 2 | EXE=nbody-orig 3 | 4 | nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/shmoo-cuda-nbody-ftz.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.cu 2 | EXE=nbody-ftz 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /Lecture4/jacobi/rocprof_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | outdir=$1 4 | name=$2 5 | outdir="${outdir}_${OMPI_COMM_WORLD_RANK}" 6 | outfile="${name}_${OMPI_COMM_WORLD_RANK}.csv" 7 | rocprof -d ${outdir} --hsa-trace --hip-trace -o ${outdir}/${outfile} ./Jacobi_hip $3 $4 8 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/shmoo-cuda-nbody-block.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-block.cu 2 | EXE=nbody-block 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/shmoo-cuda-nbody-unroll.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-unroll.cu 2 | EXE=nbody-unroll 3 | 4 | nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO 5 | 6 | echo $EXE 7 | 8 | K=1024 9 | for i in {1..10} 10 | do 11 | ./$EXE $K 12 | K=$(($K*2)) 13 | done 14 | 15 | -------------------------------------------------------------------------------- /Lecture4/jacobi/run_all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | 5 | source env.sh 6 | 7 | FLAGS=("--timestamp on" "-i input.txt" "--roctx-trace" "--hsa-trace" "--hip-trace" "--stats") 8 | for FLAG in "${FLAGS[@]}"; do 9 | export ROCPROF_FLAGS="$FLAG" 10 | eval ./run.sh 11 | done 12 | -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/run_all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | 5 | source env.sh 6 | 7 | FLAGS=("--timestamp on" "-i input.txt" "--roctx-trace" "--hsa-trace" "--hip-trace" "--stats") 8 | for FLAG in "${FLAGS[@]}"; do 9 | export ROCPROF_FLAGS="$FLAG" 10 | eval ./run.sh 11 | done 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Exercises for each Lecture are in the directory Lecture<#>. A markdown document 2 | is in the top-level directory for each lecture with the instructions for the 3 | exercises. To generate a more readable PDF of the document, run the 4 | markdown conversion script: 5 | 6 | markdown2pdf.sh '01 Exercises for HIP Introduction.md' 7 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/test/sedovbig/sedovbig.pnt: -------------------------------------------------------------------------------- 1 | tstop 1.0 2 | meshtype rect 3 | meshparams 540 540 1.125 1.125 4 | subregion 0.0 0.025 0.0 0.025 5 | rinitsub 1.0 6 | einitsub 5027.7 7 | bcx 0.0 1.125 8 | bcy 0.0 1.125 9 | ssmin 0.1 10 | q1 0.1 11 | q2 1.0 12 | dtinit 2.5e-6 13 | dtreport 100 14 | chunksize 512 15 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/test/sedovbig/sedovbig.pnt: -------------------------------------------------------------------------------- 1 | tstop 1.0 2 | meshtype rect 3 | meshparams 540 540 1.125 1.125 4 | subregion 0.0 0.025 0.0 0.025 5 | rinitsub 1.0 6 | einitsub 5027.7 7 | bcx 0.0 1.125 8 | bcy 0.0 1.125 9 | ssmin 0.1 10 | q1 0.1 11 | q2 1.0 12 | dtinit 2.5e-6 13 | dtreport 100 14 | chunksize 512 15 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/hipexamine-perl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #usage : hipexamine-perl.sh DIRNAME [hipify-perl options] 4 | 5 | # Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files 6 | # in the specified directory. 7 | 8 | 9 | SCRIPT_DIR=`dirname $0` 10 | PRIV_SCRIPT_DIR="$SCRIPT_DIR/../libexec/hipify" 11 | SEARCH_DIR=$1 12 | shift 13 | $SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$PRIV_SCRIPT_DIR/findcode.sh $SEARCH_DIR` 14 | -------------------------------------------------------------------------------- /Lecture4/jacobi/env.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | export NGPUS=${NGPUS:-1} 6 | export NPROC_PER_GPU=${NPROC_PER_GPU:-1} 7 | export DEPRICATED=${DEPRICATED:-0} 8 | 9 | export ROCPROF_FLAGS=${ROCPROF_FLAGS:-} 10 | export ROCPROF_HOME=${ROCPROF_HOME:-/opt/rocm/bin} 11 | 12 | if [[ -n ${OMPI_COMM_WORLD_RANK+z} ]]; then 13 | # mpich 14 | export MPI_RANK=${OMPI_COMM_WORLD_RANK} 15 | elif [[ -n ${MV2_COMM_WORLD_RANK+z} ]]; then 16 | # ompi 17 | export MPI_RANK=${MV2_COMM_WORLD_RANK} 18 | fi 19 | -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/env.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | export NGPUS=${NGPUS:-1} 6 | export NPROC_PER_GPU=${NPROC_PER_GPU:-1} 7 | export DEPRICATED=${DEPRICATED:-0} 8 | 9 | export ROCPROF_FLAGS=${ROCPROF_FLAGS:-} 10 | export ROCPROF_HOME=${ROCPROF_HOME:-/opt/rocm/bin} 11 | 12 | if [[ -n ${OMPI_COMM_WORLD_RANK+z} ]]; then 13 | # mpich 14 | export MPI_RANK=${OMPI_COMM_WORLD_RANK} 15 | elif [[ -n ${MV2_COMM_WORLD_RANK+z} ]]; then 16 | # ompi 17 | export MPI_RANK=${MV2_COMM_WORLD_RANK} 18 | fi 19 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/mic/shmoo-mic-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | SRC=../nbody-orig.c 2 | EXE=nbody-orig-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -DSHMOO -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/mic/shmoo-mic-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.c 2 | EXE=nbody-soa-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/wrapper.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | pid="$$" 5 | 6 | source env.sh 7 | 8 | pid="$$" 9 | outdir="rank_${pid}_${MPI_RANK}" 10 | outfile="results_${pid}_${MPI_RANK}.csv" 11 | 12 | source env.sh 13 | 14 | if [[ $DEPRICATED != 0 ]]; then 15 | ${ROCPROF_HOME}/rocprof ${ROCPROF_FLAGS} mpirun -np $((${NGPUS} * ${NPROC_PER_GPU})) ./Jacobi_hip -g ${NPROC_PER_GPU} ${NGPUS} 16 | else 17 | ${ROCPROF_HOME}/rocprof ${ROCPROF_FLAGS} -d ${outdir} -o ${outdir}/${outfile} ./Jacobi_hip -g ${NPROC_PER_GPU} ${NGPUS} 18 | fi 19 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/Exercise2_frontier.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 3 | #SBATCH --ntasks=1 4 | #SBATCH --gpus=1 5 | #SBATCH -p batch 6 | #SBATCH -t 00:10:00 7 | #SBATCH -A 8 | #####SBATCH --reservation 9 | 10 | module load PrgEnv-amd 11 | module load amd 12 | module load cmake 13 | 14 | cd $HOME/hip-training-series/Lecture2/HIPIFY/mini-nbody/cuda 15 | hipify-perl -print-stats nbody-orig.cu > nbody-orig.cpp 16 | hipcc -DSHMOO -I ../ nbody-orig.cpp -o nbody-orig 17 | srun ./nbody-orig 18 | cd ../../.. 19 | -------------------------------------------------------------------------------- /Lecture4/jacobi/wrapper.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | pid="$$" 5 | 6 | source env.sh 7 | 8 | pid="$$" 9 | outdir="rank_${pid}_${MPI_RANK}" 10 | outfile="results_${pid}_${MPI_RANK}.csv" 11 | 12 | source env.sh 13 | 14 | if [[ $DEPRICATED != 0 ]]; then 15 | ${ROCPROF_HOME}/rocprof ${ROCPROF_FLAGS} mpirun -np $((${NGPUS} * ${NPROC_PER_GPU})) ./Jacobi_hip -g ${NPROC_PER_GPU} ${NGPUS} 16 | else 17 | ${ROCPROF_HOME}/rocprof ${ROCPROF_FLAGS} -d ${outdir} -o ${outdir}/${outfile} ./Jacobi_hip -g ${NPROC_PER_GPU} ${NGPUS} 18 | fi 19 | -------------------------------------------------------------------------------- /markdown2pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ] 4 | then 5 | echo "Requires one argument with filename of markdown file" 6 | echo "Usage: ./markdown2pdf.sh test.md" 7 | fi 8 | 9 | if [ -z "$1" ] 10 | then 11 | echo "Requires one argument with filename of markdown file" 12 | echo "Usage: ./markdown2pdf.sh test.md" 13 | fi 14 | 15 | if [ ! -f "$1" ] 16 | then 17 | echo "file $1 does not exist" 18 | echo "Usage: ./markdown2pdf.sh test.md" 19 | fi 20 | 21 | filename="${1%.*}" 22 | pandoc "$1" -V geometry:margin=1in --toc -s -o "$filename.pdf" 23 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/mic/shmoo-mic-nbody-ftz.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-soa.c 2 | EXE=nbody-ftz-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/mic/shmoo-mic-nbody-align.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-align.c 2 | EXE=nbody-align-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/mic/shmoo-mic-nbody-block.sh: -------------------------------------------------------------------------------- 1 | SRC=nbody-block.c 2 | EXE=nbody-block-mic 3 | MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic 4 | MIC=mic0 5 | if [ $# -gt 0 ] 6 | then 7 | MIC=$1 8 | fi 9 | 10 | icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC 11 | 12 | scp $EXE $MIC:~/ 13 | scp $MICROOT/libiomp5.so $MIC:~/ 14 | 15 | echo $EXE 16 | 17 | K=1024 18 | for i in {1..10} 19 | do 20 | ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" 21 | K=$(($K*2)) 22 | done 23 | 24 | -------------------------------------------------------------------------------- /Lecture2/libexec/hipify/findcode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEARCH_DIRS=$@ 4 | 5 | find $SEARCH_DIRS -name '*.cu' -o -name '*.CU' 6 | find $SEARCH_DIRS -name '*.cpp' -o -name '*.cxx' -o -name '*.c' -o -name '*.cc' 7 | find $SEARCH_DIRS -name '*.CPP' -o -name '*.CXX' -o -name '*.C' -o -name '*.CC' 8 | find $SEARCH_DIRS -name '*.cuh' -o -name '*.CUH' 9 | find $SEARCH_DIRS -name '*.h' -o -name '*.hpp' -o -name '*.inc' -o -name '*.inl' -o -name '*.hxx' -o -name '*.hdl' 10 | find $SEARCH_DIRS -name '*.H' -o -name '*.HPP' -o -name '*.INC' -o -name '*.INL' -o -name '*.HXX' -o -name '*.HDL' 11 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd/hip_batch_frontier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p batch 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | #SBATCH -A 7 | 8 | module load PrgEnv-amd 9 | module load amd 10 | # needed for cmake builds 11 | module load cmake 12 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 13 | 14 | # portable Makefile system 15 | 16 | make vectoradd 17 | # Run the vectoradd application 18 | srun ./vectoradd 19 | # cleanup 20 | make clean 21 | 22 | # portable cmake system 23 | 24 | mkdir build && cd build 25 | cmake .. 26 | make 27 | 28 | # Run the vectoradd application 29 | srun ./vectoradd 30 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cpp_ext/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./vectoradd 2 | all: $(EXECUTABLE) test 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = vectoradd.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cpp_ext/hip_batch_frontier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p batch 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | #SBATCH -A 7 | 8 | module load PrgEnv-amd 9 | module load amd 10 | # needed for cmake builds 11 | module load cmake 12 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 13 | 14 | # portable Makefile system 15 | 16 | make vectoradd 17 | # Run the vectoradd application 18 | srun ./vectoradd 19 | # cleanup 20 | make clean 21 | 22 | # portable cmake system 23 | 24 | mkdir build && cd build 25 | cmake .. 26 | make 27 | 28 | # Run the vectoradd application 29 | srun ./vectoradd 30 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cu_ext/hip_batch_frontier.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p batch 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | #SBATCH -A 7 | 8 | module load PrgEnv-amd 9 | module load amd 10 | # needed for cmake builds 11 | module load cmake 12 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 13 | 14 | # portable Makefile system 15 | 16 | make vectoradd 17 | # Run the vectoradd application 18 | srun ./vectoradd 19 | # cleanup 20 | make clean 21 | 22 | # portable cmake system 23 | 24 | mkdir build && cd build 25 | cmake .. 26 | make 27 | 28 | # Run the vectoradd application 29 | srun ./vectoradd 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/PolyGas.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * PolyGas.cc 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "PolyGas.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | PolyGas::PolyGas(const InputFile* inp, Hydro* h) : hydro(h) { 21 | gamma = inp->getDouble("gamma", 5. / 3.); 22 | ssmin = inp->getDouble("ssmin", 0.); 23 | 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/PolyGas.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * PolyGas.cc 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "PolyGas.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | PolyGas::PolyGas(const InputFile* inp, Hydro* h) : hydro(h) { 21 | gamma = inp->getDouble("gamma", 5. / 3.); 22 | ssmin = inp->getDouble("ssmin", 0.); 23 | 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/TTS.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * TTS.cc 3 | * 4 | * Created on: Feb 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "TTS.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | TTS::TTS(const InputFile* inp, Hydro* h) : hydro(h) { 21 | alfa = inp->getDouble("alfa", 0.5); 22 | ssmin = inp->getDouble("ssmin", 0.); 23 | 24 | } 25 | 26 | 27 | TTS::~TTS() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/TTS.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * TTS.cc 3 | * 4 | * Created on: Feb 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "TTS.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | TTS::TTS(const InputFile* inp, Hydro* h) : hydro(h) { 21 | alfa = inp->getDouble("alfa", 0.5); 22 | ssmin = inp->getDouble("ssmin", 0.); 23 | 24 | } 25 | 26 | 27 | TTS::~TTS() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/Exercise2_perlmutter.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 3 | #SBATCH -q shared 4 | #SBATCH -C gpu 5 | #SBATCH -c 32 6 | #SBATCH -G 1 7 | #SBATCH -t 00:30:00 8 | #SBATCH -A 9 | ####SBATCH ---reservation 10 | 11 | module load PrgEnv-gnu/8.3.3 12 | module load hip/5.4.3 13 | module load PrgEnv-nvidia/8.3.3 14 | module load cmake 15 | 16 | export PATH=${PATH}:${HIP_PATH} 17 | 18 | cd ~/hip-training-series/Lecture2/HIPIFY/mini-nbody/cuda 19 | 20 | ../../hipify-perl -print-stats nbody-orig.cu > nbody-orig.cpp 21 | hipcc -DSHMOO -I ../ nbody-orig.cpp -o nbody-orig 22 | srun ./nbody-orig 23 | 24 | rm nbody-orig nbody-orig.cpp 25 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/HydroBC.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroBC.cc 3 | * 4 | * Created on: Jan 13, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "HydroBC.hh" 14 | 15 | #include "Vec2.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | HydroBC::HydroBC( 21 | Mesh* msh, 22 | const double2 v, 23 | const vector& mbp) 24 | : mesh(msh), numb(mbp.size()), vfix(v) {} 25 | 26 | 27 | HydroBC::~HydroBC() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/hip/HIP-nbody-orig.sh: -------------------------------------------------------------------------------- 1 | #Hipify the original cuda source code to hip compatible code 2 | #hipify ../cuda/nbody-orig.cu > nbody-orig.hip 3 | 4 | #compile the hipified source code into executable 5 | if [ -f nbody-orig ] 6 | then 7 | rm nbody-orig 8 | fi 9 | 10 | echo hipcc -I../ -DSHMOO nbody-orig.hip -o nbody-orig 11 | $ROCM_PATH/bin/hipcc -I../ -DSHMOO nbody-orig.hip -o nbody-orig 12 | 13 | #To print our more details, remove flag 14 | #hipcc -I../ nbody-orig.hip -o nbody-orig 15 | 16 | #execute the program 17 | 18 | EXE=nbody-orig 19 | K=1024 20 | for i in {1..10} 21 | do 22 | echo ./$EXE $K 23 | ./$EXE $K 24 | K=$(($K*2)) 25 | done 26 | 27 | -------------------------------------------------------------------------------- /Lecture4/saxpy/README: -------------------------------------------------------------------------------- 1 | Adding portable makefiles and cmake builds 2 | 3 | For ROCm environment 4 | module load rocm 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For ROCm with make 9 | make 10 | 11 | For ROCm with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./saxpy 16 | ctest 17 | 18 | For CUDA environment 19 | module load rocm 20 | module load CUDA/11.8 21 | module load cmake 22 | 23 | For CUDA with make 24 | HIP_PLATFORM=nvidia make 25 | 26 | For CUDA with cmake 27 | mkdir build && cd build 28 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 29 | make VERBOSE=1 30 | ./saxpy 31 | ctest 32 | -------------------------------------------------------------------------------- /Lecture1/HIP/saxpy/README: -------------------------------------------------------------------------------- 1 | Adding portable makefiles and cmake builds 2 | 3 | For ROCm environment 4 | module load rocm 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For ROCm with make 9 | make 10 | 11 | For ROCm with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./saxpy 16 | ctest 17 | 18 | For CUDA environment 19 | module load rocm 20 | module load CUDA/11.8 21 | module load cmake 22 | 23 | For CUDA with make 24 | HIP_PLATFORM=nvidia make 25 | 26 | For CUDA with cmake 27 | mkdir build && cd build 28 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 29 | make VERBOSE=1 30 | ./saxpy 31 | ctest 32 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/HydroBC.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroBC.cc 3 | * 4 | * Created on: Jan 13, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "HydroBC.hh" 14 | 15 | #include "Vec2.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | HydroBC::HydroBC( 21 | Mesh* msh, 22 | const double2 v, 23 | const vector& mbp) 24 | : mesh(msh), numb(mbp.size()), vfix(v) {} 25 | 26 | 27 | HydroBC::~HydroBC() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Occupancy_naive/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv_naive 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv_naive.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | @rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Occupancy_shmem/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv_shmem 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv_shmem.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | @rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Occupancy_shmem_A/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv_shmem_A 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv_shmem_A.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | @rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture4/saxpy/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./saxpy 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = saxpy.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.hip 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/QCS.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * QCS.cc 3 | * 4 | * Created on: Feb 21, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "QCS.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | QCS::QCS(const InputFile* inp, Hydro* h) : hydro(h) { 21 | qgamma = inp->getDouble("qgamma", 5. / 3.); 22 | q1 = inp->getDouble("q1", 0.); 23 | q2 = inp->getDouble("q2", 2.); 24 | 25 | } 26 | 27 | QCS::~QCS() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/QCS.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * QCS.cc 3 | * 4 | * Created on: Feb 21, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "QCS.hh" 14 | 15 | #include "InputFile.hh" 16 | 17 | using namespace std; 18 | 19 | 20 | QCS::QCS(const InputFile* inp, Hydro* h) : hydro(h) { 21 | qgamma = inp->getDouble("qgamma", 5. / 3.); 22 | q1 = inp->getDouble("q1", 0.); 23 | q2 = inp->getDouble("q2", 2.); 24 | 25 | } 26 | 27 | QCS::~QCS() {} 28 | 29 | 30 | -------------------------------------------------------------------------------- /Lecture1/HIP/saxpy/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./saxpy 2 | all: $(EXECUTABLE) test 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = saxpy.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.hip 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | -------------------------------------------------------------------------------- /Lecture1/HIP/hip-stream/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./stream 2 | all: $(EXECUTABLE) test 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = stream.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.hip 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Occupancy_shmem_batched/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv_shmem_batched 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv_shmem_batched.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | @rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./vectoradd 2 | all: $(EXECUTABLE) test 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = vectoradd.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.hip 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/cuda/nbody-batch-lumi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p small-g 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | #SBATCH -A project_xxxxxxxxx 7 | 8 | module reset 9 | module load craype-accel-amd-gfx90a 10 | module load rocm 11 | 12 | # Setting explicit GPU target 13 | #export ROCM_GPU=gfx90a 14 | 15 | # Using rocminfo to determine which GPU to build code for 16 | export ROCM_GPU=`rocminfo |grep -m 1 -E gfx[^0]{1} | sed -e 's/ *Name: *\(gfx[0-9,a-f]*\) *$/\1/'` 17 | 18 | cd HPCTrainingExamples/HIPIFY/mini-nbody/cuda 19 | hipify-perl -inplace -print-stats nbody-orig.cu 20 | hipcc --offload-arch=${ROCM_GPU} -DSHMOO -I ../ nbody-orig.cu -o nbody-orig 21 | srun ./nbody-orig 22 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cu_ext/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./vectoradd 2 | all: $(EXECUTABLE) test 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = vectoradd.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cu 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./problem.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = problem.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/Occupancy_shmem_batched_unroll/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./occupancy_mxv_shmem_batched_unroll 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = occupancy_mxv_shmem_batched_unroll.o 7 | 8 | CXX=hipcc 9 | CXXFLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | CXXFLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | CXXFLAGS += -x hip -munsafe-fp-atomics -Rpass-analysis=kernel-resource-usage 19 | endif 20 | 21 | $(EXECUTABLE): $(OBJECTS) 22 | hipcc $< $(LDFLAGS) -o $@ 23 | 24 | test: $(EXECUTABLE) 25 | $(EXECUTABLE) 26 | 27 | clean: 28 | @rm -rf $(EXECUTABLE) $(OBJECTS) build 29 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./problem.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = problem.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./problem.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = problem.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./problem.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = problem.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/solution/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/README-Perlmutter-users.md: -------------------------------------------------------------------------------- 1 | 2 | # Instructions for running examples on Perlmutter: 3 | 4 | On Perlmutter, the exercise is to compare the performance change on Nvidia GPUs for each type of optimization 5 | 6 | To load the appropriate modules, 7 | 8 | ``` 9 | module load PrgEnv-gnu/8.3.3 10 | module load hip/5.4.3 11 | module load PrgEnv-nvidia/8.3.3 12 | module load cmake 13 | 14 | export PATH=${PATH}:${HIP_PATH} 15 | ``` 16 | 17 | To get an allocation on Perlmutter 18 | 19 | ``` 20 | salloc -N 1 -q shared -C gpu -c 32 -G 1 -t 30:00 -A ntrain8 --reservation=hip_oct16 21 | ``` 22 | 23 | Outside the reservation hours use, 24 | 25 | ``` 26 | salloc -N 1 -q shared -C gpu -c 32 -G 1 -t 30:00 -A ntrain8 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/solution/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./problem.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = problem.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd/hip_batch_lumi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p small-g 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | 7 | module reset 8 | module load craype-accel-amd-gfx90a 9 | module load rocm 10 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 11 | 12 | # If only getting some of the GPUs on a node, the GPU detection will fail 13 | # in some cases in rocm_agent_enumerator utility. Set HCC_AMDGPU_TARGET to 14 | # bypass GPU detection 15 | # Setting explicit GPU target 16 | #export HCC_AMDGPU_TARGET=gfx90a 17 | # Using rocminfo to determine which GPU to build code for 18 | 19 | export HCC_AMDGPU_TARGET=`rocminfo |grep -m 1 -E gfx[^0]{1} | sed -e 's/ *Name: *\(gfx[0-9,a-f]*\) *$/\1/'` 20 | 21 | make vectoradd 22 | srun ./vectoradd 23 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/solution/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cu_ext/hip_batch_lumi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p small-g 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | 7 | module reset 8 | module load craype-accel-amd-gfx90a 9 | module load rocm 10 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 11 | 12 | # If only getting some of the GPUs on a node, the GPU detection will fail 13 | # in some cases in rocm_agent_enumerator utility. Set HCC_AMDGPU_TARGET to 14 | # bypass GPU detection 15 | # Setting explicit GPU target 16 | #export HCC_AMDGPU_TARGET=gfx90a 17 | # Using rocminfo to determine which GPU to build code for 18 | 19 | export HCC_AMDGPU_TARGET=`rocminfo |grep -m 1 -E gfx[^0]{1} | sed -e 's/ *Name: *\(gfx[0-9,a-f]*\) *$/\1/'` 20 | 21 | make vectoradd 22 | srun ./vectoradd 23 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution-no-lds/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/solution/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = ./solution.exe 2 | all: $(EXECUTABLE) 3 | 4 | .PHONY: test 5 | 6 | OBJECTS = solution.o 7 | 8 | CXXFLAGS = -g -O2 -DNDEBUG -fPIC 9 | HIPCC_FLAGS = -g -O2 -DNDEBUG 10 | 11 | HIP_PLATFORM ?= amd 12 | 13 | ifeq ($(HIP_PLATFORM), nvidia) 14 | HIP_PATH ?= $(shell hipconfig --path) 15 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 16 | endif 17 | ifeq ($(HIP_PLATFORM), amd) 18 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 19 | endif 20 | 21 | %.o: %.cpp 22 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 23 | 24 | $(EXECUTABLE): $(OBJECTS) 25 | hipcc $< $(LDFLAGS) -o $@ 26 | 27 | test: $(EXECUTABLE) 28 | $(EXECUTABLE) 29 | 30 | clean: 31 | rm -rf $(EXECUTABLE) $(OBJECTS) build 32 | 33 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cpp_ext/hip_batch_lumi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p small-g 3 | #SBATCH -N 1 4 | #SBATCH --gpus=1 5 | #SBATCH -t 10:00 6 | 7 | module reset 8 | module load craype-accel-amd-gfx90a 9 | module load rocm 10 | cd $HOME/HPCTrainingExamples/HIP/vectorAdd 11 | 12 | # If only getting some of the GPUs on a node, the GPU detection will fail 13 | # in some cases in rocm_agent_enumerator utility. Set HCC_AMDGPU_TARGET to 14 | # bypass GPU detection 15 | # Setting explicit GPU target 16 | #export HCC_AMDGPU_TARGET=gfx90a 17 | # Using rocminfo to determine which GPU to build code for 18 | 19 | export HCC_AMDGPU_TARGET=`rocminfo |grep -m 1 -E gfx[^0]{1} | sed -e 's/ *Name: *\(gfx[0-9,a-f]*\) *$/\1/'` 20 | 21 | make vectoradd 22 | srun ./vectoradd 23 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/run_maketests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd Occupancy_naive 4 | make clean 5 | make 6 | srun ./occupancy_mxv_naive |& tee occupancy.out 7 | make clean 8 | cd .. 9 | 10 | cd Occupancy_shmem 11 | make clean 12 | make 13 | srun ./occupancy_mxv_shmem |& tee occupancy.out 14 | make clean 15 | cd .. 16 | 17 | cd Occupancy_shmem_batched 18 | make clean 19 | make 20 | srun ./occupancy_mxv_shmem_batched |& tee occupancy.out 21 | make clean 22 | cd .. 23 | 24 | cd Occupancy_shmem_batched_unroll 25 | make clean 26 | make 27 | srun ./occupancy_mxv_shmem_batched_unroll |& tee occupancy.out 28 | make clean 29 | cd .. 30 | 31 | cd Occupancy_shmem_A 32 | make clean 33 | make 34 | srun ./occupancy_mxv_shmem_A |& tee occupancy.out 35 | make clean 36 | cd .. 37 | 38 | grep GFLOPS Occupancy*/occupancy.out 39 | -------------------------------------------------------------------------------- /Lecture4/jacobi/rocprof_wrapper.sh.old: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | # depends on ROCM_PATH being set outside;  input arguments are the output directory & the name 4 | outdir="$1" 5 | name="$2" 6 | if [[ -n ${OMPI_COMM_WORLD_RANK+z} ]]; then 7 |     # mpich 8 |     export MPI_RANK=${OMPI_COMM_WORLD_RANK} 9 | elif [[ -n ${MV2_COMM_WORLD_RANK+z} ]]; then 10 |     # ompi 11 |     export MPI_RANK=${MV2_COMM_WORLD_RANK} 12 | elif [[ -n ${SLURM_PROCID+z} ]]; then 13 |     export MPI_RANK=${SLURM_PROCID} 14 | else 15 |     echo "Unknown MPI layer detected! Must use OpenMPI, MVAPICH, or SLURM" 16 |     exit 1 17 | fi 18 | rocprof="${ROCM_PATH}/bin/rocprof" 19 | 20 | pid="$$" 21 | outdir="${outdir}/rank_${pid}_${MPI_RANK}" 22 | outfile="${name}_${pid}_${MPI_RANK}.csv" 23 | ${rocprof} -d ${outdir} --hsa-trace -o ${outdir}/${outfile} "${@:3}" 24 | 25 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/WriteXY.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * WriteXY.hh 3 | * 4 | * Created on: Dec 16, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef WRITEXY_HH_ 14 | #define WRITEXY_HH_ 15 | 16 | #include 17 | 18 | // forward declarations 19 | class Mesh; 20 | 21 | 22 | class WriteXY { 23 | public: 24 | 25 | Mesh* mesh; 26 | 27 | WriteXY(Mesh* m); 28 | ~WriteXY(); 29 | 30 | void write( 31 | const std::string& basename, 32 | const double* zr, 33 | const double* ze, 34 | const double* zp); 35 | 36 | }; 37 | 38 | 39 | #endif /* WRITEXY_HH_ */ 40 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/WriteXY.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * WriteXY.hh 3 | * 4 | * Created on: Dec 16, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef WRITEXY_HH_ 14 | #define WRITEXY_HH_ 15 | 16 | #include 17 | 18 | // forward declarations 19 | class Mesh; 20 | 21 | 22 | class WriteXY { 23 | public: 24 | 25 | Mesh* mesh; 26 | 27 | WriteXY(Mesh* m); 28 | ~WriteXY(); 29 | 30 | void write( 31 | const std::string& basename, 32 | const double* zr, 33 | const double* ze, 34 | const double* zp); 35 | 36 | }; 37 | 38 | 39 | #endif /* WRITEXY_HH_ */ 40 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/timer.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef _TIMER_H_ 6 | #define _TIMER_H_ 7 | #include 8 | class timer{ 9 | public: 10 | timer() = default; 11 | 12 | void 13 | tick(){ 14 | m_t0 = std::chrono::high_resolution_clock::now(); 15 | } 16 | 17 | template 18 | double 19 | tock(){ 20 | return std::chrono::duration_cast

( 21 | std::chrono::high_resolution_clock::now() - m_t0 22 | ).count(); 23 | } 24 | 25 | private: 26 | using time_point = std::chrono::time_point; 27 | time_point m_t0; 28 | time_point m_t1; 29 | }; 30 | 31 | 32 | #endif /* _TIMER_H_ */ 33 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/hipconvertinplace-perl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options] 4 | 5 | #hipify "inplace" all code files in specified directory. 6 | # This can be quite handy when dealing with an existing CUDA code base since the script 7 | # preserves the existing directory structure. 8 | 9 | # For each code file, this script will: 10 | # - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file. 11 | # - If ".prehip" file exists, this is used as input to hipify. 12 | # (this is useful for testing improvements to the hipify-perl toolset). 13 | 14 | 15 | SCRIPT_DIR=`dirname $0` 16 | PRIV_SCRIPT_DIR="$SCRIPT_DIR/../libexec/hipify" 17 | SEARCH_DIR=$1 18 | shift 19 | $SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$PRIV_SCRIPT_DIR/findcode.sh $SEARCH_DIR` 20 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/hip/HIP-nbody-soa.sh: -------------------------------------------------------------------------------- 1 | #Hipify the soa cuda source code to hip compatible code 2 | #hipify nbody-soa.cu > nbody-soa.hip 3 | #Manually add the first argument onto the kernel argument list 4 | #void bodyForce(Body *p, float dt, int n) //before modification 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification 6 | 7 | #compile the hipified source code into executable 8 | if [ -f nbody-soa ] 9 | then 10 | rm nbody-soa 11 | fi 12 | 13 | echo hipcc -I../ -DSHMOO nbody-soa.hip -o nbody-soa 14 | $ROCM_PATH/bin/hipcc -I../ -DSHMOO nbody-soa.hip -o nbody-soa 15 | 16 | #To print our more details, remove DSHMOO flag 17 | #hipcc -I../ nbody-soa.hip -o nbody-soa 18 | 19 | #execute the program 20 | EXE=nbody-soa 21 | K=1024 22 | for i in {1..8} 23 | do 24 | echo ./$EXE $K 25 | ./$EXE $K 26 | K=$(($K*2)) 27 | done 28 | 29 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/TTS.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * TTS.hh 3 | * 4 | * Created on: Feb 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef TTS_HH_ 14 | #define TTS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class TTS { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double alfa; // alpha coefficient for TTS model 30 | double ssmin; // minimum sound speed 31 | 32 | TTS(const InputFile* inp, Hydro* h); 33 | ~TTS(); 34 | 35 | }; // class TTS 36 | 37 | 38 | #endif /* TTS_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/TTS.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * TTS.hh 3 | * 4 | * Created on: Feb 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef TTS_HH_ 14 | #define TTS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class TTS { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double alfa; // alpha coefficient for TTS model 30 | double ssmin; // minimum sound speed 31 | 32 | TTS(const InputFile* inp, Hydro* h); 33 | ~TTS(); 34 | 35 | }; // class TTS 36 | 37 | 38 | #endif /* TTS_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd/README: -------------------------------------------------------------------------------- 1 | Adding portable makefiles and cmake builds 2 | 3 | For ROCm environment 4 | module load rocm 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For ROCm with make 9 | make 10 | 11 | For ROCm with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./vectoradd 16 | ctest 17 | 18 | For CUDA environment 19 | module load rocm 20 | module load CUDA/11.8 21 | module load cmake 22 | 23 | For CUDA with make 24 | HIP_PLATFORM=nvidia make 25 | 26 | For CUDA with cmake 27 | mkdir build && cd build 28 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 29 | make VERBOSE=1 30 | ./vectoradd 31 | ctest 32 | 33 | Original vectorAdd example from HIP-Examples 34 | https://github.com/ROCm-Developer-Tools/HIP-Examples 35 | 36 | Simple vectorAdd example written directly to the HIP interface. 37 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cpp_ext/README: -------------------------------------------------------------------------------- 1 | Adding portable makefiles and cmake builds 2 | 3 | For ROCm environment 4 | module load rocm 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For ROCm with make 9 | make 10 | 11 | For ROCm with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./vectoradd 16 | ctest 17 | 18 | For CUDA environment 19 | module load rocm 20 | module load CUDA/11.8 21 | module load cmake 22 | 23 | For CUDA with make 24 | HIPCC=nvcc make 25 | 26 | For CUDA with cmake 27 | mkdir build && cd build 28 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 29 | make VERBOSE=1 30 | ./vectoradd 31 | ctest 32 | 33 | Original vectorAdd example from HIP-Examples 34 | https://github.com/ROCm-Developer-Tools/HIP-Examples 35 | 36 | Simple vectorAdd example written directly to the HIP interface. 37 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cu_ext/README: -------------------------------------------------------------------------------- 1 | Adding portable makefiles and cmake builds 2 | 3 | For ROCm environment 4 | module load rocm 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For ROCm with make 9 | make 10 | 11 | For ROCm with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./vectoradd 16 | ctest 17 | 18 | For CUDA environment 19 | module load rocm 20 | module load CUDA/11.8 21 | module load cmake 22 | 23 | For CUDA with make 24 | HIPCC=nvcc make 25 | 26 | For CUDA with cmake 27 | mkdir build && cd build 28 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 29 | make VERBOSE=1 30 | ./vectoradd 31 | ctest 32 | 33 | Original vectorAdd example from HIP-Examples 34 | https://github.com/ROCm-Developer-Tools/HIP-Examples 35 | 36 | Simple vectorAdd example written directly to the HIP interface. 37 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/hip/HIP-nbody-block.sh: -------------------------------------------------------------------------------- 1 | #Hipify the blocked cuda source code to hip compatible code 2 | #hipify nbody-block.cu > nbody-block.hip 3 | #Manually add the first argument onto the kernel argument list 4 | #void bodyForce(Body *p, float dt, int n) //before modification 5 | #void bodyForce(hipLaunchParm lp, Body *p, float dt, int n) //after modification 6 | 7 | #compile the hipified source code into executable 8 | if [ -f nbody-block ] 9 | then 10 | rm nbody-block 11 | fi 12 | 13 | echo hipcc -I../ -DSHMOO nbody-block.hip -o nbody-block 14 | $ROCM_PATH/bin/hipcc -I../ -DSHMOO nbody-block.hip -o nbody-block 15 | 16 | #To print our more details, remove DSHMOO flag 17 | #hipcc -I../ nbody-block.cpp -o nbody-block 18 | 19 | #execute the program 20 | EXE=nbody-block 21 | K=1024 22 | for i in {1..8} 23 | do 24 | echo ./$EXE $K 25 | ./$EXE $K 26 | K=$(($K*2)) 27 | done 28 | 29 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/PolyGas.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * PolyGas.hh 3 | * 4 | * Created on: Mar 23, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef POLYGAS_HH_ 14 | #define POLYGAS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class PolyGas { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double gamma; // coeff. for ideal gas equation 30 | double ssmin; // minimum sound speed for gas 31 | 32 | PolyGas(const InputFile* inp, Hydro* h); 33 | ~PolyGas(); 34 | 35 | }; // class PolyGas 36 | 37 | 38 | #endif /* POLYGAS_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/PolyGas.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * PolyGas.hh 3 | * 4 | * Created on: Mar 23, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef POLYGAS_HH_ 14 | #define POLYGAS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class PolyGas { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double gamma; // coeff. for ideal gas equation 30 | double ssmin; // minimum sound speed for gas 31 | 32 | PolyGas(const InputFile* inp, Hydro* h); 33 | ~PolyGas(); 34 | 35 | }; // class PolyGas 36 | 37 | 38 | #endif /* POLYGAS_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/args.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef ARGS_H_ 6 | #define ARGS_H_ 7 | #include 8 | #include 9 | 10 | struct args{ 11 | // Matrix dimensions 12 | int m = 0; 13 | int n = 0; 14 | int k = 0; 15 | 16 | std::vector device_ids; 17 | 18 | // number of times to perform 19 | int iter_count = 1; 20 | 21 | // number of times to repeat dgemm while measuring time 22 | int rep_count = 10; 23 | 24 | std::string output_fn; 25 | 26 | static 27 | bool 28 | validate(args const& input){ 29 | return (input.m > 1) 30 | && (input.n > 0) 31 | && (input.k > 0) 32 | && (input.iter_count > 0) 33 | && (input.rep_count > 0); 34 | } 35 | 36 | }; 37 | 38 | #endif /* ARGS_H_ */ 39 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/Memory.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory.hh 3 | * 4 | * Created on: Jul 3, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef MEMORY_HH_ 14 | #define MEMORY_HH_ 15 | 16 | #include 17 | 18 | 19 | // Namespace Memory provides functions to allocate and free memory. 20 | // Currently these are just wrappers around std::malloc and free, 21 | // but they are abstracted here to make it easier to replace them 22 | // if needed. 23 | 24 | namespace Memory { 25 | 26 | template 27 | inline T* alloc(const int count) { 28 | return (T*) std::malloc(count * sizeof(T)); 29 | } 30 | 31 | template 32 | inline void free(T* ptr) { 33 | std::free(ptr); 34 | } 35 | 36 | }; // namespace Memory 37 | 38 | #endif /* MEMORY_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/Memory.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory.hh 3 | * 4 | * Created on: Jul 3, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef MEMORY_HH_ 14 | #define MEMORY_HH_ 15 | 16 | #include 17 | 18 | 19 | // Namespace Memory provides functions to allocate and free memory. 20 | // Currently these are just wrappers around std::malloc and free, 21 | // but they are abstracted here to make it easier to replace them 22 | // if needed. 23 | 24 | namespace Memory { 25 | 26 | template 27 | inline T* alloc(const int count) { 28 | return (T*) std::malloc(count * sizeof(T)); 29 | } 30 | 31 | template 32 | inline void free(T* ptr) { 33 | std::free(ptr); 34 | } 35 | 36 | }; // namespace Memory 37 | 38 | #endif /* MEMORY_HH_ */ 39 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/ImportGMV.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * ImportGMV.hh 3 | * 4 | * Created on: Feb 27, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef IMPORTGMV_HH_ 14 | #define IMPORTGMV_HH_ 15 | 16 | #include 17 | #include 18 | #include "Vec2.hh" 19 | 20 | // forward declarations 21 | class Mesh; 22 | 23 | 24 | class ImportGMV { 25 | public: 26 | 27 | // parent object 28 | Mesh* mesh; 29 | 30 | ImportGMV(Mesh* m); 31 | ~ImportGMV(); 32 | 33 | void read( 34 | const std::string& filename, 35 | std::vector& nodepos, 36 | std::vector& cellstart, 37 | std::vector& cellsize, 38 | std::vector& cellnodes); 39 | }; // class ImportGMV 40 | 41 | 42 | #endif /* IMPORTGMV_HH_ */ 43 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/ImportGMV.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * ImportGMV.hh 3 | * 4 | * Created on: Feb 27, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef IMPORTGMV_HH_ 14 | #define IMPORTGMV_HH_ 15 | 16 | #include 17 | #include 18 | #include "Vec2.hh" 19 | 20 | // forward declarations 21 | class Mesh; 22 | 23 | 24 | class ImportGMV { 25 | public: 26 | 27 | // parent object 28 | Mesh* mesh; 29 | 30 | ImportGMV(Mesh* m); 31 | ~ImportGMV(); 32 | 33 | void read( 34 | const std::string& filename, 35 | std::vector& nodepos, 36 | std::vector& cellstart, 37 | std::vector& cellsize, 38 | std::vector& cellnodes); 39 | }; // class ImportGMV 40 | 41 | 42 | #endif /* IMPORTGMV_HH_ */ 43 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/dgemm.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef _DGEMM_H_ 6 | #define _DGEMM_H_ 7 | 8 | 9 | #include 10 | #include "matrix.h" 11 | 12 | 13 | struct dgemm_results{ 14 | std::vector flops; 15 | std::vector time_points; 16 | }; 17 | 18 | 19 | /** Run A*B on gpus 20 | * 21 | * @param A input matrix A 22 | * @param B input matrix B 23 | * @param inter_count number of iterations to perform dgemm 24 | * @param rep_count number of times to perform dgemm to compute flops 25 | * @param device_id GPU device id, indexed at 0, to run on 26 | * @return estimated terraflops and corresponding local time 27 | * 28 | */ 29 | dgemm_results 30 | run_dgemm( 31 | matrixd const& A, matrixd const& B, 32 | int iter_count, int rep_count, int dev_id); 33 | 34 | 35 | #endif /* _DGEMM_H_ */ 36 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/solution/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/solution/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution-no-lds/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/solution/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/solution/hipCheck.h: -------------------------------------------------------------------------------- 1 | #ifndef __HIP_CHECK_H__ 2 | #define __HIP_CHECK_H__ 3 | #include 4 | #define hipCheck(stmt) \ 5 | do { \ 6 | hipError_t err = stmt; \ 7 | if (err != hipSuccess) { \ 8 | char msg[256]; \ 9 | sprintf(msg, "%s in file %s, function %s, line %d\n", #stmt, __FILE__, \ 10 | __FUNCTION__, __LINE__); \ 11 | std::string errstring = hipGetErrorString(err); \ 12 | std::cerr << msg << "\t" << errstring << std::endl; \ 13 | throw std::runtime_error(msg); \ 14 | } \ 15 | } while (0) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd/README.frontier: -------------------------------------------------------------------------------- 1 | 2 | For AMD environment 3 | module load PrgEnv-amd 4 | module load amd 5 | module load cmake 6 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 7 | 8 | For AMD with make 9 | make vectoradd 10 | 11 | For AMD with cmake 12 | mkdir build && cd build 13 | cmake .. 14 | make VERBOSE=1 15 | ./vectoradd 16 | ctest 17 | 18 | For batch version of AMD environment, modify hip_batch_frontier.sh 19 | and submit with sbatch 14 | #include 15 | 16 | #include "InputFile.hh" 17 | #include "Driver.hh" 18 | 19 | using namespace std; 20 | 21 | 22 | int main(const int argc, const char** argv) 23 | { 24 | if (argc != 2) { 25 | cerr << "Usage: pennant " << endl; 26 | exit(1); 27 | } 28 | 29 | const char* filename = argv[1]; 30 | InputFile inp(filename); 31 | 32 | string probname(filename); 33 | // strip .pnt suffix from filename 34 | int len = probname.length(); 35 | if (probname.substr(len - 4, 4) == ".pnt") 36 | probname = probname.substr(0, len - 4); 37 | 38 | Driver drv(&inp, probname); 39 | 40 | drv.run(); 41 | 42 | return 0; 43 | 44 | } 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/main.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * main.cc 3 | * 4 | * Created on: Jan 23, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include 14 | #include 15 | 16 | #include "InputFile.hh" 17 | #include "Driver.hh" 18 | 19 | using namespace std; 20 | 21 | 22 | int main(const int argc, const char** argv) 23 | { 24 | if (argc != 2) { 25 | cerr << "Usage: pennant " << endl; 26 | exit(1); 27 | } 28 | 29 | const char* filename = argv[1]; 30 | InputFile inp(filename); 31 | 32 | string probname(filename); 33 | // strip .pnt suffix from filename 34 | int len = probname.length(); 35 | if (probname.substr(len - 4, 4) == ".pnt") 36 | probname = probname.substr(0, len - 4); 37 | 38 | Driver drv(&inp, probname); 39 | 40 | drv.run(); 41 | 42 | return 0; 43 | 44 | } 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Lecture3/Occupancy/run_cmaketests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd Occupancy_naive 4 | make clean 5 | mkdir build && cd build 6 | cmake .. 7 | make 8 | srun ./occupancy_mxv_naive |& tee occupancy.out 9 | cd .. 10 | make clean 11 | cd .. 12 | 13 | cd Occupancy_shmem 14 | make clean 15 | mkdir build && cd build 16 | cmake .. 17 | make 18 | srun ./occupancy_mxv_shmem |& tee occupancy.out 19 | make clean 20 | cd .. 21 | make clean 22 | cd .. 23 | 24 | cd Occupancy_shmem_batched 25 | make clean 26 | mkdir build && cd build 27 | cmake .. 28 | make 29 | srun ./occupancy_mxv_shmem_batched |& tee occupancy.out 30 | make clean 31 | cd .. 32 | make clean 33 | cd .. 34 | 35 | cd Occupancy_shmem_batched_unroll 36 | make clean 37 | mkdir build && cd build 38 | cmake .. 39 | make 40 | srun ./occupancy_mxv_shmem_batched_unroll |& tee occupancy.out 41 | make clean 42 | cd .. 43 | make clean 44 | cd .. 45 | 46 | cd Occupancy_shmem_A 47 | make clean 48 | mkdir build && cd build 49 | cmake .. 50 | make 51 | srun ./occupancy_mxv_shmem_A |& tee occupancy.out 52 | make clean 53 | cd .. 54 | make clean 55 | cd .. 56 | 57 | grep GFLOPS Occupancy*/occupancy.out 58 | 59 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/hip/Makefile: -------------------------------------------------------------------------------- 1 | all: nbody-orig nbody-soa nbody-block nbody-unroll 2 | 3 | CXXFLAGS =-g -O2 4 | HIPCC_FLAGS=-O2 -g -DNDEBUG 5 | HIPCC_FLAGS+=-I.. -DSHMOO 6 | 7 | HIP_PLATFORM ?= amd 8 | 9 | ifeq ($(HIP_PLATFORM), nvidia) 10 | HIP_PATH ?= $(shell hipconfig --path) 11 | HIPCC_FLAGS += $(HIPCC_FLAGS) -x cu -I${HIP_PATH}/include/ 12 | endif 13 | ifeq ($(HIP_PLATFORM), amd) 14 | HIPCC_FLAGS += $(HIPCC_FLAGS) -x hip -munsafe-fp-atomics 15 | ifeq (, $(findstring CXX, clang++)) 16 | LDFLAGS=--hip-link 17 | else 18 | LDFLAGS=-L${ROCM_PATH}/hip/lib -lamdhip64 19 | endif 20 | endif 21 | 22 | %.hip: ../cuda/%.cu 23 | hipify-perl $^ > $@ 24 | 25 | %.o: %.hip 26 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 27 | 28 | nbody-orig: nbody-orig.o 29 | $(CXX) $^ $(LDFLAGS) -o $@ 30 | 31 | nbody-soa: nbody-soa.o 32 | $(CXX) $^ $(LDFLAGS) -o $@ 33 | 34 | nbody-block: nbody-block.o 35 | $(CXX) $^ $(LDFLAGS) -o $@ 36 | 37 | nbody-unroll: nbody-unroll.o 38 | $(CXX) $^ $(LDFLAGS) -o $@ 39 | 40 | clean: 41 | rm -f nbody-orig nbody-soa nbody-block nbody-unroll 42 | rm -f nbody-orig.o nbody-soa.o nbody-block.o nbody-unroll.o 43 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/QCS.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * QCS.hh 3 | * 4 | * Created on: Feb 21, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef QCS_HH_ 14 | #define QCS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class QCS { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double qgamma; // gamma coefficient for Q model 30 | double q1, q2; // linear and quadratic coefficients 31 | // for Q model 32 | 33 | double* c0evol; 34 | double* c0du; 35 | double* c0div; 36 | double* c0cos; 37 | double2* c0qe; 38 | double2* z0uc; 39 | double* c0rmu; 40 | double* c0w; 41 | 42 | 43 | QCS(const InputFile* inp, Hydro* h); 44 | ~QCS(); 45 | 46 | }; // class QCS 47 | 48 | 49 | 50 | #endif /* QCS_HH_ */ 51 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/QCS.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * QCS.hh 3 | * 4 | * Created on: Feb 21, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef QCS_HH_ 14 | #define QCS_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Hydro; 21 | 22 | 23 | class QCS { 24 | public: 25 | 26 | // parent hydro object 27 | Hydro* hydro; 28 | 29 | double qgamma; // gamma coefficient for Q model 30 | double q1, q2; // linear and quadratic coefficients 31 | // for Q model 32 | 33 | double* c0evol; 34 | double* c0du; 35 | double* c0div; 36 | double* c0cos; 37 | double2* c0qe; 38 | double2* z0uc; 39 | double* c0rmu; 40 | double* c0w; 41 | 42 | 43 | QCS(const InputFile* inp, Hydro* h); 44 | ~QCS(); 45 | 46 | }; // class QCS 47 | 48 | 49 | 50 | #endif /* QCS_HH_ */ 51 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/HydroBC.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroBC.hh 3 | * 4 | * Created on: Jan 13, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef HYDROBC_HH_ 14 | #define HYDROBC_HH_ 15 | 16 | #include 17 | 18 | #include "Vec2.hh" 19 | 20 | // forward declarations 21 | class Mesh; 22 | 23 | 24 | class HydroBC { 25 | public: 26 | 27 | // associated mesh object 28 | Mesh* mesh; 29 | 30 | int numb; // number of bdy points 31 | double2 vfix; // vector perp. to fixed plane 32 | int* mapbp; // map: bdy point -> point 33 | std::vector pchbfirst; // start/stop index for bdy pt chunks 34 | std::vector pchblast; 35 | 36 | HydroBC( 37 | Mesh* msh, 38 | const double2 v, 39 | const std::vector& mbp); 40 | 41 | ~HydroBC(); 42 | 43 | }; // class HydroBC 44 | 45 | 46 | #endif /* HYDROBC_HH_ */ 47 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/HydroBC.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroBC.hh 3 | * 4 | * Created on: Jan 13, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef HYDROBC_HH_ 14 | #define HYDROBC_HH_ 15 | 16 | #include 17 | 18 | #include "Vec2.hh" 19 | 20 | // forward declarations 21 | class Mesh; 22 | 23 | 24 | class HydroBC { 25 | public: 26 | 27 | // associated mesh object 28 | Mesh* mesh; 29 | 30 | int numb; // number of bdy points 31 | double2 vfix; // vector perp. to fixed plane 32 | int* mapbp; // map: bdy point -> point 33 | std::vector pchbfirst; // start/stop index for bdy pt chunks 34 | std::vector pchblast; 35 | 36 | HydroBC( 37 | Mesh* msh, 38 | const double2 v, 39 | const std::vector& mbp); 40 | 41 | ~HydroBC(); 42 | 43 | }; // class HydroBC 44 | 45 | 46 | #endif /* HYDROBC_HH_ */ 47 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 AMD HPC Application Performance Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # /*************************************************************************** 2 | # Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | # ***************************************************************************/ 4 | 5 | include_directories("${CMAKE_SOURCE_DIR}") 6 | message("Running src/CMakeLists.txt...") 7 | 8 | set( 9 | CMAKE_ARCHIVE_OUTPUT_DIRECTORY 10 | ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR} 11 | ) 12 | set( 13 | CMAKE_LIBRARY_OUTPUT_DIRECTORY 14 | ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR} 15 | ) 16 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY 17 | ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR} 18 | ) 19 | 20 | set(BINOUT dgemm) 21 | set(LIBOUT dgemm_lib) 22 | 23 | file(GLOB SOURCES "*.cpp") 24 | 25 | set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE ${GPU_RUNTIME}) 26 | set_source_files_properties(${SOURCES} PROPERTIES COMPILE_FLAGS "${HIPCC_FLAGS}") 27 | 28 | add_library(${LIBOUT} ${SOURCES}) 29 | add_executable(${BINOUT} ${SOURCES}) 30 | 31 | target_include_directories( 32 | ${LIBOUT} 33 | PUBLIC 34 | ${HIPCC_FLAGS} 35 | ${CMAKE_CURRENT_LIST_DIR} 36 | ) 37 | 38 | target_link_libraries(${BINOUT} LINK_PUBLIC ${hipblas_LIBRARIES}) 39 | 40 | # EOF 41 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/matrix.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef _MATRIX_H_ 6 | #define _MATRIX_H_ 7 | 8 | 9 | #include 10 | 11 | 12 | template 13 | class matrix{ 14 | public: 15 | matrix(int m, int n, T val=0) 16 | : m_rowcount(m), m_colcount(n) 17 | { 18 | m_data = std::vector(m*n, val); 19 | } 20 | 21 | double const& 22 | operator()(int i, int j) const{ 23 | return m_data[i*m_colcount + j]; 24 | } 25 | 26 | double& 27 | operator()(int i, int j){ 28 | return m_data[i*m_colcount + j]; 29 | } 30 | 31 | int 32 | row_count() const noexcept{ 33 | return m_rowcount; 34 | } 35 | 36 | int 37 | col_count() const noexcept{ 38 | return m_colcount; 39 | } 40 | 41 | std::vector const& 42 | data() const{ 43 | return m_data; 44 | } 45 | 46 | private: 47 | unsigned int m_rowcount = 0; 48 | unsigned int m_colcount = 0; 49 | std::vector m_data; 50 | }; 51 | 52 | 53 | using matrixd = matrix; 54 | 55 | 56 | #endif /* _MATRIX_H_ */ 57 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | 6 | #ifdef WIN32 7 | #define WIN32_LEAN_AND_MEAN 8 | #include 9 | #else 10 | #ifndef __USE_BSD 11 | #define __USE_BSD 12 | #endif 13 | #include 14 | #endif 15 | 16 | #ifdef WIN32 17 | double PCFreq = 0.0; 18 | __int64 timerStart = 0; 19 | #else 20 | struct timeval timerStart; 21 | #endif 22 | 23 | void StartTimer() 24 | { 25 | #ifdef WIN32 26 | LARGE_INTEGER li; 27 | if(!QueryPerformanceFrequency(&li)) 28 | printf("QueryPerformanceFrequency failed!\n"); 29 | 30 | PCFreq = (double)li.QuadPart/1000.0; 31 | 32 | QueryPerformanceCounter(&li); 33 | timerStart = li.QuadPart; 34 | #else 35 | gettimeofday(&timerStart, NULL); 36 | #endif 37 | } 38 | 39 | // time elapsed in ms 40 | double GetTimer() 41 | { 42 | #ifdef WIN32 43 | LARGE_INTEGER li; 44 | QueryPerformanceCounter(&li); 45 | return (double)(li.QuadPart-timerStart)/PCFreq; 46 | #else 47 | struct timeval timerStop, timerElapsed; 48 | gettimeofday(&timerStop, NULL); 49 | timersub(&timerStop, &timerStart, &timerElapsed); 50 | return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0; 51 | #endif 52 | } 53 | 54 | #endif // TIMER_H 55 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/InputFile.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * InputFile.hh 3 | * 4 | * Created on: Mar 20, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef INPUTFILE_HH_ 14 | #define INPUTFILE_HH_ 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | 21 | class InputFile { 22 | public: 23 | InputFile(const char* filename); 24 | ~InputFile(); 25 | int getInt(const std::string& key, const int dflt) const; 26 | double getDouble(const std::string& key, const double dflt) const; 27 | std::string getString(const std::string& key, 28 | const std::string& dflt) const; 29 | std::vector getDoubleList( 30 | const std::string& key, 31 | const std::vector& dflt) const; 32 | 33 | private: 34 | typedef std::map pairstype; 35 | 36 | pairstype pairs; // map of key-value string pairs 37 | 38 | template 39 | T get(const std::string& key, const T& dflt) const; 40 | 41 | 42 | }; // class InputFile 43 | 44 | 45 | #endif /* INPUTFILE_HH_ */ 46 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/InputFile.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * InputFile.hh 3 | * 4 | * Created on: Mar 20, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef INPUTFILE_HH_ 14 | #define INPUTFILE_HH_ 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | 21 | class InputFile { 22 | public: 23 | InputFile(const char* filename); 24 | ~InputFile(); 25 | int getInt(const std::string& key, const int dflt) const; 26 | double getDouble(const std::string& key, const double dflt) const; 27 | std::string getString(const std::string& key, 28 | const std::string& dflt) const; 29 | std::vector getDoubleList( 30 | const std::string& key, 31 | const std::vector& dflt) const; 32 | 33 | private: 34 | typedef std::map pairstype; 35 | 36 | pairstype pairs; // map of key-value string pairs 37 | 38 | template 39 | T get(const std::string& key, const T& dflt) const; 40 | 41 | 42 | }; // class InputFile 43 | 44 | 45 | #endif /* INPUTFILE_HH_ */ 46 | -------------------------------------------------------------------------------- /Lecture1/HIP/hip-stream/README.md: -------------------------------------------------------------------------------- 1 | Originally from HIP-Examples as cuda-stream 2 | https://github.com/ROCm-Developer-Tools/HIP-Examples 3 | Code based on the code developed by John D. McCalpin 4 | http://www.cs.virginia.edu/stream/FTP/Code/stream.c 5 | 6 | cuda version written by: Massimiliano Fatica, NVIDIA Corporation 7 | 8 | Further modifications by: Ben Cumming, CSCS 9 | 10 | Ported to HIP by: Peng Sun, AMD 11 | 12 | The benchmark is modified from STREAM benchmark implementation with the following kernels: 13 | ``` 14 | COPY: a(i) = b(i) 15 | SCALE: a(i) = q*b(i) 16 | SUM: a(i) = b(i) + c(i) 17 | TRIAD: a(i) = b(i) + q*c(i) 18 | ``` 19 | 20 | For ROCm environment 21 | ``` 22 | module load rocm 23 | module load cmake 24 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 25 | ``` 26 | 27 | For ROCm with make 28 | ``` 29 | make 30 | ``` 31 | 32 | For ROCm with cmake 33 | ``` 34 | mkdir build && cd build 35 | cmake .. 36 | make VERBOSE=1 37 | ./stream 38 | ctest 39 | ``` 40 | 41 | For CUDA environment 42 | ``` 43 | module load rocm 44 | module load CUDA/11.8 45 | module load cmake 46 | ``` 47 | 48 | For CUDA with make 49 | ``` 50 | HIP_PLATFORM=nvidia make 51 | ``` 52 | 53 | For CUDA with cmake 54 | ``` 55 | mkdir build && cd build 56 | cmake -DCMAKE_GPU_RUNTIME=CUDA .. 57 | make VERBOSE=1 58 | ./stream 59 | ctest 60 | ``` 61 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/darray.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef _DARRAY_H_ 6 | #define _DARRAY_H_ 7 | 8 | 9 | #include 10 | #include "utils.h" 11 | 12 | 13 | 14 | /** Device array 15 | * 16 | */ 17 | template < 18 | typename T, 19 | typename = typename std::enable_if::value, T>::type 20 | > 21 | class darray{ 22 | public: 23 | 24 | /** Allocate an array for n number of elements 25 | */ 26 | darray(size_t n){ 27 | check_stat( 28 | hipMalloc((void**)&m_data, sizeof(T)*n) 29 | ); 30 | }; 31 | 32 | ~darray(){ 33 | check_stat( 34 | hipFree(m_data) 35 | ); 36 | }; 37 | 38 | darray(darray && that){ 39 | m_data = std::move(that.m_data); 40 | }; 41 | 42 | darray const& operator=(darray && that){ 43 | m_data = std::move(that.data); 44 | return *this; 45 | } 46 | 47 | darray(darray const&) = delete; 48 | darray const& operator=(darray const&) = delete; 49 | 50 | 51 | operator T*() const{ 52 | return m_data; 53 | } 54 | 55 | 56 | T* 57 | data(){ 58 | return m_data; 59 | } 60 | 61 | 62 | private: 63 | T *m_data; 64 | }; 65 | 66 | 67 | #endif /* _DARRAY_H_ */ 68 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/README-Frontier-users.md: -------------------------------------------------------------------------------- 1 | # Instructions for loading Omniperf on Frontier: 2 | 3 | To load the appropriate environment, you should be able to simply run: 4 | ``` 5 | module use /autofs/nccs-svm1_sw/crusher/amdsw/modules 6 | module load PrgEnv-amd amd omniperf 7 | ``` 8 | This should pull in Omniperf, ROCm, and all the dependencies necessary to run our exercises. 9 | It is worthy of note that this version of Omniperf is a pre-release candidate for 1.1.0. 10 | 11 | The module use command will make the omniperf/1.1.0-PR1 pre-release version the default and 12 | then the module load omniperf will get this version. 13 | 14 | >Note: By default this loads ROCm 5.3.0, which may not show the issue talked about in Exercise 3: Register Occupancy Limiter 15 | 16 | To allocate an interactive job on Frontier: 17 | ``` 18 | salloc -N 1 -p batch --reservation=hip_training_2023_10_16 --gpus=1 -t 10:00 -A 19 | ``` 20 | 21 | Use your project ID in the project field. If you're unsure of what projects are available to you, run the above command without the `-A` option, and it will report a list of your valid projects. 22 | 23 | Outside our reservation window, you can do: 24 | ``` 25 | salloc -N 1 -p batch --gpus=1 -t 10:00 -A 26 | ``` 27 | 28 | As is usual in this lecture series, the linked Google doc for comments, questions and answers 29 | will be at https://docs.google.com/document/d/1aUfzofSgxCn-gkejJHDlh54YX5mRMaj5xkAO7zEZUkQ/edit 30 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/README: -------------------------------------------------------------------------------- 1 | PENNANT Description: 2 | 3 | PENNANT is an unstructured mesh physics mini-app designed for advanced 4 | architecture research. It contains mesh data structures and a few 5 | physics algorithms adapted from the LANL rad-hydro code FLAG, and gives 6 | a sample of the typical memory access patterns of FLAG. 7 | 8 | Further documentation can be found in the 'doc' directory of the 9 | PENNANT distribution. 10 | 11 | 12 | Version Log: 13 | 14 | 0.6, February 2014: 15 | Replaced GMV mesh reader with internal mesh generators. 16 | Added QCS velocity difference routine to reflect a recent 17 | bugfix in FLAG. Increased size of big test problems. 18 | [ Master branch contained this change but CUDA branch does not: 19 | First MPI version. MPI capability is working and mostly 20 | optimized; MPI+OpenMP is working but needs optimization. ] 21 | 22 | 0.5, May 2013: 23 | Further optimizations. 24 | 25 | 0.4, January 2013: 26 | First open-source release. Fixed a bug in QCS and added some 27 | optimizations. Added Sedov and Leblanc test problems, and some 28 | new input keywords to support them. 29 | 30 | 0.3, July 2012: 31 | Added OpenMP pragmas and point chunk processing. Modified physics 32 | state arrays to be flat arrays instead of STL vectors. 33 | 34 | 0.2, June 2012: 35 | Added side chunk processing. Miscellaneous minor cleanup. 36 | 37 | 0.1, March 2012: 38 | Initial release, internal LANL only. 39 | 40 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/README: -------------------------------------------------------------------------------- 1 | PENNANT Description: 2 | 3 | PENNANT is an unstructured mesh physics mini-app designed for advanced 4 | architecture research. It contains mesh data structures and a few 5 | physics algorithms adapted from the LANL rad-hydro code FLAG, and gives 6 | a sample of the typical memory access patterns of FLAG. 7 | 8 | Further documentation can be found in the 'doc' directory of the 9 | PENNANT distribution. 10 | 11 | 12 | Version Log: 13 | 14 | 0.6, February 2014: 15 | Replaced GMV mesh reader with internal mesh generators. 16 | Added QCS velocity difference routine to reflect a recent 17 | bugfix in FLAG. Increased size of big test problems. 18 | [ Master branch contained this change but CUDA branch does not: 19 | First MPI version. MPI capability is working and mostly 20 | optimized; MPI+OpenMP is working but needs optimization. ] 21 | 22 | 0.5, May 2013: 23 | Further optimizations. 24 | 25 | 0.4, January 2013: 26 | First open-source release. Fixed a bug in QCS and added some 27 | optimizations. Added Sedov and Leblanc test problems, and some 28 | new input keywords to support them. 29 | 30 | 0.3, July 2012: 31 | Added OpenMP pragmas and point chunk processing. Modified physics 32 | state arrays to be flat arrays instead of STL vectors. 33 | 34 | 0.2, June 2012: 35 | Added side chunk processing. Miscellaneous minor cleanup. 36 | 37 | 0.1, March 2012: 38 | Initial release, internal LANL only. 39 | 40 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/ExportGold.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * ExportGold.hh 3 | * 4 | * Created on: Mar 1, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef EXPORTGOLD_HH_ 14 | #define EXPORTGOLD_HH_ 15 | 16 | #include 17 | #include 18 | 19 | // forward declarations 20 | class Mesh; 21 | 22 | 23 | class ExportGold { 24 | public: 25 | 26 | Mesh* mesh; 27 | 28 | std::vector tris; // zone index list for 3-sided zones 29 | std::vector quads; // same, for 4-sided zones 30 | std::vector others; // same, for n-sided zones, n > 4 31 | std::vector mapzs; // map: zone -> first side 32 | 33 | ExportGold(Mesh* m); 34 | ~ExportGold(); 35 | 36 | void write( 37 | const std::string& basename, 38 | const int cycle, 39 | const double time, 40 | const double* zr, 41 | const double* ze, 42 | const double* zp); 43 | 44 | void writeCaseFile( 45 | const std::string& basename); 46 | 47 | void writeGeoFile( 48 | const std::string& basename, 49 | const int cycle, 50 | const double time); 51 | 52 | void writeVarFile( 53 | const std::string& basename, 54 | const std::string& varname, 55 | const double* var); 56 | 57 | void sortZones(); 58 | }; 59 | 60 | 61 | 62 | #endif /* EXPORTGOLD_HH_ */ 63 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/ExportGold.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * ExportGold.hh 3 | * 4 | * Created on: Mar 1, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef EXPORTGOLD_HH_ 14 | #define EXPORTGOLD_HH_ 15 | 16 | #include 17 | #include 18 | 19 | // forward declarations 20 | class Mesh; 21 | 22 | 23 | class ExportGold { 24 | public: 25 | 26 | Mesh* mesh; 27 | 28 | std::vector tris; // zone index list for 3-sided zones 29 | std::vector quads; // same, for 4-sided zones 30 | std::vector others; // same, for n-sided zones, n > 4 31 | std::vector mapzs; // map: zone -> first side 32 | 33 | ExportGold(Mesh* m); 34 | ~ExportGold(); 35 | 36 | void write( 37 | const std::string& basename, 38 | const int cycle, 39 | const double time, 40 | const double* zr, 41 | const double* ze, 42 | const double* zp); 43 | 44 | void writeCaseFile( 45 | const std::string& basename); 46 | 47 | void writeGeoFile( 48 | const std::string& basename, 49 | const int cycle, 50 | const double time); 51 | 52 | void writeVarFile( 53 | const std::string& basename, 54 | const std::string& varname, 55 | const double* var); 56 | 57 | void sortZones(); 58 | }; 59 | 60 | 61 | 62 | #endif /* EXPORTGOLD_HH_ */ 63 | -------------------------------------------------------------------------------- /Lecture4/jacobi/JacobiIteration.hip: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #include "Jacobi.hpp" 6 | 7 | //Jacobi iterative method 8 | // U = U + D^{-1}*(RHS - AU) 9 | __global__ void JacobiIterationKernel(const int N, 10 | const dfloat dx, 11 | const dfloat dy, 12 | const dfloat *__restrict__ RHS, 13 | const dfloat *__restrict__ AU, 14 | dfloat *__restrict__ RES, 15 | dfloat *__restrict__ U) { 16 | 17 | const int id = threadIdx.x+blockIdx.x*blockDim.x; 18 | 19 | if (id 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Mesh; 21 | class Hydro; 22 | 23 | 24 | class Driver { 25 | public: 26 | 27 | // children of this object 28 | Mesh *mesh; 29 | Hydro *hydro; 30 | 31 | std::string probname; // problem name 32 | double time; // simulation time 33 | int cycle; // simulation cycle number 34 | double tstop; // simulation stop time 35 | int cstop; // simulation stop cycle 36 | double dtmax; // maximum timestep size 37 | double dtinit; // initial timestep size 38 | double dtfac; // factor limiting timestep growth 39 | int dtreport; // frequency for timestep reports 40 | double dt; // current timestep 41 | double dtlast; // previous timestep 42 | std::string msgdt; // dt limiter message 43 | std::string msgdtlast; // previous dt limiter message 44 | 45 | Driver(const InputFile* inp, const std::string& pname); 46 | ~Driver(); 47 | 48 | void run(); 49 | void calcGlobalDt(); 50 | 51 | }; // class Driver 52 | 53 | 54 | #endif /* DRIVER_HH_ */ 55 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/Driver.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * Driver.hh 3 | * 4 | * Created on: Jan 23, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef DRIVER_HH_ 14 | #define DRIVER_HH_ 15 | 16 | #include 17 | 18 | // forward declarations 19 | class InputFile; 20 | class Mesh; 21 | class Hydro; 22 | 23 | 24 | class Driver { 25 | public: 26 | 27 | // children of this object 28 | Mesh *mesh; 29 | Hydro *hydro; 30 | 31 | std::string probname; // problem name 32 | double time; // simulation time 33 | int cycle; // simulation cycle number 34 | double tstop; // simulation stop time 35 | int cstop; // simulation stop cycle 36 | double dtmax; // maximum timestep size 37 | double dtinit; // initial timestep size 38 | double dtfac; // factor limiting timestep growth 39 | int dtreport; // frequency for timestep reports 40 | double dt; // current timestep 41 | double dtlast; // previous timestep 42 | std::string msgdt; // dt limiter message 43 | std::string msgdtlast; // previous dt limiter message 44 | 45 | Driver(const InputFile* inp, const std::string& pname); 46 | ~Driver(); 47 | 48 | void run(); 49 | void calcGlobalDt(); 50 | 51 | }; // class Driver 52 | 53 | 54 | #endif /* DRIVER_HH_ */ 55 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Makefile.new: -------------------------------------------------------------------------------- 1 | EXECUTABLE = pennant 2 | BUILDDIR := build 3 | SRCDIR = src 4 | all: $(BUILDDIR)/$(EXECUTABLE) 5 | 6 | .PHONY: test 7 | 8 | OBJECTS = $(BUILDDIR)/Driver.o $(BUILDDIR)/GenMesh.o $(BUILDDIR)/HydroBC.o 9 | OBJECTS += $(BUILDDIR)/ImportGMV.o $(BUILDDIR)/Mesh.o $(BUILDDIR)/PolyGas.o 10 | OBJECTS += $(BUILDDIR)/TTS.o $(BUILDDIR)/main.o $(BUILDDIR)/ExportGold.o 11 | OBJECTS += $(BUILDDIR)/Hydro.o $(BUILDDIR)/HydroGPU.o $(BUILDDIR)/InputFile.o 12 | OBJECTS += $(BUILDDIR)/Parallel.o $(BUILDDIR)/QCS.o $(BUILDDIR)/WriteXY.o 13 | 14 | CXXFLAGS = -g -O3 -DNDEBUG 15 | HIPCC_FLAGS = -O3 -g -DNDEBUG 16 | 17 | HIP_PLATFORM ?= amd 18 | 19 | ifeq ($(HIP_PLATFORM), nvidia) 20 | HIP_PATH ?= $(shell hipconfig --path) 21 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 22 | endif 23 | ifeq ($(HIP_PLATFORM), amd) 24 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 25 | endif 26 | 27 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cc 28 | @echo making depends for $< 29 | $(maketargetdir) 30 | @$(CXX) $(CXXFLAGS) $(CXXINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 31 | 32 | $(BUILDDIR)/%.d : $(SRCDIR)/%.hip 33 | @echo making depends for $< 34 | $(maketargetdir) 35 | @hipcc $(HIPCCFLAGS) $(HIPCCINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 36 | 37 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cc 38 | @echo compiling $< 39 | $(maketargetdir) 40 | $(CXX) $(CXXFLAGS) $(CXXINCLUDES) -c -o $@ $< 41 | 42 | $(BUILDDIR)/%.o : $(SRCDIR)/%.hip 43 | @echo compiling $< 44 | $(maketargetdir) 45 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 46 | 47 | $(BUILDDIR)/$(EXECUTABLE) : $(OBJECTS) 48 | @echo linking $@ 49 | $(maketargetdir) 50 | hipcc $(OBJECTS) $(LDFLAGS) -o $@ 51 | 52 | test : $(BUILDDIR)/$(EXECUTABLE) 53 | $(BUILDDIR)/$(EXECUTABLE) test/sedovbig/sedovbig.pnt 54 | 55 | define maketargetdir 56 | -@mkdir -p $(dir $@) > /dev/null 2>&1 57 | endef 58 | 59 | clean : 60 | rm -rf $(BUILDDIR) 61 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE = pennant 2 | BUILDDIR := build 3 | SRCDIR = src 4 | all: $(BUILDDIR)/$(EXECUTABLE) 5 | 6 | .PHONY: test 7 | 8 | OBJECTS = $(BUILDDIR)/Driver.o $(BUILDDIR)/GenMesh.o $(BUILDDIR)/HydroBC.o 9 | OBJECTS += $(BUILDDIR)/ImportGMV.o $(BUILDDIR)/Mesh.o $(BUILDDIR)/PolyGas.o 10 | OBJECTS += $(BUILDDIR)/TTS.o $(BUILDDIR)/main.o $(BUILDDIR)/ExportGold.o 11 | OBJECTS += $(BUILDDIR)/Hydro.o $(BUILDDIR)/HydroGPU.o $(BUILDDIR)/InputFile.o 12 | OBJECTS += $(BUILDDIR)/Parallel.o $(BUILDDIR)/QCS.o $(BUILDDIR)/WriteXY.o 13 | 14 | CXXFLAGS = -g -O3 -DNDEBUG -fPIC 15 | HIPCC_FLAGS = -O3 -g -DNDEBUG 16 | 17 | HIP_PLATFORM ?= amd 18 | 19 | ifeq ($(HIP_PLATFORM), nvidia) 20 | HIP_PATH ?= $(shell hipconfig --path) 21 | HIPCC_FLAGS += -x cu -I${HIP_PATH}/include/ 22 | endif 23 | ifeq ($(HIP_PLATFORM), amd) 24 | HIPCC_FLAGS += -x hip -munsafe-fp-atomics 25 | endif 26 | 27 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cc 28 | @echo making depends for $< 29 | $(maketargetdir) 30 | @$(CXX) $(CXXFLAGS) $(CXXINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 31 | 32 | $(BUILDDIR)/%.d : $(SRCDIR)/%.hip 33 | @echo making depends for $< 34 | $(maketargetdir) 35 | @hipcc $(HIPCCFLAGS) $(HIPCCINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 36 | 37 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cc 38 | @echo compiling $< 39 | $(maketargetdir) 40 | $(CXX) $(CXXFLAGS) $(CXXINCLUDES) -c -o $@ $< 41 | 42 | $(BUILDDIR)/%.o : $(SRCDIR)/%.hip 43 | @echo compiling $< 44 | $(maketargetdir) 45 | hipcc $(HIPCC_FLAGS) -c $^ -o $@ 46 | 47 | $(BUILDDIR)/$(EXECUTABLE) : $(OBJECTS) 48 | @echo linking $@ 49 | $(maketargetdir) 50 | hipcc $(OBJECTS) $(LDFLAGS) -o $@ 51 | 52 | test : $(BUILDDIR)/$(EXECUTABLE) 53 | $(BUILDDIR)/$(EXECUTABLE) test/sedovbig/sedovbig.pnt 54 | 55 | define maketargetdir 56 | -@mkdir -p $(dir $@) > /dev/null 2>&1 57 | endef 58 | 59 | clean : 60 | rm -rf $(BUILDDIR) 61 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/HydroGPU.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroGPU.hh 3 | * 4 | * Created on: Aug 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef HYDROGPU_HH_ 14 | #define HYDROGPU_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | 19 | void hydroInit( 20 | const int numpH, 21 | const int numzH, 22 | const int numsH, 23 | const int numcH, 24 | const int numeH, 25 | const double pgammaH, 26 | const double pssminH, 27 | const double talfaH, 28 | const double tssminH, 29 | const double qgammaH, 30 | const double q1H, 31 | const double q2H, 32 | const double hcflH, 33 | const double hcflvH, 34 | const int numbcxH, 35 | const double* bcxH, 36 | const int numbcyH, 37 | const double* bcyH, 38 | const void* pxH, 39 | const void* puH, 40 | const double* zmH, 41 | const double* zrH, 42 | const double* zvolH, 43 | const double* zeH, 44 | const double* zetotH, 45 | const double* zwrateH, 46 | const double* smfH, 47 | const int* mapsp1H, 48 | const int* mapsp2H, 49 | const int* mapszH, 50 | const int* mapss4H, 51 | const int* mapseH, 52 | const int* znumpH); 53 | 54 | void hydroDoCycle( 55 | const double dtH, 56 | double& dtnextH, 57 | int& idtnextH); 58 | 59 | void hydroGetData( 60 | const int numpH, 61 | const int numzH, 62 | void* pxH, 63 | double* zrH, 64 | double* zeH, 65 | double* zpH); 66 | 67 | void hydroInitGPU(); 68 | 69 | void hydroFinalGPU(); 70 | 71 | 72 | #endif /* HYDROGPU_HH_ */ 73 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/HydroGPU.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * HydroGPU.hh 3 | * 4 | * Created on: Aug 2, 2012 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef HYDROGPU_HH_ 14 | #define HYDROGPU_HH_ 15 | 16 | #include "Vec2.hh" 17 | 18 | 19 | void hydroInit( 20 | const int numpH, 21 | const int numzH, 22 | const int numsH, 23 | const int numcH, 24 | const int numeH, 25 | const double pgammaH, 26 | const double pssminH, 27 | const double talfaH, 28 | const double tssminH, 29 | const double qgammaH, 30 | const double q1H, 31 | const double q2H, 32 | const double hcflH, 33 | const double hcflvH, 34 | const int numbcxH, 35 | const double* bcxH, 36 | const int numbcyH, 37 | const double* bcyH, 38 | const double2* pxH, 39 | const double2* puH, 40 | const double* zmH, 41 | const double* zrH, 42 | const double* zvolH, 43 | const double* zeH, 44 | const double* zetotH, 45 | const double* zwrateH, 46 | const double* smfH, 47 | const int* mapsp1H, 48 | const int* mapsp2H, 49 | const int* mapszH, 50 | const int* mapss4H, 51 | const int* mapseH, 52 | const int* znumpH); 53 | 54 | void hydroDoCycle( 55 | const double dtH, 56 | double& dtnextH, 57 | int& idtnextH); 58 | 59 | void hydroGetData( 60 | const int numpH, 61 | const int numzH, 62 | double2* pxH, 63 | double* zrH, 64 | double* zeH, 65 | double* zpH); 66 | 67 | void hydroInitGPU(); 68 | 69 | void hydroFinalGPU(); 70 | 71 | 72 | #endif /* HYDROGPU_HH_ */ 73 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/Parallel.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel.hh 3 | * 4 | * Created on: May 31, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef PARALLEL_HH_ 14 | #define PARALLEL_HH_ 15 | 16 | #ifdef USE_MPI 17 | #include "mpi.h" 18 | #endif 19 | 20 | 21 | // Namespace Parallel provides helper functions and variables for 22 | // running in distributed parallel mode using MPI, or for stubbing 23 | // these out if not using MPI. 24 | 25 | namespace Parallel { 26 | extern int numpe; // number of MPI PEs in use 27 | // (1 if not using MPI) 28 | extern int mype; // PE number for my rank 29 | // (0 if not using MPI) 30 | 31 | void init(); // initialize MPI 32 | void final(); // finalize MPI 33 | 34 | void globalMinLoc(double& x, int& xpe); 35 | // find minimum over all PEs, and 36 | // report which PE had the minimum 37 | void globalSum(int& x); // find sum over all PEs 38 | void gather(const int x, int* y); 39 | // gather list of ints from all PEs 40 | void scatter(const int* x, int& y); 41 | // gather list of ints from all PEs 42 | 43 | template 44 | void gatherv( // gather variable-length list 45 | const T *x, const int numx, 46 | T* y, const int* numy); 47 | template 48 | void gathervImpl( // helper function for gatherv 49 | const T *x, const int numx, 50 | T* y, const int* numy); 51 | 52 | } // namespace Parallel 53 | 54 | 55 | #endif /* PARALLEL_HH_ */ 56 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/Parallel.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel.hh 3 | * 4 | * Created on: May 31, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #ifndef PARALLEL_HH_ 14 | #define PARALLEL_HH_ 15 | 16 | #ifdef USE_MPI 17 | #include "mpi.h" 18 | #endif 19 | 20 | 21 | // Namespace Parallel provides helper functions and variables for 22 | // running in distributed parallel mode using MPI, or for stubbing 23 | // these out if not using MPI. 24 | 25 | namespace Parallel { 26 | extern int numpe; // number of MPI PEs in use 27 | // (1 if not using MPI) 28 | extern int mype; // PE number for my rank 29 | // (0 if not using MPI) 30 | 31 | void init(); // initialize MPI 32 | void final(); // finalize MPI 33 | 34 | void globalMinLoc(double& x, int& xpe); 35 | // find minimum over all PEs, and 36 | // report which PE had the minimum 37 | void globalSum(int& x); // find sum over all PEs 38 | void gather(const int x, int* y); 39 | // gather list of ints from all PEs 40 | void scatter(const int* x, int& y); 41 | // gather list of ints from all PEs 42 | 43 | template 44 | void gatherv( // gather variable-length list 45 | const T *x, const int numx, 46 | T* y, const int* numy); 47 | template 48 | void gathervImpl( // helper function for gatherv 49 | const T *x, const int numx, 50 | T* y, const int* numy); 51 | 52 | } // namespace Parallel 53 | 54 | 55 | #endif /* PARALLEL_HH_ */ 56 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/problem.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | 6 | __global__ void yax(double* y, 7 | double* A, 8 | double* x, 9 | int n, int m, 10 | double* result){ 11 | double res = 0.0; 12 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 13 | i += gridDim.x * blockDim.x){ 14 | double temp = 0.0; 15 | for(int j = 0; j < m; j++){ 16 | temp += A[i*n+j] * x[j]; 17 | } 18 | res += y[i] * temp; 19 | } 20 | unsafeAtomicAdd(&result[0],res); 21 | } 22 | 23 | int main(int argc, char** argv){ 24 | dim3 grid = dim3(4,1,1); 25 | dim3 block = dim3(64,1,1); 26 | int n = 2<<14; 27 | int m = 2<<14; 28 | 29 | double* y; 30 | double* x; 31 | double* A; 32 | double* result; 33 | 34 | hipCheck(hipMalloc(&y, n*sizeof(double))); 35 | hipCheck(hipMalloc(&x, m*sizeof(double))); 36 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 37 | hipCheck(hipMalloc(&result, sizeof(double))); 38 | 39 | for(int i = 0; i < n; i++){ 40 | y[i] = 1; 41 | } 42 | for(int i = 0; i < m; i++){ 43 | x[i] = 1; 44 | } 45 | for(int i = 0; i < n*m; i++){ 46 | A[i] = 1; 47 | } 48 | result[0] = 0; 49 | 50 | yax<<>>(y,A,x,n,m,result); 51 | hipDeviceSynchronize(); 52 | result[0] = 0; 53 | 54 | auto start = std::chrono::high_resolution_clock::now(); 55 | yax<<>>(y,A,x,n,m,result); 56 | hipDeviceSynchronize(); 57 | auto stop = std::chrono::high_resolution_clock::now(); 58 | 59 | double expected = (double)n * (double)m; 60 | if(result[0] - (double)n*(double)m >= 0.0001) { 61 | printf("Answer is incorrect!\n"); 62 | printf("result = %f, expected = %f\n",result[0],expected); 63 | } else { 64 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 65 | printf("yAx time: %f milliseconds\n", time); 66 | } 67 | 68 | hipFree(y); 69 | hipFree(x); 70 | hipFree(A); 71 | return 0; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/1-LaunchParameters/solution/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | __global__ void yax(double* y, 6 | double* A, 7 | double* x, 8 | int n, int m, 9 | double* result){ 10 | double res = 0.0; 11 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 12 | i += gridDim.x * blockDim.x){ 13 | double temp = 0.0; 14 | for(int j = 0; j < m; j++){ 15 | temp += A[i*m+j] * x[j]; 16 | } 17 | res += y[i] * temp; 18 | } 19 | unsafeAtomicAdd(&result[0],res); 20 | } 21 | 22 | int main(int argc, char** argv){ 23 | dim3 grid = dim3(2048,1,1); 24 | dim3 block = dim3(64,1,1); 25 | int n = 2<<14; 26 | int m = 2<<14; 27 | double* y; 28 | double* x; 29 | double* A; 30 | double* result; 31 | 32 | hipCheck(hipMalloc(&y, n*sizeof(double))); 33 | hipCheck(hipMalloc(&x, m*sizeof(double))); 34 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 35 | hipCheck(hipMalloc(&result, sizeof(double))); 36 | 37 | for(int i = 0; i < n; i++){ 38 | y[i] = 1; 39 | } 40 | for(int i = 0; i < m; i++){ 41 | x[i] = 1; 42 | } 43 | for(int i = 0; i < n*m; i++){ 44 | A[i] = 1; 45 | } 46 | result[0] = 0; 47 | 48 | yax<<>>(y,A,x,n,m,result); 49 | hipDeviceSynchronize(); 50 | result[0] = 0; 51 | 52 | auto start = std::chrono::high_resolution_clock::now(); 53 | yax<<>>(y,A,x,n,m,result); 54 | hipDeviceSynchronize(); 55 | auto stop = std::chrono::high_resolution_clock::now(); 56 | 57 | double expected = (double)n * (double)m; 58 | if(result[0] - (double)n*(double)m >= 0.0001) { 59 | printf("Answer is incorrect!\n"); 60 | printf("result = %f, expected = %f\n",result[0],expected); 61 | } else { 62 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 63 | printf("yAx time: %f milliseconds\n", time); 64 | } 65 | 66 | hipFree(y); 67 | hipFree(x); 68 | hipFree(A); 69 | return 0; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/src/WriteXY.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * WriteXY.cc 3 | * 4 | * Created on: Dec 16, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "WriteXY.hh" 14 | 15 | #include 16 | #include 17 | 18 | #include "Parallel.hh" 19 | #include "Mesh.hh" 20 | 21 | using namespace std; 22 | 23 | 24 | WriteXY::WriteXY(Mesh* m) : mesh(m) {} 25 | 26 | WriteXY::~WriteXY() {} 27 | 28 | 29 | void WriteXY::write( 30 | const string& basename, 31 | const double* zr, 32 | const double* ze, 33 | const double* zp) { 34 | 35 | using Parallel::numpe; 36 | using Parallel::mype; 37 | const int numz = mesh->numz; 38 | 39 | int gnumz = numz; 40 | Parallel::globalSum(gnumz); 41 | gnumz = (mype == 0 ? gnumz : 0); 42 | vector penumz(mype == 0 ? numpe : 0); 43 | Parallel::gather(numz, &penumz[0]); 44 | 45 | vector gzr(gnumz), gze(gnumz), gzp(gnumz); 46 | Parallel::gatherv(&zr[0], numz, &gzr[0], &penumz[0]); 47 | Parallel::gatherv(&ze[0], numz, &gze[0], &penumz[0]); 48 | Parallel::gatherv(&zp[0], numz, &gzp[0], &penumz[0]); 49 | 50 | if (mype == 0) { 51 | string xyname = basename + ".xy"; 52 | ofstream ofs(xyname.c_str()); 53 | ofs << scientific << setprecision(8); 54 | ofs << "# zr" << endl; 55 | for (int z = 0; z < gnumz; ++z) { 56 | ofs << setw(5) << (z + 1) << setw(18) << gzr[z] << endl; 57 | } 58 | ofs << "# ze" << endl; 59 | for (int z = 0; z < gnumz; ++z) { 60 | ofs << setw(5) << (z + 1) << setw(18) << gze[z] << endl; 61 | } 62 | ofs << "# zp" << endl; 63 | for (int z = 0; z < gnumz; ++z) { 64 | ofs << setw(5) << (z + 1) << setw(18) << gzp[z] << endl; 65 | } 66 | ofs.close(); 67 | 68 | } // if mype 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/src/WriteXY.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * WriteXY.cc 3 | * 4 | * Created on: Dec 16, 2013 5 | * Author: cferenba 6 | * 7 | * Copyright (c) 2012, Los Alamos National Security, LLC. 8 | * All rights reserved. 9 | * Use of this source code is governed by a BSD-style open-source 10 | * license; see top-level LICENSE file for full license text. 11 | */ 12 | 13 | #include "WriteXY.hh" 14 | 15 | #include 16 | #include 17 | 18 | #include "Parallel.hh" 19 | #include "Mesh.hh" 20 | 21 | using namespace std; 22 | 23 | 24 | WriteXY::WriteXY(Mesh* m) : mesh(m) {} 25 | 26 | WriteXY::~WriteXY() {} 27 | 28 | 29 | void WriteXY::write( 30 | const string& basename, 31 | const double* zr, 32 | const double* ze, 33 | const double* zp) { 34 | 35 | using Parallel::numpe; 36 | using Parallel::mype; 37 | const int numz = mesh->numz; 38 | 39 | int gnumz = numz; 40 | Parallel::globalSum(gnumz); 41 | gnumz = (mype == 0 ? gnumz : 0); 42 | vector penumz(mype == 0 ? numpe : 0); 43 | Parallel::gather(numz, &penumz[0]); 44 | 45 | vector gzr(gnumz), gze(gnumz), gzp(gnumz); 46 | Parallel::gatherv(&zr[0], numz, &gzr[0], &penumz[0]); 47 | Parallel::gatherv(&ze[0], numz, &gze[0], &penumz[0]); 48 | Parallel::gatherv(&zp[0], numz, &gzp[0], &penumz[0]); 49 | 50 | if (mype == 0) { 51 | string xyname = basename + ".xy"; 52 | ofstream ofs(xyname.c_str()); 53 | ofs << scientific << setprecision(8); 54 | ofs << "# zr" << endl; 55 | for (int z = 0; z < gnumz; ++z) { 56 | ofs << setw(5) << (z + 1) << setw(18) << gzr[z] << endl; 57 | } 58 | ofs << "# ze" << endl; 59 | for (int z = 0; z < gnumz; ++z) { 60 | ofs << setw(5) << (z + 1) << setw(18) << gze[z] << endl; 61 | } 62 | ofs << "# zp" << endl; 63 | for (int z = 0; z < gnumz; ++z) { 64 | ofs << setw(5) << (z + 1) << setw(18) << gzp[z] << endl; 65 | } 66 | ofs.close(); 67 | 68 | } // if mype 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution-no-lds/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | constexpr size_t partially_allocate_lds = 64ul * 1024ul / sizeof(double)/32; 6 | 7 | __global__ void yax(double* y, 8 | double* A, 9 | double* x, 10 | int n, int m, 11 | double* result){ 12 | double res = 0.0; 13 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 14 | i += gridDim.x * blockDim.x){ 15 | double temp = 0.0; 16 | for(int j = 0; j < m; j++){ 17 | temp += A[i*n+j] * x[j]; 18 | } 19 | res += y[i] * temp; 20 | } 21 | unsafeAtomicAdd(&result[0],res); 22 | } 23 | 24 | int main(int argc, char** argv){ 25 | dim3 grid = dim3(2048,1,1); 26 | dim3 block = dim3(64,1,1); 27 | int n = 2<<14; 28 | int m = 2<<14; 29 | 30 | double* y; 31 | double* x; 32 | double* A; 33 | double* result; 34 | 35 | hipCheck(hipMalloc(&y, n*sizeof(double))); 36 | hipCheck(hipMalloc(&x, m*sizeof(double))); 37 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 38 | hipCheck(hipMalloc(&result, sizeof(double))); 39 | 40 | for(int i = 0; i < n; i++){ 41 | y[i] = 1; 42 | } 43 | for(int i = 0; i < m; i++){ 44 | x[i] = 1; 45 | } 46 | for(int i = 0; i < n*m; i++){ 47 | A[i] = 1; 48 | } 49 | result[0] = 0; 50 | 51 | yax<<>>(y,A,x,n,m,result); 52 | hipDeviceSynchronize(); 53 | result[0] = 0; 54 | auto start = std::chrono::high_resolution_clock::now(); 55 | yax<<>>(y,A,x,n,m,result); 56 | hipDeviceSynchronize(); 57 | auto stop = std::chrono::high_resolution_clock::now(); 58 | 59 | double expected = (double)n * (double)m; 60 | if(result[0] - (double)n*(double)m >= 0.0001) { 61 | printf("Answer is incorrect!\n"); 62 | printf("result = %f, expected = %f\n",result[0],expected); 63 | } else { 64 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 65 | printf("yAx time: %f milliseconds\n", time); 66 | } 67 | 68 | hipFree(y); 69 | hipFree(x); 70 | hipFree(A); 71 | 72 | return 0; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/problem.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | __global__ 6 | __launch_bounds__(256) void yax(double* y, 7 | double* A, 8 | double* x, 9 | int n, int m, 10 | double* result){ 11 | double res = 0.0; 12 | 13 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 14 | i += gridDim.x * blockDim.x){ 15 | double temp = 0.0; 16 | 17 | for(int j = 0; j < m; j++){ 18 | temp += A[i*m + j] * x[j]; 19 | } 20 | res += y[i] * temp; 21 | } 22 | unsafeAtomicAdd(&result[0],res); 23 | } 24 | 25 | int main(int argc, char** argv){ 26 | dim3 grid = dim3(2048,1,1); 27 | dim3 block = dim3(64,1,1); 28 | int n = 2<<14; 29 | int m = 2<<14; 30 | 31 | double* y; 32 | double* x; 33 | double* A; 34 | double* result; 35 | 36 | hipCheck(hipMalloc(&y, n*sizeof(double))); 37 | hipCheck(hipMalloc(&x, m*sizeof(double))); 38 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 39 | hipCheck(hipMalloc(&result, sizeof(double))); 40 | 41 | for(int i = 0; i < n; i++){ 42 | y[i] = 1; 43 | } 44 | for(int i = 0; i < m; i++){ 45 | x[i] = 1; 46 | } 47 | for(int i = 0; i < n*m; i++){ 48 | A[i] = 1; 49 | } 50 | result[0] = 0.0; 51 | 52 | 53 | yax<<>>(y,A,x,n,m,result); 54 | hipDeviceSynchronize(); 55 | result[0] = 0.0; 56 | 57 | auto start = std::chrono::high_resolution_clock::now(); 58 | yax<<>>(y,A,x,n,m,result); 59 | hipDeviceSynchronize(); 60 | auto stop = std::chrono::high_resolution_clock::now(); 61 | 62 | double expected = (double)n * (double)m; 63 | if(result[0] - (double)n*(double)m >= 0.0001) { 64 | printf("Answer is incorrect!\n"); 65 | printf("result = %f, expected = %f\n",result[0],expected); 66 | } else { 67 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 68 | printf("yAx time: %f milliseconds\n", time); 69 | } 70 | 71 | hipFree(y); 72 | hipFree(x); 73 | hipFree(A); 74 | 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/4-StridedAccess/solution/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | __global__ 6 | __launch_bounds__(256) void yax(double* y, 7 | double* A, 8 | double* x, 9 | int n, int m, 10 | double* result){ 11 | double res = 0.0; 12 | 13 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 14 | i += gridDim.x * blockDim.x){ 15 | double temp = 0.0; 16 | 17 | for(int j = 0; j < m; j++){ 18 | temp += A[j*n+i] * x[j]; 19 | } 20 | res += y[i] * temp; 21 | } 22 | unsafeAtomicAdd(&result[0],res); 23 | } 24 | 25 | int main(int argc, char** argv){ 26 | dim3 grid = dim3(2048,1,1); 27 | dim3 block = dim3(64,1,1); 28 | int n = 2<<14; 29 | int m = 2<<14; 30 | 31 | double* y; 32 | double* x; 33 | double* A; 34 | double* result; 35 | 36 | hipCheck(hipMalloc(&y, n*sizeof(double))); 37 | hipCheck(hipMalloc(&x, m*sizeof(double))); 38 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 39 | hipCheck(hipMalloc(&result, sizeof(double))); 40 | 41 | for(int i = 0; i < n; i++){ 42 | y[i] = 1; 43 | } 44 | for(int i = 0; i < m; i++){ 45 | x[i] = 1; 46 | } 47 | for(int i = 0; i < n*m; i++){ 48 | A[i] = 1; 49 | } 50 | result[0] = 0.0; 51 | 52 | 53 | yax<<>>(y,A,x,n,m,result); 54 | hipDeviceSynchronize(); 55 | result[0] = 0.0; 56 | 57 | auto start = std::chrono::high_resolution_clock::now(); 58 | yax<<>>(y,A,x,n,m,result); 59 | hipDeviceSynchronize(); 60 | auto stop = std::chrono::high_resolution_clock::now(); 61 | 62 | double expected = (double)n * (double)m; 63 | if(result[0] - (double)n*(double)m >= 0.0001) { 64 | printf("Answer is incorrect!\n"); 65 | printf("result = %f, expected = %f\n",result[0],expected); 66 | } else { 67 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 68 | printf("yAx time: %f milliseconds\n", time); 69 | } 70 | 71 | hipFree(y); 72 | hipFree(x); 73 | hipFree(A); 74 | 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/problem.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | __global__ 6 | __launch_bounds__(256) void yax(double* y, 7 | double* A, 8 | double* x, 9 | int n, int m, 10 | double* result){ 11 | double res = 0.0; 12 | 13 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 14 | i += gridDim.x * blockDim.x){ 15 | double temp = 0.0; 16 | 17 | for(int j = 0; j < m; j++){ 18 | temp += A[j*n + i] * x[j]; 19 | } 20 | res += y[i] * temp; 21 | } 22 | unsafeAtomicAdd(&result[0],res); 23 | } 24 | 25 | int main(int argc, char** argv){ 26 | dim3 grid = dim3(2048,1,1); 27 | dim3 block = dim3(64,1,1); 28 | int n = 2<<14; 29 | int m = 2<<14; 30 | 31 | double* y; 32 | double* x; 33 | double* A; 34 | double* result; 35 | 36 | hipCheck(hipMalloc(&y, n*sizeof(double))); 37 | hipCheck(hipMalloc(&x, m*sizeof(double))); 38 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 39 | hipCheck(hipMalloc(&result, sizeof(double))); 40 | 41 | for(int i = 0; i < n; i++){ 42 | y[i] = 1; 43 | } 44 | for(int i = 0; i < m; i++){ 45 | x[i] = 1; 46 | } 47 | for(int i = 0; i < n*m; i++){ 48 | A[i] = 1; 49 | } 50 | result[0] = 0.0; 51 | 52 | 53 | yax<<>>(y,A,x,n,m,result); 54 | hipDeviceSynchronize(); 55 | result[0] = 0.0; 56 | 57 | auto start = std::chrono::high_resolution_clock::now(); 58 | yax<<>>(y,A,x,n,m,result); 59 | hipDeviceSynchronize(); 60 | auto stop = std::chrono::high_resolution_clock::now(); 61 | 62 | double expected = (double)n * (double)m; 63 | if(result[0] - (double)n*(double)m >= 0.0001) { 64 | printf("Answer is incorrect!\n"); 65 | printf("result = %f, expected = %f\n",result[0],expected); 66 | } else { 67 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 68 | printf("yAx time: %f milliseconds\n", time); 69 | } 70 | 71 | hipFree(y); 72 | hipFree(x); 73 | hipFree(A); 74 | 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/serialize.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #ifndef SERIALIZE_H_ 6 | #define SERIALIZE_H_ 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "utils.h" 15 | #include "args.h" 16 | #include "dgemm.h" 17 | 18 | 19 | 20 | template < 21 | typename D, 22 | typename = std::enable_if_t::value, D> 23 | > 24 | std::ostream& 25 | elem_serialize(D const& d, std::ostream &stream){ 26 | stream << std::to_string(d); 27 | return stream; 28 | } 29 | 30 | 31 | std::ostream& 32 | elem_serialize(std::string const& d, std::ostream &stream); 33 | 34 | 35 | template 36 | std::ostream& 37 | jserialize(V const& data, std::ostream &stream){ 38 | if (data.size() == 0){ 39 | stream << "[]"; 40 | return stream; 41 | } 42 | 43 | stream << "["; 44 | for (size_t i=0; i const& data, std::ostream &stream); 58 | 59 | 60 | template < 61 | typename Vec, 62 | typename = std::enable_if_t::value, Vec> 63 | > 64 | std::string 65 | jserialize(Vec const& data){ 66 | std::ostringstream stream; 67 | jserialize(data, stream); 68 | return stream.str(); 69 | } 70 | 71 | 72 | std::string 73 | jserialize(vstring const& data); 74 | 75 | 76 | std::string 77 | jserialize(args const& inp); 78 | 79 | 80 | void 81 | serialize( 82 | std::unordered_map const& map, 83 | args const& inp, 84 | std::ostream &stream); 85 | 86 | 87 | void 88 | serialize_csv( 89 | std::unordered_map const& map, 90 | args const& inp, 91 | std::ostream &stream); 92 | 93 | 94 | 95 | #endif /* SERIALIZE_H_ */ 96 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/5-AlgorithmicOptimizations/solution/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | __global__ 6 | __launch_bounds__(256) void yax(double* y, 7 | double* A, 8 | double* x, 9 | int n, int m, 10 | double* result){ 11 | __shared__ double res; 12 | 13 | for(int i = blockDim.x * blockIdx.x; i < n; i += gridDim.x * blockDim.x){ 14 | double temp = 0.0; 15 | 16 | for(int j = threadIdx.x; j < m; j+=blockDim.x){ 17 | temp += A[i*m + j] * x[j]; 18 | } 19 | unsafeAtomicAdd(&res,temp); 20 | if(threadIdx.x == 0) res *= y[i]; 21 | } 22 | if(threadIdx.x==0) unsafeAtomicAdd(&result[0],res); 23 | } 24 | 25 | int main(int argc, char** argv){ 26 | dim3 grid = dim3(2048,1,1); 27 | dim3 block = dim3(64,1,1); 28 | int n = 2<<14; 29 | int m = 2<<14; 30 | 31 | double* y; 32 | double* x; 33 | double* A; 34 | double* result; 35 | 36 | hipCheck(hipMalloc(&y, n*sizeof(double))); 37 | hipCheck(hipMalloc(&x, m*sizeof(double))); 38 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 39 | hipCheck(hipMalloc(&result, sizeof(double))); 40 | 41 | for(int i = 0; i < n; i++){ 42 | y[i] = 1; 43 | } 44 | for(int i = 0; i < m; i++){ 45 | x[i] = 1; 46 | } 47 | for(int i = 0; i < n*m; i++){ 48 | A[i] = 1; 49 | } 50 | result[0] = 0.0; 51 | 52 | 53 | yax<<>>(y,A,x,n,m,result); 54 | hipDeviceSynchronize(); 55 | result[0] = 0.0; 56 | 57 | auto start = std::chrono::high_resolution_clock::now(); 58 | yax<<>>(y,A,x,n,m,result); 59 | hipDeviceSynchronize(); 60 | auto stop = std::chrono::high_resolution_clock::now(); 61 | 62 | double expected = (double)n * (double)m; 63 | if(result[0] - (double)n*(double)m >= 0.0001) { 64 | printf("Answer is incorrect!\n"); 65 | printf("result = %f, expected = %f\n",result[0],expected); 66 | } else { 67 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 68 | printf("yAx time: %f milliseconds\n", time); 69 | } 70 | 71 | hipFree(y); 72 | hipFree(x); 73 | hipFree(A); 74 | 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Lecture4/jacobi/JacobiMain.hip: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | ***************************************************************************/ 22 | 23 | #include "Jacobi.hpp" 24 | 25 | /** 26 | * @file JacobiMain.cpp 27 | * @brief This contains the application entry point 28 | */ 29 | 30 | /** 31 | * @brief The application entry point 32 | * 33 | * @param[in] argc The number of command-line arguments 34 | * @param[in] argv The list of command-line arguments 35 | */ 36 | int main(int argc, char ** argv) 37 | { 38 | MPI_Init(&argc, &argv); 39 | 40 | MPI_Comm comm = MPI_COMM_WORLD; 41 | 42 | grid_t grid; 43 | mesh_t mesh; 44 | 45 | // Extract topology and domain dimensions from the command-line arguments 46 | ParseCommandLineArguments(argc, argv, 47 | comm, 48 | grid, 49 | mesh); 50 | 51 | Jacobi_t Jacobi(grid, mesh); 52 | 53 | Jacobi.Run(); 54 | 55 | // Finalize the MPI process 56 | MPI_Finalize(); 57 | return STATUS_OK; 58 | } 59 | -------------------------------------------------------------------------------- /Lecture1/HIP/jacobi/JacobiMain.hip: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | ***************************************************************************/ 22 | 23 | #include "Jacobi.hpp" 24 | 25 | /** 26 | * @file JacobiMain.cpp 27 | * @brief This contains the application entry point 28 | */ 29 | 30 | /** 31 | * @brief The application entry point 32 | * 33 | * @param[in] argc The number of command-line arguments 34 | * @param[in] argv The list of command-line arguments 35 | */ 36 | int main(int argc, char ** argv) 37 | { 38 | MPI_Init(&argc, &argv); 39 | 40 | MPI_Comm comm = MPI_COMM_WORLD; 41 | 42 | grid_t grid; 43 | mesh_t mesh; 44 | 45 | // Extract topology and domain dimensions from the command-line arguments 46 | ParseCommandLineArguments(argc, argv, 47 | comm, 48 | grid, 49 | mesh); 50 | 51 | Jacobi_t Jacobi(grid, mesh); 52 | 53 | Jacobi.Run(); 54 | 55 | // Finalize the MPI process 56 | MPI_Finalize(); 57 | return STATUS_OK; 58 | } 59 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/problem.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double); 6 | 7 | __global__ void yax(double* y, 8 | double* A, 9 | double* x, 10 | int n, int m, 11 | double* result){ 12 | __shared__ double tmp[fully_allocate_lds]; 13 | for(int i = threadIdx.x; i < fully_allocate_lds; i+= blockDim.x) tmp[i] = x[i]; 14 | __syncthreads(); 15 | double res = 0.0; 16 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 17 | i += gridDim.x * blockDim.x){ 18 | double temp = 0.0; 19 | for(int j = 0; j < m; j++){ 20 | temp += A[i*n+j] * (j < fully_allocate_lds ? tmp[j] : x[j]); 21 | } 22 | res += y[i] * temp; 23 | } 24 | unsafeAtomicAdd(&result[0],res); 25 | } 26 | 27 | int main(int argc, char** argv){ 28 | dim3 grid = dim3(2048,1,1); 29 | dim3 block = dim3(64,1,1); 30 | int n = 2<<14; 31 | int m = 2<<14; 32 | 33 | double* y; 34 | double* x; 35 | double* A; 36 | double* result; 37 | 38 | hipCheck(hipMalloc(&y, n*sizeof(double))); 39 | hipCheck(hipMalloc(&x, m*sizeof(double))); 40 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 41 | hipCheck(hipMalloc(&result, sizeof(double))); 42 | 43 | for(int i = 0; i < n; i++){ 44 | y[i] = 1; 45 | } 46 | for(int i = 0; i < m; i++){ 47 | x[i] = 1; 48 | } 49 | for(int i = 0; i < n*m; i++){ 50 | A[i] = 1; 51 | } 52 | result[0] = 0; 53 | 54 | yax<<>>(y,A,x,n,m,result); 55 | hipDeviceSynchronize(); 56 | result[0] = 0; 57 | 58 | auto start = std::chrono::high_resolution_clock::now(); 59 | yax<<>>(y,A,x,n,m,result); 60 | hipDeviceSynchronize(); 61 | auto stop = std::chrono::high_resolution_clock::now(); 62 | 63 | double expected = (double)n * (double)m; 64 | if(result[0] - (double)n*(double)m >= 0.0001) { 65 | printf("Answer is incorrect!\n"); 66 | printf("result = %f, expected = %f\n",result[0],expected); 67 | } else { 68 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 69 | printf("yAx time: %f milliseconds\n", time); 70 | } 71 | 72 | hipFree(y); 73 | hipFree(x); 74 | hipFree(A); 75 | return 0; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/2-LDSOccupancyLimit/solution/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | constexpr size_t partially_allocate_lds = 64ul * 1024ul / sizeof(double)/32; 6 | 7 | __global__ void yax(double* y, 8 | double* A, 9 | double* x, 10 | int n, int m, 11 | double* result){ 12 | __shared__ double tmp[partially_allocate_lds]; 13 | for(int i = threadIdx.x; i < partially_allocate_lds; i+= blockDim.x) tmp[i] = x[i]; 14 | __syncthreads(); 15 | double res = 0.0; 16 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 17 | i += gridDim.x * blockDim.x){ 18 | double temp = 0.0; 19 | for(int j = 0; j < m; j++){ 20 | temp += A[i*n+j] * (j < partially_allocate_lds ? tmp[j] : x[j]); 21 | } 22 | res += y[i] * temp; 23 | } 24 | unsafeAtomicAdd(&result[0],res); 25 | } 26 | 27 | int main(int argc, char** argv){ 28 | dim3 grid = dim3(2048,1,1); 29 | dim3 block = dim3(64,1,1); 30 | int n = 2<<14; 31 | int m = 2<<14; 32 | 33 | double* y; 34 | double* x; 35 | double* A; 36 | double* result; 37 | 38 | hipCheck(hipMalloc(&y, n*sizeof(double))); 39 | hipCheck(hipMalloc(&x, m*sizeof(double))); 40 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 41 | hipCheck(hipMalloc(&result, sizeof(double))); 42 | 43 | for(int i = 0; i < n; i++){ 44 | y[i] = 1; 45 | } 46 | for(int i = 0; i < m; i++){ 47 | x[i] = 1; 48 | } 49 | for(int i = 0; i < n*m; i++){ 50 | A[i] = 1; 51 | } 52 | result[0] = 0; 53 | 54 | yax<<>>(y,A,x,n,m,result); 55 | hipDeviceSynchronize(); 56 | result[0] = 0; 57 | 58 | auto start = std::chrono::high_resolution_clock::now(); 59 | yax<<>>(y,A,x,n,m,result); 60 | hipDeviceSynchronize(); 61 | auto stop = std::chrono::high_resolution_clock::now(); 62 | 63 | double expected = (double)n * (double)m; 64 | if(result[0] - (double)n*(double)m >= 0.0001) { 65 | printf("Answer is incorrect!\n"); 66 | printf("result = %f, expected = %f\n",result[0],expected); 67 | } else { 68 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 69 | printf("yAx time: %f milliseconds\n", time); 70 | } 71 | 72 | hipFree(y); 73 | hipFree(x); 74 | hipFree(A); 75 | 76 | return 0; 77 | } 78 | 79 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/frontier_pennant_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Run this script from /Lecture2/HIPIFY directory 4 | 5 | module load PrgEnv-amd 6 | module load amd/5.5.1 7 | module load cmake 8 | export CXX=${ROCM_PATH}/llvm/bin/clang++ 9 | 10 | rm -rf ./Pennant-new 11 | mkdir ./Pennant-new 12 | cp -r ./Pennant-orig/* ./Pennant-new 13 | cd ./Pennant-new 14 | ../hipconvertinplace-perl.sh . 15 | rm -f src/*.prehip 16 | mv src/HydroGPU.cu src/HydroGPU.hip 17 | 18 | sed -i -e 's/CUDACFLAGS/HIPCFLAGS/g' Makefile 19 | sed -i -e 's/CUDA/HIP/g' Makefile 20 | sed -i -e '/CXX/s/icpc/amdclang++/' Makefile 21 | sed -i -e 's/nvcc/hipcc/' Makefile 22 | sed -i -e 's/-fast -fno-alias//' Makefile 23 | sed -i -e 's/%.cu/%.hip/g' Makefile 24 | sed -i -e 's/-arch=sm_21 --ptxas-options=-v//' Makefile 25 | sed -i -e 's/^LDFLAGS/LDFLAGS_CUDA/' Makefile 26 | sed -i -e '/^LDFLAGS_CUDA/aLDFLAGS := -L${ROCM_PATH}/hip/lib -lamdhip64' Makefile 27 | 28 | sed -i -e 's/#ifndef __CUDACC__/#if !defined(__HIPCC__) \&\& !defined(__CUDACC__)/' src/Vec2.hh 29 | sed -i -e 's/#ifdef __CUDACC__/#if defined(__HIPCC__) \|\| defined(__CUDACC__)/' src/Vec2.hh 30 | sed -i -e '85,85s/#else/#elif defined\(__CUDACC__\)/' src/Vec2.hh 31 | 32 | sed -i -e '724,724a#ifdef __CUDACC__' -e '738,738a#endif' src/HydroGPU.hip 33 | 34 | sed -i -e '38,39s/const double2/const void/' src/HydroGPU.hh 35 | sed -i -e '62,62s/double2/void/' src/HydroGPU.hh 36 | 37 | sed -i -e '1031,1032s/const double2/const void/' src/HydroGPU.hip 38 | sed -i -e '1284,1284s/double2/void/' src/HydroGPU.hip 39 | 40 | sed -i -e '59,59s/mesh/(void *)mesh/' src/Hydro.cc 41 | sed -i -e '60,60s/pu/(void *)pu/' src/Hydro.cc 42 | sed -i -e '145,145s/mesh/(void *)mesh/' src/Hydro.cc 43 | 44 | make 45 | 46 | build/pennant test/sedovbig/sedovbig.pnt 47 | rm -rf build 48 | make clean 49 | 50 | echo "" 51 | echo "" 52 | echo "" 53 | echo "" 54 | echo "" 55 | echo " Portable Makefile build system" 56 | echo "" 57 | 58 | cp ../Makefile.new . 59 | 60 | make 61 | build/pennant test/sedovbig/sedovbig.pnt 62 | rm -rf build 63 | make clean 64 | 65 | echo "" 66 | echo "" 67 | echo "" 68 | echo "" 69 | echo "" 70 | echo " CMake portable build system" 71 | echo "" 72 | 73 | cp ../CMakeLists.txt . 74 | rm -rf build 75 | mkdir build && cd build 76 | cmake .. 77 | make 78 | ./pennant ../test/sedovbig/sedovbig.pnt 79 | cd .. 80 | rm -rf build 81 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/solution/solution.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | constexpr size_t register_array_size = 96; 6 | 7 | __global__ 8 | __launch_bounds__(256) void yax(double* y, 9 | double* A, 10 | double* x, 11 | int n, int m, 12 | double* result){ 13 | double tmp[register_array_size]; 14 | for(int i = 0; i < register_array_size; i+= 1) tmp[i] = x[i]; 15 | double res = 0.0; 16 | 17 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 18 | i += gridDim.x * blockDim.x){ 19 | double temp = 0.0; 20 | 21 | for(int j = 0; j < register_array_size; j++){ 22 | temp += A[i*n+j] * tmp[j]; 23 | } 24 | for(int j = register_array_size; j < m; j++){ 25 | temp += A[i*n+j] * x[j]; 26 | } 27 | res += y[i] * temp; 28 | } 29 | unsafeAtomicAdd(&result[0],res); 30 | } 31 | 32 | int main(int argc, char** argv){ 33 | dim3 grid = dim3(2048,1,1); 34 | dim3 block = dim3(64,1,1); 35 | int n = 2<<14; 36 | int m = 2<<14; 37 | 38 | double* y; 39 | double* x; 40 | double* A; 41 | double* result; 42 | 43 | hipCheck(hipMalloc(&y, n*sizeof(double))); 44 | hipCheck(hipMalloc(&x, m*sizeof(double))); 45 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 46 | hipCheck(hipMalloc(&result, sizeof(double))); 47 | 48 | for(int i = 0; i < n; i++){ 49 | y[i] = 1; 50 | } 51 | for(int i = 0; i < m; i++){ 52 | x[i] = 1; 53 | } 54 | for(int i = 0; i < n*m; i++){ 55 | A[i] = 1; 56 | } 57 | result[0] = 0.0; 58 | 59 | 60 | yax<<>>(y,A,x,n,m,result); 61 | hipDeviceSynchronize(); 62 | result[0] = 0.0; 63 | 64 | auto start = std::chrono::high_resolution_clock::now(); 65 | yax<<>>(y,A,x,n,m,result); 66 | hipDeviceSynchronize(); 67 | auto stop = std::chrono::high_resolution_clock::now(); 68 | 69 | double expected = (double)n * (double)m; 70 | if(result[0] - (double)n*(double)m >= 0.0001) { 71 | printf("Answer is incorrect!\n"); 72 | printf("result = %f, expected = %f\n",result[0],expected); 73 | } else { 74 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 75 | printf("yAx time: %f milliseconds\n", time); 76 | } 77 | 78 | hipFree(y); 79 | hipFree(x); 80 | hipFree(A); 81 | 82 | return 0; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /Lecture5/OmniperfExamples/3-RegisterOccupancyLimit/problem.cpp: -------------------------------------------------------------------------------- 1 | #include"hip/hip_runtime.h" 2 | #include 3 | #include"hipCheck.h" 4 | 5 | constexpr size_t register_array_size = 96; 6 | 7 | __global__ 8 | __launch_bounds__(256) void yax(double* y, 9 | double* A, 10 | double* x, 11 | int n, int m, 12 | double* result){ 13 | double tmp[register_array_size]; 14 | for(int i = 0; i < register_array_size; i+= 1) tmp[i] = x[i]; 15 | assert(result[0] == 0.0); 16 | double res = 0.0; 17 | 18 | for(int i = blockDim.x * blockIdx.x + threadIdx.x; i < n; 19 | i += gridDim.x * blockDim.x){ 20 | double temp = 0.0; 21 | 22 | for(int j = 0; j < register_array_size; j++){ 23 | temp += A[i*n+j] * tmp[j]; 24 | } 25 | for(int j = register_array_size; j < m; j++){ 26 | temp += A[i*n+j] * x[j]; 27 | } 28 | res += y[i] * temp; 29 | } 30 | unsafeAtomicAdd(&result[0],res); 31 | } 32 | 33 | int main(int argc, char** argv){ 34 | dim3 grid = dim3(2048,1,1); 35 | dim3 block = dim3(64,1,1); 36 | int n = 2<<14; 37 | int m = 2<<14; 38 | 39 | double* y; 40 | double* x; 41 | double* A; 42 | double* result; 43 | 44 | hipCheck(hipMalloc(&y, n*sizeof(double))); 45 | hipCheck(hipMalloc(&x, m*sizeof(double))); 46 | hipCheck(hipMalloc(&A, n*m*sizeof(double))); 47 | hipCheck(hipMalloc(&result, sizeof(double))); 48 | 49 | for(int i = 0; i < n; i++){ 50 | y[i] = 1; 51 | } 52 | for(int i = 0; i < m; i++){ 53 | x[i] = 1; 54 | } 55 | for(int i = 0; i < n*m; i++){ 56 | A[i] = 1; 57 | } 58 | result[0] = 0.0; 59 | 60 | 61 | yax<<>>(y,A,x,n,m,result); 62 | hipDeviceSynchronize(); 63 | result[0] = 0.0; 64 | 65 | auto start = std::chrono::high_resolution_clock::now(); 66 | yax<<>>(y,A,x,n,m,result); 67 | hipDeviceSynchronize(); 68 | auto stop = std::chrono::high_resolution_clock::now(); 69 | 70 | double expected = (double)n * (double)m; 71 | if(result[0] - (double)n*(double)m >= 0.0001) { 72 | printf("Answer is incorrect!\n"); 73 | printf("result = %f, expected = %f\n",result[0],expected); 74 | } else { 75 | auto time = std::chrono::duration_cast(stop - start).count() * 1e-6; 76 | printf("yAx time: %f milliseconds\n", time); 77 | } 78 | 79 | hipFree(y); 80 | hipFree(x); 81 | hipFree(A); 82 | 83 | return 0; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Los Alamos National Security, LLC. 2 | All rights reserved. 3 | 4 | Copyright 2012. Los Alamos National Security, LLC. 5 | This software was produced under U.S. Government contract 6 | DE-AC52-06NA25396 for Los Alamos National Laboratory (LANL), which is 7 | operated by Los Alamos National Security, LLC for the U.S. Department 8 | of Energy. The U.S. Government has rights to use, reproduce, and 9 | distribute this software. NEITHER THE GOVERNMENT NOR LOS ALAMOS 10 | NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is 12 | modified to produce derivative works, such modified software should be 13 | clearly marked, so as not to confuse it with the version available from 14 | LANL. 15 | 16 | Additionally, redistribution and use in source and binary forms, with 17 | or without modification, are permitted provided that the following 18 | conditions are met: 19 | 20 | 1. Redistributions of source code must retain the above copyright 21 | notice, this list of conditions and the following disclaimer. 22 | 23 | 2. Redistributions in binary form must reproduce the above 24 | copyright notice, this list of conditions and the following 25 | disclaimer in the documentation and/or other materials provided 26 | with the distribution. 27 | 28 | 3. Neither the name of Los Alamos National Security, LLC, Los Alamos 29 | National Laboratory, LANL, the U.S. Government, nor the names of its 30 | contributors may be used to endorse or promote products derived from 31 | this software without specific prior written permission. 32 | 33 | THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND 34 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 35 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 36 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS 37 | NATIONAL SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 38 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 39 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 40 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 41 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 42 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 43 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 44 | POSSIBILITY OF SUCH DAMAGE. 45 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Los Alamos National Security, LLC. 2 | All rights reserved. 3 | 4 | Copyright 2012. Los Alamos National Security, LLC. 5 | This software was produced under U.S. Government contract 6 | DE-AC52-06NA25396 for Los Alamos National Laboratory (LANL), which is 7 | operated by Los Alamos National Security, LLC for the U.S. Department 8 | of Energy. The U.S. Government has rights to use, reproduce, and 9 | distribute this software. NEITHER THE GOVERNMENT NOR LOS ALAMOS 10 | NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is 12 | modified to produce derivative works, such modified software should be 13 | clearly marked, so as not to confuse it with the version available from 14 | LANL. 15 | 16 | Additionally, redistribution and use in source and binary forms, with 17 | or without modification, are permitted provided that the following 18 | conditions are met: 19 | 20 | 1. Redistributions of source code must retain the above copyright 21 | notice, this list of conditions and the following disclaimer. 22 | 23 | 2. Redistributions in binary form must reproduce the above 24 | copyright notice, this list of conditions and the following 25 | disclaimer in the documentation and/or other materials provided 26 | with the distribution. 27 | 28 | 3. Neither the name of Los Alamos National Security, LLC, Los Alamos 29 | National Laboratory, LANL, the U.S. Government, nor the names of its 30 | contributors may be used to endorse or promote products derived from 31 | this software without specific prior written permission. 32 | 33 | THIS SOFTWARE IS PROVIDED BY LOS ALAMOS NATIONAL SECURITY, LLC AND 34 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 35 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 36 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS 37 | NATIONAL SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 38 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 39 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 40 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 41 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 42 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 43 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 44 | POSSIBILITY OF SUCH DAMAGE. 45 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/mini-nbody/nbody.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "timer.h" 5 | 6 | #define SOFTENING 1e-9f 7 | 8 | typedef struct { float x, y, z, vx, vy, vz; } Body; 9 | 10 | void randomizeBodies(float *data, int n) { 11 | for (int i = 0; i < n; i++) { 12 | data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; 13 | } 14 | } 15 | 16 | void bodyForce(Body *p, float dt, int n) { 17 | #pragma omp parallel for schedule(dynamic) 18 | for (int i = 0; i < n; i++) { 19 | float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; 20 | 21 | for (int j = 0; j < n; j++) { 22 | float dx = p[j].x - p[i].x; 23 | float dy = p[j].y - p[i].y; 24 | float dz = p[j].z - p[i].z; 25 | float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; 26 | float invDist = 1.0f / sqrtf(distSqr); 27 | float invDist3 = invDist * invDist * invDist; 28 | 29 | Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; 30 | } 31 | 32 | p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; 33 | } 34 | } 35 | 36 | int main(const int argc, const char** argv) { 37 | 38 | int nBodies = 30000; 39 | if (argc > 1) nBodies = atoi(argv[1]); 40 | 41 | const float dt = 0.01f; // time step 42 | const int nIters = 10; // simulation iterations 43 | 44 | int bytes = nBodies*sizeof(Body); 45 | float *buf = (float*)malloc(bytes); 46 | Body *p = (Body*)buf; 47 | 48 | randomizeBodies(buf, 6*nBodies); // Init pos / vel data 49 | 50 | double totalTime = 0.0; 51 | 52 | for (int iter = 1; iter <= nIters; iter++) { 53 | StartTimer(); 54 | 55 | bodyForce(p, dt, nBodies); // compute interbody forces 56 | 57 | for (int i = 0 ; i < nBodies; i++) { // integrate position 58 | p[i].x += p[i].vx*dt; 59 | p[i].y += p[i].vy*dt; 60 | p[i].z += p[i].vz*dt; 61 | } 62 | 63 | const double tElapsed = GetTimer() / 1000.0; 64 | if (iter > 1) { // First iter is warm up 65 | totalTime += tElapsed; 66 | } 67 | #ifndef SHMOO 68 | printf("Iteration %d: %.3f seconds\n", iter, tElapsed); 69 | #endif 70 | } 71 | double avgTime = totalTime / (double)(nIters-1); 72 | 73 | #ifdef SHMOO 74 | printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 75 | #else 76 | printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", 77 | nIters, rate); 78 | printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); 79 | #endif 80 | free(buf); 81 | } 82 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Makefile.allhipcc: -------------------------------------------------------------------------------- 1 | BUILDDIR := build 2 | PRODUCT := pennant 3 | 4 | SRCDIR := src 5 | 6 | HDRS := $(wildcard $(SRCDIR)/*.hh) 7 | SRCS := $(wildcard $(SRCDIR)/*.cc) 8 | OBJS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.o) 9 | DEPS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.d) 10 | 11 | HDRS += $(SRCDIR)/HydroGPU.hh 12 | SRCS += $(SRCDIR)/HydroGPU.hip 13 | OBJS += $(BUILDDIR)/HydroGPU.o 14 | DEPS += $(BUILDDIR)/HydroGPU.d 15 | 16 | BINARY := $(BUILDDIR)/$(PRODUCT) 17 | 18 | CPPFLAGS := -I. 19 | 20 | # begin compiler-dependent flags 21 | # 22 | # gcc flags: 23 | #CXX := g++ 24 | #CXXFLAGS_DEBUG := -g 25 | #CXXFLAGS_OPT := -O3 26 | #CXXFLAGS_OPENMP := -fopenmp 27 | 28 | # intel flags: 29 | CXX := hipcc 30 | CXXFLAGS_DEBUG := -g 31 | CXXFLAGS_OPT := -O3 32 | CXXFLAGS_OPENMP := -openmp 33 | 34 | # pgi flags: 35 | #CXX := pgCC 36 | #CXXFLAGS_DEBUG := -g 37 | #CXXFLAGS_OPT := -O3 -fastsse 38 | #CXXFLAGS_OPENMP := -mp 39 | 40 | # end compiler-dependent flags 41 | 42 | HIPC := hipcc 43 | HIPCFLAGS := 44 | HIPCFLAGS_DEBUG := -G -lineinfo 45 | HIPCFLAGS_OPT := -O3 46 | 47 | LD := $(CXX) 48 | LDFLAGS := -L$(HIP_INSTALL_PATH)/lib64 -.hipdart 49 | 50 | # select optimized or debug 51 | CXXFLAGS := $(CXXFLAGS_OPT) $(CPPFLAGS) 52 | HIPCFLAGS += $(HIPCFLAGS_OPT) $(CPPFLAGS) 53 | #CXXFLAGS := $(CXXFLAGS_DEBUG) $(CPPFLAGS) 54 | #HIPCFLAGS += $(HIPCFLAGS_DEBUG) $(CPPFLAGS) 55 | 56 | # add openmp flags (comment out for serial build) 57 | #CXXFLAGS += $(CXXFLAGS_OPENMP) 58 | #LDFLAGS += $(CXXFLAGS_OPENMP) 59 | 60 | all : $(BINARY) 61 | 62 | -include $(DEPS) 63 | 64 | $(BINARY) : $(OBJS) 65 | @echo linking $@ 66 | $(maketargetdir) 67 | $(LD) -o $@ $^ $(LDFLAGS) 68 | 69 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cc 70 | @echo compiling $< 71 | $(maketargetdir) 72 | $(CXX) $(CXXFLAGS) $(CXXINCLUDES) -c -o $@ $< 73 | 74 | $(BUILDDIR)/%.o : $(SRCDIR)/%.hip 75 | @echo compiling $< 76 | $(maketargetdir) 77 | @# unsetting of CPATH is needed to make hipcc and icpc 78 | @# play nicely together 79 | (CPATH=;$(HIPC) $(HIPCFLAGS) $(HIPCINCLUDES) -c -o $@ $<) 80 | 81 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cc 82 | @echo making depends for $< 83 | $(maketargetdir) 84 | @$(CXX) $(CXXFLAGS) $(CXXINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 85 | 86 | $(BUILDDIR)/%.d : $(SRCDIR)/%.hip 87 | @echo making depends for $< 88 | $(maketargetdir) 89 | @$(HIPC) $(HIPCFLAGS) $(HIPCINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 90 | 91 | define maketargetdir 92 | -@mkdir -p $(dir $@) > /dev/null 2>&1 93 | endef 94 | 95 | clean : 96 | rm -f $(BINARY) $(OBJS) $(DEPS) 97 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Makefile.twopath: -------------------------------------------------------------------------------- 1 | BUILDDIR := build 2 | PRODUCT := pennant 3 | 4 | SRCDIR := src 5 | 6 | HDRS := $(wildcard $(SRCDIR)/*.hh) 7 | SRCS := $(wildcard $(SRCDIR)/*.cc) 8 | OBJS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.o) 9 | DEPS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.d) 10 | 11 | HDRS += $(SRCDIR)/HydroGPU.hh 12 | SRCS += $(SRCDIR)/HydroGPU.hip 13 | OBJS += $(BUILDDIR)/HydroGPU.o 14 | DEPS += $(BUILDDIR)/HydroGPU.d 15 | 16 | BINARY := $(BUILDDIR)/$(PRODUCT) 17 | 18 | CPPFLAGS := -I. 19 | 20 | # begin compiler-dependent flags 21 | # 22 | # gcc flags: 23 | #CXX := g++ 24 | #CXXFLAGS_DEBUG := -g 25 | #CXXFLAGS_OPT := -O3 26 | #CXXFLAGS_OPENMP := -fopenmp 27 | 28 | # intel flags: 29 | CXX := amdclang++ 30 | CXXFLAGS_DEBUG := -g 31 | CXXFLAGS_OPT := -O3 32 | CXXFLAGS_OPENMP := -openmp 33 | 34 | # pgi flags: 35 | #CXX := pgCC 36 | #CXXFLAGS_DEBUG := -g 37 | #CXXFLAGS_OPT := -O3 -fastsse 38 | #CXXFLAGS_OPENMP := -mp 39 | 40 | # end compiler-dependent flags 41 | 42 | HIPC := hipcc 43 | HIPCFLAGS := 44 | HIPCFLAGS_DEBUG := -G -lineinfo 45 | HIPCFLAGS_OPT := -O3 46 | 47 | LD := $(CXX) 48 | LDFLAGS := -L$(HIP_INSTALL_PATH)/lib64 -.hipdart 49 | 50 | # select optimized or debug 51 | CXXFLAGS := $(CXXFLAGS_OPT) $(CPPFLAGS) 52 | HIPCFLAGS += $(HIPCFLAGS_OPT) $(CPPFLAGS) 53 | #CXXFLAGS := $(CXXFLAGS_DEBUG) $(CPPFLAGS) 54 | #HIPCFLAGS += $(HIPCFLAGS_DEBUG) $(CPPFLAGS) 55 | 56 | # add openmp flags (comment out for serial build) 57 | #CXXFLAGS += $(CXXFLAGS_OPENMP) 58 | #LDFLAGS += $(CXXFLAGS_OPENMP) 59 | 60 | all : $(BINARY) 61 | 62 | -include $(DEPS) 63 | 64 | $(BINARY) : $(OBJS) 65 | @echo linking $@ 66 | $(maketargetdir) 67 | $(LD) -o $@ $^ $(LDFLAGS) 68 | 69 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cc 70 | @echo compiling $< 71 | $(maketargetdir) 72 | $(CXX) $(CXXFLAGS) $(CXXINCLUDES) -c -o $@ $< 73 | 74 | $(BUILDDIR)/%.o : $(SRCDIR)/%.hip 75 | @echo compiling $< 76 | $(maketargetdir) 77 | @# unsetting of CPATH is needed to make hipcc and icpc 78 | @# play nicely together 79 | (CPATH=;$(HIPC) $(HIPCFLAGS) $(HIPCINCLUDES) -c -o $@ $<) 80 | 81 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cc 82 | @echo making depends for $< 83 | $(maketargetdir) 84 | @$(CXX) $(CXXFLAGS) $(CXXINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 85 | 86 | $(BUILDDIR)/%.d : $(SRCDIR)/%.hip 87 | @echo making depends for $< 88 | $(maketargetdir) 89 | @$(HIPC) $(HIPCFLAGS) $(HIPCINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 90 | 91 | define maketargetdir 92 | -@mkdir -p $(dir $@) > /dev/null 2>&1 93 | endef 94 | 95 | clean : 96 | rm -f $(BINARY) $(OBJS) $(DEPS) 97 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-hip/tools/gmvrect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # gmvrect.py 5 | # Writes a rectangular gmv mesh with user-specified dimensions 6 | # 7 | 8 | import sys 9 | 10 | def usage(): 11 | print "Usage: gmvrect.py NZX [NZY [LENX [LENY]]]" 12 | print "where nzx, nzy = number of zones in x, y directions" 13 | print " (no default for nzx; default nzy = nzx)" 14 | print " lenx, leny = total length in x, y directions" 15 | print " (default for both = 1.0)" 16 | sys.exit(0) 17 | 18 | def writecoords(file, np, x): 19 | for i in range(np): 20 | if i % 10 == 0: s = " " 21 | s += "%16.8E" % x[i] 22 | if (i % 10 == 9) or (i == np - 1): file.write("%s\n" % s) 23 | 24 | nargs = len(sys.argv) 25 | nzx = 0 26 | nzy = 0 27 | lenx = 1.0 28 | leny = 1.0 29 | if nargs < 2 or nargs > 5: usage() 30 | try: 31 | nzx = int(sys.argv[1]) 32 | nzy = nzx 33 | if nargs > 2: nzy = int(sys.argv[2]) 34 | if nargs > 3: lenx = float(sys.argv[3]) 35 | if nargs > 4: leny = float(sys.argv[4]) 36 | except: 37 | usage() 38 | if nzx <= 0 or nzy <= 0 or lenx <= 0. or leny <= 0.: usage() 39 | 40 | nz = nzx * nzy 41 | npx = nzx + 1 42 | npy = nzy + 1 43 | np = npx * npy 44 | 45 | filename = "rect%dx%d.gmv" % (nzx, nzy) 46 | file = open(filename, "w") 47 | 48 | # write header 49 | file.write("gmvinput ascii\n") 50 | 51 | # write node header 52 | file.write("nodes %9d\n" % np) 53 | 54 | # write node coordinates 55 | x = np * [0] 56 | y = np * [0] 57 | ijtop = np * [0] 58 | for n in range(np): 59 | i = n % npx 60 | j = n / npx 61 | x[n] = (lenx * float(i) / nzx) 62 | y[n] = (leny * float(j) / nzy) 63 | ijtop[j * npx + i] = n 64 | 65 | writecoords(file, np, x) 66 | writecoords(file, np, y) 67 | writecoords(file, np, np * [0]) 68 | 69 | # write cell header 70 | file.write("cells %9d\n" % nz) 71 | 72 | # write cells 73 | ztop = nz * [[]] 74 | for n in range(nz): 75 | i = n % nzx 76 | j = n / nzx 77 | # +1 is to convert from 0-based to 1-based 78 | p0 = ijtop[j * npx + i] + 1 79 | p1 = ijtop[j * npx + (i+1)] + 1 80 | p2 = ijtop[(j+1) * npx + (i+1)] + 1 81 | p3 = ijtop[(j+1) * npx + i] + 1 82 | ztop[n] = [p0, p1, p2, p3] 83 | 84 | for n in range(nz): 85 | file.write(" general 1\n") 86 | file.write(" 4\n") 87 | pl = ztop[n] 88 | file.write(" %9d %9d %9d %9d\n" % (pl[0], pl[1], pl[2], pl[3])) 89 | 90 | # write end-of-file marker 91 | file.write("endgmv\n") 92 | 93 | print "Wrote %s" % filename 94 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/tools/gmvrect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # gmvrect.py 5 | # Writes a rectangular gmv mesh with user-specified dimensions 6 | # 7 | 8 | import sys 9 | 10 | def usage(): 11 | print "Usage: gmvrect.py NZX [NZY [LENX [LENY]]]" 12 | print "where nzx, nzy = number of zones in x, y directions" 13 | print " (no default for nzx; default nzy = nzx)" 14 | print " lenx, leny = total length in x, y directions" 15 | print " (default for both = 1.0)" 16 | sys.exit(0) 17 | 18 | def writecoords(file, np, x): 19 | for i in range(np): 20 | if i % 10 == 0: s = " " 21 | s += "%16.8E" % x[i] 22 | if (i % 10 == 9) or (i == np - 1): file.write("%s\n" % s) 23 | 24 | nargs = len(sys.argv) 25 | nzx = 0 26 | nzy = 0 27 | lenx = 1.0 28 | leny = 1.0 29 | if nargs < 2 or nargs > 5: usage() 30 | try: 31 | nzx = int(sys.argv[1]) 32 | nzy = nzx 33 | if nargs > 2: nzy = int(sys.argv[2]) 34 | if nargs > 3: lenx = float(sys.argv[3]) 35 | if nargs > 4: leny = float(sys.argv[4]) 36 | except: 37 | usage() 38 | if nzx <= 0 or nzy <= 0 or lenx <= 0. or leny <= 0.: usage() 39 | 40 | nz = nzx * nzy 41 | npx = nzx + 1 42 | npy = nzy + 1 43 | np = npx * npy 44 | 45 | filename = "rect%dx%d.gmv" % (nzx, nzy) 46 | file = open(filename, "w") 47 | 48 | # write header 49 | file.write("gmvinput ascii\n") 50 | 51 | # write node header 52 | file.write("nodes %9d\n" % np) 53 | 54 | # write node coordinates 55 | x = np * [0] 56 | y = np * [0] 57 | ijtop = np * [0] 58 | for n in range(np): 59 | i = n % npx 60 | j = n / npx 61 | x[n] = (lenx * float(i) / nzx) 62 | y[n] = (leny * float(j) / nzy) 63 | ijtop[j * npx + i] = n 64 | 65 | writecoords(file, np, x) 66 | writecoords(file, np, y) 67 | writecoords(file, np, np * [0]) 68 | 69 | # write cell header 70 | file.write("cells %9d\n" % nz) 71 | 72 | # write cells 73 | ztop = nz * [[]] 74 | for n in range(nz): 75 | i = n % nzx 76 | j = n / nzx 77 | # +1 is to convert from 0-based to 1-based 78 | p0 = ijtop[j * npx + i] + 1 79 | p1 = ijtop[j * npx + (i+1)] + 1 80 | p2 = ijtop[(j+1) * npx + (i+1)] + 1 81 | p3 = ijtop[(j+1) * npx + i] + 1 82 | ztop[n] = [p0, p1, p2, p3] 83 | 84 | for n in range(nz): 85 | file.write(" general 1\n") 86 | file.write(" 4\n") 87 | pl = ztop[n] 88 | file.write(" %9d %9d %9d %9d\n" % (pl[0], pl[1], pl[2], pl[3])) 89 | 90 | # write end-of-file marker 91 | file.write("endgmv\n") 92 | 93 | print "Wrote %s" % filename 94 | -------------------------------------------------------------------------------- /Lecture1/HIP/vectorAdd_w_cpp_ext/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 2 | project(Vectoradd LANGUAGES CXX) 3 | include(CTest) 4 | 5 | set (CMAKE_CXX_STANDARD 14) 6 | 7 | if (NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 9 | endif(NOT CMAKE_BUILD_TYPE) 10 | 11 | string(REPLACE -O2 -O3 CMAKE_CXX_FLAGS_RELWITHDEBINFO ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) 12 | 13 | if (NOT CMAKE_GPU_RUNTIME) 14 | set(GPU_RUNTIME "ROCM" CACHE STRING "Switches between ROCM and CUDA") 15 | else (NOT CMAKE_GPU_RUNTIME) 16 | set(GPU_RUNTIME "${CMAKE_GPU_RUNTIME}" CACHE STRING "Switches between ROCM and CUDA") 17 | endif (NOT CMAKE_GPU_RUNTIME) 18 | # Really should only be ROCM or CUDA, but allowing HIP because it is the currently built-in option 19 | set(GPU_RUNTIMES "ROCM" "CUDA" "HIP") 20 | if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) 21 | set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP, ROCM, or CUDA.") 22 | message(FATAL_ERROR ${ERROR_MESSAGE}) 23 | endif() 24 | # GPU_RUNTIME for AMD GPUs should really be ROCM, if selecting AMD GPUs 25 | # so manually resetting to HIP if ROCM is selected 26 | if (${GPU_RUNTIME} MATCHES "ROCM") 27 | set(GPU_RUNTIME "HIP") 28 | endif (${GPU_RUNTIME} MATCHES "ROCM") 29 | set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) 30 | 31 | enable_language(${GPU_RUNTIME}) 32 | set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) 33 | set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) 34 | 35 | set(CMAKE_${GPU_RUNTIME}_FLAGS_DEBUG "-ggdb") 36 | 37 | set(VECTORADD_CXX_SRCS vectoradd.cpp) 38 | 39 | if (DEFINED ENV{HIP_PATH}) 40 | set(HIP_PATH $ENV{HIP_PATH}) 41 | else (DEFINED ENV{HIP_PATH}) 42 | execute_process(COMMAND hipconfig --path OUTPUT_VARIABLE HIP_PATH ERROR_QUIET) 43 | endif (DEFINED ENV{HIP_PATH}) 44 | 45 | add_executable(vectoradd ${VECTORADD_CXX_SRCS} ) 46 | 47 | # Make example runnable using ctest 48 | add_test(NAME Vectoradd COMMAND vectoradd ) 49 | set_property(TEST Vectoradd PROPERTY PASS_REGULAR_EXPRESSION "PASSED!") 50 | 51 | set(ROCMCC_FLAGS "${ROCMCC_FLAGS} -munsafe-fp-atomics") 52 | set(CUDACC_FLAGS "${CUDACC_FLAGS} ") 53 | 54 | if (${GPU_RUNTIME} MATCHES "HIP") 55 | set(HIPCC_FLAGS "${ROCMCC_FLAGS}") 56 | else (${GPU_RUNTIME} MATCHES "HIP") 57 | set(HIPCC_FLAGS "${CUDACC_FLAGS} -I/${HIP_PATH}/include") 58 | endif (${GPU_RUNTIME} MATCHES "HIP") 59 | 60 | set_source_files_properties(${VECTORADD_CXX_SRCS} PROPERTIES LANGUAGE ${GPU_RUNTIME}) 61 | set_source_files_properties(vectoradd.cpp PROPERTIES COMPILE_FLAGS "${HIPCC_FLAGS}") 62 | 63 | install(TARGETS vectoradd) 64 | -------------------------------------------------------------------------------- /Lecture4/saxpy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 2 | project(Saxpy LANGUAGES CXX) 3 | include(CTest) 4 | 5 | set (CMAKE_CXX_STANDARD 14) 6 | 7 | if (NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 9 | endif(NOT CMAKE_BUILD_TYPE) 10 | 11 | string(REPLACE -O2 -O3 CMAKE_CXX_FLAGS_RELWITHDEBINFO ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) 12 | 13 | if (NOT CMAKE_GPU_RUNTIME) 14 | set(GPU_RUNTIME "ROCM" CACHE STRING "Switches between ROCM and CUDA") 15 | else (NOT CMAKE_GPU_RUNTIME) 16 | set(GPU_RUNTIME "${CMAKE_GPU_RUNTIME}" CACHE STRING "Switches between ROCM and CUDA") 17 | endif (NOT CMAKE_GPU_RUNTIME) 18 | # Really should only be ROCM or CUDA, but allowing HIP because it is the currently built-in option 19 | set(GPU_RUNTIMES "ROCM" "CUDA" "HIP") 20 | if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) 21 | set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP, ROCM, or CUDA.") 22 | message(FATAL_ERROR ${ERROR_MESSAGE}) 23 | endif() 24 | # GPU_RUNTIME for AMD GPUs should really be ROCM, if selecting AMD GPUs 25 | # so manually resetting to HIP if ROCM is selected 26 | if (${GPU_RUNTIME} MATCHES "ROCM") 27 | set(GPU_RUNTIME "HIP") 28 | endif (${GPU_RUNTIME} MATCHES "ROCM") 29 | set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) 30 | 31 | enable_language(${GPU_RUNTIME}) 32 | set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) 33 | set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) 34 | 35 | set(CMAKE_${GPU_RUNTIME}_FLAGS_DEBUG "-ggdb") 36 | 37 | set(SAXPY_CXX_SRCS "") 38 | 39 | set(SAXPY_HIP_SRCS saxpy.hip) 40 | 41 | if (DEFINED ENV{HIP_PATH}) 42 | set(HIP_PATH $ENV{HIP_PATH}) 43 | else (DEFINED ENV{HIP_PATH}) 44 | execute_process(COMMAND hipconfig --path OUTPUT_VARIABLE HIP_PATH ERROR_QUIET) 45 | endif (DEFINED ENV{HIP_PATH}) 46 | 47 | add_executable(saxpy ${SAXPY_CXX_SRCS} ${SAXPY_HIP_SRCS} ) 48 | 49 | # Make example runnable using ctest 50 | add_test(NAME Saxpy COMMAND saxpy ) 51 | set_property(TEST Saxpy 52 | PROPERTY PASS_REGULAR_EXPRESSION "PASSED!") 53 | 54 | set(ROCMCC_FLAGS "${ROCMCC_FLAGS} -munsafe-fp-atomics") 55 | set(CUDACC_FLAGS "${CUDACC_FLAGS} ") 56 | 57 | if (${GPU_RUNTIME} MATCHES "HIP") 58 | set(HIPCC_FLAGS "${ROCMCC_FLAGS}") 59 | else (${GPU_RUNTIME} MATCHES "HIP") 60 | set(HIPCC_FLAGS "${CUDACC_FLAGS} -I/${HIP_PATH}/include") 61 | endif (${GPU_RUNTIME} MATCHES "HIP") 62 | 63 | set_source_files_properties(${SAXPY_HIP_SRCS} PROPERTIES LANGUAGE ${GPU_RUNTIME}) 64 | set_source_files_properties(saxpy.hip PROPERTIES COMPILE_FLAGS "${HIPCC_FLAGS}") 65 | 66 | install(TARGETS saxpy) 67 | -------------------------------------------------------------------------------- /Lecture1/HIP/dgemm/src/utils.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. 3 | ***************************************************************************/ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "utils.h" 11 | 12 | 13 | 14 | vstring 15 | split(std::string const& str, char del){ 16 | vstring ret; 17 | std::stringstream itr(str); 18 | std::string val; 19 | while (std::getline(itr, val, del)){ 20 | ret.push_back(val); 21 | } 22 | return ret; 23 | } 24 | 25 | 26 | std::string 27 | now_str(){ 28 | auto const now = std::chrono::high_resolution_clock::now(); 29 | auto const now_t = std::chrono::system_clock::to_time_t(now); 30 | auto const ms = std::chrono::duration_cast(now.time_since_epoch()) % 1000; 31 | std::stringstream ss; 32 | ss << std::put_time(std::localtime(&now_t), "%Y-%m-%d %H:%M:%S") 33 | << '.' << std::setfill('0') << std::setw(3) << ms.count(); 34 | 35 | return ss.str(); 36 | } 37 | 38 | 39 | std::string 40 | join(std::vector const& vec, std::string const& del){ 41 | std::string ret; 42 | for (size_t i=0; i= path.size()){ 56 | return ""; 57 | } 58 | 59 | return path.substr(last_index + 1); 60 | } 61 | 62 | 63 | std::string 64 | to_lower(std::string const& str){ 65 | auto ret = str; 66 | std::transform( 67 | ret.begin(), ret.end(), ret.begin(), 68 | [](unsigned char x){ return std::tolower(x);}); 69 | return ret; 70 | } 71 | 72 | 73 | Stats 74 | basic_stats(std::vector const& data){ 75 | if (data.size() == 0){ 76 | return Stats(); 77 | } 78 | 79 | Stats ret; 80 | auto const mean = std::accumulate(data.begin(), data.end(), 0.0) / (double)data.size();; 81 | auto const min_max = std::minmax_element(data.begin(), data.end()); 82 | 83 | ret.variance = std::accumulate( 84 | data.begin(), data.end(), 0.0, [mean](double x, double y){ 85 | auto const m1 = y - mean; 86 | return x + m1*m1; 87 | } 88 | ) / (double)data.size(); 89 | 90 | ret.mean = mean; 91 | ret.min = *min_max.first; 92 | ret.max = *min_max.second; 93 | return ret; 94 | } 95 | -------------------------------------------------------------------------------- /Lecture1/HIP/saxpy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 2 | project(Saxpy LANGUAGES CXX) 3 | include(CTest) 4 | 5 | set (CMAKE_CXX_STANDARD 14) 6 | 7 | if (NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 9 | endif(NOT CMAKE_BUILD_TYPE) 10 | 11 | string(REPLACE -O2 -O3 CMAKE_CXX_FLAGS_RELWITHDEBINFO ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) 12 | 13 | if (NOT CMAKE_GPU_RUNTIME) 14 | set(GPU_RUNTIME "ROCM" CACHE STRING "Switches between ROCM and CUDA") 15 | else (NOT CMAKE_GPU_RUNTIME) 16 | set(GPU_RUNTIME "${CMAKE_GPU_RUNTIME}" CACHE STRING "Switches between ROCM and CUDA") 17 | endif (NOT CMAKE_GPU_RUNTIME) 18 | # Really should only be ROCM or CUDA, but allowing HIP because it is the currently built-in option 19 | set(GPU_RUNTIMES "ROCM" "CUDA" "HIP") 20 | if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) 21 | set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP, ROCM, or CUDA.") 22 | message(FATAL_ERROR ${ERROR_MESSAGE}) 23 | endif() 24 | # GPU_RUNTIME for AMD GPUs should really be ROCM, if selecting AMD GPUs 25 | # so manually resetting to HIP if ROCM is selected 26 | if (${GPU_RUNTIME} MATCHES "ROCM") 27 | set(GPU_RUNTIME "HIP") 28 | endif (${GPU_RUNTIME} MATCHES "ROCM") 29 | set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) 30 | 31 | enable_language(${GPU_RUNTIME}) 32 | set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) 33 | set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) 34 | 35 | set(CMAKE_${GPU_RUNTIME}_FLAGS_DEBUG "-ggdb") 36 | 37 | set(SAXPY_CXX_SRCS "") 38 | 39 | set(SAXPY_HIP_SRCS saxpy.hip) 40 | 41 | if (DEFINED ENV{HIP_PATH}) 42 | set(HIP_PATH $ENV{HIP_PATH}) 43 | else (DEFINED ENV{HIP_PATH}) 44 | execute_process(COMMAND hipconfig --path OUTPUT_VARIABLE HIP_PATH ERROR_QUIET) 45 | endif (DEFINED ENV{HIP_PATH}) 46 | 47 | add_executable(saxpy ${SAXPY_CXX_SRCS} ${SAXPY_HIP_SRCS} ) 48 | 49 | # Make example runnable using ctest 50 | add_test(NAME Saxpy COMMAND saxpy ) 51 | set_property(TEST Saxpy 52 | PROPERTY PASS_REGULAR_EXPRESSION "PASSED!") 53 | 54 | set(ROCMCC_FLAGS "${ROCMCC_FLAGS} -munsafe-fp-atomics") 55 | set(CUDACC_FLAGS "${CUDACC_FLAGS} ") 56 | 57 | if (${GPU_RUNTIME} MATCHES "HIP") 58 | set(HIPCC_FLAGS "${ROCMCC_FLAGS}") 59 | else (${GPU_RUNTIME} MATCHES "HIP") 60 | set(HIPCC_FLAGS "${CUDACC_FLAGS} -I/${HIP_PATH}/include") 61 | endif (${GPU_RUNTIME} MATCHES "HIP") 62 | 63 | set_source_files_properties(${SAXPY_HIP_SRCS} PROPERTIES LANGUAGE ${GPU_RUNTIME}) 64 | set_source_files_properties(saxpy.hip PROPERTIES COMPILE_FLAGS "${HIPCC_FLAGS}") 65 | 66 | install(TARGETS saxpy) 67 | -------------------------------------------------------------------------------- /Lecture2/HIPIFY/Pennant-orig/Makefile: -------------------------------------------------------------------------------- 1 | BUILDDIR := build 2 | PRODUCT := pennant 3 | 4 | SRCDIR := src 5 | 6 | HDRS := $(wildcard $(SRCDIR)/*.hh) 7 | SRCS := $(wildcard $(SRCDIR)/*.cc) 8 | OBJS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.o) 9 | DEPS := $(SRCS:$(SRCDIR)/%.cc=$(BUILDDIR)/%.d) 10 | 11 | HDRS += $(SRCDIR)/HydroGPU.hh 12 | SRCS += $(SRCDIR)/HydroGPU.cu 13 | OBJS += $(BUILDDIR)/HydroGPU.o 14 | DEPS += $(BUILDDIR)/HydroGPU.d 15 | 16 | BINARY := $(BUILDDIR)/$(PRODUCT) 17 | 18 | CPPFLAGS := -I. 19 | 20 | # begin compiler-dependent flags 21 | # 22 | # gcc flags: 23 | #CXX := g++ 24 | #CXXFLAGS_DEBUG := -g 25 | #CXXFLAGS_OPT := -O3 26 | #CXXFLAGS_OPENMP := -fopenmp 27 | 28 | # intel flags: 29 | CXX := icpc 30 | CXXFLAGS_DEBUG := -g 31 | CXXFLAGS_OPT := -O3 -fast -fno-alias 32 | CXXFLAGS_OPENMP := -openmp 33 | 34 | # pgi flags: 35 | #CXX := pgCC 36 | #CXXFLAGS_DEBUG := -g 37 | #CXXFLAGS_OPT := -O3 -fastsse 38 | #CXXFLAGS_OPENMP := -mp 39 | 40 | # end compiler-dependent flags 41 | 42 | CUDAC := nvcc 43 | CUDACFLAGS := -arch=sm_21 --ptxas-options=-v 44 | CUDACFLAGS_DEBUG := -G -lineinfo 45 | CUDACFLAGS_OPT := -O3 46 | 47 | LD := $(CXX) 48 | LDFLAGS := -L$(CUDA_INSTALL_PATH)/lib64 -lcudart 49 | 50 | # select optimized or debug 51 | CXXFLAGS := $(CXXFLAGS_OPT) $(CPPFLAGS) 52 | CUDACFLAGS += $(CUDACFLAGS_OPT) $(CPPFLAGS) 53 | #CXXFLAGS := $(CXXFLAGS_DEBUG) $(CPPFLAGS) 54 | #CUDACFLAGS += $(CUDACFLAGS_DEBUG) $(CPPFLAGS) 55 | 56 | # add openmp flags (comment out for serial build) 57 | #CXXFLAGS += $(CXXFLAGS_OPENMP) 58 | #LDFLAGS += $(CXXFLAGS_OPENMP) 59 | 60 | all : $(BINARY) 61 | 62 | -include $(DEPS) 63 | 64 | $(BINARY) : $(OBJS) 65 | @echo linking $@ 66 | $(maketargetdir) 67 | $(LD) -o $@ $^ $(LDFLAGS) 68 | 69 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cc 70 | @echo compiling $< 71 | $(maketargetdir) 72 | $(CXX) $(CXXFLAGS) $(CXXINCLUDES) -c -o $@ $< 73 | 74 | $(BUILDDIR)/%.o : $(SRCDIR)/%.cu 75 | @echo compiling $< 76 | $(maketargetdir) 77 | @# unsetting of CPATH is needed to make nvcc and icpc 78 | @# play nicely together 79 | (CPATH=;$(CUDAC) $(CUDACFLAGS) $(CUDACINCLUDES) -c -o $@ $<) 80 | 81 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cc 82 | @echo making depends for $< 83 | $(maketargetdir) 84 | @$(CXX) $(CXXFLAGS) $(CXXINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 85 | 86 | $(BUILDDIR)/%.d : $(SRCDIR)/%.cu 87 | @echo making depends for $< 88 | $(maketargetdir) 89 | @$(CUDAC) $(CUDACFLAGS) $(CUDACINCLUDES) -M $< | sed "1s![^ \t]\+\.o!$(@:.d=.o) $@!" >$@ 90 | 91 | define maketargetdir 92 | -@mkdir -p $(dir $@) > /dev/null 2>&1 93 | endef 94 | 95 | clean : 96 | rm -f $(BINARY) $(OBJS) $(DEPS) 97 | -------------------------------------------------------------------------------- /Lecture1/HIP/hip-stream/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 2 | project(Stream LANGUAGES CXX) 3 | include(CTest) 4 | 5 | set (CMAKE_CXX_STANDARD 14) 6 | 7 | if (NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 9 | endif(NOT CMAKE_BUILD_TYPE) 10 | 11 | string(REPLACE -O2 -O3 CMAKE_CXX_FLAGS_RELWITHDEBINFO ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) 12 | 13 | if (NOT CMAKE_GPU_RUNTIME) 14 | set(GPU_RUNTIME "ROCM" CACHE STRING "Switches between ROCM and CUDA") 15 | else (NOT CMAKE_GPU_RUNTIME) 16 | set(GPU_RUNTIME "${CMAKE_GPU_RUNTIME}" CACHE STRING "Switches between ROCM and CUDA") 17 | endif (NOT CMAKE_GPU_RUNTIME) 18 | # Really should only be ROCM or CUDA, but allowing HIP because it is the currently built-in option 19 | set(GPU_RUNTIMES "ROCM" "CUDA" "HIP") 20 | if(NOT "${GPU_RUNTIME}" IN_LIST GPU_RUNTIMES) 21 | set(ERROR_MESSAGE "GPU_RUNTIME is set to \"${GPU_RUNTIME}\".\nGPU_RUNTIME must be either HIP, ROCM, or CUDA.") 22 | message(FATAL_ERROR ${ERROR_MESSAGE}) 23 | endif() 24 | # GPU_RUNTIME for AMD GPUs should really be ROCM, if selecting AMD GPUs 25 | # so manually resetting to HIP if ROCM is selected 26 | if (${GPU_RUNTIME} MATCHES "ROCM") 27 | set(GPU_RUNTIME "HIP") 28 | endif (${GPU_RUNTIME} MATCHES "ROCM") 29 | set_property(CACHE GPU_RUNTIME PROPERTY STRINGS ${GPU_RUNTIMES}) 30 | 31 | enable_language(${GPU_RUNTIME}) 32 | set(CMAKE_${GPU_RUNTIME}_EXTENSIONS OFF) 33 | set(CMAKE_${GPU_RUNTIME}_STANDARD_REQUIRED ON) 34 | 35 | set(CMAKE_${GPU_RUNTIME}_FLAGS_DEBUG "-ggdb") 36 | 37 | set(STREAM_CXX_SRCS "") 38 | 39 | set(STREAM_HIP_SRCS stream.hip) 40 | 41 | if (DEFINED ENV{HIP_PATH}) 42 | set(HIP_PATH $ENV{HIP_PATH}) 43 | else (DEFINED ENV{HIP_PATH}) 44 | execute_process(COMMAND hipconfig --path OUTPUT_VARIABLE HIP_PATH ERROR_QUIET) 45 | endif (DEFINED ENV{HIP_PATH}) 46 | 47 | add_executable(stream ${STREAM_CXX_SRCS} ${STREAM_HIP_SRCS} ) 48 | 49 | # Make example runnable using ctest 50 | #add_test(NAME Stream COMMAND stream ) 51 | #set_property(TEST Stream 52 | # PROPERTY PASS_REGULAR_EXPRESSION "PASSED!") 53 | 54 | set(ROCMCC_FLAGS "${ROCMCC_FLAGS} -munsafe-fp-atomics") 55 | set(CUDACC_FLAGS "${CUDACC_FLAGS} ") 56 | 57 | if (${GPU_RUNTIME} MATCHES "HIP") 58 | set(HIPCC_FLAGS "${ROCMCC_FLAGS}") 59 | else (${GPU_RUNTIME} MATCHES "HIP") 60 | set(HIPCC_FLAGS "${CUDACC_FLAGS} -I/${HIP_PATH}/include") 61 | endif (${GPU_RUNTIME} MATCHES "HIP") 62 | 63 | set_source_files_properties(${STREAM_HIP_SRCS} PROPERTIES LANGUAGE ${GPU_RUNTIME}) 64 | set_source_files_properties(stream.hip PROPERTIES COMPILE_FLAGS "${HIPCC_FLAGS}") 65 | 66 | install(TARGETS stream) 67 | --------------------------------------------------------------------------------