├── vardecl.cpp ├── ppopp-18.pdf ├── docs ├── codegen.pdf └── Makefile ├── grammar.hpp ├── examples ├── j2d25pt │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j2d25pt.driver.cpp │ ├── j2d25pt_gold.cpp │ ├── j2d25pt-orig.cu │ ├── reorder.sh │ ├── j2d25pt-reg.cu │ ├── j2d25pt-unroll.cu │ └── Makefile ├── j2d64pt │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j2d64pt.driver.cpp │ ├── j2d64pt_gold.cpp │ ├── reorder.sh │ ├── j2d64pt-orig.cu │ ├── j2d64pt-reg.cu │ └── Makefile ├── j2d81pt │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j2d81pt.driver.cpp │ ├── j2d81pt_gold.cpp │ ├── reorder.sh │ ├── j2d81pt-orig.cu │ ├── j2d81pt-reg.cu │ └── Makefile ├── j3d27pt │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j3d27pt.driver.cpp │ ├── j3d27pt_gold.cpp │ ├── reorder.sh │ ├── j3d27pt-orig.cu │ ├── j3d27pt-reg.cu │ ├── j3d27pt-unroll.cu │ └── Makefile ├── derivative-2 │ ├── common │ │ ├── cuda_header.cu │ │ └── timer.hpp │ ├── derivative.driver.cpp │ └── reorder.sh ├── hypterm-3 │ ├── common │ │ ├── cuda_header.cu │ │ └── timer.hpp │ ├── hypterm.driver.cpp │ └── reorder-3.sh ├── j3d125pt-new │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j3d125pt.driver.cpp │ ├── reorder.sh │ ├── j3d125pt_gold.cpp │ ├── Makefile │ └── j3d125pt-orig.cu ├── j3d125pt │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── j3d125pt.driver.cpp │ ├── reorder.sh │ ├── j3d125pt_gold.cpp │ ├── Makefile │ ├── j3d125pt-orig.cu │ ├── j3d125pt-reg.cu │ └── j3d125pt-unroll.cu ├── hypterm-maxfuse │ ├── common │ │ ├── cuda_header.cu │ │ └── timer.hpp │ ├── hypterm.driver.cpp │ └── reorder.sh ├── rhs4th3fort-3 │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── sw4.driver.cpp │ └── reorder.sh ├── derivative-maxfuse │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── derivative.driver.cpp │ ├── reorder.sh │ └── Makefile ├── rhs4th3fort-maxfuse │ ├── common │ │ ├── cuda_header.cu │ │ ├── timer.hpp │ │ └── time.awk │ ├── sw4.driver.cpp │ └── reorder.sh └── run-benchmarks.sh ├── datatypes.hpp ├── codegen.hpp ├── Makefile ├── scanner.l ├── README.md ├── symtab.hpp ├── vardecl.hpp └── main.cpp /vardecl.cpp: -------------------------------------------------------------------------------- 1 | #include "vardecl.hpp" 2 | using namespace std; 3 | -------------------------------------------------------------------------------- /ppopp-18.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pssrawat/ppopp-artifact/HEAD/ppopp-18.pdf -------------------------------------------------------------------------------- /docs/codegen.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pssrawat/ppopp-artifact/HEAD/docs/codegen.pdf -------------------------------------------------------------------------------- /grammar.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __GRAMMAR_HPP__ 2 | #define __GRAMMAR_HPP__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "funcdefn.hpp" 9 | 10 | class grammar { 11 | public: 12 | static start_node *start; 13 | static void set_input (FILE *); 14 | static void parse (); 15 | }; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | PAPER = codegen 2 | TEX = $(wildcard *.tex) 3 | BIB = references.bib 4 | FIGS = $(wildcard figures/*.pdf figures/*.png graphs/*.pdf graphs/*.png) 5 | 6 | .PHONY: all clean 7 | 8 | $(PAPER).pdf: $(TEX) $(BIB) $(FIGS) 9 | echo $(FIGS) 10 | latex $(PAPER) 11 | bibtex $(PAPER) 12 | latex $(PAPER) 13 | latex $(PAPER) 14 | dvipdf $(PAPER).dvi 15 | clean: 16 | rm -f *.aux *.bbl *.blg *.log *.out *.dvi 17 | -------------------------------------------------------------------------------- /examples/j2d25pt/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/j2d64pt/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/j2d81pt/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/j3d27pt/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/derivative-2/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/hypterm-3/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/j3d125pt/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/hypterm-maxfuse/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-3/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-maxfuse/common/cuda_header.cu: -------------------------------------------------------------------------------- 1 | #include "cuda.h" 2 | #include "stdio.h" 3 | 4 | // extern __host__ __device__ int MAX(int a, int b) { return a > b ? a : b; } 5 | // extern __host__ __device__ int MIN(int a, int b) { return a < b ? a : b; } 6 | // extern __host__ __device__ int CEIL(int a, int b) { return ( (a) % (b) == 0 ? (a) / (b) : ( (a) / (b) + 1 ) ); } 7 | 8 | void Check_CUDA_Error(const char* message){ 9 | cudaError_t error = cudaGetLastError(); 10 | if( error != cudaSuccess ){ 11 | printf("CUDA-ERROR:%s, %s\n",message,cudaGetErrorString(error) ); 12 | exit(-1); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/run-benchmarks.sh: -------------------------------------------------------------------------------- 1 | if [[ -z "${CUDAHOME}" ]]; then 2 | echo "CUDAHOME unset" 3 | exit 125 4 | fi 5 | if [[ -z "${CAPABILITY}" ]]; then 6 | echo "CAPABILITY unset" 7 | exit 125 8 | fi 9 | 10 | cur=`pwd` 11 | rm -f output.txt 12 | touch output.txt 13 | 14 | for dir in $(ls -d */) 15 | do 16 | echo $'\n\n\n==========================================================' >> output.txt 17 | echo $dir >> output.txt 18 | echo ========================================================== >> output.txt 19 | cd ${dir} 20 | make 21 | ./common/time.awk >> ${cur}/output.txt 22 | cd ${cur} 23 | done 24 | -------------------------------------------------------------------------------- /datatypes.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __DATATYPES_HPP__ 2 | #define __DATATYPES_HPP__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // All the enums used 10 | enum DATA_TYPE { 11 | INT=0, 12 | FLOAT, 13 | DOUBLE, 14 | BOOL 15 | }; 16 | 17 | enum ETYPE { 18 | T_DATATYPE=0, 19 | T_UMINUS, 20 | T_BINARY, 21 | T_ID, 22 | T_SHIFTVEC, 23 | T_FUNCTION 24 | }; 25 | 26 | enum STMT_OP { 27 | ST_PLUSEQ=0, 28 | ST_MINUSEQ, 29 | ST_MULTEQ, 30 | ST_DIVEQ, 31 | ST_ANDEQ, 32 | ST_OREQ, 33 | ST_EQ 34 | }; 35 | 36 | enum OP_TYPE { 37 | T_PLUS=0, 38 | T_MINUS, 39 | T_MULT, 40 | T_DIV, 41 | T_MOD, 42 | T_EXP, 43 | T_LEQ, 44 | T_GEQ, 45 | T_NEQ, 46 | T_EQ, 47 | T_LT, 48 | T_GT, 49 | T_OR, 50 | T_AND 51 | }; 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /examples/hypterm-3/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j2d25pt/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j2d64pt/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j2d81pt/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j3d125pt/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j3d27pt/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/derivative-2/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-3/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/hypterm-maxfuse/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-maxfuse/common/timer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_HPP__ 2 | #define __TIMER_HPP__ 3 | 4 | #include 5 | 6 | #ifndef NUMRUNS 7 | #define NUMRUNS 2 8 | #endif 9 | 10 | typedef std::chrono::high_resolution_clock HighResolutionClock; 11 | typedef std::chrono::milliseconds milliseconds; 12 | typedef std::chrono::time_point TimePoint; 13 | 14 | 15 | /// Timers 16 | static TimePoint globalTimerStart; 17 | static TimePoint globalTimerStop; 18 | static void startTimer() 19 | { 20 | globalTimerStart = HighResolutionClock::now(); 21 | } 22 | static void stopTimer() 23 | { 24 | globalTimerStop = HighResolutionClock::now(); 25 | } 26 | static double getElapsedTime() 27 | { 28 | milliseconds time(0); 29 | time = 30 | std::chrono::duration_cast 31 | (globalTimerStop - globalTimerStart); 32 | return (double)time.count(); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /codegen.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CODEGEN_HPP__ 2 | #define __CODEGEN_HPP__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "funcdefn.hpp" 12 | 13 | class codegen { 14 | private: 15 | start_node *start; 16 | std::stringstream header; 17 | std::stringstream gpu_code; 18 | std::stringstream host_code; 19 | public: 20 | codegen (start_node *); 21 | void print_parameters (void); 22 | void print_temp_decls (void); 23 | void print_unroll_decls (void); 24 | void print_var_decls (void); 25 | void print_array_decls (void); 26 | void print_func_calls (void); 27 | void generate_code (std::stringstream &, std::stringstream &, std::map &, DATA_TYPE, int, bool, bool, bool); 28 | }; 29 | 30 | inline codegen::codegen (start_node *node) { 31 | start = node; 32 | } 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /examples/j2d25pt/j2d25pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j2d25pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 8196; 10 | 11 | double (*input)[8196] = (double (*)[8196]) getRandom2DArray(8196, 8196); 12 | double (*output)[8196] = (double (*)[8196]) getZero2DArray(8196, 8196); 13 | double (*output_gold)[8196] = (double (*)[8196]) getZero2DArray(8196, 8196); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j2d25pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError2D (N, (double*)output, (double*) output_gold, 2, N-2, 2, N-2); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j2d64pt/j2d64pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j2d64pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 8200; 10 | 11 | double (*input)[8200] = (double (*)[8200]) getRandom2DArray(8200, 8200); 12 | double (*output)[8200] = (double (*)[8200]) getZero2DArray(8200, 8200); 13 | double (*output_gold)[8200] = (double (*)[8200]) getZero2DArray(8200, 8200); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j2d64pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError2D (N, (double*)output, (double*) output_gold, 4, N-4, 4, N-4); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j2d81pt/j2d81pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j2d81pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 8200; 10 | 11 | double (*input)[8200] = (double (*)[8200]) getRandom2DArray(8200, 8200); 12 | double (*output)[8200] = (double (*)[8200]) getZero2DArray(8200, 8200); 13 | double (*output_gold)[8200] = (double (*)[8200]) getZero2DArray(8200, 8200); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j2d81pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError2D (N, (double*)output, (double*) output_gold, 0, N-8, 0, N-8); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j2d25pt/j2d25pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j2d25pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[8196] = (const double (*)[8196])l_in; 3 | double (*out)[8196] = (double (*)[8196])l_out; 4 | 5 | for (int j = 2; j < N-2; j++) { 6 | for (int i = 2; i < N-2; i++) { 7 | out[j][i] = 0.1*(in[j-2][i-2] + in[j-2][i+2] + in[j+2][i-2] + in[j+2][i+2]) + 8 | 0.2*(in[j-2][i-1] + in[j-2][i+1] + in[j+2][i-1] + in[j+2][i+1]) + 9 | 0.3*(in[j-2][i] + in[j+2][i]) + 10 | 1.1*(in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] + in[j+1][i+2]) + 11 | 1.2*(in[j-1][i-1] + in[j-1][i+1] + in[j+1][i-1] + in[j+1][i+1]) + 12 | 1.3*(in[j-1][i] + in[j+1][i]) + 13 | 2.1*(in[j][i-2] + in[j][i+2]) + 14 | 2.2*(in[j][i-1] + in[j][i+1]) + 15 | 2.3*in[j][i]; 16 | } 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /examples/j3d27pt/j3d27pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j3d27pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 514; 10 | 11 | double (*input)[514][514] = (double (*)[514][514]) getRandom3DArray(514, 514, 514); 12 | double (*output)[514][514] = (double (*)[514][514]) getZero3DArray(514, 514, 514); 13 | double (*output_gold)[514][514] = (double (*)[514][514]) getZero3DArray(514, 514, 514); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j3d27pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError3D (N, N, (double*)output, (double*) output_gold, 2, N-2, 2, N-2, 2, N-2); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/j3d125pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j3d125pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 516; 10 | 11 | double (*input)[516][516] = (double (*)[516][516]) getRandom3DArray(516, 516, 516); 12 | double (*output)[516][516] = (double (*)[516][516]) getZero3DArray(516, 516, 516); 13 | double (*output_gold)[516][516] = (double (*)[516][516]) getZero3DArray(516, 516, 516); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j3d125pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError3D (N, N, (double*)output, (double*) output_gold, 2, N-2, 2, N-2, 2, N-2); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j3d125pt/j3d125pt.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void host_code (double*, double*, int); 6 | extern "C" void j3d125pt_gold (double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 516; 10 | 11 | double (*input)[516][516] = (double (*)[516][516]) getRandom3DArray(516, 516, 516); 12 | double (*output)[516][516] = (double (*)[516][516]) getZero3DArray(516, 516, 516); 13 | double (*output_gold)[516][516] = (double (*)[516][516]) getZero3DArray(516, 516, 516); 14 | 15 | host_code ((double*)input, (double*)output, N); 16 | j3d125pt_gold((double*)input, (double*)output_gold, N); 17 | 18 | double error = checkError3D (N, N, (double*)output, (double*) output_gold, 2, N-2, 2, N-2, 2, N-2); 19 | printf("[Test] RMS Error : %e\n",error); 20 | if (error > TOLERANCE) 21 | return -1; 22 | 23 | delete[] input; 24 | delete[] output; 25 | delete[] output_gold; 26 | } 27 | -------------------------------------------------------------------------------- /examples/j3d27pt/j3d27pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j3d27pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[514][514] = (const double (*)[514][514])l_in; 3 | double (*out)[514][514] = (double (*)[514][514])l_out; 4 | 5 | for (int k = 1; k < N-1; k++) { 6 | for (int j = 1; j < N-1; j++) { 7 | for (int i = 1; i < N-1; i++) { 8 | out[k][j][i] = 0.125 * in[k][j][i] + 9 | 1.14 * (in[k-1][j][i] + in[k+1][j][i] + in[k][j-1][i] + 10 | in[k][j+1][i] + in[k][j][i-1] + in[k][j][i+1]) + 11 | 0.75 * (in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + 12 | in[k-1][j+1][i+1] + in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + 13 | in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 14 | 1.031 * (in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + 15 | in[k-1][j+1][i] + in[k][j-1][i-1] + in[k][j-1][i+1] + 16 | in[k][j+1][i-1] + in[k][j+1][i+1] + in[k+1][j-1][i] + 17 | in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i]); 18 | } 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /examples/j2d81pt/j2d81pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j2d81pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[8200] = (const double (*)[8200])l_in; 3 | double (*out)[8200] = (double (*)[8200])l_out; 4 | 5 | for (int j = 0; j < N-8; j++) { 6 | for (int i = 0; i < N-8; i++) { 7 | out[j][i] = 8 | (in[j][i] + in[j][i+8] + in[j+8][i] + in[j+8][i+8]) * 3.1862206 + 9 | (in[j][i+1] + in[j][i+7] + in[j+1][i] + in[j+1][i+8] + in[j+7][i] + in[j+7][i+8] + in[j+8][i+1] + in[j+8][i+7]) * 4.5339005 + 10 | (in[j][i+2] + in[j][i+6] + in[j+2][i] + in[j+2][i+8] + in[j+6][i] + in[j+6][i+8] + in[j+8][i+2] + in[j+8][i+6]) * -0.000357000 + 11 | (in[j][i+3] + in[j][i+5] + in[j+3][i] + in[j+3][i+8] + in[j+5][i] + in[j+5][i+8] + in[j+8][i+3] + in[j+8][i+5]) * 0.00285600 + 12 | (in[j][i+4] + in[j+4][i+8] + in[j+4][i] + in[j+8][i+4]) * -0.00508225 + 13 | (in[j+1][i+1] + in[j+1][i+7] + in[j+7][i+1] + in[j+7][i+7]) * 0.000645160 + 14 | (in[j+1][i+2] + in[j+1][i+6] + in[j+2][i+1] + in[j+2][i+7] + in[j+6][i+1] + in[j+6][i+7] + in[j+7][i+2] + in[j+7][i+6]) * -0.00508000 + 15 | (in[j+1][i+3] + in[j+1][i+5] + in[j+3][i+1] + in[j+3][i+7] + in[j+5][i+1] + in[j+5][i+7] + in[j+7][i+3] + in[j+7][i+5]) * 0.0406400 + 16 | (in[j+1][i+4] + in[j+4][i+1] + in[j+4][i+7] + in[j+7][i+4]) * -0.0723189 + 17 | (in[j+2][i+2] + in[j+2][i+6] + in[j+6][i+2] + in[j+6][i+6]) * 0.0400000 + 18 | (in[j+2][i+3] + in[j+2][i+5] + in[j+3][i+2] + in[j+3][i+6] + in[j+5][i+2] + in[j+5][i+6] + in[j+6][i+3] + in[j+6][i+5]) * -0.320000 + 19 | (in[j+2][i+4] + in[j+4][i+2] + in[j+4][i+6] + in[j+6][i+4]) * 0.569440 + 20 | (in[j+3][i+3] + in[j+3][i+5] + in[j+5][i+3] + in[j+5][i+5]) * 2.56000 + 21 | (in[j+3][i+4] + in[j+4][i+3] + in[j+4][i+5] + in[j+5][i+4]) * -4.55552 + 22 | in[j+4][i+4] * 8.10655; 23 | } 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/j2d64pt/j2d64pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j2d64pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[8200] = (const double (*)[8200])l_in; 3 | double (*out)[8200] = (double (*)[8200])l_out; 4 | 5 | for (int j = 4; j < N-4; j++) { 6 | for (int i = 4; i < N-4; i++) { 7 | out[j][i] = 8 | (in[j-4][i-4] - in[j-4][i+4] - in[j+4][i-4] + in[j+4][i+4]) * 1.274495 + 9 | (-in[j-4][i-3] + in[j-4][i+3] + in[j-3][i+4] - in[j-3][i-4] + in[j+3][i-4] - in[j+3][i+4] + in[j+4][i-3] - in[j+4][i+3]) * 0.000136017 + 10 | (in[j-4][i-2] - in[j-4][i+2] + in[j-2][i-4] - in[j-2][i+4] - in[j+2][i-4] + in[j+2][i+4] - in[j+4][i-2] + in[j+4][i+2]) * 0.000714000 + 11 | (-in[j-4][i-1] + in[j-4][i+1] - in[j-1][i-4] + in[j-1][i+4] + in[j+1][i-4] - in[j+1][i+4] + in[j+4][i-1] - in[j+4][i+1]) * 0.00285600 + 12 | (in[j-3][i-3] - in[j-3][i+3] - in[j+3][i-3] + in[j+3][i+3]) * 0.00145161 + 13 | (-in[j-3][i-2] + in[j-3][i+2] - in[j-2][i-3] + in[j-2][i+3] + in[j+2][i-3] - in[j+2][i+3] + in[j+3][i-2] - in[j+3][i+2]) * 0.00762000 + 14 | (in[j-3][i-1] - in[j-3][i+1] + in[j-1][i-3] - in[j-1][i+3] - in[j+1][i-3] + in[j+1][i+3] - in[j+3][i-1] + in[j+3][i+1]) * 0.0304800 + 15 | (in[j-2][i-2] - in[j-2][i+2] - in[j+2][i-2] + in[j+2][i+2]) * 0.0400000 + 16 | (-in[j-2][i-1] + in[j-2][i+1] - in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] - in[j+1][i+2] + in[j+2][i-1] - in[j+2][i+1]) * 0.160000 + 17 | (in[j-1][i-1] - in[j-1][i+1] - in[j+1][i-1] + in[j+1][i+1]) * 0.640000; 18 | } 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /examples/j2d25pt/j2d25pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d25pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,0) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,0) + (int)(threadIdx.y); 21 | 22 | double (*in)[8196] = (double (*)[8196]) l_in; 23 | double (*out)[8196] = (double (*)[8196]) l_out; 24 | 25 | if (i>=2 & j>=2 & i<=N-3 & j<=N-3) { 26 | out[j][i] = 0.1*(in[j-2][i-2] + in[j-2][i+2] + in[j+2][i-2] + in[j+2][i+2]) + 27 | 0.2*(in[j-2][i-1] + in[j-2][i+1] + in[j+2][i-1] + in[j+2][i+1]) + 28 | 0.3*(in[j-2][i] + in[j+2][i]) + 29 | 1.1*(in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] + in[j+1][i+2]) + 30 | 1.2*(in[j-1][i-1] + in[j-1][i+1] + in[j+1][i-1] + in[j+1][i+1]) + 31 | 1.3*(in[j-1][i] + in[j+1][i]) + 32 | 2.1*(in[j][i-2] + in[j][i+2]) + 33 | 2.2*(in[j][i-1] + in[j][i+1]) + 34 | 2.3*in[j][i]; 35 | } 36 | } 37 | 38 | extern "C" void host_code (double *h_in, double *h_out, int N) { 39 | double *in; 40 | cudaMalloc (&in, sizeof(double)*N*N); 41 | check_error ("Failed to allocate device memory for in\n"); 42 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 43 | double *out; 44 | cudaMalloc (&out, sizeof(double)*N*N); 45 | check_error ("Failed to allocate device memory for out\n"); 46 | 47 | dim3 blockconfig (16, 8); 48 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, blockconfig.y)); 49 | 50 | j2d25pt<<>> (in, out, N); 51 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 52 | 53 | cudaFree (in); 54 | cudaFree (out); 55 | } 56 | -------------------------------------------------------------------------------- /examples/j2d25pt/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j2d64pt/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j2d81pt/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j3d27pt/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j3d125pt/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | 10 | awk '/#pragma begin/{print $3}' stencils > stencilnames 11 | awk '/unroll/{print $5}' stencils > unrollfactors 12 | 13 | while read -r name 14 | do 15 | uf=`awk 'NR==1' unrollfactors` 16 | sed -i '1d' unrollfactors 17 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 18 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --split false 19 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 20 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --split false 21 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 22 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --split false 23 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 24 | sed -i '/#pragma begin '"$name"'/r 'orig_"$name"'.cu' reordered-d.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 27 | done < stencilnames 28 | 29 | sed -i '/#pragma begin stencil/d' reordered-a.cu 30 | sed -i '/#pragma end stencil/d' reordered-a.cu 31 | #indent -kr -i8 reordered-a.cu 32 | sed -i '/#pragma begin stencil/d' reordered-b.cu 33 | sed -i '/#pragma end stencil/d' reordered-b.cu 34 | #indent -kr -i8 reordered-b.cu 35 | sed -i '/#pragma begin stencil/d' reordered-c.cu 36 | sed -i '/#pragma end stencil/d' reordered-c.cu 37 | #indent -kr -i8 reordered-c.cu 38 | sed -i '/#pragma begin stencil/d' reordered-d.cu 39 | sed -i '/#pragma end stencil/d' reordered-d.cu 40 | #indent -kr -i8 reordered-d.cu 41 | sed -i '/#pragma begin stencil/d' reordered-e.cu 42 | sed -i '/#pragma end stencil/d' reordered-e.cu 43 | #indent -kr -i8 reordered-e.cu 44 | 45 | rm *~ 46 | -------------------------------------------------------------------------------- /examples/j2d25pt/j2d25pt-reg.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d25pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,2) + 4*(int)(threadIdx.y); 21 | 22 | double (*in)[8196] = (double (*)[8196]) l_in; 23 | double (*out)[8196] = (double (*)[8196]) l_out; 24 | 25 | if (i>=2 & j>=2 & i<=N-3 & j<=N-3) { 26 | #pragma begin stencil1 unroll j=4,i=1 27 | out[j][i] = 0.1*(in[j-2][i-2] + in[j-2][i+2] + in[j+2][i-2] + in[j+2][i+2]) + 28 | 0.2*(in[j-2][i-1] + in[j-2][i+1] + in[j+2][i-1] + in[j+2][i+1]) + 29 | 0.3*(in[j-2][i] + in[j+2][i]) + 30 | 1.1*(in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] + in[j+1][i+2]) + 31 | 1.2*(in[j-1][i-1] + in[j-1][i+1] + in[j+1][i-1] + in[j+1][i+1]) + 32 | 1.3*(in[j-1][i] + in[j+1][i]) + 33 | 2.1*(in[j][i-2] + in[j][i+2]) + 34 | 2.2*(in[j][i-1] + in[j][i+1]) + 35 | 2.3*in[j][i]; 36 | #pragma end stencil1 37 | } 38 | } 39 | 40 | extern "C" void host_code (double *h_in, double *h_out, int N) { 41 | double *in; 42 | cudaMalloc (&in, sizeof(double)*N*N); 43 | check_error ("Failed to allocate device memory for in\n"); 44 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 45 | double *out; 46 | cudaMalloc (&out, sizeof(double)*N*N); 47 | check_error ("Failed to allocate device memory for out\n"); 48 | 49 | dim3 blockconfig (16, 8); 50 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, 4*blockconfig.y)); 51 | 52 | j2d25pt<<>> (in, out, N); 53 | 54 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 55 | cudaFree (in); 56 | cudaFree (out); 57 | } 58 | -------------------------------------------------------------------------------- /examples/j2d25pt/j2d25pt-unroll.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d25pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,2) + 4*(int)(threadIdx.y); 21 | 22 | double (*in)[8196] = (double (*)[8196]) l_in; 23 | double (*out)[8196] = (double (*)[8196]) l_out; 24 | 25 | if (i>=2 & j>=2 & i<=N-3 & j<=N-3) { 26 | #pragma unroll 4 27 | for (int jj=0; jj<=3; jj++) { 28 | out[j+jj][i] = 0.1*(in[j+jj-2][i-2] + in[j+jj-2][i+2] + in[j+jj+2][i-2] + in[j+jj+2][i+2]) + 29 | 0.2*(in[j+jj-2][i-1] + in[j+jj-2][i+1] + in[j+jj+2][i-1] + in[j+jj+2][i+1]) + 30 | 0.3*(in[j+jj-2][i] + in[j+jj+2][i]) + 31 | 1.1*(in[j+jj-1][i-2] + in[j+jj-1][i+2] + in[j+jj+1][i-2] + in[j+jj+1][i+2]) + 32 | 1.2*(in[j+jj-1][i-1] + in[j+jj-1][i+1] + in[j+jj+1][i-1] + in[j+jj+1][i+1]) + 33 | 1.3*(in[j+jj-1][i] + in[j+jj+1][i]) + 34 | 2.1*(in[j+jj][i-2] + in[j+jj][i+2]) + 35 | 2.2*(in[j+jj][i-1] + in[j+jj][i+1]) + 36 | 2.3*in[j+jj][i]; 37 | } 38 | } 39 | } 40 | 41 | extern "C" void host_code (double *h_in, double *h_out, int N) { 42 | double *in; 43 | cudaMalloc (&in, sizeof(double)*N*N); 44 | check_error ("Failed to allocate device memory for in\n"); 45 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 46 | double *out; 47 | cudaMalloc (&out, sizeof(double)*N*N); 48 | check_error ("Failed to allocate device memory for out\n"); 49 | 50 | dim3 blockconfig (16, 8); 51 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, 4*blockconfig.y)); 52 | 53 | j2d25pt<<>> (in, out, N); 54 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 55 | 56 | cudaFree (in); 57 | cudaFree (out); 58 | } 59 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LEX=lex 2 | YACC=yacc 3 | CC=g++ 4 | 5 | CFLAGS=-g -O3 -std=c++14 6 | OPTFLAGS=-DDEBUG=false -DDROP_COEFS=true -DASSOC_MULT=false -DAVAIL_EXPR_OPT=false -DRETAIN_SIMPLE_OPS=false -DPERM_LIMIT=5040 -DSPLICE_EQUALITY=false -DPAR_LOADS=1 -DEXPLICIT_LOADS=false -DGEN_FMA=true -DPRINT_INTRINSICS=false 7 | INTEROPTFLAGS=-DRESTRICT_INTER_OPT=true -DSPLICE_TEMP_LABELS=false -DINTRA_TYPE_INTER_OPT=false 8 | REGALLOCFLAGS=-DFIRST_LEVEL=false -DSECOND_LEVEL=true -DOPERATION_VIEW=false 9 | 10 | default : test 11 | 12 | treenode.o : treenode.cpp treenode.hpp utils.hpp sort.hpp 13 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o treenode.o -c treenode.cpp 14 | exprnode.o : exprnode.cpp exprnode.hpp utils.hpp sort.hpp 15 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o exprnode.o -c exprnode.cpp 16 | funcdefn.o : funcdefn.cpp tree-reg-funcdefn.cpp funcdefn.hpp 17 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o funcdefn.o -c funcdefn.cpp 18 | vardecl.o : vardecl.cpp vardecl.hpp utils.hpp sort.hpp 19 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o vardecl.o -c vardecl.cpp 20 | codegen.o : codegen.cpp codegen.hpp utils.hpp sort.hpp 21 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o codegen.o -c codegen.cpp 22 | test : lex.yy.c y.tab.c main.cpp utils.hpp sort.hpp treenode.o exprnode.o funcdefn.o vardecl.o codegen.o 23 | $(CC) $(CFLAGS) $(OPTFLAGS) $(REGALLOCFLAGS) -o test main.cpp treenode.o exprnode.o codegen.o funcdefn.o vardecl.o lex.yy.c y.tab.c 24 | 25 | all : lex.yy.c y.tab.c main.cpp 26 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o treenode.o -c treenode.cpp 27 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o exprnode.o -c exprnode.cpp 28 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o funcdefn.o -c funcdefn.cpp 29 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o vardecl.o -c vardecl.cpp 30 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o codegen.o -c codegen.cpp 31 | $(CC) $(CFLAGS) $(OPTFLAGS) $(INTEROPTFLAGS) $(REGALLOCFLAGS) -o test main.cpp treenode.o exprnode.o codegen.o funcdefn.o vardecl.o lex.yy.c y.tab.c 32 | 33 | lex.yy.c : scanner.l 34 | $(LEX) scanner.l 35 | 36 | y.tab.c : grammar.y 37 | $(YACC) -d grammar.y 38 | 39 | clean: 40 | -@rm *.o lex.yy.* y.tab.* out.cu orig_out.cu test 2>/dev/null || true 41 | -------------------------------------------------------------------------------- /examples/j3d125pt/common/time.awk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "-------------------- NVCC ---------------------" 5 | 6 | time=`grep -E 'float|double' nvcc-orig-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 7 | awk -v otime=$time 'BEGIN {print "Original GFlops = " (512*512*512*130/10^6/otime)}' 8 | 9 | time=`grep -E 'float|double' nvcc-unroll-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 10 | awk -v utime=$time 'BEGIN {print "Unrolled GFlops = " (512*512*512*130/10^6/utime)}' 11 | 12 | timea=`grep -E 'float|double' nvcc-reorder-results-a | awk 'BEGIN {timea = 0.0} {timea += $2} END {print timea}'` 13 | timec=`grep -E 'float|double' nvcc-reorder-results-c | awk 'BEGIN {timec = 0.0} {timec += $2} END {print timec}'` 14 | timed=`grep -E 'float|double' nvcc-reorder-results-d | awk 'BEGIN {timed = 0.0} {timed += $2} END {print timed}'` 15 | timee=`grep -E 'float|double' nvcc-reorder-results-e | awk 'BEGIN {timee = 0.0} {timee += $2} END {print timee}'` 16 | min2=`awk -v atime=$timea -v ctime=$timec 'BEGIN {print (ctime 3 | #include 4 | 5 | extern "C" void derivative_gold (double *h_r1, double *h_u1, double *h_u2, double *h_u3, double *h_mu, double *h_la, double *h_met1, double *h_met2, double *h_met3, double *h_met4, double *, double *, double c1, double c2, int N); 6 | extern "C" void host_code (double *h_r1, double *h_u1, double *h_u2, double *h_u3, double *h_mu, double *h_la, double *h_met1, double *h_met2, double *h_met3, double *h_met4, double *, double *, double c1, double c2, int N); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 304; 10 | 11 | double (*r_gold_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 12 | double (*mu)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 13 | double (*la)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 14 | double (*met1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 15 | double (*met2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 16 | double (*met3)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 17 | double (*met4)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 18 | double (*u1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 19 | double (*u2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 20 | double (*u3)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 21 | double (*r_0)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 22 | memcpy(r_0, r_gold_0, sizeof(double)*304*304*304); 23 | double *strx = (double *) getRandom1DArray(304); 24 | double *stry = (double *) getRandom1DArray(304); 25 | 26 | double c1 = 0.32; 27 | double c2 = 0.43; 28 | derivative_gold ((double*)r_gold_0, (double *)u1, (double *)u2, (double *)u3, (double*)mu, (double*)la, (double*)met1, (double*)met2, (double *)met3, (double*)met4, strx, stry, c1, c2, N); 29 | host_code ((double*)r_0, (double *)u1, (double *)u2, (double *)u3, (double*)mu, (double*)la, (double*)met1, (double*)met2, (double *)met3, (double*)met4, strx, stry, c1, c2, N); 30 | double error_0 = checkError3D (N, N, (double*)r_0, (double*)r_gold_0, 2, N-2, 2, N-2, 2, N-2); 31 | printf("[Test] RMS Error : %e\n",error_0); 32 | if (error_0 > TOLERANCE) 33 | return -1; 34 | } 35 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/derivative.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void derivative_gold (double *h_r1, double *h_u1, double *h_u2, double *h_u3, double *h_mu, double *h_la, double *h_met1, double *h_met2, double *h_met3, double *h_met4, double *, double *, double c1, double c2, int N); 6 | extern "C" void host_code (double *h_r1, double *h_u1, double *h_u2, double *h_u3, double *h_mu, double *h_la, double *h_met1, double *h_met2, double *h_met3, double *h_met4, double *, double *, double c1, double c2, int N); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 304; 10 | 11 | double (*r_gold_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 12 | double (*mu)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 13 | double (*la)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 14 | double (*met1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 15 | double (*met2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 16 | double (*met3)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 17 | double (*met4)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 18 | double (*u1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 19 | double (*u2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 20 | double (*u3)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 21 | double (*r_0)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 22 | memcpy(r_0, r_gold_0, sizeof(double)*304*304*304); 23 | double *strx = (double *) getRandom1DArray(304); 24 | double *stry = (double *) getRandom1DArray(304); 25 | 26 | double c1 = 0.32; 27 | double c2 = 0.43; 28 | derivative_gold ((double*)r_gold_0, (double *)u1, (double *)u2, (double *)u3, (double*)mu, (double*)la, (double*)met1, (double*)met2, (double *)met3, (double*)met4, strx, stry, c1, c2, N); 29 | host_code ((double*)r_0, (double *)u1, (double *)u2, (double *)u3, (double*)mu, (double*)la, (double*)met1, (double*)met2, (double *)met3, (double*)met4, strx, stry, c1, c2, N); 30 | double error_0 = checkError3D (N, N, (double*)r_0, (double*)r_gold_0, 2, N-2, 2, N-2, 2, N-2); 31 | printf("[Test] RMS Error : %e\n",error_0); 32 | if (error_0 > TOLERANCE) 33 | return -1; 34 | } 35 | -------------------------------------------------------------------------------- /examples/j3d27pt/j3d27pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d27pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 1; 18 | int i = max(i0,1) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y) + 1; 20 | int j = max(j0,1) + (int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 1; 22 | int k = max(k0,1) + (int)(threadIdx.z); 23 | 24 | double (*in)[514][514] = (double (*)[514][514])t_in; 25 | double (*out)[514][514] = (double (*)[514][514])t_out; 26 | 27 | if (i<=N-2 & j<=N-2 && k<=N-2) { 28 | out[k][j][i] = 0.125 * in[k][j][i] + 29 | 1.14 * (in[k-1][j][i] + in[k+1][j][i] + in[k][j-1][i] + 30 | in[k][j+1][i] + in[k][j][i-1] + in[k][j][i+1]) + 31 | 0.75 * (in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + 32 | in[k-1][j+1][i+1] + in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + 33 | in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 34 | 1.031 * (in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + 35 | in[k-1][j+1][i] + in[k][j-1][i-1] + in[k][j-1][i+1] + 36 | in[k][j+1][i-1] + in[k][j+1][i+1] + in[k+1][j-1][i] + 37 | in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i]); 38 | } 39 | } 40 | 41 | extern "C" void host_code (double *h_in, double *h_out, int N) { 42 | double *in; 43 | cudaMalloc (&in, sizeof(double)*N*N*N); 44 | check_error ("Failed to allocate device memory for in\n"); 45 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 46 | double *out; 47 | cudaMalloc (&out, sizeof(double)*N*N*N); 48 | check_error ("Failed to allocate device memory for out\n"); 49 | dim3 blockconfig (16, 4, 4); 50 | dim3 gridconfig (ceil(N-2, blockconfig.x), ceil(N-2, blockconfig.y), ceil(N-2, blockconfig.z)); 51 | 52 | j3d27pt<<>> (in, out, N); 53 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 54 | 55 | cudaFree (in); 56 | cudaFree (out); 57 | } 58 | -------------------------------------------------------------------------------- /scanner.l: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "stdio.h" 3 | #include "y.tab.h" 4 | #include "utils.hpp" 5 | %} 6 | %option noyywrap 7 | 8 | DIGIT [0-9]+ 9 | ID [a-zA-Z][a-zA-Z0-9_]* 10 | %% 11 | 12 | coefficient { 13 | return COEFFICIENT; 14 | } 15 | 16 | parameter { 17 | return PARAMETER; 18 | } 19 | 20 | function { 21 | return FUNCTION; 22 | } 23 | 24 | temporary { 25 | return TEMPORARY; 26 | } 27 | 28 | unroll { 29 | return UNROLL; 30 | } 31 | 32 | iterator { 33 | return ITERATOR; 34 | } 35 | 36 | reglimit { 37 | return REGLIMIT; 38 | } 39 | 40 | bool { 41 | yylval.ival = BOOL; 42 | return DATATYPE; 43 | } 44 | 45 | float { 46 | yylval.ival = FLOAT; 47 | return DATATYPE; 48 | } 49 | 50 | double { 51 | yylval.ival = DOUBLE; 52 | return DATATYPE; 53 | } 54 | 55 | int { 56 | yylval.ival = INT; 57 | return DATATYPE; 58 | } 59 | 60 | true { 61 | yylval.bval = true; 62 | return TRUE; 63 | } 64 | 65 | false { 66 | yylval.bval = false; 67 | return FALSE; 68 | } 69 | 70 | {ID} { 71 | yylval.str = strdup (yytext); 72 | return ID; 73 | } 74 | 75 | {DIGIT} { 76 | yylval.ival = atoi(yytext); 77 | return T_INT; 78 | } 79 | 80 | {DIGIT}"."{DIGIT}"f" { 81 | yylval.fval = atof(yytext); 82 | return T_FLOAT; 83 | } 84 | 85 | {DIGIT}"."{DIGIT}"F" { 86 | yylval.fval = atof(yytext); 87 | return T_FLOAT; 88 | } 89 | 90 | {DIGIT}"f" { 91 | yylval.fval = atof(yytext); 92 | return T_FLOAT; 93 | } 94 | 95 | {DIGIT}"F" { 96 | yylval.fval = atof(yytext); 97 | return T_FLOAT; 98 | } 99 | 100 | {DIGIT}"."{DIGIT} { 101 | yylval.dval = atof(yytext); 102 | return T_DOUBLE; 103 | } 104 | 105 | {DIGIT}"e"{DIGIT} { 106 | yylval.dval = atof(yytext); 107 | return T_DOUBLE; 108 | } 109 | 110 | {DIGIT}"E"{DIGIT} { 111 | yylval.dval = atof(yytext); 112 | return T_DOUBLE; 113 | } 114 | 115 | ";" | 116 | "+" | 117 | "(" | 118 | ")" | 119 | "{" | 120 | "}" | 121 | "|" | 122 | "[" | 123 | "]" | 124 | "&" | 125 | "%" | 126 | "^" | 127 | "," | 128 | ":" | 129 | "-" | 130 | ">" | 131 | "<" | 132 | "=" | 133 | "*" | 134 | "/" { 135 | return yytext[0]; 136 | } 137 | 138 | "<=" { 139 | return LEQ; 140 | } 141 | 142 | ">=" { 143 | return GEQ; 144 | } 145 | 146 | "==" { 147 | return EQ; 148 | } 149 | 150 | "!=" { 151 | return NEQ; 152 | } 153 | 154 | "+=" { 155 | return PLUSEQ; 156 | } 157 | 158 | "-=" { 159 | return MINUSEQ; 160 | } 161 | 162 | "*=" { 163 | return MULTEQ; 164 | } 165 | 166 | "/=" { 167 | return DIVEQ; 168 | } 169 | 170 | "|=" { 171 | return OREQ; 172 | } 173 | 174 | "&=" { 175 | return ANDEQ; 176 | } 177 | 178 | ".." { 179 | return DDOTS; 180 | } 181 | 182 | "//" { 183 | return COMMENT; 184 | } 185 | 186 | [ \t\n]+ {} 187 | 188 | <> { 189 | return 0; 190 | } 191 | %% 192 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 10 | 11 | awk '/#pragma begin/{print $3}' stencils > stencilnames 12 | awk '/unroll/{print $5}' stencils > unrollfactors 13 | 14 | while read -r name 15 | do 16 | uf=`awk 'NR==1' unrollfactors` 17 | sed -i '1d' unrollfactors 18 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 19 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 20 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 21 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 22 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 23 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort false --split false 24 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort true --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-d.cu 27 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 28 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 29 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 30 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-h.cu 31 | 32 | done < stencilnames 33 | 34 | sed -i '/#pragma begin stencil/d' reordered-a.cu 35 | sed -i '/#pragma end stencil/d' reordered-a.cu 36 | #indent -kr -i8 reordered-a.cu 37 | sed -i '/#pragma begin stencil/d' reordered-b.cu 38 | sed -i '/#pragma end stencil/d' reordered-b.cu 39 | #indent -kr -i8 reordered-b.cu 40 | sed -i '/#pragma begin stencil/d' reordered-c.cu 41 | sed -i '/#pragma end stencil/d' reordered-c.cu 42 | #indent -kr -i8 reordered-c.cu 43 | sed -i '/#pragma begin stencil/d' reordered-f.cu 44 | sed -i '/#pragma end stencil/d' reordered-f.cu 45 | #indent -kr -i8 reordered-f.cu 46 | sed -i '/#pragma begin stencil/d' reordered-g.cu 47 | sed -i '/#pragma end stencil/d' reordered-g.cu 48 | #indent -kr -i8 reordered-g.cu 49 | sed -i '/#pragma begin stencil/d' reordered-h.cu 50 | sed -i '/#pragma end stencil/d' reordered-h.cu 51 | #indent -kr -i8 reordered-h.cu 52 | #rm *~ 53 | -------------------------------------------------------------------------------- /examples/j2d25pt/common/time.awk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "-------------------- NVCC ---------------------" 5 | 6 | time=`grep -E 'float|double' nvcc-orig-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 7 | awk -v otime=$time 'BEGIN {print "Original GFlops = " (8192*8192*33/10^6/otime)}' 8 | 9 | time=`grep -E 'float|double' nvcc-unroll-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 10 | awk -v utime=$time 'BEGIN {print "Unrolled GFlops = " (8192*8192*33/10^6/utime)}' 11 | 12 | timea=`grep -E 'float|double' nvcc-reorder-results-a | awk 'BEGIN {timea = 0.0} {timea += $2} END {print timea}'` 13 | timeb=`grep -E 'float|double' nvcc-reorder-results-b | awk 'BEGIN {timeb = 0.0} {timeb += $2} END {print timeb}'` 14 | timec=`grep -E 'float|double' nvcc-reorder-results-c | awk 'BEGIN {timec = 0.0} {timec += $2} END {print timec}'` 15 | timed=`grep -E 'float|double' nvcc-reorder-results-d | awk 'BEGIN {timed = 0.0} {timed += $2} END {print timed}'` 16 | timee=`grep -E 'float|double' nvcc-reorder-results-e | awk 'BEGIN {timee = 0.0} {timee += $2} END {print timee}'` 17 | min1=`awk -v atime=$timea -v btime=$timeb 'BEGIN {print (atime (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d27pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 1; 18 | int i = max(i0,1) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y) + 1; 20 | int j = max(j0,1) + 4*(int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 1; 22 | int k = max(k0,1) + (int)(threadIdx.z); 23 | 24 | double (*in)[514][514] = (double (*)[514][514])t_in; 25 | double (*out)[514][514] = (double (*)[514][514])t_out; 26 | 27 | if (i<=N-2 & j<=N-2 && k<=N-2) { 28 | #pragma begin stencil1 unroll k=1,j=4,i=1 29 | out[k][j][i] = 0.125 * in[k][j][i] + 30 | 1.14 * (in[k-1][j][i] + in[k+1][j][i] + in[k][j-1][i] + 31 | in[k][j+1][i] + in[k][j][i-1] + in[k][j][i+1]) + 32 | 0.75 * (in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + 33 | in[k-1][j+1][i+1] + in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + 34 | in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 35 | 1.031 * (in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + 36 | in[k-1][j+1][i] + in[k][j-1][i-1] + in[k][j-1][i+1] + 37 | in[k][j+1][i-1] + in[k][j+1][i+1] + in[k+1][j-1][i] + 38 | in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i]); 39 | #pragma end stencil1 40 | } 41 | } 42 | 43 | extern "C" void host_code (double *h_in, double *h_out, int N) { 44 | double *in; 45 | cudaMalloc (&in, sizeof(double)*N*N*N); 46 | check_error ("Failed to allocate device memory for in\n"); 47 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 48 | double *out; 49 | cudaMalloc (&out, sizeof(double)*N*N*N); 50 | check_error ("Failed to allocate device memory for out\n"); 51 | 52 | dim3 blockconfig (32,4,4); 53 | dim3 gridconfig (ceil(N-2, blockconfig.x), ceil(N-2, 4*blockconfig.y), ceil(N-2, blockconfig.z)); 54 | 55 | j3d27pt<<>> (in, out, N); 56 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 57 | 58 | cudaFree (in); 59 | cudaFree (out); 60 | } 61 | -------------------------------------------------------------------------------- /examples/j3d27pt/common/time.awk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "-------------------- NVCC ---------------------" 5 | 6 | time=`grep -E 'float|double' nvcc-orig-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 7 | awk -v otime=$time 'BEGIN {print "Original GFlops = " (512*512*512*30/10^6/otime)}' 8 | 9 | time=`grep -E 'float|double' nvcc-unroll-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 10 | awk -v utime=$time 'BEGIN {print "Unrolled GFlops = " (512*512*512*30/10^6/utime)}' 11 | 12 | timea=`grep -E 'float|double' nvcc-reorder-results-a | awk 'BEGIN {timea = 0.0} {timea += $2} END {print timea}'` 13 | timeb=`grep -E 'float|double' nvcc-reorder-results-b | awk 'BEGIN {timeb = 0.0} {timeb += $2} END {print timeb}'` 14 | timec=`grep -E 'float|double' nvcc-reorder-results-c | awk 'BEGIN {timec = 0.0} {timec += $2} END {print timec}'` 15 | timed=`grep -E 'float|double' nvcc-reorder-results-d | awk 'BEGIN {timed = 0.0} {timed += $2} END {print timed}'` 16 | timee=`grep -E 'float|double' nvcc-reorder-results-e | awk 'BEGIN {timee = 0.0} {timee += $2} END {print timee}'` 17 | min1=`awk -v atime=$timea -v btime=$timeb 'BEGIN {print (atime (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d27pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 1; 18 | int i = max(i0,1) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y) + 1; 20 | int j = max(j0,1) + 4*(int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 1; 22 | int k = max(k0,1) + (int)(threadIdx.z); 23 | 24 | double (*in)[514][514] = (double (*)[514][514])t_in; 25 | double (*out)[514][514] = (double (*)[514][514])t_out; 26 | 27 | if (i<=N-2 & j<=N-2 && k<=N-2) { 28 | #pragma unroll 4 29 | for (int jj=0; jj<=3; jj++) { 30 | out[k][j+jj][i] = 0.125 * in[k][j+jj][i] + 31 | 1.14 * (in[k-1][j+jj][i] + in[k+1][j+jj][i] + in[k][j+jj-1][i] + 32 | in[k][j+jj+1][i] + in[k][j+jj][i-1] + in[k][j+jj][i+1]) + 33 | 0.75 * (in[k-1][j+jj-1][i-1] + in[k-1][j+jj-1][i+1] + in[k-1][j+jj+1][i-1] + 34 | in[k-1][j+jj+1][i+1] + in[k+1][j+jj-1][i-1] + in[k+1][j+jj-1][i+1] + 35 | in[k+1][j+jj+1][i-1] + in[k+1][j+jj+1][i+1]) + 36 | 1.031 * (in[k-1][j+jj-1][i] + in[k-1][j+jj][i-1] + in[k-1][j+jj][i+1] + 37 | in[k-1][j+jj+1][i] + in[k][j+jj-1][i-1] + in[k][j+jj-1][i+1] + 38 | in[k][j+jj+1][i-1] + in[k][j+jj+1][i+1] + in[k+1][j+jj-1][i] + 39 | in[k+1][j+jj][i-1] + in[k+1][j+jj][i+1] + in[k+1][j+jj+1][i]); 40 | } 41 | } 42 | } 43 | 44 | extern "C" void host_code (double *h_in, double *h_out, int N) { 45 | double *in; 46 | cudaMalloc (&in, sizeof(double)*N*N*N); 47 | check_error ("Failed to allocate device memory for in\n"); 48 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 49 | double *out; 50 | cudaMalloc (&out, sizeof(double)*N*N*N); 51 | check_error ("Failed to allocate device memory for out\n"); 52 | 53 | dim3 blockconfig (16, 4, 4); 54 | dim3 gridconfig (ceil(N-2, blockconfig.x), ceil(N-2, 4*blockconfig.y), ceil(N-2, blockconfig.z)); 55 | 56 | j3d27pt<<>> (in, out, N); 57 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 58 | 59 | cudaFree (in); 60 | cudaFree (out); 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DESCRIPTION 2 | 3 | The artifact for the paper "Register Optimizations for Stencils on GPUs" can be downloaded from 4 | https://github.com/pssrawat/ppopp-artifact.git. The paper is in the main directory (ppopp-18.pdf). 5 | 6 | The package contains: 7 | a. the source code for the reordering framework 8 | b. the examples used in the paper in the examples/ directory 9 | c. scripts for code installation and benchmarking 10 | 11 | 12 | 13 | 14 | DEPENDENCIES 15 | 16 | We tested the framework on ubuntu 16.04 and Red Hat Enterprise Linux Server release 6.7 using a 17 | Kepler K40c card, with GCC 5.3.0, LLVM 5.0.0, and NVCC 8.0. The following are hardware requirements 18 | for the framework: 19 | 1. flex >= 2.6.0 (2.6.0 tested) 20 | 2. bison >= 3.0.4 (3.0.4 tested) 21 | 3. cmake >= 3.8 (3.8 tested) 22 | 4. boost >=1.58 (1.58 tested) 23 | 5. GCC version 4 (4.9.2 tested) or 5 (5.3.0 tested) 24 | 6. NVCC 8.0 25 | 7. LLVM 5.0 (with gpucc) 26 | 27 | 28 | 29 | 30 | STEPS TO INSTALL 31 | 32 | 1. Set the CUDAHOME variable with 'export CUDAHOME=path-to-cuda'. 33 | 2. Set the CAPABILITY variable to the GPU device's compute capability. For example, we executed 34 | 'export CAPABILITY=35' for the K40c card we tested the framework on. 35 | *Some scripts will not run if these two variables are not set* 36 | 3. Download and install LLVM. If you cannot get the latest version of LLVM with GPUCC (LLVM 5.0 and above) from 37 | apt or any other repo, you can download it from http://releases.llvm.org/download.html. The installation 38 | steps are in https://llvm.org/docs/GettingStarted.html. 39 | I downloaded LLVM into source directory, and created two separate build and install directories. Then, from 40 | the build directory, I used the following command to configure LLVM: 41 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=`pwd`/../install ../source/llvm/ -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DGCC_INSTALL_PREFIX=/opt/software/gcc/4.9.2/ -DCMAKE_C_COMPILER=/opt/software/gcc/4.9.2/bin/gcc -DCMAKE_CXX_COMPILER=/opt/software/gcc/4.9.2/bin/g++ -DCMAKE_CXX_LINK_FLAGS="-L/opt/software/gcc/4.9.2/lib64 -Wl,-rpath,/opt/software/gcc/4.9.2/lib64" 42 | You may need to adjust the paths according to your machine configuration. 43 | 4. Simply run 'make all' in the main directory. The makefile will create a 'test' executable. 44 | 5. Go to the examples directory, and run the benchmarking script as './run-benchmarks.sh'. 45 | This will create a file 'output.txt' with all the results. Alternatively, you can go to an independent 46 | directory, and run 'make', and see the printed results on standard output. 47 | 48 | 49 | 50 | 51 | COPYRIGHT 52 | 53 | All files in this archive which do not include a prior copyright are by default included in this tool and copyrighted 2017 Ohio State University. 54 | 55 | 56 | 57 | 58 | MORE INFORMATION 59 | 60 | For more information on how to add a new benchmark, see the docs/ folder or contact me at 61 | -------------------------------------------------------------------------------- /examples/j3d125pt/j3d125pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j3d125pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[516][516] = (const double (*)[516][516])l_in; 3 | double (*out)[516][516] = (double (*)[516][516])l_out; 4 | 5 | for (int k = 2; k < N-2; k++) { 6 | for (int j = 2; j < N-2; j++) { 7 | for (int i = 2; i < N-2; i++) { 8 | out[k][j][i] = 9 | 0.75 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2] + in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + in[k-1][j+1][i+1] + 10 | in[k][j-1][i] + in[k][j][i-1] + in[k][j][i+1] + in[k][j+1][i] + 11 | in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 12 | 0.76 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2]) + 13 | 14 | 1.132 * (in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1] + 15 | in[k-1][j-2][i-2] + in[k-1][j-2][i+2] + in[k-1][j+2][i-2] + in[k-1][j+2][i+2] + 16 | in[k][j-2][i] + in[k][j][i-2] + in[k][j][i+2] + in[k][j+2][i] + 17 | in[k+1][j-2][i-2] + in[k+1][j-2][i+2] + in[k+1][j+2][i-2] + in[k+1][j+2][i+2] + 18 | in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1]) + 19 | 20 | 0.217 * (in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i] + 21 | in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + in[k-1][j+1][i] + 22 | in[k][j-2][i-2] + in[k][j-2][i+2] + in[k][j+2][i-2] + in[k][j+2][i+2] + 23 | in[k+1][j-1][i] + in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i] + 24 | in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i]) + 25 | 26 | 2.13 * (in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i] + 27 | in[k-1][j-2][i] + in[k-1][j][i-2] + in[k-1][j][i+2] + in[k-1][j+2][i] + 28 | in[k][j-2][i-1] + in[k][j-2][i+1] + in[k][j-1][i-2] + in[k][j-1][i+2] + in[k][j][i] + in[k][j+1][i-2] + in[k][j+1][i+2] + in[k][j+2][i-1] + in[k][j+2][i+1] + 29 | in[k+1][j-2][i] + in[k+1][j][i-2] + in[k+1][j][i+2] + in[k+1][j+2][i] + 30 | in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i]) + 31 | 32 | 0.331 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1] + 33 | in[k-1][j-2][i-1] + in[k-1][j-2][i+1] + in[k-1][j-1][i-2] + in[k-1][j-1][i+2] + in[k-1][j][i] + in[k-1][j+1][i-2] + in[k-1][j+1][i+2] + in[k-1][j+2][i-1] + in[k-1][j+2][i+1] + 34 | in[k][j-1][i-1] + in[k][j-1][i+1] + in[k][j+1][i-1] + in[k][j+1][i+1] + 35 | in[k+1][j-2][i-1] + in[k+1][j-2][i+1] + in[k+1][j-1][i-2] + in[k+1][j-1][i+2] + in[k+1][j][i] + in[k+1][j+1][i-2] + in[k+1][j+1][i+2] + in[k+1][j+2][i-1] + in[k+1][j+2][i+1]) + 36 | 0.332 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1]); 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/j3d125pt_gold.cpp: -------------------------------------------------------------------------------- 1 | extern "C" void j3d125pt_gold (const double* l_in, double* l_out, int N) { 2 | const double (*in)[516][516] = (const double (*)[516][516])l_in; 3 | double (*out)[516][516] = (double (*)[516][516])l_out; 4 | 5 | for (int k = 2; k < N-2; k++) { 6 | for (int j = 2; j < N-2; j++) { 7 | for (int i = 2; i < N-2; i++) { 8 | out[k][j][i] = 9 | 0.75 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2] + 10 | in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + in[k-1][j+1][i+1] + 11 | in[k][j-1][i] + in[k][j][i-1] + in[k][j][i+1] + in[k][j+1][i] + 12 | in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + in[k+1][j+1][i-1] + in[k+1][j+1][i+1] + 13 | in[k+2][j-2][i-2] + in[k+2][j-2][i+2] + in[k+2][j+2][i-2] + in[k+2][j+2][i+2]) + 14 | 15 | 1.132 * (in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + 16 | in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1] + 17 | in[k-1][j-2][i-2] + in[k-1][j-2][i+2] + in[k-1][j+2][i-2] + in[k-1][j+2][i+2] + 18 | in[k][j-2][i] + in[k][j][i-2] + in[k][j][i+2] + in[k][j+2][i] + 19 | in[k+1][j-2][i-2] + in[k+1][j-2][i+2] + in[k+1][j+2][i-2] + in[k+1][j+2][i+2] + 20 | in[k+2][j-2][i-1] + in[k+2][j-2][i+1] + in[k+2][j-1][i-2] + in[k+2][j-1][i+2] + in[k+2][j][i] + 21 | in[k+2][j+1][i-2] + in[k+2][j+1][i+2] + in[k+2][j+2][i-1] + in[k+2][j+2][i+1]) + 22 | 23 | 0.217 * (in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i] + 24 | in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + in[k-1][j+1][i] + 25 | in[k][j-2][i-2] + in[k][j-2][i+2] + in[k][j+2][i-2] + in[k][j+2][i+2] + 26 | in[k+1][j-1][i] + in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i] + 27 | in[k+2][j-2][i] + in[k+2][j][i-2] + in[k+2][j][i+2] + in[k+2][j+2][i]) + 28 | 29 | 2.13 * (in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i] + 30 | in[k-1][j-2][i] + in[k-1][j][i-2] + in[k-1][j][i+2] + in[k-1][j+2][i] + 31 | in[k][j-2][i-1] + in[k][j-2][i+1] + in[k][j-1][i-2] + in[k][j-1][i+2] + 32 | in[k][j][i] + in[k][j+1][i-2] + in[k][j+1][i+2] + in[k][j+2][i-1] + in[k][j+2][i+1] + 33 | in[k+1][j-2][i] + in[k+1][j][i-2] + in[k+1][j][i+2] + in[k+1][j+2][i] + 34 | in[k+2][j-1][i] + in[k+2][j][i-1] + in[k+2][j][i+1] + in[k+2][j+1][i]) + 35 | 36 | 0.331 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1] + 37 | in[k-1][j-2][i-1] + in[k-1][j-2][i+1] + in[k-1][j-1][i-2] + in[k-1][j-1][i+2] + in[k-1][j][i] + 38 | in[k-1][j+1][i-2] + in[k-1][j+1][i+2] + in[k-1][j+2][i-1] + in[k-1][j+2][i+1] + 39 | in[k][j-1][i-1] + in[k][j-1][i+1] + in[k][j+1][i-1] + in[k][j+1][i+1] + 40 | in[k+1][j-2][i-1] + in[k+1][j-2][i+1] + in[k+1][j-1][i-2] + in[k+1][j-1][i+2] + in[k+1][j][i] + 41 | in[k+1][j+1][i-2] + in[k+1][j+1][i+2] + in[k+1][j+2][i-1] + in[k+1][j+2][i+1] + 42 | in[k+2][j-1][i-1] + in[k+2][j-1][i+1] + in[k+2][j+1][i-1] + in[k+2][j+1][i+1]); 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /examples/j2d81pt/j2d81pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d81pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,0) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,0) + (int)(threadIdx.y); 21 | 22 | double (*in)[8200] = (double (*)[8200]) l_in; 23 | double (*out)[8200] = (double (*)[8200]) l_out; 24 | 25 | if (i>=0 & j>=0 & i<=N-9 & j<=N-9) { 26 | out[j][i] = 27 | (in[j][i] + in[j][i+8] + in[j+8][i] + in[j+8][i+8]) * 3.1862206 + 28 | (in[j][i+1] + in[j][i+7] + in[j+1][i] + in[j+1][i+8] + in[j+7][i] + in[j+7][i+8] + in[j+8][i+1] + in[j+8][i+7]) * 4.5339005 + 29 | (in[j][i+2] + in[j][i+6] + in[j+2][i] + in[j+2][i+8] + in[j+6][i] + in[j+6][i+8] + in[j+8][i+2] + in[j+8][i+6]) * -0.000357000 + 30 | (in[j][i+3] + in[j][i+5] + in[j+3][i] + in[j+3][i+8] + in[j+5][i] + in[j+5][i+8] + in[j+8][i+3] + in[j+8][i+5]) * 0.00285600 + 31 | (in[j][i+4] + in[j+4][i+8] + in[j+4][i] + in[j+8][i+4]) * -0.00508225 + 32 | (in[j+1][i+1] + in[j+1][i+7] + in[j+7][i+1] + in[j+7][i+7]) * 0.000645160 + 33 | (in[j+1][i+2] + in[j+1][i+6] + in[j+2][i+1] + in[j+2][i+7] + in[j+6][i+1] + in[j+6][i+7] + in[j+7][i+2] + in[j+7][i+6]) * -0.00508000 + 34 | (in[j+1][i+3] + in[j+1][i+5] + in[j+3][i+1] + in[j+3][i+7] + in[j+5][i+1] + in[j+5][i+7] + in[j+7][i+3] + in[j+7][i+5]) * 0.0406400 + 35 | (in[j+1][i+4] + in[j+4][i+1] + in[j+4][i+7] + in[j+7][i+4]) * -0.0723189 + 36 | (in[j+2][i+2] + in[j+2][i+6] + in[j+6][i+2] + in[j+6][i+6]) * 0.0400000 + 37 | (in[j+2][i+3] + in[j+2][i+5] + in[j+3][i+2] + in[j+3][i+6] + in[j+5][i+2] + in[j+5][i+6] + in[j+6][i+3] + in[j+6][i+5]) * -0.320000 + 38 | (in[j+2][i+4] + in[j+4][i+2] + in[j+4][i+6] + in[j+6][i+4]) * 0.569440 + 39 | (in[j+3][i+3] + in[j+3][i+5] + in[j+5][i+3] + in[j+5][i+5]) * 2.56000 + 40 | (in[j+3][i+4] + in[j+4][i+3] + in[j+4][i+5] + in[j+5][i+4]) * -4.55552 + 41 | in[j+4][i+4] * 8.10655; 42 | } 43 | } 44 | 45 | extern "C" void host_code (double *h_in, double *h_out, int N) { 46 | double *in; 47 | cudaMalloc (&in, sizeof(double)*N*N); 48 | check_error ("Failed to allocate device memory for in\n"); 49 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 50 | double *out; 51 | cudaMalloc (&out, sizeof(double)*N*N); 52 | check_error ("Failed to allocate device memory for out\n"); 53 | 54 | dim3 blockconfig (16, 8); 55 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, blockconfig.y)); 56 | 57 | j2d81pt<<>> (in, out, N); 58 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 59 | 60 | cudaFree (in); 61 | cudaFree (out); 62 | } 63 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-3/sw4.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void sw4_gold (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 6 | extern "C" void host_code (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 304; 10 | 11 | double (*u_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 12 | double (*u_1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 13 | double (*u_2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 14 | double (*mu)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 15 | double (*la)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 16 | double *strx = (double *) getRandom1DArray(304); 17 | double *stry = (double *) getRandom1DArray(304); 18 | double *strz = (double *) getRandom1DArray(304); 19 | double (*uacc_gold_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 20 | double (*uacc_gold_1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 21 | double (*uacc_gold_2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 22 | double (*uacc_0)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 23 | double (*uacc_1)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 24 | double (*uacc_2)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 25 | memcpy(uacc_0, uacc_gold_0, sizeof(double)*N*N*N); 26 | memcpy(uacc_1, uacc_gold_1, sizeof(double)*N*N*N); 27 | memcpy(uacc_2, uacc_gold_2, sizeof(double)*N*N*N); 28 | 29 | sw4_gold ((double*)uacc_gold_0, (double*)uacc_gold_1, (double*)uacc_gold_2, (double*)u_0, (double*)u_1, (double*)u_2, (double*)mu, (double*)la, (double*)strx, (double*)stry, (double*)strz, N); 30 | host_code ((double*)uacc_0, (double*)uacc_1, (double*)uacc_2, (double*)u_0, (double*)u_1, (double*)u_2, (double*)mu, (double*)la, (double*)strx, (double*)stry, (double*)strz, N); 31 | 32 | double error_0 = checkError3D (N, N, (double*)uacc_0, (double*)uacc_gold_0, 2, N-2, 2, N-2, 2, N-2); 33 | printf("[Test] RMS Error : %e\n",error_0); 34 | if (error_0 > TOLERANCE) 35 | return -1; 36 | double error_1 = checkError3D (N, N, (double*)uacc_1, (double*)uacc_gold_1, 2, N-2, 2, N-2, 2, N-2); 37 | printf("[Test] RMS Error : %e\n",error_1); 38 | if (error_1 > TOLERANCE) 39 | return -1; 40 | double error_2 = checkError3D (N, N, (double*)uacc_2, (double*)uacc_gold_2, 2, N-2, 2, N-2, 2, N-2); 41 | printf("[Test] RMS Error : %e\n",error_2); 42 | if (error_2 > TOLERANCE) 43 | return -1; 44 | 45 | delete[] strx; 46 | delete[] stry; 47 | delete[] strz; 48 | delete[] u_0; 49 | delete[] u_1; 50 | delete[] u_2; 51 | delete[] mu; 52 | delete[] la; 53 | delete[] uacc_0; 54 | delete[] uacc_1; 55 | delete[] uacc_2; 56 | delete[] uacc_gold_0; 57 | delete[] uacc_gold_1; 58 | delete[] uacc_gold_2; 59 | } 60 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-maxfuse/sw4.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void sw4_gold (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 6 | extern "C" void host_code (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 304; 10 | 11 | double (*u_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 12 | double (*u_1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 13 | double (*u_2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 14 | double (*mu)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 15 | double (*la)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 16 | double *strx = (double *) getRandom1DArray(304); 17 | double *stry = (double *) getRandom1DArray(304); 18 | double *strz = (double *) getRandom1DArray(304); 19 | double (*uacc_gold_0)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 20 | double (*uacc_gold_1)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 21 | double (*uacc_gold_2)[304][304] = (double (*)[304][304]) getRandom3DArray(304, 304, 304); 22 | double (*uacc_0)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 23 | double (*uacc_1)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 24 | double (*uacc_2)[304][304] = (double (*)[304][304]) getZero3DArray(304, 304, 304); 25 | memcpy(uacc_0, uacc_gold_0, sizeof(double)*N*N*N); 26 | memcpy(uacc_1, uacc_gold_1, sizeof(double)*N*N*N); 27 | memcpy(uacc_2, uacc_gold_2, sizeof(double)*N*N*N); 28 | 29 | sw4_gold ((double*)uacc_gold_0, (double*)uacc_gold_1, (double*)uacc_gold_2, (double*)u_0, (double*)u_1, (double*)u_2, (double*)mu, (double*)la, (double*)strx, (double*)stry, (double*)strz, N); 30 | host_code ((double*)uacc_0, (double*)uacc_1, (double*)uacc_2, (double*)u_0, (double*)u_1, (double*)u_2, (double*)mu, (double*)la, (double*)strx, (double*)stry, (double*)strz, N); 31 | 32 | double error_0 = checkError3D (N, N, (double*)uacc_0, (double*)uacc_gold_0, 2, N-2, 2, N-2, 2, N-2); 33 | printf("[Test] RMS Error : %e\n",error_0); 34 | if (error_0 > TOLERANCE) 35 | return -1; 36 | double error_1 = checkError3D (N, N, (double*)uacc_1, (double*)uacc_gold_1, 2, N-2, 2, N-2, 2, N-2); 37 | printf("[Test] RMS Error : %e\n",error_1); 38 | if (error_1 > TOLERANCE) 39 | return -1; 40 | double error_2 = checkError3D (N, N, (double*)uacc_2, (double*)uacc_gold_2, 2, N-2, 2, N-2, 2, N-2); 41 | printf("[Test] RMS Error : %e\n",error_2); 42 | if (error_2 > TOLERANCE) 43 | return -1; 44 | 45 | delete[] strx; 46 | delete[] stry; 47 | delete[] strz; 48 | delete[] u_0; 49 | delete[] u_1; 50 | delete[] u_2; 51 | delete[] mu; 52 | delete[] la; 53 | delete[] uacc_0; 54 | delete[] uacc_1; 55 | delete[] uacc_2; 56 | delete[] uacc_gold_0; 57 | delete[] uacc_gold_1; 58 | delete[] uacc_gold_2; 59 | } 60 | -------------------------------------------------------------------------------- /examples/j2d64pt/j2d64pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d64pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,4) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,4) + (int)(threadIdx.y); 21 | 22 | double (*in)[8200] = (double (*)[8200]) l_in; 23 | double (*out)[8200] = (double (*)[8200]) l_out; 24 | 25 | if (i>=4 & j>=4 & i<=N-5 & j<=N-5) { 26 | out[j][i] = 27 | (in[j-4][i-4] - in[j-4][i+4] - in[j+4][i-4] + in[j+4][i+4]) * 1.274495 + 28 | (-in[j-4][i-3] + in[j-4][i+3] + in[j-3][i+4] - in[j-3][i-4] + in[j+3][i-4] - in[j+3][i+4] + in[j+4][i-3] - in[j+4][i+3]) * 0.000136017 + 29 | (in[j-4][i-2] - in[j-4][i+2] + in[j-2][i-4] - in[j-2][i+4] - in[j+2][i-4] + in[j+2][i+4] - in[j+4][i-2] + in[j+4][i+2]) * 0.000714000 + 30 | (-in[j-4][i-1] + in[j-4][i+1] - in[j-1][i-4] + in[j-1][i+4] + in[j+1][i-4] - in[j+1][i+4] + in[j+4][i-1] - in[j+4][i+1]) * 0.00285600 + 31 | (in[j-3][i-3] - in[j-3][i+3] - in[j+3][i-3] + in[j+3][i+3]) * 0.00145161 + 32 | (-in[j-3][i-2] + in[j-3][i+2] - in[j-2][i-3] + in[j-2][i+3] + in[j+2][i-3] - in[j+2][i+3] + in[j+3][i-2] - in[j+3][i+2]) * 0.00762000 + 33 | (in[j-3][i-1] - in[j-3][i+1] + in[j-1][i-3] - in[j-1][i+3] - in[j+1][i-3] + in[j+1][i+3] - in[j+3][i-1] + in[j+3][i+1]) * 0.0304800 + 34 | (in[j-2][i-2] - in[j-2][i+2] - in[j+2][i-2] + in[j+2][i+2]) * 0.0400000 + 35 | (-in[j-2][i-1] + in[j-2][i+1] - in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] - in[j+1][i+2] + in[j+2][i-1] - in[j+2][i+1]) * 0.160000 + 36 | (in[j-1][i-1] - in[j-1][i+1] - in[j+1][i-1] + in[j+1][i+1]) * 0.640000; 37 | } 38 | } 39 | 40 | extern "C" void host_code (double *h_in, double *h_out, int N) { 41 | double *in; 42 | cudaMalloc (&in, sizeof(double)*N*N); 43 | check_error ("Failed to allocate device memory for in\n"); 44 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 45 | double *out; 46 | cudaMalloc (&out, sizeof(double)*N*N); 47 | check_error ("Failed to allocate device memory for out\n"); 48 | 49 | dim3 blockconfig (16, 8); 50 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, blockconfig.y)); 51 | 52 | j2d64pt<<>> (in, out, N); 53 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 54 | 55 | cudaFree (in); 56 | cudaFree (out); 57 | } 58 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/common/time.awk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "-------------------- NVCC ---------------------" 5 | 6 | time=`grep -E 'float|double' nvcc-orig-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 7 | awk -v otime=$time 'BEGIN {print "Original GFlops = " (300*300*300*486/10^6/otime)}' 8 | time0=`grep -E 'float|double' nvcc-reorder-results | awk 'BEGIN {time0 = 0.0} {time0 += $2} END {print time0}'` 9 | timea=`grep -E 'float|double' nvcc-reorder-results-a | awk 'BEGIN {timea = 0.0} {timea += $2} END {print timea}'` 10 | timeb=`grep -E 'float|double' nvcc-reorder-results-b | awk 'BEGIN {timeb = 0.0} {timeb += $2} END {print timeb}'` 11 | timec=`grep -E 'float|double' nvcc-reorder-results-c | awk 'BEGIN {timec = 0.0} {timec += $2} END {print timec}'` 12 | timed=`grep -E 'float|double' nvcc-reorder-results-d | awk 'BEGIN {timed = 0.0} {timed += $2} END {print timed}'` 13 | timeg=`grep -E 'float|double' nvcc-reorder-results-g | awk 'BEGIN {timeg = 0.0} {timeg += $2} END {print timeg}'` 14 | timeh=`grep -E 'float|double' nvcc-reorder-results-h | awk 'BEGIN {timeh = 0.0} {timeh += $2} END {print timeh}'` 15 | min0=`awk -v atime=$time0 -v btime=$timea 'BEGIN {print (atime (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d64pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 4; 18 | int i = max(i0,4) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y) + 4; 20 | int j = max(j0,4) + 4*(int)(threadIdx.y); 21 | 22 | double (*in)[8200] = (double (*)[8200]) l_in; 23 | double (*out)[8200] = (double (*)[8200]) l_out; 24 | 25 | if (i>=4 & j>=4 & i<=N-5 & j<=N-5) { 26 | #pragma begin stencil1 unroll j=4,i=1 27 | out[j][i] = 28 | (in[j-4][i-4] - in[j-4][i+4] - in[j+4][i-4] + in[j+4][i+4]) * 1.274495 + 29 | (-in[j-4][i-3] + in[j-4][i+3] + in[j-3][i+4] - in[j-3][i-4] + in[j+3][i-4] - in[j+3][i+4] + in[j+4][i-3] - in[j+4][i+3]) * 0.000136017 + 30 | (in[j-4][i-2] - in[j-4][i+2] + in[j-2][i-4] - in[j-2][i+4] - in[j+2][i-4] + in[j+2][i+4] - in[j+4][i-2] + in[j+4][i+2]) * 0.000714000 + 31 | (-in[j-4][i-1] + in[j-4][i+1] - in[j-1][i-4] + in[j-1][i+4] + in[j+1][i-4] - in[j+1][i+4] + in[j+4][i-1] - in[j+4][i+1]) * 0.00285600 + 32 | (in[j-3][i-3] - in[j-3][i+3] - in[j+3][i-3] + in[j+3][i+3]) * 0.00145161 + 33 | (-in[j-3][i-2] + in[j-3][i+2] - in[j-2][i-3] + in[j-2][i+3] + in[j+2][i-3] - in[j+2][i+3] + in[j+3][i-2] - in[j+3][i+2]) * 0.00762000 + 34 | (in[j-3][i-1] - in[j-3][i+1] + in[j-1][i-3] - in[j-1][i+3] - in[j+1][i-3] + in[j+1][i+3] - in[j+3][i-1] + in[j+3][i+1]) * 0.0304800 + 35 | (in[j-2][i-2] - in[j-2][i+2] - in[j+2][i-2] + in[j+2][i+2]) * 0.0400000 + 36 | (-in[j-2][i-1] + in[j-2][i+1] - in[j-1][i-2] + in[j-1][i+2] + in[j+1][i-2] - in[j+1][i+2] + in[j+2][i-1] - in[j+2][i+1]) * 0.160000 + 37 | (in[j-1][i-1] - in[j-1][i+1] - in[j+1][i-1] + in[j+1][i+1]) * 0.640000; 38 | #pragma end stencil1 39 | } 40 | } 41 | 42 | extern "C" void host_code (double *h_in, double *h_out, int N) { 43 | double *in; 44 | cudaMalloc (&in, sizeof(double)*N*N); 45 | check_error ("Failed to allocate device memory for in\n"); 46 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 47 | double *out; 48 | cudaMalloc (&out, sizeof(double)*N*N); 49 | check_error ("Failed to allocate device memory for out\n"); 50 | 51 | dim3 blockconfig (16, 16); 52 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, 4*blockconfig.y)); 53 | 54 | j2d64pt<<>> (in, out, N); 55 | 56 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 57 | cudaFree (in); 58 | cudaFree (out); 59 | } 60 | -------------------------------------------------------------------------------- /examples/j2d81pt/j2d81pt-reg.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j2d81pt (double * __restrict__ l_in, double * __restrict__ l_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x); 18 | int i = max(i0,0) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y); 20 | int j = max(j0,0) + 4*(int)(threadIdx.y); 21 | 22 | double (*in)[8200] = (double (*)[8200]) l_in; 23 | double (*out)[8200] = (double (*)[8200]) l_out; 24 | 25 | if (i>=0 & j>=0 & i<=N-9 & j<=N-9) { 26 | #pragma begin stencil1 unroll j=4,i=1 27 | out[j][i] = (in[j][i] + in[j][i+8] + in[j+8][i] + in[j+8][i+8]) * 3.1862206 + 28 | (in[j][i+1] + in[j][i+7] + in[j+1][i] + in[j+1][i+8] + in[j+7][i] + in[j+7][i+8] + in[j+8][i+1] + in[j+8][i+7]) * 4.5339005 + 29 | (in[j][i+2] + in[j][i+6] + in[j+2][i] + in[j+2][i+8] + in[j+6][i] + in[j+6][i+8] + in[j+8][i+2] + in[j+8][i+6]) * -0.000357000 + 30 | (in[j][i+3] + in[j][i+5] + in[j+3][i] + in[j+3][i+8] + in[j+5][i] + in[j+5][i+8] + in[j+8][i+3] + in[j+8][i+5]) * 0.00285600 + 31 | (in[j][i+4] + in[j+4][i+8] + in[j+4][i] + in[j+8][i+4]) * -0.00508225 + 32 | (in[j+1][i+1] + in[j+1][i+7] + in[j+7][i+1] + in[j+7][i+7]) * 0.000645160 + 33 | (in[j+1][i+2] + in[j+1][i+6] + in[j+2][i+1] + in[j+2][i+7] + in[j+6][i+1] + in[j+6][i+7] + in[j+7][i+2] + in[j+7][i+6]) * -0.00508000 + 34 | (in[j+1][i+3] + in[j+1][i+5] + in[j+3][i+1] + in[j+3][i+7] + in[j+5][i+1] + in[j+5][i+7] + in[j+7][i+3] + in[j+7][i+5]) * 0.0406400 + 35 | (in[j+1][i+4] + in[j+4][i+1] + in[j+4][i+7] + in[j+7][i+4]) * -0.0723189 + 36 | (in[j+2][i+2] + in[j+2][i+6] + in[j+6][i+2] + in[j+6][i+6]) * 0.0400000 + 37 | (in[j+2][i+3] + in[j+2][i+5] + in[j+3][i+2] + in[j+3][i+6] + in[j+5][i+2] + in[j+5][i+6] + in[j+6][i+3] + in[j+6][i+5]) * -0.320000 + 38 | (in[j+2][i+4] + in[j+4][i+2] + in[j+4][i+6] + in[j+6][i+4]) * 0.569440 + 39 | (in[j+3][i+3] + in[j+3][i+5] + in[j+5][i+3] + in[j+5][i+5]) * 2.56000 + 40 | (in[j+3][i+4] + in[j+4][i+3] + in[j+4][i+5] + in[j+5][i+4]) * -4.55552 + 41 | in[j+4][i+4] * 8.10655; 42 | #pragma end stencil1 43 | } 44 | } 45 | 46 | extern "C" void host_code (double *h_in, double *h_out, int N) { 47 | double *in; 48 | cudaMalloc (&in, sizeof(double)*N*N); 49 | check_error ("Failed to allocate device memory for in\n"); 50 | cudaMemcpy (in, h_in, sizeof(double)*N*N, cudaMemcpyHostToDevice); 51 | double *out; 52 | cudaMalloc (&out, sizeof(double)*N*N); 53 | check_error ("Failed to allocate device memory for out\n"); 54 | 55 | dim3 blockconfig (16, 8); 56 | dim3 gridconfig (ceil(N, blockconfig.x), ceil(N, 4*blockconfig.y)); 57 | 58 | j2d81pt<<>> (in, out, N); 59 | 60 | cudaMemcpy (h_out, out, sizeof(double)*N*N, cudaMemcpyDeviceToHost); 61 | cudaFree (in); 62 | cudaFree (out); 63 | } 64 | -------------------------------------------------------------------------------- /symtab.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __SYMTAB_HPP__ 2 | #define __SYMTAB_HPP__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | /* A class that represents vector. Needed for parsing */ 12 | template 13 | class vectab { 14 | protected: 15 | std::vector vec_list; 16 | public: 17 | virtual void push_back (T); 18 | virtual std::vector get_list (void); 19 | }; 20 | 21 | template 22 | inline void vectab::push_back (T value) { 23 | vec_list.push_back (value); 24 | } 25 | 26 | template 27 | inline std::vector vectab::get_list (void) { 28 | return vec_list; 29 | } 30 | 31 | /* A class that represents a vector of string */ 32 | class string_list : public vectab { 33 | public: 34 | void push_back (std::string); 35 | std::vector get_list (void); 36 | }; 37 | 38 | inline void string_list::push_back (std::string value) { 39 | vec_list.push_back (value); 40 | } 41 | 42 | inline std::vector string_list::get_list (void) { 43 | return vec_list; 44 | } 45 | 46 | /* A class that represents a vector of expressions */ 47 | class expr_list : public vectab { 48 | public: 49 | void push_back (expr_node *); 50 | std::vector get_list (void); 51 | }; 52 | 53 | inline void expr_list::push_back (expr_node *value) { 54 | vec_list.push_back (value); 55 | } 56 | 57 | inline std::vector expr_list::get_list (void) { 58 | return vec_list; 59 | } 60 | 61 | /* A class that represents symbol table. Basically a map from string to 62 | any data structure. */ 63 | template 64 | class symtab { 65 | protected: 66 | std::map symbol_list; 67 | public: 68 | void push_symbol (char *str, T); 69 | void delete_symbol (char *str); 70 | T find_symbol (char *str); 71 | std::map get_symbol_list (void); 72 | }; 73 | 74 | template 75 | inline std::map symtab::get_symbol_list (void) { 76 | return symbol_list; 77 | } 78 | 79 | template 80 | inline void symtab::push_symbol (char *s, T value) { 81 | std::string key = std::string (s); 82 | assert (symbol_list.find (key) == symbol_list.end () && "Assigned name already exists"); 83 | symbol_list.insert (make_pair (key, value)); 84 | } 85 | 86 | template 87 | inline void symtab::delete_symbol (char *s) { 88 | std::string key = std::string (s); 89 | symbol_list.erase (key); 90 | } 91 | 92 | template 93 | inline T symtab::find_symbol (char *s) { 94 | std::string key = std::string (s); 95 | typename std::map ::iterator it = symbol_list.find (key); 96 | if (it != symbol_list.end ()) 97 | return it->second; 98 | return NULL; 99 | } 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /examples/j3d125pt/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j3d125pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 17 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-orig.cu -o llvm-orig 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-unroll.cu -o llvm-unroll 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-a.cu -o llvm-reorder-a 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-c.cu -o llvm-reorder-c 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-d.cu -o llvm-reorder-d 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-e.cu -o llvm-reorder-e 23 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 24 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 25 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 26 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 29 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 30 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 31 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 32 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 33 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 35 | ./common/time.awk 36 | 37 | clean: 38 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 39 | -------------------------------------------------------------------------------- /vardecl.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __VARDECL_HPP__ 2 | #define __VARDECL_HPP__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "utils.hpp" 9 | #include "exprnode.hpp" 10 | #include "symtab.hpp" 11 | 12 | class array_range { 13 | private: 14 | expr_node *lo; 15 | expr_node *hi; 16 | public: 17 | array_range (expr_node *, expr_node *); 18 | expr_node *get_lo_range (); 19 | expr_node *get_hi_range (); 20 | }; 21 | 22 | inline array_range::array_range (expr_node *lo_id, expr_node *hi_id) { 23 | lo = lo_id; 24 | hi = hi_id; 25 | } 26 | 27 | inline expr_node * array_range::get_lo_range (void) { 28 | return lo; 29 | } 30 | 31 | inline expr_node * array_range::get_hi_range (void) { 32 | return hi; 33 | } 34 | 35 | class range_list : public vectab { 36 | public: 37 | void push_back (array_range *); 38 | std::vector get_list (void); 39 | }; 40 | 41 | inline void range_list::push_back (array_range *value) { 42 | vec_list.push_back (value); 43 | } 44 | 45 | inline std::vector range_list::get_list (void) { 46 | return vec_list; 47 | } 48 | 49 | class array_decl { 50 | private: 51 | range_list *range; 52 | std::string name; 53 | DATA_TYPE data_type; 54 | public: 55 | array_decl (DATA_TYPE, char *, range_list *); 56 | void push_range (array_range *); 57 | std::vector get_array_range (void); 58 | range_list *get_range_list (void); 59 | std::string get_array_name (void); 60 | DATA_TYPE get_array_type (void); 61 | }; 62 | 63 | inline array_decl::array_decl (DATA_TYPE t, char *str, range_list *r) { 64 | data_type = t; 65 | range= r; 66 | name = std::string (str); 67 | } 68 | 69 | inline void array_decl::push_range (array_range *r) { 70 | range->push_back (r); 71 | } 72 | 73 | inline DATA_TYPE array_decl::get_array_type (void) { 74 | return data_type; 75 | } 76 | 77 | inline std::vector array_decl::get_array_range (void) { 78 | return range->get_list (); 79 | } 80 | 81 | inline range_list *array_decl::get_range_list (void) { 82 | return range; 83 | } 84 | 85 | inline std::string array_decl::get_array_name (void) { 86 | return name; 87 | } 88 | 89 | class func_call { 90 | private: 91 | std::string func_name; 92 | string_list *out_list, *args; 93 | public: 94 | func_call (char *, string_list *, string_list *); 95 | func_call (char *, string_list *); 96 | void set_name (char *); 97 | std::string get_name (void); 98 | void set_out_list (string_list *); 99 | void push_arg (char *); 100 | void push_out_var (char *); 101 | std::vector get_out_list (void); 102 | std::vector get_arg_list (void); 103 | }; 104 | 105 | inline func_call::func_call (char *s, string_list *arg) { 106 | func_name = std::string (s); 107 | args = arg; 108 | out_list = new string_list (); 109 | } 110 | 111 | inline func_call::func_call (char *s, string_list *arg, string_list *out) { 112 | func_name = std::string (s); 113 | out_list = out; 114 | args = arg; 115 | } 116 | 117 | inline void func_call::set_name (char *s) { 118 | func_name = std::string (s); 119 | } 120 | 121 | inline std::string func_call::get_name (void) { 122 | return func_name; 123 | } 124 | 125 | inline void func_call::set_out_list (string_list *s) { 126 | out_list = s; 127 | } 128 | 129 | inline void func_call::push_arg (char *s) { 130 | args->push_back (std::string (s)); 131 | } 132 | 133 | inline void func_call::push_out_var (char *s) { 134 | out_list->push_back (std::string (s)); 135 | } 136 | 137 | inline std::vector func_call::get_out_list (void) { 138 | return out_list->get_list (); 139 | } 140 | 141 | inline std::vector func_call::get_arg_list (void) { 142 | return args->get_list (); 143 | } 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /examples/j2d25pt/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j2d25pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=32 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp j2d25pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=48 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp j2d25pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=48 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp j2d25pt-orig.cu -o llvm-orig 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp j2d25pt-unroll.cu -o llvm-unroll 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-a.cu -o llvm-reorder-a 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-b.cu -o llvm-reorder-b 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-c.cu -o llvm-reorder-c 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-d.cu -o llvm-reorder-d 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d25pt.driver.cpp j2d25pt_gold.cpp reordered-e.cu -o llvm-reorder-e 25 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 26 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 32 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 33 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 35 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 39 | ./common/time.awk 40 | 41 | clean: 42 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 43 | -------------------------------------------------------------------------------- /examples/j2d64pt/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j2d64pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp j2d64pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp j2d64pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp j2d64pt-orig.cu -o llvm-orig 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp j2d64pt-unroll.cu -o llvm-unroll 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-a.cu -o llvm-reorder-a 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-b.cu -o llvm-reorder-b 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-c.cu -o llvm-reorder-c 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-d.cu -o llvm-reorder-d 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d64pt.driver.cpp j2d64pt_gold.cpp reordered-e.cu -o llvm-reorder-e 25 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 26 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 32 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 33 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 35 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 39 | ./common/time.awk 40 | 41 | clean: 42 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 43 | -------------------------------------------------------------------------------- /examples/j2d81pt/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j2d81pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp j2d81pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp j2d81pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp j2d81pt-orig.cu -o llvm-orig 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp j2d81pt-unroll.cu -o llvm-unroll 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-a.cu -o llvm-reorder-a 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-b.cu -o llvm-reorder-b 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-c.cu -o llvm-reorder-c 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-d.cu -o llvm-reorder-d 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j2d81pt.driver.cpp j2d81pt_gold.cpp reordered-e.cu -o llvm-reorder-e 25 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 26 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 32 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 33 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 35 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 39 | ./common/time.awk 40 | 41 | clean: 42 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 43 | -------------------------------------------------------------------------------- /examples/j3d27pt/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j3d27pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=48 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp j3d27pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp j3d27pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp j3d27pt-orig.cu -o llvm-orig 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp j3d27pt-unroll.cu -o llvm-unroll 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-a.cu -o llvm-reorder-a 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-b.cu -o llvm-reorder-b 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-c.cu -o llvm-reorder-c 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-d.cu -o llvm-reorder-d 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d27pt.driver.cpp j3d27pt_gold.cpp reordered-e.cu -o llvm-reorder-e 25 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 26 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 32 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 33 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 35 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 39 | ./common/time.awk 40 | 41 | clean: 42 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 43 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-maxfuse/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-f.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 10 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-i.cu 11 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-j.cu 12 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-k.cu 13 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-l.cu 14 | 15 | awk '/#pragma begin/{print $3}' stencils > stencilnames 16 | awk '/unroll/{print $5}' stencils > unrollfactors 17 | 18 | while read -r name 19 | do 20 | uf=`awk 'NR==1' unrollfactors` 21 | sed -i '1d' unrollfactors 22 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 23 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 24 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 27 | 28 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort false --split false 29 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 30 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-f.cu 31 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort true --split false 32 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 33 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-h.cu 34 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 35 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-i.cu 36 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-j.cu 37 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 38 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-k.cu 39 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-l.cu 40 | 41 | done < stencilnames 42 | 43 | sed -i '/#pragma begin stencil/d' reordered-a.cu 44 | sed -i '/#pragma end stencil/d' reordered-a.cu 45 | #indent -kr -i8 reordered-a.cu 46 | sed -i '/#pragma begin stencil/d' reordered-b.cu 47 | sed -i '/#pragma end stencil/d' reordered-b.cu 48 | #indent -kr -i8 reordered-b.cu 49 | sed -i '/#pragma begin stencil/d' reordered-e.cu 50 | sed -i '/#pragma end stencil/d' reordered-e.cu 51 | #indent -kr -i8 reordered-e.cu 52 | sed -i '/#pragma begin stencil/d' reordered-f.cu 53 | sed -i '/#pragma end stencil/d' reordered-f.cu 54 | #indent -kr -i8 reordered-f.cu 55 | sed -i '/#pragma begin stencil/d' reordered-g.cu 56 | sed -i '/#pragma end stencil/d' reordered-g.cu 57 | #indent -kr -i8 reordered-g.cu 58 | sed -i '/#pragma begin stencil/d' reordered-h.cu 59 | sed -i '/#pragma end stencil/d' reordered-h.cu 60 | #indent -kr -i8 reordered-h.cu 61 | sed -i '/#pragma begin stencil/d' reordered-i.cu 62 | sed -i '/#pragma end stencil/d' reordered-i.cu 63 | #indent -kr -i8 reordered-i.cu 64 | sed -i '/#pragma begin stencil/d' reordered-j.cu 65 | sed -i '/#pragma end stencil/d' reordered-j.cu 66 | #indent -kr -i8 reordered-j.cu 67 | sed -i '/#pragma begin stencil/d' reordered-k.cu 68 | sed -i '/#pragma end stencil/d' reordered-k.cu 69 | #indent -kr -i8 reordered-k.cu 70 | sed -i '/#pragma begin stencil/d' reordered-l.cu 71 | sed -i '/#pragma end stencil/d' reordered-l.cu 72 | #indent -kr -i8 reordered-l.cu 73 | #rm *~ 74 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "grammar.hpp" 8 | #include "codegen.hpp" 9 | 10 | using namespace std; 11 | 12 | start_node * grammar::start = NULL; 13 | 14 | int main (int argc, char **argv) { 15 | string outfile ("--out-file"); 16 | string out_name ("out.cu"); 17 | string datatype ("--datatype"); 18 | string data_type ("double"); 19 | string unroll ("--unroll"); 20 | map unroll_decls; 21 | string dist_rhs ("--distribute-rhs"); 22 | bool distribute_rhs = true; 23 | string heuristic_used ("--heuristic"); 24 | int heuristic = 0; 25 | string split_accs ("--split"); 26 | bool split = false; 27 | string top_sort ("--topo-sort"); 28 | bool topo_sort = false; 29 | 30 | if (DEBUG) printf ("filename : %s\n", argv[1]); 31 | for (int i=2; igenerate_code (reorder, original, unroll_decls, gdata_type, heuristic, distribute_rhs, split, topo_sort); 107 | original_out << original.rdbuf (); 108 | reorder_out << reorder.rdbuf (); 109 | original_out.close (); 110 | reorder_out.close (); 111 | fclose (in); 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh j3d125pt-reg.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-unroll.cu -o nvcc-unroll 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-e.cu -o nvcc-reorder-e 18 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-orig.cu -o llvm-orig 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp j3d125pt-unroll.cu -o llvm-unroll 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-a.cu -o llvm-reorder-a 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-b.cu -o llvm-reorder-b 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=64 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-c.cu -o llvm-reorder-c 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-d.cu -o llvm-reorder-d 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=128 common/cuda_header.cu j3d125pt.driver.cpp j3d125pt_gold.cpp reordered-e.cu -o llvm-reorder-e 25 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 26 | $(NVPROF) --print-gpu-trace ./nvcc-unroll > /dev/null 2>nvcc-unroll-results 27 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-e > /dev/null 2>nvcc-reorder-results-e 32 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 33 | $(NVPROF) --print-gpu-trace ./llvm-unroll > /dev/null 2>llvm-unroll-results 34 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 35 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-e > /dev/null 2>llvm-reorder-results-e 39 | ./common/time.awk 40 | 41 | clean: 42 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 43 | -------------------------------------------------------------------------------- /examples/j3d125pt/j3d125pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d125pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 2; 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y) + 2; 20 | int j = max(j0,2) + (int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 2; 22 | int k = max(k0,2) + (int)(threadIdx.z); 23 | 24 | double (*in)[516][516] = (double (*)[516][516])t_in; 25 | double (*out)[516][516] = (double (*)[516][516])t_out; 26 | 27 | if (i<=N-3 & j<=N-3 && k<=N-3) { 28 | out[k][j][i] = 29 | 0.75 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2] + in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + in[k-1][j+1][i+1] + 30 | in[k][j-1][i] + in[k][j][i-1] + in[k][j][i+1] + in[k][j+1][i] + 31 | in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 32 | 0.76 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2]) + 33 | 34 | 1.132 * (in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1] + 35 | in[k-1][j-2][i-2] + in[k-1][j-2][i+2] + in[k-1][j+2][i-2] + in[k-1][j+2][i+2] + 36 | in[k][j-2][i] + in[k][j][i-2] + in[k][j][i+2] + in[k][j+2][i] + 37 | in[k+1][j-2][i-2] + in[k+1][j-2][i+2] + in[k+1][j+2][i-2] + in[k+1][j+2][i+2] + 38 | in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1]) + 39 | 40 | 0.217 * (in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i] + 41 | in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + in[k-1][j+1][i] + 42 | in[k][j-2][i-2] + in[k][j-2][i+2] + in[k][j+2][i-2] + in[k][j+2][i+2] + 43 | in[k+1][j-1][i] + in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i] + 44 | in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i]) + 45 | 46 | 2.13 * (in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i] + 47 | in[k-1][j-2][i] + in[k-1][j][i-2] + in[k-1][j][i+2] + in[k-1][j+2][i] + 48 | in[k][j-2][i-1] + in[k][j-2][i+1] + in[k][j-1][i-2] + in[k][j-1][i+2] + in[k][j][i] + in[k][j+1][i-2] + in[k][j+1][i+2] + in[k][j+2][i-1] + in[k][j+2][i+1] + 49 | in[k+1][j-2][i] + in[k+1][j][i-2] + in[k+1][j][i+2] + in[k+1][j+2][i] + 50 | in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i]) + 51 | 52 | 0.331 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1] + 53 | in[k-1][j-2][i-1] + in[k-1][j-2][i+1] + in[k-1][j-1][i-2] + in[k-1][j-1][i+2] + in[k-1][j][i] + in[k-1][j+1][i-2] + in[k-1][j+1][i+2] + in[k-1][j+2][i-1] + in[k-1][j+2][i+1] + 54 | in[k][j-1][i-1] + in[k][j-1][i+1] + in[k][j+1][i-1] + in[k][j+1][i+1] + 55 | in[k+1][j-2][i-1] + in[k+1][j-2][i+1] + in[k+1][j-1][i-2] + in[k+1][j-1][i+2] + in[k+1][j][i] + in[k+1][j+1][i-2] + in[k+1][j+1][i+2] + in[k+1][j+2][i-1] + in[k+1][j+2][i+1]) + 56 | 0.332*(in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1]); 57 | } 58 | } 59 | 60 | extern "C" void host_code (double *h_in, double *h_out, int N) { 61 | double *in; 62 | cudaMalloc (&in, sizeof(double)*N*N*N); 63 | check_error ("Failed to allocate device memory for in\n"); 64 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 65 | double *out; 66 | cudaMalloc (&out, sizeof(double)*N*N*N); 67 | check_error ("Failed to allocate device memory for out\n"); 68 | 69 | dim3 blockconfig (16, 4, 4); 70 | dim3 gridconfig (ceil(N-4, blockconfig.x), ceil(N-4, blockconfig.y), ceil(N-4, blockconfig.z)); 71 | 72 | j3d125pt<<>> (in, out, N); 73 | 74 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 75 | 76 | cudaFree (in); 77 | cudaFree (out); 78 | } 79 | -------------------------------------------------------------------------------- /examples/hypterm-maxfuse/hypterm.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void hypterm_gold (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 6 | extern "C" void host_code (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double, double, double, int, int, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 308; 10 | 11 | double (*cons_1)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 12 | double (*cons_2)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 13 | double (*cons_3)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 14 | double (*cons_4)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 15 | double (*q_1)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 16 | double (*q_2)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 17 | double (*q_3)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 18 | double (*q_4)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 19 | double (*flux_0)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 20 | double (*flux_1)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 21 | double (*flux_2)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 22 | double (*flux_3)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 23 | double (*flux_4)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 24 | double (*flux_gold_0)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 25 | double (*flux_gold_1)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 26 | double (*flux_gold_2)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 27 | double (*flux_gold_3)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 28 | double (*flux_gold_4)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 29 | double (*dxinv) = (double*) malloc (sizeof (double) * 3); 30 | dxinv[0] = 0.01f; 31 | dxinv[1] = 0.02f; 32 | dxinv[2] = 0.03f; 33 | 34 | hypterm_gold ((double*)flux_gold_0, (double*)flux_gold_1, (double*)flux_gold_2, (double*)flux_gold_3, (double*)flux_gold_4, (double*)cons_1, (double*)cons_2, (double*)cons_3, (double*)cons_4, (double*)q_1, (double*)q_2, (double*)q_3, (double*)q_4, dxinv, N); 35 | host_code ((double*)flux_0, (double*)flux_1, (double*)flux_2, (double*)flux_3, (double*)flux_4, (double*)cons_1, (double*)cons_2, (double*)cons_3, (double*)cons_4, (double*)q_1, (double*)q_2, (double*)q_3, (double*)q_4, dxinv[0], dxinv[1], dxinv[2], N, N, N); 36 | 37 | double error_0 = checkError3D (N, N, (double*)flux_0, (double*)flux_gold_0, 4, N-4, 4, N-4, 4, N-4); 38 | printf("[Test] RMS Error : %e\n",error_0); 39 | if (error_0 > TOLERANCE) 40 | return -1; 41 | double error_1 = checkError3D (N, N, (double*)flux_1, (double*)flux_gold_1, 4, N-4, 4, N-4, 4, N-4); 42 | printf("[Test] RMS Error : %e\n",error_1); 43 | if (error_1 > TOLERANCE) 44 | return -1; 45 | double error_2 = checkError3D (N, N, (double*)flux_2, (double*)flux_gold_2, 4, N-4, 4, N-4, 4, N-4); 46 | printf("[Test] RMS Error : %e\n",error_2); 47 | if (error_2 > TOLERANCE) 48 | return -1; 49 | double error_3 = checkError3D (N, N, (double*)flux_3, (double*)flux_gold_3, 4, N-4, 4, N-4, 4, N-4); 50 | printf("[Test] RMS Error : %e\n",error_3); 51 | if (error_3 > TOLERANCE) 52 | return -1; 53 | double error_4 = checkError3D (N, N, (double*)flux_4, (double*)flux_gold_4, 4, N-4, 4, N-4, 4, N-4); 54 | printf("[Test] RMS Error : %e\n",error_4); 55 | if (error_4 > TOLERANCE) 56 | return -1; 57 | 58 | delete[] cons_1; 59 | delete[] cons_2; 60 | delete[] cons_3; 61 | delete[] cons_4; 62 | delete[] q_1; 63 | delete[] q_2; 64 | delete[] q_3; 65 | delete[] q_4; 66 | delete[] flux_0; 67 | delete[] flux_1; 68 | delete[] flux_2; 69 | delete[] flux_3; 70 | delete[] flux_4; 71 | delete[] flux_gold_0; 72 | delete[] flux_gold_1; 73 | delete[] flux_gold_2; 74 | delete[] flux_gold_3; 75 | delete[] flux_gold_4; 76 | } 77 | -------------------------------------------------------------------------------- /examples/j3d125pt-new/j3d125pt-orig.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d125pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 2; 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = (int)(blockIdx.y)*(int)(blockDim.y) + 2; 20 | int j = max(j0,2) + (int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 2; 22 | int k = max(k0,2) + (int)(threadIdx.z); 23 | 24 | double (*in)[516][516] = (double (*)[516][516])t_in; 25 | double (*out)[516][516] = (double (*)[516][516])t_out; 26 | 27 | if (i<=N-3 & j<=N-3 && k<=N-3) { 28 | out[k][j][i] = 29 | 0.75 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2] + 30 | in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + in[k-1][j+1][i+1] + 31 | in[k][j-1][i] + in[k][j][i-1] + in[k][j][i+1] + in[k][j+1][i] + 32 | in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + in[k+1][j+1][i-1] + in[k+1][j+1][i+1] + 33 | in[k+2][j-2][i-2] + in[k+2][j-2][i+2] + in[k+2][j+2][i-2] + in[k+2][j+2][i+2]) + 34 | 35 | 1.132 * (in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + 36 | in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1] + 37 | in[k-1][j-2][i-2] + in[k-1][j-2][i+2] + in[k-1][j+2][i-2] + in[k-1][j+2][i+2] + 38 | in[k][j-2][i] + in[k][j][i-2] + in[k][j][i+2] + in[k][j+2][i] + 39 | in[k+1][j-2][i-2] + in[k+1][j-2][i+2] + in[k+1][j+2][i-2] + in[k+1][j+2][i+2] + 40 | in[k+2][j-2][i-1] + in[k+2][j-2][i+1] + in[k+2][j-1][i-2] + in[k+2][j-1][i+2] + in[k+2][j][i] + 41 | in[k+2][j+1][i-2] + in[k+2][j+1][i+2] + in[k+2][j+2][i-1] + in[k+2][j+2][i+1]) + 42 | 43 | 0.217 * (in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i] + 44 | in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + in[k-1][j+1][i] + 45 | in[k][j-2][i-2] + in[k][j-2][i+2] + in[k][j+2][i-2] + in[k][j+2][i+2] + 46 | in[k+1][j-1][i] + in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i] + 47 | in[k+2][j-2][i] + in[k+2][j][i-2] + in[k+2][j][i+2] + in[k+2][j+2][i]) + 48 | 49 | 2.13 * (in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i] + 50 | in[k-1][j-2][i] + in[k-1][j][i-2] + in[k-1][j][i+2] + in[k-1][j+2][i] + 51 | in[k][j-2][i-1] + in[k][j-2][i+1] + in[k][j-1][i-2] + in[k][j-1][i+2] + 52 | in[k][j][i] + in[k][j+1][i-2] + in[k][j+1][i+2] + in[k][j+2][i-1] + in[k][j+2][i+1] + 53 | in[k+1][j-2][i] + in[k+1][j][i-2] + in[k+1][j][i+2] + in[k+1][j+2][i] + 54 | in[k+2][j-1][i] + in[k+2][j][i-1] + in[k+2][j][i+1] + in[k+2][j+1][i]) + 55 | 56 | 0.331 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1] + 57 | in[k-1][j-2][i-1] + in[k-1][j-2][i+1] + in[k-1][j-1][i-2] + in[k-1][j-1][i+2] + in[k-1][j][i] + 58 | in[k-1][j+1][i-2] + in[k-1][j+1][i+2] + in[k-1][j+2][i-1] + in[k-1][j+2][i+1] + 59 | in[k][j-1][i-1] + in[k][j-1][i+1] + in[k][j+1][i-1] + in[k][j+1][i+1] + 60 | in[k+1][j-2][i-1] + in[k+1][j-2][i+1] + in[k+1][j-1][i-2] + in[k+1][j-1][i+2] + in[k+1][j][i] + 61 | in[k+1][j+1][i-2] + in[k+1][j+1][i+2] + in[k+1][j+2][i-1] + in[k+1][j+2][i+1] + 62 | in[k+2][j-1][i-1] + in[k+2][j-1][i+1] + in[k+2][j+1][i-1] + in[k+2][j+1][i+1]); 63 | } 64 | } 65 | 66 | extern "C" void host_code (double *h_in, double *h_out, int N) { 67 | double *in; 68 | cudaMalloc (&in, sizeof(double)*N*N*N); 69 | check_error ("Failed to allocate device memory for in\n"); 70 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 71 | double *out; 72 | cudaMalloc (&out, sizeof(double)*N*N*N); 73 | check_error ("Failed to allocate device memory for out\n"); 74 | 75 | dim3 blockconfig (16, 4, 4); 76 | dim3 gridconfig (ceil(N-4, blockconfig.x), ceil(N-4, blockconfig.y), ceil(N-4, blockconfig.z)); 77 | 78 | j3d125pt<<>> (in, out, N); 79 | 80 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 81 | 82 | cudaFree (in); 83 | cudaFree (out); 84 | } 85 | -------------------------------------------------------------------------------- /examples/j3d125pt/j3d125pt-reg.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda.h" 3 | #define max(x,y) ((x) > (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d125pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 2; 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y) + 2; 20 | int j = max(j0,2) + 4*(int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 2; 22 | int k = max(k0,2) + (int)(threadIdx.z); 23 | 24 | double (*in)[516][516] = (double (*)[516][516])t_in; 25 | double (*out)[516][516] = (double (*)[516][516])t_out; 26 | if (i>=2 && i<=N-3 && j>=2 && j<=N-3 && k>=2 && k<=N-3) { 27 | #pragma begin stencil1 unroll k=1,j=4,i=1 28 | out[k][j][i] = 29 | 0.75 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2] + in[k-1][j-1][i-1] + in[k-1][j-1][i+1] + in[k-1][j+1][i-1] + in[k-1][j+1][i+1] + 30 | in[k][j-1][i] + in[k][j][i-1] + in[k][j][i+1] + in[k][j+1][i] + 31 | in[k+1][j-1][i-1] + in[k+1][j-1][i+1] + in[k+1][j+1][i-1] + in[k+1][j+1][i+1]) + 32 | 0.76 * (in[k-2][j-2][i-2] + in[k-2][j-2][i+2] + in[k-2][j+2][i-2] + in[k-2][j+2][i+2]) + 33 | 34 | 1.132 * (in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1] + 35 | in[k-1][j-2][i-2] + in[k-1][j-2][i+2] + in[k-1][j+2][i-2] + in[k-1][j+2][i+2] + 36 | in[k][j-2][i] + in[k][j][i-2] + in[k][j][i+2] + in[k][j+2][i] + 37 | in[k+1][j-2][i-2] + in[k+1][j-2][i+2] + in[k+1][j+2][i-2] + in[k+1][j+2][i+2] + 38 | in[k-2][j-2][i-1] + in[k-2][j-2][i+1] + in[k-2][j-1][i-2] + in[k-2][j-1][i+2] + in[k-2][j][i] + in[k-2][j+1][i-2] + in[k-2][j+1][i+2] + in[k-2][j+2][i-1] + in[k-2][j+2][i+1]) + 39 | 40 | 0.217 * (in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i] + 41 | in[k-1][j-1][i] + in[k-1][j][i-1] + in[k-1][j][i+1] + in[k-1][j+1][i] + 42 | in[k][j-2][i-2] + in[k][j-2][i+2] + in[k][j+2][i-2] + in[k][j+2][i+2] + 43 | in[k+1][j-1][i] + in[k+1][j][i-1] + in[k+1][j][i+1] + in[k+1][j+1][i] + 44 | in[k-2][j-2][i] + in[k-2][j][i-2] + in[k-2][j][i+2] + in[k-2][j+2][i]) + 45 | 46 | 2.13 * (in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i] + 47 | in[k-1][j-2][i] + in[k-1][j][i-2] + in[k-1][j][i+2] + in[k-1][j+2][i] + 48 | in[k][j-2][i-1] + in[k][j-2][i+1] + in[k][j-1][i-2] + in[k][j-1][i+2] + in[k][j][i] + in[k][j+1][i-2] + in[k][j+1][i+2] + in[k][j+2][i-1] + in[k][j+2][i+1] + 49 | in[k+1][j-2][i] + in[k+1][j][i-2] + in[k+1][j][i+2] + in[k+1][j+2][i] + 50 | in[k-2][j-1][i] + in[k-2][j][i-1] + in[k-2][j][i+1] + in[k-2][j+1][i]) + 51 | 52 | 0.331 * (in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1] + 53 | in[k-1][j-2][i-1] + in[k-1][j-2][i+1] + in[k-1][j-1][i-2] + in[k-1][j-1][i+2] + in[k-1][j][i] + in[k-1][j+1][i-2] + in[k-1][j+1][i+2] + in[k-1][j+2][i-1] + in[k-1][j+2][i+1] + 54 | in[k][j-1][i-1] + in[k][j-1][i+1] + in[k][j+1][i-1] + in[k][j+1][i+1] + 55 | in[k+1][j-2][i-1] + in[k+1][j-2][i+1] + in[k+1][j-1][i-2] + in[k+1][j-1][i+2] + in[k+1][j][i] + in[k+1][j+1][i-2] + in[k+1][j+1][i+2] + in[k+1][j+2][i-1] + in[k+1][j+2][i+1]) + 56 | 0.332*(in[k-2][j-1][i-1] + in[k-2][j-1][i+1] + in[k-2][j+1][i-1] + in[k-2][j+1][i+1]); 57 | #pragma end stencil1 58 | } 59 | } 60 | 61 | extern "C" void host_code (double *h_in, double *h_out, int N) { 62 | double *in; 63 | cudaMalloc (&in, sizeof(double)*N*N*N); 64 | check_error ("Failed to allocate device memory for in\n"); 65 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 66 | double *out; 67 | cudaMalloc (&out, sizeof(double)*N*N*N); 68 | check_error ("Failed to allocate device memory for out\n"); 69 | 70 | dim3 blockconfig (16, 4, 4); 71 | dim3 gridconfig (ceil(N-4, blockconfig.x), ceil(N-4, 4*blockconfig.y), ceil(N-4, blockconfig.z)); 72 | 73 | j3d125pt<<>> (in, out, N); 74 | 75 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 76 | 77 | cudaFree (in); 78 | cudaFree (out); 79 | } 80 | -------------------------------------------------------------------------------- /examples/derivative-maxfuse/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVPROF=nvprof 3 | NOPTFLAGS=-O3 -ccbin=g++ -std=c++11 -Xcompiler "-fPIC -fopenmp -O3 -fno-strict-aliasing" --use_fast_math -Xptxas "-dlcm=ca" 4 | NCOMPUTEFLAGS=-gencode arch=compute_$(CAPABILITY),code=sm_$(CAPABILITY) 5 | CLANG=clang++ 6 | LOPTFLAGS=-O3 -ffp-contract=fast --cuda-path=$(CUDAHOME) -L$(CUDAHOME)/lib64 -L$(CUDAHOME)/nvvm -lcudart 7 | LCOMPUTEFLAGS=--cuda-gpu-arch=sm_$(CAPABILITY) 8 | 9 | all: 10 | #./reorder.sh derivative-reg-maxfuse.cu 11 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp derivative-orig.cu -o nvcc-orig 12 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered.cu -o nvcc-reorder 13 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-a.cu -o nvcc-reorder-a 14 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-b.cu -o nvcc-reorder-b 15 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-c.cu -o nvcc-reorder-c 16 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-d.cu -o nvcc-reorder-d 17 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-g.cu -o nvcc-reorder-g 18 | $(NVCC) $(NOPTFLAGS) $(NCOMPUTEFLAGS) -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-h.cu -o nvcc-reorder-h 19 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp derivative-orig.cu -o llvm-orig 20 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered.cu -o llvm-reorder 21 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-a.cu -o llvm-reorder-a 22 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-b.cu -o llvm-reorder-b 23 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-c.cu -o llvm-reorder-c 24 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-d.cu -o llvm-reorder-d 25 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-g.cu -o llvm-reorder-g 26 | $(CLANG) $(LOPTFLAGS) $(LCOMPUTEFLAGS) -Xcuda-ptxas -maxrregcount=255 common/cuda_header.cu derivative.driver.cpp derivative_gold.cpp reordered-h.cu -o llvm-reorder-h 27 | $(NVPROF) --print-gpu-trace ./nvcc-orig > /dev/null 2>nvcc-orig-results 28 | $(NVPROF) --print-gpu-trace ./nvcc-reorder > /dev/null 2>nvcc-reorder-results 29 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-a > /dev/null 2>nvcc-reorder-results-a 30 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-b > /dev/null 2>nvcc-reorder-results-b 31 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-c > /dev/null 2>nvcc-reorder-results-c 32 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-d > /dev/null 2>nvcc-reorder-results-d 33 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-g > /dev/null 2>nvcc-reorder-results-g 34 | $(NVPROF) --print-gpu-trace ./nvcc-reorder-h > /dev/null 2>nvcc-reorder-results-h 35 | $(NVPROF) --print-gpu-trace ./llvm-orig > /dev/null 2>llvm-orig-results 36 | $(NVPROF) --print-gpu-trace ./llvm-reorder > /dev/null 2>llvm-reorder-results 37 | $(NVPROF) --print-gpu-trace ./llvm-reorder-a > /dev/null 2>llvm-reorder-results-a 38 | $(NVPROF) --print-gpu-trace ./llvm-reorder-b > /dev/null 2>llvm-reorder-results-b 39 | $(NVPROF) --print-gpu-trace ./llvm-reorder-c > /dev/null 2>llvm-reorder-results-c 40 | $(NVPROF) --print-gpu-trace ./llvm-reorder-d > /dev/null 2>llvm-reorder-results-d 41 | $(NVPROF) --print-gpu-trace ./llvm-reorder-g > /dev/null 2>llvm-reorder-results-g 42 | $(NVPROF) --print-gpu-trace ./llvm-reorder-h > /dev/null 2>llvm-reorder-results-h 43 | ./common/time.awk 44 | 45 | clean: 46 | rm test nvcc-* llvm-* *.idsl stencils stencilnames unrollfactors 2>/dev/null || true 47 | -------------------------------------------------------------------------------- /examples/hypterm-3/hypterm.driver.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.hpp" 2 | #include 3 | #include 4 | 5 | extern "C" void hypterm_gold (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int); 6 | extern "C" void host_code (double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double, double, double, int, int, int); 7 | 8 | int main(int argc, char** argv) { 9 | int N = 308; 10 | 11 | double (*cons_1)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 12 | double (*cons_2)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 13 | double (*cons_3)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 14 | double (*cons_4)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 15 | double (*q_1)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 16 | double (*q_2)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 17 | double (*q_3)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 18 | double (*q_4)[308][308] = (double (*)[308][308]) getRandom3DArray(308, 308, 308); 19 | double (*flux_0)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 20 | double (*flux_1)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 21 | double (*flux_2)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 22 | double (*flux_3)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 23 | double (*flux_4)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 24 | double (*flux_gold_0)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 25 | double (*flux_gold_1)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 26 | double (*flux_gold_2)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 27 | double (*flux_gold_3)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 28 | double (*flux_gold_4)[308][308] = (double (*)[308][308]) getZero3DArray(308, 308, 308); 29 | double (*dxinv) = (double*) malloc (sizeof (double) * 3); 30 | dxinv[0] = 0.01f; 31 | dxinv[1] = 0.02f; 32 | dxinv[2] = 0.03f; 33 | 34 | hypterm_gold ((double*)flux_gold_0, (double*)flux_gold_1, (double*)flux_gold_2, (double*)flux_gold_3, (double*)flux_gold_4, (double*)cons_1, (double*)cons_2, (double*)cons_3, (double*)cons_4, (double*)q_1, (double*)q_2, (double*)q_3, (double*)q_4, dxinv, N); 35 | host_code ((double*)flux_0, (double*)flux_1, (double*)flux_2, (double*)flux_3, (double*)flux_4, (double*)cons_1, (double*)cons_2, (double*)cons_3, (double*)cons_4, (double*)q_1, (double*)q_2, (double*)q_3, (double*)q_4, dxinv[0], dxinv[1], dxinv[2], N, N, N); 36 | 37 | printf ("checking flux_0\n"); 38 | double error_0 = checkError3D (N, N, (double*)flux_0, (double*)flux_gold_0, 4, N-4, 4, N-4, 4, N-4); 39 | printf("[Test] RMS Error : %e\n",error_0); 40 | if (error_0 > TOLERANCE) 41 | return -1; 42 | printf ("checking flux_1\n"); 43 | double error_1 = checkError3D (N, N, (double*)flux_1, (double*)flux_gold_1, 4, N-4, 4, N-4, 4, N-4); 44 | printf("[Test] RMS Error : %e\n",error_1); 45 | if (error_1 > TOLERANCE) 46 | return -1; 47 | printf ("checking flux_2\n"); 48 | double error_2 = checkError3D (N, N, (double*)flux_2, (double*)flux_gold_2, 4, N-4, 4, N-4, 4, N-4); 49 | printf("[Test] RMS Error : %e\n",error_2); 50 | if (error_2 > TOLERANCE) 51 | return -1; 52 | printf ("checking flux_3\n"); 53 | double error_3 = checkError3D (N, N, (double*)flux_3, (double*)flux_gold_3, 4, N-4, 4, N-4, 4, N-4); 54 | printf("[Test] RMS Error : %e\n",error_3); 55 | if (error_3 > TOLERANCE) 56 | return -1; 57 | printf ("checking flux_4\n"); 58 | double error_4 = checkError3D (N, N, (double*)flux_4, (double*)flux_gold_4, 4, N-4, 4, N-4, 4, N-4); 59 | printf("[Test] RMS Error : %e\n",error_4); 60 | if (error_4 > TOLERANCE) 61 | return -1; 62 | 63 | delete[] cons_1; 64 | delete[] cons_2; 65 | delete[] cons_3; 66 | delete[] cons_4; 67 | delete[] q_1; 68 | delete[] q_2; 69 | delete[] q_3; 70 | delete[] q_4; 71 | delete[] flux_0; 72 | delete[] flux_1; 73 | delete[] flux_2; 74 | delete[] flux_3; 75 | delete[] flux_4; 76 | delete[] flux_gold_0; 77 | delete[] flux_gold_1; 78 | delete[] flux_gold_2; 79 | delete[] flux_gold_3; 80 | delete[] flux_gold_4; 81 | } 82 | -------------------------------------------------------------------------------- /examples/derivative-2/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-f.cu 10 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 11 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 12 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-i.cu 13 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-j.cu 14 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-k.cu 15 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-l.cu 16 | 17 | awk '/#pragma begin/{print $3}' stencils > stencilnames 18 | awk '/unroll/{print $5}' stencils > unrollfactors 19 | 20 | while read -r name 21 | do 22 | uf=`awk 'NR==1' unrollfactors` 23 | sed -i '1d' unrollfactors 24 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 27 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 28 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 29 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort false --split false 30 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 31 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort true --split false 32 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-d.cu 33 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort false --split false 34 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 35 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-f.cu 36 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort true --split false 37 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 38 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-h.cu 39 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 40 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-i.cu 41 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-j.cu 42 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 43 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-k.cu 44 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-l.cu 45 | 46 | done < stencilnames 47 | 48 | sed -i '/#pragma begin stencil/d' reordered-a.cu 49 | sed -i '/#pragma end stencil/d' reordered-a.cu 50 | #indent -kr -i8 reordered-a.cu 51 | sed -i '/#pragma begin stencil/d' reordered-b.cu 52 | sed -i '/#pragma end stencil/d' reordered-b.cu 53 | #indent -kr -i8 reordered-b.cu 54 | sed -i '/#pragma begin stencil/d' reordered-c.cu 55 | sed -i '/#pragma end stencil/d' reordered-c.cu 56 | #indent -kr -i8 reordered-c.cu 57 | sed -i '/#pragma begin stencil/d' reordered-d.cu 58 | sed -i '/#pragma end stencil/d' reordered-d.cu 59 | #indent -kr -i8 reordered-d.cu 60 | sed -i '/#pragma begin stencil/d' reordered-e.cu 61 | sed -i '/#pragma end stencil/d' reordered-e.cu 62 | #indent -kr -i8 reordered-e.cu 63 | sed -i '/#pragma begin stencil/d' reordered-f.cu 64 | sed -i '/#pragma end stencil/d' reordered-f.cu 65 | #indent -kr -i8 reordered-f.cu 66 | sed -i '/#pragma begin stencil/d' reordered-g.cu 67 | sed -i '/#pragma end stencil/d' reordered-g.cu 68 | #indent -kr -i8 reordered-g.cu 69 | sed -i '/#pragma begin stencil/d' reordered-h.cu 70 | sed -i '/#pragma end stencil/d' reordered-h.cu 71 | #indent -kr -i8 reordered-h.cu 72 | sed -i '/#pragma begin stencil/d' reordered-i.cu 73 | sed -i '/#pragma end stencil/d' reordered-i.cu 74 | #indent -kr -i8 reordered-i.cu 75 | sed -i '/#pragma begin stencil/d' reordered-j.cu 76 | sed -i '/#pragma end stencil/d' reordered-j.cu 77 | #indent -kr -i8 reordered-j.cu 78 | sed -i '/#pragma begin stencil/d' reordered-k.cu 79 | sed -i '/#pragma end stencil/d' reordered-k.cu 80 | #indent -kr -i8 reordered-k.cu 81 | sed -i '/#pragma begin stencil/d' reordered-l.cu 82 | sed -i '/#pragma end stencil/d' reordered-l.cu 83 | #indent -kr -i8 reordered-l.cu 84 | #rm *~ 85 | -------------------------------------------------------------------------------- /examples/hypterm-3/reorder-3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-f.cu 10 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 11 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 12 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-i.cu 13 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-j.cu 14 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-k.cu 15 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-l.cu 16 | 17 | awk '/#pragma begin/{print $3}' stencils > stencilnames 18 | awk '/unroll/{print $5}' stencils > unrollfactors 19 | 20 | while read -r name 21 | do 22 | uf=`awk 'NR==1' unrollfactors` 23 | sed -i '1d' unrollfactors 24 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 27 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 28 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 29 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort false --split false 30 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 31 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort true --split false 32 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-d.cu 33 | 34 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort false --split false 35 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 36 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-f.cu 37 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort true --split false 38 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 39 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-h.cu 40 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 41 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-i.cu 42 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-j.cu 43 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 44 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-k.cu 45 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-l.cu 46 | 47 | done < stencilnames 48 | 49 | sed -i '/#pragma begin stencil/d' reordered-a.cu 50 | sed -i '/#pragma end stencil/d' reordered-a.cu 51 | #indent -kr -i8 reordered-a.cu 52 | sed -i '/#pragma begin stencil/d' reordered-b.cu 53 | sed -i '/#pragma end stencil/d' reordered-b.cu 54 | #indent -kr -i8 reordered-b.cu 55 | sed -i '/#pragma begin stencil/d' reordered-c.cu 56 | sed -i '/#pragma end stencil/d' reordered-c.cu 57 | #indent -kr -i8 reordered-c.cu 58 | sed -i '/#pragma begin stencil/d' reordered-d.cu 59 | sed -i '/#pragma end stencil/d' reordered-d.cu 60 | #indent -kr -i8 reordered-d.cu 61 | sed -i '/#pragma begin stencil/d' reordered-e.cu 62 | sed -i '/#pragma end stencil/d' reordered-e.cu 63 | #indent -kr -i8 reordered-e.cu 64 | sed -i '/#pragma begin stencil/d' reordered-f.cu 65 | sed -i '/#pragma end stencil/d' reordered-f.cu 66 | #indent -kr -i8 reordered-f.cu 67 | sed -i '/#pragma begin stencil/d' reordered-g.cu 68 | sed -i '/#pragma end stencil/d' reordered-g.cu 69 | #indent -kr -i8 reordered-g.cu 70 | sed -i '/#pragma begin stencil/d' reordered-h.cu 71 | sed -i '/#pragma end stencil/d' reordered-h.cu 72 | #indent -kr -i8 reordered-h.cu 73 | sed -i '/#pragma begin stencil/d' reordered-i.cu 74 | sed -i '/#pragma end stencil/d' reordered-i.cu 75 | #indent -kr -i8 reordered-i.cu 76 | sed -i '/#pragma begin stencil/d' reordered-j.cu 77 | sed -i '/#pragma end stencil/d' reordered-j.cu 78 | #indent -kr -i8 reordered-j.cu 79 | sed -i '/#pragma begin stencil/d' reordered-k.cu 80 | sed -i '/#pragma end stencil/d' reordered-k.cu 81 | #indent -kr -i8 reordered-k.cu 82 | sed -i '/#pragma begin stencil/d' reordered-l.cu 83 | sed -i '/#pragma end stencil/d' reordered-l.cu 84 | #indent -kr -i8 reordered-l.cu 85 | #rm *~ 86 | -------------------------------------------------------------------------------- /examples/hypterm-maxfuse/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-f.cu 10 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 11 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 12 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-i.cu 13 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-j.cu 14 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-k.cu 15 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-l.cu 16 | 17 | awk '/#pragma begin/{print $3}' stencils > stencilnames 18 | awk '/unroll/{print $5}' stencils > unrollfactors 19 | 20 | while read -r name 21 | do 22 | uf=`awk 'NR==1' unrollfactors` 23 | sed -i '1d' unrollfactors 24 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 27 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 28 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 29 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort false --split false 30 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 31 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort true --split false 32 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-d.cu 33 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort false --split false 34 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 35 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-f.cu 36 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort true --split false 37 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 38 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-h.cu 39 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 40 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-i.cu 41 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-j.cu 42 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 43 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-k.cu 44 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-l.cu 45 | 46 | done < stencilnames 47 | 48 | sed -i '/#pragma begin stencil/d' reordered-a.cu 49 | sed -i '/#pragma end stencil/d' reordered-a.cu 50 | #indent -kr -i8 reordered-a.cu 51 | sed -i '/#pragma begin stencil/d' reordered-b.cu 52 | sed -i '/#pragma end stencil/d' reordered-b.cu 53 | #indent -kr -i8 reordered-b.cu 54 | sed -i '/#pragma begin stencil/d' reordered-c.cu 55 | sed -i '/#pragma end stencil/d' reordered-c.cu 56 | #indent -kr -i8 reordered-c.cu 57 | sed -i '/#pragma begin stencil/d' reordered-d.cu 58 | sed -i '/#pragma end stencil/d' reordered-d.cu 59 | #indent -kr -i8 reordered-d.cu 60 | sed -i '/#pragma begin stencil/d' reordered-e.cu 61 | sed -i '/#pragma end stencil/d' reordered-e.cu 62 | #indent -kr -i8 reordered-e.cu 63 | sed -i '/#pragma begin stencil/d' reordered-f.cu 64 | sed -i '/#pragma end stencil/d' reordered-f.cu 65 | #indent -kr -i8 reordered-f.cu 66 | sed -i '/#pragma begin stencil/d' reordered-g.cu 67 | sed -i '/#pragma end stencil/d' reordered-g.cu 68 | #indent -kr -i8 reordered-g.cu 69 | sed -i '/#pragma begin stencil/d' reordered-h.cu 70 | sed -i '/#pragma end stencil/d' reordered-h.cu 71 | #indent -kr -i8 reordered-h.cu 72 | sed -i '/#pragma begin stencil/d' reordered-i.cu 73 | sed -i '/#pragma end stencil/d' reordered-i.cu 74 | #indent -kr -i8 reordered-i.cu 75 | sed -i '/#pragma begin stencil/d' reordered-j.cu 76 | sed -i '/#pragma end stencil/d' reordered-j.cu 77 | #indent -kr -i8 reordered-j.cu 78 | sed -i '/#pragma begin stencil/d' reordered-k.cu 79 | sed -i '/#pragma end stencil/d' reordered-k.cu 80 | #indent -kr -i8 reordered-k.cu 81 | sed -i '/#pragma begin stencil/d' reordered-l.cu 82 | sed -i '/#pragma end stencil/d' reordered-l.cu 83 | #indent -kr -i8 reordered-l.cu 84 | #rm *~ 85 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-3/reorder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '/#pragma begin/,/#pragma end/' $1 > stencils 4 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-a.cu 5 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-b.cu 6 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-c.cu 7 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-d.cu 8 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-e.cu 9 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-f.cu 10 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-g.cu 11 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-h.cu 12 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-i.cu 13 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-j.cu 14 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-k.cu 15 | sed '/#pragma begin/,/#pragma end/{//!d}' $1 > reordered-l.cu 16 | 17 | awk '/#pragma begin/{print $3}' stencils > stencilnames 18 | awk '/unroll/{print $5}' stencils > unrollfactors 19 | 20 | while read -r name 21 | do 22 | uf=`awk 'NR==1' unrollfactors` 23 | sed -i '1d' unrollfactors 24 | awk '/#pragma begin '"$name"'/{flag=1;next} /#pragma end '"$name"'/{flag=0} flag' stencils > $name.idsl 25 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort false --split false 26 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-a.cu 27 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs true --topo-sort true --split false 28 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-b.cu 29 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort false --split false 30 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-c.cu 31 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 0 --distribute-rhs false --topo-sort true --split false 32 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-d.cu 33 | 34 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort false --split false 35 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-e.cu 36 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-f.cu 37 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs true --topo-sort true --split false 38 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-g.cu 39 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-h.cu 40 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort false --split false 41 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-i.cu 42 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-j.cu 43 | ../../test $name.idsl --out-file $name.cu --unroll $uf --heuristic 1 --distribute-rhs false --topo-sort true --split false 44 | sed -i '/#pragma begin '"$name"'/r '"$name"'.cu' reordered-k.cu 45 | sed -i '/#pragma begin '"$name"'/r orig_'"$name"'.cu' reordered-l.cu 46 | 47 | done < stencilnames 48 | 49 | sed -i '/#pragma begin stencil/d' reordered-a.cu 50 | sed -i '/#pragma end stencil/d' reordered-a.cu 51 | #indent -kr -i8 reordered-a.cu 52 | sed -i '/#pragma begin stencil/d' reordered-b.cu 53 | sed -i '/#pragma end stencil/d' reordered-b.cu 54 | #indent -kr -i8 reordered-b.cu 55 | sed -i '/#pragma begin stencil/d' reordered-c.cu 56 | sed -i '/#pragma end stencil/d' reordered-c.cu 57 | #indent -kr -i8 reordered-c.cu 58 | sed -i '/#pragma begin stencil/d' reordered-d.cu 59 | sed -i '/#pragma end stencil/d' reordered-d.cu 60 | #indent -kr -i8 reordered-d.cu 61 | sed -i '/#pragma begin stencil/d' reordered-e.cu 62 | sed -i '/#pragma end stencil/d' reordered-e.cu 63 | #indent -kr -i8 reordered-e.cu 64 | sed -i '/#pragma begin stencil/d' reordered-f.cu 65 | sed -i '/#pragma end stencil/d' reordered-f.cu 66 | #indent -kr -i8 reordered-f.cu 67 | sed -i '/#pragma begin stencil/d' reordered-g.cu 68 | sed -i '/#pragma end stencil/d' reordered-g.cu 69 | #indent -kr -i8 reordered-g.cu 70 | sed -i '/#pragma begin stencil/d' reordered-h.cu 71 | sed -i '/#pragma end stencil/d' reordered-h.cu 72 | #indent -kr -i8 reordered-h.cu 73 | sed -i '/#pragma begin stencil/d' reordered-i.cu 74 | sed -i '/#pragma end stencil/d' reordered-i.cu 75 | #indent -kr -i8 reordered-i.cu 76 | sed -i '/#pragma begin stencil/d' reordered-j.cu 77 | sed -i '/#pragma end stencil/d' reordered-j.cu 78 | #indent -kr -i8 reordered-j.cu 79 | sed -i '/#pragma begin stencil/d' reordered-k.cu 80 | sed -i '/#pragma end stencil/d' reordered-k.cu 81 | #indent -kr -i8 reordered-k.cu 82 | sed -i '/#pragma begin stencil/d' reordered-l.cu 83 | sed -i '/#pragma end stencil/d' reordered-l.cu 84 | #indent -kr -i8 reordered-l.cu 85 | #rm *~ 86 | -------------------------------------------------------------------------------- /examples/rhs4th3fort-3/common/time.awk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "-------------------- NVCC ---------------------" 5 | 6 | time=`grep -E 'float|double' nvcc-orig-results | awk 'BEGIN {time = 0.0} {time += $2} END {print time}'` 7 | awk -v otime=$time 'BEGIN {print "Original GFlops = " (300*300*300*687/10^6/otime)}' 8 | 9 | time0=`grep -E 'float|double' nvcc-reorder-results | awk 'BEGIN {time0 = 0.0} {time0 += $2} END {print time0}'` 10 | timea=`grep -E 'float|double' nvcc-reorder-results-a | awk 'BEGIN {timea = 0.0} {timea += $2} END {print timea}'` 11 | timeb=`grep -E 'float|double' nvcc-reorder-results-b | awk 'BEGIN {timeb = 0.0} {timeb += $2} END {print timeb}'` 12 | timee=`grep -E 'float|double' nvcc-reorder-results-e | awk 'BEGIN {timee = 0.0} {timee += $2} END {print timee}'` 13 | timef=`grep -E 'float|double' nvcc-reorder-results-f | awk 'BEGIN {timef = 0.0} {timef += $2} END {print timef}'` 14 | timeg=`grep -E 'float|double' nvcc-reorder-results-g | awk 'BEGIN {timeg = 0.0} {timeg += $2} END {print timeg}'` 15 | timeh=`grep -E 'float|double' nvcc-reorder-results-h | awk 'BEGIN {timeh = 0.0} {timeh += $2} END {print timeh}'` 16 | timei=`grep -E 'float|double' nvcc-reorder-results-i | awk 'BEGIN {timei = 0.0} {timei += $2} END {print timei}'` 17 | timej=`grep -E 'float|double' nvcc-reorder-results-j | awk 'BEGIN {timej = 0.0} {timej += $2} END {print timej}'` 18 | timek=`grep -E 'float|double' nvcc-reorder-results-k | awk 'BEGIN {timek = 0.0} {timek += $2} END {print timek}'` 19 | timel=`grep -E 'float|double' nvcc-reorder-results-l | awk 'BEGIN {timel = 0.0} {timel += $2} END {print timel}'` 20 | 21 | min1=`awk -v atime=$timea -v btime=$timeb 'BEGIN {print (atime (y)? (x) : (y)) 4 | #define min(x,y) ((x) < (y)? (x) : (y)) 5 | #define ceil(a,b) ((a) % (b) == 0 ? (a) / (b) : ((a) / (b)) + 1) 6 | 7 | void check_error (const char* message) { 8 | cudaError_t error = cudaGetLastError (); 9 | if (error != cudaSuccess) { 10 | printf ("CUDA error : %s, %s\n", message, cudaGetErrorString (error)); 11 | exit(-1); 12 | } 13 | } 14 | 15 | __global__ void j3d125pt (double * __restrict__ t_in, double * __restrict__ t_out, int N) { 16 | //Determing the block's indices 17 | int i0 = (int)(blockIdx.x)*(int)(blockDim.x) + 2; 18 | int i = max(i0,2) + (int)(threadIdx.x); 19 | int j0 = 4*(int)(blockIdx.y)*(int)(blockDim.y) + 2; 20 | int j = max(j0,2) + 4*(int)(threadIdx.y); 21 | int k0 = (int)(blockIdx.z)*(int)(blockDim.z) + 2; 22 | int k = max(k0,2) + (int)(threadIdx.z); 23 | 24 | double (*in)[516][516] = (double (*)[516][516])t_in; 25 | double (*out)[516][516] = (double (*)[516][516])t_out; 26 | 27 | if (i<=N-3 & j<=N-3 && k<=N-3) { 28 | #pragma unroll 4 29 | for (int jj=0; jj<=3; jj++) { 30 | out[k][j+jj][i] = 31 | 0.75 * (in[k-2][j+jj-2][i-2] + in[k-2][j+jj-2][i+2] + in[k-2][j+jj+2][i-2] + in[k-2][j+jj+2][i+2] + in[k-1][j+jj-1][i-1] + in[k-1][j+jj-1][i+1] + in[k-1][j+jj+1][i-1] + in[k-1][j+jj+1][i+1] + 32 | in[k][j+jj-1][i] + in[k][j+jj][i-1] + in[k][j+jj][i+1] + in[k][j+jj+1][i] + 33 | in[k+1][j+jj-1][i-1] + in[k+1][j+jj-1][i+1] + in[k+1][j+jj+1][i-1] + in[k+1][j+jj+1][i+1]) + 34 | 0.76 * (in[k-2][j+jj-2][i-2] + in[k-2][j+jj-2][i+2] + in[k-2][j+jj+2][i-2] + in[k-2][j+jj+2][i+2]) + 35 | 36 | 1.132 * (in[k-2][j+jj-2][i-1] + in[k-2][j+jj-2][i+1] + in[k-2][j+jj-1][i-2] + in[k-2][j+jj-1][i+2] + in[k-2][j+jj][i] + in[k-2][j+jj+1][i-2] + in[k-2][j+jj+1][i+2] + in[k-2][j+jj+2][i-1] + in[k-2][j+jj+2][i+1] + 37 | in[k-1][j+jj-2][i-2] + in[k-1][j+jj-2][i+2] + in[k-1][j+jj+2][i-2] + in[k-1][j+jj+2][i+2] + 38 | in[k][j+jj-2][i] + in[k][j+jj][i-2] + in[k][j+jj][i+2] + in[k][j+jj+2][i] + 39 | in[k+1][j+jj-2][i-2] + in[k+1][j+jj-2][i+2] + in[k+1][j+jj+2][i-2] + in[k+1][j+jj+2][i+2] + 40 | in[k-2][j+jj-2][i-1] + in[k-2][j+jj-2][i+1] + in[k-2][j+jj-1][i-2] + in[k-2][j+jj-1][i+2] + in[k-2][j+jj][i] + in[k-2][j+jj+1][i-2] + in[k-2][j+jj+1][i+2] + in[k-2][j+jj+2][i-1] + in[k-2][j+jj+2][i+1]) + 41 | 42 | 0.217 * (in[k-2][j+jj-2][i] + in[k-2][j+jj][i-2] + in[k-2][j+jj][i+2] + in[k-2][j+jj+2][i] + 43 | in[k-1][j+jj-1][i] + in[k-1][j+jj][i-1] + in[k-1][j+jj][i+1] + in[k-1][j+jj+1][i] + 44 | in[k][j+jj-2][i-2] + in[k][j+jj-2][i+2] + in[k][j+jj+2][i-2] + in[k][j+jj+2][i+2] + 45 | in[k+1][j+jj-1][i] + in[k+1][j+jj][i-1] + in[k+1][j+jj][i+1] + in[k+1][j+jj+1][i] + 46 | in[k-2][j+jj-2][i] + in[k-2][j+jj][i-2] + in[k-2][j+jj][i+2] + in[k-2][j+jj+2][i]) + 47 | 48 | 2.13 * (in[k-2][j+jj-1][i] + in[k-2][j+jj][i-1] + in[k-2][j+jj][i+1] + in[k-2][j+jj+1][i] + 49 | in[k-1][j+jj-2][i] + in[k-1][j+jj][i-2] + in[k-1][j+jj][i+2] + in[k-1][j+jj+2][i] + 50 | in[k][j+jj-2][i-1] + in[k][j+jj-2][i+1] + in[k][j+jj-1][i-2] + in[k][j+jj-1][i+2] + in[k][j+jj][i] + in[k][j+jj+1][i-2] + in[k][j+jj+1][i+2] + in[k][j+jj+2][i-1] + in[k][j+jj+2][i+1] + 51 | in[k+1][j+jj-2][i] + in[k+1][j+jj][i-2] + in[k+1][j+jj][i+2] + in[k+1][j+jj+2][i] + 52 | in[k-2][j+jj-1][i] + in[k-2][j+jj][i-1] + in[k-2][j+jj][i+1] + in[k-2][j+jj+1][i]) + 53 | 54 | 0.331 * (in[k-2][j+jj-1][i-1] + in[k-2][j+jj-1][i+1] + in[k-2][j+jj+1][i-1] + in[k-2][j+jj+1][i+1] + 55 | in[k-1][j+jj-2][i-1] + in[k-1][j+jj-2][i+1] + in[k-1][j+jj-1][i-2] + in[k-1][j+jj-1][i+2] + in[k-1][j+jj][i] + in[k-1][j+jj+1][i-2] + in[k-1][j+jj+1][i+2] + in[k-1][j+jj+2][i-1] + in[k-1][j+jj+2][i+1] + 56 | in[k][j+jj-1][i-1] + in[k][j+jj-1][i+1] + in[k][j+jj+1][i-1] + in[k][j+jj+1][i+1] + 57 | in[k+1][j+jj-2][i-1] + in[k+1][j+jj-2][i+1] + in[k+1][j+jj-1][i-2] + in[k+1][j+jj-1][i+2] + in[k+1][j+jj][i] + in[k+1][j+jj+1][i-2] + in[k+1][j+jj+1][i+2] + in[k+1][j+jj+2][i-1] + in[k+1][j+jj+2][i+1]) + 58 | 0.332 * (in[k-2][j+jj-1][i-1] + in[k-2][j+jj-1][i+1] + in[k-2][j+jj+1][i-1] + in[k-2][j+jj+1][i+1]); 59 | } 60 | } 61 | } 62 | 63 | extern "C" void host_code (double *h_in, double *h_out, int N) { 64 | double *in; 65 | cudaMalloc (&in, sizeof(double)*N*N*N); 66 | check_error ("Failed to allocate device memory for in\n"); 67 | cudaMemcpy (in, h_in, sizeof(double)*N*N*N, cudaMemcpyHostToDevice); 68 | double *out; 69 | cudaMalloc (&out, sizeof(double)*N*N*N); 70 | check_error ("Failed to allocate device memory for out\n"); 71 | 72 | dim3 blockconfig (16, 4, 4); 73 | dim3 gridconfig (ceil(N-4, blockconfig.x), ceil(N-4, 4*blockconfig.y), ceil(N-4, blockconfig.z)); 74 | 75 | j3d125pt<<>> (in, out, N); 76 | 77 | cudaMemcpy (h_out, out, sizeof(double)*N*N*N, cudaMemcpyDeviceToHost); 78 | 79 | cudaFree (in); 80 | cudaFree (out); 81 | } 82 | --------------------------------------------------------------------------------