├── LICENSE ├── NPB-FF ├── CG │ ├── Makefile │ └── cg.cpp ├── EP │ ├── Makefile │ └── ep.cpp ├── FT │ ├── Makefile │ ├── ft.cpp │ └── global.hpp ├── IS │ ├── Makefile │ └── is.cpp ├── MG │ ├── Makefile │ ├── globals.hpp │ └── mg.cpp ├── Makefile ├── README.md ├── bin │ └── README.md ├── common │ ├── c_print_results.cpp │ ├── c_randdp.cpp │ ├── c_timers.cpp │ ├── npb-CPP.hpp │ ├── wtime.cpp │ ├── wtime.hpp │ └── wtime_sgi64.cpp ├── config │ ├── make.def │ └── suite.def └── sys │ ├── Makefile │ ├── README │ ├── make.common │ ├── print_header │ ├── print_instructions │ └── setparams.cpp ├── NPB-OMP ├── CG │ ├── Makefile │ └── cg.cpp ├── EP │ ├── Makefile │ └── ep.cpp ├── FT │ ├── Makefile │ ├── ft.cpp │ └── global.hpp ├── IS │ ├── Makefile │ └── is.cpp ├── MG │ ├── Makefile │ ├── globals.hpp │ └── mg.cpp ├── Makefile ├── README.md ├── bin │ └── README.md ├── common │ ├── c_print_results.cpp │ ├── c_randdp.cpp │ ├── c_timers.cpp │ ├── npb-CPP.hpp │ ├── wtime.cpp │ ├── wtime.hpp │ └── wtime_sgi64.cpp ├── config │ ├── make.def │ └── suite.def └── sys │ ├── Makefile │ ├── README │ ├── make.common │ ├── print_header │ ├── print_instructions │ └── setparams.cpp ├── NPB-SER ├── CG │ ├── Makefile │ └── cg.cpp ├── EP │ ├── Makefile │ └── ep.cpp ├── FT │ ├── Makefile │ ├── ft.cpp │ └── global.hpp ├── IS │ ├── Makefile │ └── is.cpp ├── MG │ ├── Makefile │ ├── globals.hpp │ └── mg.cpp ├── Makefile ├── README.md ├── bin │ └── README.md ├── common │ ├── c_print_results.cpp │ ├── c_randdp.cpp │ ├── c_timers.cpp │ ├── npb-CPP.hpp │ ├── wtime.cpp │ ├── wtime.hpp │ └── wtime_sgi64.cpp ├── config │ ├── make.def │ └── suite.def └── sys │ ├── Makefile │ ├── README │ ├── make.common │ ├── print_header │ ├── print_instructions │ └── setparams.cpp ├── NPB-TBB ├── CG │ ├── Makefile │ └── cg.cpp ├── EP │ ├── Makefile │ └── ep.cpp ├── FT │ ├── Makefile │ ├── ft.cpp │ └── global.hpp ├── IS │ ├── Makefile │ └── is.cpp ├── MG │ ├── Makefile │ ├── globals.hpp │ └── mg.cpp ├── Makefile ├── README.md ├── bin │ └── README.md ├── common │ ├── c_print_results.cpp │ ├── c_randdp.cpp │ ├── c_timers.cpp │ ├── npb-CPP.hpp │ ├── wtime.cpp │ ├── wtime.hpp │ └── wtime_sgi64.cpp ├── config │ ├── make.def │ └── suite.def └── sys │ ├── Makefile │ ├── README │ ├── make.common │ ├── print_header │ ├── print_instructions │ └── setparams.cpp └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dalvan Griebler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NPB-FF/CG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=cg 3 | BENCHMARKU=CG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = cg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | cg.o: cg.cpp npbparams.hpp 16 | ${CCOMPILE} cg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-FF/EP/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ep 3 | BENCHMARKU=EP 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | 16 | ep.o: ep.cpp npbparams.hpp 17 | ${CCOMPILE} ep.cpp 18 | 19 | clean: 20 | - rm -f *.o *~ 21 | - rm -f npbparams.hpp core 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /NPB-FF/EP/ep.cpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | 3 | Information on NAS Parallel Benchmarks is available at: 4 | 5 | http://www.nas.nasa.gov/Software/NPB/ 6 | 7 | Authors: P. O. Frederickson 8 | D. H. Bailey 9 | A. C. Woo 10 | 11 | CPP and FastFlow version: 12 | Dalvan Griebler 13 | Júnior Löff 14 | 15 | --------------------------------------------------------------------*/ 16 | 17 | 18 | #include 19 | #include "npbparams.hpp" 20 | #include 21 | #include <../common/npb-CPP.hpp> 22 | 23 | /* parameters */ 24 | #define MK 16 25 | #define MM (M - MK) 26 | #define NN (1 << MM) 27 | #define NK (1 << MK) 28 | #define NQ 10 29 | #define EPSILON 1.0e-8 30 | #define A 1220703125.0 31 | #define S 271828183.0 32 | #define TIMERS_ENABLED FALSE 33 | 34 | /* global variables */ 35 | /* common /storage/ */ 36 | static double x[2*NK]; 37 | 38 | static double q[NQ]; 39 | 40 | const int _cache_line_size = 64; 41 | 42 | typedef struct packing_t{ 43 | double _qq; 44 | char pad[_cache_line_size-sizeof(_qq)]; //fills the rest of the cache line 45 | }; 46 | 47 | packing_t qq[100][NQ]; 48 | packing_t sxx[100]; 49 | packing_t syy[100]; 50 | /*-------------------------------------------------------------------- 51 | program EMBAR 52 | c-------------------------------------------------------------------*/ 53 | /* 54 | c This is the serial version of the APP Benchmark 1, 55 | c the "embarassingly parallel" benchmark. 56 | c 57 | c M is the Log_2 of the number of complex pairs of uniform (0, 1) random 58 | c numbers. MK is the Log_2 of the size of each batch of uniform random 59 | c numbers. MK can be set for convenience on a given system, since it does 60 | c not affect the results. 61 | */ 62 | int main(int argc, char **argv) { 63 | double Mops, t1, sx, sy, tm, an, gc; 64 | double dum[3] = { 1.0, 1.0, 1.0 }; 65 | int np,i, nit, k_offset, j; 66 | boolean verified; 67 | char size[13+1]; /* character*13 */ 68 | 69 | /* 70 | c Because the size of the problem is too large to store in a 32-bit 71 | c integer for some classes, we put it into a string (for printing). 72 | c Have to strip off the decimal point put in there by the floating 73 | c point print statement (internal file) 74 | */ 75 | 76 | printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n"); 77 | printf("Developed by: Dalvan Griebler & Júnior Löff \n\n"); 78 | sprintf(size, "%12.0f", pow(2.0, M+1)); 79 | for (j = 13; j >= 1; j--) { 80 | if (size[j] == '.') size[j] = ' '; 81 | } 82 | printf(" Number of random numbers generated: %13s\n", size); 83 | 84 | verified = FALSE; 85 | 86 | /* 87 | c Compute the number of "batches" of random number pairs generated 88 | c per processor. Adjust if the number of processors does not evenly 89 | c divide the total number 90 | */ 91 | np = NN; 92 | /* 93 | c Call the random number generator functions and initialize 94 | c the x-array to reduce the effects of paging on the timings. 95 | c Also, call all mathematical functions that are used. Make 96 | c sure these initializations cannot be eliminated as dead code. 97 | */ 98 | vranlc(0, &(dum[0]), dum[1], &(dum[2])); 99 | dum[0] = randlc(&(dum[1]), dum[2]); 100 | for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; 101 | Mops = log(sqrt(fabs(max(1.0, 1.0)))); 102 | 103 | 104 | 105 | timer_clear(1); 106 | timer_clear(2); 107 | timer_clear(3); 108 | 109 | timer_start(1); 110 | 111 | vranlc(0, &t1, A, x); 112 | 113 | /* Compute AN = A ^ (2 * NK) (mod 2^46). */ 114 | 115 | t1 = A; 116 | 117 | for ( i = 1; i <= MK+1; i++) { 118 | an = randlc(&t1, t1); 119 | } 120 | 121 | an = t1; 122 | gc = 0.0; 123 | sx = 0.0; 124 | sy = 0.0; 125 | 126 | for ( i = 0; i <= NQ - 1; i++) { 127 | q[i] = 0.0; 128 | } 129 | 130 | /* 131 | c Each instance of this loop may be performed independently. We compute 132 | c the k offsets separately to take into account the fact that some nodes 133 | c have more numbers to generate than others 134 | */ 135 | k_offset = -1; 136 | 137 | int num_workers; 138 | if(const char * nw = std::getenv("FF_NUM_THREADS")) { 139 | num_workers = atoi(nw); 140 | } else { 141 | num_workers = 1; 142 | } 143 | 144 | ff::ParallelFor pf(num_workers, true); 145 | for(int i=0; i 0D grid decomposition 6 | 7 | 8 | c Cache blocking params. These values are good for most 9 | c RISC processors. 10 | c FFT parameters: 11 | c fftblock controls how many ffts are done at a time. 12 | c The default is appropriate for most cache-based machines 13 | c On vector machines, the FFT can be vectorized with vector 14 | c length equal to the block size, so the block size should 15 | c be as large as possible. This is the size of the smallest 16 | c dimension of the problem: 128 for class A, 256 for class B and 17 | c 512 for class C. 18 | */ 19 | 20 | #define FFTBLOCK_DEFAULT 16 21 | #define FFTBLOCKPAD_DEFAULT 18 22 | 23 | #define FFTBLOCK FFTBLOCK_DEFAULT 24 | #define FFTBLOCKPAD FFTBLOCKPAD_DEFAULT 25 | 26 | /* COMMON block: blockinfo */ 27 | int fftblock; 28 | int fftblockpad; 29 | 30 | /* 31 | c we need a bunch of logic to keep track of how 32 | c arrays are laid out. 33 | 34 | 35 | c Note: this serial version is the derived from the parallel 0D case 36 | c of the ft NPB. 37 | c The computation proceeds logically as 38 | 39 | c set up initial conditions 40 | c fftx(1) 41 | c transpose (1->2) 42 | c ffty(2) 43 | c transpose (2->3) 44 | c fftz(3) 45 | c time evolution 46 | c fftz(3) 47 | c transpose (3->2) 48 | c ffty(2) 49 | c transpose (2->1) 50 | c fftx(1) 51 | c compute residual(1) 52 | 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx 54 | c 55 | c 0D 1D 2D 56 | c 1: xyz xyz xyz 57 | c 2: xyz xyz yxz 58 | c 3: xyz zyx zxy 59 | 60 | c the array dimensions are stored in dims(coord, phase) 61 | */ 62 | 63 | /* COMMON block: layout */ 64 | static int dims[3][3]; 65 | static int xstart[3]; 66 | static int ystart[3]; 67 | static int zstart[3]; 68 | static int xend[3]; 69 | static int yend[3]; 70 | static int zend[3]; 71 | 72 | #define T_TOTAL 0 73 | #define T_SETUP 1 74 | #define T_FFT 2 75 | #define T_EVOLVE 3 76 | #define T_CHECKSUM 4 77 | #define T_FFTLOW 5 78 | #define T_FFTCOPY 6 79 | #define T_MAX 7 80 | 81 | #define TIMERS_ENABLED TRUE 82 | 83 | /* other stuff */ 84 | 85 | #define SEED 314159265.0 86 | #define A 1220703125.0 87 | #define PI 3.141592653589793238 88 | #define ALPHA 1.0e-6 89 | 90 | #define EXPMAX (NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4)) 91 | 92 | /* COMMON block: excomm */ 93 | static double ex[EXPMAX+1]; /* ex(0:expmax) */ 94 | 95 | /* 96 | c roots of unity array 97 | c relies on x being largest dimension? 98 | */ 99 | 100 | /* COMMON block: ucomm */ 101 | static dcomplex u[NX]; 102 | 103 | /* for checksum data */ 104 | 105 | /* COMMON block: sumcomm */ 106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */ 107 | 108 | /* number of iterations*/ 109 | 110 | /* COMMON block: iter */ 111 | static int niter; 112 | 113 | -------------------------------------------------------------------------------- /NPB-FF/IS/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=is 3 | BENCHMARKU=IS 4 | 5 | include ../config/make.def 6 | 7 | include ../sys/make.common 8 | 9 | OBJS = is.o \ 10 | ${COMMON}/c_print_results.o \ 11 | ${COMMON}/c_timers.o \ 12 | ${COMMON}/c_wtime.o 13 | 14 | 15 | ${PROGRAM}: config ${OBJS} 16 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 17 | 18 | .c.o: 19 | ${CCOMPILE} $< 20 | 21 | is.o: is.cpp npbparams.hpp 22 | ${CCOMPILE} is.cpp 23 | 24 | 25 | clean: 26 | - rm -f *.o *~ mputil* 27 | - rm -f npbparams.hpp core 28 | - if [ -d rii_files ]; then rm -r rii_files; fi 29 | -------------------------------------------------------------------------------- /NPB-FF/MG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=mg 3 | BENCHMARKU=MG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = mg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | mg.o: mg.cpp npbparams.hpp 16 | ${CCOMPILE} mg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-FF/MG/globals.hpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | c Parameter lm (declared and set in "npbparams.h") is the log-base2 of 3 | c the edge size max for the partition on a given node, so must be changed 4 | c either to save space (if running a small case) or made bigger for larger 5 | c cases, for example, 512^3. Thus lm=7 means that the largest dimension 6 | c of a partition that can be solved on a node is 2^7 = 128. lm is set 7 | c automatically in npbparams.h 8 | c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 9 | c-------------------------------------------------------------------*/ 10 | 11 | #include "npbparams.hpp" 12 | 13 | /* parameters */ 14 | /* actual dimension including ghost cells for communications */ 15 | #define NM (2+(2<<(LM-1))) 16 | /* size of rhs array */ 17 | #define NV (2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1)))) 18 | /* size of residual array */ 19 | #define NR ((8*(NV+(NM*NM)+5*NM+7*LM))/7) 20 | /* size of communication buffer */ 21 | #define NM2 (2*NM*NM) 22 | /* maximum number of levels */ 23 | #define MAXLEVEL 11 24 | 25 | /*---------------------------------------------------------------------*/ 26 | /* common /mg3/ */ 27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; 28 | /* common /ClassType/ */ 29 | static char class_npb; 30 | /* common /my_debug/ */ 31 | static int debug_vec[8]; 32 | /* common /fap/ */ 33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/ 34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; 35 | static int lt, lb; 36 | 37 | /*c--------------------------------------------------------------------- 38 | c Set at m=1024, can handle cases up to 1024^3 case 39 | c---------------------------------------------------------------------*/ 40 | #define M 1037 41 | 42 | /* common /buffer/ */ 43 | /*static double buff[4][NM2];*/ 44 | -------------------------------------------------------------------------------- /NPB-FF/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | CLASS=S 3 | SFILE=config/suite.def 4 | 5 | default: header 6 | @ $(SHELL) sys/print_instructions 7 | 8 | BT: bt 9 | bt: header 10 | cd BT; $(MAKE) CLASS=$(CLASS) 11 | 12 | SP: sp 13 | sp: header 14 | cd SP; $(MAKE) CLASS=$(CLASS) 15 | 16 | LU: lu 17 | lu: header 18 | cd LU; $(MAKE) CLASS=$(CLASS) 19 | 20 | MG: mg 21 | mg: header 22 | cd MG; $(MAKE) CLASS=$(CLASS) 23 | 24 | FT: ft 25 | ft: header 26 | cd FT; $(MAKE) CLASS=$(CLASS) 27 | 28 | IS: is 29 | is: header 30 | cd IS; $(MAKE) CLASS=$(CLASS) 31 | 32 | CG: cg 33 | cg: header 34 | cd CG; $(MAKE) CLASS=$(CLASS) 35 | 36 | EP: ep 37 | ep: header 38 | cd EP; $(MAKE) CLASS=$(CLASS) 39 | DC: dc 40 | dc: header 41 | cd DC; $(MAKE) CLASS=$(CLASS) 42 | 43 | # Awk script courtesy cmg@cray.com 44 | suite: 45 | @ awk '{ if ($$1 !~ /^#/ && NF > 0) \ 46 | printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE) \ 47 | | $(SHELL) 48 | 49 | 50 | # It would be nice to make clean in each subdirectory (the targets 51 | # are defined) but on a really clean system this will won't work 52 | # because those makefiles need config/make.def 53 | clean: 54 | - rm -f core 55 | - rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe 56 | - rm -f sys/setparams sys/makesuite sys/setparams.hpp 57 | 58 | cleanall: clean 59 | - rm -r bin/* 60 | 61 | veryclean: clean 62 | - rm config/make.def config/suite.def Part* 63 | - rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.* 64 | 65 | header: 66 | @ $(SHELL) sys/print_header 67 | 68 | kit: 69 | - makekit -s100k -k30 * */* */*/* 70 | 71 | 72 | -------------------------------------------------------------------------------- /NPB-FF/README.md: -------------------------------------------------------------------------------- 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP) 2 | 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). 4 | 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways. 6 | 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)* 8 | 9 | 10 | ## How to cite this work 11 | 12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 13 | 14 | ## The NPB-CPP Benchmark 15 | 16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version. 17 | 18 | ================================================================== 19 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 20 | 21 | Code contributors: 22 | Dalvan Griebler 23 | Júnior Löff 24 | 25 | Warning: in case of problems send an email to us: 26 | dalvan.griebler@acad.pucrs.br 27 | junior.loff@acad.pucrs.br 28 | ================================================================== 29 | 30 | 31 | This folder contains: 32 | 33 | - NPB-FF - Directory with the parallel version implemented in FastFlow 34 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 35 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 36 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 37 | 38 | Each directory is independent and contains its own implemented version of the kernels: 39 | 40 | IS - Integer Sort, random memory access 41 | EP - Embarrassingly Parallel 42 | CG - Conjugate Gradient, irregular memory access and communication 43 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 44 | FT - discrete 3D fast Fourier Transform, all-to-all communication 45 | 46 | ## Software Requirements 47 | 48 | *Warning: our tests were made with GCC-5* 49 | 50 | **TBB** 51 | 52 | *Installation* 53 | 54 | apt-get install libtbb-dev 55 | 56 | **FastFlow** 57 | 58 | *Installation* 59 | 60 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 61 | 62 | 63 | ## How to Compile 64 | 65 | Enter the directory from the version desired and execute: 66 | 67 | make _BENCHMARK CLASS=_VERSION 68 | 69 | 70 | _BENCHMARKs are: 71 | 72 | EP, CG, MG, IS and FT 73 | 74 | _VERSIONs are: 75 | 76 | Class S: small for quick test purposes 77 | Class W: workstation size (a 90's workstation; now likely too small) 78 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 79 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 80 | 81 | 82 | Command: 83 | 84 | make ep CLASS=B 85 | -------------------------------------------------------------------------------- /NPB-FF/bin/README.md: -------------------------------------------------------------------------------- 1 | # How to Cite our Work 2 | 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 4 | 5 | # The NPB-CPP Benchmark 6 | 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version. 8 | 9 | ================================================================== 10 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 11 | 12 | Code contributors: 13 | Dalvan Griebler 14 | Júnior Löff 15 | 16 | Warning: in case of problems send an email to us: 17 | dalvan.griebler@acad.pucrs.br 18 | junior.loff@acad.pucrs.br 19 | ================================================================== 20 | 21 | 22 | This folder contains: 23 | 24 | - NPB-FF - Directory with the parallel version implemented in FastFlow 25 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 26 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 27 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 28 | 29 | Each directory is independent and contains its own implemented version of the kernels: 30 | 31 | IS - Integer Sort, random memory access 32 | EP - Embarrassingly Parallel 33 | CG - Conjugate Gradient, irregular memory access and communication 34 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 35 | FT - discrete 3D fast Fourier Transform, all-to-all communication 36 | 37 | # Software Requiriments 38 | 39 | *Warning: our tests were made with GCC-5* 40 | 41 | **TBB** 42 | 43 | *Installation* 44 | 45 | apt-get install libtbb-dev 46 | 47 | **FastFlow** 48 | 49 | *Installation* 50 | 51 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 52 | 53 | 54 | # How to Compile 55 | 56 | Enter the directory from the version desired and execute: 57 | 58 | make _BENCHMARK CLASS=_VERSION 59 | 60 | 61 | _BENCHMARKs are: 62 | 63 | EP, CG, MG, IS and FT 64 | 65 | _VERSIONs are: 66 | 67 | Class S: small for quick test purposes 68 | Class W: workstation size (a 90's workstation; now likely too small) 69 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 70 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 71 | 72 | 73 | Command: 74 | 75 | make ep CLASS=B -------------------------------------------------------------------------------- /NPB-FF/common/c_print_results.cpp: -------------------------------------------------------------------------------- 1 | /*****************************************************************/ 2 | /****** C _ P R I N T _ R E S U L T S ******/ 3 | /*****************************************************************/ 4 | #include 5 | #include 6 | 7 | void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, double t, 8 | double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, 9 | char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand) 10 | { 11 | 12 | printf( "\n\n %s Benchmark Completed\n", name ); 13 | 14 | printf( " class_npb = %c\n", class_npb ); 15 | 16 | if( n2 == 0 && n3 == 0 ) 17 | printf( " Size = %12d\n", n1 ); /* as in IS */ 18 | else 19 | printf( " Size = %3dx%3dx%3d\n", n1,n2,n3 ); 20 | 21 | printf( " Iterations = %12d\n", niter ); 22 | 23 | printf( " Time in seconds = %12.2f\n", t ); 24 | 25 | printf( " Mop/s total = %12.2f\n", mops ); 26 | 27 | printf( " Operation type = %24s\n", optype); 28 | 29 | if( passed_verification ) 30 | printf( " Verification = SUCCESSFUL\n" ); 31 | else 32 | printf( " Verification = UNSUCCESSFUL\n" ); 33 | 34 | printf( " Version = %12s\n", npbversion ); 35 | 36 | printf( " Compile date = %12s\n", compiletime ); 37 | 38 | printf( "\n Compile options:\n" ); 39 | 40 | printf( " CC = %s\n", cc ); 41 | 42 | printf( " CLINK = %s\n", clink ); 43 | 44 | printf( " C_LIB = %s\n", c_lib ); 45 | 46 | printf( " C_INC = %s\n", c_inc ); 47 | 48 | printf( " CFLAGS = %s\n", cflags ); 49 | 50 | printf( " CLINKFLAGS = %s\n", clinkflags ); 51 | 52 | printf( " RAND = %s\n", rand ); 53 | #ifdef SMP 54 | char *evalue = getenv("MP_SET_NUMTHREADS"); 55 | printf( " MULTICPUS = %s\n", evalue ); 56 | #endif 57 | 58 | /* printf( "\n\n" ); 59 | printf( " Please send the results of this run to:\n\n" ); 60 | printf( " NPB Development Team\n" ); 61 | printf( " Internet: npb@nas.nasa.gov\n \n" ); 62 | printf( " If email is not available, send this to:\n\n" ); 63 | printf( " MS T27A-1\n" ); 64 | printf( " NASA Ames Research Center\n" ); 65 | printf( " Moffett Field, CA 94035-1000\n\n" ); 66 | printf( " Fax: 415-604-3957\n\n" );*/ 67 | } 68 | 69 | -------------------------------------------------------------------------------- /NPB-FF/common/c_randdp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | #if defined(USE_POW) 4 | #define r23 pow(0.5, 23.0) 5 | #define r46 (r23*r23) 6 | #define t23 pow(2.0, 23.0) 7 | #define t46 (t23*t23) 8 | #else 9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5) 10 | #define r46 (r23*r23) 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0) 12 | #define t46 (t23*t23) 13 | #endif 14 | 15 | /*c--------------------------------------------------------------------- 16 | c---------------------------------------------------------------------*/ 17 | 18 | double randlc (double *x, double a) { 19 | 20 | /*c--------------------------------------------------------------------- 21 | c---------------------------------------------------------------------*/ 22 | 23 | /*c--------------------------------------------------------------------- 24 | c 25 | c This routine returns a uniform pseudorandom double precision number in the 26 | c range (0, 1) by using the linear congruential generator 27 | c 28 | c x_{k+1} = a x_k (mod 2^46) 29 | c 30 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 31 | c before repeating. The argument A is the same as 'a' in the above formula, 32 | c and X is the same as x_0. A and X must be odd double precision integers 33 | c in the range (1, 2^46). The returned value RANDLC is normalized to be 34 | c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 35 | c the new seed x_1, so that subsequent calls to RANDLC using the same 36 | c arguments will generate a continuous sequence. 37 | c 38 | c This routine should produce the same results on any computer with at least 39 | c 48 mantissa bits in double precision floating point data. On 64 bit 40 | c systems, double precision should be disabled. 41 | c 42 | c David H. Bailey October 26, 1990 43 | c 44 | c---------------------------------------------------------------------*/ 45 | 46 | double t1,t2,t3,t4,a1,a2,x1,x2,z; 47 | 48 | /*c--------------------------------------------------------------------- 49 | c Break A into two parts such that A = 2^23 * A1 + A2. 50 | c---------------------------------------------------------------------*/ 51 | t1 = r23 * a; 52 | a1 = (int)t1; 53 | a2 = a - t23 * a1; 54 | 55 | /*c--------------------------------------------------------------------- 56 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 57 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 58 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 59 | c---------------------------------------------------------------------*/ 60 | t1 = r23 * (*x); 61 | x1 = (int)t1; 62 | x2 = (*x) - t23 * x1; 63 | t1 = a1 * x2 + a2 * x1; 64 | t2 = (int)(r23 * t1); 65 | z = t1 - t23 * t2; 66 | t3 = t23 * z + a2 * x2; 67 | t4 = (int)(r46 * t3); 68 | (*x) = t3 - t46 * t4; 69 | 70 | return (r46 * (*x)); 71 | } 72 | 73 | /*c--------------------------------------------------------------------- 74 | c---------------------------------------------------------------------*/ 75 | 76 | void vranlc (int n, double *x_seed, double a, double y[]) { 77 | 78 | /*c--------------------------------------------------------------------- 79 | c---------------------------------------------------------------------*/ 80 | 81 | /*c--------------------------------------------------------------------- 82 | c 83 | c This routine generates N uniform pseudorandom double precision numbers in 84 | c the range (0, 1) by using the linear congruential generator 85 | c 86 | c x_{k+1} = a x_k (mod 2^46) 87 | c 88 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 89 | c before repeating. The argument A is the same as 'a' in the above formula, 90 | c and X is the same as x_0. A and X must be odd double precision integers 91 | c in the range (1, 2^46). The N results are placed in Y and are normalized 92 | c to be between 0 and 1. X is updated to contain the new seed, so that 93 | c subsequent calls to VRANLC using the same arguments will generate a 94 | c continuous sequence. If N is zero, only initialization is performed, and 95 | c the variables X, A and Y are ignored. 96 | c 97 | c This routine is the standard version designed for scalar or RISC systems. 98 | c However, it should produce the same results on any single processor 99 | c computer with at least 48 mantissa bits in double precision floating point 100 | c data. On 64 bit systems, double precision should be disabled. 101 | c 102 | c---------------------------------------------------------------------*/ 103 | 104 | int i; 105 | double x,t1,t2,t3,t4,a1,a2,x1,x2,z; 106 | 107 | /*c--------------------------------------------------------------------- 108 | c Break A into two parts such that A = 2^23 * A1 + A2. 109 | c---------------------------------------------------------------------*/ 110 | t1 = r23 * a; 111 | a1 = (int)t1; 112 | a2 = a - t23 * a1; 113 | x = *x_seed; 114 | 115 | /*c--------------------------------------------------------------------- 116 | c Generate N results. This loop is not vectorizable. 117 | c---------------------------------------------------------------------*/ 118 | for (i = 1; i <= n; i++) { 119 | 120 | /*c--------------------------------------------------------------------- 121 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 122 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 123 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 124 | c---------------------------------------------------------------------*/ 125 | t1 = r23 * x; 126 | x1 = (int)t1; 127 | x2 = x - t23 * x1; 128 | t1 = a1 * x2 + a2 * x1; 129 | t2 = (int)(r23 * t1); 130 | z = t1 - t23 * t2; 131 | t3 = t23 * z + a2 * x2; 132 | t4 = (int)(r46 * t3); 133 | x = t3 - t46 * t4; 134 | y[i] = r46 * x; 135 | } 136 | *x_seed = x; 137 | } 138 | -------------------------------------------------------------------------------- /NPB-FF/common/c_timers.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #include "wtime.hpp" 5 | #include 6 | 7 | /* Prototype */ 8 | void wtime( double * ); 9 | 10 | 11 | 12 | /*****************************************************************/ 13 | /****** E L A P S E D _ T I M E ******/ 14 | /*****************************************************************/ 15 | double elapsed_time( void ) 16 | { 17 | double t; 18 | 19 | wtime( &t ); 20 | return( t ); 21 | } 22 | 23 | 24 | double start[64], elapsed[64]; 25 | 26 | /*****************************************************************/ 27 | /****** T I M E R _ C L E A R ******/ 28 | /*****************************************************************/ 29 | void timer_clear( int n ) 30 | { 31 | elapsed[n] = 0.0; 32 | } 33 | 34 | 35 | /*****************************************************************/ 36 | /****** T I M E R _ S T A R T ******/ 37 | /*****************************************************************/ 38 | void timer_start( int n ) 39 | { 40 | start[n] = elapsed_time(); 41 | } 42 | 43 | 44 | /*****************************************************************/ 45 | /****** T I M E R _ S T O P ******/ 46 | /*****************************************************************/ 47 | void timer_stop( int n ) 48 | { 49 | double t, now; 50 | 51 | now = elapsed_time(); 52 | t = now - start[n]; 53 | elapsed[n] += t; 54 | 55 | } 56 | 57 | 58 | /*****************************************************************/ 59 | /****** T I M E R _ R E A D ******/ 60 | /*****************************************************************/ 61 | double timer_read( int n ) 62 | { 63 | return( elapsed[n] ); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /NPB-FF/common/npb-CPP.hpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | typedef int boolean; 7 | typedef struct { double real; double imag; } dcomplex; 8 | 9 | #define TRUE 1 10 | #define FALSE 0 11 | 12 | #define max(a,b) (((a) > (b)) ? (a) : (b)) 13 | #define min(a,b) (((a) < (b)) ? (a) : (b)) 14 | #define pow2(a) ((a)*(a)) 15 | 16 | #define get_real(c) c.real 17 | #define get_imag(c) c.imag 18 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) 19 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) 20 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \ 21 | c.imag = a.real * b.imag + a.imag * b.real) 22 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b) 23 | 24 | extern double randlc(double *, double); 25 | extern void vranlc(int, double *, double, double *); 26 | extern void timer_clear(int); 27 | extern void timer_start(int); 28 | extern void timer_stop(int); 29 | extern double timer_read(int); 30 | 31 | extern void c_print_results(char *name, char class_npb, int n1, int n2, 32 | int n3, int niter, double t, 33 | double mops, char *optype, int passed_verification, 34 | char *npbversion, char *compiletime, char *cc, 35 | char *clink, char *c_lib, char *c_inc, 36 | char *cflags, char *clinkflags, char *rand); 37 | -------------------------------------------------------------------------------- /NPB-FF/common/wtime.cpp: -------------------------------------------------------------------------------- 1 | #include "wtime.hpp" 2 | #include 3 | 4 | void wtime(double *t) 5 | { 6 | static int sec = -1; 7 | struct timeval tv; 8 | gettimeofday(&tv, 0); 9 | if (sec < 0) sec = tv.tv_sec; 10 | *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /NPB-FF/common/wtime.hpp: -------------------------------------------------------------------------------- 1 | /* C/Fortran interface is different on different machines. 2 | * You may need to tweak this. 3 | */ 4 | 5 | 6 | #if defined(IBM) 7 | #define wtime wtime 8 | #elif defined(CRAY) 9 | #define wtime WTIME 10 | #else 11 | #define wtime wtime_ 12 | #endif 13 | -------------------------------------------------------------------------------- /NPB-FF/common/wtime_sgi64.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* The following works on SGI Power Challenge systems */ 10 | 11 | typedef unsigned long iotimer_t; 12 | 13 | unsigned int cycleval; 14 | volatile iotimer_t *iotimer_addr, base_counter; 15 | double resolution; 16 | 17 | /* address_t is an integer type big enough to hold an address */ 18 | typedef unsigned long address_t; 19 | 20 | 21 | 22 | void timer_init() 23 | { 24 | 25 | int fd; 26 | char *virt_addr; 27 | address_t phys_addr, page_offset, pagemask, pagebase_addr; 28 | 29 | pagemask = getpagesize() - 1; 30 | errno = 0; 31 | phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); 32 | if (errno != 0) { 33 | perror("SGI_QUERY_CYCLECNTR"); 34 | exit(1); 35 | } 36 | /* rel_addr = page offset of physical address */ 37 | page_offset = phys_addr & pagemask; 38 | pagebase_addr = phys_addr - page_offset; 39 | fd = open("/dev/mmem", O_RDONLY); 40 | 41 | virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); 42 | virt_addr = virt_addr + page_offset; 43 | iotimer_addr = (iotimer_t *)virt_addr; 44 | /* cycleval in picoseconds to this gives resolution in seconds */ 45 | resolution = 1.0e-12*cycleval; 46 | base_counter = *iotimer_addr; 47 | } 48 | 49 | void wtime_(double *time) 50 | { 51 | static int initialized = 0; 52 | volatile iotimer_t counter_value; 53 | if (!initialized) { 54 | timer_init(); 55 | initialized = 1; 56 | } 57 | counter_value = *iotimer_addr - base_counter; 58 | *time = (double)counter_value * resolution; 59 | } 60 | 61 | 62 | void wtime(double *time) 63 | { 64 | static int initialized = 0; 65 | volatile iotimer_t counter_value; 66 | if (!initialized) { 67 | timer_init(); 68 | initialized = 1; 69 | } 70 | counter_value = *iotimer_addr - base_counter; 71 | *time = (double)counter_value * resolution; 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /NPB-FF/config/make.def: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------- 2 | # 3 | # SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 4 | # 5 | #--------------------------------------------------------------------------- 6 | 7 | #--------------------------------------------------------------------------- 8 | # Items in this file will need to be changed for each platform. 9 | # (Note these definitions are inconsistent with NPB2.1.) 10 | #--------------------------------------------------------------------------- 11 | 12 | #--------------------------------------------------------------------------- 13 | # Parallel C: 14 | # 15 | # CC - C compiler 16 | # CFLAGS - C compilation arguments 17 | # C_INC - any -I arguments required for compiling C 18 | # CLINK - C linker 19 | # CLINKFLAGS - C linker flags 20 | # C_LIB - any -L and -l arguments required for linking C 21 | # 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or 23 | # $(CC) $(CFLAGS) 24 | # linking is done with $(CLINK) $(C_LIB) $(CLINKFLAGS) 25 | #--------------------------------------------------------------------------- 26 | 27 | #--------------------------------------------------------------------------- 28 | # This is the C compiler used for OpenMP programs 29 | #--------------------------------------------------------------------------- 30 | CC = g++ -std=c++14 31 | #gcc #cc 32 | # This links C programs; usually the same as ${CC} 33 | CLINK = $(CC) 34 | 35 | #--------------------------------------------------------------------------- 36 | # These macros are passed to the linker 37 | #--------------------------------------------------------------------------- 38 | C_LIB = -lm 39 | 40 | #--------------------------------------------------------------------------- 41 | # These macros are passed to the compiler 42 | #--------------------------------------------------------------------------- 43 | C_INC = -I../common 44 | 45 | #--------------------------------------------------------------------------- 46 | # Global *compile time* flags for C programs 47 | #--------------------------------------------------------------------------- 48 | CFLAGS = -O3 -I $(HOME)/fastflow -DBLOCKING_MODE -pthread 49 | # CFLAGS = -g 50 | 51 | #--------------------------------------------------------------------------- 52 | # Global *link time* flags. Flags for increasing maximum executable 53 | # size usually go here. 54 | #--------------------------------------------------------------------------- 55 | CLINKFLAGS = -O3 -I $(HOME)/fastflow -DBLOCKING_MODE -pthread 56 | 57 | 58 | #--------------------------------------------------------------------------- 59 | # Utilities C: 60 | # 61 | # This is the C compiler used to compile C utilities. Flags required by 62 | # this compiler go here also; typically there are few flags required; hence 63 | # there are no separate macros provided for such flags. 64 | #--------------------------------------------------------------------------- 65 | UCC = cc 66 | 67 | 68 | #--------------------------------------------------------------------------- 69 | # Destination of executables, relative to subdirs of the main directory. . 70 | #--------------------------------------------------------------------------- 71 | BINDIR = ../bin 72 | 73 | 74 | #--------------------------------------------------------------------------- 75 | # The variable RAND controls which random number generator 76 | # is used. It is described in detail in Doc/README.install. 77 | # Use "randi8" unless there is a reason to use another one. 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec" 79 | #--------------------------------------------------------------------------- 80 | # RAND = randi8 81 | # The following is highly reliable but may be slow: 82 | RAND = randdp 83 | 84 | 85 | #--------------------------------------------------------------------------- 86 | # The variable WTIME is the name of the wtime source code module in the 87 | # NPB2.x/common directory. 88 | # For most machines, use wtime.c 89 | # For SGI power challenge: use wtime_sgi64.c 90 | #--------------------------------------------------------------------------- 91 | WTIME = wtime.cpp 92 | 93 | 94 | #--------------------------------------------------------------------------- 95 | # Enable if either Cray or IBM: 96 | # (no such flag for most machines: see common/wtime.h) 97 | # This is used by the C compiler to pass the machine name to common/wtime.h, 98 | # where the C/Fortran binding interface format is determined 99 | #--------------------------------------------------------------------------- 100 | # MACHINE = -DCRAY 101 | # MACHINE = -DIBM 102 | 103 | 104 | -------------------------------------------------------------------------------- /NPB-FF/config/suite.def: -------------------------------------------------------------------------------- 1 | # config/suite.def 2 | # This file is used to build several benchmarks with a single command. 3 | # Typing "make suite" in the main directory will build all the benchmarks 4 | # specified in this file. 5 | # Each line of this file contains a benchmark name, class, and number 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft" 7 | # The class is one of "S", "W", "A", "B", and "C". 8 | # No blank lines. 9 | # The following example builds serial sample sizes of all benchmarks. 10 | ft A 11 | mg A 12 | is A 13 | ep A 14 | cg A 15 | -------------------------------------------------------------------------------- /NPB-FF/sys/Makefile: -------------------------------------------------------------------------------- 1 | include ../config/make.def 2 | 3 | # Note that COMPILE is also defined in make.common and should 4 | # be the same. We can't include make.common because it has a lot 5 | # of other garbage. 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 7 | 8 | all: setparams 9 | 10 | # setparams creates an npbparam.h file for each benchmark 11 | # configuration. npbparams.h also contains info about how a benchmark 12 | # was compiled and linked 13 | 14 | setparams: setparams.cpp ../config/make.def 15 | $(UCC) -o setparams setparams.cpp 16 | 17 | 18 | clean: 19 | -rm -f setparams setparams.hpp npbparams.hpp 20 | -rm -f *~ *.o 21 | 22 | -------------------------------------------------------------------------------- /NPB-FF/sys/README: -------------------------------------------------------------------------------- 1 | This directory contains utilities and files used by the 2 | build process. You should not need to change anything 3 | in this directory. 4 | 5 | Original Files 6 | -------------- 7 | setparams.c: 8 | Source for the setparams program. This program is used internally 9 | in the build process to create the file "npbparams.h" for each 10 | benchmark. npbparams.h contains Fortran or C parameters to build a 11 | benchmark for a specific class. The setparams program is never run 12 | directly by a user. Its invocation syntax is 13 | 14 | "setparams benchmark-name class". 15 | 16 | It examines the file "npbparams.h" in the current directory. If 17 | the specified parameters are the same as those in the npbparams.h 18 | file, nothing it changed. If the file does not exist or corresponds 19 | to a different class/number of nodes, it is (re)built. 20 | One of the more complicated things in npbparams.h is that it 21 | contains, in a Fortran string, the compiler flags used to build a 22 | benchmark, so that a benchmark can print out how it was compiled. 23 | 24 | make.common 25 | A makefile segment that is included in each individual benchmark 26 | program makefile. It sets up some standard macros (COMPILE, etc) 27 | and makes sure everything is configured correctly (npbparams.h) 28 | 29 | Makefile 30 | Builds setparams 31 | 32 | README 33 | This file. 34 | 35 | 36 | Created files 37 | ------------- 38 | 39 | setparams 40 | See descriptions above 41 | 42 | -------------------------------------------------------------------------------- /NPB-FF/sys/make.common: -------------------------------------------------------------------------------- 1 | PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 3 | CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) 4 | 5 | # Class "U" is used internally by the setparams program to mean 6 | # "unknown". This means that if you don't specify CLASS= 7 | # on the command line, you'll get an error. It would be nice 8 | # to be able to avoid this, but we'd have to get information 9 | # from the setparams back to the make program, which isn't easy. 10 | CLASS=U 11 | 12 | default:: ${PROGRAM} 13 | 14 | # This makes sure the configuration utility setparams 15 | # is up to date. 16 | # Note that this must be run every time, which is why the 17 | # target does not exist and is not created. 18 | # If you create a file called "config" you will break things. 19 | config: 20 | @cd ../sys; ${MAKE} all 21 | ../sys/setparams ${BENCHMARK} ${CLASS} 22 | 23 | COMMON=../common 24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f 25 | cd ${COMMON}; ${FCOMPILE} ${RAND}.f 26 | 27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp 28 | cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp 29 | 30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f 31 | cd ${COMMON}; ${FCOMPILE} print_results.f 32 | 33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp 34 | cd ${COMMON}; ${CCOMPILE} c_print_results.cpp 35 | 36 | ${COMMON}/timers.o: ${COMMON}/timers.f 37 | cd ${COMMON}; ${FCOMPILE} timers.f 38 | 39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp 40 | cd ${COMMON}; ${CCOMPILE} c_timers.cpp 41 | 42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME} 43 | cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME} 44 | # For most machines or CRAY or IBM 45 | # cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c 46 | # For a precise timer on an SGI Power Challenge, try: 47 | # cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c 48 | 49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME} 50 | cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME} 51 | 52 | 53 | # Normally setparams updates npbparams.h only if the settings (CLASS) 54 | # have changed. However, we also want to update if the compile options 55 | # may have changed (set in ../config/make.def). 56 | npbparams.hpp: ../config/make.def 57 | @ echo make.def modified. Rebuilding npbparams.hpp just in case 58 | rm -f npbparams.hpp 59 | ../sys/setparams ${BENCHMARK} ${CLASS} 60 | 61 | # So that "make benchmark-name" works 62 | ${BENCHMARK}: default 63 | ${BENCHMARKU}: default 64 | 65 | 66 | -------------------------------------------------------------------------------- /NPB-FF/sys/print_header: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' =========================================' 3 | echo ' = NAS Parallel Benchmarks =' 4 | echo ' = FastFlow C++ Versions =' 5 | echo ' = Developed by: Dalvan Griebler =' 6 | echo ' = Júnior Löff =' 7 | echo ' = =' 8 | echo ' = Warning: in case of problems =' 9 | echo ' = send an email to us: =' 10 | echo ' = dalvan.griebler@acad.pucrs.br =' 11 | echo ' = junior.loff@acad.pucrs.br =' 12 | echo ' =========================================' 13 | echo '' 14 | -------------------------------------------------------------------------------- /NPB-FF/sys/print_instructions: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' To make a NAS benchmark type ' 3 | echo '' 4 | echo ' make CLASS=' 5 | echo '' 6 | echo ' where is "cg", "ep", "ft", "is", or "mg"' 7 | echo ' is "S", "W", "A", "B" or "C"' 8 | echo '' 9 | echo ' To make a set of benchmarks, create the file config/suite.def' 10 | echo ' according to the instructions in config/suite.def.template and type' 11 | echo '' 12 | echo ' make suite' 13 | echo '' 14 | echo ' ***************************************************************' 15 | echo ' * Remember to edit the file config/make.def for site specific *' 16 | echo ' * information as described in the README file *' 17 | echo ' ***************************************************************' 18 | 19 | -------------------------------------------------------------------------------- /NPB-OMP/CG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=cg 3 | BENCHMARKU=CG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = cg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | cg.o: cg.cpp npbparams.hpp 16 | ${CCOMPILE} cg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-OMP/EP/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ep 3 | BENCHMARKU=EP 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | 16 | ep.o: ep.cpp npbparams.hpp 17 | ${CCOMPILE} ep.cpp 18 | 19 | clean: 20 | - rm -f *.o *~ 21 | - rm -f npbparams.hpp core 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /NPB-OMP/EP/ep.cpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | 3 | Information on NAS Parallel Benchmarks is available at: 4 | 5 | http://www.nas.nasa.gov/Software/NPB/ 6 | 7 | Authors: P. O. Frederickson 8 | D. H. Bailey 9 | A. C. Woo 10 | 11 | CPP and OpenMP version: 12 | Dalvan Griebler 13 | Júnior Löff 14 | 15 | --------------------------------------------------------------------*/ 16 | 17 | 18 | 19 | #include "npbparams.hpp" 20 | #include 21 | #include <../common/npb-CPP.hpp> 22 | 23 | /* parameters */ 24 | #define MK 16 25 | #define MM (M - MK) 26 | #define NN (1 << MM) 27 | #define NK (1 << MK) 28 | #define NQ 10 29 | #define EPSILON 1.0e-8 30 | #define A 1220703125.0 31 | #define S 271828183.0 32 | #define TIMERS_ENABLED FALSE 33 | 34 | /* global variables */ 35 | /* common /storage/ */ 36 | static double x[(2*NK)+1]; 37 | #pragma omp threadprivate(x) 38 | static double q[NQ]; 39 | 40 | /*-------------------------------------------------------------------- 41 | program EMBAR 42 | c-------------------------------------------------------------------*/ 43 | /* 44 | c This is the serial version of the APP Benchmark 1, 45 | c the "embarassingly parallel" benchmark. 46 | c 47 | c M is the Log_2 of the number of complex pairs of uniform (0, 1) random 48 | c numbers. MK is the Log_2 of the size of each batch of uniform random 49 | c numbers. MK can be set for convenience on a given system, since it does 50 | c not affect the results. 51 | */ 52 | int main(int argc, char **argv) { 53 | double Mops, t1, sx, sy, tm, an, gc; 54 | double dum[3] = { 1.0, 1.0, 1.0 }; 55 | int np,i, k, nit, k_offset, j; 56 | int nthreads = 1; 57 | boolean verified; 58 | char size[13+1]; /* character*13 */ 59 | 60 | /* 61 | c Because the size of the problem is too large to store in a 32-bit 62 | c integer for some classes, we put it into a string (for printing). 63 | c Have to strip off the decimal point put in there by the floating 64 | c point print statement (internal file) 65 | */ 66 | 67 | printf("\n\n NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n"); 68 | printf("\n\n Developed by: Dalvan Griebler \n"); 69 | sprintf(size, "%12.0f", pow(2.0, M+1)); 70 | for (j = 13; j >= 1; j--) { 71 | if (size[j] == '.') size[j] = ' '; 72 | } 73 | printf(" Number of random numbers generated: %13s\n", size); 74 | 75 | verified = FALSE; 76 | 77 | /* 78 | c Compute the number of "batches" of random number pairs generated 79 | c per processor. Adjust if the number of processors does not evenly 80 | c divide the total number 81 | */ 82 | np = NN; 83 | 84 | /* 85 | c Call the random number generator functions and initialize 86 | c the x-array to reduce the effects of paging on the timings. 87 | c Also, call all mathematical functions that are used. Make 88 | c sure these initializations cannot be eliminated as dead code. 89 | */ 90 | vranlc(0, &(dum[0]), dum[1], &(dum[2])); 91 | dum[0] = randlc(&(dum[1]), dum[2]); 92 | for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; 93 | Mops = log(sqrt(fabs(max(1.0, 1.0)))); 94 | 95 | 96 | 97 | timer_clear(1); 98 | timer_clear(2); 99 | timer_clear(3); 100 | 101 | timer_start(1); 102 | 103 | vranlc(0, &t1, A, x); 104 | 105 | /* Compute AN = A ^ (2 * NK) (mod 2^46). */ 106 | 107 | t1 = A; 108 | 109 | for ( i = 1; i <= MK+1; i++) { 110 | an = randlc(&t1, t1); 111 | } 112 | 113 | an = t1; 114 | gc = 0.0; 115 | sx = 0.0; 116 | sy = 0.0; 117 | 118 | for ( i = 0; i <= NQ - 1; i++) { 119 | q[i] = 0.0; 120 | } 121 | 122 | /* 123 | c Each instance of this loop may be performed independently. We compute 124 | c the k offsets separately to take into account the fact that some nodes 125 | c have more numbers to generate than others 126 | */ 127 | k_offset = -1; 128 | 129 | #pragma omp parallel copyin(x) 130 | { 131 | double t1, t2, t3, t4, x1, x2; 132 | int kk, i, ik, l; 133 | double qq[NQ]; /* private copy of q[0:NQ-1] */ 134 | 135 | for (i = 0; i < NQ; i++) qq[i] = 0.0; 136 | 137 | #pragma omp for reduction(+:sx,sy) 138 | for (k = 1; k <= np; k++) { 139 | kk = k_offset + k; 140 | t1 = S; 141 | t2 = an; 142 | 143 | /* Find starting seed t1 for this kk. */ 144 | 145 | for (i = 1; i <= 100; i++) { 146 | ik = kk / 2; 147 | if (2 * ik != kk) t3 = randlc(&t1, t2); 148 | if (ik == 0) break; 149 | t3 = randlc(&t2, t2); 150 | kk = ik; 151 | } 152 | 153 | /* Compute uniform pseudorandom numbers. */ 154 | 155 | if (TIMERS_ENABLED == TRUE) timer_start(3); 156 | vranlc(2*NK, &t1, A, x); 157 | if (TIMERS_ENABLED == TRUE) timer_stop(3); 158 | 159 | /* 160 | c Compute Gaussian deviates by acceptance-rejection method and 161 | c tally counts in concentric square annuli. This loop is not 162 | c vectorizable. 163 | */ 164 | if (TIMERS_ENABLED == TRUE) timer_start(2); 165 | 166 | for ( i = 1; i <= NK; i++) { 167 | x1 = 2.0 * x[2*i-1] - 1.0; 168 | x2 = 2.0 * x[2*i] - 1.0; 169 | t1 = pow2(x1) + pow2(x2); 170 | if (t1 <= 1.0) { 171 | t2 = sqrt(-2.0 * log(t1) / t1); 172 | t3 = (x1 * t2); /* Xi */ 173 | t4 = (x2 * t2); /* Yi */ 174 | l = max(fabs(t3), fabs(t4)); 175 | qq[l] += 1.0; /* counts */ 176 | sx = sx + t3; /* sum of Xi */ 177 | sy = sy + t4; /* sum of Yi */ 178 | } 179 | } 180 | if (TIMERS_ENABLED == TRUE) timer_stop(2); 181 | } 182 | #pragma omp critical 183 | { 184 | for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; 185 | } 186 | #if defined(_OPENMP) 187 | #pragma omp master 188 | nthreads = omp_get_num_threads(); 189 | #endif /* _OPENMP */ 190 | } /* end of parallel region */ 191 | for (i = 0; i <= NQ-1; i++) { 192 | gc = gc + q[i]; 193 | } 194 | 195 | timer_stop(1); 196 | tm = timer_read(1); 197 | 198 | 199 | 200 | nit = 0; 201 | if (M == 24) { 202 | if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) { 203 | verified = TRUE; 204 | } 205 | } else if (M == 25) { 206 | if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) { 207 | verified = TRUE; 208 | } 209 | } else if (M == 28) { 210 | //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { 211 | if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) { 212 | verified = TRUE; 213 | } 214 | } else if (M == 30) { 215 | if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) { 216 | verified = TRUE; 217 | } 218 | } else if (M == 32) { 219 | if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) { 220 | verified = TRUE; 221 | } 222 | } else if (M == 36) { 223 | if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) { 224 | verified = TRUE; 225 | } 226 | } else if (M == 40) { 227 | if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) { 228 | verified = TRUE; 229 | } 230 | } 231 | 232 | Mops = pow(2.0, M+1)/tm/1000000.0; 233 | 234 | printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" 235 | "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); 236 | for (i = 0; i <= NQ-1; i++) { 237 | printf("%3d %15.0f\n", i, q[i]); 238 | } 239 | 240 | c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, (char*)"Random numbers generated", 241 | verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); 242 | 243 | if (TIMERS_ENABLED == TRUE) { 244 | printf("Total time: %f", timer_read(1)); 245 | printf("Gaussian pairs: %f", timer_read(2)); 246 | printf("Random numbers: %f", timer_read(3)); 247 | } 248 | return 0; 249 | } 250 | -------------------------------------------------------------------------------- /NPB-OMP/FT/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ft 3 | BENCHMARKU=FT 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | ft.o: ft.cpp global.hpp npbparams.hpp 16 | ${CCOMPILE} ft.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ mputil* 20 | - rm -f ft npbparams.hpp core 21 | -------------------------------------------------------------------------------- /NPB-OMP/FT/global.hpp: -------------------------------------------------------------------------------- 1 | #include "npbparams.hpp" 2 | 3 | 4 | /* 5 | c If processor array is 1x1 -> 0D grid decomposition 6 | 7 | 8 | c Cache blocking params. These values are good for most 9 | c RISC processors. 10 | c FFT parameters: 11 | c fftblock controls how many ffts are done at a time. 12 | c The default is appropriate for most cache-based machines 13 | c On vector machines, the FFT can be vectorized with vector 14 | c length equal to the block size, so the block size should 15 | c be as large as possible. This is the size of the smallest 16 | c dimension of the problem: 128 for class A, 256 for class B and 17 | c 512 for class C. 18 | */ 19 | 20 | #define FFTBLOCK_DEFAULT 16 21 | #define FFTBLOCKPAD_DEFAULT 18 22 | 23 | #define FFTBLOCK FFTBLOCK_DEFAULT 24 | #define FFTBLOCKPAD FFTBLOCKPAD_DEFAULT 25 | 26 | /* COMMON block: blockinfo */ 27 | int fftblock; 28 | int fftblockpad; 29 | 30 | /* 31 | c we need a bunch of logic to keep track of how 32 | c arrays are laid out. 33 | 34 | 35 | c Note: this serial version is the derived from the parallel 0D case 36 | c of the ft NPB. 37 | c The computation proceeds logically as 38 | 39 | c set up initial conditions 40 | c fftx(1) 41 | c transpose (1->2) 42 | c ffty(2) 43 | c transpose (2->3) 44 | c fftz(3) 45 | c time evolution 46 | c fftz(3) 47 | c transpose (3->2) 48 | c ffty(2) 49 | c transpose (2->1) 50 | c fftx(1) 51 | c compute residual(1) 52 | 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx 54 | c 55 | c 0D 1D 2D 56 | c 1: xyz xyz xyz 57 | c 2: xyz xyz yxz 58 | c 3: xyz zyx zxy 59 | 60 | c the array dimensions are stored in dims(coord, phase) 61 | */ 62 | 63 | /* COMMON block: layout */ 64 | static int dims[3][3]; 65 | static int xstart[3]; 66 | static int ystart[3]; 67 | static int zstart[3]; 68 | static int xend[3]; 69 | static int yend[3]; 70 | static int zend[3]; 71 | 72 | #define T_TOTAL 0 73 | #define T_SETUP 1 74 | #define T_FFT 2 75 | #define T_EVOLVE 3 76 | #define T_CHECKSUM 4 77 | #define T_FFTLOW 5 78 | #define T_FFTCOPY 6 79 | #define T_MAX 7 80 | 81 | #define TIMERS_ENABLED TRUE 82 | 83 | /* other stuff */ 84 | 85 | #define SEED 314159265.0 86 | #define A 1220703125.0 87 | #define PI 3.141592653589793238 88 | #define ALPHA 1.0e-6 89 | 90 | #define EXPMAX (NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4)) 91 | 92 | /* COMMON block: excomm */ 93 | static double ex[EXPMAX+1]; /* ex(0:expmax) */ 94 | 95 | /* 96 | c roots of unity array 97 | c relies on x being largest dimension? 98 | */ 99 | 100 | /* COMMON block: ucomm */ 101 | static dcomplex u[NX]; 102 | 103 | /* for checksum data */ 104 | 105 | /* COMMON block: sumcomm */ 106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */ 107 | 108 | /* number of iterations*/ 109 | 110 | /* COMMON block: iter */ 111 | static int niter; 112 | 113 | -------------------------------------------------------------------------------- /NPB-OMP/IS/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=is 3 | BENCHMARKU=IS 4 | 5 | include ../config/make.def 6 | 7 | include ../sys/make.common 8 | 9 | OBJS = is.o \ 10 | ${COMMON}/c_print_results.o \ 11 | ${COMMON}/c_timers.o \ 12 | ${COMMON}/c_wtime.o 13 | 14 | 15 | ${PROGRAM}: config ${OBJS} 16 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 17 | 18 | is.o: is.cpp npbparams.hpp 19 | ${CCOMPILE} is.cpp 20 | 21 | clean: 22 | - rm -f *.o *~ mputil* 23 | - rm -f npbparams.hpp core 24 | - if [ -d rii_files ]; then rm -r rii_files; fi 25 | -------------------------------------------------------------------------------- /NPB-OMP/MG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=mg 3 | BENCHMARKU=MG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = mg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | mg.o: mg.cpp npbparams.hpp 16 | ${CCOMPILE} mg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-OMP/MG/globals.hpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | c Parameter lm (declared and set in "npbparams.h") is the log-base2 of 3 | c the edge size max for the partition on a given node, so must be changed 4 | c either to save space (if running a small case) or made bigger for larger 5 | c cases, for example, 512^3. Thus lm=7 means that the largest dimension 6 | c of a partition that can be solved on a node is 2^7 = 128. lm is set 7 | c automatically in npbparams.h 8 | c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 9 | c-------------------------------------------------------------------*/ 10 | 11 | #include "npbparams.hpp" 12 | 13 | /* parameters */ 14 | /* actual dimension including ghost cells for communications */ 15 | #define NM (2+(2<<(LM-1))) 16 | /* size of rhs array */ 17 | #define NV (2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1)))) 18 | /* size of residual array */ 19 | #define NR ((8*(NV+(NM*NM)+5*NM+7*LM))/7) 20 | /* size of communication buffer */ 21 | #define NM2 (2*NM*NM) 22 | /* maximum number of levels */ 23 | #define MAXLEVEL 11 24 | 25 | /*---------------------------------------------------------------------*/ 26 | /* common /mg3/ */ 27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; 28 | /* common /ClassType/ */ 29 | static char class_npb; 30 | /* common /my_debug/ */ 31 | static int debug_vec[8]; 32 | /* common /fap/ */ 33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/ 34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; 35 | static int lt, lb; 36 | 37 | /*c--------------------------------------------------------------------- 38 | c Set at m=1024, can handle cases up to 1024^3 case 39 | c---------------------------------------------------------------------*/ 40 | #define M 1037 41 | 42 | /* common /buffer/ */ 43 | /*static double buff[4][NM2];*/ 44 | -------------------------------------------------------------------------------- /NPB-OMP/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | CLASS=S 3 | SFILE=config/suite.def 4 | 5 | default: header 6 | @ $(SHELL) sys/print_instructions 7 | 8 | BT: bt 9 | bt: header 10 | cd BT; $(MAKE) CLASS=$(CLASS) 11 | 12 | SP: sp 13 | sp: header 14 | cd SP; $(MAKE) CLASS=$(CLASS) 15 | 16 | LU: lu 17 | lu: header 18 | cd LU; $(MAKE) CLASS=$(CLASS) 19 | 20 | MG: mg 21 | mg: header 22 | cd MG; $(MAKE) CLASS=$(CLASS) 23 | 24 | FT: ft 25 | ft: header 26 | cd FT; $(MAKE) CLASS=$(CLASS) 27 | 28 | IS: is 29 | is: header 30 | cd IS; $(MAKE) CLASS=$(CLASS) 31 | 32 | CG: cg 33 | cg: header 34 | cd CG; $(MAKE) CLASS=$(CLASS) 35 | 36 | EP: ep 37 | ep: header 38 | cd EP; $(MAKE) CLASS=$(CLASS) 39 | DC: dc 40 | dc: header 41 | cd DC; $(MAKE) CLASS=$(CLASS) 42 | 43 | # Awk script courtesy cmg@cray.com 44 | suite: 45 | @ awk '{ if ($$1 !~ /^#/ && NF > 0) \ 46 | printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE) \ 47 | | $(SHELL) 48 | 49 | 50 | # It would be nice to make clean in each subdirectory (the targets 51 | # are defined) but on a really clean system this will won't work 52 | # because those makefiles need config/make.def 53 | clean: 54 | - rm -f core 55 | - rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe 56 | - rm -f sys/setparams sys/makesuite sys/setparams.hpp 57 | 58 | cleanall: clean 59 | - rm -r bin/* 60 | 61 | veryclean: clean 62 | - rm config/make.def config/suite.def Part* 63 | - rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.* 64 | 65 | header: 66 | @ $(SHELL) sys/print_header 67 | 68 | kit: 69 | - makekit -s100k -k30 * */* */*/* 70 | 71 | 72 | -------------------------------------------------------------------------------- /NPB-OMP/README.md: -------------------------------------------------------------------------------- 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP) 2 | 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). 4 | 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways. 6 | 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)* 8 | 9 | 10 | ## How to cite this work 11 | 12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 13 | 14 | ## The NPB-CPP Benchmark 15 | 16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version. 17 | 18 | ================================================================== 19 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 20 | 21 | Code contributors: 22 | Dalvan Griebler 23 | Júnior Löff 24 | 25 | Warning: in case of problems send an email to us: 26 | dalvan.griebler@acad.pucrs.br 27 | junior.loff@acad.pucrs.br 28 | ================================================================== 29 | 30 | 31 | This folder contains: 32 | 33 | - NPB-FF - Directory with the parallel version implemented in FastFlow 34 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 35 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 36 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 37 | 38 | Each directory is independent and contains its own implemented version of the kernels: 39 | 40 | IS - Integer Sort, random memory access 41 | EP - Embarrassingly Parallel 42 | CG - Conjugate Gradient, irregular memory access and communication 43 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 44 | FT - discrete 3D fast Fourier Transform, all-to-all communication 45 | 46 | ## Software Requirements 47 | 48 | *Warning: our tests were made with GCC-5* 49 | 50 | **TBB** 51 | 52 | *Installation* 53 | 54 | apt-get install libtbb-dev 55 | 56 | **FastFlow** 57 | 58 | *Installation* 59 | 60 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 61 | 62 | 63 | ## How to Compile 64 | 65 | Enter the directory from the version desired and execute: 66 | 67 | make _BENCHMARK CLASS=_VERSION 68 | 69 | 70 | _BENCHMARKs are: 71 | 72 | EP, CG, MG, IS and FT 73 | 74 | _VERSIONs are: 75 | 76 | Class S: small for quick test purposes 77 | Class W: workstation size (a 90's workstation; now likely too small) 78 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 79 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 80 | 81 | 82 | Command: 83 | 84 | make ep CLASS=B 85 | -------------------------------------------------------------------------------- /NPB-OMP/bin/README.md: -------------------------------------------------------------------------------- 1 | # How to Cite our Work 2 | 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 4 | 5 | # The NPB-CPP Benchmark 6 | 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version. 8 | 9 | ================================================================== 10 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 11 | 12 | Code contributors: 13 | Dalvan Griebler 14 | Júnior Löff 15 | 16 | Warning: in case of problems send an email to us: 17 | dalvan.griebler@acad.pucrs.br 18 | junior.loff@acad.pucrs.br 19 | ================================================================== 20 | 21 | 22 | This folder contains: 23 | 24 | - NPB-FF - Directory with the parallel version implemented in FastFlow 25 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 26 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 27 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 28 | 29 | Each directory is independent and contains its own implemented version of the kernels: 30 | 31 | IS - Integer Sort, random memory access 32 | EP - Embarrassingly Parallel 33 | CG - Conjugate Gradient, irregular memory access and communication 34 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 35 | FT - discrete 3D fast Fourier Transform, all-to-all communication 36 | 37 | # Software Requiriments 38 | 39 | *Warning: our tests were made with GCC-5* 40 | 41 | **TBB** 42 | 43 | *Installation* 44 | 45 | apt-get install libtbb-dev 46 | 47 | **FastFlow** 48 | 49 | *Installation* 50 | 51 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 52 | 53 | 54 | # How to Compile 55 | 56 | Enter the directory from the version desired and execute: 57 | 58 | make _BENCHMARK CLASS=_VERSION 59 | 60 | 61 | _BENCHMARKs are: 62 | 63 | EP, CG, MG, IS and FT 64 | 65 | _VERSIONs are: 66 | 67 | Class S: small for quick test purposes 68 | Class W: workstation size (a 90's workstation; now likely too small) 69 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 70 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 71 | 72 | 73 | Command: 74 | 75 | make ep CLASS=B -------------------------------------------------------------------------------- /NPB-OMP/common/c_print_results.cpp: -------------------------------------------------------------------------------- 1 | /*****************************************************************/ 2 | /****** C _ P R I N T _ R E S U L T S ******/ 3 | /*****************************************************************/ 4 | #include 5 | #include 6 | 7 | void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, int nthreads, double t, 8 | double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, 9 | char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand) 10 | { 11 | 12 | printf( "\n\n %s Benchmark Completed\n", name ); 13 | 14 | printf( " class_npb = %c\n", class_npb ); 15 | 16 | if( n2 == 0 && n3 == 0 ) 17 | printf( " Size = %12d\n", n1 ); /* as in IS */ 18 | else 19 | printf( " Size = %3dx%3dx%3d\n", n1,n2,n3 ); 20 | 21 | printf( " Iterations = %12d\n", niter ); 22 | 23 | printf( " Threads = %12d\n", nthreads ); 24 | 25 | printf( " Time in seconds = %12.2f\n", t ); 26 | 27 | printf( " Mop/s total = %12.2f\n", mops ); 28 | 29 | printf( " Operation type = %24s\n", optype); 30 | 31 | if( passed_verification ) 32 | printf( " Verification = SUCCESSFUL\n" ); 33 | else 34 | printf( " Verification = UNSUCCESSFUL\n" ); 35 | 36 | printf( " Version = %12s\n", npbversion ); 37 | 38 | printf( " Compile date = %12s\n", compiletime ); 39 | 40 | printf( "\n Compile options:\n" ); 41 | 42 | printf( " CC = %s\n", cc ); 43 | 44 | printf( " CLINK = %s\n", clink ); 45 | 46 | printf( " C_LIB = %s\n", c_lib ); 47 | 48 | printf( " C_INC = %s\n", c_inc ); 49 | 50 | printf( " CFLAGS = %s\n", cflags ); 51 | 52 | printf( " CLINKFLAGS = %s\n", clinkflags ); 53 | 54 | printf( " RAND = %s\n", rand ); 55 | #ifdef SMP 56 | char *evalue = getenv("MP_SET_NUMTHREADS"); 57 | printf( " MULTICPUS = %s\n", evalue ); 58 | #endif 59 | 60 | /* printf( "\n\n" ); 61 | printf( " Please send the results of this run to:\n\n" ); 62 | printf( " NPB Development Team\n" ); 63 | printf( " Internet: npb@nas.nasa.gov\n \n" ); 64 | printf( " If email is not available, send this to:\n\n" ); 65 | printf( " MS T27A-1\n" ); 66 | printf( " NASA Ames Research Center\n" ); 67 | printf( " Moffett Field, CA 94035-1000\n\n" ); 68 | printf( " Fax: 415-604-3957\n\n" );*/ 69 | } 70 | 71 | -------------------------------------------------------------------------------- /NPB-OMP/common/c_randdp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | #if defined(USE_POW) 4 | #define r23 pow(0.5, 23.0) 5 | #define r46 (r23*r23) 6 | #define t23 pow(2.0, 23.0) 7 | #define t46 (t23*t23) 8 | #else 9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5) 10 | #define r46 (r23*r23) 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0) 12 | #define t46 (t23*t23) 13 | #endif 14 | 15 | /*c--------------------------------------------------------------------- 16 | c---------------------------------------------------------------------*/ 17 | 18 | double randlc (double *x, double a) { 19 | 20 | /*c--------------------------------------------------------------------- 21 | c---------------------------------------------------------------------*/ 22 | 23 | /*c--------------------------------------------------------------------- 24 | c 25 | c This routine returns a uniform pseudorandom double precision number in the 26 | c range (0, 1) by using the linear congruential generator 27 | c 28 | c x_{k+1} = a x_k (mod 2^46) 29 | c 30 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 31 | c before repeating. The argument A is the same as 'a' in the above formula, 32 | c and X is the same as x_0. A and X must be odd double precision integers 33 | c in the range (1, 2^46). The returned value RANDLC is normalized to be 34 | c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 35 | c the new seed x_1, so that subsequent calls to RANDLC using the same 36 | c arguments will generate a continuous sequence. 37 | c 38 | c This routine should produce the same results on any computer with at least 39 | c 48 mantissa bits in double precision floating point data. On 64 bit 40 | c systems, double precision should be disabled. 41 | c 42 | c David H. Bailey October 26, 1990 43 | c 44 | c---------------------------------------------------------------------*/ 45 | 46 | double t1,t2,t3,t4,a1,a2,x1,x2,z; 47 | 48 | /*c--------------------------------------------------------------------- 49 | c Break A into two parts such that A = 2^23 * A1 + A2. 50 | c---------------------------------------------------------------------*/ 51 | t1 = r23 * a; 52 | a1 = (int)t1; 53 | a2 = a - t23 * a1; 54 | 55 | /*c--------------------------------------------------------------------- 56 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 57 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 58 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 59 | c---------------------------------------------------------------------*/ 60 | t1 = r23 * (*x); 61 | x1 = (int)t1; 62 | x2 = (*x) - t23 * x1; 63 | t1 = a1 * x2 + a2 * x1; 64 | t2 = (int)(r23 * t1); 65 | z = t1 - t23 * t2; 66 | t3 = t23 * z + a2 * x2; 67 | t4 = (int)(r46 * t3); 68 | (*x) = t3 - t46 * t4; 69 | 70 | return (r46 * (*x)); 71 | } 72 | 73 | /*c--------------------------------------------------------------------- 74 | c---------------------------------------------------------------------*/ 75 | 76 | void vranlc (int n, double *x_seed, double a, double y[]) { 77 | 78 | /*c--------------------------------------------------------------------- 79 | c---------------------------------------------------------------------*/ 80 | 81 | /*c--------------------------------------------------------------------- 82 | c 83 | c This routine generates N uniform pseudorandom double precision numbers in 84 | c the range (0, 1) by using the linear congruential generator 85 | c 86 | c x_{k+1} = a x_k (mod 2^46) 87 | c 88 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 89 | c before repeating. The argument A is the same as 'a' in the above formula, 90 | c and X is the same as x_0. A and X must be odd double precision integers 91 | c in the range (1, 2^46). The N results are placed in Y and are normalized 92 | c to be between 0 and 1. X is updated to contain the new seed, so that 93 | c subsequent calls to VRANLC using the same arguments will generate a 94 | c continuous sequence. If N is zero, only initialization is performed, and 95 | c the variables X, A and Y are ignored. 96 | c 97 | c This routine is the standard version designed for scalar or RISC systems. 98 | c However, it should produce the same results on any single processor 99 | c computer with at least 48 mantissa bits in double precision floating point 100 | c data. On 64 bit systems, double precision should be disabled. 101 | c 102 | c---------------------------------------------------------------------*/ 103 | 104 | int i; 105 | double x,t1,t2,t3,t4,a1,a2,x1,x2,z; 106 | 107 | /*c--------------------------------------------------------------------- 108 | c Break A into two parts such that A = 2^23 * A1 + A2. 109 | c---------------------------------------------------------------------*/ 110 | t1 = r23 * a; 111 | a1 = (int)t1; 112 | a2 = a - t23 * a1; 113 | x = *x_seed; 114 | 115 | /*c--------------------------------------------------------------------- 116 | c Generate N results. This loop is not vectorizable. 117 | c---------------------------------------------------------------------*/ 118 | for (i = 1; i <= n; i++) { 119 | 120 | /*c--------------------------------------------------------------------- 121 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 122 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 123 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 124 | c---------------------------------------------------------------------*/ 125 | t1 = r23 * x; 126 | x1 = (int)t1; 127 | x2 = x - t23 * x1; 128 | t1 = a1 * x2 + a2 * x1; 129 | t2 = (int)(r23 * t1); 130 | z = t1 - t23 * t2; 131 | t3 = t23 * z + a2 * x2; 132 | t4 = (int)(r46 * t3); 133 | x = t3 - t46 * t4; 134 | y[i] = r46 * x; 135 | } 136 | *x_seed = x; 137 | } 138 | -------------------------------------------------------------------------------- /NPB-OMP/common/c_timers.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #include "wtime.hpp" 5 | #include 6 | 7 | /* Prototype */ 8 | void wtime( double * ); 9 | 10 | 11 | 12 | /*****************************************************************/ 13 | /****** E L A P S E D _ T I M E ******/ 14 | /*****************************************************************/ 15 | double elapsed_time( void ) 16 | { 17 | double t; 18 | 19 | wtime( &t ); 20 | return( t ); 21 | } 22 | 23 | 24 | double start[64], elapsed[64]; 25 | 26 | /*****************************************************************/ 27 | /****** T I M E R _ C L E A R ******/ 28 | /*****************************************************************/ 29 | void timer_clear( int n ) 30 | { 31 | elapsed[n] = 0.0; 32 | } 33 | 34 | 35 | /*****************************************************************/ 36 | /****** T I M E R _ S T A R T ******/ 37 | /*****************************************************************/ 38 | void timer_start( int n ) 39 | { 40 | start[n] = elapsed_time(); 41 | } 42 | 43 | 44 | /*****************************************************************/ 45 | /****** T I M E R _ S T O P ******/ 46 | /*****************************************************************/ 47 | void timer_stop( int n ) 48 | { 49 | double t, now; 50 | 51 | now = elapsed_time(); 52 | t = now - start[n]; 53 | elapsed[n] += t; 54 | 55 | } 56 | 57 | 58 | /*****************************************************************/ 59 | /****** T I M E R _ R E A D ******/ 60 | /*****************************************************************/ 61 | double timer_read( int n ) 62 | { 63 | return( elapsed[n] ); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /NPB-OMP/common/npb-CPP.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #if defined(_OPENMP) 5 | #include 6 | #endif /* _OPENMP */ 7 | 8 | typedef int boolean; 9 | typedef struct { double real; double imag; } dcomplex; 10 | 11 | #define TRUE 1 12 | #define FALSE 0 13 | 14 | #define max(a,b) (((a) > (b)) ? (a) : (b)) 15 | #define min(a,b) (((a) < (b)) ? (a) : (b)) 16 | #define pow2(a) ((a)*(a)) 17 | 18 | #define get_real(c) c.real 19 | #define get_imag(c) c.imag 20 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) 21 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) 22 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \ 23 | c.imag = a.real * b.imag + a.imag * b.real) 24 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b) 25 | 26 | extern double randlc(double *, double); 27 | extern void vranlc(int, double *, double, double *); 28 | extern void timer_clear(int); 29 | extern void timer_start(int); 30 | extern void timer_stop(int); 31 | extern double timer_read(int); 32 | 33 | extern void c_print_results(char *name, char class_npb, int n1, int n2, 34 | int n3, int niter, int nthreads, double t, 35 | double mops, char *optype, int passed_verification, 36 | char *npbversion, char *compiletime, char *cc, 37 | char *clink, char *c_lib, char *c_inc, 38 | char *cflags, char *clinkflags, char *rand); 39 | -------------------------------------------------------------------------------- /NPB-OMP/common/wtime.cpp: -------------------------------------------------------------------------------- 1 | #include "wtime.hpp" 2 | #include 3 | 4 | void wtime(double *t) 5 | { 6 | static int sec = -1; 7 | struct timeval tv; 8 | gettimeofday(&tv, 0); 9 | if (sec < 0) sec = tv.tv_sec; 10 | *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /NPB-OMP/common/wtime.hpp: -------------------------------------------------------------------------------- 1 | /* C/Fortran interface is different on different machines. 2 | * You may need to tweak this. 3 | */ 4 | 5 | 6 | #if defined(IBM) 7 | #define wtime wtime 8 | #elif defined(CRAY) 9 | #define wtime WTIME 10 | #else 11 | #define wtime wtime_ 12 | #endif 13 | -------------------------------------------------------------------------------- /NPB-OMP/common/wtime_sgi64.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* The following works on SGI Power Challenge systems */ 10 | 11 | typedef unsigned long iotimer_t; 12 | 13 | unsigned int cycleval; 14 | volatile iotimer_t *iotimer_addr, base_counter; 15 | double resolution; 16 | 17 | /* address_t is an integer type big enough to hold an address */ 18 | typedef unsigned long address_t; 19 | 20 | 21 | 22 | void timer_init() 23 | { 24 | 25 | int fd; 26 | char *virt_addr; 27 | address_t phys_addr, page_offset, pagemask, pagebase_addr; 28 | 29 | pagemask = getpagesize() - 1; 30 | errno = 0; 31 | phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); 32 | if (errno != 0) { 33 | perror("SGI_QUERY_CYCLECNTR"); 34 | exit(1); 35 | } 36 | /* rel_addr = page offset of physical address */ 37 | page_offset = phys_addr & pagemask; 38 | pagebase_addr = phys_addr - page_offset; 39 | fd = open("/dev/mmem", O_RDONLY); 40 | 41 | virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); 42 | virt_addr = virt_addr + page_offset; 43 | iotimer_addr = (iotimer_t *)virt_addr; 44 | /* cycleval in picoseconds to this gives resolution in seconds */ 45 | resolution = 1.0e-12*cycleval; 46 | base_counter = *iotimer_addr; 47 | } 48 | 49 | void wtime_(double *time) 50 | { 51 | static int initialized = 0; 52 | volatile iotimer_t counter_value; 53 | if (!initialized) { 54 | timer_init(); 55 | initialized = 1; 56 | } 57 | counter_value = *iotimer_addr - base_counter; 58 | *time = (double)counter_value * resolution; 59 | } 60 | 61 | 62 | void wtime(double *time) 63 | { 64 | static int initialized = 0; 65 | volatile iotimer_t counter_value; 66 | if (!initialized) { 67 | timer_init(); 68 | initialized = 1; 69 | } 70 | counter_value = *iotimer_addr - base_counter; 71 | *time = (double)counter_value * resolution; 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /NPB-OMP/config/make.def: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------- 2 | # 3 | # SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 4 | # 5 | #--------------------------------------------------------------------------- 6 | 7 | #--------------------------------------------------------------------------- 8 | # Items in this file will need to be changed for each platform. 9 | # (Note these definitions are inconsistent with NPB2.1.) 10 | #--------------------------------------------------------------------------- 11 | 12 | #--------------------------------------------------------------------------- 13 | # Parallel C: 14 | # 15 | # CC - C compiler 16 | # CFLAGS - C compilation arguments 17 | # C_INC - any -I arguments required for compiling C 18 | # CLINK - C linker 19 | # CLINKFLAGS - C linker flags 20 | # C_LIB - any -L and -l arguments required for linking C 21 | # 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or 23 | # $(CC) $(CFLAGS) 24 | # linking is done with $(CLINK) $(C_LIB) $(CLINKFLAGS) 25 | #--------------------------------------------------------------------------- 26 | 27 | #--------------------------------------------------------------------------- 28 | # This is the C compiler used for OpenMP programs 29 | #--------------------------------------------------------------------------- 30 | CC = g++ -std=c++14 31 | #gcc #cc 32 | # This links C programs; usually the same as ${CC} 33 | CLINK = $(CC) 34 | 35 | #--------------------------------------------------------------------------- 36 | # These macros are passed to the linker 37 | #--------------------------------------------------------------------------- 38 | C_LIB = -lm 39 | 40 | #--------------------------------------------------------------------------- 41 | # These macros are passed to the compiler 42 | #--------------------------------------------------------------------------- 43 | C_INC = -I../common 44 | 45 | #--------------------------------------------------------------------------- 46 | # Global *compile time* flags for C programs 47 | #--------------------------------------------------------------------------- 48 | CFLAGS = -O3 -fopenmp 49 | # CFLAGS = -g 50 | 51 | #--------------------------------------------------------------------------- 52 | # Global *link time* flags. Flags for increasing maximum executable 53 | # size usually go here. 54 | #--------------------------------------------------------------------------- 55 | CLINKFLAGS = -O3 -fopenmp 56 | 57 | 58 | #--------------------------------------------------------------------------- 59 | # Utilities C: 60 | # 61 | # This is the C compiler used to compile C utilities. Flags required by 62 | # this compiler go here also; typically there are few flags required; hence 63 | # there are no separate macros provided for such flags. 64 | #--------------------------------------------------------------------------- 65 | UCC = cc 66 | 67 | 68 | #--------------------------------------------------------------------------- 69 | # Destination of executables, relative to subdirs of the main directory. . 70 | #--------------------------------------------------------------------------- 71 | BINDIR = ../bin 72 | 73 | 74 | #--------------------------------------------------------------------------- 75 | # The variable RAND controls which random number generator 76 | # is used. It is described in detail in Doc/README.install. 77 | # Use "randi8" unless there is a reason to use another one. 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec" 79 | #--------------------------------------------------------------------------- 80 | # RAND = randi8 81 | # The following is highly reliable but may be slow: 82 | RAND = randdp 83 | 84 | 85 | #--------------------------------------------------------------------------- 86 | # The variable WTIME is the name of the wtime source code module in the 87 | # NPB2.x/common directory. 88 | # For most machines, use wtime.c 89 | # For SGI power challenge: use wtime_sgi64.c 90 | #--------------------------------------------------------------------------- 91 | WTIME = wtime.cpp 92 | 93 | 94 | #--------------------------------------------------------------------------- 95 | # Enable if either Cray or IBM: 96 | # (no such flag for most machines: see common/wtime.h) 97 | # This is used by the C compiler to pass the machine name to common/wtime.h, 98 | # where the C/Fortran binding interface format is determined 99 | #--------------------------------------------------------------------------- 100 | # MACHINE = -DCRAY 101 | # MACHINE = -DIBM 102 | 103 | 104 | -------------------------------------------------------------------------------- /NPB-OMP/config/suite.def: -------------------------------------------------------------------------------- 1 | # config/suite.def 2 | # This file is used to build several benchmarks with a single command. 3 | # Typing "make suite" in the main directory will build all the benchmarks 4 | # specified in this file. 5 | # Each line of this file contains a benchmark name, class, and number 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft" 7 | # The class is one of "S", "W", "A", "B", and "C". 8 | # No blank lines. 9 | # The following example builds serial sample sizes of all benchmarks. 10 | ft B 11 | mg B 12 | is B 13 | ep B 14 | cg B 15 | -------------------------------------------------------------------------------- /NPB-OMP/sys/Makefile: -------------------------------------------------------------------------------- 1 | include ../config/make.def 2 | 3 | # Note that COMPILE is also defined in make.common and should 4 | # be the same. We can't include make.common because it has a lot 5 | # of other garbage. 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 7 | 8 | all: setparams 9 | 10 | # setparams creates an npbparam.h file for each benchmark 11 | # configuration. npbparams.h also contains info about how a benchmark 12 | # was compiled and linked 13 | 14 | setparams: setparams.cpp ../config/make.def 15 | $(UCC) -o setparams setparams.cpp 16 | 17 | 18 | clean: 19 | -rm -f setparams setparams.hpp npbparams.hpp 20 | -rm -f *~ *.o 21 | 22 | -------------------------------------------------------------------------------- /NPB-OMP/sys/README: -------------------------------------------------------------------------------- 1 | This directory contains utilities and files used by the 2 | build process. You should not need to change anything 3 | in this directory. 4 | 5 | Original Files 6 | -------------- 7 | setparams.c: 8 | Source for the setparams program. This program is used internally 9 | in the build process to create the file "npbparams.h" for each 10 | benchmark. npbparams.h contains Fortran or C parameters to build a 11 | benchmark for a specific class. The setparams program is never run 12 | directly by a user. Its invocation syntax is 13 | 14 | "setparams benchmark-name class". 15 | 16 | It examines the file "npbparams.h" in the current directory. If 17 | the specified parameters are the same as those in the npbparams.h 18 | file, nothing it changed. If the file does not exist or corresponds 19 | to a different class/number of nodes, it is (re)built. 20 | One of the more complicated things in npbparams.h is that it 21 | contains, in a Fortran string, the compiler flags used to build a 22 | benchmark, so that a benchmark can print out how it was compiled. 23 | 24 | make.common 25 | A makefile segment that is included in each individual benchmark 26 | program makefile. It sets up some standard macros (COMPILE, etc) 27 | and makes sure everything is configured correctly (npbparams.h) 28 | 29 | Makefile 30 | Builds setparams 31 | 32 | README 33 | This file. 34 | 35 | 36 | Created files 37 | ------------- 38 | 39 | setparams 40 | See descriptions above 41 | 42 | -------------------------------------------------------------------------------- /NPB-OMP/sys/make.common: -------------------------------------------------------------------------------- 1 | PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 3 | CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) 4 | 5 | # Class "U" is used internally by the setparams program to mean 6 | # "unknown". This means that if you don't specify CLASS= 7 | # on the command line, you'll get an error. It would be nice 8 | # to be able to avoid this, but we'd have to get information 9 | # from the setparams back to the make program, which isn't easy. 10 | CLASS=U 11 | 12 | default:: ${PROGRAM} 13 | 14 | # This makes sure the configuration utility setparams 15 | # is up to date. 16 | # Note that this must be run every time, which is why the 17 | # target does not exist and is not created. 18 | # If you create a file called "config" you will break things. 19 | config: 20 | @cd ../sys; ${MAKE} all 21 | ../sys/setparams ${BENCHMARK} ${CLASS} 22 | 23 | COMMON=../common 24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f 25 | cd ${COMMON}; ${FCOMPILE} ${RAND}.f 26 | 27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp 28 | cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp 29 | 30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f 31 | cd ${COMMON}; ${FCOMPILE} print_results.f 32 | 33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp 34 | cd ${COMMON}; ${CCOMPILE} c_print_results.cpp 35 | 36 | ${COMMON}/timers.o: ${COMMON}/timers.f 37 | cd ${COMMON}; ${FCOMPILE} timers.f 38 | 39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp 40 | cd ${COMMON}; ${CCOMPILE} c_timers.cpp 41 | 42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME} 43 | cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME} 44 | # For most machines or CRAY or IBM 45 | # cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c 46 | # For a precise timer on an SGI Power Challenge, try: 47 | # cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c 48 | 49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME} 50 | cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME} 51 | 52 | 53 | # Normally setparams updates npbparams.h only if the settings (CLASS) 54 | # have changed. However, we also want to update if the compile options 55 | # may have changed (set in ../config/make.def). 56 | npbparams.hpp: ../config/make.def 57 | @ echo make.def modified. Rebuilding npbparams.hpp just in case 58 | rm -f npbparams.hpp 59 | ../sys/setparams ${BENCHMARK} ${CLASS} 60 | 61 | # So that "make benchmark-name" works 62 | ${BENCHMARK}: default 63 | ${BENCHMARKU}: default 64 | 65 | 66 | -------------------------------------------------------------------------------- /NPB-OMP/sys/print_header: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' =========================================' 3 | echo ' = NAS Parallel Benchmarks =' 4 | echo ' = OpenMP C++ Versions =' 5 | echo ' = Developed by: Dalvan Griebler =' 6 | echo ' = Júnior Löff =' 7 | echo ' = =' 8 | echo ' = Warning: in case of problems =' 9 | echo ' = send an email to us: =' 10 | echo ' = dalvan.griebler@acad.pucrs.br =' 11 | echo ' = junior.loff@acad.pucrs.br =' 12 | echo ' =========================================' 13 | echo '' 14 | -------------------------------------------------------------------------------- /NPB-OMP/sys/print_instructions: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' To make a NAS benchmark type ' 3 | echo '' 4 | echo ' make CLASS=' 5 | echo '' 6 | echo ' where is "cg", "ep", "ft", "is", or "mg"' 7 | echo ' is "S", "W", "A", "B" or "C"' 8 | echo '' 9 | echo ' To make a set of benchmarks, create the file config/suite.def' 10 | echo ' according to the instructions in config/suite.def.template and type' 11 | echo '' 12 | echo ' make suite' 13 | echo '' 14 | echo ' ***************************************************************' 15 | echo ' * Remember to edit the file config/make.def for site specific *' 16 | echo ' * information as described in the README file *' 17 | echo ' ***************************************************************' 18 | 19 | -------------------------------------------------------------------------------- /NPB-SER/CG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=cg 3 | BENCHMARKU=CG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = cg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | cg.o: cg.cpp npbparams.hpp 16 | ${CCOMPILE} cg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-SER/EP/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ep 3 | BENCHMARKU=EP 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | 16 | ep.o: ep.cpp npbparams.hpp 17 | ${CCOMPILE} ep.cpp 18 | 19 | clean: 20 | - rm -f *.o *~ 21 | - rm -f npbparams.hpp core 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /NPB-SER/EP/ep.cpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | 3 | Information on NAS Parallel Benchmarks is available at: 4 | 5 | http://www.nas.nasa.gov/Software/NPB/ 6 | 7 | Authors: P. O. Frederickson 8 | D. H. Bailey 9 | A. C. Woo 10 | 11 | CPP version: 12 | Dalvan Griebler 13 | Júnior Löff 14 | 15 | --------------------------------------------------------------------*/ 16 | 17 | #include "npbparams.hpp" 18 | #include 19 | #include <../common/npb-CPP.hpp> 20 | 21 | /* parameters */ 22 | #define MK 16 23 | #define MM (M - MK) 24 | #define NN (1 << MM) 25 | #define NK (1 << MK) 26 | #define NQ 10 27 | #define EPSILON 1.0e-8 28 | #define A 1220703125.0 29 | #define S 271828183.0 30 | #define TIMERS_ENABLED FALSE 31 | 32 | /* global variables */ 33 | /* common /storage/ */ 34 | static double x[(2*NK)+1]; 35 | static double q[NQ]; 36 | 37 | /*-------------------------------------------------------------------- 38 | program EMBAR 39 | c-------------------------------------------------------------------*/ 40 | /* 41 | c This is the serial version of the APP Benchmark 1, 42 | c the "embarassingly parallel" benchmark. 43 | c 44 | c M is the Log_2 of the number of complex pairs of uniform (0, 1) random 45 | c numbers. MK is the Log_2 of the size of each batch of uniform random 46 | c numbers. MK can be set for convenience on a given system, since it does 47 | c not affect the results. 48 | */ 49 | int main(int argc, char **argv) { 50 | double Mops, t1, sx, sy, tm, an, gc; 51 | double dum[3] = { 1.0, 1.0, 1.0 }; 52 | int np,i, k, nit, k_offset, j; 53 | boolean verified; 54 | char size[13+1]; /* character*13 */ 55 | 56 | /* 57 | c Because the size of the problem is too large to store in a 32-bit 58 | c integer for some classes, we put it into a string (for printing). 59 | c Have to strip off the decimal point put in there by the floating 60 | c point print statement (internal file) 61 | */ 62 | 63 | printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n"); 64 | printf("Developed by: Dalvan Griebler & Júnior Löff \n\n"); 65 | sprintf(size, "%12.0f", pow(2.0, M+1)); 66 | for (j = 13; j >= 1; j--) { 67 | if (size[j] == '.') size[j] = ' '; 68 | } 69 | printf(" Number of random numbers generated: %13s\n", size); 70 | 71 | verified = FALSE; 72 | 73 | /* 74 | c Compute the number of "batches" of random number pairs generated 75 | c per processor. Adjust if the number of processors does not evenly 76 | c divide the total number 77 | */ 78 | np = NN; 79 | 80 | /* 81 | c Call the random number generator functions and initialize 82 | c the x-array to reduce the effects of paging on the timings. 83 | c Also, call all mathematical functions that are used. Make 84 | c sure these initializations cannot be eliminated as dead code. 85 | */ 86 | vranlc(0, &(dum[0]), dum[1], &(dum[2])); 87 | dum[0] = randlc(&(dum[1]), dum[2]); 88 | for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; 89 | Mops = log(sqrt(fabs(max(1.0, 1.0)))); 90 | 91 | 92 | 93 | timer_clear(1); 94 | timer_clear(2); 95 | timer_clear(3); 96 | 97 | timer_start(1); 98 | 99 | vranlc(0, &t1, A, x); 100 | 101 | /* Compute AN = A ^ (2 * NK) (mod 2^46). */ 102 | 103 | t1 = A; 104 | 105 | for ( i = 1; i <= MK+1; i++) { 106 | an = randlc(&t1, t1); 107 | } 108 | 109 | an = t1; 110 | gc = 0.0; 111 | sx = 0.0; 112 | sy = 0.0; 113 | 114 | for ( i = 0; i <= NQ - 1; i++) { 115 | q[i] = 0.0; 116 | } 117 | 118 | /* 119 | c Each instance of this loop may be performed independently. We compute 120 | c the k offsets separately to take into account the fact that some nodes 121 | c have more numbers to generate than others 122 | */ 123 | k_offset = -1; 124 | 125 | double t2, t3, t4, x1, x2; 126 | int kk, ik, l; 127 | double qq[NQ]; /* private copy of q[0:NQ-1] */ 128 | 129 | for (i = 0; i < NQ; i++) qq[i] = 0.0; 130 | 131 | for (k = 1; k <= np; k++) { 132 | kk = k_offset + k; 133 | t1 = S; 134 | t2 = an; 135 | 136 | /* Find starting seed t1 for this kk. */ 137 | 138 | for (i = 1; i <= 100; i++) { 139 | ik = kk / 2; 140 | if (2 * ik != kk) t3 = randlc(&t1, t2); 141 | if (ik == 0) break; 142 | t3 = randlc(&t2, t2); 143 | kk = ik; 144 | } 145 | 146 | /* Compute uniform pseudorandom numbers. */ 147 | 148 | if (TIMERS_ENABLED == TRUE) timer_start(3); 149 | vranlc(2*NK, &t1, A, x); 150 | if (TIMERS_ENABLED == TRUE) timer_stop(3); 151 | 152 | /* 153 | c Compute Gaussian deviates by acceptance-rejection method and 154 | c tally counts in concentric square annuli. This loop is not 155 | c vectorizable. 156 | */ 157 | if (TIMERS_ENABLED == TRUE) timer_start(2); 158 | 159 | for ( i = 1; i <= NK; i++) { 160 | x1 = 2.0 * x[2*i-1] - 1.0; 161 | x2 = 2.0 * x[2*i] - 1.0; 162 | t1 = pow2(x1) + pow2(x2); 163 | if (t1 <= 1.0) { 164 | t2 = sqrt(-2.0 * log(t1) / t1); 165 | t3 = (x1 * t2); /* Xi */ 166 | t4 = (x2 * t2); /* Yi */ 167 | l = max(fabs(t3), fabs(t4)); 168 | qq[l] += 1.0; /* counts */ 169 | sx = sx + t3; /* sum of Xi */ 170 | sy = sy + t4; /* sum of Yi */ 171 | } 172 | } 173 | if (TIMERS_ENABLED == TRUE) timer_stop(2); 174 | } 175 | for (i = 0; i <= NQ-1; i++) q[i] += qq[i]; 176 | 177 | for (i = 0; i <= NQ-1; i++) { 178 | gc = gc + q[i]; 179 | } 180 | 181 | timer_stop(1); 182 | 183 | tm = timer_read(1); 184 | 185 | 186 | nit = 0; 187 | if (M == 24) { 188 | if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) { 189 | verified = TRUE; 190 | } 191 | } else if (M == 25) { 192 | if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) { 193 | verified = TRUE; 194 | } 195 | } else if (M == 28) { 196 | //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { 197 | if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) { 198 | verified = TRUE; 199 | } 200 | } else if (M == 30) { 201 | if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) { 202 | verified = TRUE; 203 | } 204 | } else if (M == 32) { 205 | if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) { 206 | verified = TRUE; 207 | } 208 | } else if (M == 36) { 209 | if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) { 210 | verified = TRUE; 211 | } 212 | } else if (M == 40) { 213 | if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) { 214 | verified = TRUE; 215 | } 216 | } 217 | 218 | Mops = pow(2.0, M+1)/tm/1000000.0; 219 | 220 | printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" 221 | "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); 222 | for (i = 0; i <= NQ-1; i++) { 223 | printf("%3d %15.0f\n", i, q[i]); 224 | } 225 | 226 | c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, tm, Mops, (char*)"Random numbers generated", 227 | verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); 228 | 229 | if (TIMERS_ENABLED == TRUE) { 230 | printf("Total time: %f", timer_read(1)); 231 | printf("Gaussian pairs: %f", timer_read(2)); 232 | printf("Random numbers: %f", timer_read(3)); 233 | } 234 | return 0; 235 | } 236 | -------------------------------------------------------------------------------- /NPB-SER/FT/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ft 3 | BENCHMARKU=FT 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | ft.o: ft.cpp global.hpp npbparams.hpp 16 | ${CCOMPILE} ft.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ mputil* 20 | - rm -f ft npbparams.hpp core 21 | -------------------------------------------------------------------------------- /NPB-SER/FT/global.hpp: -------------------------------------------------------------------------------- 1 | #include "npbparams.hpp" 2 | 3 | 4 | /* 5 | c If processor array is 1x1 -> 0D grid decomposition 6 | 7 | 8 | c Cache blocking params. These values are good for most 9 | c RISC processors. 10 | c FFT parameters: 11 | c fftblock controls how many ffts are done at a time. 12 | c The default is appropriate for most cache-based machines 13 | c On vector machines, the FFT can be vectorized with vector 14 | c length equal to the block size, so the block size should 15 | c be as large as possible. This is the size of the smallest 16 | c dimension of the problem: 128 for class A, 256 for class B and 17 | c 512 for class C. 18 | */ 19 | 20 | #define FFTBLOCK_DEFAULT 16 21 | #define FFTBLOCKPAD_DEFAULT 18 22 | 23 | #define FFTBLOCK FFTBLOCK_DEFAULT 24 | #define FFTBLOCKPAD FFTBLOCKPAD_DEFAULT 25 | 26 | /* COMMON block: blockinfo */ 27 | int fftblock; 28 | int fftblockpad; 29 | 30 | /* 31 | c we need a bunch of logic to keep track of how 32 | c arrays are laid out. 33 | 34 | 35 | c Note: this serial version is the derived from the parallel 0D case 36 | c of the ft NPB. 37 | c The computation proceeds logically as 38 | 39 | c set up initial conditions 40 | c fftx(1) 41 | c transpose (1->2) 42 | c ffty(2) 43 | c transpose (2->3) 44 | c fftz(3) 45 | c time evolution 46 | c fftz(3) 47 | c transpose (3->2) 48 | c ffty(2) 49 | c transpose (2->1) 50 | c fftx(1) 51 | c compute residual(1) 52 | 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx 54 | c 55 | c 0D 1D 2D 56 | c 1: xyz xyz xyz 57 | c 2: xyz xyz yxz 58 | c 3: xyz zyx zxy 59 | 60 | c the array dimensions are stored in dims(coord, phase) 61 | */ 62 | 63 | /* COMMON block: layout */ 64 | static int dims[3][3]; 65 | static int xstart[3]; 66 | static int ystart[3]; 67 | static int zstart[3]; 68 | static int xend[3]; 69 | static int yend[3]; 70 | static int zend[3]; 71 | 72 | #define T_TOTAL 0 73 | #define T_SETUP 1 74 | #define T_FFT 2 75 | #define T_EVOLVE 3 76 | #define T_CHECKSUM 4 77 | #define T_FFTLOW 5 78 | #define T_FFTCOPY 6 79 | #define T_MAX 7 80 | 81 | #define TIMERS_ENABLED FALSE 82 | 83 | /* other stuff */ 84 | 85 | #define SEED 314159265.0 86 | #define A 1220703125.0 87 | #define PI 3.141592653589793238 88 | #define ALPHA 1.0e-6 89 | 90 | #define EXPMAX (NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4)) 91 | 92 | /* COMMON block: excomm */ 93 | static double ex[EXPMAX+1]; /* ex(0:expmax) */ 94 | 95 | /* 96 | c roots of unity array 97 | c relies on x being largest dimension? 98 | */ 99 | 100 | /* COMMON block: ucomm */ 101 | static dcomplex u[NX]; 102 | 103 | /* for checksum data */ 104 | 105 | /* COMMON block: sumcomm */ 106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */ 107 | 108 | /* number of iterations*/ 109 | 110 | /* COMMON block: iter */ 111 | static int niter; 112 | 113 | -------------------------------------------------------------------------------- /NPB-SER/IS/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=is 3 | BENCHMARKU=IS 4 | 5 | include ../config/make.def 6 | 7 | OBJS = is.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | 16 | is.o: is.cpp npbparams.hpp 17 | ${CCOMPILE} is.cpp 18 | 19 | clean: 20 | - rm -f *.o *~ 21 | - rm -f npbparams.hpp core -------------------------------------------------------------------------------- /NPB-SER/MG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=mg 3 | BENCHMARKU=MG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = mg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | mg.o: mg.cpp npbparams.hpp 16 | ${CCOMPILE} mg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-SER/MG/globals.hpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | c Parameter lm (declared and set in "npbparams.h") is the log-base2 of 3 | c the edge size max for the partition on a given node, so must be changed 4 | c either to save space (if running a small case) or made bigger for larger 5 | c cases, for example, 512^3. Thus lm=7 means that the largest dimension 6 | c of a partition that can be solved on a node is 2^7 = 128. lm is set 7 | c automatically in npbparams.h 8 | c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 9 | c-------------------------------------------------------------------*/ 10 | 11 | #include "npbparams.hpp" 12 | 13 | /* parameters */ 14 | /* actual dimension including ghost cells for communications */ 15 | #define NM (2+(2<<(LM-1))) 16 | /* size of rhs array */ 17 | #define NV (2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1)))) 18 | /* size of residual array */ 19 | #define NR ((8*(NV+(NM*NM)+5*NM+7*LM))/7) 20 | /* size of communication buffer */ 21 | #define NM2 (2*NM*NM) 22 | /* maximum number of levels */ 23 | #define MAXLEVEL 11 24 | 25 | /*---------------------------------------------------------------------*/ 26 | /* common /mg3/ */ 27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; 28 | /* common /ClassType/ */ 29 | static char class_npb; 30 | /* common /my_debug/ */ 31 | static int debug_vec[8]; 32 | /* common /fap/ */ 33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/ 34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; 35 | static int lt, lb; 36 | 37 | /*c--------------------------------------------------------------------- 38 | c Set at m=1024, can handle cases up to 1024^3 case 39 | c---------------------------------------------------------------------*/ 40 | #define M 1037 41 | 42 | /* common /buffer/ */ 43 | /*static double buff[4][NM2];*/ 44 | -------------------------------------------------------------------------------- /NPB-SER/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | CLASS=S 3 | SFILE=config/suite.def 4 | 5 | default: header 6 | @ $(SHELL) sys/print_instructions 7 | 8 | 9 | MG: mg 10 | mg: header 11 | cd MG; $(MAKE) CLASS=$(CLASS) 12 | 13 | FT: ft 14 | ft: header 15 | cd FT; $(MAKE) CLASS=$(CLASS) 16 | 17 | IS: is 18 | is: header 19 | cd IS; $(MAKE) CLASS=$(CLASS) 20 | 21 | CG: cg 22 | cg: header 23 | cd CG; $(MAKE) CLASS=$(CLASS) 24 | 25 | EP: ep 26 | ep: header 27 | cd EP; $(MAKE) CLASS=$(CLASS) 28 | 29 | 30 | # Awk script courtesy cmg@cray.com 31 | suite: 32 | @ awk '{ if ($$1 !~ /^#/ && NF > 0) \ 33 | printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE) \ 34 | | $(SHELL) 35 | 36 | 37 | # It would be nice to make clean in each subdirectory (the targets 38 | # are defined) but on a really clean system this will won't work 39 | # because those makefiles need config/make.def 40 | clean: 41 | - rm -f core 42 | - rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe 43 | - rm -f sys/setparams sys/makesuite sys/setparams.hpp 44 | 45 | cleanall: clean 46 | - rm -r bin/* 47 | 48 | veryclean: clean 49 | - rm config/make.def config/suite.def Part* 50 | - rm bin/mg.* bin/ft.* bin/is.* bin/ep.* bin/cg.* 51 | 52 | header: 53 | @ $(SHELL) sys/print_header 54 | 55 | kit: 56 | - makekit -s100k -k30 * */* */*/* 57 | 58 | 59 | -------------------------------------------------------------------------------- /NPB-SER/README.md: -------------------------------------------------------------------------------- 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP) 2 | 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). 4 | 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways. 6 | 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)* 8 | 9 | 10 | ## How to cite this work 11 | 12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 13 | 14 | ## The NPB-CPP Benchmark 15 | 16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version. 17 | 18 | ================================================================== 19 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 20 | 21 | Code contributors: 22 | Dalvan Griebler 23 | Júnior Löff 24 | 25 | Warning: in case of problems send an email to us: 26 | dalvan.griebler@acad.pucrs.br 27 | junior.loff@acad.pucrs.br 28 | ================================================================== 29 | 30 | 31 | This folder contains: 32 | 33 | - NPB-FF - Directory with the parallel version implemented in FastFlow 34 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 35 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 36 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 37 | 38 | Each directory is independent and contains its own implemented version of the kernels: 39 | 40 | IS - Integer Sort, random memory access 41 | EP - Embarrassingly Parallel 42 | CG - Conjugate Gradient, irregular memory access and communication 43 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 44 | FT - discrete 3D fast Fourier Transform, all-to-all communication 45 | 46 | ## Software Requirements 47 | 48 | *Warning: our tests were made with GCC-5* 49 | 50 | **TBB** 51 | 52 | *Installation* 53 | 54 | apt-get install libtbb-dev 55 | 56 | **FastFlow** 57 | 58 | *Installation* 59 | 60 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 61 | 62 | 63 | ## How to Compile 64 | 65 | Enter the directory from the version desired and execute: 66 | 67 | make _BENCHMARK CLASS=_VERSION 68 | 69 | 70 | _BENCHMARKs are: 71 | 72 | EP, CG, MG, IS and FT 73 | 74 | _VERSIONs are: 75 | 76 | Class S: small for quick test purposes 77 | Class W: workstation size (a 90's workstation; now likely too small) 78 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 79 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 80 | 81 | 82 | Command: 83 | 84 | make ep CLASS=B 85 | -------------------------------------------------------------------------------- /NPB-SER/bin/README.md: -------------------------------------------------------------------------------- 1 | # How to Cite our Work 2 | 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 4 | 5 | # The NPB-CPP Benchmark 6 | 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version. 8 | 9 | ================================================================== 10 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 11 | 12 | Code contributors: 13 | Dalvan Griebler 14 | Júnior Löff 15 | 16 | Warning: in case of problems send an email to us: 17 | dalvan.griebler@acad.pucrs.br 18 | junior.loff@acad.pucrs.br 19 | ================================================================== 20 | 21 | 22 | This folder contains: 23 | 24 | - NPB-FF - Directory with the parallel version implemented in FastFlow 25 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 26 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 27 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 28 | 29 | Each directory is independent and contains its own implemented version of the kernels: 30 | 31 | IS - Integer Sort, random memory access 32 | EP - Embarrassingly Parallel 33 | CG - Conjugate Gradient, irregular memory access and communication 34 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 35 | FT - discrete 3D fast Fourier Transform, all-to-all communication 36 | 37 | # Software Requiriments 38 | 39 | *Warning: our tests were made with GCC-5* 40 | 41 | **TBB** 42 | 43 | *Installation* 44 | 45 | apt-get install libtbb-dev 46 | 47 | **FastFlow** 48 | 49 | *Installation* 50 | 51 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 52 | 53 | 54 | # How to Compile 55 | 56 | Enter the directory from the version desired and execute: 57 | 58 | make _BENCHMARK CLASS=_VERSION 59 | 60 | 61 | _BENCHMARKs are: 62 | 63 | EP, CG, MG, IS and FT 64 | 65 | _VERSIONs are: 66 | 67 | Class S: small for quick test purposes 68 | Class W: workstation size (a 90's workstation; now likely too small) 69 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 70 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 71 | 72 | 73 | Command: 74 | 75 | make ep CLASS=B -------------------------------------------------------------------------------- /NPB-SER/common/c_print_results.cpp: -------------------------------------------------------------------------------- 1 | /*****************************************************************/ 2 | /****** C _ P R I N T _ R E S U L T S ******/ 3 | /*****************************************************************/ 4 | #include 5 | #include 6 | 7 | void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, double t, 8 | double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, 9 | char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand) 10 | { 11 | 12 | printf( "\n\n %s Benchmark Completed\n", name ); 13 | 14 | printf( " class_npb = %c\n", class_npb ); 15 | 16 | if( n2 == 0 && n3 == 0 ) 17 | printf( " Size = %12d\n", n1 ); /* as in IS */ 18 | else 19 | printf( " Size = %3dx%3dx%3d\n", n1,n2,n3 ); 20 | 21 | printf( " Iterations = %12d\n", niter ); 22 | 23 | printf( " Time in seconds = %12.2f\n", t ); 24 | 25 | printf( " Mop/s total = %12.2f\n", mops ); 26 | 27 | printf( " Operation type = %24s\n", optype); 28 | 29 | if( passed_verification ) 30 | printf( " Verification = SUCCESSFUL\n" ); 31 | else 32 | printf( " Verification = UNSUCCESSFUL\n" ); 33 | 34 | printf( " Version = %12s\n", npbversion ); 35 | 36 | printf( " Compile date = %12s\n", compiletime ); 37 | 38 | printf( "\n Compile options:\n" ); 39 | 40 | printf( " CC = %s\n", cc ); 41 | 42 | printf( " CLINK = %s\n", clink ); 43 | 44 | printf( " C_LIB = %s\n", c_lib ); 45 | 46 | printf( " C_INC = %s\n", c_inc ); 47 | 48 | printf( " CFLAGS = %s\n", cflags ); 49 | 50 | printf( " CLINKFLAGS = %s\n", clinkflags ); 51 | 52 | printf( " RAND = %s\n", rand ); 53 | #ifdef SMP 54 | char *evalue = getenv("MP_SET_NUMTHREADS"); 55 | printf( " MULTICPUS = %s\n", evalue ); 56 | #endif 57 | 58 | /* printf( "\n\n" ); 59 | printf( " Please send the results of this run to:\n\n" ); 60 | printf( " NPB Development Team\n" ); 61 | printf( " Internet: npb@nas.nasa.gov\n \n" ); 62 | printf( " If email is not available, send this to:\n\n" ); 63 | printf( " MS T27A-1\n" ); 64 | printf( " NASA Ames Research Center\n" ); 65 | printf( " Moffett Field, CA 94035-1000\n\n" ); 66 | printf( " Fax: 415-604-3957\n\n" );*/ 67 | } 68 | 69 | -------------------------------------------------------------------------------- /NPB-SER/common/c_randdp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | #if defined(USE_POW) 4 | #define r23 pow(0.5, 23.0) 5 | #define r46 (r23*r23) 6 | #define t23 pow(2.0, 23.0) 7 | #define t46 (t23*t23) 8 | #else 9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5) 10 | #define r46 (r23*r23) 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0) 12 | #define t46 (t23*t23) 13 | #endif 14 | 15 | /*c--------------------------------------------------------------------- 16 | c---------------------------------------------------------------------*/ 17 | 18 | double randlc (double *x, double a) { 19 | 20 | /*c--------------------------------------------------------------------- 21 | c---------------------------------------------------------------------*/ 22 | 23 | /*c--------------------------------------------------------------------- 24 | c 25 | c This routine returns a uniform pseudorandom double precision number in the 26 | c range (0, 1) by using the linear congruential generator 27 | c 28 | c x_{k+1} = a x_k (mod 2^46) 29 | c 30 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 31 | c before repeating. The argument A is the same as 'a' in the above formula, 32 | c and X is the same as x_0. A and X must be odd double precision integers 33 | c in the range (1, 2^46). The returned value RANDLC is normalized to be 34 | c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 35 | c the new seed x_1, so that subsequent calls to RANDLC using the same 36 | c arguments will generate a continuous sequence. 37 | c 38 | c This routine should produce the same results on any computer with at least 39 | c 48 mantissa bits in double precision floating point data. On 64 bit 40 | c systems, double precision should be disabled. 41 | c 42 | c David H. Bailey October 26, 1990 43 | c 44 | c---------------------------------------------------------------------*/ 45 | 46 | double t1,t2,t3,t4,a1,a2,x1,x2,z; 47 | 48 | /*c--------------------------------------------------------------------- 49 | c Break A into two parts such that A = 2^23 * A1 + A2. 50 | c---------------------------------------------------------------------*/ 51 | t1 = r23 * a; 52 | a1 = (int)t1; 53 | a2 = a - t23 * a1; 54 | 55 | /*c--------------------------------------------------------------------- 56 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 57 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 58 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 59 | c---------------------------------------------------------------------*/ 60 | t1 = r23 * (*x); 61 | x1 = (int)t1; 62 | x2 = (*x) - t23 * x1; 63 | t1 = a1 * x2 + a2 * x1; 64 | t2 = (int)(r23 * t1); 65 | z = t1 - t23 * t2; 66 | t3 = t23 * z + a2 * x2; 67 | t4 = (int)(r46 * t3); 68 | (*x) = t3 - t46 * t4; 69 | 70 | return (r46 * (*x)); 71 | } 72 | 73 | /*c--------------------------------------------------------------------- 74 | c---------------------------------------------------------------------*/ 75 | 76 | void vranlc (int n, double *x_seed, double a, double y[]) { 77 | 78 | /*c--------------------------------------------------------------------- 79 | c---------------------------------------------------------------------*/ 80 | 81 | /*c--------------------------------------------------------------------- 82 | c 83 | c This routine generates N uniform pseudorandom double precision numbers in 84 | c the range (0, 1) by using the linear congruential generator 85 | c 86 | c x_{k+1} = a x_k (mod 2^46) 87 | c 88 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 89 | c before repeating. The argument A is the same as 'a' in the above formula, 90 | c and X is the same as x_0. A and X must be odd double precision integers 91 | c in the range (1, 2^46). The N results are placed in Y and are normalized 92 | c to be between 0 and 1. X is updated to contain the new seed, so that 93 | c subsequent calls to VRANLC using the same arguments will generate a 94 | c continuous sequence. If N is zero, only initialization is performed, and 95 | c the variables X, A and Y are ignored. 96 | c 97 | c This routine is the standard version designed for scalar or RISC systems. 98 | c However, it should produce the same results on any single processor 99 | c computer with at least 48 mantissa bits in double precision floating point 100 | c data. On 64 bit systems, double precision should be disabled. 101 | c 102 | c---------------------------------------------------------------------*/ 103 | 104 | int i; 105 | double x,t1,t2,t3,t4,a1,a2,x1,x2,z; 106 | 107 | /*c--------------------------------------------------------------------- 108 | c Break A into two parts such that A = 2^23 * A1 + A2. 109 | c---------------------------------------------------------------------*/ 110 | t1 = r23 * a; 111 | a1 = (int)t1; 112 | a2 = a - t23 * a1; 113 | x = *x_seed; 114 | 115 | /*c--------------------------------------------------------------------- 116 | c Generate N results. This loop is not vectorizable. 117 | c---------------------------------------------------------------------*/ 118 | for (i = 1; i <= n; i++) { 119 | 120 | /*c--------------------------------------------------------------------- 121 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 122 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 123 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 124 | c---------------------------------------------------------------------*/ 125 | t1 = r23 * x; 126 | x1 = (int)t1; 127 | x2 = x - t23 * x1; 128 | t1 = a1 * x2 + a2 * x1; 129 | t2 = (int)(r23 * t1); 130 | z = t1 - t23 * t2; 131 | t3 = t23 * z + a2 * x2; 132 | t4 = (int)(r46 * t3); 133 | x = t3 - t46 * t4; 134 | y[i] = r46 * x; 135 | } 136 | *x_seed = x; 137 | } 138 | -------------------------------------------------------------------------------- /NPB-SER/common/c_timers.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #include "wtime.hpp" 5 | #include 6 | 7 | /* Prototype */ 8 | void wtime( double * ); 9 | 10 | 11 | 12 | /*****************************************************************/ 13 | /****** E L A P S E D _ T I M E ******/ 14 | /*****************************************************************/ 15 | double elapsed_time( void ) 16 | { 17 | double t; 18 | 19 | wtime( &t ); 20 | return( t ); 21 | } 22 | 23 | 24 | double start[64], elapsed[64]; 25 | 26 | /*****************************************************************/ 27 | /****** T I M E R _ C L E A R ******/ 28 | /*****************************************************************/ 29 | void timer_clear( int n ) 30 | { 31 | elapsed[n] = 0.0; 32 | } 33 | 34 | 35 | /*****************************************************************/ 36 | /****** T I M E R _ S T A R T ******/ 37 | /*****************************************************************/ 38 | void timer_start( int n ) 39 | { 40 | start[n] = elapsed_time(); 41 | } 42 | 43 | 44 | /*****************************************************************/ 45 | /****** T I M E R _ S T O P ******/ 46 | /*****************************************************************/ 47 | void timer_stop( int n ) 48 | { 49 | double t, now; 50 | 51 | now = elapsed_time(); 52 | t = now - start[n]; 53 | elapsed[n] += t; 54 | 55 | } 56 | 57 | 58 | /*****************************************************************/ 59 | /****** T I M E R _ R E A D ******/ 60 | /*****************************************************************/ 61 | double timer_read( int n ) 62 | { 63 | return( elapsed[n] ); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /NPB-SER/common/npb-CPP.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | typedef int boolean; 6 | typedef struct { double real; double imag; } dcomplex; 7 | 8 | #define TRUE 1 9 | #define FALSE 0 10 | 11 | #define max(a,b) (((a) > (b)) ? (a) : (b)) 12 | #define min(a,b) (((a) < (b)) ? (a) : (b)) 13 | #define pow2(a) ((a)*(a)) 14 | 15 | #define get_real(c) c.real 16 | #define get_imag(c) c.imag 17 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) 18 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) 19 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \ 20 | c.imag = a.real * b.imag + a.imag * b.real) 21 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b) 22 | 23 | extern double randlc(double *, double); 24 | extern void vranlc(int, double *, double, double *); 25 | extern void timer_clear(int); 26 | extern void timer_start(int); 27 | extern void timer_stop(int); 28 | extern double timer_read(int); 29 | 30 | extern void c_print_results(char *name, char class_npb, int n1, int n2, 31 | int n3, int niter, double t, 32 | double mops, char *optype, int passed_verification, 33 | char *npbversion, char *compiletime, char *cc, 34 | char *clink, char *c_lib, char *c_inc, 35 | char *cflags, char *clinkflags, char *rand); 36 | -------------------------------------------------------------------------------- /NPB-SER/common/wtime.cpp: -------------------------------------------------------------------------------- 1 | #include "wtime.hpp" 2 | #include 3 | 4 | void wtime(double *t) 5 | { 6 | static int sec = -1; 7 | struct timeval tv; 8 | gettimeofday(&tv, 0); 9 | if (sec < 0) sec = tv.tv_sec; 10 | *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /NPB-SER/common/wtime.hpp: -------------------------------------------------------------------------------- 1 | /* C/Fortran interface is different on different machines. 2 | * You may need to tweak this. 3 | */ 4 | 5 | 6 | #if defined(IBM) 7 | #define wtime wtime 8 | #elif defined(CRAY) 9 | #define wtime WTIME 10 | #else 11 | #define wtime wtime_ 12 | #endif 13 | -------------------------------------------------------------------------------- /NPB-SER/common/wtime_sgi64.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* The following works on SGI Power Challenge systems */ 10 | 11 | typedef unsigned long iotimer_t; 12 | 13 | unsigned int cycleval; 14 | volatile iotimer_t *iotimer_addr, base_counter; 15 | double resolution; 16 | 17 | /* address_t is an integer type big enough to hold an address */ 18 | typedef unsigned long address_t; 19 | 20 | 21 | 22 | void timer_init() 23 | { 24 | 25 | int fd; 26 | char *virt_addr; 27 | address_t phys_addr, page_offset, pagemask, pagebase_addr; 28 | 29 | pagemask = getpagesize() - 1; 30 | errno = 0; 31 | phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); 32 | if (errno != 0) { 33 | perror("SGI_QUERY_CYCLECNTR"); 34 | exit(1); 35 | } 36 | /* rel_addr = page offset of physical address */ 37 | page_offset = phys_addr & pagemask; 38 | pagebase_addr = phys_addr - page_offset; 39 | fd = open("/dev/mmem", O_RDONLY); 40 | 41 | virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); 42 | virt_addr = virt_addr + page_offset; 43 | iotimer_addr = (iotimer_t *)virt_addr; 44 | /* cycleval in picoseconds to this gives resolution in seconds */ 45 | resolution = 1.0e-12*cycleval; 46 | base_counter = *iotimer_addr; 47 | } 48 | 49 | void wtime_(double *time) 50 | { 51 | static int initialized = 0; 52 | volatile iotimer_t counter_value; 53 | if (!initialized) { 54 | timer_init(); 55 | initialized = 1; 56 | } 57 | counter_value = *iotimer_addr - base_counter; 58 | *time = (double)counter_value * resolution; 59 | } 60 | 61 | 62 | void wtime(double *time) 63 | { 64 | static int initialized = 0; 65 | volatile iotimer_t counter_value; 66 | if (!initialized) { 67 | timer_init(); 68 | initialized = 1; 69 | } 70 | counter_value = *iotimer_addr - base_counter; 71 | *time = (double)counter_value * resolution; 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /NPB-SER/config/make.def: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------- 2 | # 3 | # SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 4 | # 5 | #--------------------------------------------------------------------------- 6 | 7 | #--------------------------------------------------------------------------- 8 | # Items in this file will need to be changed for each platform. 9 | # (Note these definitions are inconsistent with NPB2.1.) 10 | #--------------------------------------------------------------------------- 11 | 12 | #--------------------------------------------------------------------------- 13 | # Parallel C: 14 | # 15 | # CC - C compiler 16 | # CFLAGS - C compilation arguments 17 | # C_INC - any -I arguments required for compiling C 18 | # CLINK - C linker 19 | # CLINKFLAGS - C linker flags 20 | # C_LIB - any -L and -l arguments required for linking C 21 | # 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or 23 | # $(CC) $(CFLAGS) 24 | # linking is done with $(CLINK) $(C_LIB) $(CLINKFLAGS) 25 | #--------------------------------------------------------------------------- 26 | 27 | #--------------------------------------------------------------------------- 28 | # This is the C compiler used for OpenMP programs 29 | #--------------------------------------------------------------------------- 30 | CC = g++ -std=c++14 31 | #gcc #cc 32 | # This links C programs; usually the same as ${CC} 33 | CLINK = $(CC) 34 | 35 | #--------------------------------------------------------------------------- 36 | # These macros are passed to the linker 37 | #--------------------------------------------------------------------------- 38 | C_LIB = -lm 39 | 40 | #--------------------------------------------------------------------------- 41 | # These macros are passed to the compiler 42 | #--------------------------------------------------------------------------- 43 | C_INC = -I../common 44 | 45 | #--------------------------------------------------------------------------- 46 | # Global *compile time* flags for C programs 47 | #--------------------------------------------------------------------------- 48 | CFLAGS = -O3 49 | # CFLAGS = -g 50 | 51 | #--------------------------------------------------------------------------- 52 | # Global *link time* flags. Flags for increasing maximum executable 53 | # size usually go here. 54 | #--------------------------------------------------------------------------- 55 | CLINKFLAGS = -O3 56 | 57 | 58 | #--------------------------------------------------------------------------- 59 | # Utilities C: 60 | # 61 | # This is the C compiler used to compile C utilities. Flags required by 62 | # this compiler go here also; typically there are few flags required; hence 63 | # there are no separate macros provided for such flags. 64 | #--------------------------------------------------------------------------- 65 | UCC = cc 66 | 67 | 68 | #--------------------------------------------------------------------------- 69 | # Destination of executables, relative to subdirs of the main directory. . 70 | #--------------------------------------------------------------------------- 71 | BINDIR = ../bin 72 | 73 | 74 | #--------------------------------------------------------------------------- 75 | # The variable RAND controls which random number generator 76 | # is used. It is described in detail in Doc/README.install. 77 | # Use "randi8" unless there is a reason to use another one. 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec" 79 | #--------------------------------------------------------------------------- 80 | # RAND = randi8 81 | # The following is highly reliable but may be slow: 82 | RAND = randdp 83 | 84 | 85 | #--------------------------------------------------------------------------- 86 | # The variable WTIME is the name of the wtime source code module in the 87 | # NPB2.x/common directory. 88 | # For most machines, use wtime.c 89 | # For SGI power challenge: use wtime_sgi64.c 90 | #--------------------------------------------------------------------------- 91 | WTIME = wtime.cpp 92 | 93 | 94 | #--------------------------------------------------------------------------- 95 | # Enable if either Cray or IBM: 96 | # (no such flag for most machines: see common/wtime.h) 97 | # This is used by the C compiler to pass the machine name to common/wtime.h, 98 | # where the C/Fortran binding interface format is determined 99 | #--------------------------------------------------------------------------- 100 | # MACHINE = -DCRAY 101 | # MACHINE = -DIBM 102 | 103 | 104 | -------------------------------------------------------------------------------- /NPB-SER/config/suite.def: -------------------------------------------------------------------------------- 1 | # config/suite.def 2 | # This file is used to build several benchmarks with a single command. 3 | # Typing "make suite" in the main directory will build all the benchmarks 4 | # specified in this file. 5 | # Each line of this file contains a benchmark name, class, and number 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft". 7 | # The class is one of "S", "W", "A", "B", and "C". 8 | # No blank lines. 9 | # The following example builds serial sample sizes of all benchmarks. 10 | ft B 11 | mg B 12 | is B 13 | ep B 14 | cg B 15 | -------------------------------------------------------------------------------- /NPB-SER/sys/Makefile: -------------------------------------------------------------------------------- 1 | include ../config/make.def 2 | 3 | # Note that COMPILE is also defined in make.common and should 4 | # be the same. We can't include make.common because it has a lot 5 | # of other garbage. 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 7 | 8 | all: setparams 9 | 10 | # setparams creates an npbparam.h file for each benchmark 11 | # configuration. npbparams.h also contains info about how a benchmark 12 | # was compiled and linked 13 | 14 | setparams: setparams.cpp ../config/make.def 15 | $(UCC) -o setparams setparams.cpp 16 | 17 | 18 | clean: 19 | -rm -f setparams setparams.hpp npbparams.hpp 20 | -rm -f *~ *.o 21 | 22 | -------------------------------------------------------------------------------- /NPB-SER/sys/README: -------------------------------------------------------------------------------- 1 | This directory contains utilities and files used by the 2 | build process. You should not need to change anything 3 | in this directory. 4 | 5 | Original Files 6 | -------------- 7 | setparams.c: 8 | Source for the setparams program. This program is used internally 9 | in the build process to create the file "npbparams.h" for each 10 | benchmark. npbparams.h contains Fortran or C parameters to build a 11 | benchmark for a specific class. The setparams program is never run 12 | directly by a user. Its invocation syntax is 13 | 14 | "setparams benchmark-name class". 15 | 16 | It examines the file "npbparams.h" in the current directory. If 17 | the specified parameters are the same as those in the npbparams.h 18 | file, nothing it changed. If the file does not exist or corresponds 19 | to a different class/number of nodes, it is (re)built. 20 | One of the more complicated things in npbparams.h is that it 21 | contains, in a Fortran string, the compiler flags used to build a 22 | benchmark, so that a benchmark can print out how it was compiled. 23 | 24 | make.common 25 | A makefile segment that is included in each individual benchmark 26 | program makefile. It sets up some standard macros (COMPILE, etc) 27 | and makes sure everything is configured correctly (npbparams.h) 28 | 29 | Makefile 30 | Builds setparams 31 | 32 | README 33 | This file. 34 | 35 | 36 | Created files 37 | ------------- 38 | 39 | setparams 40 | See descriptions above 41 | 42 | -------------------------------------------------------------------------------- /NPB-SER/sys/make.common: -------------------------------------------------------------------------------- 1 | PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 3 | CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) 4 | 5 | # Class "U" is used internally by the setparams program to mean 6 | # "unknown". This means that if you don't specify CLASS= 7 | # on the command line, you'll get an error. It would be nice 8 | # to be able to avoid this, but we'd have to get information 9 | # from the setparams back to the make program, which isn't easy. 10 | CLASS=U 11 | 12 | default:: ${PROGRAM} 13 | 14 | # This makes sure the configuration utility setparams 15 | # is up to date. 16 | # Note that this must be run every time, which is why the 17 | # target does not exist and is not created. 18 | # If you create a file called "config" you will break things. 19 | config: 20 | @cd ../sys; ${MAKE} all 21 | ../sys/setparams ${BENCHMARK} ${CLASS} 22 | 23 | COMMON=../common 24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f 25 | cd ${COMMON}; ${FCOMPILE} ${RAND}.f 26 | 27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp 28 | cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp 29 | 30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f 31 | cd ${COMMON}; ${FCOMPILE} print_results.f 32 | 33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp 34 | cd ${COMMON}; ${CCOMPILE} c_print_results.cpp 35 | 36 | ${COMMON}/timers.o: ${COMMON}/timers.f 37 | cd ${COMMON}; ${FCOMPILE} timers.f 38 | 39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp 40 | cd ${COMMON}; ${CCOMPILE} c_timers.cpp 41 | 42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME} 43 | cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME} 44 | # For most machines or CRAY or IBM 45 | # cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c 46 | # For a precise timer on an SGI Power Challenge, try: 47 | # cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c 48 | 49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME} 50 | cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME} 51 | 52 | 53 | # Normally setparams updates npbparams.h only if the settings (CLASS) 54 | # have changed. However, we also want to update if the compile options 55 | # may have changed (set in ../config/make.def). 56 | npbparams.hpp: ../config/make.def 57 | @ echo make.def modified. Rebuilding npbparams.hpp just in case 58 | rm -f npbparams.hpp 59 | ../sys/setparams ${BENCHMARK} ${CLASS} 60 | 61 | # So that "make benchmark-name" works 62 | ${BENCHMARK}: default 63 | ${BENCHMARKU}: default 64 | 65 | 66 | -------------------------------------------------------------------------------- /NPB-SER/sys/print_header: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' =========================================' 3 | echo ' = NAS Parallel Benchmarks =' 4 | echo ' = Serial C++ Versions =' 5 | echo ' = Developed by: Dalvan Griebler =' 6 | echo ' = Júnior Löff =' 7 | echo ' = =' 8 | echo ' = Warning: in case of problems =' 9 | echo ' = send an email to us: =' 10 | echo ' = dalvan.griebler@acad.pucrs.br =' 11 | echo ' = junior.loff@acad.pucrs.br =' 12 | echo ' =========================================' 13 | echo '' 14 | -------------------------------------------------------------------------------- /NPB-SER/sys/print_instructions: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' To make a NAS benchmark type ' 3 | echo '' 4 | echo ' make CLASS=' 5 | echo '' 6 | echo ' where is "cg", "ep", "ft", "is", or "mg"' 7 | echo ' is "S", "W", "A", "B" or "C"' 8 | echo '' 9 | echo ' To make a set of benchmarks, create the file config/suite.def' 10 | echo ' according to the instructions in config/suite.def.template and type' 11 | echo '' 12 | echo ' make suite' 13 | echo '' 14 | echo ' ***************************************************************' 15 | echo ' * Remember to edit the file config/make.def for site specific *' 16 | echo ' * information as described in the README file *' 17 | echo ' ***************************************************************' 18 | 19 | -------------------------------------------------------------------------------- /NPB-TBB/CG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=cg 3 | BENCHMARKU=CG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = cg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | cg.o: cg.cpp npbparams.hpp 16 | ${CCOMPILE} cg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-TBB/EP/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ep 3 | BENCHMARKU=EP 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | 16 | ep.o: ep.cpp npbparams.hpp 17 | ${CCOMPILE} ep.cpp 18 | 19 | clean: 20 | - rm -f *.o *~ 21 | - rm -f npbparams.hpp core 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /NPB-TBB/EP/ep.cpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | 3 | Information on NAS Parallel Benchmarks is available at: 4 | 5 | http://www.nas.nasa.gov/Software/NPB/ 6 | 7 | Authors: P. O. Frederickson 8 | D. H. Bailey 9 | A. C. Woo 10 | 11 | CPP and TBB version: 12 | Dalvan Griebler 13 | Júnior Löff 14 | 15 | --------------------------------------------------------------------*/ 16 | 17 | 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "npbparams.hpp" 24 | #include 25 | #include <../common/npb-CPP.hpp> 26 | 27 | /* parameters */ 28 | #define MK 16 29 | #define MM (M - MK) 30 | #define NN (1 << MM) 31 | #define NK (1 << MK) 32 | #define NQ 10 33 | #define EPSILON 1.0e-8 34 | #define A 1220703125.0 35 | #define S 271828183.0 36 | #define TIMERS_ENABLED FALSE 37 | 38 | /* global variables */ 39 | /* common /storage/ */ 40 | static double x[2*NK]; 41 | static double q[NQ]; 42 | 43 | /*-------------------------------------------------------------------- 44 | program EMBAR 45 | c-------------------------------------------------------------------*/ 46 | /* 47 | c This is the serial version of the APP Benchmark 1, 48 | c the "embarassingly parallel" benchmark. 49 | c 50 | c M is the Log_2 of the number of complex pairs of uniform (0, 1) random 51 | c numbers. MK is the Log_2 of the size of each batch of uniform random 52 | c numbers. MK can be set for convenience on a given system, since it does 53 | c not affect the results. 54 | */ 55 | int main(int argc, char **argv) { 56 | double Mops, t1, sx, sy, tm, an, gc; 57 | double dum[3] = { 1.0, 1.0, 1.0 }; 58 | int np,i, k, nit, k_offset, j; 59 | boolean verified; 60 | char size[13+1]; /* character*13 */ 61 | 62 | int num_workers; 63 | if(const char * nw = std::getenv("TBB_NUM_THREADS")) { 64 | num_workers = atoi(nw); 65 | } else { 66 | num_workers = 1; 67 | } 68 | 69 | tbb::task_scheduler_init init(num_workers); 70 | tbb::mutex critical_section; 71 | 72 | /* 73 | c Because the size of the problem is too large to store in a 32-bit 74 | c integer for some classes, we put it into a string (for printing). 75 | c Have to strip off the decimal point put in there by the floating 76 | c point print statement (internal file) 77 | */ 78 | 79 | printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n"); 80 | printf("Developed by: Dalvan Griebler & Júnior Löff \n\n"); 81 | sprintf(size, "%12.0f", pow(2.0, M+1)); 82 | for (j = 13; j >= 1; j--) { 83 | if (size[j] == '.') size[j] = ' '; 84 | } 85 | printf(" Number of random numbers generated: %13s\n", size); 86 | 87 | verified = FALSE; 88 | 89 | /* 90 | c Compute the number of "batches" of random number pairs generated 91 | c per processor. Adjust if the number of processors does not evenly 92 | c divide the total number 93 | */ 94 | np = NN; 95 | 96 | /* 97 | c Call the random number generator functions and initialize 98 | c the x-array to reduce the effects of paging on the timings. 99 | c Also, call all mathematical functions that are used. Make 100 | c sure these initializations cannot be eliminated as dead code. 101 | */ 102 | vranlc(0, &(dum[0]), dum[1], &(dum[2])); 103 | dum[0] = randlc(&(dum[1]), dum[2]); 104 | for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; 105 | Mops = log(sqrt(fabs(max(1.0, 1.0)))); 106 | 107 | 108 | 109 | timer_clear(1); 110 | timer_clear(2); 111 | timer_clear(3); 112 | 113 | timer_start(1); 114 | 115 | vranlc(0, &t1, A, x); 116 | 117 | /* Compute AN = A ^ (2 * NK) (mod 2^46). */ 118 | 119 | t1 = A; 120 | 121 | for ( i = 1; i <= MK+1; i++) { 122 | an = randlc(&t1, t1); 123 | } 124 | 125 | an = t1; 126 | gc = 0.0; 127 | sx = 0.0; 128 | sy = 0.0; 129 | 130 | for ( i = 0; i <= NQ - 1; i++) { 131 | q[i] = 0.0; 132 | } 133 | 134 | /* 135 | c Each instance of this loop may be performed independently. We compute 136 | c the k offsets separately to take into account the fact that some nodes 137 | c have more numbers to generate than others 138 | */ 139 | k_offset = -1; 140 | 141 | tbb::parallel_for(tbb::blocked_range(1,np+1),[&](const tbb::blocked_range& r){ 142 | double t2, t3, t4, x1, x2; 143 | int kk, ik, l; 144 | double qq[NQ]; /* private copy of q[0:NQ-1] */ 145 | double sx_tbb, sy_tbb; 146 | double x[(2*NK)+1]; 147 | 148 | for (int i = 0; i < NQ; i++) 149 | qq[i] = 0.0; 150 | 151 | sx_tbb = 0.0; 152 | sy_tbb = 0.0; 153 | 154 | for(int k=r.begin(); k != r.end(); k++){ 155 | kk = k_offset + k; 156 | double t1 = S; 157 | t2 = an; 158 | 159 | /* Find starting seed t1 for this kk. */ 160 | 161 | for (int i = 1; i <= 100; i++) { 162 | ik = kk / 2; 163 | if (2 * ik != kk) t3 = randlc(&t1, t2); 164 | if (ik == 0) break; 165 | t3 = randlc(&t2, t2); 166 | kk = ik; 167 | } 168 | 169 | /* Compute uniform pseudorandom numbers. */ 170 | 171 | if (TIMERS_ENABLED == TRUE) timer_start(3); 172 | vranlc(2*NK, &t1, A, x); 173 | if (TIMERS_ENABLED == TRUE) timer_stop(3); 174 | 175 | /* 176 | c Compute Gaussian deviates by acceptance-rejection method and 177 | c tally counts in concentric square annuli. This loop is not 178 | c vectorizable. 179 | */ 180 | if (TIMERS_ENABLED == TRUE) timer_start(2); 181 | 182 | for (int i = 1; i <= NK; i++) { 183 | x1 = 2.0 * x[2*i-1] - 1.0; 184 | x2 = 2.0 * x[2*i] - 1.0; 185 | t1 = pow2(x1) + pow2(x2); 186 | if (t1 <= 1.0) { 187 | t2 = sqrt(-2.0 * log(t1) / t1); 188 | t3 = (x1 * t2); /* Xi */ 189 | t4 = (x2 * t2); /* Yi */ 190 | l = max(fabs(t3), fabs(t4)); 191 | qq[l] += 1.0; /* counts */ 192 | sx_tbb = sx_tbb + t3; /* sum of Xi */ 193 | sy_tbb = sy_tbb + t4; /* sum of Yi */ 194 | } 195 | } 196 | if (TIMERS_ENABLED == TRUE) timer_stop(2); 197 | 198 | } 199 | 200 | critical_section.lock(); 201 | for (int i = 0; i < NQ; i++){ 202 | q[i] += qq[i]; 203 | } 204 | sx += sx_tbb; 205 | sy += sy_tbb; 206 | critical_section.unlock(); 207 | 208 | }); 209 | 210 | for (i = 0; i <= NQ-1; i++) { 211 | gc = gc + q[i]; 212 | } 213 | 214 | timer_stop(1); 215 | 216 | tm = timer_read(1); 217 | 218 | 219 | nit = 0; 220 | if (M == 24) { 221 | if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) { 222 | verified = TRUE; 223 | } 224 | } else if (M == 25) { 225 | if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) { 226 | verified = TRUE; 227 | } 228 | } else if (M == 28) { 229 | //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { 230 | if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) { 231 | verified = TRUE; 232 | } 233 | } else if (M == 30) { 234 | if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) { 235 | verified = TRUE; 236 | } 237 | } else if (M == 32) { 238 | if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) { 239 | verified = TRUE; 240 | } 241 | } else if (M == 36) { 242 | if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) { 243 | verified = TRUE; 244 | } 245 | } else if (M == 40) { 246 | if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) { 247 | verified = TRUE; 248 | } 249 | } 250 | 251 | Mops = pow(2.0, M+1)/tm/1000000.0; 252 | 253 | printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" 254 | "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); 255 | for (i = 0; i <= NQ-1; i++) { 256 | printf("%3d %15.0f\n", i, q[i]); 257 | } 258 | 259 | c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, tm, Mops, (char*)"Random numbers generated", 260 | verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7); 261 | 262 | if (TIMERS_ENABLED == TRUE) { 263 | printf("Total time: %f", timer_read(1)); 264 | printf("Gaussian pairs: %f", timer_read(2)); 265 | printf("Random numbers: %f", timer_read(3)); 266 | } 267 | return 0; 268 | } 269 | -------------------------------------------------------------------------------- /NPB-TBB/FT/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=ft 3 | BENCHMARKU=FT 4 | 5 | include ../config/make.def 6 | 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | ft.o: ft.cpp global.hpp npbparams.hpp 16 | ${CCOMPILE} ft.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ mputil* 20 | - rm -f ft npbparams.hpp core 21 | -------------------------------------------------------------------------------- /NPB-TBB/FT/global.hpp: -------------------------------------------------------------------------------- 1 | #include "npbparams.hpp" 2 | 3 | 4 | /* 5 | c If processor array is 1x1 -> 0D grid decomposition 6 | 7 | 8 | c Cache blocking params. These values are good for most 9 | c RISC processors. 10 | c FFT parameters: 11 | c fftblock controls how many ffts are done at a time. 12 | c The default is appropriate for most cache-based machines 13 | c On vector machines, the FFT can be vectorized with vector 14 | c length equal to the block size, so the block size should 15 | c be as large as possible. This is the size of the smallest 16 | c dimension of the problem: 128 for class A, 256 for class B and 17 | c 512 for class C. 18 | */ 19 | 20 | #define FFTBLOCK_DEFAULT 16 21 | #define FFTBLOCKPAD_DEFAULT 18 22 | 23 | #define FFTBLOCK FFTBLOCK_DEFAULT 24 | #define FFTBLOCKPAD FFTBLOCKPAD_DEFAULT 25 | 26 | /* COMMON block: blockinfo */ 27 | int fftblock; 28 | int fftblockpad; 29 | 30 | /* 31 | c we need a bunch of logic to keep track of how 32 | c arrays are laid out. 33 | 34 | 35 | c Note: this serial version is the derived from the parallel 0D case 36 | c of the ft NPB. 37 | c The computation proceeds logically as 38 | 39 | c set up initial conditions 40 | c fftx(1) 41 | c transpose (1->2) 42 | c ffty(2) 43 | c transpose (2->3) 44 | c fftz(3) 45 | c time evolution 46 | c fftz(3) 47 | c transpose (3->2) 48 | c ffty(2) 49 | c transpose (2->1) 50 | c fftx(1) 51 | c compute residual(1) 52 | 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx 54 | c 55 | c 0D 1D 2D 56 | c 1: xyz xyz xyz 57 | c 2: xyz xyz yxz 58 | c 3: xyz zyx zxy 59 | 60 | c the array dimensions are stored in dims(coord, phase) 61 | */ 62 | 63 | /* COMMON block: layout */ 64 | static int dims[3][3]; 65 | static int xstart[3]; 66 | static int ystart[3]; 67 | static int zstart[3]; 68 | static int xend[3]; 69 | static int yend[3]; 70 | static int zend[3]; 71 | 72 | #define T_TOTAL 0 73 | #define T_SETUP 1 74 | #define T_FFT 2 75 | #define T_EVOLVE 3 76 | #define T_CHECKSUM 4 77 | #define T_FFTLOW 5 78 | #define T_FFTCOPY 6 79 | #define T_MAX 7 80 | 81 | #define TIMERS_ENABLED FALSE 82 | 83 | /* other stuff */ 84 | 85 | #define SEED 314159265.0 86 | #define A 1220703125.0 87 | #define PI 3.141592653589793238 88 | #define ALPHA 1.0e-6 89 | 90 | #define EXPMAX (NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4)) 91 | 92 | /* COMMON block: excomm */ 93 | static double ex[EXPMAX+1]; /* ex(0:expmax) */ 94 | 95 | /* 96 | c roots of unity array 97 | c relies on x being largest dimension? 98 | */ 99 | 100 | /* COMMON block: ucomm */ 101 | static dcomplex u[NX]; 102 | 103 | /* for checksum data */ 104 | 105 | /* COMMON block: sumcomm */ 106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */ 107 | 108 | /* number of iterations*/ 109 | 110 | /* COMMON block: iter */ 111 | static int niter; 112 | 113 | -------------------------------------------------------------------------------- /NPB-TBB/IS/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=is 3 | BENCHMARKU=IS 4 | 5 | include ../config/make.def 6 | 7 | include ../sys/make.common 8 | 9 | OBJS = is.o \ 10 | ${COMMON}/c_print_results.o \ 11 | ${COMMON}/c_timers.o \ 12 | ${COMMON}/c_wtime.o 13 | 14 | 15 | ${PROGRAM}: config ${OBJS} 16 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 17 | 18 | .c.o: 19 | ${CCOMPILE} $< 20 | 21 | is.o: is.cpp npbparams.hpp 22 | ${CCOMPILE} is.cpp 23 | 24 | 25 | clean: 26 | - rm -f *.o *~ mputil* 27 | - rm -f npbparams.hpp core 28 | - if [ -d rii_files ]; then rm -r rii_files; fi 29 | -------------------------------------------------------------------------------- /NPB-TBB/MG/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | BENCHMARK=mg 3 | BENCHMARKU=MG 4 | 5 | include ../config/make.def 6 | 7 | OBJS = mg.o ${COMMON}/c_print_results.o \ 8 | ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o 9 | 10 | include ../sys/make.common 11 | 12 | ${PROGRAM}: config ${OBJS} 13 | ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB} 14 | 15 | mg.o: mg.cpp npbparams.hpp 16 | ${CCOMPILE} mg.cpp 17 | 18 | clean: 19 | - rm -f *.o *~ 20 | - rm -f npbparams.hpp core 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NPB-TBB/MG/globals.hpp: -------------------------------------------------------------------------------- 1 | /*-------------------------------------------------------------------- 2 | c Parameter lm (declared and set in "npbparams.h") is the log-base2 of 3 | c the edge size max for the partition on a given node, so must be changed 4 | c either to save space (if running a small case) or made bigger for larger 5 | c cases, for example, 512^3. Thus lm=7 means that the largest dimension 6 | c of a partition that can be solved on a node is 2^7 = 128. lm is set 7 | c automatically in npbparams.h 8 | c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 9 | c-------------------------------------------------------------------*/ 10 | 11 | #include "npbparams.hpp" 12 | 13 | /* parameters */ 14 | /* actual dimension including ghost cells for communications */ 15 | #define NM (2+(2<<(LM-1))) 16 | /* size of rhs array */ 17 | #define NV (2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1)))) 18 | /* size of residual array */ 19 | #define NR ((8*(NV+(NM*NM)+5*NM+7*LM))/7) 20 | /* size of communication buffer */ 21 | #define NM2 (2*NM*NM) 22 | /* maximum number of levels */ 23 | #define MAXLEVEL 11 24 | 25 | /*---------------------------------------------------------------------*/ 26 | /* common /mg3/ */ 27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; 28 | /* common /ClassType/ */ 29 | static char class_npb; 30 | /* common /my_debug/ */ 31 | static int debug_vec[8]; 32 | /* common /fap/ */ 33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/ 34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; 35 | static int lt, lb; 36 | 37 | /*c--------------------------------------------------------------------- 38 | c Set at m=1024, can handle cases up to 1024^3 case 39 | c---------------------------------------------------------------------*/ 40 | #define M 1037 41 | 42 | /* common /buffer/ */ 43 | /*static double buff[4][NM2];*/ 44 | -------------------------------------------------------------------------------- /NPB-TBB/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | CLASS=S 3 | SFILE=config/suite.def 4 | 5 | default: header 6 | @ $(SHELL) sys/print_instructions 7 | 8 | BT: bt 9 | bt: header 10 | cd BT; $(MAKE) CLASS=$(CLASS) 11 | 12 | SP: sp 13 | sp: header 14 | cd SP; $(MAKE) CLASS=$(CLASS) 15 | 16 | LU: lu 17 | lu: header 18 | cd LU; $(MAKE) CLASS=$(CLASS) 19 | 20 | MG: mg 21 | mg: header 22 | cd MG; $(MAKE) CLASS=$(CLASS) 23 | 24 | FT: ft 25 | ft: header 26 | cd FT; $(MAKE) CLASS=$(CLASS) 27 | 28 | IS: is 29 | is: header 30 | cd IS; $(MAKE) CLASS=$(CLASS) 31 | 32 | CG: cg 33 | cg: header 34 | cd CG; $(MAKE) CLASS=$(CLASS) 35 | 36 | EP: ep 37 | ep: header 38 | cd EP; $(MAKE) CLASS=$(CLASS) 39 | DC: dc 40 | dc: header 41 | cd DC; $(MAKE) CLASS=$(CLASS) 42 | 43 | # Awk script courtesy cmg@cray.com 44 | suite: 45 | @ awk '{ if ($$1 !~ /^#/ && NF > 0) \ 46 | printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE) \ 47 | | $(SHELL) 48 | 49 | 50 | # It would be nice to make clean in each subdirectory (the targets 51 | # are defined) but on a really clean system this will won't work 52 | # because those makefiles need config/make.def 53 | clean: 54 | - rm -f core 55 | - rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe 56 | - rm -f sys/setparams sys/makesuite sys/setparams.hpp 57 | 58 | cleanall: clean 59 | - rm -r bin/* 60 | 61 | veryclean: clean 62 | - rm config/make.def config/suite.def Part* 63 | - rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.* 64 | 65 | header: 66 | @ $(SHELL) sys/print_header 67 | 68 | kit: 69 | - makekit -s100k -k30 * */* */*/* 70 | 71 | 72 | -------------------------------------------------------------------------------- /NPB-TBB/README.md: -------------------------------------------------------------------------------- 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP) 2 | 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). 4 | 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways. 6 | 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)* 8 | 9 | 10 | ## How to cite this work 11 | 12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 13 | 14 | ## The NPB-CPP Benchmark 15 | 16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version. 17 | 18 | ================================================================== 19 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 20 | 21 | Code contributors: 22 | Dalvan Griebler 23 | Júnior Löff 24 | 25 | Warning: in case of problems send an email to us: 26 | dalvan.griebler@acad.pucrs.br 27 | junior.loff@acad.pucrs.br 28 | ================================================================== 29 | 30 | 31 | This folder contains: 32 | 33 | - NPB-FF - Directory with the parallel version implemented in FastFlow 34 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 35 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 36 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 37 | 38 | Each directory is independent and contains its own implemented version of the kernels: 39 | 40 | IS - Integer Sort, random memory access 41 | EP - Embarrassingly Parallel 42 | CG - Conjugate Gradient, irregular memory access and communication 43 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 44 | FT - discrete 3D fast Fourier Transform, all-to-all communication 45 | 46 | ## Software Requirements 47 | 48 | *Warning: our tests were made with GCC-5* 49 | 50 | **TBB** 51 | 52 | *Installation* 53 | 54 | apt-get install libtbb-dev 55 | 56 | **FastFlow** 57 | 58 | *Installation* 59 | 60 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 61 | 62 | 63 | ## How to Compile 64 | 65 | Enter the directory from the version desired and execute: 66 | 67 | make _BENCHMARK CLASS=_VERSION 68 | 69 | 70 | _BENCHMARKs are: 71 | 72 | EP, CG, MG, IS and FT 73 | 74 | _VERSIONs are: 75 | 76 | Class S: small for quick test purposes 77 | Class W: workstation size (a 90's workstation; now likely too small) 78 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 79 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 80 | 81 | 82 | Command: 83 | 84 | make ep CLASS=B 85 | -------------------------------------------------------------------------------- /NPB-TBB/bin/README.md: -------------------------------------------------------------------------------- 1 | # How to Cite our Work 2 | 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 4 | 5 | # The NPB-CPP Benchmark 6 | 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version. 8 | 9 | ================================================================== 10 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 11 | 12 | Code contributors: 13 | Dalvan Griebler 14 | Júnior Löff 15 | 16 | Warning: in case of problems send an email to us: 17 | dalvan.griebler@acad.pucrs.br 18 | junior.loff@acad.pucrs.br 19 | ================================================================== 20 | 21 | 22 | This folder contains: 23 | 24 | - NPB-FF - Directory with the parallel version implemented in FastFlow 25 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 26 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 27 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 28 | 29 | Each directory is independent and contains its own implemented version of the kernels: 30 | 31 | IS - Integer Sort, random memory access 32 | EP - Embarrassingly Parallel 33 | CG - Conjugate Gradient, irregular memory access and communication 34 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 35 | FT - discrete 3D fast Fourier Transform, all-to-all communication 36 | 37 | # Software Requiriments 38 | 39 | *Warning: our tests were made with GCC-5* 40 | 41 | **TBB** 42 | 43 | *Installation* 44 | 45 | apt-get install libtbb-dev 46 | 47 | **FastFlow** 48 | 49 | *Installation* 50 | 51 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 52 | 53 | 54 | # How to Compile 55 | 56 | Enter the directory from the version desired and execute: 57 | 58 | make _BENCHMARK CLASS=_VERSION 59 | 60 | 61 | _BENCHMARKs are: 62 | 63 | EP, CG, MG, IS and FT 64 | 65 | _VERSIONs are: 66 | 67 | Class S: small for quick test purposes 68 | Class W: workstation size (a 90's workstation; now likely too small) 69 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 70 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 71 | 72 | 73 | Command: 74 | 75 | make ep CLASS=B -------------------------------------------------------------------------------- /NPB-TBB/common/c_print_results.cpp: -------------------------------------------------------------------------------- 1 | /*****************************************************************/ 2 | /****** C _ P R I N T _ R E S U L T S ******/ 3 | /*****************************************************************/ 4 | #include 5 | #include 6 | 7 | void c_print_results( char *name, char class_npb, int n1, int n2, int n3, int niter, double t, 8 | double mops, char *optype, int passed_verification, char *npbversion, char *compiletime, char *cc, 9 | char *clink, char *c_lib, char *c_inc, char *cflags, char *clinkflags, char *rand) 10 | { 11 | 12 | printf( "\n\n %s Benchmark Completed\n", name ); 13 | 14 | printf( " class_npb = %c\n", class_npb ); 15 | 16 | if( n2 == 0 && n3 == 0 ) 17 | printf( " Size = %12d\n", n1 ); /* as in IS */ 18 | else 19 | printf( " Size = %3dx%3dx%3d\n", n1,n2,n3 ); 20 | 21 | printf( " Iterations = %12d\n", niter ); 22 | 23 | printf( " Time in seconds = %12.2f\n", t ); 24 | 25 | printf( " Mop/s total = %12.2f\n", mops ); 26 | 27 | printf( " Operation type = %24s\n", optype); 28 | 29 | if( passed_verification ) 30 | printf( " Verification = SUCCESSFUL\n" ); 31 | else 32 | printf( " Verification = UNSUCCESSFUL\n" ); 33 | 34 | printf( " Version = %12s\n", npbversion ); 35 | 36 | printf( " Compile date = %12s\n", compiletime ); 37 | 38 | printf( "\n Compile options:\n" ); 39 | 40 | printf( " CC = %s\n", cc ); 41 | 42 | printf( " CLINK = %s\n", clink ); 43 | 44 | printf( " C_LIB = %s\n", c_lib ); 45 | 46 | printf( " C_INC = %s\n", c_inc ); 47 | 48 | printf( " CFLAGS = %s\n", cflags ); 49 | 50 | printf( " CLINKFLAGS = %s\n", clinkflags ); 51 | 52 | printf( " RAND = %s\n", rand ); 53 | #ifdef SMP 54 | char *evalue = getenv("MP_SET_NUMTHREADS"); 55 | printf( " MULTICPUS = %s\n", evalue ); 56 | #endif 57 | 58 | /* printf( "\n\n" ); 59 | printf( " Please send the results of this run to:\n\n" ); 60 | printf( " NPB Development Team\n" ); 61 | printf( " Internet: npb@nas.nasa.gov\n \n" ); 62 | printf( " If email is not available, send this to:\n\n" ); 63 | printf( " MS T27A-1\n" ); 64 | printf( " NASA Ames Research Center\n" ); 65 | printf( " Moffett Field, CA 94035-1000\n\n" ); 66 | printf( " Fax: 415-604-3957\n\n" );*/ 67 | } 68 | 69 | -------------------------------------------------------------------------------- /NPB-TBB/common/c_randdp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | */ 3 | #if defined(USE_POW) 4 | #define r23 pow(0.5, 23.0) 5 | #define r46 (r23*r23) 6 | #define t23 pow(2.0, 23.0) 7 | #define t46 (t23*t23) 8 | #else 9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5) 10 | #define r46 (r23*r23) 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0) 12 | #define t46 (t23*t23) 13 | #endif 14 | 15 | /*c--------------------------------------------------------------------- 16 | c---------------------------------------------------------------------*/ 17 | 18 | double randlc (double *x, double a) { 19 | 20 | /*c--------------------------------------------------------------------- 21 | c---------------------------------------------------------------------*/ 22 | 23 | /*c--------------------------------------------------------------------- 24 | c 25 | c This routine returns a uniform pseudorandom double precision number in the 26 | c range (0, 1) by using the linear congruential generator 27 | c 28 | c x_{k+1} = a x_k (mod 2^46) 29 | c 30 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 31 | c before repeating. The argument A is the same as 'a' in the above formula, 32 | c and X is the same as x_0. A and X must be odd double precision integers 33 | c in the range (1, 2^46). The returned value RANDLC is normalized to be 34 | c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 35 | c the new seed x_1, so that subsequent calls to RANDLC using the same 36 | c arguments will generate a continuous sequence. 37 | c 38 | c This routine should produce the same results on any computer with at least 39 | c 48 mantissa bits in double precision floating point data. On 64 bit 40 | c systems, double precision should be disabled. 41 | c 42 | c David H. Bailey October 26, 1990 43 | c 44 | c---------------------------------------------------------------------*/ 45 | 46 | double t1,t2,t3,t4,a1,a2,x1,x2,z; 47 | 48 | /*c--------------------------------------------------------------------- 49 | c Break A into two parts such that A = 2^23 * A1 + A2. 50 | c---------------------------------------------------------------------*/ 51 | t1 = r23 * a; 52 | a1 = (int)t1; 53 | a2 = a - t23 * a1; 54 | 55 | /*c--------------------------------------------------------------------- 56 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 57 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 58 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 59 | c---------------------------------------------------------------------*/ 60 | t1 = r23 * (*x); 61 | x1 = (int)t1; 62 | x2 = (*x) - t23 * x1; 63 | t1 = a1 * x2 + a2 * x1; 64 | t2 = (int)(r23 * t1); 65 | z = t1 - t23 * t2; 66 | t3 = t23 * z + a2 * x2; 67 | t4 = (int)(r46 * t3); 68 | (*x) = t3 - t46 * t4; 69 | 70 | return (r46 * (*x)); 71 | } 72 | 73 | /*c--------------------------------------------------------------------- 74 | c---------------------------------------------------------------------*/ 75 | 76 | void vranlc (int n, double *x_seed, double a, double y[]) { 77 | 78 | /*c--------------------------------------------------------------------- 79 | c---------------------------------------------------------------------*/ 80 | 81 | /*c--------------------------------------------------------------------- 82 | c 83 | c This routine generates N uniform pseudorandom double precision numbers in 84 | c the range (0, 1) by using the linear congruential generator 85 | c 86 | c x_{k+1} = a x_k (mod 2^46) 87 | c 88 | c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 89 | c before repeating. The argument A is the same as 'a' in the above formula, 90 | c and X is the same as x_0. A and X must be odd double precision integers 91 | c in the range (1, 2^46). The N results are placed in Y and are normalized 92 | c to be between 0 and 1. X is updated to contain the new seed, so that 93 | c subsequent calls to VRANLC using the same arguments will generate a 94 | c continuous sequence. If N is zero, only initialization is performed, and 95 | c the variables X, A and Y are ignored. 96 | c 97 | c This routine is the standard version designed for scalar or RISC systems. 98 | c However, it should produce the same results on any single processor 99 | c computer with at least 48 mantissa bits in double precision floating point 100 | c data. On 64 bit systems, double precision should be disabled. 101 | c 102 | c---------------------------------------------------------------------*/ 103 | 104 | int i; 105 | double x,t1,t2,t3,t4,a1,a2,x1,x2,z; 106 | 107 | /*c--------------------------------------------------------------------- 108 | c Break A into two parts such that A = 2^23 * A1 + A2. 109 | c---------------------------------------------------------------------*/ 110 | t1 = r23 * a; 111 | a1 = (int)t1; 112 | a2 = a - t23 * a1; 113 | x = *x_seed; 114 | 115 | /*c--------------------------------------------------------------------- 116 | c Generate N results. This loop is not vectorizable. 117 | c---------------------------------------------------------------------*/ 118 | for (i = 1; i <= n; i++) { 119 | 120 | /*c--------------------------------------------------------------------- 121 | c Break X into two parts such that X = 2^23 * X1 + X2, compute 122 | c Z = A1 * X2 + A2 * X1 (mod 2^23), and then 123 | c X = 2^23 * Z + A2 * X2 (mod 2^46). 124 | c---------------------------------------------------------------------*/ 125 | t1 = r23 * x; 126 | x1 = (int)t1; 127 | x2 = x - t23 * x1; 128 | t1 = a1 * x2 + a2 * x1; 129 | t2 = (int)(r23 * t1); 130 | z = t1 - t23 * t2; 131 | t3 = t23 * z + a2 * x2; 132 | t4 = (int)(r46 * t3); 133 | x = t3 - t46 * t4; 134 | y[i] = r46 * x; 135 | } 136 | *x_seed = x; 137 | } 138 | -------------------------------------------------------------------------------- /NPB-TBB/common/c_timers.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #include "wtime.hpp" 5 | #include 6 | 7 | /* Prototype */ 8 | void wtime( double * ); 9 | 10 | 11 | 12 | /*****************************************************************/ 13 | /****** E L A P S E D _ T I M E ******/ 14 | /*****************************************************************/ 15 | double elapsed_time( void ) 16 | { 17 | double t; 18 | 19 | wtime( &t ); 20 | return( t ); 21 | } 22 | 23 | 24 | double start[64], elapsed[64]; 25 | 26 | /*****************************************************************/ 27 | /****** T I M E R _ C L E A R ******/ 28 | /*****************************************************************/ 29 | void timer_clear( int n ) 30 | { 31 | elapsed[n] = 0.0; 32 | } 33 | 34 | 35 | /*****************************************************************/ 36 | /****** T I M E R _ S T A R T ******/ 37 | /*****************************************************************/ 38 | void timer_start( int n ) 39 | { 40 | start[n] = elapsed_time(); 41 | } 42 | 43 | 44 | /*****************************************************************/ 45 | /****** T I M E R _ S T O P ******/ 46 | /*****************************************************************/ 47 | void timer_stop( int n ) 48 | { 49 | double t, now; 50 | 51 | now = elapsed_time(); 52 | t = now - start[n]; 53 | elapsed[n] += t; 54 | 55 | } 56 | 57 | 58 | /*****************************************************************/ 59 | /****** T I M E R _ R E A D ******/ 60 | /*****************************************************************/ 61 | double timer_read( int n ) 62 | { 63 | return( elapsed[n] ); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /NPB-TBB/common/npb-CPP.hpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | typedef int boolean; 7 | typedef struct { double real; double imag; } dcomplex; 8 | 9 | #define TRUE 1 10 | #define FALSE 0 11 | 12 | #define max(a,b) (((a) > (b)) ? (a) : (b)) 13 | #define min(a,b) (((a) < (b)) ? (a) : (b)) 14 | #define pow2(a) ((a)*(a)) 15 | 16 | #define get_real(c) c.real 17 | #define get_imag(c) c.imag 18 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag) 19 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag) 20 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \ 21 | c.imag = a.real * b.imag + a.imag * b.real) 22 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b) 23 | 24 | extern double randlc(double *, double); 25 | extern void vranlc(int, double *, double, double *); 26 | extern void timer_clear(int); 27 | extern void timer_start(int); 28 | extern void timer_stop(int); 29 | extern double timer_read(int); 30 | 31 | extern void c_print_results(char *name, char class_npb, int n1, int n2, 32 | int n3, int niter, double t, 33 | double mops, char *optype, int passed_verification, 34 | char *npbversion, char *compiletime, char *cc, 35 | char *clink, char *c_lib, char *c_inc, 36 | char *cflags, char *clinkflags, char *rand); 37 | -------------------------------------------------------------------------------- /NPB-TBB/common/wtime.cpp: -------------------------------------------------------------------------------- 1 | #include "wtime.hpp" 2 | #include 3 | 4 | void wtime(double *t) 5 | { 6 | static int sec = -1; 7 | struct timeval tv; 8 | gettimeofday(&tv, 0); 9 | if (sec < 0) sec = tv.tv_sec; 10 | *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 11 | } 12 | 13 | 14 | -------------------------------------------------------------------------------- /NPB-TBB/common/wtime.hpp: -------------------------------------------------------------------------------- 1 | /* C/Fortran interface is different on different machines. 2 | * You may need to tweak this. 3 | */ 4 | 5 | 6 | #if defined(IBM) 7 | #define wtime wtime 8 | #elif defined(CRAY) 9 | #define wtime WTIME 10 | #else 11 | #define wtime wtime_ 12 | #endif 13 | -------------------------------------------------------------------------------- /NPB-TBB/common/wtime_sgi64.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* The following works on SGI Power Challenge systems */ 10 | 11 | typedef unsigned long iotimer_t; 12 | 13 | unsigned int cycleval; 14 | volatile iotimer_t *iotimer_addr, base_counter; 15 | double resolution; 16 | 17 | /* address_t is an integer type big enough to hold an address */ 18 | typedef unsigned long address_t; 19 | 20 | 21 | 22 | void timer_init() 23 | { 24 | 25 | int fd; 26 | char *virt_addr; 27 | address_t phys_addr, page_offset, pagemask, pagebase_addr; 28 | 29 | pagemask = getpagesize() - 1; 30 | errno = 0; 31 | phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval); 32 | if (errno != 0) { 33 | perror("SGI_QUERY_CYCLECNTR"); 34 | exit(1); 35 | } 36 | /* rel_addr = page offset of physical address */ 37 | page_offset = phys_addr & pagemask; 38 | pagebase_addr = phys_addr - page_offset; 39 | fd = open("/dev/mmem", O_RDONLY); 40 | 41 | virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr); 42 | virt_addr = virt_addr + page_offset; 43 | iotimer_addr = (iotimer_t *)virt_addr; 44 | /* cycleval in picoseconds to this gives resolution in seconds */ 45 | resolution = 1.0e-12*cycleval; 46 | base_counter = *iotimer_addr; 47 | } 48 | 49 | void wtime_(double *time) 50 | { 51 | static int initialized = 0; 52 | volatile iotimer_t counter_value; 53 | if (!initialized) { 54 | timer_init(); 55 | initialized = 1; 56 | } 57 | counter_value = *iotimer_addr - base_counter; 58 | *time = (double)counter_value * resolution; 59 | } 60 | 61 | 62 | void wtime(double *time) 63 | { 64 | static int initialized = 0; 65 | volatile iotimer_t counter_value; 66 | if (!initialized) { 67 | timer_init(); 68 | initialized = 1; 69 | } 70 | counter_value = *iotimer_addr - base_counter; 71 | *time = (double)counter_value * resolution; 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /NPB-TBB/config/make.def: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------- 2 | # 3 | # SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 4 | # 5 | #--------------------------------------------------------------------------- 6 | 7 | #--------------------------------------------------------------------------- 8 | # Items in this file will need to be changed for each platform. 9 | # (Note these definitions are inconsistent with NPB2.1.) 10 | #--------------------------------------------------------------------------- 11 | 12 | #--------------------------------------------------------------------------- 13 | # Parallel C: 14 | # 15 | # CC - C compiler 16 | # CFLAGS - C compilation arguments 17 | # C_INC - any -I arguments required for compiling C 18 | # CLINK - C linker 19 | # CLINKFLAGS - C linker flags 20 | # C_LIB - any -L and -l arguments required for linking C 21 | # 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or 23 | # $(CC) $(CFLAGS) 24 | # linking is done with $(CLINK) $(C_LIB) $(CLINKFLAGS) 25 | #--------------------------------------------------------------------------- 26 | 27 | #--------------------------------------------------------------------------- 28 | # This is the C compiler used for OpenMP programs 29 | #--------------------------------------------------------------------------- 30 | CC = g++ -std=c++14 31 | #gcc #cc 32 | # This links C programs; usually the same as ${CC} 33 | CLINK = $(CC) 34 | 35 | #--------------------------------------------------------------------------- 36 | # These macros are passed to the linker 37 | #--------------------------------------------------------------------------- 38 | C_LIB = -lm -ltbb 39 | 40 | #--------------------------------------------------------------------------- 41 | # These macros are passed to the compiler 42 | #--------------------------------------------------------------------------- 43 | C_INC = -I../common 44 | 45 | #--------------------------------------------------------------------------- 46 | # Global *compile time* flags for C programs 47 | #--------------------------------------------------------------------------- 48 | CFLAGS = -O3 49 | # CFLAGS = -g 50 | 51 | #--------------------------------------------------------------------------- 52 | # Global *link time* flags. Flags for increasing maximum executable 53 | # size usually go here. 54 | #--------------------------------------------------------------------------- 55 | CLINKFLAGS = -O3 56 | 57 | 58 | #--------------------------------------------------------------------------- 59 | # Utilities C: 60 | # 61 | # This is the C compiler used to compile C utilities. Flags required by 62 | # this compiler go here also; typically there are few flags required; hence 63 | # there are no separate macros provided for such flags. 64 | #--------------------------------------------------------------------------- 65 | UCC = cc 66 | 67 | 68 | #--------------------------------------------------------------------------- 69 | # Destination of executables, relative to subdirs of the main directory. . 70 | #--------------------------------------------------------------------------- 71 | BINDIR = ../bin 72 | 73 | 74 | #--------------------------------------------------------------------------- 75 | # The variable RAND controls which random number generator 76 | # is used. It is described in detail in Doc/README.install. 77 | # Use "randi8" unless there is a reason to use another one. 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec" 79 | #--------------------------------------------------------------------------- 80 | # RAND = randi8 81 | # The following is highly reliable but may be slow: 82 | RAND = randdp 83 | 84 | 85 | #--------------------------------------------------------------------------- 86 | # The variable WTIME is the name of the wtime source code module in the 87 | # NPB2.x/common directory. 88 | # For most machines, use wtime.c 89 | # For SGI power challenge: use wtime_sgi64.c 90 | #--------------------------------------------------------------------------- 91 | WTIME = wtime.cpp 92 | 93 | 94 | #--------------------------------------------------------------------------- 95 | # Enable if either Cray or IBM: 96 | # (no such flag for most machines: see common/wtime.h) 97 | # This is used by the C compiler to pass the machine name to common/wtime.h, 98 | # where the C/Fortran binding interface format is determined 99 | #--------------------------------------------------------------------------- 100 | # MACHINE = -DCRAY 101 | # MACHINE = -DIBM 102 | 103 | 104 | -------------------------------------------------------------------------------- /NPB-TBB/config/suite.def: -------------------------------------------------------------------------------- 1 | # config/suite.def 2 | # This file is used to build several benchmarks with a single command. 3 | # Typing "make suite" in the main directory will build all the benchmarks 4 | # specified in this file. 5 | # Each line of this file contains a benchmark name, class, and number 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft" 7 | # The class is one of "S", "W", "A", "B", and "C". 8 | # No blank lines. 9 | # The following example builds serial sample sizes of all benchmarks. 10 | ft A 11 | mg A 12 | is A 13 | ep A 14 | cg A 15 | -------------------------------------------------------------------------------- /NPB-TBB/sys/Makefile: -------------------------------------------------------------------------------- 1 | include ../config/make.def 2 | 3 | # Note that COMPILE is also defined in make.common and should 4 | # be the same. We can't include make.common because it has a lot 5 | # of other garbage. 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 7 | 8 | all: setparams 9 | 10 | # setparams creates an npbparam.h file for each benchmark 11 | # configuration. npbparams.h also contains info about how a benchmark 12 | # was compiled and linked 13 | 14 | setparams: setparams.cpp ../config/make.def 15 | $(UCC) -o setparams setparams.cpp 16 | 17 | 18 | clean: 19 | -rm -f setparams setparams.hpp npbparams.hpp 20 | -rm -f *~ *.o 21 | 22 | -------------------------------------------------------------------------------- /NPB-TBB/sys/README: -------------------------------------------------------------------------------- 1 | This directory contains utilities and files used by the 2 | build process. You should not need to change anything 3 | in this directory. 4 | 5 | Original Files 6 | -------------- 7 | setparams.c: 8 | Source for the setparams program. This program is used internally 9 | in the build process to create the file "npbparams.h" for each 10 | benchmark. npbparams.h contains Fortran or C parameters to build a 11 | benchmark for a specific class. The setparams program is never run 12 | directly by a user. Its invocation syntax is 13 | 14 | "setparams benchmark-name class". 15 | 16 | It examines the file "npbparams.h" in the current directory. If 17 | the specified parameters are the same as those in the npbparams.h 18 | file, nothing it changed. If the file does not exist or corresponds 19 | to a different class/number of nodes, it is (re)built. 20 | One of the more complicated things in npbparams.h is that it 21 | contains, in a Fortran string, the compiler flags used to build a 22 | benchmark, so that a benchmark can print out how it was compiled. 23 | 24 | make.common 25 | A makefile segment that is included in each individual benchmark 26 | program makefile. It sets up some standard macros (COMPILE, etc) 27 | and makes sure everything is configured correctly (npbparams.h) 28 | 29 | Makefile 30 | Builds setparams 31 | 32 | README 33 | This file. 34 | 35 | 36 | Created files 37 | ------------- 38 | 39 | setparams 40 | See descriptions above 41 | 42 | -------------------------------------------------------------------------------- /NPB-TBB/sys/make.common: -------------------------------------------------------------------------------- 1 | PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS) 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS) 3 | CCOMPILE = $(CC) -c $(C_INC) $(CFLAGS) 4 | 5 | # Class "U" is used internally by the setparams program to mean 6 | # "unknown". This means that if you don't specify CLASS= 7 | # on the command line, you'll get an error. It would be nice 8 | # to be able to avoid this, but we'd have to get information 9 | # from the setparams back to the make program, which isn't easy. 10 | CLASS=U 11 | 12 | default:: ${PROGRAM} 13 | 14 | # This makes sure the configuration utility setparams 15 | # is up to date. 16 | # Note that this must be run every time, which is why the 17 | # target does not exist and is not created. 18 | # If you create a file called "config" you will break things. 19 | config: 20 | @cd ../sys; ${MAKE} all 21 | ../sys/setparams ${BENCHMARK} ${CLASS} 22 | 23 | COMMON=../common 24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f 25 | cd ${COMMON}; ${FCOMPILE} ${RAND}.f 26 | 27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp 28 | cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp 29 | 30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f 31 | cd ${COMMON}; ${FCOMPILE} print_results.f 32 | 33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp 34 | cd ${COMMON}; ${CCOMPILE} c_print_results.cpp 35 | 36 | ${COMMON}/timers.o: ${COMMON}/timers.f 37 | cd ${COMMON}; ${FCOMPILE} timers.f 38 | 39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp 40 | cd ${COMMON}; ${CCOMPILE} c_timers.cpp 41 | 42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME} 43 | cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME} 44 | # For most machines or CRAY or IBM 45 | # cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c 46 | # For a precise timer on an SGI Power Challenge, try: 47 | # cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c 48 | 49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME} 50 | cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME} 51 | 52 | 53 | # Normally setparams updates npbparams.h only if the settings (CLASS) 54 | # have changed. However, we also want to update if the compile options 55 | # may have changed (set in ../config/make.def). 56 | npbparams.hpp: ../config/make.def 57 | @ echo make.def modified. Rebuilding npbparams.hpp just in case 58 | rm -f npbparams.hpp 59 | ../sys/setparams ${BENCHMARK} ${CLASS} 60 | 61 | # So that "make benchmark-name" works 62 | ${BENCHMARK}: default 63 | ${BENCHMARKU}: default 64 | 65 | 66 | -------------------------------------------------------------------------------- /NPB-TBB/sys/print_header: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' =========================================' 3 | echo ' = NAS Parallel Benchmarks =' 4 | echo ' = TBB C++ Versions =' 5 | echo ' = Developed by: Dalvan Griebler =' 6 | echo ' = Júnior Löff =' 7 | echo ' = =' 8 | echo ' = Warning: in case of problems =' 9 | echo ' = send an email to us: =' 10 | echo ' = dalvan.griebler@acad.pucrs.br =' 11 | echo ' = junior.loff@acad.pucrs.br =' 12 | echo ' =========================================' 13 | echo '' 14 | -------------------------------------------------------------------------------- /NPB-TBB/sys/print_instructions: -------------------------------------------------------------------------------- 1 | echo '' 2 | echo ' To make a NAS benchmark type ' 3 | echo '' 4 | echo ' make CLASS=' 5 | echo '' 6 | echo ' where is "cg", "ep", "ft", "is", or "mg"' 7 | echo ' is "S", "W", "A", "B" or "C"' 8 | echo '' 9 | echo ' To make a set of benchmarks, create the file config/suite.def' 10 | echo ' according to the instructions in config/suite.def.template and type' 11 | echo '' 12 | echo ' make suite' 13 | echo '' 14 | echo ' ***************************************************************' 15 | echo ' * Remember to edit the file config/make.def for site specific *' 16 | echo ' * information as described in the README file *' 17 | echo ' ***************************************************************' 18 | 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP) 2 | 3 | ## :sound: We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). :smile: 4 | 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways. 6 | 7 | :sound:*Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)* 8 | 9 | 10 | ## How to cite this work 11 | 12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018. 13 | 14 | ## The NPB-CPP Benchmark 15 | 16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version. 17 | 18 | ================================================================== 19 | NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB 20 | 21 | Code contributors: 22 | Dalvan Griebler 23 | Júnior Löff 24 | 25 | Warning: in case of problems send an email to us: 26 | dalvan.griebler@acad.pucrs.br 27 | junior.loff@acad.pucrs.br 28 | ================================================================== 29 | 30 | 31 | This folder contains: 32 | 33 | - NPB-FF - Directory with the parallel version implemented in FastFlow 34 | - NPB-OMP - Directory with the parallel version translated from the original NPB version 35 | - NPB-SER - Directory with the serial version of the NPB ported to C++ 36 | - NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks 37 | 38 | Each directory is independent and contains its own implemented version of the kernels: 39 | 40 | IS - Integer Sort, random memory access 41 | EP - Embarrassingly Parallel 42 | CG - Conjugate Gradient, irregular memory access and communication 43 | MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive 44 | FT - discrete 3D fast Fourier Transform, all-to-all communication 45 | 46 | ## Software Requirements 47 | 48 | *Warning: our tests were made with GCC-5* 49 | 50 | **TBB** 51 | 52 | *Installation* 53 | 54 | apt-get install libtbb-dev 55 | 56 | **FastFlow** 57 | 58 | *Installation* 59 | 60 | svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow 61 | 62 | 63 | ## How to Compile 64 | 65 | Enter the directory from the version desired and execute: 66 | 67 | make _BENCHMARK CLASS=_VERSION 68 | 69 | 70 | _BENCHMARKs are: 71 | 72 | EP, CG, MG, IS and FT 73 | 74 | _VERSIONs are: 75 | 76 | Class S: small for quick test purposes 77 | Class W: workstation size (a 90's workstation; now likely too small) 78 | Classes A, B, C: standard test problems; ~4X size increase going from one class to the next 79 | Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes 80 | 81 | 82 | Command: 83 | 84 | make ep CLASS=B 85 | --------------------------------------------------------------------------------