├── LICENSE
├── NPB-FF
    ├── CG
    │   ├── Makefile
    │   └── cg.cpp
    ├── EP
    │   ├── Makefile
    │   └── ep.cpp
    ├── FT
    │   ├── Makefile
    │   ├── ft.cpp
    │   └── global.hpp
    ├── IS
    │   ├── Makefile
    │   └── is.cpp
    ├── MG
    │   ├── Makefile
    │   ├── globals.hpp
    │   └── mg.cpp
    ├── Makefile
    ├── README.md
    ├── bin
    │   └── README.md
    ├── common
    │   ├── c_print_results.cpp
    │   ├── c_randdp.cpp
    │   ├── c_timers.cpp
    │   ├── npb-CPP.hpp
    │   ├── wtime.cpp
    │   ├── wtime.hpp
    │   └── wtime_sgi64.cpp
    ├── config
    │   ├── make.def
    │   └── suite.def
    └── sys
    │   ├── Makefile
    │   ├── README
    │   ├── make.common
    │   ├── print_header
    │   ├── print_instructions
    │   └── setparams.cpp
├── NPB-OMP
    ├── CG
    │   ├── Makefile
    │   └── cg.cpp
    ├── EP
    │   ├── Makefile
    │   └── ep.cpp
    ├── FT
    │   ├── Makefile
    │   ├── ft.cpp
    │   └── global.hpp
    ├── IS
    │   ├── Makefile
    │   └── is.cpp
    ├── MG
    │   ├── Makefile
    │   ├── globals.hpp
    │   └── mg.cpp
    ├── Makefile
    ├── README.md
    ├── bin
    │   └── README.md
    ├── common
    │   ├── c_print_results.cpp
    │   ├── c_randdp.cpp
    │   ├── c_timers.cpp
    │   ├── npb-CPP.hpp
    │   ├── wtime.cpp
    │   ├── wtime.hpp
    │   └── wtime_sgi64.cpp
    ├── config
    │   ├── make.def
    │   └── suite.def
    └── sys
    │   ├── Makefile
    │   ├── README
    │   ├── make.common
    │   ├── print_header
    │   ├── print_instructions
    │   └── setparams.cpp
├── NPB-SER
    ├── CG
    │   ├── Makefile
    │   └── cg.cpp
    ├── EP
    │   ├── Makefile
    │   └── ep.cpp
    ├── FT
    │   ├── Makefile
    │   ├── ft.cpp
    │   └── global.hpp
    ├── IS
    │   ├── Makefile
    │   └── is.cpp
    ├── MG
    │   ├── Makefile
    │   ├── globals.hpp
    │   └── mg.cpp
    ├── Makefile
    ├── README.md
    ├── bin
    │   └── README.md
    ├── common
    │   ├── c_print_results.cpp
    │   ├── c_randdp.cpp
    │   ├── c_timers.cpp
    │   ├── npb-CPP.hpp
    │   ├── wtime.cpp
    │   ├── wtime.hpp
    │   └── wtime_sgi64.cpp
    ├── config
    │   ├── make.def
    │   └── suite.def
    └── sys
    │   ├── Makefile
    │   ├── README
    │   ├── make.common
    │   ├── print_header
    │   ├── print_instructions
    │   └── setparams.cpp
├── NPB-TBB
    ├── CG
    │   ├── Makefile
    │   └── cg.cpp
    ├── EP
    │   ├── Makefile
    │   └── ep.cpp
    ├── FT
    │   ├── Makefile
    │   ├── ft.cpp
    │   └── global.hpp
    ├── IS
    │   ├── Makefile
    │   └── is.cpp
    ├── MG
    │   ├── Makefile
    │   ├── globals.hpp
    │   └── mg.cpp
    ├── Makefile
    ├── README.md
    ├── bin
    │   └── README.md
    ├── common
    │   ├── c_print_results.cpp
    │   ├── c_randdp.cpp
    │   ├── c_timers.cpp
    │   ├── npb-CPP.hpp
    │   ├── wtime.cpp
    │   ├── wtime.hpp
    │   └── wtime_sgi64.cpp
    ├── config
    │   ├── make.def
    │   └── suite.def
    └── sys
    │   ├── Makefile
    │   ├── README
    │   ├── make.common
    │   ├── print_header
    │   ├── print_instructions
    │   └── setparams.cpp
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Dalvan Griebler
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NPB-FF/CG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=cg
 3 | BENCHMARKU=CG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = cg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | cg.o:		cg.cpp  npbparams.hpp
16 | 	${CCOMPILE} cg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-FF/EP/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ep
 3 | BENCHMARKU=EP
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | 
16 | ep.o:		ep.cpp npbparams.hpp
17 | 	${CCOMPILE} ep.cpp   
18 | 
19 | clean:
20 | 	- rm -f *.o *~ 
21 | 	- rm -f npbparams.hpp core
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/NPB-FF/EP/ep.cpp:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------
  2 | 
  3 |     Information on NAS Parallel Benchmarks is available at:
  4 | 
  5 |     http://www.nas.nasa.gov/Software/NPB/
  6 | 
  7 |     Authors: P. O. Frederickson
  8 |            D. H. Bailey
  9 |            A. C. Woo
 10 | 
 11 |     CPP and FastFlow version:
 12 |             Dalvan Griebler <dalvangriebler@gmail.com>
 13 |             Júnior Löff <loffjh@gmail.com>
 14 | 
 15 | --------------------------------------------------------------------*/
 16 | 
 17 | 
 18 | #include <ff/parallel_for.hpp>
 19 | #include "npbparams.hpp"
 20 | #include <iostream>
 21 | #include <../common/npb-CPP.hpp>
 22 | 
 23 | /* parameters */
 24 | #define MK      16
 25 | #define MM      (M - MK)
 26 | #define NN      (1 << MM)
 27 | #define NK      (1 << MK)
 28 | #define NQ      10
 29 | #define EPSILON     1.0e-8
 30 | #define A       1220703125.0
 31 | #define S       271828183.0
 32 | #define TIMERS_ENABLED  FALSE
 33 | 
 34 | /* global variables */
 35 | /* common /storage/ */
 36 | static double x[2*NK];
 37 | 
 38 | static double q[NQ];
 39 | 
 40 | const int _cache_line_size = 64;
 41 | 
 42 | typedef struct packing_t{
 43 |     double _qq;
 44 |     char pad[_cache_line_size-sizeof(_qq)]; //fills the rest of the cache line
 45 | };
 46 | 
 47 | packing_t qq[100][NQ];
 48 | packing_t sxx[100];
 49 | packing_t syy[100];
 50 | /*--------------------------------------------------------------------
 51 |       program EMBAR
 52 | c-------------------------------------------------------------------*/
 53 | /*
 54 | c   This is the serial version of the APP Benchmark 1,
 55 | c   the "embarassingly parallel" benchmark.
 56 | c
 57 | c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
 58 | c   numbers.  MK is the Log_2 of the size of each batch of uniform random
 59 | c   numbers.  MK can be set for convenience on a given system, since it does
 60 | c   not affect the results.
 61 | */
 62 | int main(int argc, char **argv) {
 63 |     double Mops, t1, sx, sy, tm, an, gc;
 64 |     double dum[3] = { 1.0, 1.0, 1.0 };
 65 |     int np,i, nit, k_offset, j;
 66 |     boolean verified;
 67 |     char size[13+1];    /* character*13 */
 68 | 
 69 |     /*
 70 |     c   Because the size of the problem is too large to store in a 32-bit
 71 |     c   integer for some classes, we put it into a string (for printing).
 72 |     c   Have to strip off the decimal point put in there by the floating
 73 |     c   point print statement (internal file)
 74 |     */
 75 | 
 76 |     printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n");
 77 |     printf("Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br> & Júnior Löff <loffjh@gmail.com>\n\n");
 78 |     sprintf(size, "%12.0f", pow(2.0, M+1));
 79 |     for (j = 13; j >= 1; j--) {
 80 |         if (size[j] == '.') size[j] = ' ';
 81 |     }
 82 |     printf(" Number of random numbers generated: %13s\n", size);
 83 | 
 84 |     verified = FALSE;
 85 | 
 86 |     /*
 87 |     c   Compute the number of "batches" of random number pairs generated
 88 |     c   per processor. Adjust if the number of processors does not evenly
 89 |     c   divide the total number
 90 |     */
 91 |     np = NN;
 92 |     /*
 93 |     c   Call the random number generator functions and initialize
 94 |     c   the x-array to reduce the effects of paging on the timings.
 95 |     c   Also, call all mathematical functions that are used. Make
 96 |     c   sure these initializations cannot be eliminated as dead code.
 97 |     */
 98 |     vranlc(0, &(dum[0]), dum[1], &(dum[2]));
 99 |     dum[0] = randlc(&(dum[1]), dum[2]);
100 |     for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
101 |     Mops = log(sqrt(fabs(max(1.0, 1.0))));
102 | 
103 | 
104 | 
105 |     timer_clear(1);
106 |     timer_clear(2);
107 |     timer_clear(3);
108 |     
109 |     timer_start(1);
110 | 
111 |     vranlc(0, &t1, A, x);
112 | 
113 |     /*   Compute AN = A ^ (2 * NK) (mod 2^46). */
114 | 
115 |     t1 = A;
116 | 
117 |     for ( i = 1; i <= MK+1; i++) {
118 |         an = randlc(&t1, t1);
119 |     }
120 | 
121 |     an = t1;
122 |     gc = 0.0;
123 |     sx = 0.0;
124 |     sy = 0.0;
125 | 
126 |     for ( i = 0; i <= NQ - 1; i++) {
127 |         q[i] = 0.0;
128 |     }
129 | 
130 |     /*
131 |     c   Each instance of this loop may be performed independently. We compute
132 |     c   the k offsets separately to take into account the fact that some nodes
133 |     c   have more numbers to generate than others
134 |     */
135 |     k_offset = -1;
136 | 
137 |     int num_workers;
138 |     if(const char * nw = std::getenv("FF_NUM_THREADS")) {
139 |         num_workers = atoi(nw);
140 |     } else {
141 |         num_workers = 1;
142 |     }
143 | 
144 |     ff::ParallelFor pf(num_workers, true);
145 |     for(int i=0; i<num_workers; i++) {
146 |         sxx[i]._qq = 0.0;
147 |         syy[i]._qq = 0.0;
148 |         for(int j=0; j<=NQ-1; j++) qq[i][j]._qq = 0.0;
149 |     }
150 | 
151 |     pf.parallel_for_thid(1, np+1, 1, (int)((np+1)/num_workers)+1, [&](int k, int id) {
152 |         int kk = k_offset + k;
153 |         double t1 = S;
154 |         double t2 = an;
155 |         double t3, t4, x1, x2;
156 |         int i, ik, l;
157 |         double x[(2*NK)+1];
158 |         /*  Find starting seed t1 for this kk. */
159 | 
160 |         for (i = 1; i <= 100; i++) {
161 |             ik = kk / 2;
162 |             if (2 * ik != kk) t3 = randlc(&t1, t2);
163 |             if (ik == 0) break;
164 |             t3 = randlc(&t2, t2);
165 |             kk = ik;
166 |         }
167 | 
168 |         /*      Compute uniform pseudorandom numbers. */
169 | 
170 |         if (TIMERS_ENABLED == TRUE) timer_start(3);
171 |         vranlc(2*NK, &t1, A, x);
172 |         if (TIMERS_ENABLED == TRUE) timer_stop(3);
173 | 
174 |         /*
175 |         c       Compute Gaussian deviates by acceptance-rejection method and
176 |         c       tally counts in concentric square annuli.  This loop is not
177 |         c       vectorizable.
178 |         */
179 |         if (TIMERS_ENABLED == TRUE) timer_start(2);
180 | 
181 |         for (i = 1; i <= NK; i++) {
182 |             x1 = 2.0 * x[2*i-1] - 1.0;
183 |             x2 = 2.0 * x[2*i] - 1.0;
184 |             t1 = pow2(x1) + pow2(x2);
185 |             if (t1 <= 1.0) {
186 |                 t2 = sqrt(-2.0 * log(t1) / t1);
187 |                 t3 = (x1 * t2);       /* Xi */
188 |                 t4 = (x2 * t2);       /* Yi */
189 |                 l = max(fabs(t3), fabs(t4));
190 |                 qq[id][l]._qq += 1.0;       /* counts */
191 |                 sxx[id]._qq = sxx[id]._qq + t3;       /* sum of Xi */
192 |                 syy[id]._qq = syy[id]._qq + t4;       /* sum of Yi */
193 | 
194 |             }
195 |         }
196 |         if (TIMERS_ENABLED == TRUE) timer_stop(2);
197 |     });
198 | 
199 |     for(i=0; i<num_workers; i++) {
200 |         sx += sxx[i]._qq;
201 |         sy += syy[i]._qq;
202 |         for(int j=0; j<=NQ-1; j++) q[j] += qq[i][j]._qq;
203 |     }
204 |     for (i = 0; i <= NQ-1; i++) {
205 |         gc = gc + q[i];
206 |     }
207 | 
208 |     timer_stop(1);
209 | 
210 |     tm = timer_read(1);
211 | 
212 |     nit = 0;
213 |     if (M == 24) {
214 |         if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) {
215 |             verified = TRUE;
216 |         }
217 |     } else if (M == 25) {
218 |         if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) {
219 |             verified = TRUE;
220 |         }
221 |     } else if (M == 28) {
222 |         //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
223 |         if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) {
224 |             verified = TRUE;
225 |         }
226 |     } else if (M == 30) {
227 |         if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) {
228 |             verified = TRUE;
229 |         }
230 |     } else if (M == 32) {
231 |         if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) {
232 |             verified = TRUE;
233 |         }
234 |     } else if (M == 36) {
235 |         if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) {
236 |             verified = TRUE;
237 |         }
238 |     } else if (M == 40) {
239 |         if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) {
240 |             verified = TRUE;
241 |         }
242 |     }
243 | 
244 |     Mops = pow(2.0, M+1)/tm/1000000.0;
245 | 
246 |     printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n"
247 |            "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy);
248 |     for (i = 0; i  <= NQ-1; i++) {
249 |         printf("%3d %15.0f\n", i, q[i]);
250 |     }
251 | 
252 |     c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, tm, Mops, (char*)"Random numbers generated",
253 |                     verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
254 | 
255 |     if (TIMERS_ENABLED == TRUE) {
256 |         printf("Total time:     %f", timer_read(1));
257 |         printf("Gaussian pairs: %f", timer_read(2));
258 |         printf("Random numbers: %f", timer_read(3));
259 |     }
260 |     return 0;
261 | }
262 | 


--------------------------------------------------------------------------------
/NPB-FF/FT/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ft
 3 | BENCHMARKU=FT
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | ft.o:             ft.cpp  global.hpp npbparams.hpp
16 | 	${CCOMPILE} ft.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ mputil*
20 | 	- rm -f ft npbparams.hpp core
21 | 


--------------------------------------------------------------------------------
/NPB-FF/FT/global.hpp:
--------------------------------------------------------------------------------
  1 | #include "npbparams.hpp"
  2 | 
  3 | 
  4 | /*
  5 | c If processor array is 1x1 -> 0D grid decomposition
  6 | 
  7 | 
  8 | c Cache blocking params. These values are good for most
  9 | c RISC processors.  
 10 | c FFT parameters:
 11 | c  fftblock controls how many ffts are done at a time. 
 12 | c  The default is appropriate for most cache-based machines
 13 | c  On vector machines, the FFT can be vectorized with vector
 14 | c  length equal to the block size, so the block size should
 15 | c  be as large as possible. This is the size of the smallest
 16 | c  dimension of the problem: 128 for class A, 256 for class B and
 17 | c  512 for class C.
 18 | */
 19 | 
 20 | #define	FFTBLOCK_DEFAULT	16
 21 | #define	FFTBLOCKPAD_DEFAULT	18
 22 | 
 23 | #define FFTBLOCK	FFTBLOCK_DEFAULT
 24 | #define FFTBLOCKPAD	FFTBLOCKPAD_DEFAULT
 25 | 
 26 | /* COMMON block: blockinfo */
 27 | int fftblock;
 28 | int fftblockpad;
 29 |       
 30 | /*
 31 | c we need a bunch of logic to keep track of how
 32 | c arrays are laid out. 
 33 | 
 34 | 
 35 | c Note: this serial version is the derived from the parallel 0D case
 36 | c of the ft NPB.
 37 | c The computation proceeds logically as
 38 | 
 39 | c set up initial conditions
 40 | c fftx(1)
 41 | c transpose (1->2)
 42 | c ffty(2)
 43 | c transpose (2->3)
 44 | c fftz(3)
 45 | c time evolution
 46 | c fftz(3)
 47 | c transpose (3->2)
 48 | c ffty(2)
 49 | c transpose (2->1)
 50 | c fftx(1)
 51 | c compute residual(1)
 52 | 
 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx
 54 | c        
 55 | c            0D        1D        2D
 56 | c 1:        xyz       xyz       xyz
 57 | c 2:        xyz       xyz       yxz
 58 | c 3:        xyz       zyx       zxy
 59 | 
 60 | c the array dimensions are stored in dims(coord, phase)
 61 | */
 62 | 
 63 | /* COMMON block: layout */
 64 | static int dims[3][3];
 65 | static int xstart[3];
 66 | static int ystart[3];
 67 | static int zstart[3];
 68 | static int xend[3];
 69 | static int yend[3];
 70 | static int zend[3];
 71 | 
 72 | #define	T_TOTAL		0
 73 | #define	T_SETUP		1
 74 | #define	T_FFT		2
 75 | #define	T_EVOLVE	3
 76 | #define	T_CHECKSUM	4
 77 | #define	T_FFTLOW	5
 78 | #define	T_FFTCOPY	6
 79 | #define	T_MAX		7
 80 | 
 81 | #define	TIMERS_ENABLED	TRUE
 82 | 
 83 | /* other stuff */
 84 | 
 85 | #define	SEED	314159265.0
 86 | #define	A	1220703125.0
 87 | #define	PI	3.141592653589793238
 88 | #define	ALPHA	1.0e-6
 89 | 
 90 | #define	EXPMAX	(NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4))
 91 | 
 92 | /* COMMON block: excomm */
 93 | static double ex[EXPMAX+1];	/* ex(0:expmax) */
 94 | 
 95 | /*
 96 | c roots of unity array
 97 | c relies on x being largest dimension?
 98 | */
 99 | 
100 | /* COMMON block: ucomm */
101 | static dcomplex u[NX];
102 | 
103 | /* for checksum data */
104 | 
105 | /* COMMON block: sumcomm */
106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */
107 | 
108 | /* number of iterations*/
109 | 
110 | /* COMMON block: iter */
111 | static int niter;
112 | 
113 | 


--------------------------------------------------------------------------------
/NPB-FF/IS/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=is
 3 | BENCHMARKU=IS
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | include ../sys/make.common
 8 | 
 9 | OBJS = is.o \
10 |        ${COMMON}/c_print_results.o \
11 |        ${COMMON}/c_timers.o \
12 |        ${COMMON}/c_wtime.o
13 | 
14 | 
15 | ${PROGRAM}: config ${OBJS}
16 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
17 | 
18 | .c.o:
19 | 	${CCOMPILE} $<
20 | 
21 | is.o:             is.cpp  npbparams.hpp
22 | 	${CCOMPILE} is.cpp
23 | 
24 | 
25 | clean:
26 | 	- rm -f *.o *~ mputil*
27 | 	- rm -f npbparams.hpp core
28 | 	- if [ -d rii_files ]; then rm -r rii_files; fi
29 | 


--------------------------------------------------------------------------------
/NPB-FF/MG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=mg
 3 | BENCHMARKU=MG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = mg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | mg.o:		mg.cpp npbparams.hpp
16 | 	${CCOMPILE} mg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-FF/MG/globals.hpp:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------
 2 | c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
 3 | c  the edge size max for the partition on a given node, so must be changed 
 4 | c  either to save space (if running a small case) or made bigger for larger 
 5 | c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
 6 | c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
 7 | c  automatically in npbparams.h
 8 | c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
 9 | c-------------------------------------------------------------------*/
10 | 
11 | #include "npbparams.hpp"
12 | 
13 | /* parameters */
14 | /* actual dimension including ghost cells for communications */
15 | #define	NM	(2+(2<<(LM-1)))
16 | /* size of rhs array */
17 | #define	NV	(2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1))))
18 | /* size of residual array */
19 | #define	NR	((8*(NV+(NM*NM)+5*NM+7*LM))/7)
20 | /* size of communication buffer */
21 | #define	NM2	(2*NM*NM)
22 | /* maximum number of levels */
23 | #define	MAXLEVEL	11
24 | 
25 | /*---------------------------------------------------------------------*/
26 | /* common /mg3/ */
27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1];
28 | /* common /ClassType/ */
29 | static char class_npb;
30 | /* common /my_debug/ */
31 | static int debug_vec[8];
32 | /* common /fap/ */
33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/
34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1];
35 | static int lt, lb;
36 | 
37 | /*c---------------------------------------------------------------------
38 | c  Set at m=1024, can handle cases up to 1024^3 case
39 | c---------------------------------------------------------------------*/
40 | #define	M	1037
41 | 
42 | /* common /buffer/ */
43 | /*static double buff[4][NM2];*/
44 | 


--------------------------------------------------------------------------------
/NPB-FF/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | CLASS=S
 3 | SFILE=config/suite.def
 4 | 
 5 | default: header
 6 | 	@ $(SHELL) sys/print_instructions
 7 | 
 8 | BT: bt
 9 | bt: header
10 | 	cd BT; $(MAKE) CLASS=$(CLASS)
11 | 		       
12 | SP: sp		       
13 | sp: header	       
14 | 	cd SP; $(MAKE) CLASS=$(CLASS)
15 | 		       
16 | LU: lu		       
17 | lu: header	       
18 | 	cd LU; $(MAKE) CLASS=$(CLASS)
19 | 		       
20 | MG: mg		       
21 | mg: header	       
22 | 	cd MG; $(MAKE) CLASS=$(CLASS)
23 | 		       
24 | FT: ft		       
25 | ft: header	       
26 | 	cd FT; $(MAKE) CLASS=$(CLASS)
27 | 		       
28 | IS: is		       
29 | is: header	       
30 | 	cd IS; $(MAKE) CLASS=$(CLASS)
31 | 		       
32 | CG: cg		       
33 | cg: header	       
34 | 	cd CG; $(MAKE) CLASS=$(CLASS)
35 | 		       
36 | EP: ep		       
37 | ep: header	       
38 | 	cd EP; $(MAKE) CLASS=$(CLASS)
39 | DC: dc
40 | dc: header	       
41 | 	cd DC; $(MAKE) CLASS=$(CLASS)
42 | 
43 | # Awk script courtesy cmg@cray.com
44 | suite:
45 | 	@ awk '{ if ($$1 !~ /^#/ &&  NF > 0)                              \
46 | 	printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE)  \
47 | 	| $(SHELL)
48 | 
49 | 
50 | # It would be nice to make clean in each subdirectory (the targets
51 | # are defined) but on a really clean system this will won't work
52 | # because those makefiles need config/make.def
53 | clean:
54 | 	- rm -f core 
55 | 	- rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe
56 | 	- rm -f sys/setparams sys/makesuite sys/setparams.hpp
57 | 
58 | cleanall: clean
59 | 	- rm -r bin/*
60 | 
61 | veryclean: clean
62 | 	- rm config/make.def config/suite.def Part*
63 | 	- rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.*
64 | 
65 | header:
66 | 	@ $(SHELL) sys/print_header
67 | 
68 | kit: 
69 | 	- makekit -s100k -k30 * */* */*/*
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/NPB-FF/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP)
 2 | 
 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP).
 4 | 
 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways.
 6 | 
 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)*
 8 | 
 9 | 
10 | ## How to cite this work
11 | 	
12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
13 | 
14 | ## The NPB-CPP Benchmark
15 | 
16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version.
17 | 
18 | 	==================================================================
19 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
20 | 	 												
21 | 			Code contributors: 
22 | 					Dalvan Griebler    		
23 | 					Júnior Löff
24 | 													
25 | 		Warning: in case of problems send an email to us:					
26 | 			dalvan.griebler@acad.pucrs.br			
27 | 			junior.loff@acad.pucrs.br				
28 | 	==================================================================
29 | 
30 | 
31 | This folder contains:
32 | 
33 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
34 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
35 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
36 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
37 | 
38 | Each directory is independent and contains its own implemented version of the kernels:
39 | 
40 | 	IS - Integer Sort, random memory access
41 | 	EP - Embarrassingly Parallel
42 | 	CG - Conjugate Gradient, irregular memory access and communication
43 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
44 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
45 | 
46 | ## Software Requirements
47 | 
48 | *Warning: our tests were made with GCC-5*
49 | 
50 | **TBB**
51 | 
52 | *Installation*
53 | 
54 | 	apt-get install libtbb-dev
55 | 
56 | **FastFlow** 
57 | 
58 | *Installation*
59 | 
60 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
61 | 
62 | 
63 | ## How to Compile 
64 | 
65 | Enter the directory from the version desired and execute:
66 | 
67 | 	make _BENCHMARK CLASS=_VERSION
68 | 
69 | 
70 | _BENCHMARKs are: 
71 | 		
72 | 	EP, CG, MG, IS and FT 
73 | 																										
74 | _VERSIONs are: 
75 | 	
76 | 	Class S: small for quick test purposes
77 | 	Class W: workstation size (a 90's workstation; now likely too small)	
78 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
79 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
80 | 
81 | 
82 | Command:
83 | 
84 | 	make ep CLASS=B
85 | 


--------------------------------------------------------------------------------
/NPB-FF/bin/README.md:
--------------------------------------------------------------------------------
 1 | # How to Cite our Work
 2 | 	
 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
 4 | 
 5 | # The NPB-CPP Benchmark
 6 | 
 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version.
 8 | 
 9 | 	==================================================================
10 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
11 | 	 												
12 | 			Code contributors: 
13 | 					Dalvan Griebler    		
14 | 					Júnior Löff
15 | 													
16 | 		Warning: in case of problems send an email to us:					
17 | 			dalvan.griebler@acad.pucrs.br			
18 | 			junior.loff@acad.pucrs.br				
19 | 	==================================================================
20 | 
21 | 
22 | This folder contains:
23 | 
24 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
25 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
26 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
27 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
28 | 
29 | Each directory is independent and contains its own implemented version of the kernels:
30 | 
31 | 	IS - Integer Sort, random memory access
32 | 	EP - Embarrassingly Parallel
33 | 	CG - Conjugate Gradient, irregular memory access and communication
34 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
35 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
36 | 
37 | # Software Requiriments
38 | 
39 | *Warning: our tests were made with GCC-5*
40 | 
41 | **TBB**
42 | 
43 | *Installation*
44 | 
45 | 	apt-get install libtbb-dev
46 | 
47 | **FastFlow** 
48 | 
49 | *Installation*
50 | 
51 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
52 | 
53 | 
54 | # How to Compile 
55 | 
56 | Enter the directory from the version desired and execute:
57 | 
58 | 	make _BENCHMARK CLASS=_VERSION
59 | 
60 | 
61 | _BENCHMARKs are: 
62 | 		
63 | 	EP, CG, MG, IS and FT 
64 | 																										
65 | _VERSIONs are: 
66 | 	
67 | 	Class S: small for quick test purposes
68 | 	Class W: workstation size (a 90's workstation; now likely too small)	
69 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
70 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
71 | 
72 | 
73 | Command:
74 | 
75 | 	make ep CLASS=B


--------------------------------------------------------------------------------
/NPB-FF/common/c_print_results.cpp:
--------------------------------------------------------------------------------
 1 | /*****************************************************************/
 2 | /******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
 3 | /*****************************************************************/
 4 | #include <cstdlib>
 5 | #include <cstdio>
 6 | 
 7 | void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, double t,
 8 |   double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
 9 |   char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand)
10 | {
11 | 
12 |     printf( "\n\n %s Benchmark Completed\n", name ); 
13 | 
14 |     printf( " class_npb           =                        %c\n", class_npb );
15 | 
16 |     if( n2 == 0 && n3 == 0 )
17 |         printf( " Size            =             %12d\n", n1 );   /* as in IS */
18 |     else
19 |         printf( " Size            =              %3dx%3dx%3d\n", n1,n2,n3 );
20 | 
21 |     printf( " Iterations      =             %12d\n", niter );
22 |  
23 |     printf( " Time in seconds =             %12.2f\n", t );
24 | 
25 |     printf( " Mop/s total     =             %12.2f\n", mops );
26 | 
27 |     printf( " Operation type  = %24s\n", optype);
28 | 
29 |     if( passed_verification )
30 |         printf( " Verification    =               SUCCESSFUL\n" );
31 |     else
32 |         printf( " Verification    =             UNSUCCESSFUL\n" );
33 | 
34 |     printf( " Version         =             %12s\n", npbversion );
35 | 
36 |     printf( " Compile date    =             %12s\n", compiletime );
37 | 
38 |     printf( "\n Compile options:\n" );
39 | 
40 |     printf( "    CC           = %s\n", cc );
41 | 
42 |     printf( "    CLINK        = %s\n", clink );
43 | 
44 |     printf( "    C_LIB        = %s\n", c_lib );
45 | 
46 |     printf( "    C_INC        = %s\n", c_inc );
47 | 
48 |     printf( "    CFLAGS       = %s\n", cflags );
49 | 
50 |     printf( "    CLINKFLAGS   = %s\n", clinkflags );
51 | 
52 |     printf( "    RAND         = %s\n", rand );
53 | #ifdef SMP
54 |     char *evalue = getenv("MP_SET_NUMTHREADS");
55 |     printf( "   MULTICPUS = %s\n", evalue );
56 | #endif
57 | 
58 | /*    printf( "\n\n" );
59 |     printf( " Please send the results of this run to:\n\n" );
60 |     printf( " NPB Development Team\n" );
61 |     printf( " Internet: npb@nas.nasa.gov\n \n" );
62 |     printf( " If email is not available, send this to:\n\n" );
63 |     printf( " MS T27A-1\n" );
64 |     printf( " NASA Ames Research Center\n" );
65 |     printf( " Moffett Field, CA  94035-1000\n\n" );
66 |     printf( " Fax: 415-604-3957\n\n" );*/
67 | }
68 |  
69 | 


--------------------------------------------------------------------------------
/NPB-FF/common/c_randdp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | */
  3 | #if defined(USE_POW)
  4 | #define r23 pow(0.5, 23.0)
  5 | #define r46 (r23*r23)
  6 | #define t23 pow(2.0, 23.0)
  7 | #define t46 (t23*t23)
  8 | #else
  9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5)
 10 | #define r46 (r23*r23)
 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0)
 12 | #define t46 (t23*t23)
 13 | #endif
 14 | 
 15 | /*c---------------------------------------------------------------------
 16 | c---------------------------------------------------------------------*/
 17 | 
 18 | double randlc (double *x, double a) {
 19 | 
 20 | /*c---------------------------------------------------------------------
 21 | c---------------------------------------------------------------------*/
 22 | 
 23 | /*c---------------------------------------------------------------------
 24 | c
 25 | c   This routine returns a uniform pseudorandom double precision number in the
 26 | c   range (0, 1) by using the linear congruential generator
 27 | c
 28 | c   x_{k+1} = a x_k  (mod 2^46)
 29 | c
 30 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 31 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 32 | c   and X is the same as x_0.  A and X must be odd double precision integers
 33 | c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
 34 | c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 35 | c   the new seed x_1, so that subsequent calls to RANDLC using the same
 36 | c   arguments will generate a continuous sequence.
 37 | c
 38 | c   This routine should produce the same results on any computer with at least
 39 | c   48 mantissa bits in double precision floating point data.  On 64 bit
 40 | c   systems, double precision should be disabled.
 41 | c
 42 | c   David H. Bailey     October 26, 1990
 43 | c
 44 | c---------------------------------------------------------------------*/
 45 | 
 46 |     double t1,t2,t3,t4,a1,a2,x1,x2,z;
 47 | 
 48 | /*c---------------------------------------------------------------------
 49 | c   Break A into two parts such that A = 2^23 * A1 + A2.
 50 | c---------------------------------------------------------------------*/
 51 |     t1 = r23 * a;
 52 |     a1 = (int)t1;
 53 |     a2 = a - t23 * a1;
 54 | 
 55 | /*c---------------------------------------------------------------------
 56 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
 57 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 58 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
 59 | c---------------------------------------------------------------------*/
 60 |     t1 = r23 * (*x);
 61 |     x1 = (int)t1;
 62 |     x2 = (*x) - t23 * x1;
 63 |     t1 = a1 * x2 + a2 * x1;
 64 |     t2 = (int)(r23 * t1);
 65 |     z = t1 - t23 * t2;
 66 |     t3 = t23 * z + a2 * x2;
 67 |     t4 = (int)(r46 * t3);
 68 |     (*x) = t3 - t46 * t4;
 69 | 
 70 |     return (r46 * (*x));
 71 | }
 72 | 
 73 | /*c---------------------------------------------------------------------
 74 | c---------------------------------------------------------------------*/
 75 | 
 76 | void vranlc (int n, double *x_seed, double a, double y[]) {
 77 | 
 78 | /*c---------------------------------------------------------------------
 79 | c---------------------------------------------------------------------*/
 80 | 
 81 | /*c---------------------------------------------------------------------
 82 | c
 83 | c   This routine generates N uniform pseudorandom double precision numbers in
 84 | c   the range (0, 1) by using the linear congruential generator
 85 | c
 86 | c   x_{k+1} = a x_k  (mod 2^46)
 87 | c
 88 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 89 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 90 | c   and X is the same as x_0.  A and X must be odd double precision integers
 91 | c   in the range (1, 2^46).  The N results are placed in Y and are normalized
 92 | c   to be between 0 and 1.  X is updated to contain the new seed, so that
 93 | c   subsequent calls to VRANLC using the same arguments will generate a
 94 | c   continuous sequence.  If N is zero, only initialization is performed, and
 95 | c   the variables X, A and Y are ignored.
 96 | c
 97 | c   This routine is the standard version designed for scalar or RISC systems.
 98 | c   However, it should produce the same results on any single processor
 99 | c   computer with at least 48 mantissa bits in double precision floating point
100 | c   data.  On 64 bit systems, double precision should be disabled.
101 | c
102 | c---------------------------------------------------------------------*/
103 | 
104 |     int i;
105 |     double x,t1,t2,t3,t4,a1,a2,x1,x2,z;
106 | 
107 | /*c---------------------------------------------------------------------
108 | c   Break A into two parts such that A = 2^23 * A1 + A2.
109 | c---------------------------------------------------------------------*/
110 |     t1 = r23 * a;
111 |     a1 = (int)t1;
112 |     a2 = a - t23 * a1;
113 |     x = *x_seed;
114 | 
115 | /*c---------------------------------------------------------------------
116 | c   Generate N results.   This loop is not vectorizable.
117 | c---------------------------------------------------------------------*/
118 |     for (i = 1; i <= n; i++) {
119 | 
120 | /*c---------------------------------------------------------------------
121 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
122 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
123 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
124 | c---------------------------------------------------------------------*/
125 |         t1 = r23 * x;
126 |         x1 = (int)t1;
127 |         x2 = x - t23 * x1;
128 |         t1 = a1 * x2 + a2 * x1;
129 |         t2 = (int)(r23 * t1);
130 |         z = t1 - t23 * t2;
131 |         t3 = t23 * z + a2 * x2;
132 |         t4 = (int)(r46 * t3);
133 |         x = t3 - t46 * t4;
134 |         y[i] = r46 * x;
135 |     }
136 |     *x_seed = x;
137 | }
138 | 


--------------------------------------------------------------------------------
/NPB-FF/common/c_timers.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #include "wtime.hpp"
 5 | #include <cstdlib>
 6 | 
 7 | /*  Prototype  */
 8 | void wtime( double * );
 9 | 
10 | 
11 | 
12 | /*****************************************************************/
13 | /******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
14 | /*****************************************************************/
15 | double elapsed_time( void )
16 | {
17 |     double t;
18 | 
19 |     wtime( &t );
20 |     return( t );
21 | }
22 | 
23 | 
24 | double start[64], elapsed[64];
25 | 
26 | /*****************************************************************/
27 | /******            T  I  M  E  R  _  C  L  E  A  R          ******/
28 | /*****************************************************************/
29 | void timer_clear( int n )
30 | {
31 |     elapsed[n] = 0.0;
32 | }
33 | 
34 | 
35 | /*****************************************************************/
36 | /******            T  I  M  E  R  _  S  T  A  R  T          ******/
37 | /*****************************************************************/
38 | void timer_start( int n )
39 | {
40 |     start[n] = elapsed_time();
41 | }
42 | 
43 | 
44 | /*****************************************************************/
45 | /******            T  I  M  E  R  _  S  T  O  P             ******/
46 | /*****************************************************************/
47 | void timer_stop( int n )
48 | {
49 |     double t, now;
50 | 
51 |     now = elapsed_time();
52 |     t = now - start[n];
53 |     elapsed[n] += t;
54 | 
55 | }
56 | 
57 | 
58 | /*****************************************************************/
59 | /******            T  I  M  E  R  _  R  E  A  D             ******/
60 | /*****************************************************************/
61 | double timer_read( int n )
62 | {
63 |     return( elapsed[n] );
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-FF/common/npb-CPP.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <cstdio>
 3 | #include <cstdlib>
 4 | #include <cmath>
 5 | 
 6 | typedef int boolean;
 7 | typedef struct { double real; double imag; } dcomplex;
 8 | 
 9 | #define TRUE	1
10 | #define FALSE	0
11 | 
12 | #define max(a,b) (((a) > (b)) ? (a) : (b))
13 | #define min(a,b) (((a) < (b)) ? (a) : (b))
14 | #define	pow2(a) ((a)*(a))
15 | 
16 | #define get_real(c) c.real
17 | #define get_imag(c) c.imag
18 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag)
19 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag)
20 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \
21 |                      c.imag = a.real * b.imag + a.imag * b.real)
22 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b)
23 | 
24 | extern double randlc(double *, double);
25 | extern void vranlc(int, double *, double, double *);
26 | extern void timer_clear(int);
27 | extern void timer_start(int);
28 | extern void timer_stop(int);
29 | extern double timer_read(int);
30 | 
31 | extern void c_print_results(char *name, char class_npb, int n1, int n2,
32 | 			    int n3, int niter, double t,
33 | 			    double mops, char *optype, int passed_verification,
34 | 			    char *npbversion, char *compiletime, char *cc,
35 | 			    char *clink, char *c_lib, char *c_inc,
36 | 			    char *cflags, char *clinkflags, char *rand);
37 | 


--------------------------------------------------------------------------------
/NPB-FF/common/wtime.cpp:
--------------------------------------------------------------------------------
 1 | #include "wtime.hpp"
 2 | #include <sys/time.h>
 3 | 
 4 | void wtime(double *t)
 5 | {
 6 |   static int sec = -1;
 7 |   struct timeval tv;
 8 |   gettimeofday(&tv, 0);
 9 |   if (sec < 0) sec = tv.tv_sec;
10 |   *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
11 | }
12 | 
13 |     
14 | 


--------------------------------------------------------------------------------
/NPB-FF/common/wtime.hpp:
--------------------------------------------------------------------------------
 1 | /* C/Fortran interface is different on different machines. 
 2 |  * You may need to tweak this.
 3 |  */
 4 | 
 5 | 
 6 | #if defined(IBM)
 7 | #define wtime wtime
 8 | #elif defined(CRAY)
 9 | #define wtime WTIME
10 | #else
11 | #define wtime wtime_
12 | #endif
13 | 


--------------------------------------------------------------------------------
/NPB-FF/common/wtime_sgi64.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <fcntl.h>
 3 | #include <sys/mman.h>
 4 | #include <sys/syssgi.h>
 5 | #include <sys/immu.h>
 6 | #include <cerrno>
 7 | #include <cstdio>
 8 | 
 9 | /* The following works on SGI Power Challenge systems */
10 | 
11 | typedef unsigned long iotimer_t;
12 | 
13 | unsigned int cycleval;
14 | volatile iotimer_t *iotimer_addr, base_counter;
15 | double resolution;
16 | 
17 | /* address_t is an integer type big enough to hold an address */
18 | typedef unsigned long address_t;
19 | 
20 | 
21 | 
22 | void timer_init() 
23 | {
24 |   
25 |   int fd;
26 |   char *virt_addr;
27 |   address_t phys_addr, page_offset, pagemask, pagebase_addr;
28 |   
29 |   pagemask = getpagesize() - 1;
30 |   errno = 0;
31 |   phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
32 |   if (errno != 0) {
33 |     perror("SGI_QUERY_CYCLECNTR");
34 |     exit(1);
35 |   }
36 |   /* rel_addr = page offset of physical address */
37 |   page_offset = phys_addr & pagemask;
38 |   pagebase_addr = phys_addr - page_offset;
39 |   fd = open("/dev/mmem", O_RDONLY);
40 | 
41 |   virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
42 |   virt_addr = virt_addr + page_offset;
43 |   iotimer_addr = (iotimer_t *)virt_addr;
44 |   /* cycleval in picoseconds to this gives resolution in seconds */
45 |   resolution = 1.0e-12*cycleval; 
46 |   base_counter = *iotimer_addr;
47 | }
48 | 
49 | void wtime_(double *time) 
50 | {
51 |   static int initialized = 0;
52 |   volatile iotimer_t counter_value;
53 |   if (!initialized) { 
54 |     timer_init();
55 |     initialized = 1;
56 |   }
57 |   counter_value = *iotimer_addr - base_counter;
58 |   *time = (double)counter_value * resolution;
59 | }
60 | 
61 | 
62 | void wtime(double *time) 
63 | {
64 |   static int initialized = 0;
65 |   volatile iotimer_t counter_value;
66 |   if (!initialized) { 
67 |     timer_init();
68 |     initialized = 1;
69 |   }
70 |   counter_value = *iotimer_addr - base_counter;
71 |   *time = (double)counter_value * resolution;
72 | }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/NPB-FF/config/make.def:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------------------
  2 | #
  3 | #                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
  4 | #
  5 | #---------------------------------------------------------------------------
  6 | 
  7 | #---------------------------------------------------------------------------
  8 | # Items in this file will need to be changed for each platform.
  9 | # (Note these definitions are inconsistent with NPB2.1.)
 10 | #---------------------------------------------------------------------------
 11 | 
 12 | #---------------------------------------------------------------------------
 13 | # Parallel C:
 14 | #
 15 | # CC         - C compiler 
 16 | # CFLAGS     - C compilation arguments
 17 | # C_INC      - any -I arguments required for compiling C 
 18 | # CLINK      - C linker
 19 | # CLINKFLAGS - C linker flags
 20 | # C_LIB      - any -L and -l arguments required for linking C 
 21 | #
 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or
 23 | #                            $(CC) $(CFLAGS)
 24 | # linking is done with       $(CLINK) $(C_LIB) $(CLINKFLAGS)
 25 | #---------------------------------------------------------------------------
 26 | 
 27 | #---------------------------------------------------------------------------
 28 | # This is the C compiler used for OpenMP programs
 29 | #---------------------------------------------------------------------------
 30 | CC = g++ -std=c++14
 31 | #gcc #cc
 32 | # This links C programs; usually the same as ${CC}
 33 | CLINK	= $(CC)
 34 | 
 35 | #---------------------------------------------------------------------------
 36 | # These macros are passed to the linker 
 37 | #---------------------------------------------------------------------------
 38 | C_LIB  = -lm
 39 | 
 40 | #---------------------------------------------------------------------------
 41 | # These macros are passed to the compiler 
 42 | #---------------------------------------------------------------------------
 43 | C_INC = -I../common 
 44 | 
 45 | #---------------------------------------------------------------------------
 46 | # Global *compile time* flags for C programs
 47 | #---------------------------------------------------------------------------
 48 | CFLAGS	= -O3 -I $(HOME)/fastflow -DBLOCKING_MODE -pthread
 49 | # CFLAGS = -g
 50 | 
 51 | #---------------------------------------------------------------------------
 52 | # Global *link time* flags. Flags for increasing maximum executable 
 53 | # size usually go here. 
 54 | #---------------------------------------------------------------------------
 55 | CLINKFLAGS = -O3 -I $(HOME)/fastflow -DBLOCKING_MODE -pthread 
 56 | 
 57 | 
 58 | #---------------------------------------------------------------------------
 59 | # Utilities C:
 60 | #
 61 | # This is the C compiler used to compile C utilities.  Flags required by 
 62 | # this compiler go here also; typically there are few flags required; hence 
 63 | # there are no separate macros provided for such flags.
 64 | #---------------------------------------------------------------------------
 65 | UCC	= cc
 66 | 
 67 | 
 68 | #---------------------------------------------------------------------------
 69 | # Destination of executables, relative to subdirs of the main directory. . 
 70 | #---------------------------------------------------------------------------
 71 | BINDIR	= ../bin
 72 | 
 73 | 
 74 | #---------------------------------------------------------------------------
 75 | # The variable RAND controls which random number generator 
 76 | # is used. It is described in detail in Doc/README.install. 
 77 | # Use "randi8" unless there is a reason to use another one. 
 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec"
 79 | #---------------------------------------------------------------------------
 80 | # RAND   = randi8
 81 | # The following is highly reliable but may be slow:
 82 | RAND   = randdp
 83 | 
 84 | 
 85 | #---------------------------------------------------------------------------
 86 | # The variable WTIME is the name of the wtime source code module in the
 87 | # NPB2.x/common directory.  
 88 | # For most machines,       use wtime.c
 89 | # For SGI power challenge: use wtime_sgi64.c
 90 | #---------------------------------------------------------------------------
 91 | WTIME  = wtime.cpp
 92 | 
 93 | 
 94 | #---------------------------------------------------------------------------
 95 | # Enable if either Cray or IBM: 
 96 | # (no such flag for most machines: see common/wtime.h)
 97 | # This is used by the C compiler to pass the machine name to common/wtime.h,
 98 | # where the C/Fortran binding interface format is determined
 99 | #---------------------------------------------------------------------------
100 | # MACHINE	=	-DCRAY
101 | # MACHINE	=	-DIBM
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/NPB-FF/config/suite.def:
--------------------------------------------------------------------------------
 1 | # config/suite.def
 2 | # This file is used to build several benchmarks with a single command. 
 3 | # Typing "make suite" in the main directory will build all the benchmarks
 4 | # specified in this file. 
 5 | # Each line of this file contains a benchmark name, class, and number
 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft" 
 7 | # The class is one of "S", "W", "A", "B", and "C". 
 8 | # No blank lines. 
 9 | # The following example builds serial sample sizes of all benchmarks. 
10 | ft	A
11 | mg	A
12 | is	A
13 | ep	A
14 | cg	A
15 | 


--------------------------------------------------------------------------------
/NPB-FF/sys/Makefile:
--------------------------------------------------------------------------------
 1 | include ../config/make.def
 2 | 
 3 | # Note that COMPILE is also defined in make.common and should
 4 | # be the same. We can't include make.common because it has a lot
 5 | # of other garbage. 
 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 7 | 
 8 | all: setparams 
 9 | 
10 | # setparams creates an npbparam.h file for each benchmark 
11 | # configuration. npbparams.h also contains info about how a benchmark
12 | # was compiled and linked
13 | 
14 | setparams: setparams.cpp ../config/make.def
15 | 	$(UCC) -o setparams setparams.cpp
16 | 
17 | 
18 | clean: 
19 | 	-rm -f setparams setparams.hpp npbparams.hpp
20 | 	-rm -f *~ *.o
21 | 
22 | 


--------------------------------------------------------------------------------
/NPB-FF/sys/README:
--------------------------------------------------------------------------------
 1 | This directory contains utilities and files used by the 
 2 | build process. You should not need to change anything
 3 | in this directory. 
 4 | 
 5 | Original Files
 6 | --------------
 7 | setparams.c:
 8 |         Source for the setparams program. This program is used internally
 9 |         in the build process to create the file "npbparams.h" for each 
10 |         benchmark. npbparams.h contains Fortran or C parameters to build a 
11 |         benchmark for a specific class. The setparams program is never run 
12 |         directly by a user. Its invocation syntax is 
13 | 
14 |             "setparams benchmark-name class". 
15 | 
16 |         It examines the file "npbparams.h" in the current directory. If 
17 |         the specified parameters are the same as those in the npbparams.h 
18 |         file, nothing it changed. If the file does not exist or corresponds 
19 |         to a different class/number of nodes, it is (re)built. 
20 | 	One of the more complicated things in npbparams.h is that it 
21 |         contains, in a Fortran string, the compiler flags used to build a 
22 |         benchmark, so that a benchmark can print out how it was compiled. 
23 | 
24 | make.common
25 |         A makefile segment that is included in each individual benchmark
26 |         program makefile. It sets up some standard macros (COMPILE, etc) 
27 |         and makes sure everything is configured correctly (npbparams.h)
28 | 
29 | Makefile
30 |         Builds  setparams
31 | 
32 | README
33 |         This file. 
34 | 
35 | 
36 | Created files
37 | -------------
38 | 
39 | setparams
40 | 	See descriptions above
41 | 
42 | 


--------------------------------------------------------------------------------
/NPB-FF/sys/make.common:
--------------------------------------------------------------------------------
 1 | PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 3 | CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
 4 | 
 5 | # Class "U" is used internally by the setparams program to mean
 6 | # "unknown". This means that if you don't specify CLASS=
 7 | # on the command line, you'll get an error. It would be nice
 8 | # to be able to avoid this, but we'd have to get information
 9 | # from the setparams back to the make program, which isn't easy. 
10 | CLASS=U
11 | 
12 | default:: ${PROGRAM}
13 | 
14 | # This makes sure the configuration utility setparams 
15 | # is up to date. 
16 | # Note that this must be run every time, which is why the
17 | # target does not exist and is not created. 
18 | # If you create a file called "config" you will break things. 
19 | config:
20 | 	@cd ../sys; ${MAKE} all
21 | 	../sys/setparams ${BENCHMARK} ${CLASS}
22 | 
23 | COMMON=../common
24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
25 | 	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
26 | 
27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp
28 | 	cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp
29 | 
30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f
31 | 	cd ${COMMON}; ${FCOMPILE} print_results.f
32 | 
33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp
34 | 	cd ${COMMON}; ${CCOMPILE} c_print_results.cpp
35 | 
36 | ${COMMON}/timers.o: ${COMMON}/timers.f
37 | 	cd ${COMMON}; ${FCOMPILE} timers.f
38 | 
39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp
40 | 	cd ${COMMON}; ${CCOMPILE} c_timers.cpp
41 | 
42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME}
43 | 	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME}
44 | # For most machines or CRAY or IBM
45 | #	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
46 | # For a precise timer on an SGI Power Challenge, try:
47 | #	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
48 | 
49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
50 | 	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME}
51 | 
52 | 
53 | # Normally setparams updates npbparams.h only if the settings (CLASS)
54 | # have changed. However, we also want to update if the compile options
55 | # may have changed (set in ../config/make.def). 
56 | npbparams.hpp: ../config/make.def
57 | 	@ echo make.def modified. Rebuilding npbparams.hpp just in case
58 | 	rm -f npbparams.hpp
59 | 	../sys/setparams ${BENCHMARK} ${CLASS}
60 | 
61 | # So that "make benchmark-name" works
62 | ${BENCHMARK}:  default
63 | ${BENCHMARKU}: default
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-FF/sys/print_header:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   ========================================='
 3 | echo '   =      NAS Parallel Benchmarks          ='
 4 | echo '   =      FastFlow C++ Versions            ='
 5 | echo '   =      Developed by: Dalvan Griebler    ='
 6 | echo '   =                    Júnior Löff        ='
 7 | echo '   =                                       ='
 8 | echo '   =      Warning: in case of problems     ='
 9 | echo '   =      send an email to us:             ='
10 | echo '   =      dalvan.griebler@acad.pucrs.br    ='
11 | echo '   =      junior.loff@acad.pucrs.br        ='
12 | echo '   ========================================='
13 | echo ''
14 | 


--------------------------------------------------------------------------------
/NPB-FF/sys/print_instructions:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   To make a NAS benchmark type '
 3 | echo ''
 4 | echo '         make <benchmark-name> CLASS=<class>'
 5 | echo ''
 6 | echo '   where <benchmark-name> is "cg", "ep", "ft", "is", or "mg"'
 7 | echo '         <class>          is "S", "W", "A", "B" or "C"'
 8 | echo ''
 9 | echo '   To make a set of benchmarks, create the file config/suite.def'
10 | echo '   according to the instructions in config/suite.def.template and type'
11 | echo ''
12 | echo '         make suite'
13 | echo ''
14 | echo ' ***************************************************************'
15 | echo ' * Remember to edit the file config/make.def for site specific *'
16 | echo ' * information as described in the README file                 *'
17 | echo ' ***************************************************************'
18 | 
19 | 


--------------------------------------------------------------------------------
/NPB-OMP/CG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=cg
 3 | BENCHMARKU=CG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = cg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | cg.o:		cg.cpp  npbparams.hpp
16 | 	${CCOMPILE} cg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-OMP/EP/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ep
 3 | BENCHMARKU=EP
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | 
16 | ep.o:		ep.cpp npbparams.hpp
17 | 	${CCOMPILE} ep.cpp
18 | 
19 | clean:
20 | 	- rm -f *.o *~ 
21 | 	- rm -f npbparams.hpp core
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/NPB-OMP/EP/ep.cpp:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------
  2 | 
  3 |     Information on NAS Parallel Benchmarks is available at:
  4 | 
  5 |     http://www.nas.nasa.gov/Software/NPB/
  6 | 
  7 |     Authors: P. O. Frederickson
  8 |            D. H. Bailey
  9 |            A. C. Woo
 10 | 
 11 |     CPP and OpenMP version:
 12 |             Dalvan Griebler <dalvangriebler@gmail.com>
 13 |             Júnior Löff <loffjh@gmail.com>
 14 | 
 15 | --------------------------------------------------------------------*/
 16 | 
 17 | 
 18 | 
 19 | #include "npbparams.hpp"
 20 | #include <iostream>
 21 | #include <../common/npb-CPP.hpp>
 22 | 
 23 | /* parameters */
 24 | #define	MK		16
 25 | #define	MM		(M - MK)
 26 | #define	NN		(1 << MM)
 27 | #define	NK		(1 << MK)
 28 | #define	NQ		10
 29 | #define EPSILON		1.0e-8
 30 | #define	A		1220703125.0
 31 | #define	S		271828183.0
 32 | #define	TIMERS_ENABLED	FALSE
 33 | 
 34 | /* global variables */
 35 | /* common /storage/ */
 36 | static double x[(2*NK)+1];
 37 | #pragma omp threadprivate(x)
 38 | static double q[NQ];
 39 | 
 40 | /*--------------------------------------------------------------------
 41 |       program EMBAR
 42 | c-------------------------------------------------------------------*/
 43 | /*
 44 | c   This is the serial version of the APP Benchmark 1,
 45 | c   the "embarassingly parallel" benchmark.
 46 | c
 47 | c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
 48 | c   numbers.  MK is the Log_2 of the size of each batch of uniform random
 49 | c   numbers.  MK can be set for convenience on a given system, since it does
 50 | c   not affect the results.
 51 | */
 52 | int main(int argc, char **argv) {
 53 |     double Mops, t1, sx, sy, tm, an, gc;
 54 |     double dum[3] = { 1.0, 1.0, 1.0 };
 55 |     int np,i, k, nit, k_offset, j;
 56 |     int nthreads = 1;
 57 |     boolean verified;
 58 |     char size[13+1];	/* character*13 */
 59 | 
 60 |     /*
 61 |     c   Because the size of the problem is too large to store in a 32-bit
 62 |     c   integer for some classes, we put it into a string (for printing).
 63 |     c   Have to strip off the decimal point put in there by the floating
 64 |     c   point print statement (internal file)
 65 |     */
 66 | 
 67 |     printf("\n\n NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n");
 68 |     printf("\n\n Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br>\n");
 69 |     sprintf(size, "%12.0f", pow(2.0, M+1));
 70 |     for (j = 13; j >= 1; j--) {
 71 |         if (size[j] == '.') size[j] = ' ';
 72 |     }
 73 |     printf(" Number of random numbers generated: %13s\n", size);
 74 | 
 75 |     verified = FALSE;
 76 | 
 77 |     /*
 78 |     c   Compute the number of "batches" of random number pairs generated
 79 |     c   per processor. Adjust if the number of processors does not evenly
 80 |     c   divide the total number
 81 |     */
 82 |     np = NN;
 83 | 
 84 |     /*
 85 |     c   Call the random number generator functions and initialize
 86 |     c   the x-array to reduce the effects of paging on the timings.
 87 |     c   Also, call all mathematical functions that are used. Make
 88 |     c   sure these initializations cannot be eliminated as dead code.
 89 |     */
 90 |     vranlc(0, &(dum[0]), dum[1], &(dum[2]));
 91 |     dum[0] = randlc(&(dum[1]), dum[2]);
 92 |     for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
 93 |     Mops = log(sqrt(fabs(max(1.0, 1.0))));
 94 | 
 95 | 
 96 | 
 97 |     timer_clear(1);
 98 |     timer_clear(2);
 99 |     timer_clear(3);
100 | 
101 |     timer_start(1);
102 | 
103 |     vranlc(0, &t1, A, x);
104 | 
105 |     /*   Compute AN = A ^ (2 * NK) (mod 2^46). */
106 | 
107 |     t1 = A;
108 | 
109 |     for ( i = 1; i <= MK+1; i++) {
110 |         an = randlc(&t1, t1);
111 |     }
112 | 
113 |     an = t1;
114 |     gc = 0.0;
115 |     sx = 0.0;
116 |     sy = 0.0;
117 | 
118 |     for ( i = 0; i <= NQ - 1; i++) {
119 |         q[i] = 0.0;
120 |     }
121 | 
122 |     /*
123 |     c   Each instance of this loop may be performed independently. We compute
124 |     c   the k offsets separately to take into account the fact that some nodes
125 |     c   have more numbers to generate than others
126 |     */
127 |     k_offset = -1;
128 | 
129 |     #pragma omp parallel copyin(x)
130 |     {
131 |         double t1, t2, t3, t4, x1, x2;
132 |         int kk, i, ik, l;
133 |         double qq[NQ];		/* private copy of q[0:NQ-1] */
134 | 
135 |         for (i = 0; i < NQ; i++) qq[i] = 0.0;
136 | 
137 |         #pragma omp for reduction(+:sx,sy)
138 |         for (k = 1; k <= np; k++) {
139 |             kk = k_offset + k;
140 |             t1 = S;
141 |             t2 = an;
142 | 
143 |             /*  Find starting seed t1 for this kk. */
144 | 
145 |             for (i = 1; i <= 100; i++) {
146 |                 ik = kk / 2;
147 |                 if (2 * ik != kk) t3 = randlc(&t1, t2);
148 |                 if (ik == 0) break;
149 |                 t3 = randlc(&t2, t2);
150 |                 kk = ik;
151 |             }
152 | 
153 |             /*      Compute uniform pseudorandom numbers. */
154 | 
155 |             if (TIMERS_ENABLED == TRUE) timer_start(3);
156 |             vranlc(2*NK, &t1, A, x);
157 |             if (TIMERS_ENABLED == TRUE) timer_stop(3);
158 | 
159 |             /*
160 |             c       Compute Gaussian deviates by acceptance-rejection method and
161 |             c       tally counts in concentric square annuli.  This loop is not
162 |             c       vectorizable.
163 |             */
164 |             if (TIMERS_ENABLED == TRUE) timer_start(2);
165 | 
166 |             for ( i = 1; i <= NK; i++) {
167 |                 x1 = 2.0 * x[2*i-1] - 1.0;
168 |                 x2 = 2.0 * x[2*i] - 1.0;
169 |                 t1 = pow2(x1) + pow2(x2);
170 |                 if (t1 <= 1.0) {
171 |                     t2 = sqrt(-2.0 * log(t1) / t1);
172 |                     t3 = (x1 * t2);				/* Xi */
173 |                     t4 = (x2 * t2);				/* Yi */
174 |                     l = max(fabs(t3), fabs(t4));
175 |                     qq[l] += 1.0;				/* counts */
176 |                     sx = sx + t3;				/* sum of Xi */
177 |                     sy = sy + t4;				/* sum of Yi */
178 |                 }
179 |             }
180 |             if (TIMERS_ENABLED == TRUE) timer_stop(2);
181 |         }
182 |         #pragma omp critical
183 |         {
184 |             for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
185 |         }
186 | #if defined(_OPENMP)
187 |         #pragma omp master
188 |         nthreads = omp_get_num_threads();
189 | #endif /* _OPENMP */
190 |     } /* end of parallel region */
191 |     for (i = 0; i <= NQ-1; i++) {
192 |         gc = gc + q[i];
193 |     }
194 | 
195 |     timer_stop(1);
196 |     tm = timer_read(1);
197 | 
198 | 
199 | 
200 |     nit = 0;
201 |     if (M == 24) {
202 |         if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) {
203 |             verified = TRUE;
204 |         }
205 |     } else if (M == 25) {
206 |         if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) {
207 |             verified = TRUE;
208 |         }
209 |     } else if (M == 28) {
210 |         //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
211 |         if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) {
212 |             verified = TRUE;
213 |         }
214 |     } else if (M == 30) {
215 |         if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) {
216 |             verified = TRUE;
217 |         }
218 |     } else if (M == 32) {
219 |         if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) {
220 |             verified = TRUE;
221 |         }
222 |     } else if (M == 36) {
223 |         if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) {
224 |             verified = TRUE;
225 |         }
226 |     } else if (M == 40) {
227 |         if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) {
228 |             verified = TRUE;
229 |         }
230 |     }
231 | 
232 |     Mops = pow(2.0, M+1)/tm/1000000.0;
233 | 
234 |     printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n"
235 |            "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy);
236 |     for (i = 0; i  <= NQ-1; i++) {
237 |         printf("%3d %15.0f\n", i, q[i]);
238 |     }
239 | 
240 |     c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, (char*)"Random numbers generated",
241 |                     verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
242 | 
243 |     if (TIMERS_ENABLED == TRUE) {
244 |         printf("Total time:     %f", timer_read(1));
245 |         printf("Gaussian pairs: %f", timer_read(2));
246 |         printf("Random numbers: %f", timer_read(3));
247 |     }
248 |     return 0;
249 | }
250 | 


--------------------------------------------------------------------------------
/NPB-OMP/FT/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ft
 3 | BENCHMARKU=FT
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | ft.o:             ft.cpp  global.hpp npbparams.hpp
16 | 	${CCOMPILE} ft.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ mputil*
20 | 	- rm -f ft npbparams.hpp core
21 | 


--------------------------------------------------------------------------------
/NPB-OMP/FT/global.hpp:
--------------------------------------------------------------------------------
  1 | #include "npbparams.hpp"
  2 | 
  3 | 
  4 | /*
  5 | c If processor array is 1x1 -> 0D grid decomposition
  6 | 
  7 | 
  8 | c Cache blocking params. These values are good for most
  9 | c RISC processors.  
 10 | c FFT parameters:
 11 | c  fftblock controls how many ffts are done at a time. 
 12 | c  The default is appropriate for most cache-based machines
 13 | c  On vector machines, the FFT can be vectorized with vector
 14 | c  length equal to the block size, so the block size should
 15 | c  be as large as possible. This is the size of the smallest
 16 | c  dimension of the problem: 128 for class A, 256 for class B and
 17 | c  512 for class C.
 18 | */
 19 | 
 20 | #define	FFTBLOCK_DEFAULT	16
 21 | #define	FFTBLOCKPAD_DEFAULT	18
 22 | 
 23 | #define FFTBLOCK	FFTBLOCK_DEFAULT
 24 | #define FFTBLOCKPAD	FFTBLOCKPAD_DEFAULT
 25 | 
 26 | /* COMMON block: blockinfo */
 27 | int fftblock;
 28 | int fftblockpad;
 29 |       
 30 | /*
 31 | c we need a bunch of logic to keep track of how
 32 | c arrays are laid out. 
 33 | 
 34 | 
 35 | c Note: this serial version is the derived from the parallel 0D case
 36 | c of the ft NPB.
 37 | c The computation proceeds logically as
 38 | 
 39 | c set up initial conditions
 40 | c fftx(1)
 41 | c transpose (1->2)
 42 | c ffty(2)
 43 | c transpose (2->3)
 44 | c fftz(3)
 45 | c time evolution
 46 | c fftz(3)
 47 | c transpose (3->2)
 48 | c ffty(2)
 49 | c transpose (2->1)
 50 | c fftx(1)
 51 | c compute residual(1)
 52 | 
 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx
 54 | c        
 55 | c            0D        1D        2D
 56 | c 1:        xyz       xyz       xyz
 57 | c 2:        xyz       xyz       yxz
 58 | c 3:        xyz       zyx       zxy
 59 | 
 60 | c the array dimensions are stored in dims(coord, phase)
 61 | */
 62 | 
 63 | /* COMMON block: layout */
 64 | static int dims[3][3];
 65 | static int xstart[3];
 66 | static int ystart[3];
 67 | static int zstart[3];
 68 | static int xend[3];
 69 | static int yend[3];
 70 | static int zend[3];
 71 | 
 72 | #define	T_TOTAL		0
 73 | #define	T_SETUP		1
 74 | #define	T_FFT		2
 75 | #define	T_EVOLVE	3
 76 | #define	T_CHECKSUM	4
 77 | #define	T_FFTLOW	5
 78 | #define	T_FFTCOPY	6
 79 | #define	T_MAX		7
 80 | 
 81 | #define	TIMERS_ENABLED	TRUE
 82 | 
 83 | /* other stuff */
 84 | 
 85 | #define	SEED	314159265.0
 86 | #define	A	1220703125.0
 87 | #define	PI	3.141592653589793238
 88 | #define	ALPHA	1.0e-6
 89 | 
 90 | #define	EXPMAX	(NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4))
 91 | 
 92 | /* COMMON block: excomm */
 93 | static double ex[EXPMAX+1];	/* ex(0:expmax) */
 94 | 
 95 | /*
 96 | c roots of unity array
 97 | c relies on x being largest dimension?
 98 | */
 99 | 
100 | /* COMMON block: ucomm */
101 | static dcomplex u[NX];
102 | 
103 | /* for checksum data */
104 | 
105 | /* COMMON block: sumcomm */
106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */
107 | 
108 | /* number of iterations*/
109 | 
110 | /* COMMON block: iter */
111 | static int niter;
112 | 
113 | 


--------------------------------------------------------------------------------
/NPB-OMP/IS/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=is
 3 | BENCHMARKU=IS
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | include ../sys/make.common
 8 | 
 9 | OBJS = is.o \
10 |        ${COMMON}/c_print_results.o \
11 |        ${COMMON}/c_timers.o \
12 |        ${COMMON}/c_wtime.o
13 | 
14 | 
15 | ${PROGRAM}: config ${OBJS}
16 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
17 | 
18 | is.o:             is.cpp  npbparams.hpp
19 | 	${CCOMPILE} is.cpp
20 | 
21 | clean:
22 | 	- rm -f *.o *~ mputil*
23 | 	- rm -f npbparams.hpp core
24 | 	- if [ -d rii_files ]; then rm -r rii_files; fi
25 | 


--------------------------------------------------------------------------------
/NPB-OMP/MG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=mg
 3 | BENCHMARKU=MG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = mg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | mg.o:		mg.cpp npbparams.hpp
16 | 	${CCOMPILE} mg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-OMP/MG/globals.hpp:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------
 2 | c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
 3 | c  the edge size max for the partition on a given node, so must be changed 
 4 | c  either to save space (if running a small case) or made bigger for larger 
 5 | c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
 6 | c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
 7 | c  automatically in npbparams.h
 8 | c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
 9 | c-------------------------------------------------------------------*/
10 | 
11 | #include "npbparams.hpp"
12 | 
13 | /* parameters */
14 | /* actual dimension including ghost cells for communications */
15 | #define	NM	(2+(2<<(LM-1)))
16 | /* size of rhs array */
17 | #define	NV	(2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1))))
18 | /* size of residual array */
19 | #define	NR	((8*(NV+(NM*NM)+5*NM+7*LM))/7)
20 | /* size of communication buffer */
21 | #define	NM2	(2*NM*NM)
22 | /* maximum number of levels */
23 | #define	MAXLEVEL	11
24 | 
25 | /*---------------------------------------------------------------------*/
26 | /* common /mg3/ */
27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1];
28 | /* common /ClassType/ */
29 | static char class_npb;
30 | /* common /my_debug/ */
31 | static int debug_vec[8];
32 | /* common /fap/ */
33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/
34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1];
35 | static int lt, lb;
36 | 
37 | /*c---------------------------------------------------------------------
38 | c  Set at m=1024, can handle cases up to 1024^3 case
39 | c---------------------------------------------------------------------*/
40 | #define	M	1037
41 | 
42 | /* common /buffer/ */
43 | /*static double buff[4][NM2];*/
44 | 


--------------------------------------------------------------------------------
/NPB-OMP/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | CLASS=S
 3 | SFILE=config/suite.def
 4 | 
 5 | default: header
 6 | 	@ $(SHELL) sys/print_instructions
 7 | 
 8 | BT: bt
 9 | bt: header
10 | 	cd BT; $(MAKE) CLASS=$(CLASS)
11 | 		       
12 | SP: sp		       
13 | sp: header	       
14 | 	cd SP; $(MAKE) CLASS=$(CLASS)
15 | 		       
16 | LU: lu		       
17 | lu: header	       
18 | 	cd LU; $(MAKE) CLASS=$(CLASS)
19 | 		       
20 | MG: mg		       
21 | mg: header	       
22 | 	cd MG; $(MAKE) CLASS=$(CLASS)
23 | 		       
24 | FT: ft		       
25 | ft: header	       
26 | 	cd FT; $(MAKE) CLASS=$(CLASS)
27 | 		       
28 | IS: is		       
29 | is: header	       
30 | 	cd IS; $(MAKE) CLASS=$(CLASS)
31 | 		       
32 | CG: cg		       
33 | cg: header	       
34 | 	cd CG; $(MAKE) CLASS=$(CLASS)
35 | 		       
36 | EP: ep		       
37 | ep: header	       
38 | 	cd EP; $(MAKE) CLASS=$(CLASS)
39 | DC: dc
40 | dc: header	       
41 | 	cd DC; $(MAKE) CLASS=$(CLASS)
42 | 
43 | # Awk script courtesy cmg@cray.com
44 | suite:
45 | 	@ awk '{ if ($$1 !~ /^#/ &&  NF > 0)                              \
46 | 	printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE)  \
47 | 	| $(SHELL)
48 | 
49 | 
50 | # It would be nice to make clean in each subdirectory (the targets
51 | # are defined) but on a really clean system this will won't work
52 | # because those makefiles need config/make.def
53 | clean:
54 | 	- rm -f core 
55 | 	- rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe
56 | 	- rm -f sys/setparams sys/makesuite sys/setparams.hpp
57 | 
58 | cleanall: clean
59 | 	- rm -r bin/*
60 | 
61 | veryclean: clean
62 | 	- rm config/make.def config/suite.def Part*
63 | 	- rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.*
64 | 
65 | header:
66 | 	@ $(SHELL) sys/print_header
67 | 
68 | kit: 
69 | 	- makekit -s100k -k30 * */* */*/*
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/NPB-OMP/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP)
 2 | 
 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP).
 4 | 
 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways.
 6 | 
 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)*
 8 | 
 9 | 
10 | ## How to cite this work
11 | 	
12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
13 | 
14 | ## The NPB-CPP Benchmark
15 | 
16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version.
17 | 
18 | 	==================================================================
19 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
20 | 	 												
21 | 			Code contributors: 
22 | 					Dalvan Griebler    		
23 | 					Júnior Löff
24 | 													
25 | 		Warning: in case of problems send an email to us:					
26 | 			dalvan.griebler@acad.pucrs.br			
27 | 			junior.loff@acad.pucrs.br				
28 | 	==================================================================
29 | 
30 | 
31 | This folder contains:
32 | 
33 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
34 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
35 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
36 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
37 | 
38 | Each directory is independent and contains its own implemented version of the kernels:
39 | 
40 | 	IS - Integer Sort, random memory access
41 | 	EP - Embarrassingly Parallel
42 | 	CG - Conjugate Gradient, irregular memory access and communication
43 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
44 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
45 | 
46 | ## Software Requirements
47 | 
48 | *Warning: our tests were made with GCC-5*
49 | 
50 | **TBB**
51 | 
52 | *Installation*
53 | 
54 | 	apt-get install libtbb-dev
55 | 
56 | **FastFlow** 
57 | 
58 | *Installation*
59 | 
60 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
61 | 
62 | 
63 | ## How to Compile 
64 | 
65 | Enter the directory from the version desired and execute:
66 | 
67 | 	make _BENCHMARK CLASS=_VERSION
68 | 
69 | 
70 | _BENCHMARKs are: 
71 | 		
72 | 	EP, CG, MG, IS and FT 
73 | 																										
74 | _VERSIONs are: 
75 | 	
76 | 	Class S: small for quick test purposes
77 | 	Class W: workstation size (a 90's workstation; now likely too small)	
78 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
79 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
80 | 
81 | 
82 | Command:
83 | 
84 | 	make ep CLASS=B
85 | 


--------------------------------------------------------------------------------
/NPB-OMP/bin/README.md:
--------------------------------------------------------------------------------
 1 | # How to Cite our Work
 2 | 	
 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
 4 | 
 5 | # The NPB-CPP Benchmark
 6 | 
 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version.
 8 | 
 9 | 	==================================================================
10 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
11 | 	 												
12 | 			Code contributors: 
13 | 					Dalvan Griebler    		
14 | 					Júnior Löff
15 | 													
16 | 		Warning: in case of problems send an email to us:					
17 | 			dalvan.griebler@acad.pucrs.br			
18 | 			junior.loff@acad.pucrs.br				
19 | 	==================================================================
20 | 
21 | 
22 | This folder contains:
23 | 
24 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
25 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
26 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
27 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
28 | 
29 | Each directory is independent and contains its own implemented version of the kernels:
30 | 
31 | 	IS - Integer Sort, random memory access
32 | 	EP - Embarrassingly Parallel
33 | 	CG - Conjugate Gradient, irregular memory access and communication
34 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
35 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
36 | 
37 | # Software Requiriments
38 | 
39 | *Warning: our tests were made with GCC-5*
40 | 
41 | **TBB**
42 | 
43 | *Installation*
44 | 
45 | 	apt-get install libtbb-dev
46 | 
47 | **FastFlow** 
48 | 
49 | *Installation*
50 | 
51 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
52 | 
53 | 
54 | # How to Compile 
55 | 
56 | Enter the directory from the version desired and execute:
57 | 
58 | 	make _BENCHMARK CLASS=_VERSION
59 | 
60 | 
61 | _BENCHMARKs are: 
62 | 		
63 | 	EP, CG, MG, IS and FT 
64 | 																										
65 | _VERSIONs are: 
66 | 	
67 | 	Class S: small for quick test purposes
68 | 	Class W: workstation size (a 90's workstation; now likely too small)	
69 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
70 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
71 | 
72 | 
73 | Command:
74 | 
75 | 	make ep CLASS=B


--------------------------------------------------------------------------------
/NPB-OMP/common/c_print_results.cpp:
--------------------------------------------------------------------------------
 1 | /*****************************************************************/
 2 | /******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
 3 | /*****************************************************************/
 4 | #include <cstdlib>
 5 | #include <cstdio>
 6 | 
 7 | void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, int  nthreads, double t,
 8 |   double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
 9 |   char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand)
10 | {
11 | 
12 |     printf( "\n\n %s Benchmark Completed\n", name ); 
13 | 
14 |     printf( " class_npb           =                        %c\n", class_npb );
15 | 
16 |     if( n2 == 0 && n3 == 0 )
17 |         printf( " Size            =             %12d\n", n1 );   /* as in IS */
18 |     else
19 |         printf( " Size            =              %3dx%3dx%3d\n", n1,n2,n3 );
20 | 
21 |     printf( " Iterations      =             %12d\n", niter );
22 |     
23 |     printf( " Threads         =             %12d\n", nthreads );
24 |  
25 |     printf( " Time in seconds =             %12.2f\n", t );
26 | 
27 |     printf( " Mop/s total     =             %12.2f\n", mops );
28 | 
29 |     printf( " Operation type  = %24s\n", optype);
30 | 
31 |     if( passed_verification )
32 |         printf( " Verification    =               SUCCESSFUL\n" );
33 |     else
34 |         printf( " Verification    =             UNSUCCESSFUL\n" );
35 | 
36 |     printf( " Version         =             %12s\n", npbversion );
37 | 
38 |     printf( " Compile date    =             %12s\n", compiletime );
39 | 
40 |     printf( "\n Compile options:\n" );
41 | 
42 |     printf( "    CC           = %s\n", cc );
43 | 
44 |     printf( "    CLINK        = %s\n", clink );
45 | 
46 |     printf( "    C_LIB        = %s\n", c_lib );
47 | 
48 |     printf( "    C_INC        = %s\n", c_inc );
49 | 
50 |     printf( "    CFLAGS       = %s\n", cflags );
51 | 
52 |     printf( "    CLINKFLAGS   = %s\n", clinkflags );
53 | 
54 |     printf( "    RAND         = %s\n", rand );
55 | #ifdef SMP
56 |     char *evalue = getenv("MP_SET_NUMTHREADS");
57 |     printf( "   MULTICPUS = %s\n", evalue );
58 | #endif
59 | 
60 | /*    printf( "\n\n" );
61 |     printf( " Please send the results of this run to:\n\n" );
62 |     printf( " NPB Development Team\n" );
63 |     printf( " Internet: npb@nas.nasa.gov\n \n" );
64 |     printf( " If email is not available, send this to:\n\n" );
65 |     printf( " MS T27A-1\n" );
66 |     printf( " NASA Ames Research Center\n" );
67 |     printf( " Moffett Field, CA  94035-1000\n\n" );
68 |     printf( " Fax: 415-604-3957\n\n" );*/
69 | }
70 |  
71 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/c_randdp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | */
  3 | #if defined(USE_POW)
  4 | #define r23 pow(0.5, 23.0)
  5 | #define r46 (r23*r23)
  6 | #define t23 pow(2.0, 23.0)
  7 | #define t46 (t23*t23)
  8 | #else
  9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5)
 10 | #define r46 (r23*r23)
 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0)
 12 | #define t46 (t23*t23)
 13 | #endif
 14 | 
 15 | /*c---------------------------------------------------------------------
 16 | c---------------------------------------------------------------------*/
 17 | 
 18 | double randlc (double *x, double a) {
 19 | 
 20 | /*c---------------------------------------------------------------------
 21 | c---------------------------------------------------------------------*/
 22 | 
 23 | /*c---------------------------------------------------------------------
 24 | c
 25 | c   This routine returns a uniform pseudorandom double precision number in the
 26 | c   range (0, 1) by using the linear congruential generator
 27 | c
 28 | c   x_{k+1} = a x_k  (mod 2^46)
 29 | c
 30 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 31 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 32 | c   and X is the same as x_0.  A and X must be odd double precision integers
 33 | c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
 34 | c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 35 | c   the new seed x_1, so that subsequent calls to RANDLC using the same
 36 | c   arguments will generate a continuous sequence.
 37 | c
 38 | c   This routine should produce the same results on any computer with at least
 39 | c   48 mantissa bits in double precision floating point data.  On 64 bit
 40 | c   systems, double precision should be disabled.
 41 | c
 42 | c   David H. Bailey     October 26, 1990
 43 | c
 44 | c---------------------------------------------------------------------*/
 45 | 
 46 |     double t1,t2,t3,t4,a1,a2,x1,x2,z;
 47 | 
 48 | /*c---------------------------------------------------------------------
 49 | c   Break A into two parts such that A = 2^23 * A1 + A2.
 50 | c---------------------------------------------------------------------*/
 51 |     t1 = r23 * a;
 52 |     a1 = (int)t1;
 53 |     a2 = a - t23 * a1;
 54 | 
 55 | /*c---------------------------------------------------------------------
 56 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
 57 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 58 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
 59 | c---------------------------------------------------------------------*/
 60 |     t1 = r23 * (*x);
 61 |     x1 = (int)t1;
 62 |     x2 = (*x) - t23 * x1;
 63 |     t1 = a1 * x2 + a2 * x1;
 64 |     t2 = (int)(r23 * t1);
 65 |     z = t1 - t23 * t2;
 66 |     t3 = t23 * z + a2 * x2;
 67 |     t4 = (int)(r46 * t3);
 68 |     (*x) = t3 - t46 * t4;
 69 | 
 70 |     return (r46 * (*x));
 71 | }
 72 | 
 73 | /*c---------------------------------------------------------------------
 74 | c---------------------------------------------------------------------*/
 75 | 
 76 | void vranlc (int n, double *x_seed, double a, double y[]) {
 77 | 
 78 | /*c---------------------------------------------------------------------
 79 | c---------------------------------------------------------------------*/
 80 | 
 81 | /*c---------------------------------------------------------------------
 82 | c
 83 | c   This routine generates N uniform pseudorandom double precision numbers in
 84 | c   the range (0, 1) by using the linear congruential generator
 85 | c
 86 | c   x_{k+1} = a x_k  (mod 2^46)
 87 | c
 88 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 89 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 90 | c   and X is the same as x_0.  A and X must be odd double precision integers
 91 | c   in the range (1, 2^46).  The N results are placed in Y and are normalized
 92 | c   to be between 0 and 1.  X is updated to contain the new seed, so that
 93 | c   subsequent calls to VRANLC using the same arguments will generate a
 94 | c   continuous sequence.  If N is zero, only initialization is performed, and
 95 | c   the variables X, A and Y are ignored.
 96 | c
 97 | c   This routine is the standard version designed for scalar or RISC systems.
 98 | c   However, it should produce the same results on any single processor
 99 | c   computer with at least 48 mantissa bits in double precision floating point
100 | c   data.  On 64 bit systems, double precision should be disabled.
101 | c
102 | c---------------------------------------------------------------------*/
103 | 
104 |     int i;
105 |     double x,t1,t2,t3,t4,a1,a2,x1,x2,z;
106 | 
107 | /*c---------------------------------------------------------------------
108 | c   Break A into two parts such that A = 2^23 * A1 + A2.
109 | c---------------------------------------------------------------------*/
110 |     t1 = r23 * a;
111 |     a1 = (int)t1;
112 |     a2 = a - t23 * a1;
113 |     x = *x_seed;
114 | 
115 | /*c---------------------------------------------------------------------
116 | c   Generate N results.   This loop is not vectorizable.
117 | c---------------------------------------------------------------------*/
118 |     for (i = 1; i <= n; i++) {
119 | 
120 | /*c---------------------------------------------------------------------
121 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
122 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
123 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
124 | c---------------------------------------------------------------------*/
125 |         t1 = r23 * x;
126 |         x1 = (int)t1;
127 |         x2 = x - t23 * x1;
128 |         t1 = a1 * x2 + a2 * x1;
129 |         t2 = (int)(r23 * t1);
130 |         z = t1 - t23 * t2;
131 |         t3 = t23 * z + a2 * x2;
132 |         t4 = (int)(r46 * t3);
133 |         x = t3 - t46 * t4;
134 |         y[i] = r46 * x;
135 |     }
136 |     *x_seed = x;
137 | }
138 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/c_timers.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #include "wtime.hpp"
 5 | #include <cstdlib>
 6 | 
 7 | /*  Prototype  */
 8 | void wtime( double * );
 9 | 
10 | 
11 | 
12 | /*****************************************************************/
13 | /******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
14 | /*****************************************************************/
15 | double elapsed_time( void )
16 | {
17 |     double t;
18 | 
19 |     wtime( &t );
20 |     return( t );
21 | }
22 | 
23 | 
24 | double start[64], elapsed[64];
25 | 
26 | /*****************************************************************/
27 | /******            T  I  M  E  R  _  C  L  E  A  R          ******/
28 | /*****************************************************************/
29 | void timer_clear( int n )
30 | {
31 |     elapsed[n] = 0.0;
32 | }
33 | 
34 | 
35 | /*****************************************************************/
36 | /******            T  I  M  E  R  _  S  T  A  R  T          ******/
37 | /*****************************************************************/
38 | void timer_start( int n )
39 | {
40 |     start[n] = elapsed_time();
41 | }
42 | 
43 | 
44 | /*****************************************************************/
45 | /******            T  I  M  E  R  _  S  T  O  P             ******/
46 | /*****************************************************************/
47 | void timer_stop( int n )
48 | {
49 |     double t, now;
50 | 
51 |     now = elapsed_time();
52 |     t = now - start[n];
53 |     elapsed[n] += t;
54 | 
55 | }
56 | 
57 | 
58 | /*****************************************************************/
59 | /******            T  I  M  E  R  _  R  E  A  D             ******/
60 | /*****************************************************************/
61 | double timer_read( int n )
62 | {
63 |     return( elapsed[n] );
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/npb-CPP.hpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <cmath>
 4 | #if defined(_OPENMP)
 5 | #include <omp.h>
 6 | #endif /* _OPENMP */
 7 | 
 8 | typedef int boolean;
 9 | typedef struct { double real; double imag; } dcomplex;
10 | 
11 | #define TRUE	1
12 | #define FALSE	0
13 | 
14 | #define max(a,b) (((a) > (b)) ? (a) : (b))
15 | #define min(a,b) (((a) < (b)) ? (a) : (b))
16 | #define	pow2(a) ((a)*(a))
17 | 
18 | #define get_real(c) c.real
19 | #define get_imag(c) c.imag
20 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag)
21 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag)
22 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \
23 |                      c.imag = a.real * b.imag + a.imag * b.real)
24 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b)
25 | 
26 | extern double randlc(double *, double);
27 | extern void vranlc(int, double *, double, double *);
28 | extern void timer_clear(int);
29 | extern void timer_start(int);
30 | extern void timer_stop(int);
31 | extern double timer_read(int);
32 | 
33 | extern void c_print_results(char *name, char class_npb, int n1, int n2,
34 | 			    int n3, int niter, int nthreads, double t,
35 | 			    double mops, char *optype, int passed_verification,
36 | 			    char *npbversion, char *compiletime, char *cc,
37 | 			    char *clink, char *c_lib, char *c_inc,
38 | 			    char *cflags, char *clinkflags, char *rand);
39 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/wtime.cpp:
--------------------------------------------------------------------------------
 1 | #include "wtime.hpp"
 2 | #include <sys/time.h>
 3 | 
 4 | void wtime(double *t)
 5 | {
 6 |   static int sec = -1;
 7 |   struct timeval tv;
 8 |   gettimeofday(&tv, 0);
 9 |   if (sec < 0) sec = tv.tv_sec;
10 |   *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
11 | }
12 | 
13 |     
14 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/wtime.hpp:
--------------------------------------------------------------------------------
 1 | /* C/Fortran interface is different on different machines. 
 2 |  * You may need to tweak this.
 3 |  */
 4 | 
 5 | 
 6 | #if defined(IBM)
 7 | #define wtime wtime
 8 | #elif defined(CRAY)
 9 | #define wtime WTIME
10 | #else
11 | #define wtime wtime_
12 | #endif
13 | 


--------------------------------------------------------------------------------
/NPB-OMP/common/wtime_sgi64.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <fcntl.h>
 3 | #include <sys/mman.h>
 4 | #include <sys/syssgi.h>
 5 | #include <sys/immu.h>
 6 | #include <cerrno>
 7 | #include <cstdio>
 8 | 
 9 | /* The following works on SGI Power Challenge systems */
10 | 
11 | typedef unsigned long iotimer_t;
12 | 
13 | unsigned int cycleval;
14 | volatile iotimer_t *iotimer_addr, base_counter;
15 | double resolution;
16 | 
17 | /* address_t is an integer type big enough to hold an address */
18 | typedef unsigned long address_t;
19 | 
20 | 
21 | 
22 | void timer_init() 
23 | {
24 |   
25 |   int fd;
26 |   char *virt_addr;
27 |   address_t phys_addr, page_offset, pagemask, pagebase_addr;
28 |   
29 |   pagemask = getpagesize() - 1;
30 |   errno = 0;
31 |   phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
32 |   if (errno != 0) {
33 |     perror("SGI_QUERY_CYCLECNTR");
34 |     exit(1);
35 |   }
36 |   /* rel_addr = page offset of physical address */
37 |   page_offset = phys_addr & pagemask;
38 |   pagebase_addr = phys_addr - page_offset;
39 |   fd = open("/dev/mmem", O_RDONLY);
40 | 
41 |   virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
42 |   virt_addr = virt_addr + page_offset;
43 |   iotimer_addr = (iotimer_t *)virt_addr;
44 |   /* cycleval in picoseconds to this gives resolution in seconds */
45 |   resolution = 1.0e-12*cycleval; 
46 |   base_counter = *iotimer_addr;
47 | }
48 | 
49 | void wtime_(double *time) 
50 | {
51 |   static int initialized = 0;
52 |   volatile iotimer_t counter_value;
53 |   if (!initialized) { 
54 |     timer_init();
55 |     initialized = 1;
56 |   }
57 |   counter_value = *iotimer_addr - base_counter;
58 |   *time = (double)counter_value * resolution;
59 | }
60 | 
61 | 
62 | void wtime(double *time) 
63 | {
64 |   static int initialized = 0;
65 |   volatile iotimer_t counter_value;
66 |   if (!initialized) { 
67 |     timer_init();
68 |     initialized = 1;
69 |   }
70 |   counter_value = *iotimer_addr - base_counter;
71 |   *time = (double)counter_value * resolution;
72 | }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/NPB-OMP/config/make.def:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------------------
  2 | #
  3 | #                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
  4 | #
  5 | #---------------------------------------------------------------------------
  6 | 
  7 | #---------------------------------------------------------------------------
  8 | # Items in this file will need to be changed for each platform.
  9 | # (Note these definitions are inconsistent with NPB2.1.)
 10 | #---------------------------------------------------------------------------
 11 | 
 12 | #---------------------------------------------------------------------------
 13 | # Parallel C:
 14 | #
 15 | # CC         - C compiler 
 16 | # CFLAGS     - C compilation arguments
 17 | # C_INC      - any -I arguments required for compiling C 
 18 | # CLINK      - C linker
 19 | # CLINKFLAGS - C linker flags
 20 | # C_LIB      - any -L and -l arguments required for linking C 
 21 | #
 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or
 23 | #                            $(CC) $(CFLAGS)
 24 | # linking is done with       $(CLINK) $(C_LIB) $(CLINKFLAGS)
 25 | #---------------------------------------------------------------------------
 26 | 
 27 | #---------------------------------------------------------------------------
 28 | # This is the C compiler used for OpenMP programs
 29 | #---------------------------------------------------------------------------
 30 | CC = g++ -std=c++14
 31 | #gcc #cc
 32 | # This links C programs; usually the same as ${CC}
 33 | CLINK	= $(CC)
 34 | 
 35 | #---------------------------------------------------------------------------
 36 | # These macros are passed to the linker 
 37 | #---------------------------------------------------------------------------
 38 | C_LIB  = -lm
 39 | 
 40 | #---------------------------------------------------------------------------
 41 | # These macros are passed to the compiler 
 42 | #---------------------------------------------------------------------------
 43 | C_INC = -I../common 
 44 | 
 45 | #---------------------------------------------------------------------------
 46 | # Global *compile time* flags for C programs
 47 | #---------------------------------------------------------------------------
 48 | CFLAGS	= -O3 -fopenmp
 49 | # CFLAGS = -g
 50 | 
 51 | #---------------------------------------------------------------------------
 52 | # Global *link time* flags. Flags for increasing maximum executable 
 53 | # size usually go here. 
 54 | #---------------------------------------------------------------------------
 55 | CLINKFLAGS = -O3 -fopenmp
 56 | 
 57 | 
 58 | #---------------------------------------------------------------------------
 59 | # Utilities C:
 60 | #
 61 | # This is the C compiler used to compile C utilities.  Flags required by 
 62 | # this compiler go here also; typically there are few flags required; hence 
 63 | # there are no separate macros provided for such flags.
 64 | #---------------------------------------------------------------------------
 65 | UCC	= cc
 66 | 
 67 | 
 68 | #---------------------------------------------------------------------------
 69 | # Destination of executables, relative to subdirs of the main directory. . 
 70 | #---------------------------------------------------------------------------
 71 | BINDIR	= ../bin
 72 | 
 73 | 
 74 | #---------------------------------------------------------------------------
 75 | # The variable RAND controls which random number generator 
 76 | # is used. It is described in detail in Doc/README.install. 
 77 | # Use "randi8" unless there is a reason to use another one. 
 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec"
 79 | #---------------------------------------------------------------------------
 80 | # RAND   = randi8
 81 | # The following is highly reliable but may be slow:
 82 | RAND   = randdp
 83 | 
 84 | 
 85 | #---------------------------------------------------------------------------
 86 | # The variable WTIME is the name of the wtime source code module in the
 87 | # NPB2.x/common directory.  
 88 | # For most machines,       use wtime.c
 89 | # For SGI power challenge: use wtime_sgi64.c
 90 | #---------------------------------------------------------------------------
 91 | WTIME  = wtime.cpp
 92 | 
 93 | 
 94 | #---------------------------------------------------------------------------
 95 | # Enable if either Cray or IBM: 
 96 | # (no such flag for most machines: see common/wtime.h)
 97 | # This is used by the C compiler to pass the machine name to common/wtime.h,
 98 | # where the C/Fortran binding interface format is determined
 99 | #---------------------------------------------------------------------------
100 | # MACHINE	=	-DCRAY
101 | # MACHINE	=	-DIBM
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/NPB-OMP/config/suite.def:
--------------------------------------------------------------------------------
 1 | # config/suite.def
 2 | # This file is used to build several benchmarks with a single command. 
 3 | # Typing "make suite" in the main directory will build all the benchmarks
 4 | # specified in this file. 
 5 | # Each line of this file contains a benchmark name, class, and number
 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft"
 7 | # The class is one of "S", "W", "A", "B", and "C". 
 8 | # No blank lines. 
 9 | # The following example builds serial sample sizes of all benchmarks. 
10 | ft	B	
11 | mg	B
12 | is	B
13 | ep	B
14 | cg	B
15 | 


--------------------------------------------------------------------------------
/NPB-OMP/sys/Makefile:
--------------------------------------------------------------------------------
 1 | include ../config/make.def
 2 | 
 3 | # Note that COMPILE is also defined in make.common and should
 4 | # be the same. We can't include make.common because it has a lot
 5 | # of other garbage. 
 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 7 | 
 8 | all: setparams 
 9 | 
10 | # setparams creates an npbparam.h file for each benchmark 
11 | # configuration. npbparams.h also contains info about how a benchmark
12 | # was compiled and linked
13 | 
14 | setparams: setparams.cpp ../config/make.def
15 | 	$(UCC) -o setparams setparams.cpp
16 | 
17 | 
18 | clean: 
19 | 	-rm -f setparams setparams.hpp npbparams.hpp
20 | 	-rm -f *~ *.o
21 | 
22 | 


--------------------------------------------------------------------------------
/NPB-OMP/sys/README:
--------------------------------------------------------------------------------
 1 | This directory contains utilities and files used by the 
 2 | build process. You should not need to change anything
 3 | in this directory. 
 4 | 
 5 | Original Files
 6 | --------------
 7 | setparams.c:
 8 |         Source for the setparams program. This program is used internally
 9 |         in the build process to create the file "npbparams.h" for each 
10 |         benchmark. npbparams.h contains Fortran or C parameters to build a 
11 |         benchmark for a specific class. The setparams program is never run 
12 |         directly by a user. Its invocation syntax is 
13 | 
14 |             "setparams benchmark-name class". 
15 | 
16 |         It examines the file "npbparams.h" in the current directory. If 
17 |         the specified parameters are the same as those in the npbparams.h 
18 |         file, nothing it changed. If the file does not exist or corresponds 
19 |         to a different class/number of nodes, it is (re)built. 
20 | 	One of the more complicated things in npbparams.h is that it 
21 |         contains, in a Fortran string, the compiler flags used to build a 
22 |         benchmark, so that a benchmark can print out how it was compiled. 
23 | 
24 | make.common
25 |         A makefile segment that is included in each individual benchmark
26 |         program makefile. It sets up some standard macros (COMPILE, etc) 
27 |         and makes sure everything is configured correctly (npbparams.h)
28 | 
29 | Makefile
30 |         Builds  setparams
31 | 
32 | README
33 |         This file. 
34 | 
35 | 
36 | Created files
37 | -------------
38 | 
39 | setparams
40 | 	See descriptions above
41 | 
42 | 


--------------------------------------------------------------------------------
/NPB-OMP/sys/make.common:
--------------------------------------------------------------------------------
 1 | PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 3 | CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
 4 | 
 5 | # Class "U" is used internally by the setparams program to mean
 6 | # "unknown". This means that if you don't specify CLASS=
 7 | # on the command line, you'll get an error. It would be nice
 8 | # to be able to avoid this, but we'd have to get information
 9 | # from the setparams back to the make program, which isn't easy. 
10 | CLASS=U
11 | 
12 | default:: ${PROGRAM}
13 | 
14 | # This makes sure the configuration utility setparams 
15 | # is up to date. 
16 | # Note that this must be run every time, which is why the
17 | # target does not exist and is not created. 
18 | # If you create a file called "config" you will break things. 
19 | config:
20 | 	@cd ../sys; ${MAKE} all
21 | 	../sys/setparams ${BENCHMARK} ${CLASS}
22 | 
23 | COMMON=../common
24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
25 | 	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
26 | 
27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp
28 | 	cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp
29 | 
30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f
31 | 	cd ${COMMON}; ${FCOMPILE} print_results.f
32 | 
33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp
34 | 	cd ${COMMON}; ${CCOMPILE} c_print_results.cpp
35 | 
36 | ${COMMON}/timers.o: ${COMMON}/timers.f
37 | 	cd ${COMMON}; ${FCOMPILE} timers.f
38 | 
39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp
40 | 	cd ${COMMON}; ${CCOMPILE} c_timers.cpp
41 | 
42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME}
43 | 	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME}
44 | # For most machines or CRAY or IBM
45 | #	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
46 | # For a precise timer on an SGI Power Challenge, try:
47 | #	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
48 | 
49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
50 | 	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME}
51 | 
52 | 
53 | # Normally setparams updates npbparams.h only if the settings (CLASS)
54 | # have changed. However, we also want to update if the compile options
55 | # may have changed (set in ../config/make.def). 
56 | npbparams.hpp: ../config/make.def
57 | 	@ echo make.def modified. Rebuilding npbparams.hpp just in case
58 | 	rm -f npbparams.hpp
59 | 	../sys/setparams ${BENCHMARK} ${CLASS}
60 | 
61 | # So that "make benchmark-name" works
62 | ${BENCHMARK}:  default
63 | ${BENCHMARKU}: default
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-OMP/sys/print_header:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   ========================================='
 3 | echo '   =      NAS Parallel Benchmarks 	     ='
 4 | echo '   =      OpenMP C++ Versions              ='
 5 | echo '   =      Developed by: Dalvan Griebler    ='
 6 | echo '   =                    Júnior Löff        ='
 7 | echo '   =                                       ='
 8 | echo '   =      Warning: in case of problems     ='
 9 | echo '   =      send an email to us:             ='
10 | echo '   =      dalvan.griebler@acad.pucrs.br    ='
11 | echo '   =      junior.loff@acad.pucrs.br        ='
12 | echo '   ========================================='
13 | echo ''
14 | 


--------------------------------------------------------------------------------
/NPB-OMP/sys/print_instructions:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   To make a NAS benchmark type '
 3 | echo ''
 4 | echo '         make <benchmark-name> CLASS=<class>'
 5 | echo ''
 6 | echo '   where <benchmark-name> is "cg", "ep", "ft", "is", or "mg"'
 7 | echo '         <class>          is "S", "W", "A", "B" or "C"'
 8 | echo ''
 9 | echo '   To make a set of benchmarks, create the file config/suite.def'
10 | echo '   according to the instructions in config/suite.def.template and type'
11 | echo ''
12 | echo '         make suite'
13 | echo ''
14 | echo ' ***************************************************************'
15 | echo ' * Remember to edit the file config/make.def for site specific *'
16 | echo ' * information as described in the README file                 *'
17 | echo ' ***************************************************************'
18 | 
19 | 


--------------------------------------------------------------------------------
/NPB-SER/CG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=cg
 3 | BENCHMARKU=CG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = cg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | cg.o:		cg.cpp  npbparams.hpp
16 | 	${CCOMPILE} cg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-SER/EP/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ep
 3 | BENCHMARKU=EP
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | 
16 | ep.o:		ep.cpp npbparams.hpp
17 | 	${CCOMPILE} ep.cpp
18 | 
19 | clean:
20 | 	- rm -f *.o *~ 
21 | 	- rm -f npbparams.hpp core
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/NPB-SER/EP/ep.cpp:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------
  2 | 
  3 |     Information on NAS Parallel Benchmarks is available at:
  4 | 
  5 |     http://www.nas.nasa.gov/Software/NPB/
  6 | 
  7 |     Authors: P. O. Frederickson
  8 |            D. H. Bailey
  9 |            A. C. Woo
 10 | 
 11 |     CPP version:
 12 |             Dalvan Griebler <dalvangriebler@gmail.com>
 13 |             Júnior Löff <loffjh@gmail.com>
 14 | 
 15 | --------------------------------------------------------------------*/
 16 | 
 17 | #include "npbparams.hpp"
 18 | #include <iostream>
 19 | #include <../common/npb-CPP.hpp>
 20 | 
 21 | /* parameters */
 22 | #define	MK		16
 23 | #define	MM		(M - MK)
 24 | #define	NN		(1 << MM)
 25 | #define	NK		(1 << MK)
 26 | #define	NQ		10
 27 | #define EPSILON		1.0e-8
 28 | #define	A		1220703125.0
 29 | #define	S		271828183.0
 30 | #define	TIMERS_ENABLED	FALSE
 31 | 
 32 | /* global variables */
 33 | /* common /storage/ */
 34 | static double x[(2*NK)+1];
 35 | static double q[NQ];
 36 | 
 37 | /*--------------------------------------------------------------------
 38 |       program EMBAR
 39 | c-------------------------------------------------------------------*/
 40 | /*
 41 | c   This is the serial version of the APP Benchmark 1,
 42 | c   the "embarassingly parallel" benchmark.
 43 | c
 44 | c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
 45 | c   numbers.  MK is the Log_2 of the size of each batch of uniform random
 46 | c   numbers.  MK can be set for convenience on a given system, since it does
 47 | c   not affect the results.
 48 | */
 49 | int main(int argc, char **argv) {
 50 |     double Mops, t1, sx, sy, tm, an, gc;
 51 |     double dum[3] = { 1.0, 1.0, 1.0 };
 52 |     int np,i, k, nit, k_offset, j;
 53 |     boolean verified;
 54 |     char size[13+1];	/* character*13 */
 55 | 
 56 |     /*
 57 |     c   Because the size of the problem is too large to store in a 32-bit
 58 |     c   integer for some classes, we put it into a string (for printing).
 59 |     c   Have to strip off the decimal point put in there by the floating
 60 |     c   point print statement (internal file)
 61 |     */
 62 | 
 63 |     printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n");
 64 |     printf("Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br> & Júnior Löff <loffjh@gmail.com>\n\n");
 65 |     sprintf(size, "%12.0f", pow(2.0, M+1));
 66 |     for (j = 13; j >= 1; j--) {
 67 |         if (size[j] == '.') size[j] = ' ';
 68 |     }
 69 |     printf(" Number of random numbers generated: %13s\n", size);
 70 | 
 71 |     verified = FALSE;
 72 | 
 73 |     /*
 74 |     c   Compute the number of "batches" of random number pairs generated
 75 |     c   per processor. Adjust if the number of processors does not evenly
 76 |     c   divide the total number
 77 |     */
 78 |     np = NN;
 79 | 
 80 |     /*
 81 |     c   Call the random number generator functions and initialize
 82 |     c   the x-array to reduce the effects of paging on the timings.
 83 |     c   Also, call all mathematical functions that are used. Make
 84 |     c   sure these initializations cannot be eliminated as dead code.
 85 |     */
 86 |     vranlc(0, &(dum[0]), dum[1], &(dum[2]));
 87 |     dum[0] = randlc(&(dum[1]), dum[2]);
 88 |     for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
 89 |     Mops = log(sqrt(fabs(max(1.0, 1.0))));
 90 | 
 91 | 
 92 | 
 93 |     timer_clear(1);
 94 |     timer_clear(2);
 95 |     timer_clear(3);
 96 | 
 97 |     timer_start(1);
 98 | 
 99 |     vranlc(0, &t1, A, x);
100 | 
101 |     /*   Compute AN = A ^ (2 * NK) (mod 2^46). */
102 | 
103 |     t1 = A;
104 | 
105 |     for ( i = 1; i <= MK+1; i++) {
106 |         an = randlc(&t1, t1);
107 |     }
108 | 
109 |     an = t1;
110 |     gc = 0.0;
111 |     sx = 0.0;
112 |     sy = 0.0;
113 | 
114 |     for ( i = 0; i <= NQ - 1; i++) {
115 |         q[i] = 0.0;
116 |     }
117 | 
118 |     /*
119 |     c   Each instance of this loop may be performed independently. We compute
120 |     c   the k offsets separately to take into account the fact that some nodes
121 |     c   have more numbers to generate than others
122 |     */
123 |     k_offset = -1;
124 | 
125 |     double t2, t3, t4, x1, x2;
126 |     int kk, ik, l;
127 |     double qq[NQ];		/* private copy of q[0:NQ-1] */
128 | 
129 |     for (i = 0; i < NQ; i++) qq[i] = 0.0;
130 | 
131 |     for (k = 1; k <= np; k++) {
132 |         kk = k_offset + k;
133 |         t1 = S;
134 |         t2 = an;
135 | 
136 |         /*  Find starting seed t1 for this kk. */
137 | 
138 |         for (i = 1; i <= 100; i++) {
139 |             ik = kk / 2;
140 |             if (2 * ik != kk) t3 = randlc(&t1, t2);
141 |             if (ik == 0) break;
142 |             t3 = randlc(&t2, t2);
143 |             kk = ik;
144 |         }
145 | 
146 |         /*      Compute uniform pseudorandom numbers. */
147 | 
148 |         if (TIMERS_ENABLED == TRUE) timer_start(3);
149 |         vranlc(2*NK, &t1, A, x);
150 |         if (TIMERS_ENABLED == TRUE) timer_stop(3);
151 | 
152 |         /*
153 |         c       Compute Gaussian deviates by acceptance-rejection method and
154 |         c       tally counts in concentric square annuli.  This loop is not
155 |         c       vectorizable.
156 |         */
157 |         if (TIMERS_ENABLED == TRUE) timer_start(2);
158 | 
159 |         for ( i = 1; i <= NK; i++) {
160 |             x1 = 2.0 * x[2*i-1] - 1.0;
161 |             x2 = 2.0 * x[2*i] - 1.0;
162 |             t1 = pow2(x1) + pow2(x2);
163 |             if (t1 <= 1.0) {
164 |                 t2 = sqrt(-2.0 * log(t1) / t1);
165 |                 t3 = (x1 * t2);				/* Xi */
166 |                 t4 = (x2 * t2);				/* Yi */
167 |                 l = max(fabs(t3), fabs(t4));
168 |                 qq[l] += 1.0;				/* counts */
169 |                 sx = sx + t3;				/* sum of Xi */
170 |                 sy = sy + t4;				/* sum of Yi */
171 |             }
172 |         }
173 |         if (TIMERS_ENABLED == TRUE) timer_stop(2);
174 |     }
175 |     for (i = 0; i <= NQ-1; i++) q[i] += qq[i];
176 | 
177 |     for (i = 0; i <= NQ-1; i++) {
178 |         gc = gc + q[i];
179 |     }
180 | 
181 |     timer_stop(1);
182 |         
183 |     tm = timer_read(1);
184 | 
185 | 
186 |     nit = 0;
187 |     if (M == 24) {
188 |         if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) {
189 |             verified = TRUE;
190 |         }
191 |     } else if (M == 25) {
192 |         if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) {
193 |             verified = TRUE;
194 |         }
195 |     } else if (M == 28) {
196 |         //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
197 |         if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) {
198 |             verified = TRUE;
199 |         }
200 |     } else if (M == 30) {
201 |         if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) {
202 |             verified = TRUE;
203 |         }
204 |     } else if (M == 32) {
205 |         if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) {
206 |             verified = TRUE;
207 |         }
208 |     } else if (M == 36) {
209 |         if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) {
210 |             verified = TRUE;
211 |         }
212 |     } else if (M == 40) {
213 |         if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) {
214 |             verified = TRUE;
215 |         }
216 |     }
217 | 
218 |     Mops = pow(2.0, M+1)/tm/1000000.0;
219 | 
220 |     printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n"
221 |            "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy);
222 |     for (i = 0; i  <= NQ-1; i++) {
223 |         printf("%3d %15.0f\n", i, q[i]);
224 |     }
225 | 
226 |     c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, tm, Mops, (char*)"Random numbers generated",
227 |                     verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
228 | 
229 |     if (TIMERS_ENABLED == TRUE) {
230 |         printf("Total time:     %f", timer_read(1));
231 |         printf("Gaussian pairs: %f", timer_read(2));
232 |         printf("Random numbers: %f", timer_read(3));
233 |     }
234 |     return 0;
235 | }
236 | 


--------------------------------------------------------------------------------
/NPB-SER/FT/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ft
 3 | BENCHMARKU=FT
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | ft.o:             ft.cpp  global.hpp npbparams.hpp
16 | 	${CCOMPILE} ft.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ mputil*
20 | 	- rm -f ft npbparams.hpp core
21 | 


--------------------------------------------------------------------------------
/NPB-SER/FT/global.hpp:
--------------------------------------------------------------------------------
  1 | #include "npbparams.hpp"
  2 | 
  3 | 
  4 | /*
  5 | c If processor array is 1x1 -> 0D grid decomposition
  6 | 
  7 | 
  8 | c Cache blocking params. These values are good for most
  9 | c RISC processors.  
 10 | c FFT parameters:
 11 | c  fftblock controls how many ffts are done at a time. 
 12 | c  The default is appropriate for most cache-based machines
 13 | c  On vector machines, the FFT can be vectorized with vector
 14 | c  length equal to the block size, so the block size should
 15 | c  be as large as possible. This is the size of the smallest
 16 | c  dimension of the problem: 128 for class A, 256 for class B and
 17 | c  512 for class C.
 18 | */
 19 | 
 20 | #define	FFTBLOCK_DEFAULT	16
 21 | #define	FFTBLOCKPAD_DEFAULT	18
 22 | 
 23 | #define FFTBLOCK	FFTBLOCK_DEFAULT
 24 | #define FFTBLOCKPAD	FFTBLOCKPAD_DEFAULT
 25 | 
 26 | /* COMMON block: blockinfo */
 27 | int fftblock;
 28 | int fftblockpad;
 29 |       
 30 | /*
 31 | c we need a bunch of logic to keep track of how
 32 | c arrays are laid out. 
 33 | 
 34 | 
 35 | c Note: this serial version is the derived from the parallel 0D case
 36 | c of the ft NPB.
 37 | c The computation proceeds logically as
 38 | 
 39 | c set up initial conditions
 40 | c fftx(1)
 41 | c transpose (1->2)
 42 | c ffty(2)
 43 | c transpose (2->3)
 44 | c fftz(3)
 45 | c time evolution
 46 | c fftz(3)
 47 | c transpose (3->2)
 48 | c ffty(2)
 49 | c transpose (2->1)
 50 | c fftx(1)
 51 | c compute residual(1)
 52 | 
 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx
 54 | c        
 55 | c            0D        1D        2D
 56 | c 1:        xyz       xyz       xyz
 57 | c 2:        xyz       xyz       yxz
 58 | c 3:        xyz       zyx       zxy
 59 | 
 60 | c the array dimensions are stored in dims(coord, phase)
 61 | */
 62 | 
 63 | /* COMMON block: layout */
 64 | static int dims[3][3];
 65 | static int xstart[3];
 66 | static int ystart[3];
 67 | static int zstart[3];
 68 | static int xend[3];
 69 | static int yend[3];
 70 | static int zend[3];
 71 | 
 72 | #define	T_TOTAL		0
 73 | #define	T_SETUP		1
 74 | #define	T_FFT		2
 75 | #define	T_EVOLVE	3
 76 | #define	T_CHECKSUM	4
 77 | #define	T_FFTLOW	5
 78 | #define	T_FFTCOPY	6
 79 | #define	T_MAX		7
 80 | 
 81 | #define	TIMERS_ENABLED	FALSE
 82 | 
 83 | /* other stuff */
 84 | 
 85 | #define	SEED	314159265.0
 86 | #define	A	1220703125.0
 87 | #define	PI	3.141592653589793238
 88 | #define	ALPHA	1.0e-6
 89 | 
 90 | #define	EXPMAX	(NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4))
 91 | 
 92 | /* COMMON block: excomm */
 93 | static double ex[EXPMAX+1];	/* ex(0:expmax) */
 94 | 
 95 | /*
 96 | c roots of unity array
 97 | c relies on x being largest dimension?
 98 | */
 99 | 
100 | /* COMMON block: ucomm */
101 | static dcomplex u[NX];
102 | 
103 | /* for checksum data */
104 | 
105 | /* COMMON block: sumcomm */
106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */
107 | 
108 | /* number of iterations*/
109 | 
110 | /* COMMON block: iter */
111 | static int niter;
112 | 
113 | 


--------------------------------------------------------------------------------
/NPB-SER/IS/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=is
 3 | BENCHMARKU=IS
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = is.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | 
16 | is.o:		is.cpp npbparams.hpp
17 | 	${CCOMPILE} is.cpp
18 | 
19 | clean:
20 | 	- rm -f *.o *~ 
21 | 	- rm -f npbparams.hpp core


--------------------------------------------------------------------------------
/NPB-SER/MG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=mg
 3 | BENCHMARKU=MG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = mg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | mg.o:		mg.cpp npbparams.hpp
16 | 	${CCOMPILE} mg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-SER/MG/globals.hpp:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------
 2 | c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
 3 | c  the edge size max for the partition on a given node, so must be changed 
 4 | c  either to save space (if running a small case) or made bigger for larger 
 5 | c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
 6 | c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
 7 | c  automatically in npbparams.h
 8 | c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
 9 | c-------------------------------------------------------------------*/
10 | 
11 | #include "npbparams.hpp"
12 | 
13 | /* parameters */
14 | /* actual dimension including ghost cells for communications */
15 | #define	NM	(2+(2<<(LM-1)))
16 | /* size of rhs array */
17 | #define	NV	(2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1))))
18 | /* size of residual array */
19 | #define	NR	((8*(NV+(NM*NM)+5*NM+7*LM))/7)
20 | /* size of communication buffer */
21 | #define	NM2	(2*NM*NM)
22 | /* maximum number of levels */
23 | #define	MAXLEVEL	11
24 | 
25 | /*---------------------------------------------------------------------*/
26 | /* common /mg3/ */
27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1];
28 | /* common /ClassType/ */
29 | static char class_npb;
30 | /* common /my_debug/ */
31 | static int debug_vec[8];
32 | /* common /fap/ */
33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/
34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1];
35 | static int lt, lb;
36 | 
37 | /*c---------------------------------------------------------------------
38 | c  Set at m=1024, can handle cases up to 1024^3 case
39 | c---------------------------------------------------------------------*/
40 | #define	M	1037
41 | 
42 | /* common /buffer/ */
43 | /*static double buff[4][NM2];*/
44 | 


--------------------------------------------------------------------------------
/NPB-SER/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | CLASS=S
 3 | SFILE=config/suite.def
 4 | 
 5 | default: header
 6 | 	@ $(SHELL) sys/print_instructions
 7 | 
 8 | 	       
 9 | MG: mg		       
10 | mg: header	       
11 | 	cd MG; $(MAKE) CLASS=$(CLASS)
12 | 		       
13 | FT: ft		       
14 | ft: header	       
15 | 	cd FT; $(MAKE) CLASS=$(CLASS)
16 | 		       
17 | IS: is		       
18 | is: header	       
19 | 	cd IS; $(MAKE) CLASS=$(CLASS)
20 | 		       
21 | CG: cg		       
22 | cg: header	       
23 | 	cd CG; $(MAKE) CLASS=$(CLASS)
24 | 		       
25 | EP: ep		       
26 | ep: header	       
27 | 	cd EP; $(MAKE) CLASS=$(CLASS)
28 | 
29 | 
30 | # Awk script courtesy cmg@cray.com
31 | suite:
32 | 	@ awk '{ if ($$1 !~ /^#/ &&  NF > 0)                              \
33 | 	printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE)  \
34 | 	| $(SHELL)
35 | 
36 | 
37 | # It would be nice to make clean in each subdirectory (the targets
38 | # are defined) but on a really clean system this will won't work
39 | # because those makefiles need config/make.def
40 | clean:
41 | 	- rm -f core 
42 | 	- rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe
43 | 	- rm -f sys/setparams sys/makesuite sys/setparams.hpp
44 | 
45 | cleanall: clean
46 | 	- rm -r bin/*
47 | 
48 | veryclean: clean
49 | 	- rm config/make.def config/suite.def Part*
50 | 	- rm bin/mg.* bin/ft.* bin/is.* bin/ep.* bin/cg.*
51 | 
52 | header:
53 | 	@ $(SHELL) sys/print_header
54 | 
55 | kit: 
56 | 	- makekit -s100k -k30 * */* */*/*
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/NPB-SER/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP)
 2 | 
 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP).
 4 | 
 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways.
 6 | 
 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)*
 8 | 
 9 | 
10 | ## How to cite this work
11 | 	
12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
13 | 
14 | ## The NPB-CPP Benchmark
15 | 
16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version.
17 | 
18 | 	==================================================================
19 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
20 | 	 												
21 | 			Code contributors: 
22 | 					Dalvan Griebler    		
23 | 					Júnior Löff
24 | 													
25 | 		Warning: in case of problems send an email to us:					
26 | 			dalvan.griebler@acad.pucrs.br			
27 | 			junior.loff@acad.pucrs.br				
28 | 	==================================================================
29 | 
30 | 
31 | This folder contains:
32 | 
33 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
34 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
35 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
36 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
37 | 
38 | Each directory is independent and contains its own implemented version of the kernels:
39 | 
40 | 	IS - Integer Sort, random memory access
41 | 	EP - Embarrassingly Parallel
42 | 	CG - Conjugate Gradient, irregular memory access and communication
43 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
44 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
45 | 
46 | ## Software Requirements
47 | 
48 | *Warning: our tests were made with GCC-5*
49 | 
50 | **TBB**
51 | 
52 | *Installation*
53 | 
54 | 	apt-get install libtbb-dev
55 | 
56 | **FastFlow** 
57 | 
58 | *Installation*
59 | 
60 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
61 | 
62 | 
63 | ## How to Compile 
64 | 
65 | Enter the directory from the version desired and execute:
66 | 
67 | 	make _BENCHMARK CLASS=_VERSION
68 | 
69 | 
70 | _BENCHMARKs are: 
71 | 		
72 | 	EP, CG, MG, IS and FT 
73 | 																										
74 | _VERSIONs are: 
75 | 	
76 | 	Class S: small for quick test purposes
77 | 	Class W: workstation size (a 90's workstation; now likely too small)	
78 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
79 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
80 | 
81 | 
82 | Command:
83 | 
84 | 	make ep CLASS=B
85 | 


--------------------------------------------------------------------------------
/NPB-SER/bin/README.md:
--------------------------------------------------------------------------------
 1 | # How to Cite our Work
 2 | 	
 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
 4 | 
 5 | # The NPB-CPP Benchmark
 6 | 
 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version.
 8 | 
 9 | 	==================================================================
10 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
11 | 	 												
12 | 			Code contributors: 
13 | 					Dalvan Griebler    		
14 | 					Júnior Löff
15 | 													
16 | 		Warning: in case of problems send an email to us:					
17 | 			dalvan.griebler@acad.pucrs.br			
18 | 			junior.loff@acad.pucrs.br				
19 | 	==================================================================
20 | 
21 | 
22 | This folder contains:
23 | 
24 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
25 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
26 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
27 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
28 | 
29 | Each directory is independent and contains its own implemented version of the kernels:
30 | 
31 | 	IS - Integer Sort, random memory access
32 | 	EP - Embarrassingly Parallel
33 | 	CG - Conjugate Gradient, irregular memory access and communication
34 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
35 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
36 | 
37 | # Software Requiriments
38 | 
39 | *Warning: our tests were made with GCC-5*
40 | 
41 | **TBB**
42 | 
43 | *Installation*
44 | 
45 | 	apt-get install libtbb-dev
46 | 
47 | **FastFlow** 
48 | 
49 | *Installation*
50 | 
51 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
52 | 
53 | 
54 | # How to Compile 
55 | 
56 | Enter the directory from the version desired and execute:
57 | 
58 | 	make _BENCHMARK CLASS=_VERSION
59 | 
60 | 
61 | _BENCHMARKs are: 
62 | 		
63 | 	EP, CG, MG, IS and FT 
64 | 																										
65 | _VERSIONs are: 
66 | 	
67 | 	Class S: small for quick test purposes
68 | 	Class W: workstation size (a 90's workstation; now likely too small)	
69 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
70 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
71 | 
72 | 
73 | Command:
74 | 
75 | 	make ep CLASS=B


--------------------------------------------------------------------------------
/NPB-SER/common/c_print_results.cpp:
--------------------------------------------------------------------------------
 1 | /*****************************************************************/
 2 | /******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
 3 | /*****************************************************************/
 4 | #include <cstdlib>
 5 | #include <cstdio>
 6 | 
 7 | void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, double t,
 8 |   double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
 9 |   char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand)
10 | {
11 | 
12 |     printf( "\n\n %s Benchmark Completed\n", name ); 
13 | 
14 |     printf( " class_npb           =                        %c\n", class_npb );
15 | 
16 |     if( n2 == 0 && n3 == 0 )
17 |         printf( " Size            =             %12d\n", n1 );   /* as in IS */
18 |     else
19 |         printf( " Size            =              %3dx%3dx%3d\n", n1,n2,n3 );
20 | 
21 |     printf( " Iterations      =             %12d\n", niter );
22 |  
23 |     printf( " Time in seconds =             %12.2f\n", t );
24 | 
25 |     printf( " Mop/s total     =             %12.2f\n", mops );
26 | 
27 |     printf( " Operation type  = %24s\n", optype);
28 | 
29 |     if( passed_verification )
30 |         printf( " Verification    =               SUCCESSFUL\n" );
31 |     else
32 |         printf( " Verification    =             UNSUCCESSFUL\n" );
33 | 
34 |     printf( " Version         =             %12s\n", npbversion );
35 | 
36 |     printf( " Compile date    =             %12s\n", compiletime );
37 | 
38 |     printf( "\n Compile options:\n" );
39 | 
40 |     printf( "    CC           = %s\n", cc );
41 | 
42 |     printf( "    CLINK        = %s\n", clink );
43 | 
44 |     printf( "    C_LIB        = %s\n", c_lib );
45 | 
46 |     printf( "    C_INC        = %s\n", c_inc );
47 | 
48 |     printf( "    CFLAGS       = %s\n", cflags );
49 | 
50 |     printf( "    CLINKFLAGS   = %s\n", clinkflags );
51 | 
52 |     printf( "    RAND         = %s\n", rand );
53 | #ifdef SMP
54 |     char *evalue = getenv("MP_SET_NUMTHREADS");
55 |     printf( "   MULTICPUS = %s\n", evalue );
56 | #endif
57 | 
58 | /*    printf( "\n\n" );
59 |     printf( " Please send the results of this run to:\n\n" );
60 |     printf( " NPB Development Team\n" );
61 |     printf( " Internet: npb@nas.nasa.gov\n \n" );
62 |     printf( " If email is not available, send this to:\n\n" );
63 |     printf( " MS T27A-1\n" );
64 |     printf( " NASA Ames Research Center\n" );
65 |     printf( " Moffett Field, CA  94035-1000\n\n" );
66 |     printf( " Fax: 415-604-3957\n\n" );*/
67 | }
68 |  
69 | 


--------------------------------------------------------------------------------
/NPB-SER/common/c_randdp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | */
  3 | #if defined(USE_POW)
  4 | #define r23 pow(0.5, 23.0)
  5 | #define r46 (r23*r23)
  6 | #define t23 pow(2.0, 23.0)
  7 | #define t46 (t23*t23)
  8 | #else
  9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5)
 10 | #define r46 (r23*r23)
 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0)
 12 | #define t46 (t23*t23)
 13 | #endif
 14 | 
 15 | /*c---------------------------------------------------------------------
 16 | c---------------------------------------------------------------------*/
 17 | 
 18 | double randlc (double *x, double a) {
 19 | 
 20 | /*c---------------------------------------------------------------------
 21 | c---------------------------------------------------------------------*/
 22 | 
 23 | /*c---------------------------------------------------------------------
 24 | c
 25 | c   This routine returns a uniform pseudorandom double precision number in the
 26 | c   range (0, 1) by using the linear congruential generator
 27 | c
 28 | c   x_{k+1} = a x_k  (mod 2^46)
 29 | c
 30 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 31 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 32 | c   and X is the same as x_0.  A and X must be odd double precision integers
 33 | c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
 34 | c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 35 | c   the new seed x_1, so that subsequent calls to RANDLC using the same
 36 | c   arguments will generate a continuous sequence.
 37 | c
 38 | c   This routine should produce the same results on any computer with at least
 39 | c   48 mantissa bits in double precision floating point data.  On 64 bit
 40 | c   systems, double precision should be disabled.
 41 | c
 42 | c   David H. Bailey     October 26, 1990
 43 | c
 44 | c---------------------------------------------------------------------*/
 45 | 
 46 |     double t1,t2,t3,t4,a1,a2,x1,x2,z;
 47 | 
 48 | /*c---------------------------------------------------------------------
 49 | c   Break A into two parts such that A = 2^23 * A1 + A2.
 50 | c---------------------------------------------------------------------*/
 51 |     t1 = r23 * a;
 52 |     a1 = (int)t1;
 53 |     a2 = a - t23 * a1;
 54 | 
 55 | /*c---------------------------------------------------------------------
 56 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
 57 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 58 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
 59 | c---------------------------------------------------------------------*/
 60 |     t1 = r23 * (*x);
 61 |     x1 = (int)t1;
 62 |     x2 = (*x) - t23 * x1;
 63 |     t1 = a1 * x2 + a2 * x1;
 64 |     t2 = (int)(r23 * t1);
 65 |     z = t1 - t23 * t2;
 66 |     t3 = t23 * z + a2 * x2;
 67 |     t4 = (int)(r46 * t3);
 68 |     (*x) = t3 - t46 * t4;
 69 | 
 70 |     return (r46 * (*x));
 71 | }
 72 | 
 73 | /*c---------------------------------------------------------------------
 74 | c---------------------------------------------------------------------*/
 75 | 
 76 | void vranlc (int n, double *x_seed, double a, double y[]) {
 77 | 
 78 | /*c---------------------------------------------------------------------
 79 | c---------------------------------------------------------------------*/
 80 | 
 81 | /*c---------------------------------------------------------------------
 82 | c
 83 | c   This routine generates N uniform pseudorandom double precision numbers in
 84 | c   the range (0, 1) by using the linear congruential generator
 85 | c
 86 | c   x_{k+1} = a x_k  (mod 2^46)
 87 | c
 88 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 89 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 90 | c   and X is the same as x_0.  A and X must be odd double precision integers
 91 | c   in the range (1, 2^46).  The N results are placed in Y and are normalized
 92 | c   to be between 0 and 1.  X is updated to contain the new seed, so that
 93 | c   subsequent calls to VRANLC using the same arguments will generate a
 94 | c   continuous sequence.  If N is zero, only initialization is performed, and
 95 | c   the variables X, A and Y are ignored.
 96 | c
 97 | c   This routine is the standard version designed for scalar or RISC systems.
 98 | c   However, it should produce the same results on any single processor
 99 | c   computer with at least 48 mantissa bits in double precision floating point
100 | c   data.  On 64 bit systems, double precision should be disabled.
101 | c
102 | c---------------------------------------------------------------------*/
103 | 
104 |     int i;
105 |     double x,t1,t2,t3,t4,a1,a2,x1,x2,z;
106 | 
107 | /*c---------------------------------------------------------------------
108 | c   Break A into two parts such that A = 2^23 * A1 + A2.
109 | c---------------------------------------------------------------------*/
110 |     t1 = r23 * a;
111 |     a1 = (int)t1;
112 |     a2 = a - t23 * a1;
113 |     x = *x_seed;
114 | 
115 | /*c---------------------------------------------------------------------
116 | c   Generate N results.   This loop is not vectorizable.
117 | c---------------------------------------------------------------------*/
118 |     for (i = 1; i <= n; i++) {
119 | 
120 | /*c---------------------------------------------------------------------
121 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
122 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
123 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
124 | c---------------------------------------------------------------------*/
125 |         t1 = r23 * x;
126 |         x1 = (int)t1;
127 |         x2 = x - t23 * x1;
128 |         t1 = a1 * x2 + a2 * x1;
129 |         t2 = (int)(r23 * t1);
130 |         z = t1 - t23 * t2;
131 |         t3 = t23 * z + a2 * x2;
132 |         t4 = (int)(r46 * t3);
133 |         x = t3 - t46 * t4;
134 |         y[i] = r46 * x;
135 |     }
136 |     *x_seed = x;
137 | }
138 | 


--------------------------------------------------------------------------------
/NPB-SER/common/c_timers.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #include "wtime.hpp"
 5 | #include <cstdlib>
 6 | 
 7 | /*  Prototype  */
 8 | void wtime( double * );
 9 | 
10 | 
11 | 
12 | /*****************************************************************/
13 | /******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
14 | /*****************************************************************/
15 | double elapsed_time( void )
16 | {
17 |     double t;
18 | 
19 |     wtime( &t );
20 |     return( t );
21 | }
22 | 
23 | 
24 | double start[64], elapsed[64];
25 | 
26 | /*****************************************************************/
27 | /******            T  I  M  E  R  _  C  L  E  A  R          ******/
28 | /*****************************************************************/
29 | void timer_clear( int n )
30 | {
31 |     elapsed[n] = 0.0;
32 | }
33 | 
34 | 
35 | /*****************************************************************/
36 | /******            T  I  M  E  R  _  S  T  A  R  T          ******/
37 | /*****************************************************************/
38 | void timer_start( int n )
39 | {
40 |     start[n] = elapsed_time();
41 | }
42 | 
43 | 
44 | /*****************************************************************/
45 | /******            T  I  M  E  R  _  S  T  O  P             ******/
46 | /*****************************************************************/
47 | void timer_stop( int n )
48 | {
49 |     double t, now;
50 | 
51 |     now = elapsed_time();
52 |     t = now - start[n];
53 |     elapsed[n] += t;
54 | 
55 | }
56 | 
57 | 
58 | /*****************************************************************/
59 | /******            T  I  M  E  R  _  R  E  A  D             ******/
60 | /*****************************************************************/
61 | double timer_read( int n )
62 | {
63 |     return( elapsed[n] );
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-SER/common/npb-CPP.hpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <cmath>
 4 | 
 5 | typedef int boolean;
 6 | typedef struct { double real; double imag; } dcomplex;
 7 | 
 8 | #define TRUE	1
 9 | #define FALSE	0
10 | 
11 | #define max(a,b) (((a) > (b)) ? (a) : (b))
12 | #define min(a,b) (((a) < (b)) ? (a) : (b))
13 | #define	pow2(a) ((a)*(a))
14 | 
15 | #define get_real(c) c.real
16 | #define get_imag(c) c.imag
17 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag)
18 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag)
19 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \
20 |                      c.imag = a.real * b.imag + a.imag * b.real)
21 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b)
22 | 
23 | extern double randlc(double *, double);
24 | extern void vranlc(int, double *, double, double *);
25 | extern void timer_clear(int);
26 | extern void timer_start(int);
27 | extern void timer_stop(int);
28 | extern double timer_read(int);
29 | 
30 | extern void c_print_results(char *name, char class_npb, int n1, int n2,
31 | 			    int n3, int niter, double t,
32 | 			    double mops, char *optype, int passed_verification,
33 | 			    char *npbversion, char *compiletime, char *cc,
34 | 			    char *clink, char *c_lib, char *c_inc,
35 | 			    char *cflags, char *clinkflags, char *rand);
36 | 


--------------------------------------------------------------------------------
/NPB-SER/common/wtime.cpp:
--------------------------------------------------------------------------------
 1 | #include "wtime.hpp"
 2 | #include <sys/time.h>
 3 | 
 4 | void wtime(double *t)
 5 | {
 6 |   static int sec = -1;
 7 |   struct timeval tv;
 8 |   gettimeofday(&tv, 0);
 9 |   if (sec < 0) sec = tv.tv_sec;
10 |   *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
11 | }
12 | 
13 |     
14 | 


--------------------------------------------------------------------------------
/NPB-SER/common/wtime.hpp:
--------------------------------------------------------------------------------
 1 | /* C/Fortran interface is different on different machines. 
 2 |  * You may need to tweak this.
 3 |  */
 4 | 
 5 | 
 6 | #if defined(IBM)
 7 | #define wtime wtime
 8 | #elif defined(CRAY)
 9 | #define wtime WTIME
10 | #else
11 | #define wtime wtime_
12 | #endif
13 | 


--------------------------------------------------------------------------------
/NPB-SER/common/wtime_sgi64.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <fcntl.h>
 3 | #include <sys/mman.h>
 4 | #include <sys/syssgi.h>
 5 | #include <sys/immu.h>
 6 | #include <cerrno>
 7 | #include <cstdio>
 8 | 
 9 | /* The following works on SGI Power Challenge systems */
10 | 
11 | typedef unsigned long iotimer_t;
12 | 
13 | unsigned int cycleval;
14 | volatile iotimer_t *iotimer_addr, base_counter;
15 | double resolution;
16 | 
17 | /* address_t is an integer type big enough to hold an address */
18 | typedef unsigned long address_t;
19 | 
20 | 
21 | 
22 | void timer_init() 
23 | {
24 |   
25 |   int fd;
26 |   char *virt_addr;
27 |   address_t phys_addr, page_offset, pagemask, pagebase_addr;
28 |   
29 |   pagemask = getpagesize() - 1;
30 |   errno = 0;
31 |   phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
32 |   if (errno != 0) {
33 |     perror("SGI_QUERY_CYCLECNTR");
34 |     exit(1);
35 |   }
36 |   /* rel_addr = page offset of physical address */
37 |   page_offset = phys_addr & pagemask;
38 |   pagebase_addr = phys_addr - page_offset;
39 |   fd = open("/dev/mmem", O_RDONLY);
40 | 
41 |   virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
42 |   virt_addr = virt_addr + page_offset;
43 |   iotimer_addr = (iotimer_t *)virt_addr;
44 |   /* cycleval in picoseconds to this gives resolution in seconds */
45 |   resolution = 1.0e-12*cycleval; 
46 |   base_counter = *iotimer_addr;
47 | }
48 | 
49 | void wtime_(double *time) 
50 | {
51 |   static int initialized = 0;
52 |   volatile iotimer_t counter_value;
53 |   if (!initialized) { 
54 |     timer_init();
55 |     initialized = 1;
56 |   }
57 |   counter_value = *iotimer_addr - base_counter;
58 |   *time = (double)counter_value * resolution;
59 | }
60 | 
61 | 
62 | void wtime(double *time) 
63 | {
64 |   static int initialized = 0;
65 |   volatile iotimer_t counter_value;
66 |   if (!initialized) { 
67 |     timer_init();
68 |     initialized = 1;
69 |   }
70 |   counter_value = *iotimer_addr - base_counter;
71 |   *time = (double)counter_value * resolution;
72 | }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/NPB-SER/config/make.def:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------------------
  2 | #
  3 | #                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
  4 | #
  5 | #---------------------------------------------------------------------------
  6 | 
  7 | #---------------------------------------------------------------------------
  8 | # Items in this file will need to be changed for each platform.
  9 | # (Note these definitions are inconsistent with NPB2.1.)
 10 | #---------------------------------------------------------------------------
 11 | 
 12 | #---------------------------------------------------------------------------
 13 | # Parallel C:
 14 | #
 15 | # CC         - C compiler 
 16 | # CFLAGS     - C compilation arguments
 17 | # C_INC      - any -I arguments required for compiling C 
 18 | # CLINK      - C linker
 19 | # CLINKFLAGS - C linker flags
 20 | # C_LIB      - any -L and -l arguments required for linking C 
 21 | #
 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or
 23 | #                            $(CC) $(CFLAGS)
 24 | # linking is done with       $(CLINK) $(C_LIB) $(CLINKFLAGS)
 25 | #---------------------------------------------------------------------------
 26 | 
 27 | #---------------------------------------------------------------------------
 28 | # This is the C compiler used for OpenMP programs
 29 | #---------------------------------------------------------------------------
 30 | CC = g++ -std=c++14
 31 | #gcc #cc
 32 | # This links C programs; usually the same as ${CC}
 33 | CLINK	= $(CC)
 34 | 
 35 | #---------------------------------------------------------------------------
 36 | # These macros are passed to the linker 
 37 | #---------------------------------------------------------------------------
 38 | C_LIB  = -lm 
 39 | 
 40 | #---------------------------------------------------------------------------
 41 | # These macros are passed to the compiler 
 42 | #---------------------------------------------------------------------------
 43 | C_INC = -I../common 
 44 | 
 45 | #---------------------------------------------------------------------------
 46 | # Global *compile time* flags for C programs
 47 | #---------------------------------------------------------------------------
 48 | CFLAGS	= -O3
 49 | # CFLAGS = -g
 50 | 
 51 | #---------------------------------------------------------------------------
 52 | # Global *link time* flags. Flags for increasing maximum executable 
 53 | # size usually go here. 
 54 | #---------------------------------------------------------------------------
 55 | CLINKFLAGS = -O3
 56 | 
 57 | 
 58 | #---------------------------------------------------------------------------
 59 | # Utilities C:
 60 | #
 61 | # This is the C compiler used to compile C utilities.  Flags required by 
 62 | # this compiler go here also; typically there are few flags required; hence 
 63 | # there are no separate macros provided for such flags.
 64 | #---------------------------------------------------------------------------
 65 | UCC	= cc
 66 | 
 67 | 
 68 | #---------------------------------------------------------------------------
 69 | # Destination of executables, relative to subdirs of the main directory. . 
 70 | #---------------------------------------------------------------------------
 71 | BINDIR	= ../bin
 72 | 
 73 | 
 74 | #---------------------------------------------------------------------------
 75 | # The variable RAND controls which random number generator 
 76 | # is used. It is described in detail in Doc/README.install. 
 77 | # Use "randi8" unless there is a reason to use another one. 
 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec"
 79 | #---------------------------------------------------------------------------
 80 | # RAND   = randi8
 81 | # The following is highly reliable but may be slow:
 82 | RAND   = randdp
 83 | 
 84 | 
 85 | #---------------------------------------------------------------------------
 86 | # The variable WTIME is the name of the wtime source code module in the
 87 | # NPB2.x/common directory.  
 88 | # For most machines,       use wtime.c
 89 | # For SGI power challenge: use wtime_sgi64.c
 90 | #---------------------------------------------------------------------------
 91 | WTIME  = wtime.cpp
 92 | 
 93 | 
 94 | #---------------------------------------------------------------------------
 95 | # Enable if either Cray or IBM: 
 96 | # (no such flag for most machines: see common/wtime.h)
 97 | # This is used by the C compiler to pass the machine name to common/wtime.h,
 98 | # where the C/Fortran binding interface format is determined
 99 | #---------------------------------------------------------------------------
100 | # MACHINE	=	-DCRAY
101 | # MACHINE	=	-DIBM
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/NPB-SER/config/suite.def:
--------------------------------------------------------------------------------
 1 | # config/suite.def
 2 | # This file is used to build several benchmarks with a single command. 
 3 | # Typing "make suite" in the main directory will build all the benchmarks
 4 | # specified in this file. 
 5 | # Each line of this file contains a benchmark name, class, and number
 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft". 
 7 | # The class is one of "S", "W", "A", "B", and "C". 
 8 | # No blank lines. 
 9 | # The following example builds serial sample sizes of all benchmarks. 
10 | ft	B
11 | mg	B
12 | is	B
13 | ep	B
14 | cg	B
15 | 


--------------------------------------------------------------------------------
/NPB-SER/sys/Makefile:
--------------------------------------------------------------------------------
 1 | include ../config/make.def
 2 | 
 3 | # Note that COMPILE is also defined in make.common and should
 4 | # be the same. We can't include make.common because it has a lot
 5 | # of other garbage. 
 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 7 | 
 8 | all: setparams 
 9 | 
10 | # setparams creates an npbparam.h file for each benchmark 
11 | # configuration. npbparams.h also contains info about how a benchmark
12 | # was compiled and linked
13 | 
14 | setparams: setparams.cpp ../config/make.def
15 | 	$(UCC) -o setparams setparams.cpp
16 | 
17 | 
18 | clean: 
19 | 	-rm -f setparams setparams.hpp npbparams.hpp
20 | 	-rm -f *~ *.o
21 | 
22 | 


--------------------------------------------------------------------------------
/NPB-SER/sys/README:
--------------------------------------------------------------------------------
 1 | This directory contains utilities and files used by the 
 2 | build process. You should not need to change anything
 3 | in this directory. 
 4 | 
 5 | Original Files
 6 | --------------
 7 | setparams.c:
 8 |         Source for the setparams program. This program is used internally
 9 |         in the build process to create the file "npbparams.h" for each 
10 |         benchmark. npbparams.h contains Fortran or C parameters to build a 
11 |         benchmark for a specific class. The setparams program is never run 
12 |         directly by a user. Its invocation syntax is 
13 | 
14 |             "setparams benchmark-name class". 
15 | 
16 |         It examines the file "npbparams.h" in the current directory. If 
17 |         the specified parameters are the same as those in the npbparams.h 
18 |         file, nothing it changed. If the file does not exist or corresponds 
19 |         to a different class/number of nodes, it is (re)built. 
20 | 	One of the more complicated things in npbparams.h is that it 
21 |         contains, in a Fortran string, the compiler flags used to build a 
22 |         benchmark, so that a benchmark can print out how it was compiled. 
23 | 
24 | make.common
25 |         A makefile segment that is included in each individual benchmark
26 |         program makefile. It sets up some standard macros (COMPILE, etc) 
27 |         and makes sure everything is configured correctly (npbparams.h)
28 | 
29 | Makefile
30 |         Builds  setparams
31 | 
32 | README
33 |         This file. 
34 | 
35 | 
36 | Created files
37 | -------------
38 | 
39 | setparams
40 | 	See descriptions above
41 | 
42 | 


--------------------------------------------------------------------------------
/NPB-SER/sys/make.common:
--------------------------------------------------------------------------------
 1 | PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 3 | CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
 4 | 
 5 | # Class "U" is used internally by the setparams program to mean
 6 | # "unknown". This means that if you don't specify CLASS=
 7 | # on the command line, you'll get an error. It would be nice
 8 | # to be able to avoid this, but we'd have to get information
 9 | # from the setparams back to the make program, which isn't easy. 
10 | CLASS=U
11 | 
12 | default:: ${PROGRAM}
13 | 
14 | # This makes sure the configuration utility setparams 
15 | # is up to date. 
16 | # Note that this must be run every time, which is why the
17 | # target does not exist and is not created. 
18 | # If you create a file called "config" you will break things. 
19 | config:
20 | 	@cd ../sys; ${MAKE} all
21 | 	../sys/setparams ${BENCHMARK} ${CLASS}
22 | 
23 | COMMON=../common
24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
25 | 	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
26 | 
27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp
28 | 	cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp
29 | 
30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f
31 | 	cd ${COMMON}; ${FCOMPILE} print_results.f
32 | 
33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp
34 | 	cd ${COMMON}; ${CCOMPILE} c_print_results.cpp
35 | 
36 | ${COMMON}/timers.o: ${COMMON}/timers.f
37 | 	cd ${COMMON}; ${FCOMPILE} timers.f
38 | 
39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp
40 | 	cd ${COMMON}; ${CCOMPILE} c_timers.cpp
41 | 
42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME}
43 | 	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME}
44 | # For most machines or CRAY or IBM
45 | #	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
46 | # For a precise timer on an SGI Power Challenge, try:
47 | #	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
48 | 
49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
50 | 	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME}
51 | 
52 | 
53 | # Normally setparams updates npbparams.h only if the settings (CLASS)
54 | # have changed. However, we also want to update if the compile options
55 | # may have changed (set in ../config/make.def). 
56 | npbparams.hpp: ../config/make.def
57 | 	@ echo make.def modified. Rebuilding npbparams.hpp just in case
58 | 	rm -f npbparams.hpp
59 | 	../sys/setparams ${BENCHMARK} ${CLASS}
60 | 
61 | # So that "make benchmark-name" works
62 | ${BENCHMARK}:  default
63 | ${BENCHMARKU}: default
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-SER/sys/print_header:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   ========================================='
 3 | echo '   =      NAS Parallel Benchmarks 	 ='
 4 | echo '   =      Serial C++ Versions              ='
 5 | echo '   =      Developed by: Dalvan Griebler    ='
 6 | echo '   =                    Júnior Löff        ='
 7 | echo '   =                                       ='
 8 | echo '   =      Warning: in case of problems     ='
 9 | echo '   =      send an email to us:             ='
10 | echo '   =      dalvan.griebler@acad.pucrs.br    ='
11 | echo '   =      junior.loff@acad.pucrs.br        ='
12 | echo '   ========================================='
13 | echo ''
14 | 


--------------------------------------------------------------------------------
/NPB-SER/sys/print_instructions:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   To make a NAS benchmark type '
 3 | echo ''
 4 | echo '         make <benchmark-name> CLASS=<class>'
 5 | echo ''
 6 | echo '   where <benchmark-name> is "cg", "ep", "ft", "is", or "mg"'
 7 | echo '         <class>          is "S", "W", "A", "B" or "C"'
 8 | echo ''
 9 | echo '   To make a set of benchmarks, create the file config/suite.def'
10 | echo '   according to the instructions in config/suite.def.template and type'
11 | echo ''
12 | echo '         make suite'
13 | echo ''
14 | echo ' ***************************************************************'
15 | echo ' * Remember to edit the file config/make.def for site specific *'
16 | echo ' * information as described in the README file                 *'
17 | echo ' ***************************************************************'
18 | 
19 | 


--------------------------------------------------------------------------------
/NPB-TBB/CG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=cg
 3 | BENCHMARKU=CG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = cg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | cg.o:		cg.cpp  npbparams.hpp
16 | 	${CCOMPILE} cg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-TBB/EP/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ep
 3 | BENCHMARKU=EP
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ep.o ${COMMON}/c_print_results.o ${COMMON}/c_${RAND}.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | 
16 | ep.o:		ep.cpp npbparams.hpp
17 | 	${CCOMPILE} ep.cpp
18 | 
19 | clean:
20 | 	- rm -f *.o *~ 
21 | 	- rm -f npbparams.hpp core
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/NPB-TBB/EP/ep.cpp:
--------------------------------------------------------------------------------
  1 | /*--------------------------------------------------------------------
  2 | 
  3 |     Information on NAS Parallel Benchmarks is available at:
  4 | 
  5 |     http://www.nas.nasa.gov/Software/NPB/
  6 | 
  7 |     Authors: P. O. Frederickson
  8 |            D. H. Bailey
  9 |            A. C. Woo
 10 | 
 11 |     CPP and TBB version:
 12 |             Dalvan Griebler <dalvangriebler@gmail.com>
 13 |             Júnior Löff <loffjh@gmail.com>
 14 | 
 15 | --------------------------------------------------------------------*/
 16 | 
 17 | 
 18 | 
 19 | #include <tbb/parallel_for.h>
 20 | #include <tbb/blocked_range.h>
 21 | #include <tbb/task_scheduler_init.h>
 22 | #include <tbb/mutex.h>
 23 | #include "npbparams.hpp"
 24 | #include <iostream>
 25 | #include <../common/npb-CPP.hpp>
 26 | 
 27 | /* parameters */
 28 | #define	MK		16
 29 | #define	MM		(M - MK)
 30 | #define	NN		(1 << MM)
 31 | #define	NK		(1 << MK)
 32 | #define	NQ		10
 33 | #define EPSILON		1.0e-8
 34 | #define	A		1220703125.0
 35 | #define	S		271828183.0
 36 | #define	TIMERS_ENABLED	FALSE
 37 | 
 38 | /* global variables */
 39 | /* common /storage/ */
 40 | static double x[2*NK];
 41 | static double q[NQ];
 42 | 
 43 | /*--------------------------------------------------------------------
 44 |       program EMBAR
 45 | c-------------------------------------------------------------------*/
 46 | /*
 47 | c   This is the serial version of the APP Benchmark 1,
 48 | c   the "embarassingly parallel" benchmark.
 49 | c
 50 | c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
 51 | c   numbers.  MK is the Log_2 of the size of each batch of uniform random
 52 | c   numbers.  MK can be set for convenience on a given system, since it does
 53 | c   not affect the results.
 54 | */
 55 | int main(int argc, char **argv) {
 56 |     double Mops, t1, sx, sy, tm, an, gc;
 57 |     double dum[3] = { 1.0, 1.0, 1.0 };
 58 |     int np,i, k, nit, k_offset, j;
 59 |     boolean verified;
 60 |     char size[13+1];	/* character*13 */
 61 | 
 62 |     int num_workers;
 63 |     if(const char * nw = std::getenv("TBB_NUM_THREADS")) {
 64 |         num_workers = atoi(nw);
 65 |     } else {
 66 |         num_workers = 1;
 67 |     }
 68 |     
 69 |     tbb::task_scheduler_init init(num_workers);
 70 |     tbb::mutex critical_section;
 71 | 
 72 |     /*
 73 |     c   Because the size of the problem is too large to store in a 32-bit
 74 |     c   integer for some classes, we put it into a string (for printing).
 75 |     c   Have to strip off the decimal point put in there by the floating
 76 |     c   point print statement (internal file)
 77 |     */
 78 | 
 79 |     printf("NAS Parallel Benchmarks 4.0 OpenMP C++ version"" - EP Benchmark\n");
 80 |     printf("Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br> & Júnior Löff <loffjh@gmail.com>\n\n");
 81 |     sprintf(size, "%12.0f", pow(2.0, M+1));
 82 |     for (j = 13; j >= 1; j--) {
 83 |         if (size[j] == '.') size[j] = ' ';
 84 |     }
 85 |     printf(" Number of random numbers generated: %13s\n", size);
 86 | 
 87 |     verified = FALSE;
 88 | 
 89 |     /*
 90 |     c   Compute the number of "batches" of random number pairs generated
 91 |     c   per processor. Adjust if the number of processors does not evenly
 92 |     c   divide the total number
 93 |     */
 94 |     np = NN;
 95 | 
 96 |     /*
 97 |     c   Call the random number generator functions and initialize
 98 |     c   the x-array to reduce the effects of paging on the timings.
 99 |     c   Also, call all mathematical functions that are used. Make
100 |     c   sure these initializations cannot be eliminated as dead code.
101 |     */
102 |     vranlc(0, &(dum[0]), dum[1], &(dum[2]));
103 |     dum[0] = randlc(&(dum[1]), dum[2]);
104 |     for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
105 |     Mops = log(sqrt(fabs(max(1.0, 1.0))));
106 | 
107 | 
108 | 
109 |     timer_clear(1);
110 |     timer_clear(2);
111 |     timer_clear(3);
112 | 
113 |     timer_start(1);
114 | 
115 |     vranlc(0, &t1, A, x);
116 | 
117 |     /*   Compute AN = A ^ (2 * NK) (mod 2^46). */
118 | 
119 |     t1 = A;
120 | 
121 |     for ( i = 1; i <= MK+1; i++) {
122 |         an = randlc(&t1, t1);
123 |     }
124 | 
125 |     an = t1;
126 |     gc = 0.0;
127 |     sx = 0.0;
128 |     sy = 0.0;
129 | 
130 |     for ( i = 0; i <= NQ - 1; i++) {
131 |         q[i] = 0.0;
132 |     }
133 | 
134 |     /*
135 |     c   Each instance of this loop may be performed independently. We compute
136 |     c   the k offsets separately to take into account the fact that some nodes
137 |     c   have more numbers to generate than others
138 |     */
139 |     k_offset = -1;
140 | 
141 |     tbb::parallel_for(tbb::blocked_range<size_t>(1,np+1),[&](const tbb::blocked_range<size_t>& r){
142 |         double t2, t3, t4, x1, x2;
143 |         int kk, ik, l;
144 |         double qq[NQ];		/* private copy of q[0:NQ-1] */
145 |         double sx_tbb, sy_tbb;
146 |         double x[(2*NK)+1];
147 | 
148 |         for (int i = 0; i < NQ; i++)
149 |             qq[i] = 0.0;
150 |         
151 |         sx_tbb = 0.0;
152 |         sy_tbb = 0.0;
153 | 
154 |         for(int k=r.begin(); k != r.end(); k++){
155 |             kk = k_offset + k;
156 |             double t1 = S;
157 |             t2 = an;
158 | 
159 |             /*  Find starting seed t1 for this kk. */
160 | 
161 |             for (int i = 1; i <= 100; i++) {
162 |                 ik = kk / 2;
163 |                 if (2 * ik != kk) t3 = randlc(&t1, t2);
164 |                 if (ik == 0) break;
165 |                 t3 = randlc(&t2, t2);
166 |                 kk = ik;
167 |             }
168 | 
169 |             /*      Compute uniform pseudorandom numbers. */
170 | 
171 |             if (TIMERS_ENABLED == TRUE) timer_start(3);
172 |             vranlc(2*NK, &t1, A, x);
173 |             if (TIMERS_ENABLED == TRUE) timer_stop(3);
174 | 
175 |             /*
176 |             c       Compute Gaussian deviates by acceptance-rejection method and
177 |             c       tally counts in concentric square annuli.  This loop is not
178 |             c       vectorizable.
179 |             */
180 |             if (TIMERS_ENABLED == TRUE) timer_start(2);
181 | 
182 |             for (int i = 1; i <= NK; i++) {
183 |                 x1 = 2.0 * x[2*i-1] - 1.0;
184 |                 x2 = 2.0 * x[2*i] - 1.0;
185 |                 t1 = pow2(x1) + pow2(x2);
186 |                 if (t1 <= 1.0) {
187 |                     t2 = sqrt(-2.0 * log(t1) / t1);
188 |                     t3 = (x1 * t2);				/* Xi */
189 |                     t4 = (x2 * t2);				/* Yi */
190 |                     l = max(fabs(t3), fabs(t4));
191 |                     qq[l] += 1.0;				/* counts */
192 |                     sx_tbb = sx_tbb + t3;		/* sum of Xi */
193 |                     sy_tbb = sy_tbb + t4;		/* sum of Yi */
194 |                 }
195 |             }
196 |             if (TIMERS_ENABLED == TRUE) timer_stop(2);
197 | 
198 |         }
199 | 
200 |         critical_section.lock();
201 |         for (int i = 0; i < NQ; i++){ 
202 |             q[i] += qq[i];
203 |         }
204 |         sx += sx_tbb;
205 |         sy += sy_tbb;
206 |         critical_section.unlock();
207 |     
208 |     });
209 | 
210 |     for (i = 0; i <= NQ-1; i++) {
211 |         gc = gc + q[i];
212 |     }
213 | 
214 |     timer_stop(1);
215 | 
216 |     tm = timer_read(1);
217 | 
218 | 
219 |     nit = 0;
220 |     if (M == 24) {
221 |         if((fabs((sx- (-3.247834652034740e3))/-3.247834652034740e3) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/-6.958407078382297e3) <= EPSILON)) {
222 |             verified = TRUE;
223 |         }
224 |     } else if (M == 25) {
225 |         if ((fabs((sx- (-2.863319731645753e3))/-2.863319731645753e3) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/-6.320053679109499e3) <= EPSILON)) {
226 |             verified = TRUE;
227 |         }
228 |     } else if (M == 28) {
229 |         //if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
230 |         if ((fabs((sx- (-4.295875165629892e3))/-4.295875165629892e3) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/-1.580732573678431e4) <= EPSILON)) {
231 |             verified = TRUE;
232 |         }
233 |     } else if (M == 30) {
234 |         if ((fabs((sx- (4.033815542441498e4))/4.033815542441498e4) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/-2.660669192809235e4) <= EPSILON)) {
235 |             verified = TRUE;
236 |         }
237 |     } else if (M == 32) {
238 |         if ((fabs((sx- (4.764367927995374e4))/4.764367927995374e4) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/-8.084072988043731e4) <= EPSILON)) {
239 |             verified = TRUE;
240 |         }
241 |     } else if (M == 36) {
242 |         if ((fabs((sx- (1.982481200946593e5))/1.982481200946593e5) <= EPSILON) && (fabs((sy- (-1.020596636361769e5))/-1.020596636361769e5) <= EPSILON)) {
243 |             verified = TRUE;
244 |         }
245 |     } else if (M == 40) {
246 |         if ((fabs((sx- (-5.319717441530e5))/-5.319717441530e5) <= EPSILON) && (fabs((sy- (-3.688834557731e5))/-3.688834557731e5) <= EPSILON)) {
247 |             verified = TRUE;
248 |         }
249 |     }
250 | 
251 |     Mops = pow(2.0, M+1)/tm/1000000.0;
252 | 
253 |     printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n"
254 |            "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy);
255 |     for (i = 0; i  <= NQ-1; i++) {
256 |         printf("%3d %15.0f\n", i, q[i]);
257 |     }
258 | 
259 |     c_print_results((char*)"EP", CLASS, M+1, 0, 0, nit, tm, Mops, (char*)"Random numbers generated",
260 |                     verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);
261 | 
262 |     if (TIMERS_ENABLED == TRUE) {
263 |         printf("Total time:     %f", timer_read(1));
264 |         printf("Gaussian pairs: %f", timer_read(2));
265 |         printf("Random numbers: %f", timer_read(3));
266 |     }
267 |     return 0;
268 | }
269 | 


--------------------------------------------------------------------------------
/NPB-TBB/FT/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=ft
 3 | BENCHMARKU=FT
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = ft.o ${COMMON}/c_${RAND}.o ${COMMON}/c_print_results.o \
 8 |        ${COMMON}/c_timers.o ${COMMON}/c_wtime.o #../omp-prof.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | ft.o:             ft.cpp  global.hpp npbparams.hpp
16 | 	${CCOMPILE} ft.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ mputil*
20 | 	- rm -f ft npbparams.hpp core
21 | 


--------------------------------------------------------------------------------
/NPB-TBB/FT/global.hpp:
--------------------------------------------------------------------------------
  1 | #include "npbparams.hpp"
  2 | 
  3 | 
  4 | /*
  5 | c If processor array is 1x1 -> 0D grid decomposition
  6 | 
  7 | 
  8 | c Cache blocking params. These values are good for most
  9 | c RISC processors.  
 10 | c FFT parameters:
 11 | c  fftblock controls how many ffts are done at a time. 
 12 | c  The default is appropriate for most cache-based machines
 13 | c  On vector machines, the FFT can be vectorized with vector
 14 | c  length equal to the block size, so the block size should
 15 | c  be as large as possible. This is the size of the smallest
 16 | c  dimension of the problem: 128 for class A, 256 for class B and
 17 | c  512 for class C.
 18 | */
 19 | 
 20 | #define	FFTBLOCK_DEFAULT	16
 21 | #define	FFTBLOCKPAD_DEFAULT	18
 22 | 
 23 | #define FFTBLOCK	FFTBLOCK_DEFAULT
 24 | #define FFTBLOCKPAD	FFTBLOCKPAD_DEFAULT
 25 | 
 26 | /* COMMON block: blockinfo */
 27 | int fftblock;
 28 | int fftblockpad;
 29 |       
 30 | /*
 31 | c we need a bunch of logic to keep track of how
 32 | c arrays are laid out. 
 33 | 
 34 | 
 35 | c Note: this serial version is the derived from the parallel 0D case
 36 | c of the ft NPB.
 37 | c The computation proceeds logically as
 38 | 
 39 | c set up initial conditions
 40 | c fftx(1)
 41 | c transpose (1->2)
 42 | c ffty(2)
 43 | c transpose (2->3)
 44 | c fftz(3)
 45 | c time evolution
 46 | c fftz(3)
 47 | c transpose (3->2)
 48 | c ffty(2)
 49 | c transpose (2->1)
 50 | c fftx(1)
 51 | c compute residual(1)
 52 | 
 53 | c for the 0D, 1D, 2D strategies, the layouts look like xxx
 54 | c        
 55 | c            0D        1D        2D
 56 | c 1:        xyz       xyz       xyz
 57 | c 2:        xyz       xyz       yxz
 58 | c 3:        xyz       zyx       zxy
 59 | 
 60 | c the array dimensions are stored in dims(coord, phase)
 61 | */
 62 | 
 63 | /* COMMON block: layout */
 64 | static int dims[3][3];
 65 | static int xstart[3];
 66 | static int ystart[3];
 67 | static int zstart[3];
 68 | static int xend[3];
 69 | static int yend[3];
 70 | static int zend[3];
 71 | 
 72 | #define	T_TOTAL		0
 73 | #define	T_SETUP		1
 74 | #define	T_FFT		2
 75 | #define	T_EVOLVE	3
 76 | #define	T_CHECKSUM	4
 77 | #define	T_FFTLOW	5
 78 | #define	T_FFTCOPY	6
 79 | #define	T_MAX		7
 80 | 
 81 | #define	TIMERS_ENABLED	FALSE
 82 | 
 83 | /* other stuff */
 84 | 
 85 | #define	SEED	314159265.0
 86 | #define	A	1220703125.0
 87 | #define	PI	3.141592653589793238
 88 | #define	ALPHA	1.0e-6
 89 | 
 90 | #define	EXPMAX	(NITER_DEFAULT*(NX*NX/4+NY*NY/4+NZ*NZ/4))
 91 | 
 92 | /* COMMON block: excomm */
 93 | static double ex[EXPMAX+1];	/* ex(0:expmax) */
 94 | 
 95 | /*
 96 | c roots of unity array
 97 | c relies on x being largest dimension?
 98 | */
 99 | 
100 | /* COMMON block: ucomm */
101 | static dcomplex u[NX];
102 | 
103 | /* for checksum data */
104 | 
105 | /* COMMON block: sumcomm */
106 | static dcomplex sums[NITER_DEFAULT+1]; /* sums(0:niter_default) */
107 | 
108 | /* number of iterations*/
109 | 
110 | /* COMMON block: iter */
111 | static int niter;
112 | 
113 | 


--------------------------------------------------------------------------------
/NPB-TBB/IS/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=is
 3 | BENCHMARKU=IS
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | include ../sys/make.common
 8 | 
 9 | OBJS = is.o \
10 |        ${COMMON}/c_print_results.o \
11 |        ${COMMON}/c_timers.o \
12 |        ${COMMON}/c_wtime.o
13 | 
14 | 
15 | ${PROGRAM}: config ${OBJS}
16 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
17 | 
18 | .c.o:
19 | 	${CCOMPILE} $<
20 | 
21 | is.o:             is.cpp  npbparams.hpp
22 | 	${CCOMPILE} is.cpp
23 | 
24 | 
25 | clean:
26 | 	- rm -f *.o *~ mputil*
27 | 	- rm -f npbparams.hpp core
28 | 	- if [ -d rii_files ]; then rm -r rii_files; fi
29 | 


--------------------------------------------------------------------------------
/NPB-TBB/MG/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=mg
 3 | BENCHMARKU=MG
 4 | 
 5 | include ../config/make.def
 6 | 
 7 | OBJS = mg.o ${COMMON}/c_print_results.o  \
 8 |        ${COMMON}/c_${RAND}.o ${COMMON}/c_timers.o ${COMMON}/c_wtime.o
 9 | 
10 | include ../sys/make.common
11 | 
12 | ${PROGRAM}: config ${OBJS}
13 | 	${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${C_LIB}
14 | 
15 | mg.o:		mg.cpp npbparams.hpp
16 | 	${CCOMPILE} mg.cpp
17 | 
18 | clean:
19 | 	- rm -f *.o *~ 
20 | 	- rm -f npbparams.hpp core
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/NPB-TBB/MG/globals.hpp:
--------------------------------------------------------------------------------
 1 | /*--------------------------------------------------------------------
 2 | c  Parameter lm (declared and set in "npbparams.h") is the log-base2 of 
 3 | c  the edge size max for the partition on a given node, so must be changed 
 4 | c  either to save space (if running a small case) or made bigger for larger 
 5 | c  cases, for example, 512^3. Thus lm=7 means that the largest dimension 
 6 | c  of a partition that can be solved on a node is 2^7 = 128. lm is set 
 7 | c  automatically in npbparams.h
 8 | c  Parameters ndim1, ndim2, ndim3 are the local problem dimensions. 
 9 | c-------------------------------------------------------------------*/
10 | 
11 | #include "npbparams.hpp"
12 | 
13 | /* parameters */
14 | /* actual dimension including ghost cells for communications */
15 | #define	NM	(2+(2<<(LM-1)))
16 | /* size of rhs array */
17 | #define	NV	(2+(2<<(NDIM1-1))*(2+(2<<(NDIM2-1)))*(2+(2<<(NDIM3-1))))
18 | /* size of residual array */
19 | #define	NR	((8*(NV+(NM*NM)+5*NM+7*LM))/7)
20 | /* size of communication buffer */
21 | #define	NM2	(2*NM*NM)
22 | /* maximum number of levels */
23 | #define	MAXLEVEL	11
24 | 
25 | /*---------------------------------------------------------------------*/
26 | /* common /mg3/ */
27 | static int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1];
28 | /* common /ClassType/ */
29 | static char class_npb;
30 | /* common /my_debug/ */
31 | static int debug_vec[8];
32 | /* common /fap/ */
33 | /*static int ir[MAXLEVEL], m1[MAXLEVEL], m2[MAXLEVEL], m3[MAXLEVEL];*/
34 | static int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1];
35 | static int lt, lb;
36 | 
37 | /*c---------------------------------------------------------------------
38 | c  Set at m=1024, can handle cases up to 1024^3 case
39 | c---------------------------------------------------------------------*/
40 | #define	M	1037
41 | 
42 | /* common /buffer/ */
43 | /*static double buff[4][NM2];*/
44 | 


--------------------------------------------------------------------------------
/NPB-TBB/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | CLASS=S
 3 | SFILE=config/suite.def
 4 | 
 5 | default: header
 6 | 	@ $(SHELL) sys/print_instructions
 7 | 
 8 | BT: bt
 9 | bt: header
10 | 	cd BT; $(MAKE) CLASS=$(CLASS)
11 | 		       
12 | SP: sp		       
13 | sp: header	       
14 | 	cd SP; $(MAKE) CLASS=$(CLASS)
15 | 		       
16 | LU: lu		       
17 | lu: header	       
18 | 	cd LU; $(MAKE) CLASS=$(CLASS)
19 | 		       
20 | MG: mg		       
21 | mg: header	       
22 | 	cd MG; $(MAKE) CLASS=$(CLASS)
23 | 		       
24 | FT: ft		       
25 | ft: header	       
26 | 	cd FT; $(MAKE) CLASS=$(CLASS)
27 | 		       
28 | IS: is		       
29 | is: header	       
30 | 	cd IS; $(MAKE) CLASS=$(CLASS)
31 | 		       
32 | CG: cg		       
33 | cg: header	       
34 | 	cd CG; $(MAKE) CLASS=$(CLASS)
35 | 		       
36 | EP: ep		       
37 | ep: header	       
38 | 	cd EP; $(MAKE) CLASS=$(CLASS)
39 | DC: dc
40 | dc: header	       
41 | 	cd DC; $(MAKE) CLASS=$(CLASS)
42 | 
43 | # Awk script courtesy cmg@cray.com
44 | suite:
45 | 	@ awk '{ if ($$1 !~ /^#/ &&  NF > 0)                              \
46 | 	printf "make %s CLASS=%s\n", $$1, $$2 }' $(SFILE)  \
47 | 	| $(SHELL)
48 | 
49 | 
50 | # It would be nice to make clean in each subdirectory (the targets
51 | # are defined) but on a really clean system this will won't work
52 | # because those makefiles need config/make.def
53 | clean:
54 | 	- rm -f core 
55 | 	- rm -f *~ */core */*~ */*.o */npbparams.hpp */*.obj */*.exe
56 | 	- rm -f sys/setparams sys/makesuite sys/setparams.hpp
57 | 
58 | cleanall: clean
59 | 	- rm -r bin/*
60 | 
61 | veryclean: clean
62 | 	- rm config/make.def config/suite.def Part*
63 | 	- rm bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.* bin/ep.* bin/cg.*
64 | 
65 | header:
66 | 	@ $(SHELL) sys/print_header
67 | 
68 | kit: 
69 | 	- makekit -s100k -k30 * */* */*/*
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/NPB-TBB/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP)
 2 | 
 3 | ## We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP).
 4 | 
 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways.
 6 | 
 7 | *Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)*
 8 | 
 9 | 
10 | ## How to cite this work
11 | 	
12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
13 | 
14 | ## The NPB-CPP Benchmark
15 | 
16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version.
17 | 
18 | 	==================================================================
19 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
20 | 	 												
21 | 			Code contributors: 
22 | 					Dalvan Griebler    		
23 | 					Júnior Löff
24 | 													
25 | 		Warning: in case of problems send an email to us:					
26 | 			dalvan.griebler@acad.pucrs.br			
27 | 			junior.loff@acad.pucrs.br				
28 | 	==================================================================
29 | 
30 | 
31 | This folder contains:
32 | 
33 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
34 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
35 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
36 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
37 | 
38 | Each directory is independent and contains its own implemented version of the kernels:
39 | 
40 | 	IS - Integer Sort, random memory access
41 | 	EP - Embarrassingly Parallel
42 | 	CG - Conjugate Gradient, irregular memory access and communication
43 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
44 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
45 | 
46 | ## Software Requirements
47 | 
48 | *Warning: our tests were made with GCC-5*
49 | 
50 | **TBB**
51 | 
52 | *Installation*
53 | 
54 | 	apt-get install libtbb-dev
55 | 
56 | **FastFlow** 
57 | 
58 | *Installation*
59 | 
60 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
61 | 
62 | 
63 | ## How to Compile 
64 | 
65 | Enter the directory from the version desired and execute:
66 | 
67 | 	make _BENCHMARK CLASS=_VERSION
68 | 
69 | 
70 | _BENCHMARKs are: 
71 | 		
72 | 	EP, CG, MG, IS and FT 
73 | 																										
74 | _VERSIONs are: 
75 | 	
76 | 	Class S: small for quick test purposes
77 | 	Class W: workstation size (a 90's workstation; now likely too small)	
78 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
79 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
80 | 
81 | 
82 | Command:
83 | 
84 | 	make ep CLASS=B
85 | 


--------------------------------------------------------------------------------
/NPB-TBB/bin/README.md:
--------------------------------------------------------------------------------
 1 | # How to Cite our Work
 2 | 	
 3 | D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
 4 | 
 5 | # The NPB-CPP Benchmark
 6 | 
 7 | These codes were converted to **C++** from the original [NPB3.3.1](https://www.nas.nasa.gov/publications/npb.html). We achieved similar performance in **C++** compared to the **Fortran** version.
 8 | 
 9 | 	==================================================================
10 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
11 | 	 												
12 | 			Code contributors: 
13 | 					Dalvan Griebler    		
14 | 					Júnior Löff
15 | 													
16 | 		Warning: in case of problems send an email to us:					
17 | 			dalvan.griebler@acad.pucrs.br			
18 | 			junior.loff@acad.pucrs.br				
19 | 	==================================================================
20 | 
21 | 
22 | This folder contains:
23 | 
24 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
25 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
26 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
27 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
28 | 
29 | Each directory is independent and contains its own implemented version of the kernels:
30 | 
31 | 	IS - Integer Sort, random memory access
32 | 	EP - Embarrassingly Parallel
33 | 	CG - Conjugate Gradient, irregular memory access and communication
34 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
35 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
36 | 
37 | # Software Requiriments
38 | 
39 | *Warning: our tests were made with GCC-5*
40 | 
41 | **TBB**
42 | 
43 | *Installation*
44 | 
45 | 	apt-get install libtbb-dev
46 | 
47 | **FastFlow** 
48 | 
49 | *Installation*
50 | 
51 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
52 | 
53 | 
54 | # How to Compile 
55 | 
56 | Enter the directory from the version desired and execute:
57 | 
58 | 	make _BENCHMARK CLASS=_VERSION
59 | 
60 | 
61 | _BENCHMARKs are: 
62 | 		
63 | 	EP, CG, MG, IS and FT 
64 | 																										
65 | _VERSIONs are: 
66 | 	
67 | 	Class S: small for quick test purposes
68 | 	Class W: workstation size (a 90's workstation; now likely too small)	
69 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
70 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
71 | 
72 | 
73 | Command:
74 | 
75 | 	make ep CLASS=B


--------------------------------------------------------------------------------
/NPB-TBB/common/c_print_results.cpp:
--------------------------------------------------------------------------------
 1 | /*****************************************************************/
 2 | /******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
 3 | /*****************************************************************/
 4 | #include <cstdlib>
 5 | #include <cstdio>
 6 | 
 7 | void c_print_results( char   *name, char   class_npb, int    n1, int n2, int n3, int niter, double t,
 8 |   double mops, char   *optype, int    passed_verification, char   *npbversion, char   *compiletime, char   *cc,
 9 |   char   *clink, char   *c_lib, char   *c_inc, char   *cflags, char   *clinkflags, char   *rand)
10 | {
11 | 
12 |     printf( "\n\n %s Benchmark Completed\n", name ); 
13 | 
14 |     printf( " class_npb           =                        %c\n", class_npb );
15 | 
16 |     if( n2 == 0 && n3 == 0 )
17 |         printf( " Size            =             %12d\n", n1 );   /* as in IS */
18 |     else
19 |         printf( " Size            =              %3dx%3dx%3d\n", n1,n2,n3 );
20 | 
21 |     printf( " Iterations      =             %12d\n", niter );
22 |  
23 |     printf( " Time in seconds =             %12.2f\n", t );
24 | 
25 |     printf( " Mop/s total     =             %12.2f\n", mops );
26 | 
27 |     printf( " Operation type  = %24s\n", optype);
28 | 
29 |     if( passed_verification )
30 |         printf( " Verification    =               SUCCESSFUL\n" );
31 |     else
32 |         printf( " Verification    =             UNSUCCESSFUL\n" );
33 | 
34 |     printf( " Version         =             %12s\n", npbversion );
35 | 
36 |     printf( " Compile date    =             %12s\n", compiletime );
37 | 
38 |     printf( "\n Compile options:\n" );
39 | 
40 |     printf( "    CC           = %s\n", cc );
41 | 
42 |     printf( "    CLINK        = %s\n", clink );
43 | 
44 |     printf( "    C_LIB        = %s\n", c_lib );
45 | 
46 |     printf( "    C_INC        = %s\n", c_inc );
47 | 
48 |     printf( "    CFLAGS       = %s\n", cflags );
49 | 
50 |     printf( "    CLINKFLAGS   = %s\n", clinkflags );
51 | 
52 |     printf( "    RAND         = %s\n", rand );
53 | #ifdef SMP
54 |     char *evalue = getenv("MP_SET_NUMTHREADS");
55 |     printf( "   MULTICPUS = %s\n", evalue );
56 | #endif
57 | 
58 | /*    printf( "\n\n" );
59 |     printf( " Please send the results of this run to:\n\n" );
60 |     printf( " NPB Development Team\n" );
61 |     printf( " Internet: npb@nas.nasa.gov\n \n" );
62 |     printf( " If email is not available, send this to:\n\n" );
63 |     printf( " MS T27A-1\n" );
64 |     printf( " NASA Ames Research Center\n" );
65 |     printf( " Moffett Field, CA  94035-1000\n\n" );
66 |     printf( " Fax: 415-604-3957\n\n" );*/
67 | }
68 |  
69 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/c_randdp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | */
  3 | #if defined(USE_POW)
  4 | #define r23 pow(0.5, 23.0)
  5 | #define r46 (r23*r23)
  6 | #define t23 pow(2.0, 23.0)
  7 | #define t46 (t23*t23)
  8 | #else
  9 | #define r23 (0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5*0.5)
 10 | #define r46 (r23*r23)
 11 | #define t23 (2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0*2.0)
 12 | #define t46 (t23*t23)
 13 | #endif
 14 | 
 15 | /*c---------------------------------------------------------------------
 16 | c---------------------------------------------------------------------*/
 17 | 
 18 | double randlc (double *x, double a) {
 19 | 
 20 | /*c---------------------------------------------------------------------
 21 | c---------------------------------------------------------------------*/
 22 | 
 23 | /*c---------------------------------------------------------------------
 24 | c
 25 | c   This routine returns a uniform pseudorandom double precision number in the
 26 | c   range (0, 1) by using the linear congruential generator
 27 | c
 28 | c   x_{k+1} = a x_k  (mod 2^46)
 29 | c
 30 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 31 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 32 | c   and X is the same as x_0.  A and X must be odd double precision integers
 33 | c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
 34 | c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 35 | c   the new seed x_1, so that subsequent calls to RANDLC using the same
 36 | c   arguments will generate a continuous sequence.
 37 | c
 38 | c   This routine should produce the same results on any computer with at least
 39 | c   48 mantissa bits in double precision floating point data.  On 64 bit
 40 | c   systems, double precision should be disabled.
 41 | c
 42 | c   David H. Bailey     October 26, 1990
 43 | c
 44 | c---------------------------------------------------------------------*/
 45 | 
 46 |     double t1,t2,t3,t4,a1,a2,x1,x2,z;
 47 | 
 48 | /*c---------------------------------------------------------------------
 49 | c   Break A into two parts such that A = 2^23 * A1 + A2.
 50 | c---------------------------------------------------------------------*/
 51 |     t1 = r23 * a;
 52 |     a1 = (int)t1;
 53 |     a2 = a - t23 * a1;
 54 | 
 55 | /*c---------------------------------------------------------------------
 56 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
 57 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 58 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
 59 | c---------------------------------------------------------------------*/
 60 |     t1 = r23 * (*x);
 61 |     x1 = (int)t1;
 62 |     x2 = (*x) - t23 * x1;
 63 |     t1 = a1 * x2 + a2 * x1;
 64 |     t2 = (int)(r23 * t1);
 65 |     z = t1 - t23 * t2;
 66 |     t3 = t23 * z + a2 * x2;
 67 |     t4 = (int)(r46 * t3);
 68 |     (*x) = t3 - t46 * t4;
 69 | 
 70 |     return (r46 * (*x));
 71 | }
 72 | 
 73 | /*c---------------------------------------------------------------------
 74 | c---------------------------------------------------------------------*/
 75 | 
 76 | void vranlc (int n, double *x_seed, double a, double y[]) {
 77 | 
 78 | /*c---------------------------------------------------------------------
 79 | c---------------------------------------------------------------------*/
 80 | 
 81 | /*c---------------------------------------------------------------------
 82 | c
 83 | c   This routine generates N uniform pseudorandom double precision numbers in
 84 | c   the range (0, 1) by using the linear congruential generator
 85 | c
 86 | c   x_{k+1} = a x_k  (mod 2^46)
 87 | c
 88 | c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 89 | c   before repeating.  The argument A is the same as 'a' in the above formula,
 90 | c   and X is the same as x_0.  A and X must be odd double precision integers
 91 | c   in the range (1, 2^46).  The N results are placed in Y and are normalized
 92 | c   to be between 0 and 1.  X is updated to contain the new seed, so that
 93 | c   subsequent calls to VRANLC using the same arguments will generate a
 94 | c   continuous sequence.  If N is zero, only initialization is performed, and
 95 | c   the variables X, A and Y are ignored.
 96 | c
 97 | c   This routine is the standard version designed for scalar or RISC systems.
 98 | c   However, it should produce the same results on any single processor
 99 | c   computer with at least 48 mantissa bits in double precision floating point
100 | c   data.  On 64 bit systems, double precision should be disabled.
101 | c
102 | c---------------------------------------------------------------------*/
103 | 
104 |     int i;
105 |     double x,t1,t2,t3,t4,a1,a2,x1,x2,z;
106 | 
107 | /*c---------------------------------------------------------------------
108 | c   Break A into two parts such that A = 2^23 * A1 + A2.
109 | c---------------------------------------------------------------------*/
110 |     t1 = r23 * a;
111 |     a1 = (int)t1;
112 |     a2 = a - t23 * a1;
113 |     x = *x_seed;
114 | 
115 | /*c---------------------------------------------------------------------
116 | c   Generate N results.   This loop is not vectorizable.
117 | c---------------------------------------------------------------------*/
118 |     for (i = 1; i <= n; i++) {
119 | 
120 | /*c---------------------------------------------------------------------
121 | c   Break X into two parts such that X = 2^23 * X1 + X2, compute
122 | c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
123 | c   X = 2^23 * Z + A2 * X2  (mod 2^46).
124 | c---------------------------------------------------------------------*/
125 |         t1 = r23 * x;
126 |         x1 = (int)t1;
127 |         x2 = x - t23 * x1;
128 |         t1 = a1 * x2 + a2 * x1;
129 |         t2 = (int)(r23 * t1);
130 |         z = t1 - t23 * t2;
131 |         t3 = t23 * z + a2 * x2;
132 |         t4 = (int)(r46 * t3);
133 |         x = t3 - t46 * t4;
134 |         y[i] = r46 * x;
135 |     }
136 |     *x_seed = x;
137 | }
138 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/c_timers.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #include "wtime.hpp"
 5 | #include <cstdlib>
 6 | 
 7 | /*  Prototype  */
 8 | void wtime( double * );
 9 | 
10 | 
11 | 
12 | /*****************************************************************/
13 | /******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
14 | /*****************************************************************/
15 | double elapsed_time( void )
16 | {
17 |     double t;
18 | 
19 |     wtime( &t );
20 |     return( t );
21 | }
22 | 
23 | 
24 | double start[64], elapsed[64];
25 | 
26 | /*****************************************************************/
27 | /******            T  I  M  E  R  _  C  L  E  A  R          ******/
28 | /*****************************************************************/
29 | void timer_clear( int n )
30 | {
31 |     elapsed[n] = 0.0;
32 | }
33 | 
34 | 
35 | /*****************************************************************/
36 | /******            T  I  M  E  R  _  S  T  A  R  T          ******/
37 | /*****************************************************************/
38 | void timer_start( int n )
39 | {
40 |     start[n] = elapsed_time();
41 | }
42 | 
43 | 
44 | /*****************************************************************/
45 | /******            T  I  M  E  R  _  S  T  O  P             ******/
46 | /*****************************************************************/
47 | void timer_stop( int n )
48 | {
49 |     double t, now;
50 | 
51 |     now = elapsed_time();
52 |     t = now - start[n];
53 |     elapsed[n] += t;
54 | 
55 | }
56 | 
57 | 
58 | /*****************************************************************/
59 | /******            T  I  M  E  R  _  R  E  A  D             ******/
60 | /*****************************************************************/
61 | double timer_read( int n )
62 | {
63 |     return( elapsed[n] );
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/npb-CPP.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <cstdio>
 3 | #include <cstdlib>
 4 | #include <cmath>
 5 | 
 6 | typedef int boolean;
 7 | typedef struct { double real; double imag; } dcomplex;
 8 | 
 9 | #define TRUE	1
10 | #define FALSE	0
11 | 
12 | #define max(a,b) (((a) > (b)) ? (a) : (b))
13 | #define min(a,b) (((a) < (b)) ? (a) : (b))
14 | #define	pow2(a) ((a)*(a))
15 | 
16 | #define get_real(c) c.real
17 | #define get_imag(c) c.imag
18 | #define cadd(c,a,b) (c.real = a.real + b.real, c.imag = a.imag + b.imag)
19 | #define csub(c,a,b) (c.real = a.real - b.real, c.imag = a.imag - b.imag)
20 | #define cmul(c,a,b) (c.real = a.real * b.real - a.imag * b.imag, \
21 |                      c.imag = a.real * b.imag + a.imag * b.real)
22 | #define crmul(c,a,b) (c.real = a.real * b, c.imag = a.imag * b)
23 | 
24 | extern double randlc(double *, double);
25 | extern void vranlc(int, double *, double, double *);
26 | extern void timer_clear(int);
27 | extern void timer_start(int);
28 | extern void timer_stop(int);
29 | extern double timer_read(int);
30 | 
31 | extern void c_print_results(char *name, char class_npb, int n1, int n2,
32 | 			    int n3, int niter, double t,
33 | 			    double mops, char *optype, int passed_verification,
34 | 			    char *npbversion, char *compiletime, char *cc,
35 | 			    char *clink, char *c_lib, char *c_inc,
36 | 			    char *cflags, char *clinkflags, char *rand);
37 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/wtime.cpp:
--------------------------------------------------------------------------------
 1 | #include "wtime.hpp"
 2 | #include <sys/time.h>
 3 | 
 4 | void wtime(double *t)
 5 | {
 6 |   static int sec = -1;
 7 |   struct timeval tv;
 8 |   gettimeofday(&tv, 0);
 9 |   if (sec < 0) sec = tv.tv_sec;
10 |   *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
11 | }
12 | 
13 |     
14 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/wtime.hpp:
--------------------------------------------------------------------------------
 1 | /* C/Fortran interface is different on different machines. 
 2 |  * You may need to tweak this.
 3 |  */
 4 | 
 5 | 
 6 | #if defined(IBM)
 7 | #define wtime wtime
 8 | #elif defined(CRAY)
 9 | #define wtime WTIME
10 | #else
11 | #define wtime wtime_
12 | #endif
13 | 


--------------------------------------------------------------------------------
/NPB-TBB/common/wtime_sgi64.cpp:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <fcntl.h>
 3 | #include <sys/mman.h>
 4 | #include <sys/syssgi.h>
 5 | #include <sys/immu.h>
 6 | #include <cerrno>
 7 | #include <cstdio>
 8 | 
 9 | /* The following works on SGI Power Challenge systems */
10 | 
11 | typedef unsigned long iotimer_t;
12 | 
13 | unsigned int cycleval;
14 | volatile iotimer_t *iotimer_addr, base_counter;
15 | double resolution;
16 | 
17 | /* address_t is an integer type big enough to hold an address */
18 | typedef unsigned long address_t;
19 | 
20 | 
21 | 
22 | void timer_init() 
23 | {
24 |   
25 |   int fd;
26 |   char *virt_addr;
27 |   address_t phys_addr, page_offset, pagemask, pagebase_addr;
28 |   
29 |   pagemask = getpagesize() - 1;
30 |   errno = 0;
31 |   phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
32 |   if (errno != 0) {
33 |     perror("SGI_QUERY_CYCLECNTR");
34 |     exit(1);
35 |   }
36 |   /* rel_addr = page offset of physical address */
37 |   page_offset = phys_addr & pagemask;
38 |   pagebase_addr = phys_addr - page_offset;
39 |   fd = open("/dev/mmem", O_RDONLY);
40 | 
41 |   virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
42 |   virt_addr = virt_addr + page_offset;
43 |   iotimer_addr = (iotimer_t *)virt_addr;
44 |   /* cycleval in picoseconds to this gives resolution in seconds */
45 |   resolution = 1.0e-12*cycleval; 
46 |   base_counter = *iotimer_addr;
47 | }
48 | 
49 | void wtime_(double *time) 
50 | {
51 |   static int initialized = 0;
52 |   volatile iotimer_t counter_value;
53 |   if (!initialized) { 
54 |     timer_init();
55 |     initialized = 1;
56 |   }
57 |   counter_value = *iotimer_addr - base_counter;
58 |   *time = (double)counter_value * resolution;
59 | }
60 | 
61 | 
62 | void wtime(double *time) 
63 | {
64 |   static int initialized = 0;
65 |   volatile iotimer_t counter_value;
66 |   if (!initialized) { 
67 |     timer_init();
68 |     initialized = 1;
69 |   }
70 |   counter_value = *iotimer_addr - base_counter;
71 |   *time = (double)counter_value * resolution;
72 | }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/NPB-TBB/config/make.def:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------------------
  2 | #
  3 | #                SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS. 
  4 | #
  5 | #---------------------------------------------------------------------------
  6 | 
  7 | #---------------------------------------------------------------------------
  8 | # Items in this file will need to be changed for each platform.
  9 | # (Note these definitions are inconsistent with NPB2.1.)
 10 | #---------------------------------------------------------------------------
 11 | 
 12 | #---------------------------------------------------------------------------
 13 | # Parallel C:
 14 | #
 15 | # CC         - C compiler 
 16 | # CFLAGS     - C compilation arguments
 17 | # C_INC      - any -I arguments required for compiling C 
 18 | # CLINK      - C linker
 19 | # CLINKFLAGS - C linker flags
 20 | # C_LIB      - any -L and -l arguments required for linking C 
 21 | #
 22 | # compilations are done with $(CC) $(C_INC) $(CFLAGS) or
 23 | #                            $(CC) $(CFLAGS)
 24 | # linking is done with       $(CLINK) $(C_LIB) $(CLINKFLAGS)
 25 | #---------------------------------------------------------------------------
 26 | 
 27 | #---------------------------------------------------------------------------
 28 | # This is the C compiler used for OpenMP programs
 29 | #---------------------------------------------------------------------------
 30 | CC = g++ -std=c++14
 31 | #gcc #cc
 32 | # This links C programs; usually the same as ${CC}
 33 | CLINK	= $(CC)
 34 | 
 35 | #---------------------------------------------------------------------------
 36 | # These macros are passed to the linker 
 37 | #---------------------------------------------------------------------------
 38 | C_LIB  = -lm -ltbb
 39 | 
 40 | #---------------------------------------------------------------------------
 41 | # These macros are passed to the compiler 
 42 | #---------------------------------------------------------------------------
 43 | C_INC = -I../common 
 44 | 
 45 | #---------------------------------------------------------------------------
 46 | # Global *compile time* flags for C programs
 47 | #---------------------------------------------------------------------------
 48 | CFLAGS	= -O3
 49 | # CFLAGS = -g
 50 | 
 51 | #---------------------------------------------------------------------------
 52 | # Global *link time* flags. Flags for increasing maximum executable 
 53 | # size usually go here. 
 54 | #---------------------------------------------------------------------------
 55 | CLINKFLAGS = -O3
 56 | 
 57 | 
 58 | #---------------------------------------------------------------------------
 59 | # Utilities C:
 60 | #
 61 | # This is the C compiler used to compile C utilities.  Flags required by 
 62 | # this compiler go here also; typically there are few flags required; hence 
 63 | # there are no separate macros provided for such flags.
 64 | #---------------------------------------------------------------------------
 65 | UCC	= cc
 66 | 
 67 | 
 68 | #---------------------------------------------------------------------------
 69 | # Destination of executables, relative to subdirs of the main directory. . 
 70 | #---------------------------------------------------------------------------
 71 | BINDIR	= ../bin
 72 | 
 73 | 
 74 | #---------------------------------------------------------------------------
 75 | # The variable RAND controls which random number generator 
 76 | # is used. It is described in detail in Doc/README.install. 
 77 | # Use "randi8" unless there is a reason to use another one. 
 78 | # Other allowed values are "randi8_safe", "randdp" and "randdpvec"
 79 | #---------------------------------------------------------------------------
 80 | # RAND   = randi8
 81 | # The following is highly reliable but may be slow:
 82 | RAND   = randdp
 83 | 
 84 | 
 85 | #---------------------------------------------------------------------------
 86 | # The variable WTIME is the name of the wtime source code module in the
 87 | # NPB2.x/common directory.  
 88 | # For most machines,       use wtime.c
 89 | # For SGI power challenge: use wtime_sgi64.c
 90 | #---------------------------------------------------------------------------
 91 | WTIME  = wtime.cpp
 92 | 
 93 | 
 94 | #---------------------------------------------------------------------------
 95 | # Enable if either Cray or IBM: 
 96 | # (no such flag for most machines: see common/wtime.h)
 97 | # This is used by the C compiler to pass the machine name to common/wtime.h,
 98 | # where the C/Fortran binding interface format is determined
 99 | #---------------------------------------------------------------------------
100 | # MACHINE	=	-DCRAY
101 | # MACHINE	=	-DIBM
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/NPB-TBB/config/suite.def:
--------------------------------------------------------------------------------
 1 | # config/suite.def
 2 | # This file is used to build several benchmarks with a single command. 
 3 | # Typing "make suite" in the main directory will build all the benchmarks
 4 | # specified in this file. 
 5 | # Each line of this file contains a benchmark name, class, and number
 6 | # of nodes. The name is one of "cg", "is", "ep", mg", "ft"
 7 | # The class is one of "S", "W", "A", "B", and "C". 
 8 | # No blank lines. 
 9 | # The following example builds serial sample sizes of all benchmarks. 
10 | ft	A
11 | mg	A
12 | is	A
13 | ep	A
14 | cg	A
15 | 


--------------------------------------------------------------------------------
/NPB-TBB/sys/Makefile:
--------------------------------------------------------------------------------
 1 | include ../config/make.def
 2 | 
 3 | # Note that COMPILE is also defined in make.common and should
 4 | # be the same. We can't include make.common because it has a lot
 5 | # of other garbage. 
 6 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 7 | 
 8 | all: setparams 
 9 | 
10 | # setparams creates an npbparam.h file for each benchmark 
11 | # configuration. npbparams.h also contains info about how a benchmark
12 | # was compiled and linked
13 | 
14 | setparams: setparams.cpp ../config/make.def
15 | 	$(UCC) -o setparams setparams.cpp
16 | 
17 | 
18 | clean: 
19 | 	-rm -f setparams setparams.hpp npbparams.hpp
20 | 	-rm -f *~ *.o
21 | 
22 | 


--------------------------------------------------------------------------------
/NPB-TBB/sys/README:
--------------------------------------------------------------------------------
 1 | This directory contains utilities and files used by the 
 2 | build process. You should not need to change anything
 3 | in this directory. 
 4 | 
 5 | Original Files
 6 | --------------
 7 | setparams.c:
 8 |         Source for the setparams program. This program is used internally
 9 |         in the build process to create the file "npbparams.h" for each 
10 |         benchmark. npbparams.h contains Fortran or C parameters to build a 
11 |         benchmark for a specific class. The setparams program is never run 
12 |         directly by a user. Its invocation syntax is 
13 | 
14 |             "setparams benchmark-name class". 
15 | 
16 |         It examines the file "npbparams.h" in the current directory. If 
17 |         the specified parameters are the same as those in the npbparams.h 
18 |         file, nothing it changed. If the file does not exist or corresponds 
19 |         to a different class/number of nodes, it is (re)built. 
20 | 	One of the more complicated things in npbparams.h is that it 
21 |         contains, in a Fortran string, the compiler flags used to build a 
22 |         benchmark, so that a benchmark can print out how it was compiled. 
23 | 
24 | make.common
25 |         A makefile segment that is included in each individual benchmark
26 |         program makefile. It sets up some standard macros (COMPILE, etc) 
27 |         and makes sure everything is configured correctly (npbparams.h)
28 | 
29 | Makefile
30 |         Builds  setparams
31 | 
32 | README
33 |         This file. 
34 | 
35 | 
36 | Created files
37 | -------------
38 | 
39 | setparams
40 | 	See descriptions above
41 | 
42 | 


--------------------------------------------------------------------------------
/NPB-TBB/sys/make.common:
--------------------------------------------------------------------------------
 1 | PROGRAM  = $(BINDIR)/$(BENCHMARK).$(CLASS)
 2 | FCOMPILE = $(F77) -c $(F_INC) $(FFLAGS)
 3 | CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
 4 | 
 5 | # Class "U" is used internally by the setparams program to mean
 6 | # "unknown". This means that if you don't specify CLASS=
 7 | # on the command line, you'll get an error. It would be nice
 8 | # to be able to avoid this, but we'd have to get information
 9 | # from the setparams back to the make program, which isn't easy. 
10 | CLASS=U
11 | 
12 | default:: ${PROGRAM}
13 | 
14 | # This makes sure the configuration utility setparams 
15 | # is up to date. 
16 | # Note that this must be run every time, which is why the
17 | # target does not exist and is not created. 
18 | # If you create a file called "config" you will break things. 
19 | config:
20 | 	@cd ../sys; ${MAKE} all
21 | 	../sys/setparams ${BENCHMARK} ${CLASS}
22 | 
23 | COMMON=../common
24 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
25 | 	cd ${COMMON}; ${FCOMPILE} ${RAND}.f
26 | 
27 | ${COMMON}/c_${RAND}.o: ${COMMON}/c_${RAND}.cpp
28 | 	cd ${COMMON}; ${CCOMPILE} c_${RAND}.cpp
29 | 
30 | ${COMMON}/print_results.o: ${COMMON}/print_results.f
31 | 	cd ${COMMON}; ${FCOMPILE} print_results.f
32 | 
33 | ${COMMON}/c_print_results.o: ${COMMON}/c_print_results.cpp
34 | 	cd ${COMMON}; ${CCOMPILE} c_print_results.cpp
35 | 
36 | ${COMMON}/timers.o: ${COMMON}/timers.f
37 | 	cd ${COMMON}; ${FCOMPILE} timers.f
38 | 
39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.cpp
40 | 	cd ${COMMON}; ${CCOMPILE} c_timers.cpp
41 | 
42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME}
43 | 	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/${WTIME}
44 | # For most machines or CRAY or IBM
45 | #	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
46 | # For a precise timer on an SGI Power Challenge, try:
47 | #	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
48 | 
49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
50 | 	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${COMMON}/${WTIME}
51 | 
52 | 
53 | # Normally setparams updates npbparams.h only if the settings (CLASS)
54 | # have changed. However, we also want to update if the compile options
55 | # may have changed (set in ../config/make.def). 
56 | npbparams.hpp: ../config/make.def
57 | 	@ echo make.def modified. Rebuilding npbparams.hpp just in case
58 | 	rm -f npbparams.hpp
59 | 	../sys/setparams ${BENCHMARK} ${CLASS}
60 | 
61 | # So that "make benchmark-name" works
62 | ${BENCHMARK}:  default
63 | ${BENCHMARKU}: default
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/NPB-TBB/sys/print_header:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   ========================================='
 3 | echo '   =      NAS Parallel Benchmarks          ='
 4 | echo '   =      TBB C++ Versions                 ='
 5 | echo '   =      Developed by: Dalvan Griebler    ='
 6 | echo '   =                    Júnior Löff        ='
 7 | echo '   =                                       ='
 8 | echo '   =      Warning: in case of problems     ='
 9 | echo '   =      send an email to us:             ='
10 | echo '   =      dalvan.griebler@acad.pucrs.br    ='
11 | echo '   =      junior.loff@acad.pucrs.br        ='
12 | echo '   ========================================='
13 | echo ''
14 | 


--------------------------------------------------------------------------------
/NPB-TBB/sys/print_instructions:
--------------------------------------------------------------------------------
 1 | echo ''
 2 | echo '   To make a NAS benchmark type '
 3 | echo ''
 4 | echo '         make <benchmark-name> CLASS=<class>'
 5 | echo ''
 6 | echo '   where <benchmark-name> is "cg", "ep", "ft", "is", or "mg"'
 7 | echo '         <class>          is "S", "W", "A", "B" or "C"'
 8 | echo ''
 9 | echo '   To make a set of benchmarks, create the file config/suite.def'
10 | echo '   according to the instructions in config/suite.def.template and type'
11 | echo ''
12 | echo '         make suite'
13 | echo ''
14 | echo ' ***************************************************************'
15 | echo ' * Remember to edit the file config/make.def for site specific *'
16 | echo ' * information as described in the README file                 *'
17 | echo ' ***************************************************************'
18 | 
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Warning: this project is continued at [NPB-CPP](https://github.com/GMAP/NPB-CPP)
 2 | 
 3 | ## :sound: We are happy to announce that both NPB Kernels and pseudo-application are available at our new repository [NPB-CPP](https://github.com/GMAP/NPB-CPP). :smile:
 4 | 
 5 | This was our first work on NAS Parallel Benchmark (NPB) suite and many other works are now continuing this project in many different ways.
 6 | 
 7 | :sound:*Note: this repository will no longer be updated, therefore, follow us at [NPB-CPP](https://github.com/GMAP/NPB-CPP)*
 8 | 
 9 | 
10 | ## How to cite this work
11 | 	
12 | [[DOI]](https://doi.org/10.1109/PDP2018.2018.00120) D. Griebler, J. Loff, G. Mencagli, M. Danelutto and L. G. Fernandes. **Efficient NAS Benchmark Kernels with C++ Parallel Programming**. *In proceedings of the 26th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)*. Cambridge, United Kingdom, 2018.
13 | 
14 | ## The NPB-CPP Benchmark
15 | 
16 | These codes were converted to **C++** from the original [NPB3.3.1](https://doi.org/10.1109/PDP2018.2018.00120). We achieved similar performance in **C++** compared to the **Fortran** version.
17 | 
18 | 	==================================================================
19 | 		NAS Parallel Benchmarks in C++, OpenMP, FastFlow, and TBB
20 | 	 												
21 | 			Code contributors: 
22 | 					Dalvan Griebler    		
23 | 					Júnior Löff
24 | 													
25 | 		Warning: in case of problems send an email to us:					
26 | 			dalvan.griebler@acad.pucrs.br			
27 | 			junior.loff@acad.pucrs.br				
28 | 	==================================================================
29 | 
30 | 
31 | This folder contains:
32 | 
33 | 	- NPB-FF - Directory with the parallel version implemented in FastFlow
34 | 	- NPB-OMP - Directory with the parallel version translated from the original NPB version
35 | 	- NPB-SER - Directory with the serial version of the NPB ported to C++
36 | 	- NPB-TBB - Directory with the parallel version implemented in Thread Building Blocks
37 | 
38 | Each directory is independent and contains its own implemented version of the kernels:
39 | 
40 | 	IS - Integer Sort, random memory access
41 | 	EP - Embarrassingly Parallel
42 | 	CG - Conjugate Gradient, irregular memory access and communication
43 | 	MG - Multi-Grid on a sequence of meshes, long- and short-distance communication, memory intensive
44 | 	FT - discrete 3D fast Fourier Transform, all-to-all communication
45 | 
46 | ## Software Requirements
47 | 
48 | *Warning: our tests were made with GCC-5*
49 | 
50 | **TBB**
51 | 
52 | *Installation*
53 | 
54 | 	apt-get install libtbb-dev
55 | 
56 | **FastFlow** 
57 | 
58 | *Installation*
59 | 
60 | 	svn co https://svn.code.sf.net/p/mc-fastflow/code/ $HOME/fastflow
61 | 
62 | 
63 | ## How to Compile 
64 | 
65 | Enter the directory from the version desired and execute:
66 | 
67 | 	make _BENCHMARK CLASS=_VERSION
68 | 
69 | 
70 | _BENCHMARKs are: 
71 | 		
72 | 	EP, CG, MG, IS and FT 
73 | 																										
74 | _VERSIONs are: 
75 | 	
76 | 	Class S: small for quick test purposes
77 | 	Class W: workstation size (a 90's workstation; now likely too small)	
78 | 	Classes A, B, C: standard test problems; ~4X size increase going from one class to the next	
79 | 	Classes D, E, F: large test problems; ~16X size increase from each of the previous Classes  
80 | 
81 | 
82 | Command:
83 | 
84 | 	make ep CLASS=B
85 | 


--------------------------------------------------------------------------------