├── Makefile.linux
├── Makefile.storm
├── Makefile.storm.opt
├── README
├── gups_vanilla.c
├── gups_nonpow2.c
├── gups_opt.c
├── MPIRandomAccess_vanilla.c
└── MPIRandomAccess_opt.c


/Makefile.linux:
--------------------------------------------------------------------------------
 1 | # Makefile for Linux
 2 | 
 3 | SHELL = /bin/sh
 4 | 
 5 | # System-specific settings
 6 | 
 7 | CC =		gcc
 8 | CCFLAGS =	-O -g -DCHECK
 9 | LINK =		gcc
10 | LINKFLAGS =	-O -g 
11 | LIB =		-lmpich
12 | 
13 | # Link target
14 | 
15 | gups_vanilla:	gups_vanilla.o
16 | 	$(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla
17 | 
18 | gups_nonpow2:	gups_nonpow2.o
19 | 	$(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2
20 | 
21 | gups_opt:	gups_opt.o
22 | 	$(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt
23 | 
24 | # Compilation rules
25 | 
26 | %.o:%.c
27 | 	$(CC) $(CCFLAGS) -c $<
28 | 


--------------------------------------------------------------------------------
/Makefile.storm:
--------------------------------------------------------------------------------
 1 | # Makefile for Red Storm (compile on reddish)
 2 | 
 3 | SHELL = /bin/sh
 4 | 
 5 | # System-specific settings
 6 | 
 7 | CC =		CC
 8 | CCFLAGS =	-fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64
 9 | #CCFLAGS =	-fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DCHECK
10 | LINK =		CC
11 | LINKFLAGS =	-O
12 | LIB =
13 | 
14 | # Link target
15 | 
16 | gups_vanilla:	gups_vanilla.o
17 | 	$(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla
18 | 
19 | gups_nonpow2:	gups_nonpow2.o
20 | 	$(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2
21 | 
22 | gups_opt:	gups_opt.o
23 | 	$(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt
24 | 
25 | # Compilation rules
26 | 
27 | %.o:%.c
28 | 	$(CC) $(CCFLAGS) -c $<
29 | 


--------------------------------------------------------------------------------
/Makefile.storm.opt:
--------------------------------------------------------------------------------
 1 | # Makefile for Red Storm (compile on reddish)
 2 | 
 3 | SHELL = /bin/sh
 4 | 
 5 | # System-specific settings
 6 | 
 7 | CC =		/home/rbbrigh/mpich-1.2.6/install/bin/mpicc
 8 | CCFLAGS =	-O3 -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DNODEF \
 9 | 		-Msafeptr -Mipa=fast 
10 | #CCFLAGS =	-O3 -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DNODEF -DCHECK \
11 | #		-Msafeptr -Mipa=fast 
12 | LINK =		/home/rbbrigh/mpich-1.2.6/install/bin/mpicc
13 | LINKFLAGS =	-O
14 | LIB =
15 | 
16 | # Link target
17 | 
18 | gups_vanilla:	gups_vanilla.o
19 | 	$(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla
20 | 
21 | gups_nonpow2:	gups_nonpow2.o
22 | 	$(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2
23 | 
24 | gups_opt:	gups_opt.o
25 | 	$(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt
26 | 
27 | # Compilation rules
28 | 
29 | %.o:%.c
30 | 	$(CC) $(CCFLAGS) -c $<
31 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | GUPS distribution - 13 Oct 2006
  2 | 
  3 | This directory contains several implementations of an algorithm that
  4 | can be used to run the HPCC RandomAccess (GUPS) benchmark.
  5 | 
  6 | The algorithm is described on this WWW page:
  7 | www.cs.sandia.gov/~sjplimp/algorithms/html#gups
  8 | 
  9 | The tar file of codes can be downloaded from this WWW page:
 10 | www.cs.sandia.gov/~sjplimp/download.html
 11 | 
 12 | These codes are distributed by Steve Plimpton
 13 | of Sandia National Laboratories:
 14 | sjplimp@sandia.gov, www.cs.sandia.gov/~sjplimp
 15 | 
 16 | --------------------------------------------------------------------------
 17 | 
 18 | This directory should contain the following files:
 19 | 
 20 | gups_vanilla.c	      vanilla power-of-2 version of algorithm
 21 | gups_nonpow2.c	      non-power-of-2 version
 22 | gups_opt.c	      optimized power-of-2 version
 23 | 
 24 | MPIRandomAccess_vanilla.c	implementation of gups_vanilla in HPCC harness
 25 | MPIRandomAccess_opt.c		implementation of gups_opt in HPCC harness
 26 | 
 27 | Makefile.*	      Makefiles for various machines
 28 | 
 29 | --------------------------------------------------------------------------
 30 | 
 31 | The gups_* files are stand-alone single-file codes that can be built
 32 | using a Makefile like those provided.  E.g.
 33 | 
 34 | make -f Makefile.linux gups_vanilla
 35 | 
 36 | You will need to create a Makefile.* appropriate to your platform,
 37 | that points at the correct MPI library, etc.  Note that these 3 codes
 38 | support a -DLONG64 C compiler flag.  If a "long" on your processor is
 39 | 32-bit (presumably long long is 64 bits), then don't use -DLONG64; if
 40 | a "long" is 64 bits, then use -DLONG64.
 41 | 
 42 | --------------------------------------------------------------------------
 43 | 
 44 | You can run any of the 3 gups* codes as follows:
 45 | 
 46 | 1 proc:
 47 | gups_vanilla N M chunk
 48 | 
 49 | P procs:
 50 | mpirun -np P gups_vanilla N M chunk
 51 | 
 52 | where
 53 | 
 54 | N = length of global table is 2^N
 55 | M = # of update sets per proc
 56 | chunk = # of updates in one set on each proc
 57 | 
 58 | Note that 2^N is the length of the global table across all processors.
 59 | Thus N = 30 would run with a billion-element table.
 60 | 
 61 | Chunk is the number of updates each proc will do before communicating.
 62 | In the official HPCC benchmark this is specified to be no larger than
 63 | 1024, but you can run the code with any value you like.  Your GUPS
 64 | performance will typically decrease for smaller chunk size.
 65 | 
 66 | When each proc performs "chunk" updates, that is one "set" of updates.
 67 | M determines how many sets are performed.  The GUPS performance is a
 68 | "rate", so it's independent of M, once M is large enough to get good
 69 | statistics.  So you can start your testing with a small M to see how
 70 | fast your machine runs with this algorithm, then get better stats with
 71 | longer runs with a larger M.  An official HPCC benchmark run requires
 72 | M be a large number (like the total number of updates = 4x the table
 73 | size, if I recall), but your GUPS rate won't change.
 74 | 
 75 | After the code runs, it will print out some stats, like this:
 76 | > mpirun -np 2 gups_vanilla 20 1000 1024
 77 | Number of procs: 2
 78 | Vector size: 1048576
 79 | Max datums during comm: 1493
 80 | Max datums after comm: 1493
 81 | Excess datums (frac): 39395 (0.0192358)
 82 | Bad locality count: 0
 83 | Update time (secs):     0.383
 84 | Gups:  0.005351
 85 | 
 86 | "Vector size" is the length of the global table.
 87 | 
 88 | The "max datums" values tell how message size varied as datums were
 89 | routed thru the hypercube dimensions.  They should only exceed "chunk"
 90 | by a modest amount.  However the random number generation in the HPCC
 91 | algorithm is not very random, so in the first few iterations a few
 92 | procs tend to receive larger messages.
 93 | 
 94 | The "excess datums" value is the number of updates (and fraction) that
 95 | would have been missed if datums greater than the chunk size were
 96 | discarded.  It should typically be < 1% for long runs.  The codes do
 97 | not discard these excess updates.
 98 | 
 99 | The "bad locality" should be 0.  If the code was compiled with -DCHECK
100 | and a non-zero value results, it means some procs are trying to
101 | perform table updates on table indices they don't own, so something is
102 | wrong.
103 | 
104 | The "update time" is how long the code ran.
105 | 
106 | "Gups" is the GUPS performance rate, as HPCC defines it.  Namely the
107 | total # of updates per second across all processors.  The total # of
108 | updates is M*chunk*P, where P = # of processors.  Once you run on
109 | enough processors (e.g. 32), you should see the GUPS rate nearly
110 | double each time you double the number of procs, unless communication
111 | on your machine is slowing things down.
112 | 
113 | --------------------------------------------------------------------------
114 | 
115 | The MPIRandomAccess*.c codes are versions of the same algorithms
116 | implemented within the framework that HPCC provides to enable users to
117 | implement new optimized algorithms.
118 | 
119 | In principle you can take these files and drop them into the HPCC
120 | harness, re-compile the HPCC suite, and run an official HPCC benchmark
121 | test with the new algorithms.  In practice, I don't know the specifics
122 | of how to do that!  Courtenay Vaughan at Sandia was the one who worked
123 | on that part of the project.  You can email him if you have questions
124 | at ctvaugh@sandia.gov.
125 | 
126 | You should get essentially the same GUPS number when running these
127 | algorithms in the HPCC harness as you get with the stand-alone codes.
128 | 
129 | Note that we have only ported the vanilla and opt algorithms (not the
130 | non-power-of-2 version) to the HPCC framework.
131 | 


--------------------------------------------------------------------------------
/gups_vanilla.c:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------
  2 |    gups = algorithm for the HPCC RandomAccess (GUPS) benchmark
  3 |           implements a hypercube-style synchronous all2all
  4 | 
  5 |    Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
  6 |    www.cs.sandia.gov/~sjplimp
  7 |    Copyright (2006) Sandia Corporation
  8 | ------------------------------------------------------------------------- */
  9 | 
 10 | /* random update GUPS code, power-of-2 number of procs
 11 |    compile with -DCHECK to check if table updates happen on correct proc */
 12 | 
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <mpich/mpi.h>
 16 | 
 17 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 18 | 
 19 | /* machine defs
 20 |    compile with -DLONG64 if a "long" is 64 bits
 21 |    else compile with no setting if "long long" is 64 bit */
 22 | 
 23 | #ifdef LONG64
 24 | #define POLY 0x0000000000000007UL
 25 | #define PERIOD 1317624576693539401L
 26 | #define ZERO64B 0L
 27 | typedef long s64Int;
 28 | typedef unsigned long u64Int;
 29 | #define U64INT MPI_UNSIGNED_LONG
 30 | #else
 31 | #define POLY 0x0000000000000007ULL
 32 | #define PERIOD 1317624576693539401LL
 33 | #define ZERO64B 0LL
 34 | typedef long long s64Int;
 35 | typedef unsigned long long u64Int;
 36 | #define U64INT MPI_LONG_LONG_INT
 37 | #endif
 38 | 
 39 | u64Int HPCC_starts(s64Int n);
 40 | 
 41 | int main(int narg, char **arg)
 42 | {
 43 |   int me,nprocs;
 44 |   int i,j,iterate,niterate;
 45 |   int nlocal,nlocalm1,logtable,index,logtablelocal;
 46 |   int logprocs,ipartner,ndata,nsend,nkeep,nrecv,maxndata,maxnfinal,nexcess;
 47 |   int nbad,chunk,chunkbig;
 48 |   double t0,t0_all,Gups;
 49 |   u64Int *table,*data,*send;
 50 |   u64Int ran,datum,procmask,nglobal,offset,nupdates;
 51 |   u64Int ilong,nexcess_long,nbad_long;
 52 |   MPI_Status status;
 53 | 
 54 |   MPI_Init(&narg,&arg);
 55 |   MPI_Comm_rank(MPI_COMM_WORLD,&me);
 56 |   MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
 57 | 
 58 |   /* command line args = N M chunk
 59 |      N = length of global table is 2^N
 60 |      M = # of update sets per proc
 61 |      chunk = # of updates in one set */
 62 | 
 63 |   if (narg != 4) {
 64 |     if (me == 0) printf("Syntax: gups N M chunk\n");
 65 |     MPI_Abort(MPI_COMM_WORLD,1);
 66 |   }
 67 | 
 68 |   logtable = atoi(arg[1]);
 69 |   niterate = atoi(arg[2]);
 70 |   chunk = atoi(arg[3]);
 71 | 
 72 |   /* insure Nprocs is power of 2 */
 73 | 
 74 |   i = 1;
 75 |   while (i < nprocs) i *= 2;
 76 |   if (i != nprocs) {
 77 |     if (me == 0) printf("Must run on power-of-2 procs\n");
 78 |     MPI_Abort(MPI_COMM_WORLD,1);
 79 |   }
 80 | 
 81 |   /* nglobal = entire table
 82 |      nlocal = size of my portion
 83 |      nlocalm1 = local size - 1 (for index computation)
 84 |      logtablelocal = log of table size I store
 85 |      offset = starting index in global table of 1st entry in local table */
 86 | 
 87 |   logprocs = 0;
 88 |   while (1 << logprocs < nprocs) logprocs++;
 89 | 
 90 |   nglobal = ((u64Int) 1) << logtable;
 91 |   nlocal = nglobal / nprocs;
 92 |   nlocalm1 = nlocal - 1;
 93 |   logtablelocal = logtable - logprocs;
 94 |   offset = (u64Int) nlocal * me;
 95 | 
 96 |   /* allocate local memory
 97 |      16 factor insures space for messages that (randomly) exceed chunk size */
 98 | 
 99 |   chunkbig = 16*chunk;
100 | 
101 |   table = (u64Int *) malloc(nlocal*sizeof(u64Int));
102 |   data = (u64Int *) malloc(chunkbig*sizeof(u64Int));
103 |   send = (u64Int *) malloc(chunkbig*sizeof(u64Int));
104 | 
105 |   if (!table || !data || !send) {
106 |     if (me == 0) printf("Table allocation failed\n");
107 |     MPI_Abort(MPI_COMM_WORLD,1);
108 |   }
109 | 
110 |   /* initialize my portion of global array
111 |      global array starts with table[i] = i */
112 | 
113 |   for (i = 0; i < nlocal; i++) table[i] = i + offset;
114 | 
115 |   /* start my random # partway thru global stream */
116 | 
117 |   nupdates = (u64Int) nprocs * chunk * niterate;
118 |   ran = HPCC_starts(nupdates/nprocs*me);
119 | 
120 |   /* loop:
121 |        generate chunk random values per proc
122 |        communicate datums to correct processor via hypercube routing
123 |        use received values to update local table */
124 | 
125 |   maxndata = 0;
126 |   maxnfinal = 0;
127 |   nexcess = 0;
128 |   nbad = 0;
129 | 
130 |   MPI_Barrier(MPI_COMM_WORLD);
131 |   t0 = -MPI_Wtime();
132 | 
133 |   for (iterate = 0; iterate < niterate; iterate++) {
134 |     for (i = 0; i < chunk; i++) {
135 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
136 |       data[i] = ran;
137 |     }
138 |     ndata = chunk;
139 | 
140 |     for (j = 0; j < logprocs; j++) {
141 |       nkeep = nsend = 0;
142 |       ipartner = (1 << j) ^ me;
143 |       procmask = ((u64Int) 1) << (logtablelocal + j);
144 |       if (ipartner > me) {
145 | 	for (i = 0; i < ndata; i++) {
146 | 	  if (data[i] & procmask) send[nsend++] = data[i];
147 | 	  else data[nkeep++] = data[i];
148 | 	}
149 |       } else {
150 | 	for (i = 0; i < ndata; i++) {
151 | 	  if (data[i] & procmask) data[nkeep++] = data[i];
152 | 	  else send[nsend++] = data[i];
153 | 	}
154 |       }
155 | 
156 |       MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep],chunkbig,U64INT,
157 | 		   ipartner,0,MPI_COMM_WORLD,&status);
158 |       MPI_Get_count(&status,U64INT,&nrecv);
159 |       ndata = nkeep + nrecv;
160 |       maxndata = MAX(maxndata,ndata);
161 |     }
162 |     maxnfinal = MAX(maxnfinal,ndata);
163 |     if (ndata > chunk) nexcess += ndata - chunk;
164 | 
165 |     for (i = 0; i < ndata; i++) {
166 |       datum = data[i];
167 |       index = datum & nlocalm1;
168 |       table[index] ^= datum;
169 |     }
170 | 
171 | #ifdef CHECK
172 |     procmask = ((u64Int) (nprocs-1)) << logtablelocal;
173 |     for (i = 0; i < ndata; i++)
174 |       if ((data[i] & procmask) >> logtablelocal != me) nbad++;
175 | #endif
176 |   }
177 | 
178 |   MPI_Barrier(MPI_COMM_WORLD);
179 |   t0 += MPI_Wtime();
180 | 
181 |   /* stats */
182 | 
183 |   MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
184 |   t0 = t0_all/nprocs;
185 | 
186 |   i = maxndata;
187 |   MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
188 |   i = maxnfinal;
189 |   MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
190 |   ilong = nexcess;
191 |   MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
192 |   ilong = nbad;
193 |   MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
194 | 
195 |   nupdates = (u64Int) niterate * nprocs * chunk;
196 |   Gups = nupdates / t0 / 1.0e9;
197 | 
198 |   if (me == 0) {
199 |     printf("Number of procs: %d\n",nprocs);
200 |     printf("Vector size: %lld\n",nglobal);
201 |     printf("Max datums during comm: %d\n",maxndata);
202 |     printf("Max datums after comm: %d\n",maxnfinal);
203 |     printf("Excess datums (frac): %lld (%g)\n",
204 | 	   nexcess_long,(double) nexcess_long / nupdates);
205 |     printf("Bad locality count: %lld\n",nbad_long);
206 |     printf("Update time (secs): %9.3f\n",t0);
207 |     printf("Gups: %9.6f\n",Gups);
208 |   }
209 | 
210 |   /* clean up */
211 | 
212 |   free(table);
213 |   free(data);
214 |   free(send);
215 |   MPI_Finalize();
216 | }
217 | 
218 | /* start random number generator at Nth step of stream
219 |    routine provided by HPCC */
220 | 
221 | u64Int HPCC_starts(s64Int n)
222 | {
223 |   int i, j;
224 |   u64Int m2[64];
225 |   u64Int temp, ran;
226 | 
227 |   while (n < 0) n += PERIOD;
228 |   while (n > PERIOD) n -= PERIOD;
229 |   if (n == 0) return 0x1;
230 | 
231 |   temp = 0x1;
232 |   for (i=0; i<64; i++) {
233 |     m2[i] = temp;
234 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
235 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
236 |   }
237 | 
238 |   for (i=62; i>=0; i--)
239 |     if ((n >> i) & 1)
240 |       break;
241 | 
242 |   ran = 0x2;
243 |   while (i > 0) {
244 |     temp = 0;
245 |     for (j=0; j<64; j++)
246 |       if ((ran >> j) & 1)
247 |         temp ^= m2[j];
248 |     ran = temp;
249 |     i -= 1;
250 |     if ((n >> i) & 1)
251 |       ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0);
252 |   }
253 | 
254 |   return ran;
255 | }
256 | 


--------------------------------------------------------------------------------
/gups_nonpow2.c:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------
  2 |    gups = algorithm for the HPCC RandomAccess (GUPS) benchmark
  3 |           implements a hypercube-style synchronous all2all
  4 | 
  5 |    Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
  6 |    www.cs.sandia.gov/~sjplimp
  7 |    Copyright (2006) Sandia Corporation
  8 | ------------------------------------------------------------------------- */
  9 | 
 10 | /* random update GUPS code, non-power-of-2 number of procs (pow 2 is OK)
 11 |    compile with -DCHECK to check if table updates happen on correct proc */
 12 | 
 13 | #include "stdio.h"
 14 | #include "stdlib.h"
 15 | #include "mpi.h"
 16 | 
 17 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 18 | 
 19 | /* machine defs
 20 |    compile with -DLONG64 if a "long" is 64 bits
 21 |    else compile with no setting if "long long" is 64 bit */
 22 | 
 23 | #ifdef LONG64
 24 | #define POLY 0x0000000000000007UL
 25 | #define PERIOD 1317624576693539401L
 26 | #define ZERO64B 0L
 27 | typedef long s64Int;
 28 | typedef unsigned long u64Int;
 29 | #define U64INT MPI_UNSIGNED_LONG
 30 | #else
 31 | #define POLY 0x0000000000000007ULL
 32 | #define PERIOD 1317624576693539401LL
 33 | #define ZERO64B 0LL
 34 | typedef long long s64Int;
 35 | typedef unsigned long long u64Int;
 36 | #define U64INT MPI_LONG_LONG_INT
 37 | #endif
 38 | 
 39 | u64Int HPCC_starts(s64Int n);
 40 | 
 41 | int main(int narg, char **arg)
 42 | {
 43 |   int me,nprocs;
 44 |   int i,iterate,niterate;
 45 |   int nlocal,logtable,index;
 46 |   int ipartner,ndata,nsend,nkeep,nrecv,maxndata,maxnfinal,nexcess;
 47 |   int nbad,chunk,chunkbig,npartition,nlower,nupper,proclo,procmid,nfrac;
 48 |   double t0,t0_all,Gups;
 49 |   u64Int *table,*data,*send,*offsets;
 50 |   u64Int ran,datum,nglobal,nglobalm1,nupdates,offset,indexmid,nstart;
 51 |   u64Int ilong,nexcess_long,nbad_long;
 52 |   MPI_Status status;
 53 | 
 54 |   MPI_Init(&narg,&arg);
 55 |   MPI_Comm_rank(MPI_COMM_WORLD,&me);
 56 |   MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
 57 | 
 58 |   /* command line args = N M chunk
 59 |      N = length of global table is 2^N
 60 |      M = # of update sets per proc
 61 |      chunk = # of updates in one set */
 62 | 
 63 |   if (narg != 4) {
 64 |     if (me == 0) printf("Syntax: gups N M chunk\n");
 65 |     MPI_Abort(MPI_COMM_WORLD,1);
 66 |   }
 67 | 
 68 |   logtable = atoi(arg[1]);
 69 |   niterate = atoi(arg[2]);
 70 |   chunk = atoi(arg[3]);
 71 | 
 72 |   /* nglobal = entire table (power of 2)
 73 |      nlocal = size of my portion (not a power of 2)
 74 |      nglobalm1 = global size - 1 (for index computation)
 75 |      offsets[i] = starting index in global table of proc I's portion
 76 |      offset = starting index in global table of 1st entry in local table */
 77 | 
 78 |   nglobal = ((u64Int) 1) << logtable;
 79 |   nglobalm1 = nglobal - 1;
 80 |   nstart = (double) me / nprocs * nglobal;
 81 |   offsets = (u64Int *) malloc((nprocs+1)*sizeof(u64Int));
 82 |   MPI_Allgather(&nstart,1,U64INT,offsets,1,U64INT,MPI_COMM_WORLD);
 83 |   offsets[nprocs] = nglobal;
 84 |   nlocal = offsets[me+1] - offsets[me];
 85 |   offset = offsets[me];
 86 | 
 87 |   /* allocate local memory
 88 |      16 factor insures space for messages that (randomly) exceed chunk size */
 89 | 
 90 |   chunkbig = 16*chunk;
 91 | 
 92 |   table = (u64Int *) malloc(nlocal*sizeof(u64Int));
 93 |   data = (u64Int *) malloc(chunkbig*sizeof(u64Int));
 94 |   send = (u64Int *) malloc(chunkbig*sizeof(u64Int));
 95 | 
 96 |   if (!table || !data || !send) {
 97 |     if (me == 0) printf("Table allocation failed\n");
 98 |     MPI_Abort(MPI_COMM_WORLD,1);
 99 |   }
100 | 
101 |   /* initialize my portion of global array
102 |      global array starts with table[i] = i */
103 | 
104 |   for (i = 0; i < nlocal; i++) table[i] = i + offset;
105 | 
106 |   /* start my random # partway thru global stream */
107 | 
108 |   nupdates = (u64Int) nprocs * chunk * niterate;
109 |   ran = HPCC_starts(nupdates/nprocs*me);
110 | 
111 |   /* loop:
112 |        generate chunk random stream values per proc
113 |        communicate datums to correct processor via hypercube routing
114 |        use received values to update local table */
115 | 
116 |   maxndata = 0;
117 |   maxnfinal = 0;
118 |   nexcess = 0;
119 |   nbad = 0;
120 | 
121 |   MPI_Barrier(MPI_COMM_WORLD);
122 |   t0 = -MPI_Wtime();
123 | 
124 |   for (iterate = 0; iterate < niterate; iterate++) {
125 |     for (i = 0; i < chunk; i++) {
126 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
127 |       data[i] = ran;
128 |     }
129 |     ndata = chunk;
130 | 
131 |     npartition = nprocs;
132 |     proclo = 0;
133 |     while (npartition > 1) {
134 |       nlower = npartition/2;
135 |       nupper = npartition - nlower;
136 |       procmid = proclo + nlower;
137 |       indexmid = offsets[procmid];
138 | 
139 |       nkeep = nsend = 0;
140 |       if (me < procmid) {
141 | 	for (i = 0; i < ndata; i++) {
142 | 	  if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i];
143 | 	  else data[nkeep++] = data[i];
144 | 	}
145 |       } else {
146 | 	for (i = 0; i < ndata; i++) {
147 | 	  if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i];
148 | 	  else data[nkeep++] = data[i];
149 | 	}
150 |       }
151 | 
152 |       /* if partition halves are equal, exchange message with 1 partner
153 | 	 if upper half = lower half + 1:
154 | 	   if in lower half, send/recv 2 messages
155 | 	     1st exchange with me+nlower, 2nd exchange with me+nlower+1
156 | 	     1st send has first 
157 | 	       nfrac = (nlower - (me-proclo)) / nupper of my data 
158 | 	     2nd send has remainder of my data
159 | 	   if not first or last proc of upper half, send/recv 2 messages
160 | 	     1st exchange with me-nlower, 2nd exchange with me-nlower-1
161 | 	     2nd send has first
162 | 	       nfrac = (me-procmid) / nlower of my data 
163 | 	     1st send has remainder of my data
164 | 	   if first or last proc of upper half, send/recv 1 message
165 | 	     each exchanges with first/last proc of lower half
166 | 	     send all my data
167 | 	   always recv whatever is sent */
168 | 
169 |       if (nlower == nupper) {
170 | 	if (me < procmid) ipartner = me + nlower;
171 | 	else ipartner = me - nlower;
172 | 	MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep],
173 | 		     chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status);
174 | 	MPI_Get_count(&status,U64INT,&nrecv);
175 | 	ndata = nkeep + nrecv;
176 | 	maxndata = MAX(maxndata,ndata);
177 |       } else {
178 | 	if (me < procmid) {
179 | 	  nfrac = (nlower - (me-proclo)) * nsend / nupper;
180 | 	  ipartner = me + nlower;
181 | 	  MPI_Sendrecv(send,nfrac,U64INT,ipartner,0,&data[nkeep],
182 | 		       chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status);
183 | 	  MPI_Get_count(&status,U64INT,&nrecv);
184 | 	  nkeep += nrecv;
185 | 	  MPI_Sendrecv(&send[nfrac],nsend-nfrac,U64INT,ipartner+1,0,
186 | 		       &data[nkeep],chunkbig,U64INT,
187 | 		       ipartner+1,0,MPI_COMM_WORLD,&status);
188 | 	  MPI_Get_count(&status,U64INT,&nrecv);
189 | 	  ndata = nkeep + nrecv;
190 | 	} else if (me > procmid && me < procmid+nlower) {
191 | 	  nfrac = (me - procmid) * nsend / nlower;
192 | 	  ipartner = me - nlower;
193 | 	  MPI_Sendrecv(&send[nfrac],nsend-nfrac,U64INT,ipartner,0,&data[nkeep],
194 | 		       chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status);
195 | 	  MPI_Get_count(&status,U64INT,&nrecv);
196 | 	  nkeep += nrecv;
197 | 	  MPI_Sendrecv(send,nfrac,U64INT,ipartner-1,0,&data[nkeep],
198 | 		       chunkbig,U64INT,ipartner-1,0,MPI_COMM_WORLD,&status);
199 | 	  MPI_Get_count(&status,U64INT,&nrecv);
200 | 	  ndata = nkeep + nrecv;
201 | 	} else {
202 | 	  if (me == procmid) ipartner = me - nlower;
203 | 	  else ipartner = me - nupper;
204 | 	  MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep],
205 | 		       chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status);
206 | 	  MPI_Get_count(&status,U64INT,&nrecv);
207 | 	  ndata = nkeep + nrecv;
208 | 	}
209 |       }
210 | 
211 |       if (me < procmid) npartition = nlower;
212 |       else {
213 | 	proclo = procmid;
214 | 	npartition = nupper;
215 |       }
216 |     }
217 |     maxnfinal = MAX(maxnfinal,ndata);
218 |     if (ndata > chunk) nexcess += ndata-chunk;
219 | 
220 |     for (i = 0; i < ndata; i++) {
221 |       datum = data[i];
222 |       index = (datum & nglobalm1) - offset;
223 |       table[index] ^= datum;
224 |     }
225 | 
226 | #ifdef CHECK
227 |     for (i = 0; i < ndata; i++) {
228 |       index = (datum & nglobalm1) - offset;
229 |       if (index < 0 || index >= nlocal) nbad++;
230 |     }
231 | #endif
232 |   }
233 | 
234 |   MPI_Barrier(MPI_COMM_WORLD);
235 |   t0 += MPI_Wtime();
236 | 
237 |   /* stats */
238 | 
239 |   MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
240 |   t0 = t0_all/nprocs;
241 | 
242 |   i = maxndata;
243 |   MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
244 |   i = maxnfinal;
245 |   MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
246 |   ilong = nexcess;
247 |   MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
248 |   ilong = nbad;
249 |   MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
250 | 
251 |   nupdates = (u64Int) niterate * nprocs * chunk;
252 |   Gups = nupdates / t0 / 1.0e9;
253 | 
254 |   if (me == 0) {
255 |     printf("Number of procs: %d\n",nprocs);
256 |     printf("Vector size: %lld\n",nglobal);
257 |     printf("Max datums during comm: %d\n",maxndata);
258 |     printf("Max datums after comm: %d\n",maxnfinal);
259 |     printf("Excess datums (frac): %lld (%g)\n",
260 | 	   nexcess_long,(double) nexcess_long / nupdates);
261 |     printf("Bad locality count: %lld\n",nbad_long);
262 |     printf("Update time (secs): %9.3f\n",t0);
263 |     printf("Gups: %9.6f\n",Gups);
264 |   }
265 | 
266 |   /* clean up */
267 | 
268 |   free(table);
269 |   free(data);
270 |   free(send);
271 |   free(offsets);
272 |   MPI_Finalize();
273 | }
274 | 
275 | /* start random number generator at Nth step of stream
276 |    routine provided by HPCC */
277 | 
278 | u64Int HPCC_starts(s64Int n)
279 | {
280 |   int i, j;
281 |   u64Int m2[64];
282 |   u64Int temp, ran;
283 | 
284 |   while (n < 0) n += PERIOD;
285 |   while (n > PERIOD) n -= PERIOD;
286 |   if (n == 0) return 0x1;
287 | 
288 |   temp = 0x1;
289 |   for (i=0; i<64; i++) {
290 |     m2[i] = temp;
291 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
292 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
293 |   }
294 | 
295 |   for (i=62; i>=0; i--)
296 |     if ((n >> i) & 1)
297 |       break;
298 | 
299 |   ran = 0x2;
300 |   while (i > 0) {
301 |     temp = 0;
302 |     for (j=0; j<64; j++)
303 |       if ((ran >> j) & 1)
304 |         temp ^= m2[j];
305 |     ran = temp;
306 |     i -= 1;
307 |     if ((n >> i) & 1)
308 |       ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0);
309 |   }
310 | 
311 |   return ran;
312 | }
313 | 


--------------------------------------------------------------------------------
/gups_opt.c:
--------------------------------------------------------------------------------
  1 | /* ----------------------------------------------------------------------
  2 |    gups = algorithm for the HPCC RandomAccess (GUPS) benchmark
  3 |           implements a hypercube-style synchronous all2all
  4 | 
  5 |    Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
  6 |    www.cs.sandia.gov/~sjplimp
  7 |    Copyright (2006) Sandia Corporation
  8 | 
  9 |    optimizations implemented by Ron Brightwell and Keith Underwood (SNL)
 10 | ------------------------------------------------------------------------- */
 11 | 
 12 | /* random update GUPS code with optimizations, power-of-2 number of procs
 13 |    compile with -DCHECK to check if table updates happen on correct proc */
 14 | 
 15 | #include "stdio.h"
 16 | #include "stdlib.h"
 17 | #include "mpi.h"
 18 | 
 19 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 20 | 
 21 | #define MAXLOGPROCS (20)
 22 | #define CHUNK  1024
 23 | /* Optimize CHUNK2 to make sure that we minimize aliasing in the cache -KDU */
 24 | #define CHUNKBIG 10240
 25 | /* RCHUNK is constant because we have log2(N) of them, but they are used
 26 |  * independently on successive iterations (i.e. they don't interfere
 27 |  * with each other in cache -KDU
 28 |  */
 29 | #define RCHUNK 4096
 30 | #define PITER 8
 31 | 
 32 | /* machine defs
 33 |    compile with -DLONG64 if a "long" is 64 bits
 34 |    else compile with no setting if "long long" is 64 bit */
 35 | 
 36 | #ifdef LONG64
 37 | #define POLY 0x0000000000000007UL
 38 | #define PERIOD 1317624576693539401L
 39 | #define ZERO64B 0L
 40 | typedef long s64Int;
 41 | typedef unsigned long u64Int;
 42 | #define U64INT MPI_UNSIGNED_LONG
 43 | #else
 44 | #define POLY 0x0000000000000007ULL
 45 | #define PERIOD 1317624576693539401LL
 46 | #define ZERO64B 0LL
 47 | typedef long long s64Int;
 48 | typedef unsigned long long u64Int;
 49 | #define U64INT MPI_LONG_LONG_INT
 50 | #endif
 51 | 
 52 | void sort_data (u64Int *source, u64Int *nomatch, u64Int *match, int number, 
 53 | 		int *nnomatch, int *nmatch, int mask_shift);
 54 | inline update_table (u64Int *data, u64Int *table, int number, int nlocalm1);
 55 | u64Int HPCC_starts(s64Int n);
 56 | 
 57 | int main(int narg, char **arg)
 58 | {
 59 |   int me,nprocs;
 60 |   int i,j,k,iterate,niterate;
 61 |   int nlocal,nlocalm1,logtable,index,logtablelocal;
 62 |   int logprocs,ipartner,ndata,nsend,nkeep,nkept,nrecv;
 63 |   int maxndata,maxnfinal,nexcess;
 64 |   int nbad;
 65 |   double t0,t0_all,Gups;
 66 |   u64Int *table,*data,*send, *keep_data;
 67 | #ifndef USE_BLOCKING_SEND
 68 |   u64Int *send1,*send2;
 69 | #endif
 70 |   u64Int *recv[PITER][MAXLOGPROCS];
 71 |   u64Int ran,datum,procmask,nglobal,offset,nupdates;
 72 |   u64Int ilong,nexcess_long,nbad_long;
 73 |   MPI_Status status;
 74 |   MPI_Request request[PITER][MAXLOGPROCS];
 75 |   MPI_Request srequest;
 76 | 
 77 |   MPI_Init(&narg,&arg);
 78 |   MPI_Comm_rank(MPI_COMM_WORLD,&me);
 79 |   MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
 80 | 
 81 |   /* command line args = N M
 82 |      N = length of global table is 2^N
 83 |      M = # of 1024-update sets per proc */
 84 | 
 85 |   if (narg != 3) {
 86 |     if (me == 0) printf("Syntax: gups N M\n");
 87 |     MPI_Abort(MPI_COMM_WORLD,1);
 88 |   }
 89 | 
 90 |   logtable = atoi(arg[1]);
 91 |   niterate = atoi(arg[2]);
 92 | 
 93 |   /* insure Nprocs is power of 2 */
 94 | 
 95 |   i = 1;
 96 |   while (i < nprocs) i *= 2;
 97 |   if (i != nprocs) {
 98 |     if (me == 0) printf("Must run on power-of-2 procs\n");
 99 |     MPI_Abort(MPI_COMM_WORLD,1);
100 |   }
101 | 
102 |   /* nglobal = entire table
103 |      nlocal = size of my portion
104 |      nlocalm1 = local size - 1 (for index computation)
105 |      logtablelocal = log of table size I store
106 |      offset = starting index in global table of 1st entry in local table */
107 | 
108 |   logprocs = 0;
109 |   while (1 << logprocs < nprocs) logprocs++;
110 | 
111 |   nglobal = ((u64Int) 1) << logtable;
112 |   nlocal = nglobal / nprocs;
113 |   nlocalm1 = nlocal - 1;
114 |   logtablelocal = logtable - logprocs;
115 |   offset = (u64Int) nlocal * me;
116 | 
117 |   /* allocate local memory */
118 | 
119 |   table = (u64Int *) malloc(nlocal*sizeof(u64Int));
120 |   data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
121 | 
122 |   if (!table || !data) {
123 |     if (me == 0) printf("Table allocation failed\n");
124 |     MPI_Abort(MPI_COMM_WORLD,1);
125 |   }
126 | 
127 | #ifdef USE_BLOCKING_SEND
128 |   send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
129 |   if (!send) {
130 |     if (me == 0) printf("Table allocation failed\n");
131 |     MPI_Abort(MPI_COMM_WORLD,1);
132 |   }
133 | #else
134 |   send1 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
135 |   send2 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
136 |   send = send1;
137 |   if (!send1 || !send2) {
138 |     if (me == 0) printf("Table allocation failed\n");
139 |     MPI_Abort(MPI_COMM_WORLD,1);
140 |   }
141 | #endif
142 | 
143 |   for (j = 0; j < PITER; j++)
144 |     for (i=0; i<logprocs; i++) {
145 |       if ((recv[j][i] = (u64Int *)malloc(sizeof(u64Int)*RCHUNK)) == NULL) {
146 |          printf("Recv buffer allocation failed\n");
147 |          MPI_Abort(MPI_COMM_WORLD,1);
148 |       }
149 |     }
150 | 
151 |   /* initialize my portion of global array
152 |      global array starts with table[i] = i */
153 | 
154 |   for (i = 0; i < nlocal; i++) table[i] = i + offset;
155 | 
156 |   /* start my random # partway thru global stream */
157 | 
158 |   nupdates = (u64Int) nprocs * CHUNK * niterate;
159 |   ran = HPCC_starts(nupdates/nprocs*me);
160 | 
161 |   /* loop:
162 |        generate CHUNK random values per proc
163 |        communicate datums to correct processor via hypercube routing
164 |        use received values to update local table */
165 | 
166 |   maxndata = 0;
167 |   maxnfinal = 0;
168 |   nexcess = 0;
169 |   nbad = 0;
170 | 
171 |   MPI_Barrier(MPI_COMM_WORLD);
172 |   t0 = -MPI_Wtime();
173 | 
174 |   for (iterate = 0; iterate < niterate; iterate++) {
175 |     int iter_mod = iterate % PITER;
176 |     for (i = 0; i < CHUNK; i++) {
177 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
178 |       data[i] = ran;
179 |     }
180 |     nkept = CHUNK;
181 |     nrecv = 0;
182 | 
183 |     if (iter_mod == 0) 
184 |       for (k=0; k < PITER; k++)
185 |         for (j=0; j<logprocs; j++) {
186 |           ipartner = (1 << j) ^ me;
187 |           MPI_Irecv(recv[k][j],RCHUNK,U64INT,ipartner,0,MPI_COMM_WORLD,
188 | 	    &request[k][j]);
189 |         }
190 | 
191 |     for (j = 0; j < logprocs; j++) {
192 |       nkeep = nsend = 0;
193 | #ifndef USE_BLOCKING_SEND
194 |       send = (send == send1) ? send2 : send1;
195 | #endif
196 |       ipartner = (1 << j) ^ me;
197 |       procmask = ((u64Int) 1) << (logtablelocal + j);
198 |       if (ipartner > me) {
199 |       	sort_data(data,data,send,nkept,&nkeep,&nsend,logtablelocal+j);
200 |         if (j > 0) {
201 |           MPI_Wait(&request[iter_mod][j-1],&status);
202 |           MPI_Get_count(&status,U64INT,&nrecv);
203 | 
204 | 
205 |       	  sort_data(recv[iter_mod][j-1],data,send,nrecv,&nkeep, 
206 | 	    &nsend,logtablelocal+j);
207 |         }
208 |       } else {
209 |         sort_data(data,send,data,nkept,&nsend,&nkeep,logtablelocal+j);
210 |         if (j > 0) {
211 |           MPI_Wait(&request[iter_mod][j-1],&status);
212 |           MPI_Get_count(&status,U64INT,&nrecv);
213 |           sort_data(recv[iter_mod][j-1],send,data,nrecv,&nsend,
214 | 		    &nkeep,logtablelocal+j);
215 |         }
216 |       }
217 | #ifdef USE_BLOCKING_SEND
218 |       MPI_Send(send,nsend,U64INT,ipartner,0,MPI_COMM_WORLD);
219 | #else
220 |       if (j > 0) MPI_Wait(&srequest,&status);
221 |       MPI_Isend(send,nsend,U64INT,ipartner,0,MPI_COMM_WORLD,&srequest);
222 | #endif
223 |       if (j == (logprocs - 1)) {
224 | 	update_table(data, table, nkeep, nlocalm1);
225 |       }
226 |       maxndata = MAX(maxndata,nkept+nrecv);
227 |       nkept = nkeep;
228 |     }
229 | 
230 |     if (logprocs == 0) {
231 |       update_table(data, table, nkept, nlocalm1);
232 |     } else {
233 |       MPI_Wait(&request[iter_mod][j-1],&status);
234 |       MPI_Get_count(&status,U64INT,&nrecv);
235 |       update_table(recv[iter_mod][j-1], table, nrecv, nlocalm1);
236 | #ifndef USE_BLOCKING_SEND
237 |       MPI_Wait(&srequest,&status);
238 | #endif
239 |     }
240 | 
241 |     ndata = nkept + nrecv;
242 |     maxndata = MAX(maxndata,ndata);
243 |     maxnfinal = MAX(maxnfinal,ndata);
244 |     if (ndata > CHUNK) nexcess += ndata - CHUNK;
245 | 
246 | #ifdef CHECK
247 |     procmask = ((u64Int) (nprocs-1)) << logtablelocal;
248 |     for (i = 0; i < nkept; i++)
249 |       if ((data[i] & procmask) >> logtablelocal != me) nbad++;
250 |     for (i = 0; i < nrecv; i++)
251 |       if ((recv[iter_mod][j-1][i] & procmask) >> logtablelocal != me) nbad++;
252 | #endif
253 |   }
254 | 
255 |   MPI_Barrier(MPI_COMM_WORLD);
256 |   t0 += MPI_Wtime();
257 | 
258 |   /* stats */
259 | 
260 |   MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
261 |   t0 = t0_all/nprocs;
262 | 
263 |   i = maxndata;
264 |   MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
265 |   i = maxnfinal;
266 |   MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);
267 |   ilong = nexcess;
268 |   MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
269 |   ilong = nbad;
270 |   MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD);
271 | 
272 |   nupdates = (u64Int) niterate * nprocs * CHUNK;
273 |   Gups = nupdates / t0 / 1.0e9;
274 | 
275 |   if (me == 0) {
276 |     printf("Number of procs: %d\n",nprocs);
277 |     printf("Vector size: %lld\n",nglobal);
278 |     printf("Max datums during comm: %d\n",maxndata);
279 |     printf("Max datums after comm: %d\n",maxnfinal);
280 |     printf("Excess datums (frac): %lld (%g)\n",
281 | 	   nexcess_long,(double) nexcess_long / nupdates);
282 |     printf("Bad locality count: %lld\n",nbad_long);
283 |     printf("Update time (secs): %9.3f\n",t0);
284 |     printf("Gups: %9.6f\n",Gups);
285 |   }
286 | 
287 |   /* clean up */
288 | 
289 |   for (j = 0; j < PITER; j++)
290 |     for (i = 0; i < logprocs; i++) free(recv[j][i]);
291 |   free(table);
292 |   free(data);
293 | #ifdef USE_BLOCKING_SEND
294 |   free(send);
295 | #else
296 |   free(send1);
297 |   free(send2);
298 | #endif
299 |   MPI_Finalize();
300 | }
301 | 
302 | /* This sort is manually unrolled to make sure the compiler can see 
303 |  * the parallelism -KDU
304 |  */
305 | 
306 | void sort_data(u64Int *source, u64Int *nomatch, u64Int *match, int number, 
307 | 	       int *nnomatch, int *nmatch, int mask_shift)
308 | {
309 |   int div_num = number / 8;
310 |   int loop_total = div_num * 8;
311 |   u64Int procmask = ((u64Int) 1) << mask_shift;
312 |   int i;
313 |   u64Int *buffers[2];
314 |   int counts[2];
315 | 
316 |   buffers[0] = nomatch;
317 |   counts[0] = *nnomatch;
318 |   buffers[1] = match;
319 |   counts[1] = *nmatch;
320 | 
321 |   for (i = 0; i < div_num; i++) {
322 |     int dindex = i*8;
323 |     int myselect[8];
324 |     myselect[0] = (source[dindex] & procmask) >> mask_shift;
325 |     myselect[1] = (source[dindex+1] & procmask) >> mask_shift;
326 |     myselect[2] = (source[dindex+2] & procmask) >> mask_shift;
327 |     myselect[3] = (source[dindex+3] & procmask) >> mask_shift;
328 |     myselect[4] = (source[dindex+4] & procmask) >> mask_shift;
329 |     myselect[5] = (source[dindex+5] & procmask) >> mask_shift;
330 |     myselect[6] = (source[dindex+6] & procmask) >> mask_shift;
331 |     myselect[7] = (source[dindex+7] & procmask) >> mask_shift;
332 |     buffers[myselect[0]][counts[myselect[0]]++] = source[dindex];
333 |     buffers[myselect[1]][counts[myselect[1]]++] = source[dindex+1];
334 |     buffers[myselect[2]][counts[myselect[2]]++] = source[dindex+2];
335 |     buffers[myselect[3]][counts[myselect[3]]++] = source[dindex+3];
336 |     buffers[myselect[4]][counts[myselect[4]]++] = source[dindex+4];
337 |     buffers[myselect[5]][counts[myselect[5]]++] = source[dindex+5];
338 |     buffers[myselect[6]][counts[myselect[6]]++] = source[dindex+6];
339 |     buffers[myselect[7]][counts[myselect[7]]++] = source[dindex+7];
340 |   }
341 |   
342 |   for (i = loop_total; i < number; i++) {
343 |     u64Int mydata = source[i];
344 |     if (mydata & procmask) buffers[1][counts[1]++] = mydata;
345 |     else buffers[0][counts[0]++] = mydata;
346 |   }
347 |   
348 |   *nnomatch = counts[0];
349 |   *nmatch = counts[1];
350 | }
351 | 
352 | inline update_table(u64Int *data, u64Int *table, int number, int nlocalm1)
353 | {
354 | /* DEEP_UNROLL doesn't seem to improve anything at this time */
355 | /* Manual unrolling is a significant win if -Msafeptr is used -KDU */
356 | #ifdef DEEP_UNROLL
357 |   int div_num = number / 16;
358 |   int loop_total = div_num * 16;
359 | #else
360 |   int div_num = number / 8;
361 |   int loop_total = div_num * 8;
362 | #endif
363 | 
364 |   int i;
365 |   for (i = 0; i < div_num; i++) {
366 | #ifdef DEEP_UNROLL
367 |     const int dindex = i*16;
368 | #else
369 |     const int dindex = i*8;
370 | #endif
371 |     u64Int index0 = data[dindex] & nlocalm1;
372 |     u64Int index1 = data[dindex+1] & nlocalm1;
373 |     u64Int index2 = data[dindex+2] & nlocalm1;
374 |     u64Int index3 = data[dindex+3] & nlocalm1;
375 |     u64Int index4 = data[dindex+4] & nlocalm1;
376 |     u64Int index5 = data[dindex+5] & nlocalm1;
377 |     u64Int index6 = data[dindex+6] & nlocalm1;
378 |     u64Int index7 = data[dindex+7] & nlocalm1;
379 |     u64Int ltable0 = table[index0];
380 |     u64Int ltable1 = table[index1];
381 |     u64Int ltable2 = table[index2];
382 |     u64Int ltable3 = table[index3];
383 |     u64Int ltable4 = table[index4];
384 |     u64Int ltable5 = table[index5];
385 |     u64Int ltable6 = table[index6];
386 |     u64Int ltable7 = table[index7];
387 | #ifdef DEEP_UNROLL
388 |     u64Int index8 = data[dindex+8] & nlocalm1;
389 |     u64Int index9 = data[dindex+9] & nlocalm1;
390 |     u64Int index10 = data[dindex+10] & nlocalm1;
391 |     u64Int index11 = data[dindex+11] & nlocalm1;
392 |     u64Int index12 = data[dindex+12] & nlocalm1;
393 |     u64Int index13 = data[dindex+13] & nlocalm1;
394 |     u64Int index14 = data[dindex+14] & nlocalm1;
395 |     u64Int index15 = data[dindex+15] & nlocalm1;
396 |     u64Int ltable8 = table[index8];
397 |     u64Int ltable9 = table[index9];
398 |     u64Int ltable10 = table[index10];
399 |     u64Int ltable11 = table[index11];
400 |     u64Int ltable12 = table[index12];
401 |     u64Int ltable13 = table[index13];
402 |     u64Int ltable14 = table[index14];
403 |     u64Int ltable15 = table[index15];
404 | #endif
405 |     table[index0] = ltable0 ^ data[dindex];
406 |     table[index1] = ltable1 ^ data[dindex+1];
407 |     table[index2] = ltable2 ^ data[dindex+2];
408 |     table[index3] = ltable3 ^ data[dindex+3];
409 |     table[index4] = ltable4 ^ data[dindex+4];
410 |     table[index5] = ltable5 ^ data[dindex+5];
411 |     table[index6] = ltable6 ^ data[dindex+6];
412 |     table[index7] = ltable7 ^ data[dindex+7];
413 | #ifdef DEEP_UNROLL
414 |     table[index8] = ltable8 ^ data[dindex+8];
415 |     table[index9] = ltable9 ^ data[dindex+9];
416 |     table[index10] = ltable10 ^ data[dindex+10];
417 |     table[index11] = ltable11 ^ data[dindex+11];
418 |     table[index12] = ltable12 ^ data[dindex+12];
419 |     table[index13] = ltable13 ^ data[dindex+13];
420 |     table[index14] = ltable14 ^ data[dindex+14];
421 |     table[index15] = ltable15 ^ data[dindex+15];
422 | #endif
423 |   }
424 | 
425 |   for (i = loop_total; i < number; i++) {
426 |     u64Int datum = data[i];
427 |     int index = datum & nlocalm1;
428 |     table[index] ^= datum;
429 |   }
430 | }
431 | 
432 | /* start random number generator at Nth step of stream
433 |    routine provided by HPCC */
434 | 
435 | u64Int HPCC_starts(s64Int n)
436 | {
437 |   int i, j;
438 |   u64Int m2[64];
439 |   u64Int temp, ran;
440 | 
441 |   while (n < 0) n += PERIOD;
442 |   while (n > PERIOD) n -= PERIOD;
443 |   if (n == 0) return 0x1;
444 | 
445 |   temp = 0x1;
446 |   for (i=0; i<64; i++) {
447 |     m2[i] = temp;
448 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
449 |     temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0);
450 |   }
451 | 
452 |   for (i=62; i>=0; i--)
453 |     if ((n >> i) & 1)
454 |       break;
455 | 
456 |   ran = 0x2;
457 |   while (i > 0) {
458 |     temp = 0;
459 |     for (j=0; j<64; j++)
460 |       if ((ran >> j) & 1)
461 |         temp ^= m2[j];
462 |     ran = temp;
463 |     i -= 1;
464 |     if ((n >> i) & 1)
465 |       ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0);
466 |   }
467 | 
468 |   return ran;
469 | }
470 | 


--------------------------------------------------------------------------------
/MPIRandomAccess_vanilla.c:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */
  2 | 
  3 | /*
  4 |  * This code has been contributed by the DARPA HPCS program.  Contact
  5 |  * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu>
  6 |  * if you have questions.
  7 |  *
  8 |  *
  9 |  * GUPS (Giga UPdates per Second) is a measurement that profiles the memory
 10 |  * architecture of a system and is a measure of performance similar to MFLOPS.
 11 |  * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the
 12 |  * GUPS capability of a system, much like the LINPACK benchmark is intended to
 13 |  * exercise the MFLOPS capability of a computer.  In each case, we would
 14 |  * expect these benchmarks to achieve close to the "peak" capability of the
 15 |  * memory system. The extent of the similarities between RandomAccess and
 16 |  * LINPACK are limited to both benchmarks attempting to calculate a peak system
 17 |  * capability.
 18 |  *
 19 |  * GUPS is calculated by identifying the number of memory locations that can be
 20 |  * randomly updated in one second, divided by 1 billion (1e9). The term "randomly"
 21 |  * means that there is little relationship between one address to be updated and
 22 |  * the next, except that they occur in the space of one half the total system
 23 |  * memory.  An update is a read-modify-write operation on a table of 64-bit words.
 24 |  * An address is generated, the value at that address read from memory, modified
 25 |  * by an integer operation (add, and, or, xor) with a literal value, and that
 26 |  * new value is written back to memory.
 27 |  *
 28 |  * We are interested in knowing the GUPS performance of both entire systems and
 29 |  * system subcomponents --- e.g., the GUPS rating of a distributed memory
 30 |  * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a
 31 |  * single processor.  While there is typically a scaling of FLOPS with processor
 32 |  * count, a similar phenomenon may not always occur for GUPS.
 33 |  *
 34 |  * Select the memory size to be the power of two such that 2^n <= 1/2 of the
 35 |  * total memory.  Each CPU operates on its own address stream, and the single
 36 |  * table may be distributed among nodes. The distribution of memory to nodes
 37 |  * is left to the implementer.  A uniform data distribution may help balance
 38 |  * the workload, while non-uniform data distributions may simplify the
 39 |  * calculations that identify processor location by eliminating the requirement
 40 |  * for integer divides. A small (less than 1%) percentage of missed updates
 41 |  * are permitted.
 42 |  *
 43 |  * When implementing a benchmark that measures GUPS on a distributed memory
 44 |  * multiprocessor system, it may be required to define constraints as to how
 45 |  * far in the random address stream each node is permitted to "look ahead".
 46 |  * Likewise, it may be required to define a constraint as to the number of
 47 |  * update messages that can be stored before processing to permit multi-level
 48 |  * parallelism for those systems that support such a paradigm.  The limits on
 49 |  * "look ahead" and "stored updates" are being implemented to assure that the
 50 |  * benchmark meets the intent to profile memory architecture and not induce
 51 |  * significant artificial data locality. For the purpose of measuring GUPS,
 52 |  * we will stipulate that each thread is permitted to look ahead no more than
 53 |  * 1024 random address stream samples with the same number of update messages
 54 |  * stored before processing.
 55 |  *
 56 |  * The supplied MPI-1 code generates the input stream {A} on all processors
 57 |  * and the global table has been distributed as uniformly as possible to
 58 |  * balance the workload and minimize any Amdahl fraction.  This code does not
 59 |  * exploit "look-ahead".  Addresses are sent to the appropriate processor
 60 |  * where the table entry resides as soon as each address is calculated.
 61 |  * Updates are performed as addresses are received.  Each message is limited
 62 |  * to a single 64 bit long integer containing element ai from {A}.
 63 |  * Local offsets for T[ ] are extracted by the destination processor.
 64 |  *
 65 |  * If the number of processors is equal to a power of two, then the global
 66 |  * table can be distributed equally over the processors.  In addition, the
 67 |  * processor number can be determined from that portion of the input stream
 68 |  * that identifies the address into the global table by masking off log2(p)
 69 |  * bits in the address.
 70 |  *
 71 |  * If the number of processors is not equal to a power of two, then the global
 72 |  * table cannot be equally distributed between processors.  In the MPI-1
 73 |  * implementation provided, there has been an attempt to minimize the differences
 74 |  * in workloads and the largest difference in elements of T[ ] is one.  The
 75 |  * number of values in the input stream generated by each processor will be
 76 |  * related to the number of global table entries on each processor.
 77 |  *
 78 |  * The MPI-1 version of RandomAccess treats the potential instance where the
 79 |  * number of processors is a power of two as a special case, because of the
 80 |  * significant simplifications possible because processor location and local
 81 |  * offset can be determined by applying masks to the input stream values.
 82 |  * The non power of two case uses an integer division to determine the processor
 83 |  * location.  The integer division will be more costly in terms of machine
 84 |  * cycles to perform than the bit masking operations
 85 |  *
 86 |  * For additional information on the GUPS metric, the HPCchallenge RandomAccess
 87 |  * Benchmark,and the rules to run RandomAccess or modify it to optimize
 88 |  * performance -- see http://icl.cs.utk.edu/hpcc/
 89 |  *
 90 |  */
 91 | 
 92 | /* Jan 2005
 93 |  *
 94 |  * This code has been modified to allow local bucket sorting of updates.
 95 |  * The total maximum number of updates in the local buckets of a process
 96 |  * is currently defined in "RandomAccess.h" as MAX_TOTAL_PENDING_UPDATES.
 97 |  * When the total maximum number of updates is reached, the process selects
 98 |  * the bucket (or destination process) with the largest number of
 99 |  * updates and sends out all the updates in that bucket. See buckets.c
100 |  * for details about the buckets' implementation.
101 |  *
102 |  * This code also supports posting multiple MPI receive descriptors (based
103 |  * on a contribution by David Addison).
104 |  *
105 |  * In addition, this implementation provides an option for limiting
106 |  * the execution time of the benchmark to a specified time bound
107 |  * (see time_bound.c). The time bound is currently defined in
108 |  * time_bound.h, but it should be a benchmark parameter. By default
109 |  * the benchmark will execute the recommended number of updates,
110 |  * that is, four times the global table size.
111 |  */
112 | 
113 | #include <hpcc.h>
114 | 
115 | #include "RandomAccess.h"
116 | #include "buckets.h"
117 | #include "time_bound.h"
118 | #include "verification.h"
119 | 
120 | #define CHUNK    (1024)
121 | #define CHUNKBIG (32768)
122 | 
123 | /* Allocate main table (in global memory) */
124 | u64Int *HPCC_Table;
125 | 
126 | u64Int LocalSendBuffer[LOCAL_BUFFER_SIZE];
127 | u64Int LocalRecvBuffer[MAX_RECV*LOCAL_BUFFER_SIZE];
128 | 
129 | #ifndef LONG_IS_64BITS
130 | static void
131 | Sum64(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
132 |   int i, n = *len; s64Int *invec64 = (s64Int *)invec, *inoutvec64 = (s64Int *)inoutvec;
133 |   for (i = n; i; i--, invec64++, inoutvec64++) *inoutvec64 += *invec64;
134 | }
135 | #endif
136 | 
137 | static void
138 | AnyNodesMPIRandomAccessUpdate(u64Int logTableSize,
139 |                               u64Int TableSize,
140 |                               u64Int LocalTableSize,
141 |                               u64Int MinLocalTableSize,
142 |                               u64Int GlobalStartMyProc,
143 |                               u64Int Top,
144 |                               int logNumProcs,
145 |                               int NumProcs,
146 |                               int Remainder,
147 |                               int MyProc,
148 |                               s64Int ProcNumUpdates,
149 |                               MPI_Datatype INT64_DT)
150 | {
151 |   int i,j;
152 |   int ipartner,iterate,niterate,npartition,proclo,nlower,nupper,procmid;
153 |   int ndata,nkeep,nsend,nrecv,index, nfrac;
154 |   u64Int ran,datum,nglobalm1,indexmid;
155 |   u64Int *data,*send, *offsets;
156 |   MPI_Status status;
157 | 
158 |   /* setup: should not really be part of this timed routine
159 |      NOTE: niterate must be computed from global TableSize * 4
160 |            not from ProcNumUpdates since that can be different on each proc
161 |            round niterate up by 1 to do slightly more than required updates */
162 | 
163 |   data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
164 |   send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
165 | 
166 |   for (i = 0; i < LocalTableSize; i++)
167 |     HPCC_Table[i] = i + GlobalStartMyProc;
168 | 
169 |   ran = HPCC_starts(4*GlobalStartMyProc);
170 | 
171 |   offsets = (u64Int *) malloc((NumProcs+1)*sizeof(u64Int));
172 |   MPI_Allgather(&GlobalStartMyProc,1,INT64_DT,offsets,1,INT64_DT,
173 |                 MPI_COMM_WORLD);
174 |   offsets[NumProcs] = TableSize;
175 | 
176 |   niterate = 4 * TableSize / NumProcs / CHUNK + 1;
177 |   nglobalm1 = TableSize - 1;
178 | 
179 |   /* actual update loop: this is only section that should be timed */
180 | 
181 |   for (iterate = 0; iterate < niterate; iterate++) {
182 |     for (i = 0; i < CHUNK; i++) {
183 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
184 |       data[i] = ran;
185 |     }
186 |     ndata = CHUNK;
187 | 
188 |     npartition = NumProcs;
189 |     proclo = 0;
190 |     while (npartition > 1) {
191 |       nlower = npartition/2;
192 |       nupper = npartition - nlower;
193 |       procmid = proclo + nlower;
194 |       indexmid = offsets[procmid];
195 |       
196 |       nkeep = nsend = 0;
197 |       if (MyProc < procmid) {
198 |         for (i = 0; i < ndata; i++) {
199 |           if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i];
200 |           else data[nkeep++] = data[i];
201 |         }
202 |       } else {
203 |         for (i = 0; i < ndata; i++) {
204 |           if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i];
205 |           else data[nkeep++] = data[i];
206 |         }
207 |       }
208 |       
209 |       if (nlower == nupper) {
210 |         if (MyProc < procmid) ipartner = MyProc + nlower;
211 |         else ipartner = MyProc - nlower;
212 |         MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep],
213 |                      CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
214 |         MPI_Get_count(&status,INT64_DT,&nrecv);
215 |         ndata = nkeep + nrecv;
216 |       } else {
217 |         if (MyProc < procmid) {
218 |           nfrac = (nlower - (MyProc-proclo)) * nsend / nupper;
219 |           ipartner = MyProc + nlower;
220 |           MPI_Sendrecv(send,nfrac,INT64_DT,ipartner,0,&data[nkeep],
221 |                        CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
222 |           MPI_Get_count(&status,INT64_DT,&nrecv);
223 |           nkeep += nrecv;
224 |           MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner+1,0,
225 |                        &data[nkeep],CHUNKBIG,INT64_DT,
226 |                        ipartner+1,0,MPI_COMM_WORLD,&status);
227 |           MPI_Get_count(&status,INT64_DT,&nrecv);
228 |           ndata = nkeep + nrecv;
229 |         } else if (MyProc > procmid && MyProc < procmid+nlower) {
230 |           nfrac = (MyProc - procmid) * nsend / nlower;
231 |           ipartner = MyProc - nlower;
232 |           MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner,0,
233 |                        &data[nkeep],CHUNKBIG,INT64_DT,
234 |                        ipartner,0,MPI_COMM_WORLD,&status);
235 |           MPI_Get_count(&status,INT64_DT,&nrecv);
236 |           nkeep += nrecv;
237 |           MPI_Sendrecv(send,nfrac,INT64_DT,ipartner-1,0,&data[nkeep],
238 |                        CHUNKBIG,INT64_DT,ipartner-1,0,MPI_COMM_WORLD,&status);
239 |           MPI_Get_count(&status,INT64_DT,&nrecv);
240 |           ndata = nkeep + nrecv;
241 |         } else {
242 |           if (MyProc == procmid) ipartner = MyProc - nlower;
243 |           else ipartner = MyProc - nupper;
244 |           MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep],
245 |                        CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
246 |           MPI_Get_count(&status,INT64_DT,&nrecv);
247 |           ndata = nkeep + nrecv;
248 |         }
249 |       }
250 |       
251 |       if (MyProc < procmid) npartition = nlower;
252 |       else {
253 |         proclo = procmid;
254 |         npartition = nupper;
255 |       }
256 |     }
257 |     
258 |     for (i = 0; i < ndata; i++) {
259 |       datum = data[i];
260 |       index = (datum & nglobalm1) - GlobalStartMyProc;
261 |       HPCC_Table[index] ^= datum;
262 |     }
263 |   }
264 |   
265 |   /* clean up: should not really be part of this timed routine */
266 | 
267 |   free(data);
268 |   free(send);
269 |   free(offsets);
270 | }
271 | 
272 | static void
273 | Power2NodesMPIRandomAccessUpdate(u64Int logTableSize,
274 |                                  u64Int TableSize,
275 |                                  u64Int LocalTableSize,
276 |                                  u64Int MinLocalTableSize,
277 |                                  u64Int GlobalStartMyProc,
278 |                                  u64Int Top,
279 |                                  int logNumProcs,
280 |                                  int NumProcs,
281 |                                  int Remainder,
282 |                                  int MyProc,
283 |                                  s64Int ProcNumUpdates,
284 |                                  MPI_Datatype INT64_DT)
285 | {
286 |   int i,j;
287 |   int logTableLocal,ipartner,iterate,niterate;
288 |   int ndata,nkeep,nsend,nrecv,index,nlocalm1;
289 |   u64Int ran,datum,procmask;
290 |   u64Int *data,*send;
291 |   MPI_Status status;
292 | 
293 |   /* setup: should not really be part of this timed routine */
294 | 
295 |   data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
296 |   send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
297 | 
298 |   for (i = 0; i < LocalTableSize; i++)
299 |     HPCC_Table[i] = i + GlobalStartMyProc;
300 | 
301 |   ran = HPCC_starts(4*GlobalStartMyProc);
302 | 
303 |   niterate = ProcNumUpdates / CHUNK;
304 |   logTableLocal = logTableSize - logNumProcs;
305 |   nlocalm1 = LocalTableSize - 1;
306 | 
307 |   /* actual update loop: this is only section that should be timed */
308 | 
309 |   for (iterate = 0; iterate < niterate; iterate++) {
310 |     for (i = 0; i < CHUNK; i++) {
311 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
312 |       data[i] = ran;
313 |     }
314 |     ndata = CHUNK;
315 | 
316 |     for (j = 0; j < logNumProcs; j++) {
317 |       nkeep = nsend = 0;
318 |       ipartner = (1 << j) ^ MyProc;
319 |       procmask = ((u64Int) 1) << (logTableLocal + j);
320 |       if (ipartner > MyProc) {
321 |         for (i = 0; i < ndata; i++) {
322 |           if (data[i] & procmask) send[nsend++] = data[i];
323 |           else data[nkeep++] = data[i];
324 |         }
325 |       } else {
326 |         for (i = 0; i < ndata; i++) {
327 |           if (data[i] & procmask) data[nkeep++] = data[i];
328 |           else send[nsend++] = data[i];
329 |         }
330 |       }
331 |       
332 |       MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,
333 |                    &data[nkeep],CHUNKBIG,INT64_DT,
334 |                    ipartner,0,MPI_COMM_WORLD,&status);
335 |       MPI_Get_count(&status,INT64_DT,&nrecv);
336 |       ndata = nkeep + nrecv;
337 |     }
338 | 
339 |     for (i = 0; i < ndata; i++) {
340 |       datum = data[i];
341 |       index = datum & nlocalm1;
342 |       HPCC_Table[index] ^= datum;
343 |     }
344 |   }
345 | 
346 |   /* clean up: should not really be part of this timed routine */
347 | 
348 |   free(data);
349 |   free(send);
350 | }
351 | 
352 | int
353 | HPCC_MPIRandomAccess(HPCC_Params *params) {
354 |   s64Int i;
355 |   s64Int NumErrors, GlbNumErrors;
356 | 
357 |   int NumProcs, logNumProcs, MyProc;
358 |   u64Int GlobalStartMyProc;
359 |   int Remainder;            /* Number of processors with (LocalTableSize + 1) entries */
360 |   u64Int Top;               /* Number of table entries in top of Table */
361 |   u64Int LocalTableSize;    /* Local table width */
362 |   u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */
363 |   u64Int logTableSize, TableSize;
364 | 
365 |   double CPUTime;               /* CPU  time to update table */
366 |   double RealTime;              /* Real time to update table */
367 | 
368 |   double TotalMem;
369 |   int sAbort, rAbort;
370 |   int PowerofTwo;
371 | 
372 |   double timeBound;  /* OPTIONAL time bound for execution time */
373 |   u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */
374 |   u64Int NumUpdates;  /* actual number of updates to table - may be smaller than
375 |                        * NumUpdates_Default due to execution time bounds */
376 |   s64Int ProcNumUpdates; /* number of updates per processor */
377 |   s64Int GlbNumUpdates;  /* for reduction */
378 | 
379 |   FILE *outFile = NULL;
380 |   MPI_Op sum64;
381 |   double *GUPs;
382 | 
383 |   MPI_Datatype INT64_DT;
384 | 
385 | #ifdef LONG_IS_64BITS
386 |   INT64_DT = MPI_LONG;
387 | #else
388 |   INT64_DT = MPI_LONG_LONG_INT;
389 | #endif
390 | 
391 |   GUPs = &params->MPIGUPs;
392 | 
393 |   MPI_Comm_size( MPI_COMM_WORLD, &NumProcs );
394 |   MPI_Comm_rank( MPI_COMM_WORLD, &MyProc );
395 | 
396 |   if (0 == MyProc) {
397 |     outFile = fopen( params->outFname, "a" );
398 |     if (! outFile) outFile = stderr;
399 |   }
400 | 
401 |   TotalMem = params->HPLMaxProcMem; /* max single node memory */
402 |   TotalMem *= NumProcs;             /* max memory in NumProcs nodes */
403 |   TotalMem /= sizeof(u64Int);
404 | 
405 |   /* calculate TableSize --- the size of update array (must be a power of 2) */
406 |   for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1;
407 |        TotalMem >= 1.0;
408 |        TotalMem *= 0.5, logTableSize++, TableSize <<= 1)
409 |     ; /* EMPTY */
410 | 
411 | 
412 |   /* determine whether the number of processors is a power of 2 */
413 |   for (i = 1, logNumProcs = 0; ; logNumProcs++, i <<= 1) {
414 |     if (i == NumProcs) {
415 |       PowerofTwo = HPCC_TRUE;
416 |       Remainder = 0;
417 |       Top = 0;
418 |       MinLocalTableSize = (TableSize / NumProcs);
419 |       LocalTableSize = MinLocalTableSize;
420 |       GlobalStartMyProc = (MinLocalTableSize * MyProc);
421 |       break;
422 | 
423 |     /* number of processes is not a power 2 (too many shifts may introduce negative values or 0) */
424 | 
425 |     }
426 |     else if (i > NumProcs || i <= 0) {
427 |       PowerofTwo = HPCC_FALSE;
428 |       /* Minimum local table size --- some processors have an additional entry */
429 |       MinLocalTableSize = (TableSize / NumProcs);
430 |       /* Number of processors with (LocalTableSize + 1) entries */
431 |       Remainder = TableSize  - (MinLocalTableSize * NumProcs);
432 |       /* Number of table entries in top of Table */
433 |       Top = (MinLocalTableSize + 1) * Remainder;
434 |       /* Local table size */
435 |       if (MyProc < Remainder) {
436 |           LocalTableSize = (MinLocalTableSize + 1);
437 |           GlobalStartMyProc = ( (MinLocalTableSize + 1) * MyProc);
438 |         }
439 |         else {
440 |           LocalTableSize = MinLocalTableSize;
441 |           GlobalStartMyProc = ( (MinLocalTableSize * MyProc) + Remainder );
442 |         }
443 |       break;
444 | 
445 |     } /* end else if */
446 |   } /* end for i */
447 | 
448 | 
449 |   HPCC_Table = XMALLOC( u64Int, LocalTableSize);
450 |   sAbort = 0; if (! HPCC_Table) sAbort = 1;
451 | 
452 |   MPI_Allreduce( &sAbort, &rAbort, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
453 |   if (rAbort > 0) {
454 |     if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n");
455 |     goto failed_table;
456 |   }
457 | 
458 |   params->MPIRandomAccess_N = (s64Int)TableSize;
459 | 
460 |   /* Default number of global updates to table: 4x number of table entries */
461 |   NumUpdates_Default = 4 * TableSize;
462 | 
463 | #ifdef RA_TIME_BOUND
464 |   /* estimate number of updates such that execution time does not exceed time bound */
465 |   /* time_bound should be a parameter */
466 |   /* max run time in seconds */
467 |   timeBound = Mmax( 0.25 * params->HPLrdata.time, (double)TIME_BOUND );
468 |   if (PowerofTwo) {
469 |     HPCC_Power2NodesTime(logTableSize, TableSize, LocalTableSize,
470 |                     MinLocalTableSize, GlobalStartMyProc, Top,
471 |                     logNumProcs, NumProcs, Remainder,
472 |                     MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates);
473 | 
474 |   } else {
475 |     HPCC_AnyNodesTime(logTableSize, TableSize, LocalTableSize,
476 |                  MinLocalTableSize, GlobalStartMyProc, Top,
477 |                  logNumProcs, NumProcs, Remainder,
478 |                  MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates);
479 |   }
480 |   /* be conservative: get the smallest number of updates among all procs */
481 |   MPI_Reduce( &ProcNumUpdates, &GlbNumUpdates, 1, INT64_DT,
482 |               MPI_MIN, 0, MPI_COMM_WORLD );
483 |   /* distribute number of updates per proc to all procs */
484 |   MPI_Bcast( &GlbNumUpdates, 1, INT64_DT, 0, MPI_COMM_WORLD );
485 |   ProcNumUpdates = Mmin(GlbNumUpdates, (4*LocalTableSize));
486 |   /* works for both PowerofTwo and AnyNodes */
487 |   NumUpdates = Mmin((ProcNumUpdates*NumProcs), NumUpdates_Default);
488 | 
489 | #else
490 |   ProcNumUpdates = 4*LocalTableSize;
491 |   NumUpdates = NumUpdates_Default;
492 | #endif
493 | 
494 |   if (MyProc == 0) {
495 |     fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : "");
496 |     fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n",
497 |              logTableSize, TableSize );
498 |     if (PowerofTwo)
499 |         fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n",
500 |                  (logTableSize - logNumProcs), TableSize/NumProcs );
501 |       else
502 |         fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d  = " FSTR64 " words/PE MAX\n",
503 |                  logTableSize, NumProcs, LocalTableSize);
504 | 
505 |     fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default);
506 | #ifdef RA_TIME_BOUND
507 |     fprintf( outFile, "Number of updates EXECUTED = " FSTR64 " (for a TIME BOUND of %.2f secs)\n",
508 |              NumUpdates, timeBound);
509 | #endif
510 |     params->MPIRandomAccess_ExeUpdates = NumUpdates;
511 |     params->MPIRandomAccess_TimeBound = timeBound;
512 |   }
513 | 
514 |   MPI_Barrier( MPI_COMM_WORLD );
515 | 
516 |   CPUTime = -CPUSEC();
517 |   RealTime = -RTSEC();
518 | 
519 |   if (PowerofTwo) {
520 |     Power2NodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
521 |                                      MinLocalTableSize, GlobalStartMyProc, Top,
522 |                                      logNumProcs, NumProcs, Remainder,
523 |                                      MyProc, ProcNumUpdates, INT64_DT);
524 |   } else {
525 |     AnyNodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
526 |                                   MinLocalTableSize, GlobalStartMyProc, Top,
527 |                                   logNumProcs, NumProcs, Remainder,
528 |                                   MyProc, ProcNumUpdates, INT64_DT);
529 |   }
530 | 
531 | 
532 |   MPI_Barrier( MPI_COMM_WORLD );
533 | 
534 |   /* End timed section */
535 |   CPUTime += CPUSEC();
536 |   RealTime += RTSEC();
537 | 
538 |   /* Print timing results */
539 |   if (MyProc == 0){
540 |     params->MPIRandomAccess_time = RealTime;
541 |     *GUPs = 1e-9*NumUpdates / RealTime;
542 |     fprintf( outFile, "CPU time used = %.6f seconds\n", CPUTime );
543 |     fprintf( outFile, "Real time used = %.6f seconds\n", RealTime );
544 |     fprintf( outFile, "%.9f Billion(10^9) Updates    per second [GUP/s]\n",
545 |              *GUPs );
546 |     fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n",
547 |              *GUPs / NumProcs );
548 |     /* No longer reporting per CPU number */
549 |     /* *GUPs /= NumProcs; */
550 |   }
551 |   /* distribute result to all nodes */
552 |   MPI_Bcast( GUPs, 1, MPI_INT, 0, MPI_COMM_WORLD );
553 | 
554 | 
555 |   /* Verification phase */
556 | 
557 |   /* Begin timing here */
558 |   CPUTime = -CPUSEC();
559 |   RealTime = -RTSEC();
560 | 
561 |   if (PowerofTwo) {
562 |     HPCC_Power2NodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
563 |                                     GlobalStartMyProc,
564 |                                     logNumProcs, NumProcs,
565 |                                     MyProc, ProcNumUpdates,
566 |                                     INT64_DT, &NumErrors);
567 |   }
568 |   else {
569 |     HPCC_AnyNodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
570 |                                  MinLocalTableSize, GlobalStartMyProc, Top,
571 |                                  logNumProcs, NumProcs, Remainder,
572 |                                  MyProc, ProcNumUpdates,
573 |                                  INT64_DT, &NumErrors);
574 |   }
575 | 
576 | 
577 | #ifdef LONG_IS_64BITS
578 |   MPI_Reduce( &NumErrors, &GlbNumErrors, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD );
579 | #else
580 |   /* MPI 1.1 standard (obsolete at this point) doesn't define MPI_SUM
581 |     to work on `long long':
582 |     http://www.mpi-forum.org/docs/mpi-11-html/node78.html and
583 |     therefore LAM 6.5.6 chooses not to implement it (even though there
584 |     is code for it in LAM and for other reductions work OK,
585 |     e.g. MPI_MAX). MPICH 1.2.5 doesn't complain about MPI_SUM but it
586 |     doesn't have MPI_UNSIGNED_LONG_LONG (but has MPI_LONG_LONG_INT):
587 |     http://www.mpi-forum.org/docs/mpi-20-html/node84.htm So I need to
588 |     create a trivial summation operation. */
589 |   MPI_Op_create( Sum64, 1, &sum64 );
590 |   MPI_Reduce( &NumErrors, &GlbNumErrors, 1, INT64_DT, sum64, 0, MPI_COMM_WORLD );
591 |   MPI_Op_free( &sum64 );
592 | #endif
593 | 
594 |   /* End timed section */
595 |   CPUTime += CPUSEC();
596 |   RealTime += RTSEC();
597 | 
598 |   if(MyProc == 0){
599 |     params->MPIRandomAccess_CheckTime = RealTime;
600 |     fprintf( outFile, "Verification:  CPU time used = %.6f seconds\n", CPUTime);
601 |     fprintf( outFile, "Verification:  Real time used = %.6f seconds\n", RealTime);
602 |     fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n",
603 |              GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ?
604 |              "passed" : "failed");
605 |     if (GlbNumErrors > 0.01*TableSize) params->Failure = 1;
606 |     params->MPIRandomAccess_Errors = (s64Int)GlbNumErrors;
607 |     params->MPIRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize;
608 |   }
609 |   /* End verification phase */
610 | 
611 | 
612 |   /* Deallocate memory (in reverse order of allocation which should
613 |      help fragmentation) */
614 | 
615 |   free( HPCC_Table );
616 | 
617 |   failed_table:
618 | 
619 |   if (0 == MyProc) if (outFile != stderr) fclose( outFile );
620 | 
621 |   MPI_Barrier( MPI_COMM_WORLD );
622 | 
623 |   return 0;
624 | }
625 | 


--------------------------------------------------------------------------------
/MPIRandomAccess_opt.c:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */
  2 | 
  3 | /*
  4 |  * This code has been contributed by the DARPA HPCS program.  Contact
  5 |  * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu>
  6 |  * if you have questions.
  7 |  *
  8 |  *
  9 |  * GUPS (Giga UPdates per Second) is a measurement that profiles the memory
 10 |  * architecture of a system and is a measure of performance similar to MFLOPS.
 11 |  * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the
 12 |  * GUPS capability of a system, much like the LINPACK benchmark is intended to
 13 |  * exercise the MFLOPS capability of a computer.  In each case, we would
 14 |  * expect these benchmarks to achieve close to the "peak" capability of the
 15 |  * memory system. The extent of the similarities between RandomAccess and
 16 |  * LINPACK are limited to both benchmarks attempting to calculate a peak system
 17 |  * capability.
 18 |  *
 19 |  * GUPS is calculated by identifying the number of memory locations that can be
 20 |  * randomly updated in one second, divided by 1 billion (1e9). The term "randomly"
 21 |  * means that there is little relationship between one address to be updated and
 22 |  * the next, except that they occur in the space of one half the total system
 23 |  * memory.  An update is a read-modify-write operation on a table of 64-bit words.
 24 |  * An address is generated, the value at that address read from memory, modified
 25 |  * by an integer operation (add, and, or, xor) with a literal value, and that
 26 |  * new value is written back to memory.
 27 |  *
 28 |  * We are interested in knowing the GUPS performance of both entire systems and
 29 |  * system subcomponents --- e.g., the GUPS rating of a distributed memory
 30 |  * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a
 31 |  * single processor.  While there is typically a scaling of FLOPS with processor
 32 |  * count, a similar phenomenon may not always occur for GUPS.
 33 |  *
 34 |  * Select the memory size to be the power of two such that 2^n <= 1/2 of the
 35 |  * total memory.  Each CPU operates on its own address stream, and the single
 36 |  * table may be distributed among nodes. The distribution of memory to nodes
 37 |  * is left to the implementer.  A uniform data distribution may help balance
 38 |  * the workload, while non-uniform data distributions may simplify the
 39 |  * calculations that identify processor location by eliminating the requirement
 40 |  * for integer divides. A small (less than 1%) percentage of missed updates
 41 |  * are permitted.
 42 |  *
 43 |  * When implementing a benchmark that measures GUPS on a distributed memory
 44 |  * multiprocessor system, it may be required to define constraints as to how
 45 |  * far in the random address stream each node is permitted to "look ahead".
 46 |  * Likewise, it may be required to define a constraint as to the number of
 47 |  * update messages that can be stored before processing to permit multi-level
 48 |  * parallelism for those systems that support such a paradigm.  The limits on
 49 |  * "look ahead" and "stored updates" are being implemented to assure that the
 50 |  * benchmark meets the intent to profile memory architecture and not induce
 51 |  * significant artificial data locality. For the purpose of measuring GUPS,
 52 |  * we will stipulate that each thread is permitted to look ahead no more than
 53 |  * 1024 random address stream samples with the same number of update messages
 54 |  * stored before processing.
 55 |  *
 56 |  * The supplied MPI-1 code generates the input stream {A} on all processors
 57 |  * and the global table has been distributed as uniformly as possible to
 58 |  * balance the workload and minimize any Amdahl fraction.  This code does not
 59 |  * exploit "look-ahead".  Addresses are sent to the appropriate processor
 60 |  * where the table entry resides as soon as each address is calculated.
 61 |  * Updates are performed as addresses are received.  Each message is limited
 62 |  * to a single 64 bit long integer containing element ai from {A}.
 63 |  * Local offsets for T[ ] are extracted by the destination processor.
 64 |  *
 65 |  * If the number of processors is equal to a power of two, then the global
 66 |  * table can be distributed equally over the processors.  In addition, the
 67 |  * processor number can be determined from that portion of the input stream
 68 |  * that identifies the address into the global table by masking off log2(p)
 69 |  * bits in the address.
 70 |  *
 71 |  * If the number of processors is not equal to a power of two, then the global
 72 |  * table cannot be equally distributed between processors.  In the MPI-1
 73 |  * implementation provided, there has been an attempt to minimize the differences
 74 |  * in workloads and the largest difference in elements of T[ ] is one.  The
 75 |  * number of values in the input stream generated by each processor will be
 76 |  * related to the number of global table entries on each processor.
 77 |  *
 78 |  * The MPI-1 version of RandomAccess treats the potential instance where the
 79 |  * number of processors is a power of two as a special case, because of the
 80 |  * significant simplifications possible because processor location and local
 81 |  * offset can be determined by applying masks to the input stream values.
 82 |  * The non power of two case uses an integer division to determine the processor
 83 |  * location.  The integer division will be more costly in terms of machine
 84 |  * cycles to perform than the bit masking operations
 85 |  *
 86 |  * For additional information on the GUPS metric, the HPCchallenge RandomAccess
 87 |  * Benchmark,and the rules to run RandomAccess or modify it to optimize
 88 |  * performance -- see http://icl.cs.utk.edu/hpcc/
 89 |  *
 90 |  */
 91 | 
 92 | /* Jan 2005
 93 |  *
 94 |  * This code has been modified to allow local bucket sorting of updates.
 95 |  * The total maximum number of updates in the local buckets of a process
 96 |  * is currently defined in "RandomAccess.h" as MAX_TOTAL_PENDING_UPDATES.
 97 |  * When the total maximum number of updates is reached, the process selects
 98 |  * the bucket (or destination process) with the largest number of
 99 |  * updates and sends out all the updates in that bucket. See buckets.c
100 |  * for details about the buckets' implementation.
101 |  *
102 |  * This code also supports posting multiple MPI receive descriptors (based
103 |  * on a contribution by David Addison).
104 |  *
105 |  * In addition, this implementation provides an option for limiting
106 |  * the execution time of the benchmark to a specified time bound
107 |  * (see time_bound.c). The time bound is currently defined in
108 |  * time_bound.h, but it should be a benchmark parameter. By default
109 |  * the benchmark will execute the recommended number of updates,
110 |  * that is, four times the global table size.
111 |  */
112 | 
113 | #include <hpcc.h>
114 | 
115 | #include "RandomAccess.h"
116 | #include "buckets.h"
117 | #include "time_bound.h"
118 | #include "verification.h"
119 | 
120 | #define CHUNK       (1024)
121 | #define CHUNKBIG    (32768)
122 | #define RCHUNK      (16384)
123 | #define PITER       8
124 | #define MAXLOGPROCS 20
125 | 
126 | /* Allocate main table (in global memory) */
127 | 
128 | u64Int *HPCC_Table;
129 | u64Int LocalSendBuffer[LOCAL_BUFFER_SIZE];
130 | u64Int LocalRecvBuffer[MAX_RECV*LOCAL_BUFFER_SIZE];
131 | 
132 | #ifndef LONG_IS_64BITS
133 | static void
134 | Sum64(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
135 |   int i, n = *len; s64Int *invec64 = (s64Int *)invec, *inoutvec64 = (s64Int *)inoutvec;
136 |   for (i = n; i; i--, invec64++, inoutvec64++) *inoutvec64 += *invec64;
137 | }
138 | #endif
139 | 
140 | static void
141 | AnyNodesMPIRandomAccessUpdate(u64Int logTableSize,
142 |                               u64Int TableSize,
143 |                               u64Int LocalTableSize,
144 |                               u64Int MinLocalTableSize,
145 |                               u64Int GlobalStartMyProc,
146 |                               u64Int Top,
147 |                               int logNumProcs,
148 |                               int NumProcs,
149 |                               int Remainder,
150 |                               int MyProc,
151 |                               s64Int ProcNumUpdates,
152 |                               MPI_Datatype INT64_DT)
153 | {
154 |   int i,j;
155 |   int ipartner,iterate,niterate,npartition,proclo,nlower,nupper,procmid;
156 |   int ndata,nkeep,nsend,nrecv,index, nfrac;
157 |   u64Int ran,datum,nglobalm1,indexmid;
158 |   u64Int *data,*send, *offsets;
159 |   MPI_Status status;
160 | 
161 |   /* setup: should not really be part of this timed routine
162 |      NOTE: niterate must be computed from global TableSize * 4
163 |            not from ProcNumUpdates since that can be different on each proc
164 |            round niterate up by 1 to do slightly more than required updates */
165 | 
166 |   data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
167 |   send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
168 | 
169 |   for (i = 0; i < LocalTableSize; i++)
170 |     HPCC_Table[i] = i + GlobalStartMyProc;
171 | 
172 |   ran = HPCC_starts(4*GlobalStartMyProc);
173 | 
174 |   offsets = (u64Int *) malloc((NumProcs+1)*sizeof(u64Int));
175 |   MPI_Allgather(&GlobalStartMyProc,1,INT64_DT,offsets,1,INT64_DT,
176 |                 MPI_COMM_WORLD);
177 |   offsets[NumProcs] = TableSize;
178 | 
179 |   niterate = 4 * TableSize / NumProcs / CHUNK + 1;
180 |   nglobalm1 = TableSize - 1;
181 | 
182 |   /* actual update loop: this is only section that should be timed */
183 | 
184 |   for (iterate = 0; iterate < niterate; iterate++) {
185 |     for (i = 0; i < CHUNK; i++) {
186 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
187 |       data[i] = ran;
188 |     }
189 |     ndata = CHUNK;
190 | 
191 |     npartition = NumProcs;
192 |     proclo = 0;
193 |     while (npartition > 1) {
194 |       nlower = npartition/2;
195 |       nupper = npartition - nlower;
196 |       procmid = proclo + nlower;
197 |       indexmid = offsets[procmid];
198 |       
199 |       nkeep = nsend = 0;
200 |       if (MyProc < procmid) {
201 |         for (i = 0; i < ndata; i++) {
202 |           if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i];
203 |           else data[nkeep++] = data[i];
204 |         }
205 |       } else {
206 |         for (i = 0; i < ndata; i++) {
207 |           if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i];
208 |           else data[nkeep++] = data[i];
209 |         }
210 |       }
211 |       
212 |       if (nlower == nupper) {
213 |         if (MyProc < procmid) ipartner = MyProc + nlower;
214 |         else ipartner = MyProc - nlower;
215 |         MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep],
216 |                      CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
217 |         MPI_Get_count(&status,INT64_DT,&nrecv);
218 |         ndata = nkeep + nrecv;
219 |       } else {
220 |         if (MyProc < procmid) {
221 |           nfrac = (nlower - (MyProc-proclo)) * nsend / nupper;
222 |           ipartner = MyProc + nlower;
223 |           MPI_Sendrecv(send,nfrac,INT64_DT,ipartner,0,&data[nkeep],
224 |                        CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
225 |           MPI_Get_count(&status,INT64_DT,&nrecv);
226 |           nkeep += nrecv;
227 |           MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner+1,0,
228 |                        &data[nkeep],CHUNKBIG,INT64_DT,
229 |                        ipartner+1,0,MPI_COMM_WORLD,&status);
230 |           MPI_Get_count(&status,INT64_DT,&nrecv);
231 |           ndata = nkeep + nrecv;
232 |         } else if (MyProc > procmid && MyProc < procmid+nlower) {
233 |           nfrac = (MyProc - procmid) * nsend / nlower;
234 |           ipartner = MyProc - nlower;
235 |           MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner,0,
236 |                        &data[nkeep],CHUNKBIG,INT64_DT,
237 |                        ipartner,0,MPI_COMM_WORLD,&status);
238 |           MPI_Get_count(&status,INT64_DT,&nrecv);
239 |           nkeep += nrecv;
240 |           MPI_Sendrecv(send,nfrac,INT64_DT,ipartner-1,0,&data[nkeep],
241 |                        CHUNKBIG,INT64_DT,ipartner-1,0,MPI_COMM_WORLD,&status);
242 |           MPI_Get_count(&status,INT64_DT,&nrecv);
243 |           ndata = nkeep + nrecv;
244 |         } else {
245 |           if (MyProc == procmid) ipartner = MyProc - nlower;
246 |           else ipartner = MyProc - nupper;
247 |           MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep],
248 |                        CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status);
249 |           MPI_Get_count(&status,INT64_DT,&nrecv);
250 |           ndata = nkeep + nrecv;
251 |         }
252 |       }
253 |       
254 |       if (MyProc < procmid) npartition = nlower;
255 |       else {
256 |         proclo = procmid;
257 |         npartition = nupper;
258 |       }
259 |     }
260 |     
261 |     for (i = 0; i < ndata; i++) {
262 |       datum = data[i];
263 |       index = (datum & nglobalm1) - GlobalStartMyProc;
264 |       HPCC_Table[index] ^= datum;
265 |     }
266 |   }
267 |   
268 |   /* clean up: should not really be part of this timed routine */
269 | 
270 |   free(data);
271 |   free(send);
272 |   free(offsets);
273 | }
274 | 
275 | /* This sort is manually unrolled to make sure the compiler can see 
276 |  * the parallelism -KDU
277 |  */
278 | 
279 | void sort_data(u64Int *source, u64Int *nomatch, u64Int *match, int number, 
280 |                int *nnomatch, int *nmatch, int mask_shift)
281 | {
282 |   int i,dindex,myselect[8],counts[2];
283 |   int div_num = number / 8;
284 |   int loop_total = div_num * 8;
285 |   u64Int procmask = ((u64Int) 1) << mask_shift;
286 |   u64Int *buffers[2];
287 | 
288 |   buffers[0] = nomatch;
289 |   counts[0] = *nnomatch;
290 |   buffers[1] = match;
291 |   counts[1] = *nmatch;
292 | 
293 |   for (i = 0; i < div_num; i++) {
294 |     dindex = i*8;
295 |     myselect[0] = (source[dindex] & procmask) >> mask_shift;
296 |     myselect[1] = (source[dindex+1] & procmask) >> mask_shift;
297 |     myselect[2] = (source[dindex+2] & procmask) >> mask_shift;
298 |     myselect[3] = (source[dindex+3] & procmask) >> mask_shift;
299 |     myselect[4] = (source[dindex+4] & procmask) >> mask_shift;
300 |     myselect[5] = (source[dindex+5] & procmask) >> mask_shift;
301 |     myselect[6] = (source[dindex+6] & procmask) >> mask_shift;
302 |     myselect[7] = (source[dindex+7] & procmask) >> mask_shift;
303 |     buffers[myselect[0]][counts[myselect[0]]++] = source[dindex];
304 |     buffers[myselect[1]][counts[myselect[1]]++] = source[dindex+1];
305 |     buffers[myselect[2]][counts[myselect[2]]++] = source[dindex+2];
306 |     buffers[myselect[3]][counts[myselect[3]]++] = source[dindex+3];
307 |     buffers[myselect[4]][counts[myselect[4]]++] = source[dindex+4];
308 |     buffers[myselect[5]][counts[myselect[5]]++] = source[dindex+5];
309 |     buffers[myselect[6]][counts[myselect[6]]++] = source[dindex+6];
310 |     buffers[myselect[7]][counts[myselect[7]]++] = source[dindex+7];
311 |   }
312 |   
313 |   for (i = loop_total; i < number; i++) {
314 |     u64Int mydata = source[i];
315 |     if (mydata & procmask) buffers[1][counts[1]++] = mydata;
316 |     else buffers[0][counts[0]++] = mydata;
317 |   }
318 |   
319 |   *nnomatch = counts[0];
320 |   *nmatch = counts[1];
321 | }
322 | 
323 | /* Manual unrolling is a significant win if -Msafeptr is used -KDU */
324 | 
325 | inline update_table(u64Int *data, u64Int *table, int number, int nlocalm1)
326 | {
327 |   int i,dindex,index;
328 |   int div_num = number / 8;
329 |   int loop_total = div_num * 8;
330 |   u64Int index0,index1,index2,index3,index4,index5,index6,index7;
331 |   u64Int ltable0,ltable1,ltable2,ltable3,ltable4,ltable5,ltable6,ltable7;
332 | 
333 |   for (i = 0; i < div_num; i++) {
334 |     dindex = i*8;
335 | 
336 |     index0 = data[dindex] & nlocalm1;
337 |     index1 = data[dindex+1] & nlocalm1;
338 |     index2 = data[dindex+2] & nlocalm1;
339 |     index3 = data[dindex+3] & nlocalm1;
340 |     index4 = data[dindex+4] & nlocalm1;
341 |     index5 = data[dindex+5] & nlocalm1;
342 |     index6 = data[dindex+6] & nlocalm1;
343 |     index7 = data[dindex+7] & nlocalm1;
344 |     ltable0 = table[index0];
345 |     ltable1 = table[index1];
346 |     ltable2 = table[index2];
347 |     ltable3 = table[index3];
348 |     ltable4 = table[index4];
349 |     ltable5 = table[index5];
350 |     ltable6 = table[index6];
351 |     ltable7 = table[index7];
352 | 
353 |     table[index0] = ltable0 ^ data[dindex];
354 |     table[index1] = ltable1 ^ data[dindex+1];
355 |     table[index2] = ltable2 ^ data[dindex+2];
356 |     table[index3] = ltable3 ^ data[dindex+3];
357 |     table[index4] = ltable4 ^ data[dindex+4];
358 |     table[index5] = ltable5 ^ data[dindex+5];
359 |     table[index6] = ltable6 ^ data[dindex+6];
360 |     table[index7] = ltable7 ^ data[dindex+7];
361 |   }
362 | 
363 |   for (i = loop_total; i < number; i++) {
364 |     u64Int datum = data[i];
365 |     index = datum & nlocalm1;
366 |     table[index] ^= datum;
367 |   }
368 | }
369 | 
370 | static void
371 | Power2NodesMPIRandomAccessUpdate(u64Int logTableSize,
372 |                                  u64Int TableSize,
373 |                                  u64Int LocalTableSize,
374 |                                  u64Int MinLocalTableSize,
375 |                                  u64Int GlobalStartMyProc,
376 |                                  u64Int Top,
377 |                                  int logNumProcs,
378 |                                  int NumProcs,
379 |                                  int Remainder,
380 |                                  int MyProc,
381 |                                  s64Int ProcNumUpdates,
382 |                                  MPI_Datatype INT64_DT)
383 | {
384 |   int i,j,k;
385 |   int logTableLocal,ipartner,iterate,niterate,iter_mod;
386 |   int ndata,nkeep,nsend,nrecv,nlocalm1, nkept;
387 |   u64Int ran,datum,procmask;
388 |   u64Int *data,*send,*send1,*send2;
389 |   u64Int *recv[PITER][MAXLOGPROCS];
390 |   MPI_Status status;
391 |   MPI_Request request[PITER][MAXLOGPROCS];
392 |   MPI_Request srequest;
393 | 
394 |   /* setup: should not really be part of this timed routine */
395 | 
396 |   data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
397 |   send1 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
398 |   send2 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int));
399 |   send = send1;
400 | 
401 |   for (j = 0; j < PITER; j++)
402 |     for (i = 0; i < logNumProcs; i++)
403 |       recv[j][i] = (u64Int *) malloc(sizeof(u64Int)*RCHUNK);
404 | 
405 |   for (i = 0; i < LocalTableSize; i++)
406 |     HPCC_Table[i] = i + GlobalStartMyProc;
407 | 
408 |   ran = HPCC_starts(4*GlobalStartMyProc);
409 | 
410 |   niterate = ProcNumUpdates / CHUNK;
411 |   logTableLocal = logTableSize - logNumProcs;
412 |   nlocalm1 = LocalTableSize - 1;
413 | 
414 |   /* actual update loop: this is only section that should be timed */
415 | 
416 |   for (iterate = 0; iterate < niterate; iterate++) {
417 |     iter_mod = iterate % PITER;
418 |     for (i = 0; i < CHUNK; i++) {
419 |       ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B);
420 |       data[i] = ran;
421 |     }
422 |     nkept = CHUNK;
423 |     nrecv = 0;
424 |     
425 |     if (iter_mod == 0) 
426 |       for (k = 0; k < PITER; k++)
427 |         for (j = 0; j < logNumProcs; j++) {
428 |           ipartner = (1 << j) ^ MyProc;
429 |           MPI_Irecv(recv[k][j],RCHUNK,INT64_DT,ipartner,0,MPI_COMM_WORLD,
430 |                     &request[k][j]);
431 |         }
432 |     
433 |     for (j = 0; j < logNumProcs; j++) {
434 |       nkeep = nsend = 0;
435 |       send = (send == send1) ? send2 : send1;
436 |       ipartner = (1 << j) ^ MyProc;
437 |       procmask = ((u64Int) 1) << (logTableLocal + j);
438 |       if (ipartner > MyProc) {
439 |       	sort_data(data,data,send,nkept,&nkeep,&nsend,logTableLocal+j);
440 |         if (j > 0) {
441 |           MPI_Wait(&request[iter_mod][j-1],&status);
442 |           MPI_Get_count(&status,INT64_DT,&nrecv);
443 |       	  sort_data(recv[iter_mod][j-1],data,send,nrecv,&nkeep, 
444 |                     &nsend,logTableLocal+j);
445 |         }
446 |       } else {
447 |         sort_data(data,send,data,nkept,&nsend,&nkeep,logTableLocal+j);
448 |         if (j > 0) {
449 |           MPI_Wait(&request[iter_mod][j-1],&status);
450 |           MPI_Get_count(&status,INT64_DT,&nrecv);
451 |           sort_data(recv[iter_mod][j-1],send,data,nrecv,&nsend,
452 |                     &nkeep,logTableLocal+j);
453 |         }
454 |       }
455 |       if (j > 0) MPI_Wait(&srequest,&status);
456 |       MPI_Isend(send,nsend,INT64_DT,ipartner,0,MPI_COMM_WORLD,&srequest);
457 |       if (j == (logNumProcs - 1)) update_table(data,HPCC_Table,nkeep,nlocalm1);
458 |       nkept = nkeep;
459 |     }
460 |     
461 |     if (logNumProcs == 0) update_table(data,HPCC_Table,nkept,nlocalm1);
462 |     else {
463 |       MPI_Wait(&request[iter_mod][j-1],&status);
464 |       MPI_Get_count(&status,INT64_DT,&nrecv);
465 |       update_table(recv[iter_mod][j-1],HPCC_Table,nrecv,nlocalm1);
466 |       MPI_Wait(&srequest,&status);
467 |     }
468 |     
469 |     ndata = nkept + nrecv;
470 |   }
471 |   
472 |   /* clean up: should not really be part of this timed routine */
473 | 
474 |   for (j = 0; j < PITER; j++)
475 |     for (i = 0; i < logNumProcs; i++) free(recv[j][i]);
476 | 
477 |   free(data);
478 |   free(send1);
479 |   free(send2);
480 | }
481 | 
482 | int
483 | HPCC_MPIRandomAccess(HPCC_Params *params) {
484 |   s64Int i;
485 |   s64Int NumErrors, GlbNumErrors;
486 | 
487 |   int NumProcs, logNumProcs, MyProc;
488 |   u64Int GlobalStartMyProc;
489 |   int Remainder;            /* Number of processors with (LocalTableSize + 1) entries */
490 |   u64Int Top;               /* Number of table entries in top of Table */
491 |   u64Int LocalTableSize;    /* Local table width */
492 |   u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */
493 |   u64Int logTableSize, TableSize;
494 | 
495 |   double CPUTime;               /* CPU  time to update table */
496 |   double RealTime;              /* Real time to update table */
497 | 
498 |   double TotalMem;
499 |   int sAbort, rAbort;
500 |   int PowerofTwo;
501 | 
502 |   double timeBound;  /* OPTIONAL time bound for execution time */
503 |   u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */
504 |   u64Int NumUpdates;  /* actual number of updates to table - may be smaller than
505 |                        * NumUpdates_Default due to execution time bounds */
506 |   s64Int ProcNumUpdates; /* number of updates per processor */
507 |   s64Int GlbNumUpdates;  /* for reduction */
508 | 
509 |   FILE *outFile = NULL;
510 |   MPI_Op sum64;
511 |   double *GUPs;
512 | 
513 |   MPI_Datatype INT64_DT;
514 | 
515 | #ifdef LONG_IS_64BITS
516 |   INT64_DT = MPI_LONG;
517 | #else
518 |   INT64_DT = MPI_LONG_LONG_INT;
519 | #endif
520 | 
521 |   GUPs = &params->MPIGUPs;
522 | 
523 |   MPI_Comm_size( MPI_COMM_WORLD, &NumProcs );
524 |   MPI_Comm_rank( MPI_COMM_WORLD, &MyProc );
525 | 
526 |   if (0 == MyProc) {
527 |     outFile = fopen( params->outFname, "a" );
528 |     if (! outFile) outFile = stderr;
529 |   }
530 | 
531 |   TotalMem = params->HPLMaxProcMem; /* max single node memory */
532 |   TotalMem *= NumProcs;             /* max memory in NumProcs nodes */
533 |   TotalMem /= sizeof(u64Int);
534 | 
535 |   /* calculate TableSize --- the size of update array (must be a power of 2) */
536 |   for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1;
537 |        TotalMem >= 1.0;
538 |        TotalMem *= 0.5, logTableSize++, TableSize <<= 1)
539 |     ; /* EMPTY */
540 | 
541 | 
542 |   /* determine whether the number of processors is a power of 2 */
543 |   for (i = 1, logNumProcs = 0; ; logNumProcs++, i <<= 1) {
544 |     if (i == NumProcs) {
545 |       PowerofTwo = HPCC_TRUE;
546 |       Remainder = 0;
547 |       Top = 0;
548 |       MinLocalTableSize = (TableSize / NumProcs);
549 |       LocalTableSize = MinLocalTableSize;
550 |       GlobalStartMyProc = (MinLocalTableSize * MyProc);
551 |       break;
552 | 
553 |     /* number of processes is not a power 2 (too many shifts may introduce negative values or 0) */
554 | 
555 |     }
556 |     else if (i > NumProcs || i <= 0) {
557 |       PowerofTwo = HPCC_FALSE;
558 |       /* Minimum local table size --- some processors have an additional entry */
559 |       MinLocalTableSize = (TableSize / NumProcs);
560 |       /* Number of processors with (LocalTableSize + 1) entries */
561 |       Remainder = TableSize  - (MinLocalTableSize * NumProcs);
562 |       /* Number of table entries in top of Table */
563 |       Top = (MinLocalTableSize + 1) * Remainder;
564 |       /* Local table size */
565 |       if (MyProc < Remainder) {
566 |           LocalTableSize = (MinLocalTableSize + 1);
567 |           GlobalStartMyProc = ( (MinLocalTableSize + 1) * MyProc);
568 |         }
569 |         else {
570 |           LocalTableSize = MinLocalTableSize;
571 |           GlobalStartMyProc = ( (MinLocalTableSize * MyProc) + Remainder );
572 |         }
573 |       break;
574 | 
575 |     } /* end else if */
576 |   } /* end for i */
577 | 
578 | 
579 |   HPCC_Table = XMALLOC( u64Int, LocalTableSize);
580 |   sAbort = 0; if (! HPCC_Table) sAbort = 1;
581 | 
582 |   MPI_Allreduce( &sAbort, &rAbort, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
583 |   if (rAbort > 0) {
584 |     if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n");
585 |     goto failed_table;
586 |   }
587 | 
588 |   params->MPIRandomAccess_N = (s64Int)TableSize;
589 | 
590 |   /* Default number of global updates to table: 4x number of table entries */
591 |   NumUpdates_Default = 4 * TableSize;
592 | 
593 | #ifdef RA_TIME_BOUND
594 |   /* estimate number of updates such that execution time does not exceed time bound */
595 |   /* time_bound should be a parameter */
596 |   /* max run time in seconds */
597 |   timeBound = Mmax( 0.25 * params->HPLrdata.time, (double)TIME_BOUND );
598 |   if (PowerofTwo) {
599 |     HPCC_Power2NodesTime(logTableSize, TableSize, LocalTableSize,
600 |                     MinLocalTableSize, GlobalStartMyProc, Top,
601 |                     logNumProcs, NumProcs, Remainder,
602 |                     MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates);
603 | 
604 |   } else {
605 |     HPCC_AnyNodesTime(logTableSize, TableSize, LocalTableSize,
606 |                  MinLocalTableSize, GlobalStartMyProc, Top,
607 |                  logNumProcs, NumProcs, Remainder,
608 |                  MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates);
609 |   }
610 |   /* be conservative: get the smallest number of updates among all procs */
611 |   MPI_Reduce( &ProcNumUpdates, &GlbNumUpdates, 1, INT64_DT,
612 |               MPI_MIN, 0, MPI_COMM_WORLD );
613 |   /* distribute number of updates per proc to all procs */
614 |   MPI_Bcast( &GlbNumUpdates, 1, INT64_DT, 0, MPI_COMM_WORLD );
615 |   ProcNumUpdates = Mmin(GlbNumUpdates, (4*LocalTableSize));
616 |   /* works for both PowerofTwo and AnyNodes */
617 |   NumUpdates = Mmin((ProcNumUpdates*NumProcs), NumUpdates_Default);
618 | 
619 | #else
620 |   ProcNumUpdates = 4*LocalTableSize;
621 |   NumUpdates = NumUpdates_Default;
622 | #endif
623 | 
624 |   if (MyProc == 0) {
625 |     fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : "");
626 |     fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n",
627 |              logTableSize, TableSize );
628 |     if (PowerofTwo)
629 |         fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n",
630 |                  (logTableSize - logNumProcs), TableSize/NumProcs );
631 |       else
632 |         fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d  = " FSTR64 " words/PE MAX\n",
633 |                  logTableSize, NumProcs, LocalTableSize);
634 | 
635 |     fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default);
636 | #ifdef RA_TIME_BOUND
637 |     fprintf( outFile, "Number of updates EXECUTED = " FSTR64 " (for a TIME BOUND of %.2f secs)\n",
638 |              NumUpdates, timeBound);
639 | #endif
640 |     params->MPIRandomAccess_ExeUpdates = NumUpdates;
641 |     params->MPIRandomAccess_TimeBound = timeBound;
642 |   }
643 | 
644 |   MPI_Barrier( MPI_COMM_WORLD );
645 | 
646 |   CPUTime = -CPUSEC();
647 |   RealTime = -RTSEC();
648 | 
649 |   if (PowerofTwo) {
650 |     Power2NodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
651 |                                      MinLocalTableSize, GlobalStartMyProc, Top,
652 |                                      logNumProcs, NumProcs, Remainder,
653 |                                      MyProc, ProcNumUpdates, INT64_DT);
654 |   } else {
655 |     AnyNodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
656 |                                   MinLocalTableSize, GlobalStartMyProc, Top,
657 |                                   logNumProcs, NumProcs, Remainder,
658 |                                   MyProc, ProcNumUpdates, INT64_DT);
659 |   }
660 | 
661 | 
662 |   MPI_Barrier( MPI_COMM_WORLD );
663 | 
664 |   /* End timed section */
665 |   CPUTime += CPUSEC();
666 |   RealTime += RTSEC();
667 | 
668 |   /* Print timing results */
669 |   if (MyProc == 0){
670 |     params->MPIRandomAccess_time = RealTime;
671 |     *GUPs = 1e-9*NumUpdates / RealTime;
672 |     fprintf( outFile, "CPU time used = %.6f seconds\n", CPUTime );
673 |     fprintf( outFile, "Real time used = %.6f seconds\n", RealTime );
674 |     fprintf( outFile, "%.9f Billion(10^9) Updates    per second [GUP/s]\n",
675 |              *GUPs );
676 |     fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n",
677 |              *GUPs / NumProcs );
678 |     /* No longer reporting per CPU number */
679 |     /* *GUPs /= NumProcs; */
680 |   }
681 |   /* distribute result to all nodes */
682 |   MPI_Bcast( GUPs, 1, MPI_INT, 0, MPI_COMM_WORLD );
683 | 
684 | 
685 |   /* Verification phase */
686 | 
687 |   /* Begin timing here */
688 |   CPUTime = -CPUSEC();
689 |   RealTime = -RTSEC();
690 | 
691 |   if (PowerofTwo) {
692 |     HPCC_Power2NodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
693 |                                     GlobalStartMyProc,
694 |                                     logNumProcs, NumProcs,
695 |                                     MyProc, ProcNumUpdates,
696 |                                     INT64_DT, &NumErrors);
697 |   }
698 |   else {
699 |     HPCC_AnyNodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
700 |                                  MinLocalTableSize, GlobalStartMyProc, Top,
701 |                                  logNumProcs, NumProcs, Remainder,
702 |                                  MyProc, ProcNumUpdates,
703 |                                  INT64_DT, &NumErrors);
704 |   }
705 | 
706 | 
707 | #ifdef LONG_IS_64BITS
708 |   MPI_Reduce( &NumErrors, &GlbNumErrors, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD );
709 | #else
710 |   /* MPI 1.1 standard (obsolete at this point) doesn't define MPI_SUM
711 |     to work on `long long':
712 |     http://www.mpi-forum.org/docs/mpi-11-html/node78.html and
713 |     therefore LAM 6.5.6 chooses not to implement it (even though there
714 |     is code for it in LAM and for other reductions work OK,
715 |     e.g. MPI_MAX). MPICH 1.2.5 doesn't complain about MPI_SUM but it
716 |     doesn't have MPI_UNSIGNED_LONG_LONG (but has MPI_LONG_LONG_INT):
717 |     http://www.mpi-forum.org/docs/mpi-20-html/node84.htm So I need to
718 |     create a trivial summation operation. */
719 |   MPI_Op_create( Sum64, 1, &sum64 );
720 |   MPI_Reduce( &NumErrors, &GlbNumErrors, 1, INT64_DT, sum64, 0, MPI_COMM_WORLD );
721 |   MPI_Op_free( &sum64 );
722 | #endif
723 | 
724 |   /* End timed section */
725 |   CPUTime += CPUSEC();
726 |   RealTime += RTSEC();
727 | 
728 |   if(MyProc == 0){
729 |     params->MPIRandomAccess_CheckTime = RealTime;
730 |     fprintf( outFile, "Verification:  CPU time used = %.6f seconds\n", CPUTime);
731 |     fprintf( outFile, "Verification:  Real time used = %.6f seconds\n", RealTime);
732 |     fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n",
733 |              GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ?
734 |              "passed" : "failed");
735 |     if (GlbNumErrors > 0.01*TableSize) params->Failure = 1;
736 |     params->MPIRandomAccess_Errors = (s64Int)GlbNumErrors;
737 |     params->MPIRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize;
738 |   }
739 |   /* End verification phase */
740 | 
741 | 
742 |   /* Deallocate memory (in reverse order of allocation which should
743 |      help fragmentation) */
744 | 
745 |   free( HPCC_Table );
746 | 
747 |   failed_table:
748 | 
749 |   if (0 == MyProc) if (outFile != stderr) fclose( outFile );
750 | 
751 |   MPI_Barrier( MPI_COMM_WORLD );
752 | 
753 |   return 0;
754 | }
755 | 


--------------------------------------------------------------------------------