├── Makefile.linux ├── Makefile.storm ├── Makefile.storm.opt ├── README ├── gups_vanilla.c ├── gups_nonpow2.c ├── gups_opt.c ├── MPIRandomAccess_vanilla.c └── MPIRandomAccess_opt.c /Makefile.linux: -------------------------------------------------------------------------------- 1 | # Makefile for Linux 2 | 3 | SHELL = /bin/sh 4 | 5 | # System-specific settings 6 | 7 | CC = gcc 8 | CCFLAGS = -O -g -DCHECK 9 | LINK = gcc 10 | LINKFLAGS = -O -g 11 | LIB = -lmpich 12 | 13 | # Link target 14 | 15 | gups_vanilla: gups_vanilla.o 16 | $(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla 17 | 18 | gups_nonpow2: gups_nonpow2.o 19 | $(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2 20 | 21 | gups_opt: gups_opt.o 22 | $(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt 23 | 24 | # Compilation rules 25 | 26 | %.o:%.c 27 | $(CC) $(CCFLAGS) -c $< 28 | -------------------------------------------------------------------------------- /Makefile.storm: -------------------------------------------------------------------------------- 1 | # Makefile for Red Storm (compile on reddish) 2 | 3 | SHELL = /bin/sh 4 | 5 | # System-specific settings 6 | 7 | CC = CC 8 | CCFLAGS = -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 9 | #CCFLAGS = -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DCHECK 10 | LINK = CC 11 | LINKFLAGS = -O 12 | LIB = 13 | 14 | # Link target 15 | 16 | gups_vanilla: gups_vanilla.o 17 | $(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla 18 | 19 | gups_nonpow2: gups_nonpow2.o 20 | $(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2 21 | 22 | gups_opt: gups_opt.o 23 | $(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt 24 | 25 | # Compilation rules 26 | 27 | %.o:%.c 28 | $(CC) $(CCFLAGS) -c $< 29 | -------------------------------------------------------------------------------- /Makefile.storm.opt: -------------------------------------------------------------------------------- 1 | # Makefile for Red Storm (compile on reddish) 2 | 3 | SHELL = /bin/sh 4 | 5 | # System-specific settings 6 | 7 | CC = /home/rbbrigh/mpich-1.2.6/install/bin/mpicc 8 | CCFLAGS = -O3 -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DNODEF \ 9 | -Msafeptr -Mipa=fast 10 | #CCFLAGS = -O3 -fastsse -DMPICH_IGNORE_CXX_SEEK -DLONG64 -DNODEF -DCHECK \ 11 | # -Msafeptr -Mipa=fast 12 | LINK = /home/rbbrigh/mpich-1.2.6/install/bin/mpicc 13 | LINKFLAGS = -O 14 | LIB = 15 | 16 | # Link target 17 | 18 | gups_vanilla: gups_vanilla.o 19 | $(LINK) $(LINKFLAGS) gups_vanilla.o $(LIB) -o gups_vanilla 20 | 21 | gups_nonpow2: gups_nonpow2.o 22 | $(LINK) $(LINKFLAGS) gups_nonpow2.o $(LIB) -o gups_nonpow2 23 | 24 | gups_opt: gups_opt.o 25 | $(LINK) $(LINKFLAGS) gups_opt.o $(LIB) -o gups_opt 26 | 27 | # Compilation rules 28 | 29 | %.o:%.c 30 | $(CC) $(CCFLAGS) -c $< 31 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | GUPS distribution - 13 Oct 2006 2 | 3 | This directory contains several implementations of an algorithm that 4 | can be used to run the HPCC RandomAccess (GUPS) benchmark. 5 | 6 | The algorithm is described on this WWW page: 7 | www.cs.sandia.gov/~sjplimp/algorithms/html#gups 8 | 9 | The tar file of codes can be downloaded from this WWW page: 10 | www.cs.sandia.gov/~sjplimp/download.html 11 | 12 | These codes are distributed by Steve Plimpton 13 | of Sandia National Laboratories: 14 | sjplimp@sandia.gov, www.cs.sandia.gov/~sjplimp 15 | 16 | -------------------------------------------------------------------------- 17 | 18 | This directory should contain the following files: 19 | 20 | gups_vanilla.c vanilla power-of-2 version of algorithm 21 | gups_nonpow2.c non-power-of-2 version 22 | gups_opt.c optimized power-of-2 version 23 | 24 | MPIRandomAccess_vanilla.c implementation of gups_vanilla in HPCC harness 25 | MPIRandomAccess_opt.c implementation of gups_opt in HPCC harness 26 | 27 | Makefile.* Makefiles for various machines 28 | 29 | -------------------------------------------------------------------------- 30 | 31 | The gups_* files are stand-alone single-file codes that can be built 32 | using a Makefile like those provided. E.g. 33 | 34 | make -f Makefile.linux gups_vanilla 35 | 36 | You will need to create a Makefile.* appropriate to your platform, 37 | that points at the correct MPI library, etc. Note that these 3 codes 38 | support a -DLONG64 C compiler flag. If a "long" on your processor is 39 | 32-bit (presumably long long is 64 bits), then don't use -DLONG64; if 40 | a "long" is 64 bits, then use -DLONG64. 41 | 42 | -------------------------------------------------------------------------- 43 | 44 | You can run any of the 3 gups* codes as follows: 45 | 46 | 1 proc: 47 | gups_vanilla N M chunk 48 | 49 | P procs: 50 | mpirun -np P gups_vanilla N M chunk 51 | 52 | where 53 | 54 | N = length of global table is 2^N 55 | M = # of update sets per proc 56 | chunk = # of updates in one set on each proc 57 | 58 | Note that 2^N is the length of the global table across all processors. 59 | Thus N = 30 would run with a billion-element table. 60 | 61 | Chunk is the number of updates each proc will do before communicating. 62 | In the official HPCC benchmark this is specified to be no larger than 63 | 1024, but you can run the code with any value you like. Your GUPS 64 | performance will typically decrease for smaller chunk size. 65 | 66 | When each proc performs "chunk" updates, that is one "set" of updates. 67 | M determines how many sets are performed. The GUPS performance is a 68 | "rate", so it's independent of M, once M is large enough to get good 69 | statistics. So you can start your testing with a small M to see how 70 | fast your machine runs with this algorithm, then get better stats with 71 | longer runs with a larger M. An official HPCC benchmark run requires 72 | M be a large number (like the total number of updates = 4x the table 73 | size, if I recall), but your GUPS rate won't change. 74 | 75 | After the code runs, it will print out some stats, like this: 76 | > mpirun -np 2 gups_vanilla 20 1000 1024 77 | Number of procs: 2 78 | Vector size: 1048576 79 | Max datums during comm: 1493 80 | Max datums after comm: 1493 81 | Excess datums (frac): 39395 (0.0192358) 82 | Bad locality count: 0 83 | Update time (secs): 0.383 84 | Gups: 0.005351 85 | 86 | "Vector size" is the length of the global table. 87 | 88 | The "max datums" values tell how message size varied as datums were 89 | routed thru the hypercube dimensions. They should only exceed "chunk" 90 | by a modest amount. However the random number generation in the HPCC 91 | algorithm is not very random, so in the first few iterations a few 92 | procs tend to receive larger messages. 93 | 94 | The "excess datums" value is the number of updates (and fraction) that 95 | would have been missed if datums greater than the chunk size were 96 | discarded. It should typically be < 1% for long runs. The codes do 97 | not discard these excess updates. 98 | 99 | The "bad locality" should be 0. If the code was compiled with -DCHECK 100 | and a non-zero value results, it means some procs are trying to 101 | perform table updates on table indices they don't own, so something is 102 | wrong. 103 | 104 | The "update time" is how long the code ran. 105 | 106 | "Gups" is the GUPS performance rate, as HPCC defines it. Namely the 107 | total # of updates per second across all processors. The total # of 108 | updates is M*chunk*P, where P = # of processors. Once you run on 109 | enough processors (e.g. 32), you should see the GUPS rate nearly 110 | double each time you double the number of procs, unless communication 111 | on your machine is slowing things down. 112 | 113 | -------------------------------------------------------------------------- 114 | 115 | The MPIRandomAccess*.c codes are versions of the same algorithms 116 | implemented within the framework that HPCC provides to enable users to 117 | implement new optimized algorithms. 118 | 119 | In principle you can take these files and drop them into the HPCC 120 | harness, re-compile the HPCC suite, and run an official HPCC benchmark 121 | test with the new algorithms. In practice, I don't know the specifics 122 | of how to do that! Courtenay Vaughan at Sandia was the one who worked 123 | on that part of the project. You can email him if you have questions 124 | at ctvaugh@sandia.gov. 125 | 126 | You should get essentially the same GUPS number when running these 127 | algorithms in the HPCC harness as you get with the stand-alone codes. 128 | 129 | Note that we have only ported the vanilla and opt algorithms (not the 130 | non-power-of-2 version) to the HPCC framework. 131 | -------------------------------------------------------------------------------- /gups_vanilla.c: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------- 2 | gups = algorithm for the HPCC RandomAccess (GUPS) benchmark 3 | implements a hypercube-style synchronous all2all 4 | 5 | Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories 6 | www.cs.sandia.gov/~sjplimp 7 | Copyright (2006) Sandia Corporation 8 | ------------------------------------------------------------------------- */ 9 | 10 | /* random update GUPS code, power-of-2 number of procs 11 | compile with -DCHECK to check if table updates happen on correct proc */ 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 18 | 19 | /* machine defs 20 | compile with -DLONG64 if a "long" is 64 bits 21 | else compile with no setting if "long long" is 64 bit */ 22 | 23 | #ifdef LONG64 24 | #define POLY 0x0000000000000007UL 25 | #define PERIOD 1317624576693539401L 26 | #define ZERO64B 0L 27 | typedef long s64Int; 28 | typedef unsigned long u64Int; 29 | #define U64INT MPI_UNSIGNED_LONG 30 | #else 31 | #define POLY 0x0000000000000007ULL 32 | #define PERIOD 1317624576693539401LL 33 | #define ZERO64B 0LL 34 | typedef long long s64Int; 35 | typedef unsigned long long u64Int; 36 | #define U64INT MPI_LONG_LONG_INT 37 | #endif 38 | 39 | u64Int HPCC_starts(s64Int n); 40 | 41 | int main(int narg, char **arg) 42 | { 43 | int me,nprocs; 44 | int i,j,iterate,niterate; 45 | int nlocal,nlocalm1,logtable,index,logtablelocal; 46 | int logprocs,ipartner,ndata,nsend,nkeep,nrecv,maxndata,maxnfinal,nexcess; 47 | int nbad,chunk,chunkbig; 48 | double t0,t0_all,Gups; 49 | u64Int *table,*data,*send; 50 | u64Int ran,datum,procmask,nglobal,offset,nupdates; 51 | u64Int ilong,nexcess_long,nbad_long; 52 | MPI_Status status; 53 | 54 | MPI_Init(&narg,&arg); 55 | MPI_Comm_rank(MPI_COMM_WORLD,&me); 56 | MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 57 | 58 | /* command line args = N M chunk 59 | N = length of global table is 2^N 60 | M = # of update sets per proc 61 | chunk = # of updates in one set */ 62 | 63 | if (narg != 4) { 64 | if (me == 0) printf("Syntax: gups N M chunk\n"); 65 | MPI_Abort(MPI_COMM_WORLD,1); 66 | } 67 | 68 | logtable = atoi(arg[1]); 69 | niterate = atoi(arg[2]); 70 | chunk = atoi(arg[3]); 71 | 72 | /* insure Nprocs is power of 2 */ 73 | 74 | i = 1; 75 | while (i < nprocs) i *= 2; 76 | if (i != nprocs) { 77 | if (me == 0) printf("Must run on power-of-2 procs\n"); 78 | MPI_Abort(MPI_COMM_WORLD,1); 79 | } 80 | 81 | /* nglobal = entire table 82 | nlocal = size of my portion 83 | nlocalm1 = local size - 1 (for index computation) 84 | logtablelocal = log of table size I store 85 | offset = starting index in global table of 1st entry in local table */ 86 | 87 | logprocs = 0; 88 | while (1 << logprocs < nprocs) logprocs++; 89 | 90 | nglobal = ((u64Int) 1) << logtable; 91 | nlocal = nglobal / nprocs; 92 | nlocalm1 = nlocal - 1; 93 | logtablelocal = logtable - logprocs; 94 | offset = (u64Int) nlocal * me; 95 | 96 | /* allocate local memory 97 | 16 factor insures space for messages that (randomly) exceed chunk size */ 98 | 99 | chunkbig = 16*chunk; 100 | 101 | table = (u64Int *) malloc(nlocal*sizeof(u64Int)); 102 | data = (u64Int *) malloc(chunkbig*sizeof(u64Int)); 103 | send = (u64Int *) malloc(chunkbig*sizeof(u64Int)); 104 | 105 | if (!table || !data || !send) { 106 | if (me == 0) printf("Table allocation failed\n"); 107 | MPI_Abort(MPI_COMM_WORLD,1); 108 | } 109 | 110 | /* initialize my portion of global array 111 | global array starts with table[i] = i */ 112 | 113 | for (i = 0; i < nlocal; i++) table[i] = i + offset; 114 | 115 | /* start my random # partway thru global stream */ 116 | 117 | nupdates = (u64Int) nprocs * chunk * niterate; 118 | ran = HPCC_starts(nupdates/nprocs*me); 119 | 120 | /* loop: 121 | generate chunk random values per proc 122 | communicate datums to correct processor via hypercube routing 123 | use received values to update local table */ 124 | 125 | maxndata = 0; 126 | maxnfinal = 0; 127 | nexcess = 0; 128 | nbad = 0; 129 | 130 | MPI_Barrier(MPI_COMM_WORLD); 131 | t0 = -MPI_Wtime(); 132 | 133 | for (iterate = 0; iterate < niterate; iterate++) { 134 | for (i = 0; i < chunk; i++) { 135 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 136 | data[i] = ran; 137 | } 138 | ndata = chunk; 139 | 140 | for (j = 0; j < logprocs; j++) { 141 | nkeep = nsend = 0; 142 | ipartner = (1 << j) ^ me; 143 | procmask = ((u64Int) 1) << (logtablelocal + j); 144 | if (ipartner > me) { 145 | for (i = 0; i < ndata; i++) { 146 | if (data[i] & procmask) send[nsend++] = data[i]; 147 | else data[nkeep++] = data[i]; 148 | } 149 | } else { 150 | for (i = 0; i < ndata; i++) { 151 | if (data[i] & procmask) data[nkeep++] = data[i]; 152 | else send[nsend++] = data[i]; 153 | } 154 | } 155 | 156 | MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep],chunkbig,U64INT, 157 | ipartner,0,MPI_COMM_WORLD,&status); 158 | MPI_Get_count(&status,U64INT,&nrecv); 159 | ndata = nkeep + nrecv; 160 | maxndata = MAX(maxndata,ndata); 161 | } 162 | maxnfinal = MAX(maxnfinal,ndata); 163 | if (ndata > chunk) nexcess += ndata - chunk; 164 | 165 | for (i = 0; i < ndata; i++) { 166 | datum = data[i]; 167 | index = datum & nlocalm1; 168 | table[index] ^= datum; 169 | } 170 | 171 | #ifdef CHECK 172 | procmask = ((u64Int) (nprocs-1)) << logtablelocal; 173 | for (i = 0; i < ndata; i++) 174 | if ((data[i] & procmask) >> logtablelocal != me) nbad++; 175 | #endif 176 | } 177 | 178 | MPI_Barrier(MPI_COMM_WORLD); 179 | t0 += MPI_Wtime(); 180 | 181 | /* stats */ 182 | 183 | MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 184 | t0 = t0_all/nprocs; 185 | 186 | i = maxndata; 187 | MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 188 | i = maxnfinal; 189 | MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 190 | ilong = nexcess; 191 | MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 192 | ilong = nbad; 193 | MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 194 | 195 | nupdates = (u64Int) niterate * nprocs * chunk; 196 | Gups = nupdates / t0 / 1.0e9; 197 | 198 | if (me == 0) { 199 | printf("Number of procs: %d\n",nprocs); 200 | printf("Vector size: %lld\n",nglobal); 201 | printf("Max datums during comm: %d\n",maxndata); 202 | printf("Max datums after comm: %d\n",maxnfinal); 203 | printf("Excess datums (frac): %lld (%g)\n", 204 | nexcess_long,(double) nexcess_long / nupdates); 205 | printf("Bad locality count: %lld\n",nbad_long); 206 | printf("Update time (secs): %9.3f\n",t0); 207 | printf("Gups: %9.6f\n",Gups); 208 | } 209 | 210 | /* clean up */ 211 | 212 | free(table); 213 | free(data); 214 | free(send); 215 | MPI_Finalize(); 216 | } 217 | 218 | /* start random number generator at Nth step of stream 219 | routine provided by HPCC */ 220 | 221 | u64Int HPCC_starts(s64Int n) 222 | { 223 | int i, j; 224 | u64Int m2[64]; 225 | u64Int temp, ran; 226 | 227 | while (n < 0) n += PERIOD; 228 | while (n > PERIOD) n -= PERIOD; 229 | if (n == 0) return 0x1; 230 | 231 | temp = 0x1; 232 | for (i=0; i<64; i++) { 233 | m2[i] = temp; 234 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 235 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 236 | } 237 | 238 | for (i=62; i>=0; i--) 239 | if ((n >> i) & 1) 240 | break; 241 | 242 | ran = 0x2; 243 | while (i > 0) { 244 | temp = 0; 245 | for (j=0; j<64; j++) 246 | if ((ran >> j) & 1) 247 | temp ^= m2[j]; 248 | ran = temp; 249 | i -= 1; 250 | if ((n >> i) & 1) 251 | ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0); 252 | } 253 | 254 | return ran; 255 | } 256 | -------------------------------------------------------------------------------- /gups_nonpow2.c: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------- 2 | gups = algorithm for the HPCC RandomAccess (GUPS) benchmark 3 | implements a hypercube-style synchronous all2all 4 | 5 | Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories 6 | www.cs.sandia.gov/~sjplimp 7 | Copyright (2006) Sandia Corporation 8 | ------------------------------------------------------------------------- */ 9 | 10 | /* random update GUPS code, non-power-of-2 number of procs (pow 2 is OK) 11 | compile with -DCHECK to check if table updates happen on correct proc */ 12 | 13 | #include "stdio.h" 14 | #include "stdlib.h" 15 | #include "mpi.h" 16 | 17 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 18 | 19 | /* machine defs 20 | compile with -DLONG64 if a "long" is 64 bits 21 | else compile with no setting if "long long" is 64 bit */ 22 | 23 | #ifdef LONG64 24 | #define POLY 0x0000000000000007UL 25 | #define PERIOD 1317624576693539401L 26 | #define ZERO64B 0L 27 | typedef long s64Int; 28 | typedef unsigned long u64Int; 29 | #define U64INT MPI_UNSIGNED_LONG 30 | #else 31 | #define POLY 0x0000000000000007ULL 32 | #define PERIOD 1317624576693539401LL 33 | #define ZERO64B 0LL 34 | typedef long long s64Int; 35 | typedef unsigned long long u64Int; 36 | #define U64INT MPI_LONG_LONG_INT 37 | #endif 38 | 39 | u64Int HPCC_starts(s64Int n); 40 | 41 | int main(int narg, char **arg) 42 | { 43 | int me,nprocs; 44 | int i,iterate,niterate; 45 | int nlocal,logtable,index; 46 | int ipartner,ndata,nsend,nkeep,nrecv,maxndata,maxnfinal,nexcess; 47 | int nbad,chunk,chunkbig,npartition,nlower,nupper,proclo,procmid,nfrac; 48 | double t0,t0_all,Gups; 49 | u64Int *table,*data,*send,*offsets; 50 | u64Int ran,datum,nglobal,nglobalm1,nupdates,offset,indexmid,nstart; 51 | u64Int ilong,nexcess_long,nbad_long; 52 | MPI_Status status; 53 | 54 | MPI_Init(&narg,&arg); 55 | MPI_Comm_rank(MPI_COMM_WORLD,&me); 56 | MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 57 | 58 | /* command line args = N M chunk 59 | N = length of global table is 2^N 60 | M = # of update sets per proc 61 | chunk = # of updates in one set */ 62 | 63 | if (narg != 4) { 64 | if (me == 0) printf("Syntax: gups N M chunk\n"); 65 | MPI_Abort(MPI_COMM_WORLD,1); 66 | } 67 | 68 | logtable = atoi(arg[1]); 69 | niterate = atoi(arg[2]); 70 | chunk = atoi(arg[3]); 71 | 72 | /* nglobal = entire table (power of 2) 73 | nlocal = size of my portion (not a power of 2) 74 | nglobalm1 = global size - 1 (for index computation) 75 | offsets[i] = starting index in global table of proc I's portion 76 | offset = starting index in global table of 1st entry in local table */ 77 | 78 | nglobal = ((u64Int) 1) << logtable; 79 | nglobalm1 = nglobal - 1; 80 | nstart = (double) me / nprocs * nglobal; 81 | offsets = (u64Int *) malloc((nprocs+1)*sizeof(u64Int)); 82 | MPI_Allgather(&nstart,1,U64INT,offsets,1,U64INT,MPI_COMM_WORLD); 83 | offsets[nprocs] = nglobal; 84 | nlocal = offsets[me+1] - offsets[me]; 85 | offset = offsets[me]; 86 | 87 | /* allocate local memory 88 | 16 factor insures space for messages that (randomly) exceed chunk size */ 89 | 90 | chunkbig = 16*chunk; 91 | 92 | table = (u64Int *) malloc(nlocal*sizeof(u64Int)); 93 | data = (u64Int *) malloc(chunkbig*sizeof(u64Int)); 94 | send = (u64Int *) malloc(chunkbig*sizeof(u64Int)); 95 | 96 | if (!table || !data || !send) { 97 | if (me == 0) printf("Table allocation failed\n"); 98 | MPI_Abort(MPI_COMM_WORLD,1); 99 | } 100 | 101 | /* initialize my portion of global array 102 | global array starts with table[i] = i */ 103 | 104 | for (i = 0; i < nlocal; i++) table[i] = i + offset; 105 | 106 | /* start my random # partway thru global stream */ 107 | 108 | nupdates = (u64Int) nprocs * chunk * niterate; 109 | ran = HPCC_starts(nupdates/nprocs*me); 110 | 111 | /* loop: 112 | generate chunk random stream values per proc 113 | communicate datums to correct processor via hypercube routing 114 | use received values to update local table */ 115 | 116 | maxndata = 0; 117 | maxnfinal = 0; 118 | nexcess = 0; 119 | nbad = 0; 120 | 121 | MPI_Barrier(MPI_COMM_WORLD); 122 | t0 = -MPI_Wtime(); 123 | 124 | for (iterate = 0; iterate < niterate; iterate++) { 125 | for (i = 0; i < chunk; i++) { 126 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 127 | data[i] = ran; 128 | } 129 | ndata = chunk; 130 | 131 | npartition = nprocs; 132 | proclo = 0; 133 | while (npartition > 1) { 134 | nlower = npartition/2; 135 | nupper = npartition - nlower; 136 | procmid = proclo + nlower; 137 | indexmid = offsets[procmid]; 138 | 139 | nkeep = nsend = 0; 140 | if (me < procmid) { 141 | for (i = 0; i < ndata; i++) { 142 | if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i]; 143 | else data[nkeep++] = data[i]; 144 | } 145 | } else { 146 | for (i = 0; i < ndata; i++) { 147 | if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i]; 148 | else data[nkeep++] = data[i]; 149 | } 150 | } 151 | 152 | /* if partition halves are equal, exchange message with 1 partner 153 | if upper half = lower half + 1: 154 | if in lower half, send/recv 2 messages 155 | 1st exchange with me+nlower, 2nd exchange with me+nlower+1 156 | 1st send has first 157 | nfrac = (nlower - (me-proclo)) / nupper of my data 158 | 2nd send has remainder of my data 159 | if not first or last proc of upper half, send/recv 2 messages 160 | 1st exchange with me-nlower, 2nd exchange with me-nlower-1 161 | 2nd send has first 162 | nfrac = (me-procmid) / nlower of my data 163 | 1st send has remainder of my data 164 | if first or last proc of upper half, send/recv 1 message 165 | each exchanges with first/last proc of lower half 166 | send all my data 167 | always recv whatever is sent */ 168 | 169 | if (nlower == nupper) { 170 | if (me < procmid) ipartner = me + nlower; 171 | else ipartner = me - nlower; 172 | MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep], 173 | chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status); 174 | MPI_Get_count(&status,U64INT,&nrecv); 175 | ndata = nkeep + nrecv; 176 | maxndata = MAX(maxndata,ndata); 177 | } else { 178 | if (me < procmid) { 179 | nfrac = (nlower - (me-proclo)) * nsend / nupper; 180 | ipartner = me + nlower; 181 | MPI_Sendrecv(send,nfrac,U64INT,ipartner,0,&data[nkeep], 182 | chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status); 183 | MPI_Get_count(&status,U64INT,&nrecv); 184 | nkeep += nrecv; 185 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,U64INT,ipartner+1,0, 186 | &data[nkeep],chunkbig,U64INT, 187 | ipartner+1,0,MPI_COMM_WORLD,&status); 188 | MPI_Get_count(&status,U64INT,&nrecv); 189 | ndata = nkeep + nrecv; 190 | } else if (me > procmid && me < procmid+nlower) { 191 | nfrac = (me - procmid) * nsend / nlower; 192 | ipartner = me - nlower; 193 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,U64INT,ipartner,0,&data[nkeep], 194 | chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status); 195 | MPI_Get_count(&status,U64INT,&nrecv); 196 | nkeep += nrecv; 197 | MPI_Sendrecv(send,nfrac,U64INT,ipartner-1,0,&data[nkeep], 198 | chunkbig,U64INT,ipartner-1,0,MPI_COMM_WORLD,&status); 199 | MPI_Get_count(&status,U64INT,&nrecv); 200 | ndata = nkeep + nrecv; 201 | } else { 202 | if (me == procmid) ipartner = me - nlower; 203 | else ipartner = me - nupper; 204 | MPI_Sendrecv(send,nsend,U64INT,ipartner,0,&data[nkeep], 205 | chunkbig,U64INT,ipartner,0,MPI_COMM_WORLD,&status); 206 | MPI_Get_count(&status,U64INT,&nrecv); 207 | ndata = nkeep + nrecv; 208 | } 209 | } 210 | 211 | if (me < procmid) npartition = nlower; 212 | else { 213 | proclo = procmid; 214 | npartition = nupper; 215 | } 216 | } 217 | maxnfinal = MAX(maxnfinal,ndata); 218 | if (ndata > chunk) nexcess += ndata-chunk; 219 | 220 | for (i = 0; i < ndata; i++) { 221 | datum = data[i]; 222 | index = (datum & nglobalm1) - offset; 223 | table[index] ^= datum; 224 | } 225 | 226 | #ifdef CHECK 227 | for (i = 0; i < ndata; i++) { 228 | index = (datum & nglobalm1) - offset; 229 | if (index < 0 || index >= nlocal) nbad++; 230 | } 231 | #endif 232 | } 233 | 234 | MPI_Barrier(MPI_COMM_WORLD); 235 | t0 += MPI_Wtime(); 236 | 237 | /* stats */ 238 | 239 | MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 240 | t0 = t0_all/nprocs; 241 | 242 | i = maxndata; 243 | MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 244 | i = maxnfinal; 245 | MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 246 | ilong = nexcess; 247 | MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 248 | ilong = nbad; 249 | MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 250 | 251 | nupdates = (u64Int) niterate * nprocs * chunk; 252 | Gups = nupdates / t0 / 1.0e9; 253 | 254 | if (me == 0) { 255 | printf("Number of procs: %d\n",nprocs); 256 | printf("Vector size: %lld\n",nglobal); 257 | printf("Max datums during comm: %d\n",maxndata); 258 | printf("Max datums after comm: %d\n",maxnfinal); 259 | printf("Excess datums (frac): %lld (%g)\n", 260 | nexcess_long,(double) nexcess_long / nupdates); 261 | printf("Bad locality count: %lld\n",nbad_long); 262 | printf("Update time (secs): %9.3f\n",t0); 263 | printf("Gups: %9.6f\n",Gups); 264 | } 265 | 266 | /* clean up */ 267 | 268 | free(table); 269 | free(data); 270 | free(send); 271 | free(offsets); 272 | MPI_Finalize(); 273 | } 274 | 275 | /* start random number generator at Nth step of stream 276 | routine provided by HPCC */ 277 | 278 | u64Int HPCC_starts(s64Int n) 279 | { 280 | int i, j; 281 | u64Int m2[64]; 282 | u64Int temp, ran; 283 | 284 | while (n < 0) n += PERIOD; 285 | while (n > PERIOD) n -= PERIOD; 286 | if (n == 0) return 0x1; 287 | 288 | temp = 0x1; 289 | for (i=0; i<64; i++) { 290 | m2[i] = temp; 291 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 292 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 293 | } 294 | 295 | for (i=62; i>=0; i--) 296 | if ((n >> i) & 1) 297 | break; 298 | 299 | ran = 0x2; 300 | while (i > 0) { 301 | temp = 0; 302 | for (j=0; j<64; j++) 303 | if ((ran >> j) & 1) 304 | temp ^= m2[j]; 305 | ran = temp; 306 | i -= 1; 307 | if ((n >> i) & 1) 308 | ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0); 309 | } 310 | 311 | return ran; 312 | } 313 | -------------------------------------------------------------------------------- /gups_opt.c: -------------------------------------------------------------------------------- 1 | /* ---------------------------------------------------------------------- 2 | gups = algorithm for the HPCC RandomAccess (GUPS) benchmark 3 | implements a hypercube-style synchronous all2all 4 | 5 | Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories 6 | www.cs.sandia.gov/~sjplimp 7 | Copyright (2006) Sandia Corporation 8 | 9 | optimizations implemented by Ron Brightwell and Keith Underwood (SNL) 10 | ------------------------------------------------------------------------- */ 11 | 12 | /* random update GUPS code with optimizations, power-of-2 number of procs 13 | compile with -DCHECK to check if table updates happen on correct proc */ 14 | 15 | #include "stdio.h" 16 | #include "stdlib.h" 17 | #include "mpi.h" 18 | 19 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 20 | 21 | #define MAXLOGPROCS (20) 22 | #define CHUNK 1024 23 | /* Optimize CHUNK2 to make sure that we minimize aliasing in the cache -KDU */ 24 | #define CHUNKBIG 10240 25 | /* RCHUNK is constant because we have log2(N) of them, but they are used 26 | * independently on successive iterations (i.e. they don't interfere 27 | * with each other in cache -KDU 28 | */ 29 | #define RCHUNK 4096 30 | #define PITER 8 31 | 32 | /* machine defs 33 | compile with -DLONG64 if a "long" is 64 bits 34 | else compile with no setting if "long long" is 64 bit */ 35 | 36 | #ifdef LONG64 37 | #define POLY 0x0000000000000007UL 38 | #define PERIOD 1317624576693539401L 39 | #define ZERO64B 0L 40 | typedef long s64Int; 41 | typedef unsigned long u64Int; 42 | #define U64INT MPI_UNSIGNED_LONG 43 | #else 44 | #define POLY 0x0000000000000007ULL 45 | #define PERIOD 1317624576693539401LL 46 | #define ZERO64B 0LL 47 | typedef long long s64Int; 48 | typedef unsigned long long u64Int; 49 | #define U64INT MPI_LONG_LONG_INT 50 | #endif 51 | 52 | void sort_data (u64Int *source, u64Int *nomatch, u64Int *match, int number, 53 | int *nnomatch, int *nmatch, int mask_shift); 54 | inline update_table (u64Int *data, u64Int *table, int number, int nlocalm1); 55 | u64Int HPCC_starts(s64Int n); 56 | 57 | int main(int narg, char **arg) 58 | { 59 | int me,nprocs; 60 | int i,j,k,iterate,niterate; 61 | int nlocal,nlocalm1,logtable,index,logtablelocal; 62 | int logprocs,ipartner,ndata,nsend,nkeep,nkept,nrecv; 63 | int maxndata,maxnfinal,nexcess; 64 | int nbad; 65 | double t0,t0_all,Gups; 66 | u64Int *table,*data,*send, *keep_data; 67 | #ifndef USE_BLOCKING_SEND 68 | u64Int *send1,*send2; 69 | #endif 70 | u64Int *recv[PITER][MAXLOGPROCS]; 71 | u64Int ran,datum,procmask,nglobal,offset,nupdates; 72 | u64Int ilong,nexcess_long,nbad_long; 73 | MPI_Status status; 74 | MPI_Request request[PITER][MAXLOGPROCS]; 75 | MPI_Request srequest; 76 | 77 | MPI_Init(&narg,&arg); 78 | MPI_Comm_rank(MPI_COMM_WORLD,&me); 79 | MPI_Comm_size(MPI_COMM_WORLD,&nprocs); 80 | 81 | /* command line args = N M 82 | N = length of global table is 2^N 83 | M = # of 1024-update sets per proc */ 84 | 85 | if (narg != 3) { 86 | if (me == 0) printf("Syntax: gups N M\n"); 87 | MPI_Abort(MPI_COMM_WORLD,1); 88 | } 89 | 90 | logtable = atoi(arg[1]); 91 | niterate = atoi(arg[2]); 92 | 93 | /* insure Nprocs is power of 2 */ 94 | 95 | i = 1; 96 | while (i < nprocs) i *= 2; 97 | if (i != nprocs) { 98 | if (me == 0) printf("Must run on power-of-2 procs\n"); 99 | MPI_Abort(MPI_COMM_WORLD,1); 100 | } 101 | 102 | /* nglobal = entire table 103 | nlocal = size of my portion 104 | nlocalm1 = local size - 1 (for index computation) 105 | logtablelocal = log of table size I store 106 | offset = starting index in global table of 1st entry in local table */ 107 | 108 | logprocs = 0; 109 | while (1 << logprocs < nprocs) logprocs++; 110 | 111 | nglobal = ((u64Int) 1) << logtable; 112 | nlocal = nglobal / nprocs; 113 | nlocalm1 = nlocal - 1; 114 | logtablelocal = logtable - logprocs; 115 | offset = (u64Int) nlocal * me; 116 | 117 | /* allocate local memory */ 118 | 119 | table = (u64Int *) malloc(nlocal*sizeof(u64Int)); 120 | data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 121 | 122 | if (!table || !data) { 123 | if (me == 0) printf("Table allocation failed\n"); 124 | MPI_Abort(MPI_COMM_WORLD,1); 125 | } 126 | 127 | #ifdef USE_BLOCKING_SEND 128 | send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 129 | if (!send) { 130 | if (me == 0) printf("Table allocation failed\n"); 131 | MPI_Abort(MPI_COMM_WORLD,1); 132 | } 133 | #else 134 | send1 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 135 | send2 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 136 | send = send1; 137 | if (!send1 || !send2) { 138 | if (me == 0) printf("Table allocation failed\n"); 139 | MPI_Abort(MPI_COMM_WORLD,1); 140 | } 141 | #endif 142 | 143 | for (j = 0; j < PITER; j++) 144 | for (i=0; i me) { 199 | sort_data(data,data,send,nkept,&nkeep,&nsend,logtablelocal+j); 200 | if (j > 0) { 201 | MPI_Wait(&request[iter_mod][j-1],&status); 202 | MPI_Get_count(&status,U64INT,&nrecv); 203 | 204 | 205 | sort_data(recv[iter_mod][j-1],data,send,nrecv,&nkeep, 206 | &nsend,logtablelocal+j); 207 | } 208 | } else { 209 | sort_data(data,send,data,nkept,&nsend,&nkeep,logtablelocal+j); 210 | if (j > 0) { 211 | MPI_Wait(&request[iter_mod][j-1],&status); 212 | MPI_Get_count(&status,U64INT,&nrecv); 213 | sort_data(recv[iter_mod][j-1],send,data,nrecv,&nsend, 214 | &nkeep,logtablelocal+j); 215 | } 216 | } 217 | #ifdef USE_BLOCKING_SEND 218 | MPI_Send(send,nsend,U64INT,ipartner,0,MPI_COMM_WORLD); 219 | #else 220 | if (j > 0) MPI_Wait(&srequest,&status); 221 | MPI_Isend(send,nsend,U64INT,ipartner,0,MPI_COMM_WORLD,&srequest); 222 | #endif 223 | if (j == (logprocs - 1)) { 224 | update_table(data, table, nkeep, nlocalm1); 225 | } 226 | maxndata = MAX(maxndata,nkept+nrecv); 227 | nkept = nkeep; 228 | } 229 | 230 | if (logprocs == 0) { 231 | update_table(data, table, nkept, nlocalm1); 232 | } else { 233 | MPI_Wait(&request[iter_mod][j-1],&status); 234 | MPI_Get_count(&status,U64INT,&nrecv); 235 | update_table(recv[iter_mod][j-1], table, nrecv, nlocalm1); 236 | #ifndef USE_BLOCKING_SEND 237 | MPI_Wait(&srequest,&status); 238 | #endif 239 | } 240 | 241 | ndata = nkept + nrecv; 242 | maxndata = MAX(maxndata,ndata); 243 | maxnfinal = MAX(maxnfinal,ndata); 244 | if (ndata > CHUNK) nexcess += ndata - CHUNK; 245 | 246 | #ifdef CHECK 247 | procmask = ((u64Int) (nprocs-1)) << logtablelocal; 248 | for (i = 0; i < nkept; i++) 249 | if ((data[i] & procmask) >> logtablelocal != me) nbad++; 250 | for (i = 0; i < nrecv; i++) 251 | if ((recv[iter_mod][j-1][i] & procmask) >> logtablelocal != me) nbad++; 252 | #endif 253 | } 254 | 255 | MPI_Barrier(MPI_COMM_WORLD); 256 | t0 += MPI_Wtime(); 257 | 258 | /* stats */ 259 | 260 | MPI_Allreduce(&t0,&t0_all,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 261 | t0 = t0_all/nprocs; 262 | 263 | i = maxndata; 264 | MPI_Allreduce(&i,&maxndata,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 265 | i = maxnfinal; 266 | MPI_Allreduce(&i,&maxnfinal,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); 267 | ilong = nexcess; 268 | MPI_Allreduce(&ilong,&nexcess_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 269 | ilong = nbad; 270 | MPI_Allreduce(&ilong,&nbad_long,1,U64INT,MPI_SUM,MPI_COMM_WORLD); 271 | 272 | nupdates = (u64Int) niterate * nprocs * CHUNK; 273 | Gups = nupdates / t0 / 1.0e9; 274 | 275 | if (me == 0) { 276 | printf("Number of procs: %d\n",nprocs); 277 | printf("Vector size: %lld\n",nglobal); 278 | printf("Max datums during comm: %d\n",maxndata); 279 | printf("Max datums after comm: %d\n",maxnfinal); 280 | printf("Excess datums (frac): %lld (%g)\n", 281 | nexcess_long,(double) nexcess_long / nupdates); 282 | printf("Bad locality count: %lld\n",nbad_long); 283 | printf("Update time (secs): %9.3f\n",t0); 284 | printf("Gups: %9.6f\n",Gups); 285 | } 286 | 287 | /* clean up */ 288 | 289 | for (j = 0; j < PITER; j++) 290 | for (i = 0; i < logprocs; i++) free(recv[j][i]); 291 | free(table); 292 | free(data); 293 | #ifdef USE_BLOCKING_SEND 294 | free(send); 295 | #else 296 | free(send1); 297 | free(send2); 298 | #endif 299 | MPI_Finalize(); 300 | } 301 | 302 | /* This sort is manually unrolled to make sure the compiler can see 303 | * the parallelism -KDU 304 | */ 305 | 306 | void sort_data(u64Int *source, u64Int *nomatch, u64Int *match, int number, 307 | int *nnomatch, int *nmatch, int mask_shift) 308 | { 309 | int div_num = number / 8; 310 | int loop_total = div_num * 8; 311 | u64Int procmask = ((u64Int) 1) << mask_shift; 312 | int i; 313 | u64Int *buffers[2]; 314 | int counts[2]; 315 | 316 | buffers[0] = nomatch; 317 | counts[0] = *nnomatch; 318 | buffers[1] = match; 319 | counts[1] = *nmatch; 320 | 321 | for (i = 0; i < div_num; i++) { 322 | int dindex = i*8; 323 | int myselect[8]; 324 | myselect[0] = (source[dindex] & procmask) >> mask_shift; 325 | myselect[1] = (source[dindex+1] & procmask) >> mask_shift; 326 | myselect[2] = (source[dindex+2] & procmask) >> mask_shift; 327 | myselect[3] = (source[dindex+3] & procmask) >> mask_shift; 328 | myselect[4] = (source[dindex+4] & procmask) >> mask_shift; 329 | myselect[5] = (source[dindex+5] & procmask) >> mask_shift; 330 | myselect[6] = (source[dindex+6] & procmask) >> mask_shift; 331 | myselect[7] = (source[dindex+7] & procmask) >> mask_shift; 332 | buffers[myselect[0]][counts[myselect[0]]++] = source[dindex]; 333 | buffers[myselect[1]][counts[myselect[1]]++] = source[dindex+1]; 334 | buffers[myselect[2]][counts[myselect[2]]++] = source[dindex+2]; 335 | buffers[myselect[3]][counts[myselect[3]]++] = source[dindex+3]; 336 | buffers[myselect[4]][counts[myselect[4]]++] = source[dindex+4]; 337 | buffers[myselect[5]][counts[myselect[5]]++] = source[dindex+5]; 338 | buffers[myselect[6]][counts[myselect[6]]++] = source[dindex+6]; 339 | buffers[myselect[7]][counts[myselect[7]]++] = source[dindex+7]; 340 | } 341 | 342 | for (i = loop_total; i < number; i++) { 343 | u64Int mydata = source[i]; 344 | if (mydata & procmask) buffers[1][counts[1]++] = mydata; 345 | else buffers[0][counts[0]++] = mydata; 346 | } 347 | 348 | *nnomatch = counts[0]; 349 | *nmatch = counts[1]; 350 | } 351 | 352 | inline update_table(u64Int *data, u64Int *table, int number, int nlocalm1) 353 | { 354 | /* DEEP_UNROLL doesn't seem to improve anything at this time */ 355 | /* Manual unrolling is a significant win if -Msafeptr is used -KDU */ 356 | #ifdef DEEP_UNROLL 357 | int div_num = number / 16; 358 | int loop_total = div_num * 16; 359 | #else 360 | int div_num = number / 8; 361 | int loop_total = div_num * 8; 362 | #endif 363 | 364 | int i; 365 | for (i = 0; i < div_num; i++) { 366 | #ifdef DEEP_UNROLL 367 | const int dindex = i*16; 368 | #else 369 | const int dindex = i*8; 370 | #endif 371 | u64Int index0 = data[dindex] & nlocalm1; 372 | u64Int index1 = data[dindex+1] & nlocalm1; 373 | u64Int index2 = data[dindex+2] & nlocalm1; 374 | u64Int index3 = data[dindex+3] & nlocalm1; 375 | u64Int index4 = data[dindex+4] & nlocalm1; 376 | u64Int index5 = data[dindex+5] & nlocalm1; 377 | u64Int index6 = data[dindex+6] & nlocalm1; 378 | u64Int index7 = data[dindex+7] & nlocalm1; 379 | u64Int ltable0 = table[index0]; 380 | u64Int ltable1 = table[index1]; 381 | u64Int ltable2 = table[index2]; 382 | u64Int ltable3 = table[index3]; 383 | u64Int ltable4 = table[index4]; 384 | u64Int ltable5 = table[index5]; 385 | u64Int ltable6 = table[index6]; 386 | u64Int ltable7 = table[index7]; 387 | #ifdef DEEP_UNROLL 388 | u64Int index8 = data[dindex+8] & nlocalm1; 389 | u64Int index9 = data[dindex+9] & nlocalm1; 390 | u64Int index10 = data[dindex+10] & nlocalm1; 391 | u64Int index11 = data[dindex+11] & nlocalm1; 392 | u64Int index12 = data[dindex+12] & nlocalm1; 393 | u64Int index13 = data[dindex+13] & nlocalm1; 394 | u64Int index14 = data[dindex+14] & nlocalm1; 395 | u64Int index15 = data[dindex+15] & nlocalm1; 396 | u64Int ltable8 = table[index8]; 397 | u64Int ltable9 = table[index9]; 398 | u64Int ltable10 = table[index10]; 399 | u64Int ltable11 = table[index11]; 400 | u64Int ltable12 = table[index12]; 401 | u64Int ltable13 = table[index13]; 402 | u64Int ltable14 = table[index14]; 403 | u64Int ltable15 = table[index15]; 404 | #endif 405 | table[index0] = ltable0 ^ data[dindex]; 406 | table[index1] = ltable1 ^ data[dindex+1]; 407 | table[index2] = ltable2 ^ data[dindex+2]; 408 | table[index3] = ltable3 ^ data[dindex+3]; 409 | table[index4] = ltable4 ^ data[dindex+4]; 410 | table[index5] = ltable5 ^ data[dindex+5]; 411 | table[index6] = ltable6 ^ data[dindex+6]; 412 | table[index7] = ltable7 ^ data[dindex+7]; 413 | #ifdef DEEP_UNROLL 414 | table[index8] = ltable8 ^ data[dindex+8]; 415 | table[index9] = ltable9 ^ data[dindex+9]; 416 | table[index10] = ltable10 ^ data[dindex+10]; 417 | table[index11] = ltable11 ^ data[dindex+11]; 418 | table[index12] = ltable12 ^ data[dindex+12]; 419 | table[index13] = ltable13 ^ data[dindex+13]; 420 | table[index14] = ltable14 ^ data[dindex+14]; 421 | table[index15] = ltable15 ^ data[dindex+15]; 422 | #endif 423 | } 424 | 425 | for (i = loop_total; i < number; i++) { 426 | u64Int datum = data[i]; 427 | int index = datum & nlocalm1; 428 | table[index] ^= datum; 429 | } 430 | } 431 | 432 | /* start random number generator at Nth step of stream 433 | routine provided by HPCC */ 434 | 435 | u64Int HPCC_starts(s64Int n) 436 | { 437 | int i, j; 438 | u64Int m2[64]; 439 | u64Int temp, ran; 440 | 441 | while (n < 0) n += PERIOD; 442 | while (n > PERIOD) n -= PERIOD; 443 | if (n == 0) return 0x1; 444 | 445 | temp = 0x1; 446 | for (i=0; i<64; i++) { 447 | m2[i] = temp; 448 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 449 | temp = (temp << 1) ^ ((s64Int) temp < 0 ? POLY : 0); 450 | } 451 | 452 | for (i=62; i>=0; i--) 453 | if ((n >> i) & 1) 454 | break; 455 | 456 | ran = 0x2; 457 | while (i > 0) { 458 | temp = 0; 459 | for (j=0; j<64; j++) 460 | if ((ran >> j) & 1) 461 | temp ^= m2[j]; 462 | ran = temp; 463 | i -= 1; 464 | if ((n >> i) & 1) 465 | ran = (ran << 1) ^ ((s64Int) ran < 0 ? POLY : 0); 466 | } 467 | 468 | return ran; 469 | } 470 | -------------------------------------------------------------------------------- /MPIRandomAccess_vanilla.c: -------------------------------------------------------------------------------- 1 | /* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */ 2 | 3 | /* 4 | * This code has been contributed by the DARPA HPCS program. Contact 5 | * David Koester or Bob Lucas 6 | * if you have questions. 7 | * 8 | * 9 | * GUPS (Giga UPdates per Second) is a measurement that profiles the memory 10 | * architecture of a system and is a measure of performance similar to MFLOPS. 11 | * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the 12 | * GUPS capability of a system, much like the LINPACK benchmark is intended to 13 | * exercise the MFLOPS capability of a computer. In each case, we would 14 | * expect these benchmarks to achieve close to the "peak" capability of the 15 | * memory system. The extent of the similarities between RandomAccess and 16 | * LINPACK are limited to both benchmarks attempting to calculate a peak system 17 | * capability. 18 | * 19 | * GUPS is calculated by identifying the number of memory locations that can be 20 | * randomly updated in one second, divided by 1 billion (1e9). The term "randomly" 21 | * means that there is little relationship between one address to be updated and 22 | * the next, except that they occur in the space of one half the total system 23 | * memory. An update is a read-modify-write operation on a table of 64-bit words. 24 | * An address is generated, the value at that address read from memory, modified 25 | * by an integer operation (add, and, or, xor) with a literal value, and that 26 | * new value is written back to memory. 27 | * 28 | * We are interested in knowing the GUPS performance of both entire systems and 29 | * system subcomponents --- e.g., the GUPS rating of a distributed memory 30 | * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a 31 | * single processor. While there is typically a scaling of FLOPS with processor 32 | * count, a similar phenomenon may not always occur for GUPS. 33 | * 34 | * Select the memory size to be the power of two such that 2^n <= 1/2 of the 35 | * total memory. Each CPU operates on its own address stream, and the single 36 | * table may be distributed among nodes. The distribution of memory to nodes 37 | * is left to the implementer. A uniform data distribution may help balance 38 | * the workload, while non-uniform data distributions may simplify the 39 | * calculations that identify processor location by eliminating the requirement 40 | * for integer divides. A small (less than 1%) percentage of missed updates 41 | * are permitted. 42 | * 43 | * When implementing a benchmark that measures GUPS on a distributed memory 44 | * multiprocessor system, it may be required to define constraints as to how 45 | * far in the random address stream each node is permitted to "look ahead". 46 | * Likewise, it may be required to define a constraint as to the number of 47 | * update messages that can be stored before processing to permit multi-level 48 | * parallelism for those systems that support such a paradigm. The limits on 49 | * "look ahead" and "stored updates" are being implemented to assure that the 50 | * benchmark meets the intent to profile memory architecture and not induce 51 | * significant artificial data locality. For the purpose of measuring GUPS, 52 | * we will stipulate that each thread is permitted to look ahead no more than 53 | * 1024 random address stream samples with the same number of update messages 54 | * stored before processing. 55 | * 56 | * The supplied MPI-1 code generates the input stream {A} on all processors 57 | * and the global table has been distributed as uniformly as possible to 58 | * balance the workload and minimize any Amdahl fraction. This code does not 59 | * exploit "look-ahead". Addresses are sent to the appropriate processor 60 | * where the table entry resides as soon as each address is calculated. 61 | * Updates are performed as addresses are received. Each message is limited 62 | * to a single 64 bit long integer containing element ai from {A}. 63 | * Local offsets for T[ ] are extracted by the destination processor. 64 | * 65 | * If the number of processors is equal to a power of two, then the global 66 | * table can be distributed equally over the processors. In addition, the 67 | * processor number can be determined from that portion of the input stream 68 | * that identifies the address into the global table by masking off log2(p) 69 | * bits in the address. 70 | * 71 | * If the number of processors is not equal to a power of two, then the global 72 | * table cannot be equally distributed between processors. In the MPI-1 73 | * implementation provided, there has been an attempt to minimize the differences 74 | * in workloads and the largest difference in elements of T[ ] is one. The 75 | * number of values in the input stream generated by each processor will be 76 | * related to the number of global table entries on each processor. 77 | * 78 | * The MPI-1 version of RandomAccess treats the potential instance where the 79 | * number of processors is a power of two as a special case, because of the 80 | * significant simplifications possible because processor location and local 81 | * offset can be determined by applying masks to the input stream values. 82 | * The non power of two case uses an integer division to determine the processor 83 | * location. The integer division will be more costly in terms of machine 84 | * cycles to perform than the bit masking operations 85 | * 86 | * For additional information on the GUPS metric, the HPCchallenge RandomAccess 87 | * Benchmark,and the rules to run RandomAccess or modify it to optimize 88 | * performance -- see http://icl.cs.utk.edu/hpcc/ 89 | * 90 | */ 91 | 92 | /* Jan 2005 93 | * 94 | * This code has been modified to allow local bucket sorting of updates. 95 | * The total maximum number of updates in the local buckets of a process 96 | * is currently defined in "RandomAccess.h" as MAX_TOTAL_PENDING_UPDATES. 97 | * When the total maximum number of updates is reached, the process selects 98 | * the bucket (or destination process) with the largest number of 99 | * updates and sends out all the updates in that bucket. See buckets.c 100 | * for details about the buckets' implementation. 101 | * 102 | * This code also supports posting multiple MPI receive descriptors (based 103 | * on a contribution by David Addison). 104 | * 105 | * In addition, this implementation provides an option for limiting 106 | * the execution time of the benchmark to a specified time bound 107 | * (see time_bound.c). The time bound is currently defined in 108 | * time_bound.h, but it should be a benchmark parameter. By default 109 | * the benchmark will execute the recommended number of updates, 110 | * that is, four times the global table size. 111 | */ 112 | 113 | #include 114 | 115 | #include "RandomAccess.h" 116 | #include "buckets.h" 117 | #include "time_bound.h" 118 | #include "verification.h" 119 | 120 | #define CHUNK (1024) 121 | #define CHUNKBIG (32768) 122 | 123 | /* Allocate main table (in global memory) */ 124 | u64Int *HPCC_Table; 125 | 126 | u64Int LocalSendBuffer[LOCAL_BUFFER_SIZE]; 127 | u64Int LocalRecvBuffer[MAX_RECV*LOCAL_BUFFER_SIZE]; 128 | 129 | #ifndef LONG_IS_64BITS 130 | static void 131 | Sum64(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { 132 | int i, n = *len; s64Int *invec64 = (s64Int *)invec, *inoutvec64 = (s64Int *)inoutvec; 133 | for (i = n; i; i--, invec64++, inoutvec64++) *inoutvec64 += *invec64; 134 | } 135 | #endif 136 | 137 | static void 138 | AnyNodesMPIRandomAccessUpdate(u64Int logTableSize, 139 | u64Int TableSize, 140 | u64Int LocalTableSize, 141 | u64Int MinLocalTableSize, 142 | u64Int GlobalStartMyProc, 143 | u64Int Top, 144 | int logNumProcs, 145 | int NumProcs, 146 | int Remainder, 147 | int MyProc, 148 | s64Int ProcNumUpdates, 149 | MPI_Datatype INT64_DT) 150 | { 151 | int i,j; 152 | int ipartner,iterate,niterate,npartition,proclo,nlower,nupper,procmid; 153 | int ndata,nkeep,nsend,nrecv,index, nfrac; 154 | u64Int ran,datum,nglobalm1,indexmid; 155 | u64Int *data,*send, *offsets; 156 | MPI_Status status; 157 | 158 | /* setup: should not really be part of this timed routine 159 | NOTE: niterate must be computed from global TableSize * 4 160 | not from ProcNumUpdates since that can be different on each proc 161 | round niterate up by 1 to do slightly more than required updates */ 162 | 163 | data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 164 | send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 165 | 166 | for (i = 0; i < LocalTableSize; i++) 167 | HPCC_Table[i] = i + GlobalStartMyProc; 168 | 169 | ran = HPCC_starts(4*GlobalStartMyProc); 170 | 171 | offsets = (u64Int *) malloc((NumProcs+1)*sizeof(u64Int)); 172 | MPI_Allgather(&GlobalStartMyProc,1,INT64_DT,offsets,1,INT64_DT, 173 | MPI_COMM_WORLD); 174 | offsets[NumProcs] = TableSize; 175 | 176 | niterate = 4 * TableSize / NumProcs / CHUNK + 1; 177 | nglobalm1 = TableSize - 1; 178 | 179 | /* actual update loop: this is only section that should be timed */ 180 | 181 | for (iterate = 0; iterate < niterate; iterate++) { 182 | for (i = 0; i < CHUNK; i++) { 183 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 184 | data[i] = ran; 185 | } 186 | ndata = CHUNK; 187 | 188 | npartition = NumProcs; 189 | proclo = 0; 190 | while (npartition > 1) { 191 | nlower = npartition/2; 192 | nupper = npartition - nlower; 193 | procmid = proclo + nlower; 194 | indexmid = offsets[procmid]; 195 | 196 | nkeep = nsend = 0; 197 | if (MyProc < procmid) { 198 | for (i = 0; i < ndata; i++) { 199 | if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i]; 200 | else data[nkeep++] = data[i]; 201 | } 202 | } else { 203 | for (i = 0; i < ndata; i++) { 204 | if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i]; 205 | else data[nkeep++] = data[i]; 206 | } 207 | } 208 | 209 | if (nlower == nupper) { 210 | if (MyProc < procmid) ipartner = MyProc + nlower; 211 | else ipartner = MyProc - nlower; 212 | MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep], 213 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 214 | MPI_Get_count(&status,INT64_DT,&nrecv); 215 | ndata = nkeep + nrecv; 216 | } else { 217 | if (MyProc < procmid) { 218 | nfrac = (nlower - (MyProc-proclo)) * nsend / nupper; 219 | ipartner = MyProc + nlower; 220 | MPI_Sendrecv(send,nfrac,INT64_DT,ipartner,0,&data[nkeep], 221 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 222 | MPI_Get_count(&status,INT64_DT,&nrecv); 223 | nkeep += nrecv; 224 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner+1,0, 225 | &data[nkeep],CHUNKBIG,INT64_DT, 226 | ipartner+1,0,MPI_COMM_WORLD,&status); 227 | MPI_Get_count(&status,INT64_DT,&nrecv); 228 | ndata = nkeep + nrecv; 229 | } else if (MyProc > procmid && MyProc < procmid+nlower) { 230 | nfrac = (MyProc - procmid) * nsend / nlower; 231 | ipartner = MyProc - nlower; 232 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner,0, 233 | &data[nkeep],CHUNKBIG,INT64_DT, 234 | ipartner,0,MPI_COMM_WORLD,&status); 235 | MPI_Get_count(&status,INT64_DT,&nrecv); 236 | nkeep += nrecv; 237 | MPI_Sendrecv(send,nfrac,INT64_DT,ipartner-1,0,&data[nkeep], 238 | CHUNKBIG,INT64_DT,ipartner-1,0,MPI_COMM_WORLD,&status); 239 | MPI_Get_count(&status,INT64_DT,&nrecv); 240 | ndata = nkeep + nrecv; 241 | } else { 242 | if (MyProc == procmid) ipartner = MyProc - nlower; 243 | else ipartner = MyProc - nupper; 244 | MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep], 245 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 246 | MPI_Get_count(&status,INT64_DT,&nrecv); 247 | ndata = nkeep + nrecv; 248 | } 249 | } 250 | 251 | if (MyProc < procmid) npartition = nlower; 252 | else { 253 | proclo = procmid; 254 | npartition = nupper; 255 | } 256 | } 257 | 258 | for (i = 0; i < ndata; i++) { 259 | datum = data[i]; 260 | index = (datum & nglobalm1) - GlobalStartMyProc; 261 | HPCC_Table[index] ^= datum; 262 | } 263 | } 264 | 265 | /* clean up: should not really be part of this timed routine */ 266 | 267 | free(data); 268 | free(send); 269 | free(offsets); 270 | } 271 | 272 | static void 273 | Power2NodesMPIRandomAccessUpdate(u64Int logTableSize, 274 | u64Int TableSize, 275 | u64Int LocalTableSize, 276 | u64Int MinLocalTableSize, 277 | u64Int GlobalStartMyProc, 278 | u64Int Top, 279 | int logNumProcs, 280 | int NumProcs, 281 | int Remainder, 282 | int MyProc, 283 | s64Int ProcNumUpdates, 284 | MPI_Datatype INT64_DT) 285 | { 286 | int i,j; 287 | int logTableLocal,ipartner,iterate,niterate; 288 | int ndata,nkeep,nsend,nrecv,index,nlocalm1; 289 | u64Int ran,datum,procmask; 290 | u64Int *data,*send; 291 | MPI_Status status; 292 | 293 | /* setup: should not really be part of this timed routine */ 294 | 295 | data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 296 | send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 297 | 298 | for (i = 0; i < LocalTableSize; i++) 299 | HPCC_Table[i] = i + GlobalStartMyProc; 300 | 301 | ran = HPCC_starts(4*GlobalStartMyProc); 302 | 303 | niterate = ProcNumUpdates / CHUNK; 304 | logTableLocal = logTableSize - logNumProcs; 305 | nlocalm1 = LocalTableSize - 1; 306 | 307 | /* actual update loop: this is only section that should be timed */ 308 | 309 | for (iterate = 0; iterate < niterate; iterate++) { 310 | for (i = 0; i < CHUNK; i++) { 311 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 312 | data[i] = ran; 313 | } 314 | ndata = CHUNK; 315 | 316 | for (j = 0; j < logNumProcs; j++) { 317 | nkeep = nsend = 0; 318 | ipartner = (1 << j) ^ MyProc; 319 | procmask = ((u64Int) 1) << (logTableLocal + j); 320 | if (ipartner > MyProc) { 321 | for (i = 0; i < ndata; i++) { 322 | if (data[i] & procmask) send[nsend++] = data[i]; 323 | else data[nkeep++] = data[i]; 324 | } 325 | } else { 326 | for (i = 0; i < ndata; i++) { 327 | if (data[i] & procmask) data[nkeep++] = data[i]; 328 | else send[nsend++] = data[i]; 329 | } 330 | } 331 | 332 | MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0, 333 | &data[nkeep],CHUNKBIG,INT64_DT, 334 | ipartner,0,MPI_COMM_WORLD,&status); 335 | MPI_Get_count(&status,INT64_DT,&nrecv); 336 | ndata = nkeep + nrecv; 337 | } 338 | 339 | for (i = 0; i < ndata; i++) { 340 | datum = data[i]; 341 | index = datum & nlocalm1; 342 | HPCC_Table[index] ^= datum; 343 | } 344 | } 345 | 346 | /* clean up: should not really be part of this timed routine */ 347 | 348 | free(data); 349 | free(send); 350 | } 351 | 352 | int 353 | HPCC_MPIRandomAccess(HPCC_Params *params) { 354 | s64Int i; 355 | s64Int NumErrors, GlbNumErrors; 356 | 357 | int NumProcs, logNumProcs, MyProc; 358 | u64Int GlobalStartMyProc; 359 | int Remainder; /* Number of processors with (LocalTableSize + 1) entries */ 360 | u64Int Top; /* Number of table entries in top of Table */ 361 | u64Int LocalTableSize; /* Local table width */ 362 | u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */ 363 | u64Int logTableSize, TableSize; 364 | 365 | double CPUTime; /* CPU time to update table */ 366 | double RealTime; /* Real time to update table */ 367 | 368 | double TotalMem; 369 | int sAbort, rAbort; 370 | int PowerofTwo; 371 | 372 | double timeBound; /* OPTIONAL time bound for execution time */ 373 | u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ 374 | u64Int NumUpdates; /* actual number of updates to table - may be smaller than 375 | * NumUpdates_Default due to execution time bounds */ 376 | s64Int ProcNumUpdates; /* number of updates per processor */ 377 | s64Int GlbNumUpdates; /* for reduction */ 378 | 379 | FILE *outFile = NULL; 380 | MPI_Op sum64; 381 | double *GUPs; 382 | 383 | MPI_Datatype INT64_DT; 384 | 385 | #ifdef LONG_IS_64BITS 386 | INT64_DT = MPI_LONG; 387 | #else 388 | INT64_DT = MPI_LONG_LONG_INT; 389 | #endif 390 | 391 | GUPs = ¶ms->MPIGUPs; 392 | 393 | MPI_Comm_size( MPI_COMM_WORLD, &NumProcs ); 394 | MPI_Comm_rank( MPI_COMM_WORLD, &MyProc ); 395 | 396 | if (0 == MyProc) { 397 | outFile = fopen( params->outFname, "a" ); 398 | if (! outFile) outFile = stderr; 399 | } 400 | 401 | TotalMem = params->HPLMaxProcMem; /* max single node memory */ 402 | TotalMem *= NumProcs; /* max memory in NumProcs nodes */ 403 | TotalMem /= sizeof(u64Int); 404 | 405 | /* calculate TableSize --- the size of update array (must be a power of 2) */ 406 | for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1; 407 | TotalMem >= 1.0; 408 | TotalMem *= 0.5, logTableSize++, TableSize <<= 1) 409 | ; /* EMPTY */ 410 | 411 | 412 | /* determine whether the number of processors is a power of 2 */ 413 | for (i = 1, logNumProcs = 0; ; logNumProcs++, i <<= 1) { 414 | if (i == NumProcs) { 415 | PowerofTwo = HPCC_TRUE; 416 | Remainder = 0; 417 | Top = 0; 418 | MinLocalTableSize = (TableSize / NumProcs); 419 | LocalTableSize = MinLocalTableSize; 420 | GlobalStartMyProc = (MinLocalTableSize * MyProc); 421 | break; 422 | 423 | /* number of processes is not a power 2 (too many shifts may introduce negative values or 0) */ 424 | 425 | } 426 | else if (i > NumProcs || i <= 0) { 427 | PowerofTwo = HPCC_FALSE; 428 | /* Minimum local table size --- some processors have an additional entry */ 429 | MinLocalTableSize = (TableSize / NumProcs); 430 | /* Number of processors with (LocalTableSize + 1) entries */ 431 | Remainder = TableSize - (MinLocalTableSize * NumProcs); 432 | /* Number of table entries in top of Table */ 433 | Top = (MinLocalTableSize + 1) * Remainder; 434 | /* Local table size */ 435 | if (MyProc < Remainder) { 436 | LocalTableSize = (MinLocalTableSize + 1); 437 | GlobalStartMyProc = ( (MinLocalTableSize + 1) * MyProc); 438 | } 439 | else { 440 | LocalTableSize = MinLocalTableSize; 441 | GlobalStartMyProc = ( (MinLocalTableSize * MyProc) + Remainder ); 442 | } 443 | break; 444 | 445 | } /* end else if */ 446 | } /* end for i */ 447 | 448 | 449 | HPCC_Table = XMALLOC( u64Int, LocalTableSize); 450 | sAbort = 0; if (! HPCC_Table) sAbort = 1; 451 | 452 | MPI_Allreduce( &sAbort, &rAbort, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); 453 | if (rAbort > 0) { 454 | if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n"); 455 | goto failed_table; 456 | } 457 | 458 | params->MPIRandomAccess_N = (s64Int)TableSize; 459 | 460 | /* Default number of global updates to table: 4x number of table entries */ 461 | NumUpdates_Default = 4 * TableSize; 462 | 463 | #ifdef RA_TIME_BOUND 464 | /* estimate number of updates such that execution time does not exceed time bound */ 465 | /* time_bound should be a parameter */ 466 | /* max run time in seconds */ 467 | timeBound = Mmax( 0.25 * params->HPLrdata.time, (double)TIME_BOUND ); 468 | if (PowerofTwo) { 469 | HPCC_Power2NodesTime(logTableSize, TableSize, LocalTableSize, 470 | MinLocalTableSize, GlobalStartMyProc, Top, 471 | logNumProcs, NumProcs, Remainder, 472 | MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates); 473 | 474 | } else { 475 | HPCC_AnyNodesTime(logTableSize, TableSize, LocalTableSize, 476 | MinLocalTableSize, GlobalStartMyProc, Top, 477 | logNumProcs, NumProcs, Remainder, 478 | MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates); 479 | } 480 | /* be conservative: get the smallest number of updates among all procs */ 481 | MPI_Reduce( &ProcNumUpdates, &GlbNumUpdates, 1, INT64_DT, 482 | MPI_MIN, 0, MPI_COMM_WORLD ); 483 | /* distribute number of updates per proc to all procs */ 484 | MPI_Bcast( &GlbNumUpdates, 1, INT64_DT, 0, MPI_COMM_WORLD ); 485 | ProcNumUpdates = Mmin(GlbNumUpdates, (4*LocalTableSize)); 486 | /* works for both PowerofTwo and AnyNodes */ 487 | NumUpdates = Mmin((ProcNumUpdates*NumProcs), NumUpdates_Default); 488 | 489 | #else 490 | ProcNumUpdates = 4*LocalTableSize; 491 | NumUpdates = NumUpdates_Default; 492 | #endif 493 | 494 | if (MyProc == 0) { 495 | fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : ""); 496 | fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n", 497 | logTableSize, TableSize ); 498 | if (PowerofTwo) 499 | fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n", 500 | (logTableSize - logNumProcs), TableSize/NumProcs ); 501 | else 502 | fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d = " FSTR64 " words/PE MAX\n", 503 | logTableSize, NumProcs, LocalTableSize); 504 | 505 | fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default); 506 | #ifdef RA_TIME_BOUND 507 | fprintf( outFile, "Number of updates EXECUTED = " FSTR64 " (for a TIME BOUND of %.2f secs)\n", 508 | NumUpdates, timeBound); 509 | #endif 510 | params->MPIRandomAccess_ExeUpdates = NumUpdates; 511 | params->MPIRandomAccess_TimeBound = timeBound; 512 | } 513 | 514 | MPI_Barrier( MPI_COMM_WORLD ); 515 | 516 | CPUTime = -CPUSEC(); 517 | RealTime = -RTSEC(); 518 | 519 | if (PowerofTwo) { 520 | Power2NodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, 521 | MinLocalTableSize, GlobalStartMyProc, Top, 522 | logNumProcs, NumProcs, Remainder, 523 | MyProc, ProcNumUpdates, INT64_DT); 524 | } else { 525 | AnyNodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, 526 | MinLocalTableSize, GlobalStartMyProc, Top, 527 | logNumProcs, NumProcs, Remainder, 528 | MyProc, ProcNumUpdates, INT64_DT); 529 | } 530 | 531 | 532 | MPI_Barrier( MPI_COMM_WORLD ); 533 | 534 | /* End timed section */ 535 | CPUTime += CPUSEC(); 536 | RealTime += RTSEC(); 537 | 538 | /* Print timing results */ 539 | if (MyProc == 0){ 540 | params->MPIRandomAccess_time = RealTime; 541 | *GUPs = 1e-9*NumUpdates / RealTime; 542 | fprintf( outFile, "CPU time used = %.6f seconds\n", CPUTime ); 543 | fprintf( outFile, "Real time used = %.6f seconds\n", RealTime ); 544 | fprintf( outFile, "%.9f Billion(10^9) Updates per second [GUP/s]\n", 545 | *GUPs ); 546 | fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n", 547 | *GUPs / NumProcs ); 548 | /* No longer reporting per CPU number */ 549 | /* *GUPs /= NumProcs; */ 550 | } 551 | /* distribute result to all nodes */ 552 | MPI_Bcast( GUPs, 1, MPI_INT, 0, MPI_COMM_WORLD ); 553 | 554 | 555 | /* Verification phase */ 556 | 557 | /* Begin timing here */ 558 | CPUTime = -CPUSEC(); 559 | RealTime = -RTSEC(); 560 | 561 | if (PowerofTwo) { 562 | HPCC_Power2NodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize, 563 | GlobalStartMyProc, 564 | logNumProcs, NumProcs, 565 | MyProc, ProcNumUpdates, 566 | INT64_DT, &NumErrors); 567 | } 568 | else { 569 | HPCC_AnyNodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize, 570 | MinLocalTableSize, GlobalStartMyProc, Top, 571 | logNumProcs, NumProcs, Remainder, 572 | MyProc, ProcNumUpdates, 573 | INT64_DT, &NumErrors); 574 | } 575 | 576 | 577 | #ifdef LONG_IS_64BITS 578 | MPI_Reduce( &NumErrors, &GlbNumErrors, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD ); 579 | #else 580 | /* MPI 1.1 standard (obsolete at this point) doesn't define MPI_SUM 581 | to work on `long long': 582 | http://www.mpi-forum.org/docs/mpi-11-html/node78.html and 583 | therefore LAM 6.5.6 chooses not to implement it (even though there 584 | is code for it in LAM and for other reductions work OK, 585 | e.g. MPI_MAX). MPICH 1.2.5 doesn't complain about MPI_SUM but it 586 | doesn't have MPI_UNSIGNED_LONG_LONG (but has MPI_LONG_LONG_INT): 587 | http://www.mpi-forum.org/docs/mpi-20-html/node84.htm So I need to 588 | create a trivial summation operation. */ 589 | MPI_Op_create( Sum64, 1, &sum64 ); 590 | MPI_Reduce( &NumErrors, &GlbNumErrors, 1, INT64_DT, sum64, 0, MPI_COMM_WORLD ); 591 | MPI_Op_free( &sum64 ); 592 | #endif 593 | 594 | /* End timed section */ 595 | CPUTime += CPUSEC(); 596 | RealTime += RTSEC(); 597 | 598 | if(MyProc == 0){ 599 | params->MPIRandomAccess_CheckTime = RealTime; 600 | fprintf( outFile, "Verification: CPU time used = %.6f seconds\n", CPUTime); 601 | fprintf( outFile, "Verification: Real time used = %.6f seconds\n", RealTime); 602 | fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n", 603 | GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ? 604 | "passed" : "failed"); 605 | if (GlbNumErrors > 0.01*TableSize) params->Failure = 1; 606 | params->MPIRandomAccess_Errors = (s64Int)GlbNumErrors; 607 | params->MPIRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize; 608 | } 609 | /* End verification phase */ 610 | 611 | 612 | /* Deallocate memory (in reverse order of allocation which should 613 | help fragmentation) */ 614 | 615 | free( HPCC_Table ); 616 | 617 | failed_table: 618 | 619 | if (0 == MyProc) if (outFile != stderr) fclose( outFile ); 620 | 621 | MPI_Barrier( MPI_COMM_WORLD ); 622 | 623 | return 0; 624 | } 625 | -------------------------------------------------------------------------------- /MPIRandomAccess_opt.c: -------------------------------------------------------------------------------- 1 | /* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */ 2 | 3 | /* 4 | * This code has been contributed by the DARPA HPCS program. Contact 5 | * David Koester or Bob Lucas 6 | * if you have questions. 7 | * 8 | * 9 | * GUPS (Giga UPdates per Second) is a measurement that profiles the memory 10 | * architecture of a system and is a measure of performance similar to MFLOPS. 11 | * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the 12 | * GUPS capability of a system, much like the LINPACK benchmark is intended to 13 | * exercise the MFLOPS capability of a computer. In each case, we would 14 | * expect these benchmarks to achieve close to the "peak" capability of the 15 | * memory system. The extent of the similarities between RandomAccess and 16 | * LINPACK are limited to both benchmarks attempting to calculate a peak system 17 | * capability. 18 | * 19 | * GUPS is calculated by identifying the number of memory locations that can be 20 | * randomly updated in one second, divided by 1 billion (1e9). The term "randomly" 21 | * means that there is little relationship between one address to be updated and 22 | * the next, except that they occur in the space of one half the total system 23 | * memory. An update is a read-modify-write operation on a table of 64-bit words. 24 | * An address is generated, the value at that address read from memory, modified 25 | * by an integer operation (add, and, or, xor) with a literal value, and that 26 | * new value is written back to memory. 27 | * 28 | * We are interested in knowing the GUPS performance of both entire systems and 29 | * system subcomponents --- e.g., the GUPS rating of a distributed memory 30 | * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a 31 | * single processor. While there is typically a scaling of FLOPS with processor 32 | * count, a similar phenomenon may not always occur for GUPS. 33 | * 34 | * Select the memory size to be the power of two such that 2^n <= 1/2 of the 35 | * total memory. Each CPU operates on its own address stream, and the single 36 | * table may be distributed among nodes. The distribution of memory to nodes 37 | * is left to the implementer. A uniform data distribution may help balance 38 | * the workload, while non-uniform data distributions may simplify the 39 | * calculations that identify processor location by eliminating the requirement 40 | * for integer divides. A small (less than 1%) percentage of missed updates 41 | * are permitted. 42 | * 43 | * When implementing a benchmark that measures GUPS on a distributed memory 44 | * multiprocessor system, it may be required to define constraints as to how 45 | * far in the random address stream each node is permitted to "look ahead". 46 | * Likewise, it may be required to define a constraint as to the number of 47 | * update messages that can be stored before processing to permit multi-level 48 | * parallelism for those systems that support such a paradigm. The limits on 49 | * "look ahead" and "stored updates" are being implemented to assure that the 50 | * benchmark meets the intent to profile memory architecture and not induce 51 | * significant artificial data locality. For the purpose of measuring GUPS, 52 | * we will stipulate that each thread is permitted to look ahead no more than 53 | * 1024 random address stream samples with the same number of update messages 54 | * stored before processing. 55 | * 56 | * The supplied MPI-1 code generates the input stream {A} on all processors 57 | * and the global table has been distributed as uniformly as possible to 58 | * balance the workload and minimize any Amdahl fraction. This code does not 59 | * exploit "look-ahead". Addresses are sent to the appropriate processor 60 | * where the table entry resides as soon as each address is calculated. 61 | * Updates are performed as addresses are received. Each message is limited 62 | * to a single 64 bit long integer containing element ai from {A}. 63 | * Local offsets for T[ ] are extracted by the destination processor. 64 | * 65 | * If the number of processors is equal to a power of two, then the global 66 | * table can be distributed equally over the processors. In addition, the 67 | * processor number can be determined from that portion of the input stream 68 | * that identifies the address into the global table by masking off log2(p) 69 | * bits in the address. 70 | * 71 | * If the number of processors is not equal to a power of two, then the global 72 | * table cannot be equally distributed between processors. In the MPI-1 73 | * implementation provided, there has been an attempt to minimize the differences 74 | * in workloads and the largest difference in elements of T[ ] is one. The 75 | * number of values in the input stream generated by each processor will be 76 | * related to the number of global table entries on each processor. 77 | * 78 | * The MPI-1 version of RandomAccess treats the potential instance where the 79 | * number of processors is a power of two as a special case, because of the 80 | * significant simplifications possible because processor location and local 81 | * offset can be determined by applying masks to the input stream values. 82 | * The non power of two case uses an integer division to determine the processor 83 | * location. The integer division will be more costly in terms of machine 84 | * cycles to perform than the bit masking operations 85 | * 86 | * For additional information on the GUPS metric, the HPCchallenge RandomAccess 87 | * Benchmark,and the rules to run RandomAccess or modify it to optimize 88 | * performance -- see http://icl.cs.utk.edu/hpcc/ 89 | * 90 | */ 91 | 92 | /* Jan 2005 93 | * 94 | * This code has been modified to allow local bucket sorting of updates. 95 | * The total maximum number of updates in the local buckets of a process 96 | * is currently defined in "RandomAccess.h" as MAX_TOTAL_PENDING_UPDATES. 97 | * When the total maximum number of updates is reached, the process selects 98 | * the bucket (or destination process) with the largest number of 99 | * updates and sends out all the updates in that bucket. See buckets.c 100 | * for details about the buckets' implementation. 101 | * 102 | * This code also supports posting multiple MPI receive descriptors (based 103 | * on a contribution by David Addison). 104 | * 105 | * In addition, this implementation provides an option for limiting 106 | * the execution time of the benchmark to a specified time bound 107 | * (see time_bound.c). The time bound is currently defined in 108 | * time_bound.h, but it should be a benchmark parameter. By default 109 | * the benchmark will execute the recommended number of updates, 110 | * that is, four times the global table size. 111 | */ 112 | 113 | #include 114 | 115 | #include "RandomAccess.h" 116 | #include "buckets.h" 117 | #include "time_bound.h" 118 | #include "verification.h" 119 | 120 | #define CHUNK (1024) 121 | #define CHUNKBIG (32768) 122 | #define RCHUNK (16384) 123 | #define PITER 8 124 | #define MAXLOGPROCS 20 125 | 126 | /* Allocate main table (in global memory) */ 127 | 128 | u64Int *HPCC_Table; 129 | u64Int LocalSendBuffer[LOCAL_BUFFER_SIZE]; 130 | u64Int LocalRecvBuffer[MAX_RECV*LOCAL_BUFFER_SIZE]; 131 | 132 | #ifndef LONG_IS_64BITS 133 | static void 134 | Sum64(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { 135 | int i, n = *len; s64Int *invec64 = (s64Int *)invec, *inoutvec64 = (s64Int *)inoutvec; 136 | for (i = n; i; i--, invec64++, inoutvec64++) *inoutvec64 += *invec64; 137 | } 138 | #endif 139 | 140 | static void 141 | AnyNodesMPIRandomAccessUpdate(u64Int logTableSize, 142 | u64Int TableSize, 143 | u64Int LocalTableSize, 144 | u64Int MinLocalTableSize, 145 | u64Int GlobalStartMyProc, 146 | u64Int Top, 147 | int logNumProcs, 148 | int NumProcs, 149 | int Remainder, 150 | int MyProc, 151 | s64Int ProcNumUpdates, 152 | MPI_Datatype INT64_DT) 153 | { 154 | int i,j; 155 | int ipartner,iterate,niterate,npartition,proclo,nlower,nupper,procmid; 156 | int ndata,nkeep,nsend,nrecv,index, nfrac; 157 | u64Int ran,datum,nglobalm1,indexmid; 158 | u64Int *data,*send, *offsets; 159 | MPI_Status status; 160 | 161 | /* setup: should not really be part of this timed routine 162 | NOTE: niterate must be computed from global TableSize * 4 163 | not from ProcNumUpdates since that can be different on each proc 164 | round niterate up by 1 to do slightly more than required updates */ 165 | 166 | data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 167 | send = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 168 | 169 | for (i = 0; i < LocalTableSize; i++) 170 | HPCC_Table[i] = i + GlobalStartMyProc; 171 | 172 | ran = HPCC_starts(4*GlobalStartMyProc); 173 | 174 | offsets = (u64Int *) malloc((NumProcs+1)*sizeof(u64Int)); 175 | MPI_Allgather(&GlobalStartMyProc,1,INT64_DT,offsets,1,INT64_DT, 176 | MPI_COMM_WORLD); 177 | offsets[NumProcs] = TableSize; 178 | 179 | niterate = 4 * TableSize / NumProcs / CHUNK + 1; 180 | nglobalm1 = TableSize - 1; 181 | 182 | /* actual update loop: this is only section that should be timed */ 183 | 184 | for (iterate = 0; iterate < niterate; iterate++) { 185 | for (i = 0; i < CHUNK; i++) { 186 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 187 | data[i] = ran; 188 | } 189 | ndata = CHUNK; 190 | 191 | npartition = NumProcs; 192 | proclo = 0; 193 | while (npartition > 1) { 194 | nlower = npartition/2; 195 | nupper = npartition - nlower; 196 | procmid = proclo + nlower; 197 | indexmid = offsets[procmid]; 198 | 199 | nkeep = nsend = 0; 200 | if (MyProc < procmid) { 201 | for (i = 0; i < ndata; i++) { 202 | if ((data[i] & nglobalm1) >= indexmid) send[nsend++] = data[i]; 203 | else data[nkeep++] = data[i]; 204 | } 205 | } else { 206 | for (i = 0; i < ndata; i++) { 207 | if ((data[i] & nglobalm1) < indexmid) send[nsend++] = data[i]; 208 | else data[nkeep++] = data[i]; 209 | } 210 | } 211 | 212 | if (nlower == nupper) { 213 | if (MyProc < procmid) ipartner = MyProc + nlower; 214 | else ipartner = MyProc - nlower; 215 | MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep], 216 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 217 | MPI_Get_count(&status,INT64_DT,&nrecv); 218 | ndata = nkeep + nrecv; 219 | } else { 220 | if (MyProc < procmid) { 221 | nfrac = (nlower - (MyProc-proclo)) * nsend / nupper; 222 | ipartner = MyProc + nlower; 223 | MPI_Sendrecv(send,nfrac,INT64_DT,ipartner,0,&data[nkeep], 224 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 225 | MPI_Get_count(&status,INT64_DT,&nrecv); 226 | nkeep += nrecv; 227 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner+1,0, 228 | &data[nkeep],CHUNKBIG,INT64_DT, 229 | ipartner+1,0,MPI_COMM_WORLD,&status); 230 | MPI_Get_count(&status,INT64_DT,&nrecv); 231 | ndata = nkeep + nrecv; 232 | } else if (MyProc > procmid && MyProc < procmid+nlower) { 233 | nfrac = (MyProc - procmid) * nsend / nlower; 234 | ipartner = MyProc - nlower; 235 | MPI_Sendrecv(&send[nfrac],nsend-nfrac,INT64_DT,ipartner,0, 236 | &data[nkeep],CHUNKBIG,INT64_DT, 237 | ipartner,0,MPI_COMM_WORLD,&status); 238 | MPI_Get_count(&status,INT64_DT,&nrecv); 239 | nkeep += nrecv; 240 | MPI_Sendrecv(send,nfrac,INT64_DT,ipartner-1,0,&data[nkeep], 241 | CHUNKBIG,INT64_DT,ipartner-1,0,MPI_COMM_WORLD,&status); 242 | MPI_Get_count(&status,INT64_DT,&nrecv); 243 | ndata = nkeep + nrecv; 244 | } else { 245 | if (MyProc == procmid) ipartner = MyProc - nlower; 246 | else ipartner = MyProc - nupper; 247 | MPI_Sendrecv(send,nsend,INT64_DT,ipartner,0,&data[nkeep], 248 | CHUNKBIG,INT64_DT,ipartner,0,MPI_COMM_WORLD,&status); 249 | MPI_Get_count(&status,INT64_DT,&nrecv); 250 | ndata = nkeep + nrecv; 251 | } 252 | } 253 | 254 | if (MyProc < procmid) npartition = nlower; 255 | else { 256 | proclo = procmid; 257 | npartition = nupper; 258 | } 259 | } 260 | 261 | for (i = 0; i < ndata; i++) { 262 | datum = data[i]; 263 | index = (datum & nglobalm1) - GlobalStartMyProc; 264 | HPCC_Table[index] ^= datum; 265 | } 266 | } 267 | 268 | /* clean up: should not really be part of this timed routine */ 269 | 270 | free(data); 271 | free(send); 272 | free(offsets); 273 | } 274 | 275 | /* This sort is manually unrolled to make sure the compiler can see 276 | * the parallelism -KDU 277 | */ 278 | 279 | void sort_data(u64Int *source, u64Int *nomatch, u64Int *match, int number, 280 | int *nnomatch, int *nmatch, int mask_shift) 281 | { 282 | int i,dindex,myselect[8],counts[2]; 283 | int div_num = number / 8; 284 | int loop_total = div_num * 8; 285 | u64Int procmask = ((u64Int) 1) << mask_shift; 286 | u64Int *buffers[2]; 287 | 288 | buffers[0] = nomatch; 289 | counts[0] = *nnomatch; 290 | buffers[1] = match; 291 | counts[1] = *nmatch; 292 | 293 | for (i = 0; i < div_num; i++) { 294 | dindex = i*8; 295 | myselect[0] = (source[dindex] & procmask) >> mask_shift; 296 | myselect[1] = (source[dindex+1] & procmask) >> mask_shift; 297 | myselect[2] = (source[dindex+2] & procmask) >> mask_shift; 298 | myselect[3] = (source[dindex+3] & procmask) >> mask_shift; 299 | myselect[4] = (source[dindex+4] & procmask) >> mask_shift; 300 | myselect[5] = (source[dindex+5] & procmask) >> mask_shift; 301 | myselect[6] = (source[dindex+6] & procmask) >> mask_shift; 302 | myselect[7] = (source[dindex+7] & procmask) >> mask_shift; 303 | buffers[myselect[0]][counts[myselect[0]]++] = source[dindex]; 304 | buffers[myselect[1]][counts[myselect[1]]++] = source[dindex+1]; 305 | buffers[myselect[2]][counts[myselect[2]]++] = source[dindex+2]; 306 | buffers[myselect[3]][counts[myselect[3]]++] = source[dindex+3]; 307 | buffers[myselect[4]][counts[myselect[4]]++] = source[dindex+4]; 308 | buffers[myselect[5]][counts[myselect[5]]++] = source[dindex+5]; 309 | buffers[myselect[6]][counts[myselect[6]]++] = source[dindex+6]; 310 | buffers[myselect[7]][counts[myselect[7]]++] = source[dindex+7]; 311 | } 312 | 313 | for (i = loop_total; i < number; i++) { 314 | u64Int mydata = source[i]; 315 | if (mydata & procmask) buffers[1][counts[1]++] = mydata; 316 | else buffers[0][counts[0]++] = mydata; 317 | } 318 | 319 | *nnomatch = counts[0]; 320 | *nmatch = counts[1]; 321 | } 322 | 323 | /* Manual unrolling is a significant win if -Msafeptr is used -KDU */ 324 | 325 | inline update_table(u64Int *data, u64Int *table, int number, int nlocalm1) 326 | { 327 | int i,dindex,index; 328 | int div_num = number / 8; 329 | int loop_total = div_num * 8; 330 | u64Int index0,index1,index2,index3,index4,index5,index6,index7; 331 | u64Int ltable0,ltable1,ltable2,ltable3,ltable4,ltable5,ltable6,ltable7; 332 | 333 | for (i = 0; i < div_num; i++) { 334 | dindex = i*8; 335 | 336 | index0 = data[dindex] & nlocalm1; 337 | index1 = data[dindex+1] & nlocalm1; 338 | index2 = data[dindex+2] & nlocalm1; 339 | index3 = data[dindex+3] & nlocalm1; 340 | index4 = data[dindex+4] & nlocalm1; 341 | index5 = data[dindex+5] & nlocalm1; 342 | index6 = data[dindex+6] & nlocalm1; 343 | index7 = data[dindex+7] & nlocalm1; 344 | ltable0 = table[index0]; 345 | ltable1 = table[index1]; 346 | ltable2 = table[index2]; 347 | ltable3 = table[index3]; 348 | ltable4 = table[index4]; 349 | ltable5 = table[index5]; 350 | ltable6 = table[index6]; 351 | ltable7 = table[index7]; 352 | 353 | table[index0] = ltable0 ^ data[dindex]; 354 | table[index1] = ltable1 ^ data[dindex+1]; 355 | table[index2] = ltable2 ^ data[dindex+2]; 356 | table[index3] = ltable3 ^ data[dindex+3]; 357 | table[index4] = ltable4 ^ data[dindex+4]; 358 | table[index5] = ltable5 ^ data[dindex+5]; 359 | table[index6] = ltable6 ^ data[dindex+6]; 360 | table[index7] = ltable7 ^ data[dindex+7]; 361 | } 362 | 363 | for (i = loop_total; i < number; i++) { 364 | u64Int datum = data[i]; 365 | index = datum & nlocalm1; 366 | table[index] ^= datum; 367 | } 368 | } 369 | 370 | static void 371 | Power2NodesMPIRandomAccessUpdate(u64Int logTableSize, 372 | u64Int TableSize, 373 | u64Int LocalTableSize, 374 | u64Int MinLocalTableSize, 375 | u64Int GlobalStartMyProc, 376 | u64Int Top, 377 | int logNumProcs, 378 | int NumProcs, 379 | int Remainder, 380 | int MyProc, 381 | s64Int ProcNumUpdates, 382 | MPI_Datatype INT64_DT) 383 | { 384 | int i,j,k; 385 | int logTableLocal,ipartner,iterate,niterate,iter_mod; 386 | int ndata,nkeep,nsend,nrecv,nlocalm1, nkept; 387 | u64Int ran,datum,procmask; 388 | u64Int *data,*send,*send1,*send2; 389 | u64Int *recv[PITER][MAXLOGPROCS]; 390 | MPI_Status status; 391 | MPI_Request request[PITER][MAXLOGPROCS]; 392 | MPI_Request srequest; 393 | 394 | /* setup: should not really be part of this timed routine */ 395 | 396 | data = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 397 | send1 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 398 | send2 = (u64Int *) malloc(CHUNKBIG*sizeof(u64Int)); 399 | send = send1; 400 | 401 | for (j = 0; j < PITER; j++) 402 | for (i = 0; i < logNumProcs; i++) 403 | recv[j][i] = (u64Int *) malloc(sizeof(u64Int)*RCHUNK); 404 | 405 | for (i = 0; i < LocalTableSize; i++) 406 | HPCC_Table[i] = i + GlobalStartMyProc; 407 | 408 | ran = HPCC_starts(4*GlobalStartMyProc); 409 | 410 | niterate = ProcNumUpdates / CHUNK; 411 | logTableLocal = logTableSize - logNumProcs; 412 | nlocalm1 = LocalTableSize - 1; 413 | 414 | /* actual update loop: this is only section that should be timed */ 415 | 416 | for (iterate = 0; iterate < niterate; iterate++) { 417 | iter_mod = iterate % PITER; 418 | for (i = 0; i < CHUNK; i++) { 419 | ran = (ran << 1) ^ ((s64Int) ran < ZERO64B ? POLY : ZERO64B); 420 | data[i] = ran; 421 | } 422 | nkept = CHUNK; 423 | nrecv = 0; 424 | 425 | if (iter_mod == 0) 426 | for (k = 0; k < PITER; k++) 427 | for (j = 0; j < logNumProcs; j++) { 428 | ipartner = (1 << j) ^ MyProc; 429 | MPI_Irecv(recv[k][j],RCHUNK,INT64_DT,ipartner,0,MPI_COMM_WORLD, 430 | &request[k][j]); 431 | } 432 | 433 | for (j = 0; j < logNumProcs; j++) { 434 | nkeep = nsend = 0; 435 | send = (send == send1) ? send2 : send1; 436 | ipartner = (1 << j) ^ MyProc; 437 | procmask = ((u64Int) 1) << (logTableLocal + j); 438 | if (ipartner > MyProc) { 439 | sort_data(data,data,send,nkept,&nkeep,&nsend,logTableLocal+j); 440 | if (j > 0) { 441 | MPI_Wait(&request[iter_mod][j-1],&status); 442 | MPI_Get_count(&status,INT64_DT,&nrecv); 443 | sort_data(recv[iter_mod][j-1],data,send,nrecv,&nkeep, 444 | &nsend,logTableLocal+j); 445 | } 446 | } else { 447 | sort_data(data,send,data,nkept,&nsend,&nkeep,logTableLocal+j); 448 | if (j > 0) { 449 | MPI_Wait(&request[iter_mod][j-1],&status); 450 | MPI_Get_count(&status,INT64_DT,&nrecv); 451 | sort_data(recv[iter_mod][j-1],send,data,nrecv,&nsend, 452 | &nkeep,logTableLocal+j); 453 | } 454 | } 455 | if (j > 0) MPI_Wait(&srequest,&status); 456 | MPI_Isend(send,nsend,INT64_DT,ipartner,0,MPI_COMM_WORLD,&srequest); 457 | if (j == (logNumProcs - 1)) update_table(data,HPCC_Table,nkeep,nlocalm1); 458 | nkept = nkeep; 459 | } 460 | 461 | if (logNumProcs == 0) update_table(data,HPCC_Table,nkept,nlocalm1); 462 | else { 463 | MPI_Wait(&request[iter_mod][j-1],&status); 464 | MPI_Get_count(&status,INT64_DT,&nrecv); 465 | update_table(recv[iter_mod][j-1],HPCC_Table,nrecv,nlocalm1); 466 | MPI_Wait(&srequest,&status); 467 | } 468 | 469 | ndata = nkept + nrecv; 470 | } 471 | 472 | /* clean up: should not really be part of this timed routine */ 473 | 474 | for (j = 0; j < PITER; j++) 475 | for (i = 0; i < logNumProcs; i++) free(recv[j][i]); 476 | 477 | free(data); 478 | free(send1); 479 | free(send2); 480 | } 481 | 482 | int 483 | HPCC_MPIRandomAccess(HPCC_Params *params) { 484 | s64Int i; 485 | s64Int NumErrors, GlbNumErrors; 486 | 487 | int NumProcs, logNumProcs, MyProc; 488 | u64Int GlobalStartMyProc; 489 | int Remainder; /* Number of processors with (LocalTableSize + 1) entries */ 490 | u64Int Top; /* Number of table entries in top of Table */ 491 | u64Int LocalTableSize; /* Local table width */ 492 | u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */ 493 | u64Int logTableSize, TableSize; 494 | 495 | double CPUTime; /* CPU time to update table */ 496 | double RealTime; /* Real time to update table */ 497 | 498 | double TotalMem; 499 | int sAbort, rAbort; 500 | int PowerofTwo; 501 | 502 | double timeBound; /* OPTIONAL time bound for execution time */ 503 | u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */ 504 | u64Int NumUpdates; /* actual number of updates to table - may be smaller than 505 | * NumUpdates_Default due to execution time bounds */ 506 | s64Int ProcNumUpdates; /* number of updates per processor */ 507 | s64Int GlbNumUpdates; /* for reduction */ 508 | 509 | FILE *outFile = NULL; 510 | MPI_Op sum64; 511 | double *GUPs; 512 | 513 | MPI_Datatype INT64_DT; 514 | 515 | #ifdef LONG_IS_64BITS 516 | INT64_DT = MPI_LONG; 517 | #else 518 | INT64_DT = MPI_LONG_LONG_INT; 519 | #endif 520 | 521 | GUPs = ¶ms->MPIGUPs; 522 | 523 | MPI_Comm_size( MPI_COMM_WORLD, &NumProcs ); 524 | MPI_Comm_rank( MPI_COMM_WORLD, &MyProc ); 525 | 526 | if (0 == MyProc) { 527 | outFile = fopen( params->outFname, "a" ); 528 | if (! outFile) outFile = stderr; 529 | } 530 | 531 | TotalMem = params->HPLMaxProcMem; /* max single node memory */ 532 | TotalMem *= NumProcs; /* max memory in NumProcs nodes */ 533 | TotalMem /= sizeof(u64Int); 534 | 535 | /* calculate TableSize --- the size of update array (must be a power of 2) */ 536 | for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1; 537 | TotalMem >= 1.0; 538 | TotalMem *= 0.5, logTableSize++, TableSize <<= 1) 539 | ; /* EMPTY */ 540 | 541 | 542 | /* determine whether the number of processors is a power of 2 */ 543 | for (i = 1, logNumProcs = 0; ; logNumProcs++, i <<= 1) { 544 | if (i == NumProcs) { 545 | PowerofTwo = HPCC_TRUE; 546 | Remainder = 0; 547 | Top = 0; 548 | MinLocalTableSize = (TableSize / NumProcs); 549 | LocalTableSize = MinLocalTableSize; 550 | GlobalStartMyProc = (MinLocalTableSize * MyProc); 551 | break; 552 | 553 | /* number of processes is not a power 2 (too many shifts may introduce negative values or 0) */ 554 | 555 | } 556 | else if (i > NumProcs || i <= 0) { 557 | PowerofTwo = HPCC_FALSE; 558 | /* Minimum local table size --- some processors have an additional entry */ 559 | MinLocalTableSize = (TableSize / NumProcs); 560 | /* Number of processors with (LocalTableSize + 1) entries */ 561 | Remainder = TableSize - (MinLocalTableSize * NumProcs); 562 | /* Number of table entries in top of Table */ 563 | Top = (MinLocalTableSize + 1) * Remainder; 564 | /* Local table size */ 565 | if (MyProc < Remainder) { 566 | LocalTableSize = (MinLocalTableSize + 1); 567 | GlobalStartMyProc = ( (MinLocalTableSize + 1) * MyProc); 568 | } 569 | else { 570 | LocalTableSize = MinLocalTableSize; 571 | GlobalStartMyProc = ( (MinLocalTableSize * MyProc) + Remainder ); 572 | } 573 | break; 574 | 575 | } /* end else if */ 576 | } /* end for i */ 577 | 578 | 579 | HPCC_Table = XMALLOC( u64Int, LocalTableSize); 580 | sAbort = 0; if (! HPCC_Table) sAbort = 1; 581 | 582 | MPI_Allreduce( &sAbort, &rAbort, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); 583 | if (rAbort > 0) { 584 | if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n"); 585 | goto failed_table; 586 | } 587 | 588 | params->MPIRandomAccess_N = (s64Int)TableSize; 589 | 590 | /* Default number of global updates to table: 4x number of table entries */ 591 | NumUpdates_Default = 4 * TableSize; 592 | 593 | #ifdef RA_TIME_BOUND 594 | /* estimate number of updates such that execution time does not exceed time bound */ 595 | /* time_bound should be a parameter */ 596 | /* max run time in seconds */ 597 | timeBound = Mmax( 0.25 * params->HPLrdata.time, (double)TIME_BOUND ); 598 | if (PowerofTwo) { 599 | HPCC_Power2NodesTime(logTableSize, TableSize, LocalTableSize, 600 | MinLocalTableSize, GlobalStartMyProc, Top, 601 | logNumProcs, NumProcs, Remainder, 602 | MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates); 603 | 604 | } else { 605 | HPCC_AnyNodesTime(logTableSize, TableSize, LocalTableSize, 606 | MinLocalTableSize, GlobalStartMyProc, Top, 607 | logNumProcs, NumProcs, Remainder, 608 | MyProc, INT64_DT, timeBound, (u64Int *)&ProcNumUpdates); 609 | } 610 | /* be conservative: get the smallest number of updates among all procs */ 611 | MPI_Reduce( &ProcNumUpdates, &GlbNumUpdates, 1, INT64_DT, 612 | MPI_MIN, 0, MPI_COMM_WORLD ); 613 | /* distribute number of updates per proc to all procs */ 614 | MPI_Bcast( &GlbNumUpdates, 1, INT64_DT, 0, MPI_COMM_WORLD ); 615 | ProcNumUpdates = Mmin(GlbNumUpdates, (4*LocalTableSize)); 616 | /* works for both PowerofTwo and AnyNodes */ 617 | NumUpdates = Mmin((ProcNumUpdates*NumProcs), NumUpdates_Default); 618 | 619 | #else 620 | ProcNumUpdates = 4*LocalTableSize; 621 | NumUpdates = NumUpdates_Default; 622 | #endif 623 | 624 | if (MyProc == 0) { 625 | fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : ""); 626 | fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n", 627 | logTableSize, TableSize ); 628 | if (PowerofTwo) 629 | fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n", 630 | (logTableSize - logNumProcs), TableSize/NumProcs ); 631 | else 632 | fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d = " FSTR64 " words/PE MAX\n", 633 | logTableSize, NumProcs, LocalTableSize); 634 | 635 | fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default); 636 | #ifdef RA_TIME_BOUND 637 | fprintf( outFile, "Number of updates EXECUTED = " FSTR64 " (for a TIME BOUND of %.2f secs)\n", 638 | NumUpdates, timeBound); 639 | #endif 640 | params->MPIRandomAccess_ExeUpdates = NumUpdates; 641 | params->MPIRandomAccess_TimeBound = timeBound; 642 | } 643 | 644 | MPI_Barrier( MPI_COMM_WORLD ); 645 | 646 | CPUTime = -CPUSEC(); 647 | RealTime = -RTSEC(); 648 | 649 | if (PowerofTwo) { 650 | Power2NodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, 651 | MinLocalTableSize, GlobalStartMyProc, Top, 652 | logNumProcs, NumProcs, Remainder, 653 | MyProc, ProcNumUpdates, INT64_DT); 654 | } else { 655 | AnyNodesMPIRandomAccessUpdate(logTableSize, TableSize, LocalTableSize, 656 | MinLocalTableSize, GlobalStartMyProc, Top, 657 | logNumProcs, NumProcs, Remainder, 658 | MyProc, ProcNumUpdates, INT64_DT); 659 | } 660 | 661 | 662 | MPI_Barrier( MPI_COMM_WORLD ); 663 | 664 | /* End timed section */ 665 | CPUTime += CPUSEC(); 666 | RealTime += RTSEC(); 667 | 668 | /* Print timing results */ 669 | if (MyProc == 0){ 670 | params->MPIRandomAccess_time = RealTime; 671 | *GUPs = 1e-9*NumUpdates / RealTime; 672 | fprintf( outFile, "CPU time used = %.6f seconds\n", CPUTime ); 673 | fprintf( outFile, "Real time used = %.6f seconds\n", RealTime ); 674 | fprintf( outFile, "%.9f Billion(10^9) Updates per second [GUP/s]\n", 675 | *GUPs ); 676 | fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n", 677 | *GUPs / NumProcs ); 678 | /* No longer reporting per CPU number */ 679 | /* *GUPs /= NumProcs; */ 680 | } 681 | /* distribute result to all nodes */ 682 | MPI_Bcast( GUPs, 1, MPI_INT, 0, MPI_COMM_WORLD ); 683 | 684 | 685 | /* Verification phase */ 686 | 687 | /* Begin timing here */ 688 | CPUTime = -CPUSEC(); 689 | RealTime = -RTSEC(); 690 | 691 | if (PowerofTwo) { 692 | HPCC_Power2NodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize, 693 | GlobalStartMyProc, 694 | logNumProcs, NumProcs, 695 | MyProc, ProcNumUpdates, 696 | INT64_DT, &NumErrors); 697 | } 698 | else { 699 | HPCC_AnyNodesMPIRandomAccessCheck(logTableSize, TableSize, LocalTableSize, 700 | MinLocalTableSize, GlobalStartMyProc, Top, 701 | logNumProcs, NumProcs, Remainder, 702 | MyProc, ProcNumUpdates, 703 | INT64_DT, &NumErrors); 704 | } 705 | 706 | 707 | #ifdef LONG_IS_64BITS 708 | MPI_Reduce( &NumErrors, &GlbNumErrors, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD ); 709 | #else 710 | /* MPI 1.1 standard (obsolete at this point) doesn't define MPI_SUM 711 | to work on `long long': 712 | http://www.mpi-forum.org/docs/mpi-11-html/node78.html and 713 | therefore LAM 6.5.6 chooses not to implement it (even though there 714 | is code for it in LAM and for other reductions work OK, 715 | e.g. MPI_MAX). MPICH 1.2.5 doesn't complain about MPI_SUM but it 716 | doesn't have MPI_UNSIGNED_LONG_LONG (but has MPI_LONG_LONG_INT): 717 | http://www.mpi-forum.org/docs/mpi-20-html/node84.htm So I need to 718 | create a trivial summation operation. */ 719 | MPI_Op_create( Sum64, 1, &sum64 ); 720 | MPI_Reduce( &NumErrors, &GlbNumErrors, 1, INT64_DT, sum64, 0, MPI_COMM_WORLD ); 721 | MPI_Op_free( &sum64 ); 722 | #endif 723 | 724 | /* End timed section */ 725 | CPUTime += CPUSEC(); 726 | RealTime += RTSEC(); 727 | 728 | if(MyProc == 0){ 729 | params->MPIRandomAccess_CheckTime = RealTime; 730 | fprintf( outFile, "Verification: CPU time used = %.6f seconds\n", CPUTime); 731 | fprintf( outFile, "Verification: Real time used = %.6f seconds\n", RealTime); 732 | fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n", 733 | GlbNumErrors, TableSize, (GlbNumErrors <= 0.01*TableSize) ? 734 | "passed" : "failed"); 735 | if (GlbNumErrors > 0.01*TableSize) params->Failure = 1; 736 | params->MPIRandomAccess_Errors = (s64Int)GlbNumErrors; 737 | params->MPIRandomAccess_ErrorsFraction = (double)GlbNumErrors / (double)TableSize; 738 | } 739 | /* End verification phase */ 740 | 741 | 742 | /* Deallocate memory (in reverse order of allocation which should 743 | help fragmentation) */ 744 | 745 | free( HPCC_Table ); 746 | 747 | failed_table: 748 | 749 | if (0 == MyProc) if (outFile != stderr) fclose( outFile ); 750 | 751 | MPI_Barrier( MPI_COMM_WORLD ); 752 | 753 | return 0; 754 | } 755 | --------------------------------------------------------------------------------