├── CMakeLists.txt
├── Makefile
├── README
├── TODO
├── lulesh-comm.cc
├── lulesh-init.cc
├── lulesh-util.cc
├── lulesh-viz.cc
├── lulesh.cc
├── lulesh.h
└── lulesh_tuple.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(LULESH CXX)
 4 | 
 5 | option(WITH_MPI    "Build LULESH with MPI"          TRUE)
 6 | option(WITH_OPENMP "Build LULESH with OpenMP"       TRUE)
 7 | option(WITH_SILO   "Build LULESH with silo support" FALSE)
 8 | 
 9 | if (WITH_MPI)
10 |   find_package(MPI REQUIRED)
11 |   include_directories(${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
12 |   add_definitions("-DUSE_MPI=1")
13 |   list(APPEND LULESH_EXTERNAL_LIBS ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
14 | else()
15 |   add_definitions("-DUSE_MPI=0")
16 | endif()
17 | 
18 | if (WITH_OPENMP)
19 |   find_package(OpenMP REQUIRED)
20 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
21 | endif()
22 | 
23 | if (WITH_SILO)
24 |   find_path(SILO_INCLUDE_DIR silo.h
25 |     HINTS ${SILO_DIR}/include)
26 |   find_library(SILO_LIBRARY
27 |     NAMES siloh5
28 |     HINTS ${SILO_DIR}/lib)
29 | 
30 |   include(FindPackageHandleStandardArgs)
31 |   find_package_handle_standard_args(SILO DEFAULT_MSG
32 |     SILO_LIBRARY
33 |     SILO_INCLUDE_DIR)
34 | 
35 |   if (SILO_FOUND)
36 |     add_definitions("-DVIZ_MESH")
37 |     include_directories(${SILO_INCLUDE_DIR})
38 |     #   Note: silo needs to be built as a dynamic lib, otherwise 
39 |     # there are additional dependencies (hdf5) which we don't know.
40 |     # This would be fixed by silo providing a CMake package.
41 |     list(APPEND LULESH_EXTERNAL_LIBS ${SILO_LIBRARY})
42 |   endif()
43 | endif()
44 | 
45 | set(LULESH_SOURCES
46 |   lulesh-comm.cc
47 |   lulesh-init.cc
48 |   lulesh-util.cc
49 |   lulesh-viz.cc
50 |   lulesh.cc)
51 | 
52 | set(LULESH_EXEC lulesh2.0)
53 | 
54 | add_executable(${LULESH_EXEC} ${LULESH_SOURCES})
55 | target_link_libraries(${LULESH_EXEC} ${LULESH_EXTERNAL_LIBS})
56 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #default build suggestion of MPI + OPENMP with gcc on Livermore machines you might have to change the compiler name
 2 | 
 3 | SHELL = /bin/sh
 4 | .SUFFIXES: .cc .o
 5 | 
 6 | LULESH_EXEC = lulesh2.0
 7 | 
 8 | MPI_INC = /opt/local/include/openmpi
 9 | MPI_LIB = /opt/local/lib
10 | 
11 | SERCXX = g++ -DUSE_MPI=0
12 | MPICXX = mpig++ -DUSE_MPI=1
13 | CXX = $(MPICXX)
14 | 
15 | SOURCES2.0 = \
16 | 	lulesh.cc \
17 | 	lulesh-comm.cc \
18 | 	lulesh-viz.cc \
19 | 	lulesh-util.cc \
20 | 	lulesh-init.cc
21 | OBJECTS2.0 = $(SOURCES2.0:.cc=.o)
22 | 
23 | #Default build suggestions with OpenMP for g++
24 | CXXFLAGS = -g -O3 -fopenmp -I. -Wall
25 | LDFLAGS = -g -O3 -fopenmp
26 | 
27 | #Below are reasonable default flags for a serial build
28 | #CXXFLAGS = -g -O3 -I. -Wall
29 | #LDFLAGS = -g -O3 
30 | 
31 | #common places you might find silo on the Livermore machines.
32 | #SILO_INCDIR = /opt/local/include
33 | #SILO_LIBDIR = /opt/local/lib
34 | #SILO_INCDIR = ./silo/4.9/1.8.10.1/include
35 | #SILO_LIBDIR = ./silo/4.9/1.8.10.1/lib
36 | 
37 | #If you do not have silo and visit you can get them at:
38 | #silo:  https://wci.llnl.gov/codes/silo/downloads.html
39 | #visit: https://wci.llnl.gov/codes/visit/download.html
40 | 
41 | #below is and example of how to make with silo, hdf5 to get vizulization by default all this is turned off.  All paths are Livermore specific.
42 | #CXXFLAGS = -g -DVIZ_MESH -I${SILO_INCDIR} -Wall -Wno-pragmas
43 | #LDFLAGS = -g -L${SILO_LIBDIR} -Wl,-rpath -Wl,${SILO_LIBDIR} -lsiloh5 -lhdf5
44 | 
45 | .cc.o: lulesh.h
46 | 	@echo "Building $<"
47 | 	$(CXX) -c $(CXXFLAGS) -o $@  $<
48 | 
49 | all: $(LULESH_EXEC)
50 | 
51 | $(LULESH_EXEC): $(OBJECTS2.0)
52 | 	@echo "Linking"
53 | 	$(CXX) $(OBJECTS2.0) $(LDFLAGS) -lm -o $@
54 | 
55 | clean:
56 | 	/bin/rm -f *.o *~ $(OBJECTS) $(LULESH_EXEC)
57 | 	/bin/rm -rf *.dSYM
58 | 
59 | tar: clean
60 | 	cd .. ; tar cvf lulesh-2.0.tar LULESH-2.0 ; mv lulesh-2.0.tar LULESH-2.0
61 | 
62 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This is the README for LULESH 2.0
 2 | 
 3 | More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php
 4 | 
 5 | If you have any questions or problems please contact:
 6 | 
 7 | Ian Karlin <karlin1@llnl.gov> or
 8 | Rob Neely <neely4@llnl.gov>
 9 | 
10 | Also please send any notable results to Ian Karlin <karlin1@llnl.gov> as we are still evaluating the performance of this code.
11 | 
12 | A Makefile and a CMake build system are provided.
13 | 
14 | *** Building with CMake ***
15 | 
16 | Create a build directory and run cmake. Example:
17 | 
18 |   $ mkdir build; cd build; cmake -DCMAKE_BUILD_TYPE=Release -DMPI_CXX_COMPILER=`which mpicxx` ..
19 | 
20 | CMake variables:
21 | 
22 |   CMAKE_BUILD_TYPE      "Debug", "Release", or "RelWithDebInfo"
23 | 
24 |   CMAKE_CXX_COMPILER    Path to the C++ compiler
25 |   MPI_CXX_COMPILER      Path to the MPI C++ compiler
26 | 
27 |   WITH_MPI=On|Off       Build with MPI (Default: On)
28 |   WITH_OPENMP=On|Off    Build with OpenMP support (Default: On)
29 |   WITH_SILO=On|Off      Build with support for SILO. (Default: Off).
30 |   
31 |   SILO_DIR              Path to SILO library (only needed when WITH_SILO is "On")
32 | 
33 | *** Notable changes in LULESH 2.0 ***
34 | 
35 | Split functionality into different files
36 | lulesh.cc - where most (all?) of the timed functionality lies
37 | lulesh-comm.cc - MPI functionality
38 | lulesh-init.cc - Setup code
39 | lulesh-viz.cc  - Support for visualization option
40 | lulesh-util.cc - Non-timed functions
41 | 
42 | The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative:
43 | 
44 | Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride
45 | 
46 | Artificial load imbalances can be easily introduced that could impact parallelization strategies.  
47 |    * The load balance flag changes region assignment.  Region number is raised to the power entered for assignment probability.  Most likely regions changes with MPI process id.
48 |    * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple.  The cost of 5% is 10x the entered
49 |  multiple.
50 | 
51 | MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP
52 | 
53 | Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt.
54 | 
55 | Enabled variable timestep calculation by default (courant condition), which results in an additional reduction.  Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size.  Therefore steps to solution will differ from LULESH 1.0.
56 | 
57 | Default domain (mesh) size reduced from 45^3 to 30^3
58 | 
59 | Command line options to allow for numerous test cases without needing to recompile
60 | 
61 | Performance optimizations and code cleanup uncovered during study of LULESH 1.0
62 | 
63 | Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement
64 | 
65 | *** Notable changes in LULESH 2.1 ***
66 | 
67 | Minor bug fixes.
68 | Code cleanup to add consitancy to variable names, loop indexing, memory allocation/deallocation, etc.
69 | Destructor added to main class to clean up when code exits.
70 | 
71 | 
72 | Possible Future 2.0 minor updates (other changes possible as discovered)
73 | 
74 | * Different default parameters
75 | * Minor code performance changes and cleanupS
76 | 
77 | TODO in future versions
78 | * Add reader for (truly) unstructured meshes, probably serial only
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | cmake build (Abhinav)
2 | 


--------------------------------------------------------------------------------
/lulesh-comm.cc:
--------------------------------------------------------------------------------
   1 | #include "lulesh.h"
   2 | 
   3 | // If no MPI, then this whole file is stubbed out
   4 | #if USE_MPI
   5 | 
   6 | #include <mpi.h>
   7 | #include <string.h>
   8 | 
   9 | /* Comm Routines */
  10 | 
  11 | #define ALLOW_UNPACKED_PLANE false
  12 | #define ALLOW_UNPACKED_ROW   false
  13 | #define ALLOW_UNPACKED_COL   false
  14 | 
  15 | /*
  16 |    There are coherence issues for packing and unpacking message
  17 |    buffers.  Ideally, you would like a lot of threads to 
  18 |    cooperate in the assembly/dissassembly of each message.
  19 |    To do that, each thread should really be operating in a
  20 |    different coherence zone.
  21 | 
  22 |    Let's assume we have three fields, f1 through f3, defined on
  23 |    a 61x61x61 cube.  If we want to send the block boundary
  24 |    information for each field to each neighbor processor across
  25 |    each cube face, then we have three cases for the
  26 |    memory layout/coherence of data on each of the six cube
  27 |    boundaries:
  28 | 
  29 |       (a) Two of the faces will be in contiguous memory blocks
  30 |       (b) Two of the faces will be comprised of pencils of
  31 |           contiguous memory.
  32 |       (c) Two of the faces will have large strides between
  33 |           every value living on the face.
  34 | 
  35 |    How do you pack and unpack this data in buffers to
  36 |    simultaneous achieve the best memory efficiency and
  37 |    the most thread independence?
  38 | 
  39 |    Do do you pack field f1 through f3 tighly to reduce message
  40 |    size?  Do you align each field on a cache coherence boundary
  41 |    within the message so that threads can pack and unpack each
  42 |    field independently?  For case (b), do you align each
  43 |    boundary pencil of each field separately?  This increases
  44 |    the message size, but could improve cache coherence so
  45 |    each pencil could be processed independently by a separate
  46 |    thread with no conflicts.
  47 | 
  48 |    Also, memory access for case (c) would best be done without
  49 |    going through the cache (the stride is so large it just causes
  50 |    a lot of useless cache evictions).  Is it worth creating
  51 |    a special case version of the packing algorithm that uses
  52 |    non-coherent load/store opcodes?
  53 | */
  54 | 
  55 | /******************************************/
  56 | 
  57 | 
  58 | /* doRecv flag only works with regular block structure */
  59 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
  60 |               Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
  61 | 
  62 |    if (domain.numRanks() == 1)
  63 |       return ;
  64 | 
  65 |    /* post recieve buffers for all incoming messages */
  66 |    int myRank ;
  67 |    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
  68 |    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
  69 |    Index_t pmsg = 0 ; /* plane comm msg */
  70 |    Index_t emsg = 0 ; /* edge comm msg */
  71 |    Index_t cmsg = 0 ; /* corner comm msg */
  72 |    MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
  73 |    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
  74 | 
  75 |    /* assume communication to 6 neighbors by default */
  76 |    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
  77 | 
  78 |    if (domain.rowLoc() == 0) {
  79 |       rowMin = false ;
  80 |    }
  81 |    if (domain.rowLoc() == (domain.tp()-1)) {
  82 |       rowMax = false ;
  83 |    }
  84 |    if (domain.colLoc() == 0) {
  85 |       colMin = false ;
  86 |    }
  87 |    if (domain.colLoc() == (domain.tp()-1)) {
  88 |       colMax = false ;
  89 |    }
  90 |    if (domain.planeLoc() == 0) {
  91 |       planeMin = false ;
  92 |    }
  93 |    if (domain.planeLoc() == (domain.tp()-1)) {
  94 |       planeMax = false ;
  95 |    }
  96 | 
  97 |    for (Index_t i=0; i<26; ++i) {
  98 |       domain.recvRequest[i] = MPI_REQUEST_NULL ;
  99 |    }
 100 | 
 101 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
 102 | 
 103 |    /* post receives */
 104 | 
 105 |    /* receive data from neighboring domain faces */
 106 |    if (planeMin && doRecv) {
 107 |       /* contiguous memory */
 108 |       int fromRank = myRank - domain.tp()*domain.tp() ;
 109 |       int recvCount = dx * dy * xferFields ;
 110 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 111 |                 recvCount, baseType, fromRank, msgType,
 112 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 113 |       ++pmsg ;
 114 |    }
 115 |    if (planeMax) {
 116 |       /* contiguous memory */
 117 |       int fromRank = myRank + domain.tp()*domain.tp() ;
 118 |       int recvCount = dx * dy * xferFields ;
 119 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 120 |                 recvCount, baseType, fromRank, msgType,
 121 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 122 |       ++pmsg ;
 123 |    }
 124 |    if (rowMin && doRecv) {
 125 |       /* semi-contiguous memory */
 126 |       int fromRank = myRank - domain.tp() ;
 127 |       int recvCount = dx * dz * xferFields ;
 128 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 129 |                 recvCount, baseType, fromRank, msgType,
 130 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 131 |       ++pmsg ;
 132 |    }
 133 |    if (rowMax) {
 134 |       /* semi-contiguous memory */
 135 |       int fromRank = myRank + domain.tp() ;
 136 |       int recvCount = dx * dz * xferFields ;
 137 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 138 |                 recvCount, baseType, fromRank, msgType,
 139 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 140 |       ++pmsg ;
 141 |    }
 142 |    if (colMin && doRecv) {
 143 |       /* scattered memory */
 144 |       int fromRank = myRank - 1 ;
 145 |       int recvCount = dy * dz * xferFields ;
 146 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 147 |                 recvCount, baseType, fromRank, msgType,
 148 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 149 |       ++pmsg ;
 150 |    }
 151 |    if (colMax) {
 152 |       /* scattered memory */
 153 |       int fromRank = myRank + 1 ;
 154 |       int recvCount = dy * dz * xferFields ;
 155 |       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
 156 |                 recvCount, baseType, fromRank, msgType,
 157 |                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
 158 |       ++pmsg ;
 159 |    }
 160 | 
 161 |    if (!planeOnly) {
 162 |       /* receive data from domains connected only by an edge */
 163 |       if (rowMin && colMin && doRecv) {
 164 |          int fromRank = myRank - domain.tp() - 1 ;
 165 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 166 |                                          emsg * maxEdgeComm],
 167 |                    dz * xferFields, baseType, fromRank, msgType,
 168 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 169 |          ++emsg ;
 170 |       }
 171 | 
 172 |       if (rowMin && planeMin && doRecv) {
 173 |          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
 174 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 175 |                                          emsg * maxEdgeComm],
 176 |                    dx * xferFields, baseType, fromRank, msgType,
 177 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 178 |          ++emsg ;
 179 |       }
 180 | 
 181 |       if (colMin && planeMin && doRecv) {
 182 |          int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
 183 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 184 |                                          emsg * maxEdgeComm],
 185 |                    dy * xferFields, baseType, fromRank, msgType,
 186 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 187 |          ++emsg ;
 188 |       }
 189 | 
 190 |       if (rowMax && colMax) {
 191 |          int fromRank = myRank + domain.tp() + 1 ;
 192 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 193 |                                          emsg * maxEdgeComm],
 194 |                    dz * xferFields, baseType, fromRank, msgType,
 195 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 196 |          ++emsg ;
 197 |       }
 198 | 
 199 |       if (rowMax && planeMax) {
 200 |          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
 201 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 202 |                                          emsg * maxEdgeComm],
 203 |                    dx * xferFields, baseType, fromRank, msgType,
 204 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 205 |          ++emsg ;
 206 |       }
 207 | 
 208 |       if (colMax && planeMax) {
 209 |          int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
 210 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 211 |                                          emsg * maxEdgeComm],
 212 |                    dy * xferFields, baseType, fromRank, msgType,
 213 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 214 |          ++emsg ;
 215 |       }
 216 | 
 217 |       if (rowMax && colMin) {
 218 |          int fromRank = myRank + domain.tp() - 1 ;
 219 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 220 |                                          emsg * maxEdgeComm],
 221 |                    dz * xferFields, baseType, fromRank, msgType,
 222 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 223 |          ++emsg ;
 224 |       }
 225 | 
 226 |       if (rowMin && planeMax) {
 227 |          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
 228 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 229 |                                          emsg * maxEdgeComm],
 230 |                    dx * xferFields, baseType, fromRank, msgType,
 231 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 232 |          ++emsg ;
 233 |       }
 234 | 
 235 |       if (colMin && planeMax) {
 236 |          int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
 237 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 238 |                                          emsg * maxEdgeComm],
 239 |                    dy * xferFields, baseType, fromRank, msgType,
 240 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 241 |          ++emsg ;
 242 |       }
 243 | 
 244 |       if (rowMin && colMax && doRecv) {
 245 |          int fromRank = myRank - domain.tp() + 1 ;
 246 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 247 |                                          emsg * maxEdgeComm],
 248 |                    dz * xferFields, baseType, fromRank, msgType,
 249 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 250 |          ++emsg ;
 251 |       }
 252 | 
 253 |       if (rowMax && planeMin && doRecv) {
 254 |          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
 255 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 256 |                                          emsg * maxEdgeComm],
 257 |                    dx * xferFields, baseType, fromRank, msgType,
 258 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 259 |          ++emsg ;
 260 |       }
 261 | 
 262 |       if (colMax && planeMin && doRecv) {
 263 |          int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
 264 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 265 |                                          emsg * maxEdgeComm],
 266 |                    dy * xferFields, baseType, fromRank, msgType,
 267 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
 268 |          ++emsg ;
 269 |       }
 270 | 
 271 |       /* receive data from domains connected only by a corner */
 272 |       if (rowMin && colMin && planeMin && doRecv) {
 273 |          /* corner at domain logical coord (0, 0, 0) */
 274 |          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
 275 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 276 |                                          emsg * maxEdgeComm +
 277 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 278 |                    xferFields, baseType, fromRank, msgType,
 279 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 280 |          ++cmsg ;
 281 |       }
 282 |       if (rowMin && colMin && planeMax) {
 283 |          /* corner at domain logical coord (0, 0, 1) */
 284 |          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
 285 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 286 |                                          emsg * maxEdgeComm +
 287 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 288 |                    xferFields, baseType, fromRank, msgType,
 289 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 290 |          ++cmsg ;
 291 |       }
 292 |       if (rowMin && colMax && planeMin && doRecv) {
 293 |          /* corner at domain logical coord (1, 0, 0) */
 294 |          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
 295 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 296 |                                          emsg * maxEdgeComm +
 297 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 298 |                    xferFields, baseType, fromRank, msgType,
 299 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 300 |          ++cmsg ;
 301 |       }
 302 |       if (rowMin && colMax && planeMax) {
 303 |          /* corner at domain logical coord (1, 0, 1) */
 304 |          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
 305 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 306 |                                          emsg * maxEdgeComm +
 307 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 308 |                    xferFields, baseType, fromRank, msgType,
 309 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 310 |          ++cmsg ;
 311 |       }
 312 |       if (rowMax && colMin && planeMin && doRecv) {
 313 |          /* corner at domain logical coord (0, 1, 0) */
 314 |          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
 315 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 316 |                                          emsg * maxEdgeComm +
 317 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 318 |                    xferFields, baseType, fromRank, msgType,
 319 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 320 |          ++cmsg ;
 321 |       }
 322 |       if (rowMax && colMin && planeMax) {
 323 |          /* corner at domain logical coord (0, 1, 1) */
 324 |          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
 325 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 326 |                                          emsg * maxEdgeComm +
 327 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 328 |                    xferFields, baseType, fromRank, msgType,
 329 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 330 |          ++cmsg ;
 331 |       }
 332 |       if (rowMax && colMax && planeMin && doRecv) {
 333 |          /* corner at domain logical coord (1, 1, 0) */
 334 |          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
 335 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 336 |                                          emsg * maxEdgeComm +
 337 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 338 |                    xferFields, baseType, fromRank, msgType,
 339 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 340 |          ++cmsg ;
 341 |       }
 342 |       if (rowMax && colMax && planeMax) {
 343 |          /* corner at domain logical coord (1, 1, 1) */
 344 |          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
 345 |          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
 346 |                                          emsg * maxEdgeComm +
 347 |                                          cmsg * CACHE_COHERENCE_PAD_REAL],
 348 |                    xferFields, baseType, fromRank, msgType,
 349 |                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
 350 |          ++cmsg ;
 351 |       }
 352 |    }
 353 | }
 354 | 
 355 | /******************************************/
 356 | 
 357 | void CommSend(Domain& domain, Int_t msgType,
 358 |               Index_t xferFields, Domain_member *fieldData,
 359 |               Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
 360 | {
 361 | 
 362 |    if (domain.numRanks() == 1)
 363 |       return ;
 364 | 
 365 |    /* post recieve buffers for all incoming messages */
 366 |    int myRank ;
 367 |    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
 368 |    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
 369 |    Index_t pmsg = 0 ; /* plane comm msg */
 370 |    Index_t emsg = 0 ; /* edge comm msg */
 371 |    Index_t cmsg = 0 ; /* corner comm msg */
 372 |    MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
 373 |    MPI_Status status[26] ;
 374 |    Real_t *destAddr ;
 375 |    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
 376 |    /* assume communication to 6 neighbors by default */
 377 |    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
 378 |    if (domain.rowLoc() == 0) {
 379 |       rowMin = false ;
 380 |    }
 381 |    if (domain.rowLoc() == (domain.tp()-1)) {
 382 |       rowMax = false ;
 383 |    }
 384 |    if (domain.colLoc() == 0) {
 385 |       colMin = false ;
 386 |    }
 387 |    if (domain.colLoc() == (domain.tp()-1)) {
 388 |       colMax = false ;
 389 |    }
 390 |    if (domain.planeLoc() == 0) {
 391 |       planeMin = false ;
 392 |    }
 393 |    if (domain.planeLoc() == (domain.tp()-1)) {
 394 |       planeMax = false ;
 395 |    }
 396 | 
 397 |    for (Index_t i=0; i<26; ++i) {
 398 |       domain.sendRequest[i] = MPI_REQUEST_NULL ;
 399 |    }
 400 | 
 401 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
 402 | 
 403 |    /* post sends */
 404 | 
 405 |    if (planeMin | planeMax) {
 406 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 407 |       int sendCount = dx * dy ;
 408 | 
 409 |       if (planeMin) {
 410 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 411 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 412 |             Domain_member src = fieldData[fi] ;
 413 |             for (Index_t i=0; i<sendCount; ++i) {
 414 |                destAddr[i] = (domain.*src)(i) ;
 415 |             }
 416 |             destAddr += sendCount ;
 417 |          }
 418 |          destAddr -= xferFields*sendCount ;
 419 | 
 420 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 421 |                    myRank - domain.tp()*domain.tp(), msgType,
 422 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 423 |          ++pmsg ;
 424 |       }
 425 |       if (planeMax && doSend) {
 426 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 427 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 428 |             Domain_member src = fieldData[fi] ;
 429 |             for (Index_t i=0; i<sendCount; ++i) {
 430 |                destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
 431 |             }
 432 |             destAddr += sendCount ;
 433 |          }
 434 |          destAddr -= xferFields*sendCount ;
 435 | 
 436 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 437 |                    myRank + domain.tp()*domain.tp(), msgType,
 438 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 439 |          ++pmsg ;
 440 |       }
 441 |    }
 442 |    if (rowMin | rowMax) {
 443 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 444 |       int sendCount = dx * dz ;
 445 | 
 446 |       if (rowMin) {
 447 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 448 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 449 |             Domain_member src = fieldData[fi] ;
 450 |             for (Index_t i=0; i<dz; ++i) {
 451 |                for (Index_t j=0; j<dx; ++j) {
 452 |                   destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
 453 |                }
 454 |             }
 455 |             destAddr += sendCount ;
 456 |          }
 457 |          destAddr -= xferFields*sendCount ;
 458 | 
 459 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 460 |                    myRank - domain.tp(), msgType,
 461 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 462 |          ++pmsg ;
 463 |       }
 464 |       if (rowMax && doSend) {
 465 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 466 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 467 |             Domain_member src = fieldData[fi] ;
 468 |             for (Index_t i=0; i<dz; ++i) {
 469 |                for (Index_t j=0; j<dx; ++j) {
 470 |                   destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
 471 |                }
 472 |             }
 473 |             destAddr += sendCount ;
 474 |          }
 475 |          destAddr -= xferFields*sendCount ;
 476 | 
 477 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 478 |                    myRank + domain.tp(), msgType,
 479 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 480 |          ++pmsg ;
 481 |       }
 482 |    }
 483 |    if (colMin | colMax) {
 484 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 485 |       int sendCount = dy * dz ;
 486 | 
 487 |       if (colMin) {
 488 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 489 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 490 |             Domain_member src = fieldData[fi] ;
 491 |             for (Index_t i=0; i<dz; ++i) {
 492 |                for (Index_t j=0; j<dy; ++j) {
 493 |                   destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
 494 |                }
 495 |             }
 496 |             destAddr += sendCount ;
 497 |          }
 498 |          destAddr -= xferFields*sendCount ;
 499 | 
 500 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 501 |                    myRank - 1, msgType,
 502 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 503 |          ++pmsg ;
 504 |       }
 505 |       if (colMax && doSend) {
 506 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
 507 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 508 |             Domain_member src = fieldData[fi] ;
 509 |             for (Index_t i=0; i<dz; ++i) {
 510 |                for (Index_t j=0; j<dy; ++j) {
 511 |                   destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
 512 |                }
 513 |             }
 514 |             destAddr += sendCount ;
 515 |          }
 516 |          destAddr -= xferFields*sendCount ;
 517 | 
 518 |          MPI_Isend(destAddr, xferFields*sendCount, baseType,
 519 |                    myRank + 1, msgType,
 520 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
 521 |          ++pmsg ;
 522 |       }
 523 |    }
 524 | 
 525 |    if (!planeOnly) {
 526 |       if (rowMin && colMin) {
 527 |          int toRank = myRank - domain.tp() - 1 ;
 528 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 529 |                                           emsg * maxEdgeComm] ;
 530 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 531 |             Domain_member src = fieldData[fi] ;
 532 |             for (Index_t i=0; i<dz; ++i) {
 533 |                destAddr[i] = (domain.*src)(i*dx*dy) ;
 534 |             }
 535 |             destAddr += dz ;
 536 |          }
 537 |          destAddr -= xferFields*dz ;
 538 |          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
 539 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 540 |          ++emsg ;
 541 |       }
 542 | 
 543 |       if (rowMin && planeMin) {
 544 |          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
 545 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 546 |                                           emsg * maxEdgeComm] ;
 547 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 548 |             Domain_member src = fieldData[fi] ;
 549 |             for (Index_t i=0; i<dx; ++i) {
 550 |                destAddr[i] = (domain.*src)(i) ;
 551 |             }
 552 |             destAddr += dx ;
 553 |          }
 554 |          destAddr -= xferFields*dx ;
 555 |          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
 556 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 557 |          ++emsg ;
 558 |       }
 559 | 
 560 |       if (colMin && planeMin) {
 561 |          int toRank = myRank - domain.tp()*domain.tp() - 1 ;
 562 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 563 |                                           emsg * maxEdgeComm] ;
 564 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 565 |             Domain_member src = fieldData[fi] ;
 566 |             for (Index_t i=0; i<dy; ++i) {
 567 |                destAddr[i] = (domain.*src)(i*dx) ;
 568 |             }
 569 |             destAddr += dy ;
 570 |          }
 571 |          destAddr -= xferFields*dy ;
 572 |          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
 573 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 574 |          ++emsg ;
 575 |       }
 576 | 
 577 |       if (rowMax && colMax && doSend) {
 578 |          int toRank = myRank + domain.tp() + 1 ;
 579 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 580 |                                           emsg * maxEdgeComm] ;
 581 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 582 |             Domain_member src = fieldData[fi] ;
 583 |             for (Index_t i=0; i<dz; ++i) {
 584 |                destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
 585 |             }
 586 |             destAddr += dz ;
 587 |          }
 588 |          destAddr -= xferFields*dz ;
 589 |          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
 590 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 591 |          ++emsg ;
 592 |       }
 593 | 
 594 |       if (rowMax && planeMax && doSend) {
 595 |          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
 596 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 597 |                                           emsg * maxEdgeComm] ;
 598 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 599 |             Domain_member src = fieldData[fi] ;
 600 |             for (Index_t i=0; i<dx; ++i) {
 601 |               destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
 602 |             }
 603 |             destAddr += dx ;
 604 |          }
 605 |          destAddr -= xferFields*dx ;
 606 |          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
 607 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 608 |          ++emsg ;
 609 |       }
 610 | 
 611 |       if (colMax && planeMax && doSend) {
 612 |          int toRank = myRank + domain.tp()*domain.tp() + 1 ;
 613 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 614 |                                           emsg * maxEdgeComm] ;
 615 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 616 |             Domain_member src = fieldData[fi] ;
 617 |             for (Index_t i=0; i<dy; ++i) {
 618 |                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
 619 |             }
 620 |             destAddr += dy ;
 621 |          }
 622 |          destAddr -= xferFields*dy ;
 623 |          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
 624 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 625 |          ++emsg ;
 626 |       }
 627 | 
 628 |       if (rowMax && colMin && doSend) {
 629 |          int toRank = myRank + domain.tp() - 1 ;
 630 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 631 |                                           emsg * maxEdgeComm] ;
 632 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 633 |             Domain_member src = fieldData[fi] ;
 634 |             for (Index_t i=0; i<dz; ++i) {
 635 |                destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
 636 |             }
 637 |             destAddr += dz ;
 638 |          }
 639 |          destAddr -= xferFields*dz ;
 640 |          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
 641 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 642 |          ++emsg ;
 643 |       }
 644 | 
 645 |       if (rowMin && planeMax && doSend) {
 646 |          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
 647 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 648 |                                           emsg * maxEdgeComm] ;
 649 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 650 |             Domain_member src = fieldData[fi] ;
 651 |             for (Index_t i=0; i<dx; ++i) {
 652 |                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
 653 |             }
 654 |             destAddr += dx ;
 655 |          }
 656 |          destAddr -= xferFields*dx ;
 657 |          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
 658 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 659 |          ++emsg ;
 660 |       }
 661 | 
 662 |       if (colMin && planeMax && doSend) {
 663 |          int toRank = myRank + domain.tp()*domain.tp() - 1 ;
 664 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 665 |                                           emsg * maxEdgeComm] ;
 666 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 667 |             Domain_member src = fieldData[fi] ;
 668 |             for (Index_t i=0; i<dy; ++i) {
 669 |                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
 670 |             }
 671 |             destAddr += dy ;
 672 |          }
 673 |          destAddr -= xferFields*dy ;
 674 |          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
 675 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 676 |          ++emsg ;
 677 |       }
 678 | 
 679 |       if (rowMin && colMax) {
 680 |          int toRank = myRank - domain.tp() + 1 ;
 681 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 682 |                                           emsg * maxEdgeComm] ;
 683 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 684 |             Domain_member src = fieldData[fi] ;
 685 |             for (Index_t i=0; i<dz; ++i) {
 686 |                destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
 687 |             }
 688 |             destAddr += dz ;
 689 |          }
 690 |          destAddr -= xferFields*dz ;
 691 |          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
 692 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 693 |          ++emsg ;
 694 |       }
 695 | 
 696 |       if (rowMax && planeMin) {
 697 |          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
 698 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 699 |                                           emsg * maxEdgeComm] ;
 700 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 701 |             Domain_member src = fieldData[fi] ;
 702 |             for (Index_t i=0; i<dx; ++i) {
 703 |                destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
 704 |             }
 705 |             destAddr += dx ;
 706 |          }
 707 |          destAddr -= xferFields*dx ;
 708 |          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
 709 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 710 |          ++emsg ;
 711 |       }
 712 | 
 713 |       if (colMax && planeMin) {
 714 |          int toRank = myRank - domain.tp()*domain.tp() + 1 ;
 715 |          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
 716 |                                           emsg * maxEdgeComm] ;
 717 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 718 |             Domain_member src = fieldData[fi] ;
 719 |             for (Index_t i=0; i<dy; ++i) {
 720 |                destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
 721 |             }
 722 |             destAddr += dy ;
 723 |          }
 724 |          destAddr -= xferFields*dy ;
 725 |          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
 726 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
 727 |          ++emsg ;
 728 |       }
 729 | 
 730 |       if (rowMin && colMin && planeMin) {
 731 |          /* corner at domain logical coord (0, 0, 0) */
 732 |          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
 733 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 734 |                                                 emsg * maxEdgeComm +
 735 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
 736 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 737 |             comBuf[fi] = (domain.*fieldData[fi])(0) ;
 738 |          }
 739 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 740 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 741 |          ++cmsg ;
 742 |       }
 743 |       if (rowMin && colMin && planeMax && doSend) {
 744 |          /* corner at domain logical coord (0, 0, 1) */
 745 |          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
 746 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 747 |                                                 emsg * maxEdgeComm +
 748 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 749 |          Index_t idx = dx*dy*(dz - 1) ;
 750 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 751 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 752 |          }
 753 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 754 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 755 |          ++cmsg ;
 756 |       }
 757 |       if (rowMin && colMax && planeMin) {
 758 |          /* corner at domain logical coord (1, 0, 0) */
 759 |          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
 760 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 761 |                                                 emsg * maxEdgeComm +
 762 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 763 |          Index_t idx = dx - 1 ;
 764 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 765 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 766 |          }
 767 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 768 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 769 |          ++cmsg ;
 770 |       }
 771 |       if (rowMin && colMax && planeMax && doSend) {
 772 |          /* corner at domain logical coord (1, 0, 1) */
 773 |          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
 774 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 775 |                                                 emsg * maxEdgeComm +
 776 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 777 |          Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
 778 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 779 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 780 |          }
 781 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 782 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 783 |          ++cmsg ;
 784 |       }
 785 |       if (rowMax && colMin && planeMin) {
 786 |          /* corner at domain logical coord (0, 1, 0) */
 787 |          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
 788 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 789 |                                                 emsg * maxEdgeComm +
 790 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 791 |          Index_t idx = dx*(dy - 1) ;
 792 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 793 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 794 |          }
 795 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 796 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 797 |          ++cmsg ;
 798 |       }
 799 |       if (rowMax && colMin && planeMax && doSend) {
 800 |          /* corner at domain logical coord (0, 1, 1) */
 801 |          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
 802 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 803 |                                                 emsg * maxEdgeComm +
 804 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 805 |          Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
 806 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 807 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 808 |          }
 809 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 810 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 811 |          ++cmsg ;
 812 |       }
 813 |       if (rowMax && colMax && planeMin) {
 814 |          /* corner at domain logical coord (1, 1, 0) */
 815 |          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
 816 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 817 |                                                 emsg * maxEdgeComm +
 818 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 819 |          Index_t idx = dx*dy - 1 ;
 820 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 821 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 822 |          }
 823 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 824 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 825 |          ++cmsg ;
 826 |       }
 827 |       if (rowMax && colMax && planeMax && doSend) {
 828 |          /* corner at domain logical coord (1, 1, 1) */
 829 |          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
 830 |          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
 831 |                                                 emsg * maxEdgeComm +
 832 |                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
 833 |          Index_t idx = dx*dy*dz - 1 ;
 834 |          for (Index_t fi=0; fi<xferFields; ++fi) {
 835 |             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
 836 |          }
 837 |          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
 838 |                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
 839 |          ++cmsg ;
 840 |       }
 841 |    }
 842 | 
 843 |    MPI_Waitall(26, domain.sendRequest, status) ;
 844 | }
 845 | 
 846 | /******************************************/
 847 | 
 848 | void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData) {
 849 | 
 850 |    if (domain.numRanks() == 1)
 851 |       return ;
 852 | 
 853 |    /* summation order should be from smallest value to largest */
 854 |    /* or we could try out kahan summation! */
 855 | 
 856 |    int myRank ;
 857 |    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
 858 |    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
 859 |    Index_t pmsg = 0 ; /* plane comm msg */
 860 |    Index_t emsg = 0 ; /* edge comm msg */
 861 |    Index_t cmsg = 0 ; /* corner comm msg */
 862 |    Index_t dx = domain.sizeX() + 1 ;
 863 |    Index_t dy = domain.sizeY() + 1 ;
 864 |    Index_t dz = domain.sizeZ() + 1 ;
 865 |    MPI_Status status ;
 866 |    Real_t *srcAddr ;
 867 |    Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
 868 |    /* assume communication to 6 neighbors by default */
 869 |    rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
 870 |    if (domain.rowLoc() == 0) {
 871 |       rowMin = 0 ;
 872 |    }
 873 |    if (domain.rowLoc() == (domain.tp()-1)) {
 874 |       rowMax = 0 ;
 875 |    }
 876 |    if (domain.colLoc() == 0) {
 877 |       colMin = 0 ;
 878 |    }
 879 |    if (domain.colLoc() == (domain.tp()-1)) {
 880 |       colMax = 0 ;
 881 |    }
 882 |    if (domain.planeLoc() == 0) {
 883 |       planeMin = 0 ;
 884 |    }
 885 |    if (domain.planeLoc() == (domain.tp()-1)) {
 886 |       planeMax = 0 ;
 887 |    }
 888 | 
 889 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
 890 | 
 891 |    if (planeMin | planeMax) {
 892 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 893 |       Index_t opCount = dx * dy ;
 894 | 
 895 |       if (planeMin) {
 896 |          /* contiguous memory */
 897 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 898 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 899 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 900 |             Domain_member dest = fieldData[fi] ;
 901 |             for (Index_t i=0; i<opCount; ++i) {
 902 |                (domain.*dest)(i) += srcAddr[i] ;
 903 |             }
 904 |             srcAddr += opCount ;
 905 |          }
 906 |          ++pmsg ;
 907 |       }
 908 |       if (planeMax) {
 909 |          /* contiguous memory */
 910 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 911 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 912 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 913 |             Domain_member dest = fieldData[fi] ;
 914 |             for (Index_t i=0; i<opCount; ++i) {
 915 |                (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
 916 |             }
 917 |             srcAddr += opCount ;
 918 |          }
 919 |          ++pmsg ;
 920 |       }
 921 |    }
 922 | 
 923 |    if (rowMin | rowMax) {
 924 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 925 |       Index_t opCount = dx * dz ;
 926 | 
 927 |       if (rowMin) {
 928 |          /* contiguous memory */
 929 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 930 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 931 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 932 |             Domain_member dest = fieldData[fi] ;
 933 |             for (Index_t i=0; i<dz; ++i) {
 934 |                for (Index_t j=0; j<dx; ++j) {
 935 |                   (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
 936 |                }
 937 |             }
 938 |             srcAddr += opCount ;
 939 |          }
 940 |          ++pmsg ;
 941 |       }
 942 |       if (rowMax) {
 943 |          /* contiguous memory */
 944 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 945 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 946 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 947 |             Domain_member dest = fieldData[fi] ;
 948 |             for (Index_t i=0; i<dz; ++i) {
 949 |                for (Index_t j=0; j<dx; ++j) {
 950 |                   (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
 951 |                }
 952 |             }
 953 |             srcAddr += opCount ;
 954 |          }
 955 |          ++pmsg ;
 956 |       }
 957 |    }
 958 |    if (colMin | colMax) {
 959 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
 960 |       Index_t opCount = dy * dz ;
 961 | 
 962 |       if (colMin) {
 963 |          /* contiguous memory */
 964 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 965 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 966 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 967 |             Domain_member dest = fieldData[fi] ;
 968 |             for (Index_t i=0; i<dz; ++i) {
 969 |                for (Index_t j=0; j<dy; ++j) {
 970 |                   (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
 971 |                }
 972 |             }
 973 |             srcAddr += opCount ;
 974 |          }
 975 |          ++pmsg ;
 976 |       }
 977 |       if (colMax) {
 978 |          /* contiguous memory */
 979 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
 980 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
 981 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
 982 |             Domain_member dest = fieldData[fi] ;
 983 |             for (Index_t i=0; i<dz; ++i) {
 984 |                for (Index_t j=0; j<dy; ++j) {
 985 |                   (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
 986 |                }
 987 |             }
 988 |             srcAddr += opCount ;
 989 |          }
 990 |          ++pmsg ;
 991 |       }
 992 |    }
 993 | 
 994 |    if (rowMin & colMin) {
 995 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
 996 |                                        emsg * maxEdgeComm] ;
 997 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
 998 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
 999 |          Domain_member dest = fieldData[fi] ;
1000 |          for (Index_t i=0; i<dz; ++i) {
1001 |             (domain.*dest)(i*dx*dy) += srcAddr[i] ;
1002 |          }
1003 |          srcAddr += dz ;
1004 |       }
1005 |       ++emsg ;
1006 |    }
1007 | 
1008 |    if (rowMin & planeMin) {
1009 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1010 |                                        emsg * maxEdgeComm] ;
1011 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1012 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1013 |          Domain_member dest = fieldData[fi] ;
1014 |          for (Index_t i=0; i<dx; ++i) {
1015 |             (domain.*dest)(i) += srcAddr[i] ;
1016 |          }
1017 |          srcAddr += dx ;
1018 |       }
1019 |       ++emsg ;
1020 |    }
1021 | 
1022 |    if (colMin & planeMin) {
1023 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1024 |                                        emsg * maxEdgeComm] ;
1025 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1026 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1027 |          Domain_member dest = fieldData[fi] ;
1028 |          for (Index_t i=0; i<dy; ++i) {
1029 |             (domain.*dest)(i*dx) += srcAddr[i] ;
1030 |          }
1031 |          srcAddr += dy ;
1032 |       }
1033 |       ++emsg ;
1034 |    }
1035 | 
1036 |    if (rowMax & colMax) {
1037 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1038 |                                        emsg * maxEdgeComm] ;
1039 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1040 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1041 |          Domain_member dest = fieldData[fi] ;
1042 |          for (Index_t i=0; i<dz; ++i) {
1043 |             (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
1044 |          }
1045 |          srcAddr += dz ;
1046 |       }
1047 |       ++emsg ;
1048 |    }
1049 | 
1050 |    if (rowMax & planeMax) {
1051 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1052 |                                        emsg * maxEdgeComm] ;
1053 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1054 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1055 |          Domain_member dest = fieldData[fi] ;
1056 |          for (Index_t i=0; i<dx; ++i) {
1057 |             (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
1058 |          }
1059 |          srcAddr += dx ;
1060 |       }
1061 |       ++emsg ;
1062 |    }
1063 | 
1064 |    if (colMax & planeMax) {
1065 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1066 |                                        emsg * maxEdgeComm] ;
1067 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1068 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1069 |          Domain_member dest = fieldData[fi] ;
1070 |          for (Index_t i=0; i<dy; ++i) {
1071 |             (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
1072 |          }
1073 |          srcAddr += dy ;
1074 |       }
1075 |       ++emsg ;
1076 |    }
1077 | 
1078 |    if (rowMax & colMin) {
1079 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1080 |                                        emsg * maxEdgeComm] ;
1081 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1082 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1083 |          Domain_member dest = fieldData[fi] ;
1084 |          for (Index_t i=0; i<dz; ++i) {
1085 |             (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
1086 |          }
1087 |          srcAddr += dz ;
1088 |       }
1089 |       ++emsg ;
1090 |    }
1091 | 
1092 |    if (rowMin & planeMax) {
1093 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1094 |                                        emsg * maxEdgeComm] ;
1095 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1096 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1097 |          Domain_member dest = fieldData[fi] ;
1098 |          for (Index_t i=0; i<dx; ++i) {
1099 |             (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
1100 |          }
1101 |          srcAddr += dx ;
1102 |       }
1103 |       ++emsg ;
1104 |    }
1105 | 
1106 |    if (colMin & planeMax) {
1107 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1108 |                                        emsg * maxEdgeComm] ;
1109 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1110 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1111 |          Domain_member dest = fieldData[fi] ;
1112 |          for (Index_t i=0; i<dy; ++i) {
1113 |             (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
1114 |          }
1115 |          srcAddr += dy ;
1116 |       }
1117 |       ++emsg ;
1118 |    }
1119 | 
1120 |    if (rowMin & colMax) {
1121 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1122 |                                        emsg * maxEdgeComm] ;
1123 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1124 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1125 |          Domain_member dest = fieldData[fi] ;
1126 |          for (Index_t i=0; i<dz; ++i) {
1127 |             (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
1128 |          }
1129 |          srcAddr += dz ;
1130 |       }
1131 |       ++emsg ;
1132 |    }
1133 | 
1134 |    if (rowMax & planeMin) {
1135 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1136 |                                        emsg * maxEdgeComm] ;
1137 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1138 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1139 |          Domain_member dest = fieldData[fi] ;
1140 |          for (Index_t i=0; i<dx; ++i) {
1141 |             (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
1142 |          }
1143 |          srcAddr += dx ;
1144 |       }
1145 |       ++emsg ;
1146 |    }
1147 | 
1148 |    if (colMax & planeMin) {
1149 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1150 |                                        emsg * maxEdgeComm] ;
1151 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1152 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1153 |          Domain_member dest = fieldData[fi] ;
1154 |          for (Index_t i=0; i<dy; ++i) {
1155 |             (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
1156 |          }
1157 |          srcAddr += dy ;
1158 |       }
1159 |       ++emsg ;
1160 |    }
1161 | 
1162 |    if (rowMin & colMin & planeMin) {
1163 |       /* corner at domain logical coord (0, 0, 0) */
1164 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1165 |                                              emsg * maxEdgeComm +
1166 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1167 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1168 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1169 |          (domain.*fieldData[fi])(0) += comBuf[fi] ;
1170 |       }
1171 |       ++cmsg ;
1172 |    }
1173 |    if (rowMin & colMin & planeMax) {
1174 |       /* corner at domain logical coord (0, 0, 1) */
1175 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1176 |                                              emsg * maxEdgeComm +
1177 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1178 |       Index_t idx = dx*dy*(dz - 1) ;
1179 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1180 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1181 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1182 |       }
1183 |       ++cmsg ;
1184 |    }
1185 |    if (rowMin & colMax & planeMin) {
1186 |       /* corner at domain logical coord (1, 0, 0) */
1187 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1188 |                                              emsg * maxEdgeComm +
1189 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1190 |       Index_t idx = dx - 1 ;
1191 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1192 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1193 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1194 |       }
1195 |       ++cmsg ;
1196 |    }
1197 |    if (rowMin & colMax & planeMax) {
1198 |       /* corner at domain logical coord (1, 0, 1) */
1199 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1200 |                                              emsg * maxEdgeComm +
1201 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1202 |       Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
1203 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1204 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1205 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1206 |       }
1207 |       ++cmsg ;
1208 |    }
1209 |    if (rowMax & colMin & planeMin) {
1210 |       /* corner at domain logical coord (0, 1, 0) */
1211 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1212 |                                              emsg * maxEdgeComm +
1213 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1214 |       Index_t idx = dx*(dy - 1) ;
1215 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1216 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1217 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1218 |       }
1219 |       ++cmsg ;
1220 |    }
1221 |    if (rowMax & colMin & planeMax) {
1222 |       /* corner at domain logical coord (0, 1, 1) */
1223 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1224 |                                              emsg * maxEdgeComm +
1225 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1226 |       Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
1227 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1228 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1229 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1230 |       }
1231 |       ++cmsg ;
1232 |    }
1233 |    if (rowMax & colMax & planeMin) {
1234 |       /* corner at domain logical coord (1, 1, 0) */
1235 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1236 |                                              emsg * maxEdgeComm +
1237 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1238 |       Index_t idx = dx*dy - 1 ;
1239 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1240 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1241 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1242 |       }
1243 |       ++cmsg ;
1244 |    }
1245 |    if (rowMax & colMax & planeMax) {
1246 |       /* corner at domain logical coord (1, 1, 1) */
1247 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1248 |                                              emsg * maxEdgeComm +
1249 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1250 |       Index_t idx = dx*dy*dz - 1 ;
1251 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1252 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1253 |          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1254 |       }
1255 |       ++cmsg ;
1256 |    }
1257 | }
1258 | 
1259 | /******************************************/
1260 | 
1261 | void CommSyncPosVel(Domain& domain) {
1262 | 
1263 |    if (domain.numRanks() == 1)
1264 |       return ;
1265 | 
1266 |    int myRank ;
1267 |    bool doRecv = false ;
1268 |    Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
1269 |    Domain_member fieldData[6] ;
1270 |    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
1271 |    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
1272 |    Index_t pmsg = 0 ; /* plane comm msg */
1273 |    Index_t emsg = 0 ; /* edge comm msg */
1274 |    Index_t cmsg = 0 ; /* corner comm msg */
1275 |    Index_t dx = domain.sizeX() + 1 ;
1276 |    Index_t dy = domain.sizeY() + 1 ;
1277 |    Index_t dz = domain.sizeZ() + 1 ;
1278 |    MPI_Status status ;
1279 |    Real_t *srcAddr ;
1280 |    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
1281 | 
1282 |    /* assume communication to 6 neighbors by default */
1283 |    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
1284 |    if (domain.rowLoc() == 0) {
1285 |       rowMin = false ;
1286 |    }
1287 |    if (domain.rowLoc() == (domain.tp()-1)) {
1288 |       rowMax = false ;
1289 |    }
1290 |    if (domain.colLoc() == 0) {
1291 |       colMin = false ;
1292 |    }
1293 |    if (domain.colLoc() == (domain.tp()-1)) {
1294 |       colMax = false ;
1295 |    }
1296 |    if (domain.planeLoc() == 0) {
1297 |       planeMin = false ;
1298 |    }
1299 |    if (domain.planeLoc() == (domain.tp()-1)) {
1300 |       planeMax = false ;
1301 |    }
1302 | 
1303 |    fieldData[0] = &Domain::x ;
1304 |    fieldData[1] = &Domain::y ;
1305 |    fieldData[2] = &Domain::z ;
1306 |    fieldData[3] = &Domain::xd ;
1307 |    fieldData[4] = &Domain::yd ;
1308 |    fieldData[5] = &Domain::zd ;
1309 | 
1310 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
1311 | 
1312 |    if (planeMin | planeMax) {
1313 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1314 |       Index_t opCount = dx * dy ;
1315 | 
1316 |       if (planeMin && doRecv) {
1317 |          /* contiguous memory */
1318 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1319 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1320 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1321 |             Domain_member dest = fieldData[fi] ;
1322 |             for (Index_t i=0; i<opCount; ++i) {
1323 |                (domain.*dest)(i) = srcAddr[i] ;
1324 |             }
1325 |             srcAddr += opCount ;
1326 |          }
1327 |          ++pmsg ;
1328 |       }
1329 |       if (planeMax) {
1330 |          /* contiguous memory */
1331 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1332 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1333 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1334 |             Domain_member dest = fieldData[fi] ;
1335 |             for (Index_t i=0; i<opCount; ++i) {
1336 |                (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
1337 |             }
1338 |             srcAddr += opCount ;
1339 |          }
1340 |          ++pmsg ;
1341 |       }
1342 |    }
1343 | 
1344 |    if (rowMin | rowMax) {
1345 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1346 |       Index_t opCount = dx * dz ;
1347 | 
1348 |       if (rowMin && doRecv) {
1349 |          /* contiguous memory */
1350 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1351 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1352 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1353 |             Domain_member dest = fieldData[fi] ;
1354 |             for (Index_t i=0; i<dz; ++i) {
1355 |                for (Index_t j=0; j<dx; ++j) {
1356 |                   (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
1357 |                }
1358 |             }
1359 |             srcAddr += opCount ;
1360 |          }
1361 |          ++pmsg ;
1362 |       }
1363 |       if (rowMax) {
1364 |          /* contiguous memory */
1365 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1366 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1367 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1368 |             Domain_member dest = fieldData[fi] ;
1369 |             for (Index_t i=0; i<dz; ++i) {
1370 |                for (Index_t j=0; j<dx; ++j) {
1371 |                   (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
1372 |                }
1373 |             }
1374 |             srcAddr += opCount ;
1375 |          }
1376 |          ++pmsg ;
1377 |       }
1378 |    }
1379 | 
1380 |    if (colMin | colMax) {
1381 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1382 |       Index_t opCount = dy * dz ;
1383 | 
1384 |       if (colMin && doRecv) {
1385 |          /* contiguous memory */
1386 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1387 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1388 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1389 |             Domain_member dest = fieldData[fi] ;
1390 |             for (Index_t i=0; i<dz; ++i) {
1391 |                for (Index_t j=0; j<dy; ++j) {
1392 |                   (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
1393 |                }
1394 |             }
1395 |             srcAddr += opCount ;
1396 |          }
1397 |          ++pmsg ;
1398 |       }
1399 |       if (colMax) {
1400 |          /* contiguous memory */
1401 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1402 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1403 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1404 |             Domain_member dest = fieldData[fi] ;
1405 |             for (Index_t i=0; i<dz; ++i) {
1406 |                for (Index_t j=0; j<dy; ++j) {
1407 |                   (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
1408 |                }
1409 |             }
1410 |             srcAddr += opCount ;
1411 |          }
1412 |          ++pmsg ;
1413 |       }
1414 |    }
1415 | 
1416 |    if (rowMin && colMin && doRecv) {
1417 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1418 |                                        emsg * maxEdgeComm] ;
1419 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1420 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1421 |          Domain_member dest = fieldData[fi] ;
1422 |          for (Index_t i=0; i<dz; ++i) {
1423 |             (domain.*dest)(i*dx*dy) = srcAddr[i] ;
1424 |          }
1425 |          srcAddr += dz ;
1426 |       }
1427 |       ++emsg ;
1428 |    }
1429 | 
1430 |    if (rowMin && planeMin && doRecv) {
1431 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1432 |                                        emsg * maxEdgeComm] ;
1433 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1434 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1435 |          Domain_member dest = fieldData[fi] ;
1436 |          for (Index_t i=0; i<dx; ++i) {
1437 |             (domain.*dest)(i) = srcAddr[i] ;
1438 |          }
1439 |          srcAddr += dx ;
1440 |       }
1441 |       ++emsg ;
1442 |    }
1443 | 
1444 |    if (colMin && planeMin && doRecv) {
1445 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1446 |                                        emsg * maxEdgeComm] ;
1447 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1448 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1449 |          Domain_member dest = fieldData[fi] ;
1450 |          for (Index_t i=0; i<dy; ++i) {
1451 |             (domain.*dest)(i*dx) = srcAddr[i] ;
1452 |          }
1453 |          srcAddr += dy ;
1454 |       }
1455 |       ++emsg ;
1456 |    }
1457 | 
1458 |    if (rowMax && colMax) {
1459 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1460 |                                        emsg * maxEdgeComm] ;
1461 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1462 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1463 |          Domain_member dest = fieldData[fi] ;
1464 |          for (Index_t i=0; i<dz; ++i) {
1465 |             (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
1466 |          }
1467 |          srcAddr += dz ;
1468 |       }
1469 |       ++emsg ;
1470 |    }
1471 | 
1472 |    if (rowMax && planeMax) {
1473 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1474 |                                        emsg * maxEdgeComm] ;
1475 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1476 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1477 |          Domain_member dest = fieldData[fi] ;
1478 |          for (Index_t i=0; i<dx; ++i) {
1479 |             (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
1480 |          }
1481 |          srcAddr += dx ;
1482 |       }
1483 |       ++emsg ;
1484 |    }
1485 | 
1486 |    if (colMax && planeMax) {
1487 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1488 |                                        emsg * maxEdgeComm] ;
1489 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1490 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1491 |          Domain_member dest = fieldData[fi] ;
1492 |          for (Index_t i=0; i<dy; ++i) {
1493 |             (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
1494 |          }
1495 |          srcAddr += dy ;
1496 |       }
1497 |       ++emsg ;
1498 |    }
1499 | 
1500 |    if (rowMax && colMin) {
1501 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1502 |                                        emsg * maxEdgeComm] ;
1503 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1504 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1505 |          Domain_member dest = fieldData[fi] ;
1506 |          for (Index_t i=0; i<dz; ++i) {
1507 |             (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
1508 |          }
1509 |          srcAddr += dz ;
1510 |       }
1511 |       ++emsg ;
1512 |    }
1513 | 
1514 |    if (rowMin && planeMax) {
1515 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1516 |                                        emsg * maxEdgeComm] ;
1517 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1518 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1519 |          Domain_member dest = fieldData[fi] ;
1520 |          for (Index_t i=0; i<dx; ++i) {
1521 |             (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
1522 |          }
1523 |          srcAddr += dx ;
1524 |       }
1525 |       ++emsg ;
1526 |    }
1527 | 
1528 |    if (colMin && planeMax) {
1529 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1530 |                                        emsg * maxEdgeComm] ;
1531 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1532 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1533 |          Domain_member dest = fieldData[fi] ;
1534 |          for (Index_t i=0; i<dy; ++i) {
1535 |             (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
1536 |          }
1537 |          srcAddr += dy ;
1538 |       }
1539 |       ++emsg ;
1540 |    }
1541 | 
1542 |    if (rowMin && colMax && doRecv) {
1543 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1544 |                                        emsg * maxEdgeComm] ;
1545 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1546 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1547 |          Domain_member dest = fieldData[fi] ;
1548 |          for (Index_t i=0; i<dz; ++i) {
1549 |             (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
1550 |          }
1551 |          srcAddr += dz ;
1552 |       }
1553 |       ++emsg ;
1554 |    }
1555 | 
1556 |    if (rowMax && planeMin && doRecv) {
1557 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1558 |                                        emsg * maxEdgeComm] ;
1559 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1560 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1561 |          Domain_member dest = fieldData[fi] ;
1562 |          for (Index_t i=0; i<dx; ++i) {
1563 |             (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
1564 |          }
1565 |          srcAddr += dx ;
1566 |       }
1567 |       ++emsg ;
1568 |    }
1569 | 
1570 |    if (colMax && planeMin && doRecv) {
1571 |       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1572 |                                        emsg * maxEdgeComm] ;
1573 |       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1574 |       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1575 |          Domain_member dest = fieldData[fi] ;
1576 |          for (Index_t i=0; i<dy; ++i) {
1577 |             (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
1578 |          }
1579 |          srcAddr += dy ;
1580 |       }
1581 |       ++emsg ;
1582 |    }
1583 | 
1584 | 
1585 |    if (rowMin && colMin && planeMin && doRecv) {
1586 |       /* corner at domain logical coord (0, 0, 0) */
1587 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1588 |                                              emsg * maxEdgeComm +
1589 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1590 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1591 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1592 |          (domain.*fieldData[fi])(0) = comBuf[fi] ;
1593 |       }
1594 |       ++cmsg ;
1595 |    }
1596 |    if (rowMin && colMin && planeMax) {
1597 |       /* corner at domain logical coord (0, 0, 1) */
1598 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1599 |                                              emsg * maxEdgeComm +
1600 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1601 |       Index_t idx = dx*dy*(dz - 1) ;
1602 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1603 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1604 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1605 |       }
1606 |       ++cmsg ;
1607 |    }
1608 |    if (rowMin && colMax && planeMin && doRecv) {
1609 |       /* corner at domain logical coord (1, 0, 0) */
1610 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1611 |                                              emsg * maxEdgeComm +
1612 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1613 |       Index_t idx = dx - 1 ;
1614 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1615 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1616 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1617 |       }
1618 |       ++cmsg ;
1619 |    }
1620 |    if (rowMin && colMax && planeMax) {
1621 |       /* corner at domain logical coord (1, 0, 1) */
1622 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1623 |                                              emsg * maxEdgeComm +
1624 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1625 |       Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
1626 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1627 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1628 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1629 |       }
1630 |       ++cmsg ;
1631 |    }
1632 |    if (rowMax && colMin && planeMin && doRecv) {
1633 |       /* corner at domain logical coord (0, 1, 0) */
1634 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1635 |                                              emsg * maxEdgeComm +
1636 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1637 |       Index_t idx = dx*(dy - 1) ;
1638 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1639 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1640 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1641 |       }
1642 |       ++cmsg ;
1643 |    }
1644 |    if (rowMax && colMin && planeMax) {
1645 |       /* corner at domain logical coord (0, 1, 1) */
1646 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1647 |                                              emsg * maxEdgeComm +
1648 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1649 |       Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
1650 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1651 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1652 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1653 |       }
1654 |       ++cmsg ;
1655 |    }
1656 |    if (rowMax && colMax && planeMin && doRecv) {
1657 |       /* corner at domain logical coord (1, 1, 0) */
1658 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1659 |                                              emsg * maxEdgeComm +
1660 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1661 |       Index_t idx = dx*dy - 1 ;
1662 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1663 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1664 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1665 |       }
1666 |       ++cmsg ;
1667 |    }
1668 |    if (rowMax && colMax && planeMax) {
1669 |       /* corner at domain logical coord (1, 1, 1) */
1670 |       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1671 |                                              emsg * maxEdgeComm +
1672 |                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1673 |       Index_t idx = dx*dy*dz - 1 ;
1674 |       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1675 |       for (Index_t fi=0; fi<xferFields; ++fi) {
1676 |          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1677 |       }
1678 |       ++cmsg ;
1679 |    }
1680 | }
1681 | 
1682 | /******************************************/
1683 | 
1684 | void CommMonoQ(Domain& domain)
1685 | {
1686 |    if (domain.numRanks() == 1)
1687 |       return ;
1688 | 
1689 |    int myRank ;
1690 |    Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
1691 |    Domain_member fieldData[3] ;
1692 |    Index_t fieldOffset[3] ;
1693 |    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
1694 |    Index_t pmsg = 0 ; /* plane comm msg */
1695 |    Index_t dx = domain.sizeX() ;
1696 |    Index_t dy = domain.sizeY() ;
1697 |    Index_t dz = domain.sizeZ() ;
1698 |    MPI_Status status ;
1699 |    Real_t *srcAddr ;
1700 |    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
1701 |    /* assume communication to 6 neighbors by default */
1702 |    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
1703 |    if (domain.rowLoc() == 0) {
1704 |       rowMin = false ;
1705 |    }
1706 |    if (domain.rowLoc() == (domain.tp()-1)) {
1707 |       rowMax = false ;
1708 |    }
1709 |    if (domain.colLoc() == 0) {
1710 |       colMin = false ;
1711 |    }
1712 |    if (domain.colLoc() == (domain.tp()-1)) {
1713 |       colMax = false ;
1714 |    }
1715 |    if (domain.planeLoc() == 0) {
1716 |       planeMin = false ;
1717 |    }
1718 |    if (domain.planeLoc() == (domain.tp()-1)) {
1719 |       planeMax = false ;
1720 |    }
1721 | 
1722 |    /* point into ghost data area */
1723 |    // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
1724 |    // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
1725 |    // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
1726 |    fieldData[0] = &Domain::delv_xi ;
1727 |    fieldData[1] = &Domain::delv_eta ;
1728 |    fieldData[2] = &Domain::delv_zeta ;
1729 |    fieldOffset[0] = domain.numElem() ;
1730 |    fieldOffset[1] = domain.numElem() ;
1731 |    fieldOffset[2] = domain.numElem() ;
1732 | 
1733 | 
1734 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
1735 | 
1736 |    if (planeMin | planeMax) {
1737 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1738 |       Index_t opCount = dx * dy ;
1739 | 
1740 |       if (planeMin) {
1741 |          /* contiguous memory */
1742 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1743 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1744 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1745 |             Domain_member dest = fieldData[fi] ;
1746 |             for (Index_t i=0; i<opCount; ++i) {
1747 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1748 |             }
1749 |             srcAddr += opCount ;
1750 |             fieldOffset[fi] += opCount ;
1751 |          }
1752 |          ++pmsg ;
1753 |       }
1754 |       if (planeMax) {
1755 |          /* contiguous memory */
1756 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1757 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1758 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1759 |             Domain_member dest = fieldData[fi] ;
1760 |             for (Index_t i=0; i<opCount; ++i) {
1761 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1762 |             }
1763 |             srcAddr += opCount ;
1764 |             fieldOffset[fi] += opCount ;
1765 |          }
1766 |          ++pmsg ;
1767 |       }
1768 |    }
1769 | 
1770 |    if (rowMin | rowMax) {
1771 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1772 |       Index_t opCount = dx * dz ;
1773 | 
1774 |       if (rowMin) {
1775 |          /* contiguous memory */
1776 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1777 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1778 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1779 |             Domain_member dest = fieldData[fi] ;
1780 |             for (Index_t i=0; i<opCount; ++i) {
1781 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1782 |             }
1783 |             srcAddr += opCount ;
1784 |             fieldOffset[fi] += opCount ;
1785 |          }
1786 |          ++pmsg ;
1787 |       }
1788 |       if (rowMax) {
1789 |          /* contiguous memory */
1790 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1791 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1792 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1793 |             Domain_member dest = fieldData[fi] ;
1794 |             for (Index_t i=0; i<opCount; ++i) {
1795 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1796 |             }
1797 |             srcAddr += opCount ;
1798 |             fieldOffset[fi] += opCount ;
1799 |          }
1800 |          ++pmsg ;
1801 |       }
1802 |    }
1803 |    if (colMin | colMax) {
1804 |       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1805 |       Index_t opCount = dy * dz ;
1806 | 
1807 |       if (colMin) {
1808 |          /* contiguous memory */
1809 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1810 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1811 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1812 |             Domain_member dest = fieldData[fi] ;
1813 |             for (Index_t i=0; i<opCount; ++i) {
1814 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1815 |             }
1816 |             srcAddr += opCount ;
1817 |             fieldOffset[fi] += opCount ;
1818 |          }
1819 |          ++pmsg ;
1820 |       }
1821 |       if (colMax) {
1822 |          /* contiguous memory */
1823 |          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1824 |          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1825 |          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1826 |             Domain_member dest = fieldData[fi] ;
1827 |             for (Index_t i=0; i<opCount; ++i) {
1828 |                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1829 |             }
1830 |             srcAddr += opCount ;
1831 |          }
1832 |          ++pmsg ;
1833 |       }
1834 |    }
1835 | }
1836 | 
1837 | #endif
1838 | 


--------------------------------------------------------------------------------
/lulesh-init.cc:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #if USE_MPI
  3 | # include <mpi.h>
  4 | #endif
  5 | #if _OPENMP
  6 | #include <omp.h>
  7 | #endif
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <limits.h>
 12 | #include <cstdlib>
 13 | #include "lulesh.h"
 14 | 
 15 | /////////////////////////////////////////////////////////////////////
 16 | Domain::Domain(Int_t numRanks, Index_t colLoc,
 17 |                Index_t rowLoc, Index_t planeLoc,
 18 |                Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost)
 19 |    :
 20 |    m_e_cut(Real_t(1.0e-7)),
 21 |    m_p_cut(Real_t(1.0e-7)),
 22 |    m_q_cut(Real_t(1.0e-7)),
 23 |    m_v_cut(Real_t(1.0e-10)),
 24 |    m_u_cut(Real_t(1.0e-7)),
 25 |    m_hgcoef(Real_t(3.0)),
 26 |    m_ss4o3(Real_t(4.0)/Real_t(3.0)),
 27 |    m_qstop(Real_t(1.0e+12)),
 28 |    m_monoq_max_slope(Real_t(1.0)),
 29 |    m_monoq_limiter_mult(Real_t(2.0)),
 30 |    m_qlc_monoq(Real_t(0.5)),
 31 |    m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
 32 |    m_qqc(Real_t(2.0)),
 33 |    m_eosvmax(Real_t(1.0e+9)),
 34 |    m_eosvmin(Real_t(1.0e-9)),
 35 |    m_pmin(Real_t(0.)),
 36 |    m_emin(Real_t(-1.0e+15)),
 37 |    m_dvovmax(Real_t(0.1)),
 38 |    m_refdens(Real_t(1.0)),
 39 | //
 40 | // set pointers to (potentially) "new'd" arrays to null to 
 41 | // simplify deallocation.
 42 | //
 43 |    m_regNumList(0),
 44 |    m_nodeElemStart(0),
 45 |    m_nodeElemCornerList(0),
 46 |    m_regElemSize(0),
 47 |    m_regElemlist(0)
 48 | #if USE_MPI
 49 |    , 
 50 |    commDataSend(0),
 51 |    commDataRecv(0)
 52 | #endif
 53 | {
 54 | 
 55 |    Index_t edgeElems = nx ;
 56 |    Index_t edgeNodes = edgeElems+1 ;
 57 |    this->cost() = cost;
 58 | 
 59 |    m_tp       = tp ;
 60 |    m_numRanks = numRanks ;
 61 | 
 62 |    ///////////////////////////////
 63 |    //   Initialize Sedov Mesh
 64 |    ///////////////////////////////
 65 | 
 66 |    // construct a uniform box for this processor
 67 | 
 68 |    m_colLoc   =   colLoc ;
 69 |    m_rowLoc   =   rowLoc ;
 70 |    m_planeLoc = planeLoc ;
 71 |    
 72 |    m_sizeX = edgeElems ;
 73 |    m_sizeY = edgeElems ;
 74 |    m_sizeZ = edgeElems ;
 75 |    m_numElem = edgeElems*edgeElems*edgeElems ;
 76 | 
 77 |    m_numNode = edgeNodes*edgeNodes*edgeNodes ;
 78 | 
 79 |    m_regNumList = new Index_t[numElem()] ;  // material indexset
 80 | 
 81 |    // Elem-centered 
 82 |    AllocateElemPersistent(numElem()) ;
 83 | 
 84 |    // Node-centered 
 85 |    AllocateNodePersistent(numNode()) ;
 86 | 
 87 |    SetupCommBuffers(edgeNodes);
 88 | 
 89 |    // Basic Field Initialization 
 90 |    for (Index_t i=0; i<numElem(); ++i) {
 91 |       e(i) =  Real_t(0.0) ;
 92 |       p(i) =  Real_t(0.0) ;
 93 |       q(i) =  Real_t(0.0) ;
 94 |       ss(i) = Real_t(0.0) ;
 95 |    }
 96 | 
 97 |    // Note - v initializes to 1.0, not 0.0!
 98 |    for (Index_t i=0; i<numElem(); ++i) {
 99 |       v(i) = Real_t(1.0) ;
100 |    }
101 | 
102 |    for (Index_t i=0; i<numNode(); ++i) {
103 |       xd(i) = Real_t(0.0) ;
104 |       yd(i) = Real_t(0.0) ;
105 |       zd(i) = Real_t(0.0) ;
106 |    }
107 | 
108 |    for (Index_t i=0; i<numNode(); ++i) {
109 |       xdd(i) = Real_t(0.0) ;
110 |       ydd(i) = Real_t(0.0) ;
111 |       zdd(i) = Real_t(0.0) ;
112 |    }
113 | 
114 |    for (Index_t i=0; i<numNode(); ++i) {
115 |       nodalMass(i) = Real_t(0.0) ;
116 |    }
117 | 
118 |    BuildMesh(nx, edgeNodes, edgeElems);
119 | 
120 | #if _OPENMP
121 |    SetupThreadSupportStructures();
122 | #endif
123 | 
124 |    // Setup region index sets. For now, these are constant sized
125 |    // throughout the run, but could be changed every cycle to 
126 |    // simulate effects of ALE on the lagrange solver
127 |    CreateRegionIndexSets(nr, balance);
128 | 
129 |    // Setup symmetry nodesets
130 |    SetupSymmetryPlanes(edgeNodes);
131 | 
132 |    // Setup element connectivities
133 |    SetupElementConnectivities(edgeElems);
134 | 
135 |    // Setup symmetry planes and free surface boundary arrays
136 |    SetupBoundaryConditions(edgeElems);
137 | 
138 | 
139 |    // Setup defaults
140 | 
141 |    // These can be changed (requires recompile) if you want to run
142 |    // with a fixed timestep, or to a different end time, but it's
143 |    // probably easier/better to just run a fixed number of timesteps
144 |    // using the -i flag in 2.x
145 | 
146 |    dtfixed() = Real_t(-1.0e-6) ; // Negative means use courant condition
147 |    stoptime()  = Real_t(1.0e-2); // *Real_t(edgeElems*tp/45.0) ;
148 | 
149 |    // Initial conditions
150 |    deltatimemultlb() = Real_t(1.1) ;
151 |    deltatimemultub() = Real_t(1.2) ;
152 |    dtcourant() = Real_t(1.0e+20) ;
153 |    dthydro()   = Real_t(1.0e+20) ;
154 |    dtmax()     = Real_t(1.0e-2) ;
155 |    time()    = Real_t(0.) ;
156 |    cycle()   = Int_t(0) ;
157 | 
158 |    // initialize field data 
159 |    for (Index_t i=0; i<numElem(); ++i) {
160 |       Real_t x_local[8], y_local[8], z_local[8] ;
161 |       Index_t *elemToNode = nodelist(i) ;
162 |       for( Index_t lnode=0 ; lnode<8 ; ++lnode )
163 |       {
164 |         Index_t gnode = elemToNode[lnode];
165 |         x_local[lnode] = x(gnode);
166 |         y_local[lnode] = y(gnode);
167 |         z_local[lnode] = z(gnode);
168 |       }
169 | 
170 |       // volume calculations
171 |       Real_t volume = CalcElemVolume(x_local, y_local, z_local );
172 |       volo(i) = volume ;
173 |       elemMass(i) = volume ;
174 |       for (Index_t j=0; j<8; ++j) {
175 |          Index_t idx = elemToNode[j] ;
176 |          nodalMass(idx) += volume / Real_t(8.0) ;
177 |       }
178 |    }
179 | 
180 |    // deposit initial energy
181 |    // An energy of 3.948746e+7 is correct for a problem with
182 |    // 45 zones along a side - we need to scale it
183 |    const Real_t ebase = Real_t(3.948746e+7);
184 |    Real_t scale = (nx*m_tp)/Real_t(45.0);
185 |    Real_t einit = ebase*scale*scale*scale;
186 |    if (m_rowLoc + m_colLoc + m_planeLoc == 0) {
187 |       // Dump into the first zone (which we know is in the corner)
188 |       // of the domain that sits at the origin
189 |       e(0) = einit;
190 |    }
191 |    //set initial deltatime base on analytic CFL calculation
192 |    deltatime() = (Real_t(.5)*cbrt(volo(0)))/sqrt(Real_t(2.0)*einit);
193 | 
194 | } // End constructor
195 | 
196 | 
197 | ////////////////////////////////////////////////////////////////////////////////
198 | Domain::~Domain()
199 | {
200 |    delete [] m_regNumList;
201 |    delete [] m_nodeElemStart;
202 |    delete [] m_nodeElemCornerList;
203 |    delete [] m_regElemSize;
204 |    for (Index_t i=0 ; i<numReg() ; ++i) {
205 |      delete [] m_regElemlist[i];
206 |    }
207 |    delete [] m_regElemlist;
208 |    
209 | #if USE_MPI
210 |    delete [] commDataSend;
211 |    delete [] commDataRecv;
212 | #endif
213 | } // End destructor
214 | 
215 | 
216 | ////////////////////////////////////////////////////////////////////////////////
217 | void
218 | Domain::BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems)
219 | {
220 |   Index_t meshEdgeElems = m_tp*nx ;
221 | 
222 |   // initialize nodal coordinates 
223 |   Index_t nidx = 0 ;
224 |   Real_t tz = Real_t(1.125)*Real_t(m_planeLoc*nx)/Real_t(meshEdgeElems) ;
225 |   for (Index_t plane=0; plane<edgeNodes; ++plane) {
226 |     Real_t ty = Real_t(1.125)*Real_t(m_rowLoc*nx)/Real_t(meshEdgeElems) ;
227 |     for (Index_t row=0; row<edgeNodes; ++row) {
228 |       Real_t tx = Real_t(1.125)*Real_t(m_colLoc*nx)/Real_t(meshEdgeElems) ;
229 |       for (Index_t col=0; col<edgeNodes; ++col) {
230 | 	x(nidx) = tx ;
231 | 	y(nidx) = ty ;
232 | 	z(nidx) = tz ;
233 | 	++nidx ;
234 | 	// tx += ds ; // may accumulate roundoff... 
235 | 	tx = Real_t(1.125)*Real_t(m_colLoc*nx+col+1)/Real_t(meshEdgeElems) ;
236 |       }
237 |       // ty += ds ;  // may accumulate roundoff... 
238 |       ty = Real_t(1.125)*Real_t(m_rowLoc*nx+row+1)/Real_t(meshEdgeElems) ;
239 |     }
240 |     // tz += ds ;  // may accumulate roundoff... 
241 |     tz = Real_t(1.125)*Real_t(m_planeLoc*nx+plane+1)/Real_t(meshEdgeElems) ;
242 |   }
243 | 
244 | 
245 |   // embed hexehedral elements in nodal point lattice 
246 |   Index_t zidx = 0 ;
247 |   nidx = 0 ;
248 |   for (Index_t plane=0; plane<edgeElems; ++plane) {
249 |     for (Index_t row=0; row<edgeElems; ++row) {
250 |       for (Index_t col=0; col<edgeElems; ++col) {
251 | 	Index_t *localNode = nodelist(zidx) ;
252 | 	localNode[0] = nidx                                       ;
253 | 	localNode[1] = nidx                                   + 1 ;
254 | 	localNode[2] = nidx                       + edgeNodes + 1 ;
255 | 	localNode[3] = nidx                       + edgeNodes     ;
256 | 	localNode[4] = nidx + edgeNodes*edgeNodes                 ;
257 | 	localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;
258 | 	localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;
259 | 	localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;
260 | 	++zidx ;
261 | 	++nidx ;
262 |       }
263 |       ++nidx ;
264 |     }
265 |     nidx += edgeNodes ;
266 |   }
267 | }
268 | 
269 | 
270 | ////////////////////////////////////////////////////////////////////////////////
271 | void
272 | Domain::SetupThreadSupportStructures()
273 | {
274 | #if _OPENMP
275 |    Index_t numthreads = omp_get_max_threads();
276 | #else
277 |    Index_t numthreads = 1;
278 | #endif
279 | 
280 |   if (numthreads > 1) {
281 |     // set up node-centered indexing of elements 
282 |     Index_t *nodeElemCount = new Index_t[numNode()] ;
283 | 
284 |     for (Index_t i=0; i<numNode(); ++i) {
285 |       nodeElemCount[i] = 0 ;
286 |     }
287 | 
288 |     for (Index_t i=0; i<numElem(); ++i) {
289 |       Index_t *nl = nodelist(i) ;
290 |       for (Index_t j=0; j < 8; ++j) {
291 | 	++(nodeElemCount[nl[j]] );
292 |       }
293 |     }
294 | 
295 |     m_nodeElemStart = new Index_t[numNode()+1] ;
296 | 
297 |     m_nodeElemStart[0] = 0;
298 | 
299 |     for (Index_t i=1; i <= numNode(); ++i) {
300 |       m_nodeElemStart[i] =
301 | 	m_nodeElemStart[i-1] + nodeElemCount[i-1] ;
302 |     }
303 |        
304 |     m_nodeElemCornerList = new Index_t[m_nodeElemStart[numNode()]];
305 | 
306 |     for (Index_t i=0; i < numNode(); ++i) {
307 |       nodeElemCount[i] = 0;
308 |     }
309 | 
310 |     for (Index_t i=0; i < numElem(); ++i) {
311 |       Index_t *nl = nodelist(i) ;
312 |       for (Index_t j=0; j < 8; ++j) {
313 | 	Index_t m = nl[j];
314 | 	Index_t k = i*8 + j ;
315 | 	Index_t offset = m_nodeElemStart[m] + nodeElemCount[m] ;
316 | 	m_nodeElemCornerList[offset] = k;
317 | 	++(nodeElemCount[m]) ;
318 |       }
319 |     }
320 | 
321 |     Index_t clSize = m_nodeElemStart[numNode()] ;
322 |     for (Index_t i=0; i < clSize; ++i) {
323 |       Index_t clv = m_nodeElemCornerList[i] ;
324 |       if ((clv < 0) || (clv > numElem()*8)) {
325 | 	fprintf(stderr,
326 | 		"AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n");
327 | #if USE_MPI
328 | 	MPI_Abort(MPI_COMM_WORLD, -1);
329 | #else
330 | 	exit(-1);
331 | #endif
332 |       }
333 |     }
334 | 
335 |     delete [] nodeElemCount ;
336 |   }
337 | }
338 | 
339 | 
340 | ////////////////////////////////////////////////////////////////////////////////
341 | void
342 | Domain::SetupCommBuffers(Int_t edgeNodes)
343 | {
344 |   // allocate a buffer large enough for nodal ghost data 
345 |   Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ;
346 |   m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ;
347 |   m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ;
348 | 
349 |   // assume communication to 6 neighbors by default 
350 |   m_rowMin = (m_rowLoc == 0)        ? 0 : 1;
351 |   m_rowMax = (m_rowLoc == m_tp-1)     ? 0 : 1;
352 |   m_colMin = (m_colLoc == 0)        ? 0 : 1;
353 |   m_colMax = (m_colLoc == m_tp-1)     ? 0 : 1;
354 |   m_planeMin = (m_planeLoc == 0)    ? 0 : 1;
355 |   m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1;
356 | 
357 | #if USE_MPI   
358 |   // account for face communication 
359 |   Index_t comBufSize =
360 |     (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
361 |     m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
362 | 
363 |   // account for edge communication 
364 |   comBufSize +=
365 |     ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
366 |      (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
367 |      (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
368 |      (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
369 |     m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
370 | 
371 |   // account for corner communication 
372 |   // factor of 16 is so each buffer has its own cache line 
373 |   comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
374 | 		 (m_rowMin & m_colMin & m_planeMax) +
375 | 		 (m_rowMin & m_colMax & m_planeMin) +
376 | 		 (m_rowMin & m_colMax & m_planeMax) +
377 | 		 (m_rowMax & m_colMin & m_planeMin) +
378 | 		 (m_rowMax & m_colMin & m_planeMax) +
379 | 		 (m_rowMax & m_colMax & m_planeMin) +
380 | 		 (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
381 | 
382 |   this->commDataSend = new Real_t[comBufSize] ;
383 |   this->commDataRecv = new Real_t[comBufSize] ;
384 |   // prevent floating point exceptions 
385 |   memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ;
386 |   memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ;
387 | #endif   
388 | 
389 |   // Boundary nodesets
390 |   if (m_colLoc == 0)
391 |     m_symmX.resize(edgeNodes*edgeNodes);
392 |   if (m_rowLoc == 0)
393 |     m_symmY.resize(edgeNodes*edgeNodes);
394 |   if (m_planeLoc == 0)
395 |     m_symmZ.resize(edgeNodes*edgeNodes);
396 | }
397 | 
398 | 
399 | ////////////////////////////////////////////////////////////////////////////////
400 | void
401 | Domain::CreateRegionIndexSets(Int_t nr, Int_t balance)
402 | {
403 | #if USE_MPI   
404 |    int myRank;
405 |    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
406 |    srand(myRank);
407 | #else
408 |    srand(0);
409 |    Index_t myRank = 0;
410 | #endif
411 |    this->numReg() = nr;
412 |    m_regElemSize = new Index_t[numReg()];
413 |    m_regElemlist = new Index_t*[numReg()];
414 |    Index_t nextIndex = 0;
415 |    //if we only have one region just fill it
416 |    // Fill out the regNumList with material numbers, which are always
417 |    // the region index plus one 
418 |    if(numReg() == 1) {
419 |       while (nextIndex < numElem()) {
420 | 	 this->regNumList(nextIndex) = 1;
421 |          nextIndex++;
422 |       }
423 |       regElemSize(0) = 0;
424 |    }
425 |    //If we have more than one region distribute the elements.
426 |    else {
427 |       Int_t regionNum;
428 |       Int_t regionVar;
429 |       Int_t lastReg = -1;
430 |       Int_t binSize;
431 |       Index_t elements;
432 |       Index_t runto = 0;
433 |       Int_t costDenominator = 0;
434 |       Int_t* regBinEnd = new Int_t[numReg()];
435 |       //Determine the relative weights of all the regions.  This is based off the -b flag.  Balance is the value passed into b.  
436 |       for (Index_t i=0 ; i<numReg() ; ++i) {
437 |          regElemSize(i) = 0;
438 | 	 costDenominator += pow((i+1), balance);  //Total sum of all regions weights
439 | 	 regBinEnd[i] = costDenominator;  //Chance of hitting a given region is (regBinEnd[i] - regBinEdn[i-1])/costDenominator
440 |       }
441 |       //Until all elements are assigned
442 |       while (nextIndex < numElem()) {
443 | 	 //pick the region
444 | 	 regionVar = rand() % costDenominator;
445 | 	 Index_t i = 0;
446 |          while(regionVar >= regBinEnd[i])
447 | 	    i++;
448 |          //rotate the regions based on MPI rank.  Rotation is Rank % NumRegions this makes each domain have a different region with 
449 |          //the highest representation
450 | 	 regionNum = ((i + myRank) % numReg()) + 1;
451 | 	 // make sure we don't pick the same region twice in a row
452 |          while(regionNum == lastReg) {
453 | 	    regionVar = rand() % costDenominator;
454 | 	    i = 0;
455 |             while(regionVar >= regBinEnd[i])
456 | 	       i++;
457 | 	    regionNum = ((i + myRank) % numReg()) + 1;
458 |          }
459 | 	 //Pick the bin size of the region and determine the number of elements.
460 |          binSize = rand() % 1000;
461 | 	 if(binSize < 773) {
462 | 	   elements = rand() % 15 + 1;
463 | 	 }
464 | 	 else if(binSize < 937) {
465 | 	   elements = rand() % 16 + 16;
466 | 	 }
467 | 	 else if(binSize < 970) {
468 | 	   elements = rand() % 32 + 32;
469 | 	 }
470 | 	 else if(binSize < 974) {
471 | 	   elements = rand() % 64 + 64;
472 | 	 } 
473 | 	 else if(binSize < 978) {
474 | 	   elements = rand() % 128 + 128;
475 | 	 }
476 | 	 else if(binSize < 981) {
477 | 	   elements = rand() % 256 + 256;
478 | 	 }
479 | 	 else
480 | 	    elements = rand() % 1537 + 512;
481 | 	 runto = elements + nextIndex;
482 | 	 //Store the elements.  If we hit the end before we run out of elements then just stop.
483 |          while (nextIndex < runto && nextIndex < numElem()) {
484 | 	    this->regNumList(nextIndex) = regionNum;
485 | 	    nextIndex++;
486 | 	 }
487 | 	 lastReg = regionNum;
488 |       }
489 | 
490 |       delete [] regBinEnd; 
491 |    }
492 |    // Convert regNumList to region index sets
493 |    // First, count size of each region 
494 |    for (Index_t i=0 ; i<numElem() ; ++i) {
495 |       int r = this->regNumList(i)-1; // region index == regnum-1
496 |       regElemSize(r)++;
497 |    }
498 |    // Second, allocate each region index set
499 |    for (Index_t i=0 ; i<numReg() ; ++i) {
500 |       m_regElemlist[i] = new Index_t[regElemSize(i)];
501 |       regElemSize(i) = 0;
502 |    }
503 |    // Third, fill index sets
504 |    for (Index_t i=0 ; i<numElem() ; ++i) {
505 |       Index_t r = regNumList(i)-1;       // region index == regnum-1
506 |       Index_t regndx = regElemSize(r)++; // Note increment
507 |       regElemlist(r,regndx) = i;
508 |    }
509 |    
510 | }
511 | 
512 | /////////////////////////////////////////////////////////////
513 | void 
514 | Domain::SetupSymmetryPlanes(Int_t edgeNodes)
515 | {
516 |   Index_t nidx = 0 ;
517 |   for (Index_t i=0; i<edgeNodes; ++i) {
518 |     Index_t planeInc = i*edgeNodes*edgeNodes ;
519 |     Index_t rowInc   = i*edgeNodes ;
520 |     for (Index_t j=0; j<edgeNodes; ++j) {
521 |       if (m_planeLoc == 0) {
522 | 	m_symmZ[nidx] = rowInc   + j ;
523 |       }
524 |       if (m_rowLoc == 0) {
525 | 	m_symmY[nidx] = planeInc + j ;
526 |       }
527 |       if (m_colLoc == 0) {
528 | 	m_symmX[nidx] = planeInc + j*edgeNodes ;
529 |       }
530 |       ++nidx ;
531 |     }
532 |   }
533 | }
534 | 
535 | 
536 | 
537 | /////////////////////////////////////////////////////////////
538 | void
539 | Domain::SetupElementConnectivities(Int_t edgeElems)
540 | {
541 |    lxim(0) = 0 ;
542 |    for (Index_t i=1; i<numElem(); ++i) {
543 |       lxim(i)   = i-1 ;
544 |       lxip(i-1) = i ;
545 |    }
546 |    lxip(numElem()-1) = numElem()-1 ;
547 | 
548 |    for (Index_t i=0; i<edgeElems; ++i) {
549 |       letam(i) = i ; 
550 |       letap(numElem()-edgeElems+i) = numElem()-edgeElems+i ;
551 |    }
552 |    for (Index_t i=edgeElems; i<numElem(); ++i) {
553 |       letam(i) = i-edgeElems ;
554 |       letap(i-edgeElems) = i ;
555 |    }
556 | 
557 |    for (Index_t i=0; i<edgeElems*edgeElems; ++i) {
558 |       lzetam(i) = i ;
559 |       lzetap(numElem()-edgeElems*edgeElems+i) = numElem()-edgeElems*edgeElems+i ;
560 |    }
561 |    for (Index_t i=edgeElems*edgeElems; i<numElem(); ++i) {
562 |       lzetam(i) = i - edgeElems*edgeElems ;
563 |       lzetap(i-edgeElems*edgeElems) = i ;
564 |    }
565 | }
566 | 
567 | /////////////////////////////////////////////////////////////
568 | void
569 | Domain::SetupBoundaryConditions(Int_t edgeElems) 
570 | {
571 |   Index_t ghostIdx[6] ;  // offsets to ghost locations
572 | 
573 |   // set up boundary condition information
574 |   for (Index_t i=0; i<numElem(); ++i) {
575 |      elemBC(i) = Int_t(0) ;
576 |   }
577 | 
578 |   for (Index_t i=0; i<6; ++i) {
579 |     ghostIdx[i] = INT_MIN ;
580 |   }
581 | 
582 |   Int_t pidx = numElem() ;
583 |   if (m_planeMin != 0) {
584 |     ghostIdx[0] = pidx ;
585 |     pidx += sizeX()*sizeY() ;
586 |   }
587 | 
588 |   if (m_planeMax != 0) {
589 |     ghostIdx[1] = pidx ;
590 |     pidx += sizeX()*sizeY() ;
591 |   }
592 | 
593 |   if (m_rowMin != 0) {
594 |     ghostIdx[2] = pidx ;
595 |     pidx += sizeX()*sizeZ() ;
596 |   }
597 | 
598 |   if (m_rowMax != 0) {
599 |     ghostIdx[3] = pidx ;
600 |     pidx += sizeX()*sizeZ() ;
601 |   }
602 | 
603 |   if (m_colMin != 0) {
604 |     ghostIdx[4] = pidx ;
605 |     pidx += sizeY()*sizeZ() ;
606 |   }
607 | 
608 |   if (m_colMax != 0) {
609 |     ghostIdx[5] = pidx ;
610 |   }
611 | 
612 |   // symmetry plane or free surface BCs 
613 |   for (Index_t i=0; i<edgeElems; ++i) {
614 |     Index_t planeInc = i*edgeElems*edgeElems ;
615 |     Index_t rowInc   = i*edgeElems ;
616 |     for (Index_t j=0; j<edgeElems; ++j) {
617 |       if (m_planeLoc == 0) {
618 | 	elemBC(rowInc+j) |= ZETA_M_SYMM ;
619 |       }
620 |       else {
621 | 	elemBC(rowInc+j) |= ZETA_M_COMM ;
622 | 	lzetam(rowInc+j) = ghostIdx[0] + rowInc + j ;
623 |       }
624 | 
625 |       if (m_planeLoc == m_tp-1) {
626 | 	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
627 | 	  ZETA_P_FREE;
628 |       }
629 |       else {
630 | 	elemBC(rowInc+j+numElem()-edgeElems*edgeElems) |=
631 | 	  ZETA_P_COMM ;
632 | 	lzetap(rowInc+j+numElem()-edgeElems*edgeElems) =
633 | 	  ghostIdx[1] + rowInc + j ;
634 |       }
635 | 
636 |       if (m_rowLoc == 0) {
637 | 	elemBC(planeInc+j) |= ETA_M_SYMM ;
638 |       }
639 |       else {
640 | 	elemBC(planeInc+j) |= ETA_M_COMM ;
641 | 	letam(planeInc+j) = ghostIdx[2] + rowInc + j ;
642 |       }
643 | 
644 |       if (m_rowLoc == m_tp-1) {
645 | 	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
646 | 	  ETA_P_FREE ;
647 |       }
648 |       else {
649 | 	elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= 
650 | 	  ETA_P_COMM ;
651 | 	letap(planeInc+j+edgeElems*edgeElems-edgeElems) =
652 | 	  ghostIdx[3] +  rowInc + j ;
653 |       }
654 | 
655 |       if (m_colLoc == 0) {
656 | 	elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;
657 |       }
658 |       else {
659 | 	elemBC(planeInc+j*edgeElems) |= XI_M_COMM ;
660 | 	lxim(planeInc+j*edgeElems) = ghostIdx[4] + rowInc + j ;
661 |       }
662 | 
663 |       if (m_colLoc == m_tp-1) {
664 | 	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;
665 |       }
666 |       else {
667 | 	elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_COMM ;
668 | 	lxip(planeInc+j*edgeElems+edgeElems-1) =
669 | 	  ghostIdx[5] + rowInc + j ;
670 |       }
671 |     }
672 |   }
673 | }
674 | 
675 | ///////////////////////////////////////////////////////////////////////////
676 | void InitMeshDecomp(Int_t numRanks, Int_t myRank,
677 |                     Int_t *col, Int_t *row, Int_t *plane, Int_t *side)
678 | {
679 |    Int_t testProcs;
680 |    Int_t dx, dy, dz;
681 |    Int_t myDom;
682 |    
683 |    // Assume cube processor layout for now 
684 |    testProcs = Int_t(cbrt(Real_t(numRanks))+0.5) ;
685 |    if (testProcs*testProcs*testProcs != numRanks) {
686 |       printf("Num processors must be a cube of an integer (1, 8, 27, ...)\n") ;
687 | #if USE_MPI      
688 |       MPI_Abort(MPI_COMM_WORLD, -1) ;
689 | #else
690 |       exit(-1);
691 | #endif
692 |    }
693 |    if (sizeof(Real_t) != 4 && sizeof(Real_t) != 8) {
694 |       printf("MPI operations only support float and double right now...\n");
695 | #if USE_MPI      
696 |       MPI_Abort(MPI_COMM_WORLD, -1) ;
697 | #else
698 |       exit(-1);
699 | #endif
700 |    }
701 |    if (MAX_FIELDS_PER_MPI_COMM > CACHE_COHERENCE_PAD_REAL) {
702 |       printf("corner element comm buffers too small.  Fix code.\n") ;
703 | #if USE_MPI      
704 |       MPI_Abort(MPI_COMM_WORLD, -1) ;
705 | #else
706 |       exit(-1);
707 | #endif
708 |    }
709 | 
710 |    dx = testProcs ;
711 |    dy = testProcs ;
712 |    dz = testProcs ;
713 | 
714 |    // temporary test
715 |    if (dx*dy*dz != numRanks) {
716 |       printf("error -- must have as many domains as procs\n") ;
717 | #if USE_MPI      
718 |       MPI_Abort(MPI_COMM_WORLD, -1) ;
719 | #else
720 |       exit(-1);
721 | #endif
722 |    }
723 |    Int_t remainder = dx*dy*dz % numRanks ;
724 |    if (myRank < remainder) {
725 |       myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ;
726 |    }
727 |    else {
728 |       myDom = remainder*( 1+ (dx*dy*dz / numRanks)) +
729 |          (myRank - remainder)*(dx*dy*dz/numRanks) ;
730 |    }
731 | 
732 |    *col = myDom % dx ;
733 |    *row = (myDom / dx) % dy ;
734 |    *plane = myDom / (dx*dy) ;
735 |    *side = testProcs;
736 | 
737 |    return;
738 | }
739 | 
740 | 


--------------------------------------------------------------------------------
/lulesh-util.cc:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdlib.h>
  3 | #include <ctype.h>
  4 | #include <stdio.h>
  5 | #include <iostream>
  6 | #include <iomanip>
  7 | #if USE_MPI
  8 | #include <mpi.h>
  9 | #endif
 10 | #include "lulesh.h"
 11 | 
 12 | /* Helper function for converting strings to ints, with error checking */
 13 | template<typename IntT>
 14 | int StrToInt(const char *token, IntT *retVal)
 15 | {
 16 |    const char *c ;
 17 |    char *endptr ;
 18 |    const int decimal_base = 10 ;
 19 | 
 20 |    if (token == NULL)
 21 |       return 0 ;
 22 |    
 23 |    c = token ;
 24 |    *retVal = strtol(c, &endptr, decimal_base) ;
 25 |    if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0')))
 26 |       return 1 ;
 27 |    else
 28 |       return 0 ;
 29 | }
 30 | 
 31 | static void PrintCommandLineOptions(char *execname, int myRank)
 32 | {
 33 |    if (myRank == 0) {
 34 | 
 35 |       printf("Usage: %s [opts]\n", execname);
 36 |       printf(" where [opts] is one or more of:\n");
 37 |       printf(" -q              : quiet mode - suppress all stdout\n");
 38 |       printf(" -i <iterations> : number of cycles to run\n");
 39 |       printf(" -s <size>       : length of cube mesh along side\n");
 40 |       printf(" -r <numregions> : Number of distinct regions (def: 11)\n");
 41 |       printf(" -b <balance>    : Load balance between regions of a domain (def: 1)\n");
 42 |       printf(" -c <cost>       : Extra cost of more expensive regions (def: 1)\n");
 43 |       printf(" -f <numfiles>   : Number of files to split viz dump into (def: (np+10)/9)\n");
 44 |       printf(" -p              : Print out progress\n");
 45 |       printf(" -v              : Output viz file (requires compiling with -DVIZ_MESH\n");
 46 |       printf(" -h              : This message\n");
 47 |       printf("\n\n");
 48 |    }
 49 | }
 50 | 
 51 | static void ParseError(const char *message, int myRank)
 52 | {
 53 |    if (myRank == 0) {
 54 |       printf("%s\n", message);
 55 | #if USE_MPI      
 56 |       MPI_Abort(MPI_COMM_WORLD, -1);
 57 | #else
 58 |       exit(-1);
 59 | #endif
 60 |    }
 61 | }
 62 | 
 63 | void ParseCommandLineOptions(int argc, char *argv[],
 64 |                              Int_t myRank, struct cmdLineOpts *opts)
 65 | {
 66 |    if(argc > 1) {
 67 |       int i = 1;
 68 | 
 69 |       while(i < argc) {
 70 |          int ok;
 71 |          /* -i <iterations> */
 72 |          if(strcmp(argv[i], "-i") == 0) {
 73 |             if (i+1 >= argc) {
 74 |                ParseError("Missing integer argument to -i", myRank);
 75 |             }
 76 |             ok = StrToInt(argv[i+1], &(opts->its));
 77 |             if(!ok) {
 78 |                ParseError("Parse Error on option -i integer value required after argument\n", myRank);
 79 |             }
 80 |             i+=2;
 81 |          }
 82 |          /* -s <size, sidelength> */
 83 |          else if(strcmp(argv[i], "-s") == 0) {
 84 |             if (i+1 >= argc) {
 85 |                ParseError("Missing integer argument to -s\n", myRank);
 86 |             }
 87 |             ok = StrToInt(argv[i+1], &(opts->nx));
 88 |             if(!ok) {
 89 |                ParseError("Parse Error on option -s integer value required after argument\n", myRank);
 90 |             }
 91 |             i+=2;
 92 |          }
 93 | 	 /* -r <numregions> */
 94 |          else if (strcmp(argv[i], "-r") == 0) {
 95 |             if (i+1 >= argc) {
 96 |                ParseError("Missing integer argument to -r\n", myRank);
 97 |             }
 98 |             ok = StrToInt(argv[i+1], &(opts->numReg));
 99 |             if (!ok) {
100 |                ParseError("Parse Error on option -r integer value required after argument\n", myRank);
101 |             }
102 |             i+=2;
103 |          }
104 | 	 /* -f <numfilepieces> */
105 |          else if (strcmp(argv[i], "-f") == 0) {
106 |             if (i+1 >= argc) {
107 |                ParseError("Missing integer argument to -f\n", myRank);
108 |             }
109 |             ok = StrToInt(argv[i+1], &(opts->numFiles));
110 |             if (!ok) {
111 |                ParseError("Parse Error on option -f integer value required after argument\n", myRank);
112 |             }
113 |             i+=2;
114 |          }
115 |          /* -p */
116 |          else if (strcmp(argv[i], "-p") == 0) {
117 |             opts->showProg = 1;
118 |             i++;
119 |          }
120 |          /* -q */
121 |          else if (strcmp(argv[i], "-q") == 0) {
122 |             opts->quiet = 1;
123 |             i++;
124 |          }
125 |          else if (strcmp(argv[i], "-b") == 0) {
126 |             if (i+1 >= argc) {
127 |                ParseError("Missing integer argument to -b\n", myRank);
128 |             }
129 |             ok = StrToInt(argv[i+1], &(opts->balance));
130 |             if (!ok) {
131 |                ParseError("Parse Error on option -b integer value required after argument\n", myRank);
132 |             }
133 |             i+=2;
134 |          }
135 |          else if (strcmp(argv[i], "-c") == 0) {
136 |             if (i+1 >= argc) {
137 |                ParseError("Missing integer argument to -c\n", myRank);
138 |             }
139 |             ok = StrToInt(argv[i+1], &(opts->cost));
140 |             if (!ok) {
141 |                ParseError("Parse Error on option -c integer value required after argument\n", myRank);
142 |             }
143 |             i+=2;
144 |          }
145 |          /* -v */
146 |          else if (strcmp(argv[i], "-v") == 0) {
147 | #if VIZ_MESH            
148 |             opts->viz = 1;
149 | #else
150 |             ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank);
151 | #endif
152 |             i++;
153 |          }
154 |          /* -h */
155 |          else if (strcmp(argv[i], "-h") == 0) {
156 |             PrintCommandLineOptions(argv[0], myRank);
157 | #if USE_MPI            
158 |             MPI_Abort(MPI_COMM_WORLD, 0);
159 | #else
160 |             exit(0);
161 | #endif
162 |          }
163 |          else {
164 |             char msg[80];
165 |             PrintCommandLineOptions(argv[0], myRank);
166 |             sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]);
167 |             ParseError(msg, myRank);
168 |          }
169 |       }
170 |    }
171 | }
172 | 
173 | /////////////////////////////////////////////////////////////////////
174 | 
175 | void VerifyAndWriteFinalOutput(Real_t elapsed_time,
176 |                                Domain& locDom,
177 |                                Int_t nx,
178 |                                Int_t numRanks)
179 | {
180 |    // GrindTime1 only takes a single domain into account, and is thus a good way to measure
181 |    // processor speed indepdendent of MPI parallelism.
182 |    // GrindTime2 takes into account speedups from MPI parallelism.
183 |    // Cast to 64-bit integer to avoid overflows.
184 |    Int8_t nx8 = nx;
185 |    Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx8*nx8*nx8);
186 |    Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx8*nx8*nx8*numRanks);
187 | 
188 |    Index_t ElemId = 0;
189 |    std::cout << "Run completed:\n";
190 |    std::cout << "   Problem size        =  " << nx       << "\n";
191 |    std::cout << "   MPI tasks           =  " << numRanks << "\n";
192 |    std::cout << "   Iteration count     =  " << locDom.cycle() << "\n";
193 |    std::cout << "   Final Origin Energy =  ";
194 |    std::cout << std::scientific << std::setprecision(6);
195 |    std::cout << std::setw(12) << locDom.e(ElemId) << "\n";
196 | 
197 |    Real_t   MaxAbsDiff = Real_t(0.0);
198 |    Real_t TotalAbsDiff = Real_t(0.0);
199 |    Real_t   MaxRelDiff = Real_t(0.0);
200 | 
201 |    for (Index_t j=0; j<nx; ++j) {
202 |       for (Index_t k=j+1; k<nx; ++k) {
203 |          Real_t AbsDiff = FABS(locDom.e(j*nx+k)-locDom.e(k*nx+j));
204 |          TotalAbsDiff  += AbsDiff;
205 | 
206 |          if (MaxAbsDiff <AbsDiff) MaxAbsDiff = AbsDiff;
207 | 
208 |          Real_t RelDiff = AbsDiff / locDom.e(k*nx+j);
209 | 
210 |          if (MaxRelDiff <RelDiff)  MaxRelDiff = RelDiff;
211 |       }
212 |    }
213 | 
214 |    // Quick symmetry check
215 |    std::cout << "   Testing Plane 0 of Energy Array on rank 0:\n";
216 |    std::cout << "        MaxAbsDiff   = " << std::setw(12) << MaxAbsDiff   << "\n";
217 |    std::cout << "        TotalAbsDiff = " << std::setw(12) << TotalAbsDiff << "\n";
218 |    std::cout << "        MaxRelDiff   = " << std::setw(12) << MaxRelDiff   << "\n";
219 | 
220 |    // Timing information
221 |    std::cout.unsetf(std::ios_base::floatfield);
222 |    std::cout << std::setprecision(2);
223 |    std::cout << "\nElapsed time         = " << std::setw(10) << elapsed_time << " (s)\n";
224 |    std::cout << std::setprecision(8);
225 |    std::cout << "Grind time (us/z/c)  = "  << std::setw(10) << grindTime1 << " (per dom)  ("
226 |              << std::setw(10) << elapsed_time << " overall)\n";
227 |    std::cout << "FOM                  = " << std::setw(10) << 1000.0/grindTime2 << " (z/s)\n\n";
228 | 
229 |    return ;
230 | }
231 | 


--------------------------------------------------------------------------------
/lulesh-viz.cc:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <stdlib.h>
  4 | #include <math.h>
  5 | #include "lulesh.h"
  6 | 
  7 | #ifdef VIZ_MESH
  8 | 
  9 | #ifdef __cplusplus
 10 |   extern "C" {
 11 | #endif
 12 | #include "silo.h"
 13 | #if USE_MPI
 14 | # include "pmpio.h"
 15 | #endif
 16 | #ifdef __cplusplus
 17 |   }
 18 | #endif
 19 | 
 20 | // Function prototypes
 21 | static void 
 22 | DumpDomainToVisit(DBfile *db, Domain& domain, int myRank);
 23 | static
 24 | 
 25 | 
 26 | #if USE_MPI
 27 | // For some reason, earlier versions of g++ (e.g. 4.2) won't let me
 28 | // put the 'static' qualifier on this prototype, even if it's done
 29 | // consistently in the prototype and definition
 30 | void
 31 | DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
 32 |                       char basename[], int numRanks);
 33 | 
 34 | // Callback prototypes for PMPIO interface (only useful if we're
 35 | // running parallel)
 36 | static void *
 37 | LULESH_PMPIO_Create(const char *fname,
 38 | 		     const char *dname,
 39 | 		     void *udata);
 40 | static void *
 41 | LULESH_PMPIO_Open(const char *fname,
 42 | 		   const char *dname,
 43 | 		   PMPIO_iomode_t ioMode,
 44 | 		   void *udata);
 45 | static void
 46 | LULESH_PMPIO_Close(void *file, void *udata);
 47 | 
 48 | #else
 49 | void
 50 | DumpMultiblockObjects(DBfile *db, char basename[], int numRanks);
 51 | #endif
 52 | 
 53 | 
 54 | /**********************************************************************/
 55 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 
 56 | {
 57 |   char subdirName[32];
 58 |   char basename[32];
 59 |   DBfile *db;
 60 | 
 61 | 
 62 |   sprintf(basename, "lulesh_plot_c%d", domain.cycle());
 63 |   sprintf(subdirName, "data_%d", myRank);
 64 | 
 65 | #if USE_MPI
 66 | 
 67 |   PMPIO_baton_t *bat = PMPIO_Init(numFiles,
 68 | 				  PMPIO_WRITE,
 69 | 				  MPI_COMM_WORLD,
 70 | 				  10101,
 71 | 				  LULESH_PMPIO_Create,
 72 | 				  LULESH_PMPIO_Open,
 73 | 				  LULESH_PMPIO_Close,
 74 | 				  NULL);
 75 | 
 76 |   int myiorank = PMPIO_GroupRank(bat, myRank);
 77 | 
 78 |   char fileName[64];
 79 |   
 80 |   if (myiorank == 0) 
 81 |     strcpy(fileName, basename);
 82 |   else
 83 |     sprintf(fileName, "%s.%03d", basename, myiorank);
 84 | 
 85 |   db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName);
 86 | 
 87 |   DumpDomainToVisit(db, domain, myRank);
 88 | 
 89 |   // Processor 0 writes out bit of extra data to its file that
 90 |   // describes how to stitch all the pieces together
 91 |   if (myRank == 0) {
 92 |     DumpMultiblockObjects(db, bat, basename, numRanks);
 93 |   }
 94 | 
 95 |   PMPIO_HandOffBaton(bat, db);
 96 | 
 97 |   PMPIO_Finish(bat);
 98 | #else
 99 | 
100 |   db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
101 | 
102 |   if (db) {
103 |      DBMkDir(db, subdirName);
104 |      DBSetDir(db, subdirName);
105 |      DumpDomainToVisit(db, domain, myRank);
106 |      DumpMultiblockObjects(db, basename, numRanks);
107 |      DBClose(db);
108 |   }
109 |   else {
110 |      printf("Error writing out viz file - rank %d\n", myRank);
111 |   }
112 | 
113 | #endif
114 | }
115 | 
116 | 
117 | 
118 | /**********************************************************************/
119 | 
120 | static void 
121 | DumpDomainToVisit(DBfile *db, Domain& domain, int myRank)
122 | {
123 |    int ok = 0;
124 |    
125 |    /* Create an option list that will give some hints to VisIt for
126 |     * printing out the cycle and time in the annotations */
127 |    DBoptlist *optlist;
128 | 
129 | 
130 |    /* Write out the mesh connectivity in fully unstructured format */
131 |    int shapetype[1] = {DB_ZONETYPE_HEX};
132 |    int shapesize[1] = {8};
133 |    int shapecnt[1] = {domain.numElem()};
134 |    int *conn = new int[domain.numElem()*8] ;
135 |    int ci = 0 ;
136 |    for (int ei=0; ei < domain.numElem(); ++ei) {
137 |       Index_t *elemToNode = domain.nodelist(ei) ;
138 |       for (int ni=0; ni < 8; ++ni) {
139 |          conn[ci++] = elemToNode[ni] ;
140 |       }
141 |    }
142 |    ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3,
143 |                         conn, domain.numElem()*8,
144 |                         0,0,0, /* Not carrying ghost zones */
145 |                         shapetype, shapesize, shapecnt,
146 |                         1, NULL);
147 |    delete [] conn ;
148 | 
149 |    /* Write out the mesh coordinates associated with the mesh */
150 |    const char* coordnames[3] = {"X", "Y", "Z"};
151 |    float *coords[3] ;
152 |    coords[0] = new float[domain.numNode()] ;
153 |    coords[1] = new float[domain.numNode()] ;
154 |    coords[2] = new float[domain.numNode()] ;
155 |    for (int ni=0; ni < domain.numNode() ; ++ni) {
156 |       coords[0][ni] = float(domain.x(ni)) ;
157 |       coords[1][ni] = float(domain.y(ni)) ;
158 |       coords[2][ni] = float(domain.z(ni)) ;
159 |    }
160 |    optlist = DBMakeOptlist(2);
161 |    ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time());
162 |    ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle());
163 |    ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords,
164 |                       domain.numNode(), domain.numElem(), "connectivity",
165 |                       0, DB_FLOAT, optlist);
166 |    ok += DBFreeOptlist(optlist);
167 |    delete [] coords[2] ;
168 |    delete [] coords[1] ;
169 |    delete [] coords[0] ;
170 | 
171 |    /* Write out the materials */
172 |    int *matnums = new int[domain.numReg()];
173 |    int dims[1] = {domain.numElem()}; // No mixed elements
174 |    for(int i=0 ; i<domain.numReg() ; ++i)
175 |       matnums[i] = i+1;
176 |    
177 |    ok += DBPutMaterial(db, "regions", "mesh", domain.numReg(),
178 |                        matnums, domain.regNumList(), dims, 1,
179 |                        NULL, NULL, NULL, NULL, 0, DB_FLOAT, NULL);
180 |    delete [] matnums;
181 | 
182 |    /* Write out pressure, energy, relvol, q */
183 | 
184 |    float *e = new float[domain.numElem()] ; 
185 |    for (int ei=0; ei < domain.numElem(); ++ei) {
186 |       e[ei] = float(domain.e(ei)) ;
187 |    }
188 |    ok += DBPutUcdvar1(db, "e", "mesh", e,
189 |                       domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
190 |                       NULL);
191 |    delete [] e ;
192 | 
193 | 
194 |    float *p = new float[domain.numElem()] ; 
195 |    for (int ei=0; ei < domain.numElem(); ++ei) {
196 |       p[ei] = float(domain.p(ei)) ;
197 |    }
198 |    ok += DBPutUcdvar1(db, "p", "mesh", p,
199 |                       domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
200 |                       NULL);
201 |    delete [] p ;
202 | 
203 |    float *v = new float[domain.numElem()] ; 
204 |    for (int ei=0; ei < domain.numElem(); ++ei) {
205 |       v[ei] = float(domain.v(ei)) ;
206 |    }
207 |    ok += DBPutUcdvar1(db, "v", "mesh", v,
208 |                       domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
209 |                       NULL);
210 |    delete [] v ;
211 | 
212 |    float *q = new float[domain.numElem()] ; 
213 |    for (int ei=0; ei < domain.numElem(); ++ei) {
214 |       q[ei] = float(domain.q(ei)) ;
215 |    }
216 |    ok += DBPutUcdvar1(db, "q", "mesh", q,
217 |                       domain.numElem(), NULL, 0, DB_FLOAT, DB_ZONECENT,
218 |                       NULL);
219 |    delete [] q ;
220 | 
221 |    /* Write out nodal speed, velocities */
222 |    float *zd    = new float[domain.numNode()];
223 |    float *yd    = new float[domain.numNode()];
224 |    float *xd    = new float[domain.numNode()];
225 |    float *speed = new float[domain.numNode()];
226 |    for(int ni=0 ; ni < domain.numNode() ; ++ni) {
227 |       xd[ni]    = float(domain.xd(ni));
228 |       yd[ni]    = float(domain.yd(ni));
229 |       zd[ni]    = float(domain.zd(ni));
230 |       speed[ni] = float(sqrt((xd[ni]*xd[ni])+(yd[ni]*yd[ni])+(zd[ni]*zd[ni])));
231 |    }
232 | 
233 |    ok += DBPutUcdvar1(db, "speed", "mesh", speed,
234 |                       domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
235 |                       NULL);
236 |    delete [] speed;
237 | 
238 | 
239 |    ok += DBPutUcdvar1(db, "xd", "mesh", xd,
240 |                       domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
241 |                       NULL);
242 |    delete [] xd ;
243 | 
244 |    ok += DBPutUcdvar1(db, "yd", "mesh", yd,
245 |                       domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
246 |                       NULL);
247 |    delete [] yd ;
248 | 
249 |    ok += DBPutUcdvar1(db, "zd", "mesh", zd,
250 |                       domain.numNode(), NULL, 0, DB_FLOAT, DB_NODECENT,
251 |                       NULL);
252 |    delete [] zd ;
253 | 
254 | 
255 |    if (ok != 0) {
256 |       printf("Error writing out viz file - rank %d\n", myRank);
257 |    }
258 | }
259 | 
260 | /**********************************************************************/
261 | 
262 | #if USE_MPI     
263 | void
264 |    DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 
265 |                          char basename[], int numRanks)
266 | #else
267 | void
268 |   DumpMultiblockObjects(DBfile *db, char basename[], int numRanks)
269 | #endif
270 | {
271 |    /* MULTIBLOCK objects to tie together multiple files */
272 |   char **multimeshObjs;
273 |   char **multimatObjs;
274 |   char ***multivarObjs;
275 |   int *blockTypes;
276 |   int *varTypes;
277 |   int ok = 0;
278 |   // Make sure this list matches what's written out above
279 |   char vars[][10] = {"p","e","v","q", "speed", "xd", "yd", "zd"};
280 |   int numvars = sizeof(vars)/sizeof(vars[0]);
281 | 
282 |   // Reset to the root directory of the silo file
283 |   DBSetDir(db, "/");
284 | 
285 |   // Allocate a bunch of space for building up the string names
286 |   multimeshObjs = new char*[numRanks];
287 |   multimatObjs = new char*[numRanks];
288 |   multivarObjs = new char**[numvars];
289 |   blockTypes = new int[numRanks];
290 |   varTypes = new int[numRanks];
291 | 
292 |   for(int v=0 ; v<numvars ; ++v) {
293 |      multivarObjs[v] = new char*[numRanks];
294 |   }
295 |   
296 |   for(int i=0 ; i<numRanks ; ++i) {
297 |      multimeshObjs[i] = new char[64];
298 |      multimatObjs[i] = new char[64];
299 |      for(int v=0 ; v<numvars ; ++v) {
300 |         multivarObjs[v][i] = new char[64];
301 |      }
302 |      blockTypes[i] = DB_UCDMESH;
303 |      varTypes[i] = DB_UCDVAR;
304 |   }
305 |       
306 |   // Build up the multiobject names
307 |   for(int i=0 ; i<numRanks ; ++i) {
308 | #if USE_MPI     
309 |     int iorank = PMPIO_GroupRank(bat, i);
310 | #else
311 |     int iorank = 0;
312 | #endif
313 | 
314 |     //delete multivarObjs[i];
315 |     if (iorank == 0) {
316 |       snprintf(multimeshObjs[i], 64, "/data_%d/mesh", i);
317 |       snprintf(multimatObjs[i], 64, "/data_%d/regions",i);
318 |       for(int v=0 ; v<numvars ; ++v) {
319 | 	snprintf(multivarObjs[v][i], 64, "/data_%d/%s", i, vars[v]);
320 |       }
321 |      
322 |     }
323 |     else {
324 |       snprintf(multimeshObjs[i], 64, "%s.%03d:/data_%d/mesh",
325 |                basename, iorank, i);
326 |       snprintf(multimatObjs[i], 64, "%s.%03d:/data_%d/regions", 
327 | 	       basename, iorank, i);
328 |       for(int v=0 ; v<numvars ; ++v) {
329 |          snprintf(multivarObjs[v][i], 64, "%s.%03d:/data_%d/%s", 
330 |                   basename, iorank, i, vars[v]);
331 |       }
332 |     }
333 |   }
334 | 
335 |   // Now write out the objects
336 |   ok += DBPutMultimesh(db, "mesh", numRanks,
337 | 		       (char**)multimeshObjs, blockTypes, NULL);
338 |   ok += DBPutMultimat(db, "regions", numRanks,
339 | 		      (char**)multimatObjs, NULL);
340 |   for(int v=0 ; v<numvars ; ++v) {
341 |      ok += DBPutMultivar(db, vars[v], numRanks,
342 |                          (char**)multivarObjs[v], varTypes, NULL);
343 |   }
344 | 
345 |   for(int v=0; v < numvars; ++v) {
346 |     for(int i = 0; i < numRanks; i++) {
347 |       delete multivarObjs[v][i];
348 |     }
349 |     delete multivarObjs[v];
350 |   }
351 | 
352 |   // Clean up
353 |   for(int i=0 ; i<numRanks ; i++) {
354 |     delete multimeshObjs[i];
355 |     delete multimatObjs[i];
356 |   }
357 |   delete [] multimeshObjs;
358 |   delete [] multimatObjs;
359 |   delete [] multivarObjs;
360 |   delete [] blockTypes;
361 |   delete [] varTypes;
362 | 
363 |   if (ok != 0) {
364 |     printf("Error writing out multiXXX objs to viz file - rank 0\n");
365 |   }
366 | }
367 | 
368 | # if USE_MPI
369 | 
370 | /**********************************************************************/
371 | 
372 | static void *
373 | LULESH_PMPIO_Create(const char *fname,
374 | 		     const char *dname,
375 | 		     void *udata)
376 | {
377 |    /* Create the file */
378 |    DBfile* db = DBCreate(fname, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X);
379 | 
380 |    /* Put the data in a subdirectory, so VisIt only sees the multimesh
381 |     * objects we write out in the base file */
382 |    if (db) {
383 |      DBMkDir(db, dname);
384 |      DBSetDir(db, dname);
385 |    }
386 |    return (void*)db;
387 | }
388 | 
389 |    
390 | /**********************************************************************/
391 | 
392 | static void *
393 | LULESH_PMPIO_Open(const char *fname,
394 | 		   const char *dname,
395 | 		   PMPIO_iomode_t ioMode,
396 | 		   void *udata)
397 | {
398 |    /* Open the file */
399 |   DBfile* db = DBOpen(fname, DB_UNKNOWN, DB_APPEND);
400 | 
401 |    /* Put the data in a subdirectory, so VisIt only sees the multimesh
402 |     * objects we write out in the base file */
403 |    if (db) {
404 |      DBMkDir(db, dname);
405 |      DBSetDir(db, dname);
406 |    }
407 |    return (void*)db;
408 | }
409 | 
410 |    
411 | /**********************************************************************/
412 | 
413 | static void
414 | LULESH_PMPIO_Close(void *file, void *udata)
415 | {
416 |   DBfile *db = (DBfile*)file;
417 |   if (db)
418 |     DBClose(db);
419 | }
420 | # endif
421 | 
422 |    
423 | #else
424 | 
425 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks)
426 | {
427 |    if (myRank == 0) {
428 |       printf("Must enable -DVIZ_MESH at compile time to call DumpDomain\n");
429 |    }
430 | }
431 | 
432 | #endif
433 | 
434 | 


--------------------------------------------------------------------------------
/lulesh.h:
--------------------------------------------------------------------------------
  1 | #if !defined(USE_MPI)
  2 | # error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
  3 | #endif
  4 | 
  5 | #if USE_MPI
  6 | #include <mpi.h>
  7 | 
  8 | /*
  9 |    define one of these three symbols:
 10 | 
 11 |    SEDOV_SYNC_POS_VEL_NONE
 12 |    SEDOV_SYNC_POS_VEL_EARLY
 13 |    SEDOV_SYNC_POS_VEL_LATE
 14 | */
 15 | 
 16 | #define SEDOV_SYNC_POS_VEL_EARLY 1
 17 | #endif
 18 | 
 19 | #include <math.h>
 20 | #include <stdlib.h>
 21 | #include <stdint.h>
 22 | #include <vector>
 23 | 
 24 | //**************************************************
 25 | // Allow flexibility for arithmetic representations 
 26 | //**************************************************
 27 | 
 28 | #define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
 29 | 
 30 | 
 31 | // Precision specification
 32 | typedef float        real4 ;
 33 | typedef double       real8 ;
 34 | typedef long double  real10 ;  // 10 bytes on x86
 35 | 
 36 | typedef int32_t Int4_t ;
 37 | typedef int64_t Int8_t ;
 38 | typedef Int4_t  Index_t ; // array subscript and loop index
 39 | typedef real8   Real_t ;  // floating point representation
 40 | typedef Int4_t  Int_t ;   // integer representation
 41 | 
 42 | enum { VolumeError = -1, QStopError = -2 } ;
 43 | 
 44 | inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
 45 | inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
 46 | inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
 47 | 
 48 | inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
 49 | inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
 50 | inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
 51 | 
 52 | inline real4  FABS(real4  arg) { return fabsf(arg) ; }
 53 | inline real8  FABS(real8  arg) { return fabs(arg) ; }
 54 | inline real10 FABS(real10 arg) { return fabsl(arg) ; }
 55 | 
 56 | 
 57 | // Stuff needed for boundary conditions
 58 | // 2 BCs on each of 6 hexahedral faces (12 bits)
 59 | #define XI_M        0x00007
 60 | #define XI_M_SYMM   0x00001
 61 | #define XI_M_FREE   0x00002
 62 | #define XI_M_COMM   0x00004
 63 | 
 64 | #define XI_P        0x00038
 65 | #define XI_P_SYMM   0x00008
 66 | #define XI_P_FREE   0x00010
 67 | #define XI_P_COMM   0x00020
 68 | 
 69 | #define ETA_M       0x001c0
 70 | #define ETA_M_SYMM  0x00040
 71 | #define ETA_M_FREE  0x00080
 72 | #define ETA_M_COMM  0x00100
 73 | 
 74 | #define ETA_P       0x00e00
 75 | #define ETA_P_SYMM  0x00200
 76 | #define ETA_P_FREE  0x00400
 77 | #define ETA_P_COMM  0x00800
 78 | 
 79 | #define ZETA_M      0x07000
 80 | #define ZETA_M_SYMM 0x01000
 81 | #define ZETA_M_FREE 0x02000
 82 | #define ZETA_M_COMM 0x04000
 83 | 
 84 | #define ZETA_P      0x38000
 85 | #define ZETA_P_SYMM 0x08000
 86 | #define ZETA_P_FREE 0x10000
 87 | #define ZETA_P_COMM 0x20000
 88 | 
 89 | // MPI Message Tags
 90 | #define MSG_COMM_SBN      1024
 91 | #define MSG_SYNC_POS_VEL  2048
 92 | #define MSG_MONOQ         3072
 93 | 
 94 | #define MAX_FIELDS_PER_MPI_COMM 6
 95 | 
 96 | // Assume 128 byte coherence
 97 | // Assume Real_t is an "integral power of 2" bytes wide
 98 | #define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
 99 | 
100 | #define CACHE_ALIGN_REAL(n) \
101 |    (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
102 | 
103 | /*********************************/
104 | /* Data structure implementation */
105 | /*********************************/
106 | 
107 | /* might want to add access methods so that memory can be */
108 | /* better managed, as in luleshFT */
109 | 
110 | template <typename T>
111 | T *Allocate(size_t size)
112 | {
113 |    return static_cast<T *>(malloc(sizeof(T)*size)) ;
114 | }
115 | 
116 | template <typename T>
117 | void Release(T **ptr)
118 | {
119 |    if (*ptr != NULL) {
120 |       free(*ptr) ;
121 |       *ptr = NULL ;
122 |    }
123 | }
124 | 
125 | //////////////////////////////////////////////////////
126 | // Primary data structure
127 | //////////////////////////////////////////////////////
128 | 
129 | /*
130 |  * The implementation of the data abstraction used for lulesh
131 |  * resides entirely in the Domain class below.  You can change
132 |  * grouping and interleaving of fields here to maximize data layout
133 |  * efficiency for your underlying architecture or compiler.
134 |  *
135 |  * For example, fields can be implemented as STL objects or
136 |  * raw array pointers.  As another example, individual fields
137 |  * m_x, m_y, m_z could be budled into
138 |  *
139 |  *    struct { Real_t x, y, z ; } *m_coord ;
140 |  *
141 |  * allowing accessor functions such as
142 |  *
143 |  *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
144 |  *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
145 |  *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
146 |  */
147 | 
148 | class Domain {
149 | 
150 |    public:
151 | 
152 |    // Constructor
153 |    Domain(Int_t numRanks, Index_t colLoc,
154 |           Index_t rowLoc, Index_t planeLoc,
155 |           Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
156 | 
157 |    // Destructor
158 |    ~Domain();
159 | 
160 |    //
161 |    // ALLOCATION
162 |    //
163 | 
164 |    void AllocateNodePersistent(Int_t numNode) // Node-centered
165 |    {
166 |       m_x.resize(numNode);  // coordinates
167 |       m_y.resize(numNode);
168 |       m_z.resize(numNode);
169 | 
170 |       m_xd.resize(numNode); // velocities
171 |       m_yd.resize(numNode);
172 |       m_zd.resize(numNode);
173 | 
174 |       m_xdd.resize(numNode); // accelerations
175 |       m_ydd.resize(numNode);
176 |       m_zdd.resize(numNode);
177 | 
178 |       m_fx.resize(numNode);  // forces
179 |       m_fy.resize(numNode);
180 |       m_fz.resize(numNode);
181 | 
182 |       m_nodalMass.resize(numNode);  // mass
183 |    }
184 | 
185 |    void AllocateElemPersistent(Int_t numElem) // Elem-centered
186 |    {
187 |       m_nodelist.resize(8*numElem);
188 | 
189 |       // elem connectivities through face
190 |       m_lxim.resize(numElem);
191 |       m_lxip.resize(numElem);
192 |       m_letam.resize(numElem);
193 |       m_letap.resize(numElem);
194 |       m_lzetam.resize(numElem);
195 |       m_lzetap.resize(numElem);
196 | 
197 |       m_elemBC.resize(numElem);
198 | 
199 |       m_e.resize(numElem);
200 |       m_p.resize(numElem);
201 | 
202 |       m_q.resize(numElem);
203 |       m_ql.resize(numElem);
204 |       m_qq.resize(numElem);
205 | 
206 |       m_v.resize(numElem);
207 | 
208 |       m_volo.resize(numElem);
209 |       m_delv.resize(numElem);
210 |       m_vdov.resize(numElem);
211 | 
212 |       m_arealg.resize(numElem);
213 | 
214 |       m_ss.resize(numElem);
215 | 
216 |       m_elemMass.resize(numElem);
217 | 
218 |       m_vnew.resize(numElem) ;
219 |    }
220 | 
221 |    void AllocateGradients(Int_t numElem, Int_t allElem)
222 |    {
223 |       // Position gradients
224 |       m_delx_xi   = Allocate<Real_t>(numElem) ;
225 |       m_delx_eta  = Allocate<Real_t>(numElem) ;
226 |       m_delx_zeta = Allocate<Real_t>(numElem) ;
227 | 
228 |       // Velocity gradients
229 |       m_delv_xi   = Allocate<Real_t>(allElem) ;
230 |       m_delv_eta  = Allocate<Real_t>(allElem);
231 |       m_delv_zeta = Allocate<Real_t>(allElem) ;
232 |    }
233 | 
234 |    void DeallocateGradients()
235 |    {
236 |       Release(&m_delx_zeta);
237 |       Release(&m_delx_eta) ;
238 |       Release(&m_delx_xi)  ;
239 | 
240 |       Release(&m_delv_zeta);
241 |       Release(&m_delv_eta) ;
242 |       Release(&m_delv_xi)  ;
243 |    }
244 | 
245 |    void AllocateStrains(Int_t numElem)
246 |    {
247 |       m_dxx = Allocate<Real_t>(numElem) ;
248 |       m_dyy = Allocate<Real_t>(numElem) ;
249 |       m_dzz = Allocate<Real_t>(numElem) ;
250 |    }
251 | 
252 |    void DeallocateStrains()
253 |    {
254 |       Release(&m_dzz) ;
255 |       Release(&m_dyy) ;
256 |       Release(&m_dxx) ;
257 |    }
258 |    
259 |    //
260 |    // ACCESSORS
261 |    //
262 | 
263 |    // Node-centered
264 | 
265 |    // Nodal coordinates
266 |    Real_t& x(Index_t idx)    { return m_x[idx] ; }
267 |    Real_t& y(Index_t idx)    { return m_y[idx] ; }
268 |    Real_t& z(Index_t idx)    { return m_z[idx] ; }
269 | 
270 |    // Nodal velocities
271 |    Real_t& xd(Index_t idx)   { return m_xd[idx] ; }
272 |    Real_t& yd(Index_t idx)   { return m_yd[idx] ; }
273 |    Real_t& zd(Index_t idx)   { return m_zd[idx] ; }
274 | 
275 |    // Nodal accelerations
276 |    Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }
277 |    Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }
278 |    Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }
279 | 
280 |    // Nodal forces
281 |    Real_t& fx(Index_t idx)   { return m_fx[idx] ; }
282 |    Real_t& fy(Index_t idx)   { return m_fy[idx] ; }
283 |    Real_t& fz(Index_t idx)   { return m_fz[idx] ; }
284 | 
285 |    // Nodal mass
286 |    Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
287 | 
288 |    // Nodes on symmertry planes
289 |    Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
290 |    Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
291 |    Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
292 |    bool symmXempty()          { return m_symmX.empty(); }
293 |    bool symmYempty()          { return m_symmY.empty(); }
294 |    bool symmZempty()          { return m_symmZ.empty(); }
295 | 
296 |    //
297 |    // Element-centered
298 |    //
299 |    Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
300 |    Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
301 |    Index_t*  regNumList()            { return &m_regNumList[0] ; }
302 |    Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
303 |    Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
304 | 
305 |    Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
306 | 
307 |    // elem connectivities through face
308 |    Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }
309 |    Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }
310 |    Index_t&  letam(Index_t idx) { return m_letam[idx] ; }
311 |    Index_t&  letap(Index_t idx) { return m_letap[idx] ; }
312 |    Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }
313 |    Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }
314 | 
315 |    // elem face symm/free-surface flag
316 |    Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
317 | 
318 |    // Principal strains - temporary
319 |    Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
320 |    Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
321 |    Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
322 | 
323 |    // New relative volume - temporary
324 |    Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
325 | 
326 |    // Velocity gradient - temporary
327 |    Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
328 |    Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
329 |    Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
330 | 
331 |    // Position gradient - temporary
332 |    Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
333 |    Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
334 |    Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
335 | 
336 |    // Energy
337 |    Real_t& e(Index_t idx)          { return m_e[idx] ; }
338 | 
339 |    // Pressure
340 |    Real_t& p(Index_t idx)          { return m_p[idx] ; }
341 | 
342 |    // Artificial viscosity
343 |    Real_t& q(Index_t idx)          { return m_q[idx] ; }
344 | 
345 |    // Linear term for q
346 |    Real_t& ql(Index_t idx)         { return m_ql[idx] ; }
347 |    // Quadratic term for q
348 |    Real_t& qq(Index_t idx)         { return m_qq[idx] ; }
349 | 
350 |    // Relative volume
351 |    Real_t& v(Index_t idx)          { return m_v[idx] ; }
352 |    Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
353 | 
354 |    // Reference volume
355 |    Real_t& volo(Index_t idx)       { return m_volo[idx] ; }
356 | 
357 |    // volume derivative over volume
358 |    Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
359 | 
360 |    // Element characteristic length
361 |    Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
362 | 
363 |    // Sound speed
364 |    Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
365 | 
366 |    // Element mass
367 |    Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
368 | 
369 |    Index_t nodeElemCount(Index_t idx)
370 |    { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
371 | 
372 |    Index_t *nodeElemCornerList(Index_t idx)
373 |    { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
374 | 
375 |    // Parameters 
376 | 
377 |    // Cutoffs
378 |    Real_t u_cut() const               { return m_u_cut ; }
379 |    Real_t e_cut() const               { return m_e_cut ; }
380 |    Real_t p_cut() const               { return m_p_cut ; }
381 |    Real_t q_cut() const               { return m_q_cut ; }
382 |    Real_t v_cut() const               { return m_v_cut ; }
383 | 
384 |    // Other constants (usually are settable via input file in real codes)
385 |    Real_t hgcoef() const              { return m_hgcoef ; }
386 |    Real_t qstop() const               { return m_qstop ; }
387 |    Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
388 |    Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
389 |    Real_t ss4o3() const               { return m_ss4o3 ; }
390 |    Real_t qlc_monoq() const           { return m_qlc_monoq ; }
391 |    Real_t qqc_monoq() const           { return m_qqc_monoq ; }
392 |    Real_t qqc() const                 { return m_qqc ; }
393 | 
394 |    Real_t eosvmax() const             { return m_eosvmax ; }
395 |    Real_t eosvmin() const             { return m_eosvmin ; }
396 |    Real_t pmin() const                { return m_pmin ; }
397 |    Real_t emin() const                { return m_emin ; }
398 |    Real_t dvovmax() const             { return m_dvovmax ; }
399 |    Real_t refdens() const             { return m_refdens ; }
400 | 
401 |    // Timestep controls, etc...
402 |    Real_t& time()                 { return m_time ; }
403 |    Real_t& deltatime()            { return m_deltatime ; }
404 |    Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
405 |    Real_t& deltatimemultub()      { return m_deltatimemultub ; }
406 |    Real_t& stoptime()             { return m_stoptime ; }
407 |    Real_t& dtcourant()            { return m_dtcourant ; }
408 |    Real_t& dthydro()              { return m_dthydro ; }
409 |    Real_t& dtmax()                { return m_dtmax ; }
410 |    Real_t& dtfixed()              { return m_dtfixed ; }
411 | 
412 |    Int_t&  cycle()                { return m_cycle ; }
413 |    Index_t&  numRanks()           { return m_numRanks ; }
414 | 
415 |    Index_t&  colLoc()             { return m_colLoc ; }
416 |    Index_t&  rowLoc()             { return m_rowLoc ; }
417 |    Index_t&  planeLoc()           { return m_planeLoc ; }
418 |    Index_t&  tp()                 { return m_tp ; }
419 | 
420 |    Index_t&  sizeX()              { return m_sizeX ; }
421 |    Index_t&  sizeY()              { return m_sizeY ; }
422 |    Index_t&  sizeZ()              { return m_sizeZ ; }
423 |    Index_t&  numReg()             { return m_numReg ; }
424 |    Int_t&  cost()             { return m_cost ; }
425 |    Index_t&  numElem()            { return m_numElem ; }
426 |    Index_t&  numNode()            { return m_numNode ; }
427 |    
428 |    Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
429 |    Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
430 |    
431 |    //
432 |    // MPI-Related additional data
433 |    //
434 | 
435 | #if USE_MPI   
436 |    // Communication Work space 
437 |    Real_t *commDataSend ;
438 |    Real_t *commDataRecv ;
439 |    
440 |    // Maximum number of block neighbors 
441 |    MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
442 |    MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
443 | #endif
444 | 
445 |   private:
446 | 
447 |    void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
448 |    void SetupThreadSupportStructures();
449 |    void CreateRegionIndexSets(Int_t nreg, Int_t balance);
450 |    void SetupCommBuffers(Int_t edgeNodes);
451 |    void SetupSymmetryPlanes(Int_t edgeNodes);
452 |    void SetupElementConnectivities(Int_t edgeElems);
453 |    void SetupBoundaryConditions(Int_t edgeElems);
454 | 
455 |    //
456 |    // IMPLEMENTATION
457 |    //
458 | 
459 |    /* Node-centered */
460 |    std::vector<Real_t> m_x ;  /* coordinates */
461 |    std::vector<Real_t> m_y ;
462 |    std::vector<Real_t> m_z ;
463 | 
464 |    std::vector<Real_t> m_xd ; /* velocities */
465 |    std::vector<Real_t> m_yd ;
466 |    std::vector<Real_t> m_zd ;
467 | 
468 |    std::vector<Real_t> m_xdd ; /* accelerations */
469 |    std::vector<Real_t> m_ydd ;
470 |    std::vector<Real_t> m_zdd ;
471 | 
472 |    std::vector<Real_t> m_fx ;  /* forces */
473 |    std::vector<Real_t> m_fy ;
474 |    std::vector<Real_t> m_fz ;
475 | 
476 |    std::vector<Real_t> m_nodalMass ;  /* mass */
477 | 
478 |    std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
479 |    std::vector<Index_t> m_symmY ;
480 |    std::vector<Index_t> m_symmZ ;
481 | 
482 |    // Element-centered
483 | 
484 |    // Region information
485 |    Int_t    m_numReg ;
486 |    Int_t    m_cost; //imbalance cost
487 |    Index_t *m_regElemSize ;   // Size of region sets
488 |    Index_t *m_regNumList ;    // Region number per domain element
489 |    Index_t **m_regElemlist ;  // region indexset 
490 | 
491 |    std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
492 | 
493 |    std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */
494 |    std::vector<Index_t>  m_lxip ;
495 |    std::vector<Index_t>  m_letam ;
496 |    std::vector<Index_t>  m_letap ;
497 |    std::vector<Index_t>  m_lzetam ;
498 |    std::vector<Index_t>  m_lzetap ;
499 | 
500 |    std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
501 | 
502 |    Real_t             *m_dxx ;  /* principal strains -- temporary */
503 |    Real_t             *m_dyy ;
504 |    Real_t             *m_dzz ;
505 | 
506 |    Real_t             *m_delv_xi ;    /* velocity gradient -- temporary */
507 |    Real_t             *m_delv_eta ;
508 |    Real_t             *m_delv_zeta ;
509 | 
510 |    Real_t             *m_delx_xi ;    /* coordinate gradient -- temporary */
511 |    Real_t             *m_delx_eta ;
512 |    Real_t             *m_delx_zeta ;
513 |    
514 |    std::vector<Real_t> m_e ;   /* energy */
515 | 
516 |    std::vector<Real_t> m_p ;   /* pressure */
517 |    std::vector<Real_t> m_q ;   /* q */
518 |    std::vector<Real_t> m_ql ;  /* linear term for q */
519 |    std::vector<Real_t> m_qq ;  /* quadratic term for q */
520 | 
521 |    std::vector<Real_t> m_v ;     /* relative volume */
522 |    std::vector<Real_t> m_volo ;  /* reference volume */
523 |    std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
524 |    std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
525 |    std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
526 | 
527 |    std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
528 |    
529 |    std::vector<Real_t> m_ss ;      /* "sound speed" */
530 | 
531 |    std::vector<Real_t> m_elemMass ;  /* mass */
532 | 
533 |    // Cutoffs (treat as constants)
534 |    const Real_t  m_e_cut ;             // energy tolerance 
535 |    const Real_t  m_p_cut ;             // pressure tolerance 
536 |    const Real_t  m_q_cut ;             // q tolerance 
537 |    const Real_t  m_v_cut ;             // relative volume tolerance 
538 |    const Real_t  m_u_cut ;             // velocity tolerance 
539 | 
540 |    // Other constants (usually setable, but hardcoded in this proxy app)
541 | 
542 |    const Real_t  m_hgcoef ;            // hourglass control 
543 |    const Real_t  m_ss4o3 ;
544 |    const Real_t  m_qstop ;             // excessive q indicator 
545 |    const Real_t  m_monoq_max_slope ;
546 |    const Real_t  m_monoq_limiter_mult ;
547 |    const Real_t  m_qlc_monoq ;         // linear term coef for q 
548 |    const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
549 |    const Real_t  m_qqc ;
550 |    const Real_t  m_eosvmax ;
551 |    const Real_t  m_eosvmin ;
552 |    const Real_t  m_pmin ;              // pressure floor 
553 |    const Real_t  m_emin ;              // energy floor 
554 |    const Real_t  m_dvovmax ;           // maximum allowable volume change 
555 |    const Real_t  m_refdens ;           // reference density 
556 | 
557 |    // Variables to keep track of timestep, simulation time, and cycle
558 |    Real_t  m_dtcourant ;         // courant constraint 
559 |    Real_t  m_dthydro ;           // volume change constraint 
560 |    Int_t   m_cycle ;             // iteration count for simulation 
561 |    Real_t  m_dtfixed ;           // fixed time increment 
562 |    Real_t  m_time ;              // current time 
563 |    Real_t  m_deltatime ;         // variable time increment 
564 |    Real_t  m_deltatimemultlb ;
565 |    Real_t  m_deltatimemultub ;
566 |    Real_t  m_dtmax ;             // maximum allowable time increment 
567 |    Real_t  m_stoptime ;          // end time for simulation 
568 | 
569 | 
570 |    Int_t   m_numRanks ;
571 | 
572 |    Index_t m_colLoc ;
573 |    Index_t m_rowLoc ;
574 |    Index_t m_planeLoc ;
575 |    Index_t m_tp ;
576 | 
577 |    Index_t m_sizeX ;
578 |    Index_t m_sizeY ;
579 |    Index_t m_sizeZ ;
580 |    Index_t m_numElem ;
581 |    Index_t m_numNode ;
582 | 
583 |    Index_t m_maxPlaneSize ;
584 |    Index_t m_maxEdgeSize ;
585 | 
586 |    // OMP hack 
587 |    Index_t *m_nodeElemStart ;
588 |    Index_t *m_nodeElemCornerList ;
589 | 
590 |    // Used in setup
591 |    Index_t m_rowMin, m_rowMax;
592 |    Index_t m_colMin, m_colMax;
593 |    Index_t m_planeMin, m_planeMax ;
594 | 
595 | } ;
596 | 
597 | typedef Real_t &(Domain::* Domain_member )(Index_t) ;
598 | 
599 | struct cmdLineOpts {
600 |    Int_t its; // -i 
601 |    Int_t nx;  // -s 
602 |    Int_t numReg; // -r 
603 |    Int_t numFiles; // -f
604 |    Int_t showProg; // -p
605 |    Int_t quiet; // -q
606 |    Int_t viz; // -v 
607 |    Int_t cost; // -c
608 |    Int_t balance; // -b
609 | };
610 | 
611 | 
612 | 
613 | // Function Prototypes
614 | 
615 | // lulesh-par
616 | Real_t CalcElemVolume( const Real_t x[8],
617 |                        const Real_t y[8],
618 |                        const Real_t z[8]);
619 | 
620 | // lulesh-util
621 | void ParseCommandLineOptions(int argc, char *argv[],
622 |                              Int_t myRank, struct cmdLineOpts *opts);
623 | void VerifyAndWriteFinalOutput(Real_t elapsed_time,
624 |                                Domain& locDom,
625 |                                Int_t nx,
626 |                                Int_t numRanks);
627 | 
628 | // lulesh-viz
629 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
630 | 
631 | // lulesh-comm
632 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
633 |               Index_t dx, Index_t dy, Index_t dz,
634 |               bool doRecv, bool planeOnly);
635 | void CommSend(Domain& domain, Int_t msgType,
636 |               Index_t xferFields, Domain_member *fieldData,
637 |               Index_t dx, Index_t dy, Index_t dz,
638 |               bool doSend, bool planeOnly);
639 | void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
640 | void CommSyncPosVel(Domain& domain);
641 | void CommMonoQ(Domain& domain);
642 | 
643 | // lulesh-init
644 | void InitMeshDecomp(Int_t numRanks, Int_t myRank,
645 |                     Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
646 | 


--------------------------------------------------------------------------------
/lulesh_tuple.h:
--------------------------------------------------------------------------------
  1 | #if !defined(USE_MPI)
  2 | # error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line"
  3 | #endif
  4 | 
  5 | #if USE_MPI
  6 | #include <mpi.h>
  7 | 
  8 | /*
  9 |    define one of these three symbols:
 10 | 
 11 |    SEDOV_SYNC_POS_VEL_NONE
 12 |    SEDOV_SYNC_POS_VEL_EARLY
 13 |    SEDOV_SYNC_POS_VEL_LATE
 14 | */
 15 | 
 16 | #define SEDOV_SYNC_POS_VEL_EARLY 1
 17 | #endif
 18 | 
 19 | #include <math.h>
 20 | #include <vector>
 21 | 
 22 | //**************************************************
 23 | // Allow flexibility for arithmetic representations 
 24 | //**************************************************
 25 | 
 26 | #define MAX(a, b) ( ((a) > (b)) ? (a) : (b))
 27 | 
 28 | 
 29 | // Precision specification
 30 | typedef float        real4 ;
 31 | typedef double       real8 ;
 32 | typedef long double  real10 ;  // 10 bytes on x86
 33 | 
 34 | typedef int    Index_t ; // array subscript and loop index
 35 | typedef real8  Real_t ;  // floating point representation
 36 | typedef int    Int_t ;   // integer representation
 37 | 
 38 | enum { VolumeError = -1, QStopError = -2 } ;
 39 | 
 40 | inline real4  SQRT(real4  arg) { return sqrtf(arg) ; }
 41 | inline real8  SQRT(real8  arg) { return sqrt(arg) ; }
 42 | inline real10 SQRT(real10 arg) { return sqrtl(arg) ; }
 43 | 
 44 | inline real4  CBRT(real4  arg) { return cbrtf(arg) ; }
 45 | inline real8  CBRT(real8  arg) { return cbrt(arg) ; }
 46 | inline real10 CBRT(real10 arg) { return cbrtl(arg) ; }
 47 | 
 48 | inline real4  FABS(real4  arg) { return fabsf(arg) ; }
 49 | inline real8  FABS(real8  arg) { return fabs(arg) ; }
 50 | inline real10 FABS(real10 arg) { return fabsl(arg) ; }
 51 | 
 52 | 
 53 | // Stuff needed for boundary conditions
 54 | // 2 BCs on each of 6 hexahedral faces (12 bits)
 55 | #define XI_M        0x00007
 56 | #define XI_M_SYMM   0x00001
 57 | #define XI_M_FREE   0x00002
 58 | #define XI_M_COMM   0x00004
 59 | 
 60 | #define XI_P        0x00038
 61 | #define XI_P_SYMM   0x00008
 62 | #define XI_P_FREE   0x00010
 63 | #define XI_P_COMM   0x00020
 64 | 
 65 | #define ETA_M       0x001c0
 66 | #define ETA_M_SYMM  0x00040
 67 | #define ETA_M_FREE  0x00080
 68 | #define ETA_M_COMM  0x00100
 69 | 
 70 | #define ETA_P       0x00e00
 71 | #define ETA_P_SYMM  0x00200
 72 | #define ETA_P_FREE  0x00400
 73 | #define ETA_P_COMM  0x00800
 74 | 
 75 | #define ZETA_M      0x07000
 76 | #define ZETA_M_SYMM 0x01000
 77 | #define ZETA_M_FREE 0x02000
 78 | #define ZETA_M_COMM 0x04000
 79 | 
 80 | #define ZETA_P      0x38000
 81 | #define ZETA_P_SYMM 0x08000
 82 | #define ZETA_P_FREE 0x10000
 83 | #define ZETA_P_COMM 0x20000
 84 | 
 85 | // MPI Message Tags
 86 | #define MSG_COMM_SBN      1024
 87 | #define MSG_SYNC_POS_VEL  2048
 88 | #define MSG_MONOQ         3072
 89 | 
 90 | #define MAX_FIELDS_PER_MPI_COMM 6
 91 | 
 92 | // Assume 128 byte coherence
 93 | // Assume Real_t is an "integral power of 2" bytes wide
 94 | #define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t))
 95 | 
 96 | #define CACHE_ALIGN_REAL(n) \
 97 |    (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1))
 98 | 
 99 | //////////////////////////////////////////////////////
100 | // Primary data structure
101 | //////////////////////////////////////////////////////
102 | 
103 | /*
104 |  * The implementation of the data abstraction used for lulesh
105 |  * resides entirely in the Domain class below.  You can change
106 |  * grouping and interleaving of fields here to maximize data layout
107 |  * efficiency for your underlying architecture or compiler.
108 |  *
109 |  * For example, fields can be implemented as STL objects or
110 |  * raw array pointers.  As another example, individual fields
111 |  * m_x, m_y, m_z could be budled into
112 |  *
113 |  *    struct { Real_t x, y, z ; } *m_coord ;
114 |  *
115 |  * allowing accessor functions such as
116 |  *
117 |  *  "Real_t &x(Index_t idx) { return m_coord[idx].x ; }"
118 |  *  "Real_t &y(Index_t idx) { return m_coord[idx].y ; }"
119 |  *  "Real_t &z(Index_t idx) { return m_coord[idx].z ; }"
120 |  */
121 | 
122 | class Domain {
123 | 
124 |    public:
125 | 
126 |    // Constructor
127 |    Domain(Int_t numRanks, Index_t colLoc,
128 |           Index_t rowLoc, Index_t planeLoc,
129 |           Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
130 |    
131 |    // Destructor
132 |    ~Domain();
133 | 
134 |    //
135 |    // ALLOCATION
136 |    //
137 | 
138 |    void AllocateNodePersistent(Int_t numNode) // Node-centered
139 |    {
140 |       m_coord.resize(numNode);  // coordinates
141 | 
142 |       m_vel.resize(numNode); // velocities
143 | 
144 |       m_acc.resize(numNode); // accelerations
145 | 
146 |       m_force.resize(numNode);  // forces
147 | 
148 |       m_nodalMass.resize(numNode);  // mass
149 |    }
150 | 
151 |    void AllocateElemPersistent(Int_t numElem) // Elem-centered
152 |    {
153 |       m_nodelist.resize(8*numElem);
154 | 
155 |       // elem connectivities through face
156 |       m_faceToElem.resize(numElem);
157 | 
158 |       m_elemBC.resize(numElem);
159 | 
160 |       m_e.resize(numElem);
161 | 
162 |       m_pq.resize(numElem);
163 | 
164 |       m_qlqq.resize(numElem);
165 | 
166 |       m_vol.resize(numElem);
167 | 
168 |       m_delv.resize(numElem);
169 |       m_vdov.resize(numElem);
170 | 
171 |       m_arealg.resize(numElem);
172 | 
173 |       m_ss.resize(numElem);
174 | 
175 |       m_elemMass.resize(numElem);
176 | 
177 |       m_vnew.resize(numElem) ;
178 |    }
179 | 
180 |    void AllocateGradients(Int_t numElem, Int_t allElem)
181 |    {
182 |       // Position gradients
183 |       m_delx_xi.resize(numElem) ;
184 |       m_delx_eta.resize(numElem) ;
185 |       m_delx_zeta.resize(numElem) ;
186 | 
187 |       // Velocity gradients
188 |       m_delv_xi.resize(allElem) ;
189 |       m_delv_eta.resize(allElem);
190 |       m_delv_zeta.resize(allElem) ;
191 |    }
192 | 
193 |    void DeallocateGradients()
194 |    {
195 |       m_delx_zeta.clear() ;
196 |       m_delx_eta.clear() ;
197 |       m_delx_xi.clear() ;
198 | 
199 |       m_delv_zeta.clear() ;
200 |       m_delv_eta.clear() ;
201 |       m_delv_xi.clear() ;
202 |    }
203 | 
204 |    void AllocateStrains(Int_t numElem)
205 |    {
206 |       m_dxx.resize(numElem) ;
207 |       m_dyy.resize(numElem) ;
208 |       m_dzz.resize(numElem) ;
209 |    }
210 | 
211 |    void DeallocateStrains()
212 |    {
213 |       m_dzz.clear() ;
214 |       m_dyy.clear() ;
215 |       m_dxx.clear() ;
216 |    }
217 |    
218 |    //
219 |    // ACCESSORS
220 |    //
221 | 
222 |    // Node-centered
223 | 
224 |    // Nodal coordinates
225 |    Real_t& x(Index_t idx)    { return m_coord[idx].x ; }
226 |    Real_t& y(Index_t idx)    { return m_coord[idx].y ; }
227 |    Real_t& z(Index_t idx)    { return m_coord[idx].z ; }
228 | 
229 |    // Nodal velocities
230 |    Real_t& xd(Index_t idx)   { return m_vel[idx].x ; }
231 |    Real_t& yd(Index_t idx)   { return m_vel[idx].y ; }
232 |    Real_t& zd(Index_t idx)   { return m_vel[idx].z ; }
233 | 
234 |    // Nodal accelerations
235 |    Real_t& xdd(Index_t idx)  { return m_acc[idx].x ; }
236 |    Real_t& ydd(Index_t idx)  { return m_acc[idx].y ; }
237 |    Real_t& zdd(Index_t idx)  { return m_acc[idx].z ; }
238 | 
239 |    // Nodal forces
240 |    Real_t& fx(Index_t idx)   { return m_force[idx].x ; }
241 |    Real_t& fy(Index_t idx)   { return m_force[idx].y ; }
242 |    Real_t& fz(Index_t idx)   { return m_force[idx].z ; }
243 | 
244 |    // Nodal mass
245 |    Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }
246 | 
247 |    // Nodes on symmertry planes
248 |    Index_t symmX(Index_t idx) { return m_symmX[idx] ; }
249 |    Index_t symmY(Index_t idx) { return m_symmY[idx] ; }
250 |    Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; }
251 |    bool symmXempty()          { return m_symmX.empty(); }
252 |    bool symmYempty()          { return m_symmY.empty(); }
253 |    bool symmZempty()          { return m_symmZ.empty(); }
254 | 
255 |    //
256 |    // Element-centered
257 |    //
258 |    Index_t&  regElemSize(Index_t idx) { return m_regElemSize[idx] ; }
259 |    Index_t&  regNumList(Index_t idx) { return m_regNumList[idx] ; }
260 |    Index_t*  regNumList()            { return &m_regNumList[0] ; }
261 |    Index_t*  regElemlist(Int_t r)    { return m_regElemlist[r] ; }
262 |    Index_t&  regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; }
263 | 
264 |    Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }
265 | 
266 |    // elem connectivities through face
267 |    Index_t&  lxim(Index_t idx) { return m_faceToElem[idx].lxim ; }
268 |    Index_t&  lxip(Index_t idx) { return m_faceToElem[idx].lxip ; }
269 |    Index_t&  letam(Index_t idx) { return m_faceToElem[idx].letam ; }
270 |    Index_t&  letap(Index_t idx) { return m_faceToElem[idx].letap ; }
271 |    Index_t&  lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; }
272 |    Index_t&  lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; }
273 | 
274 |    // elem face symm/free-surface flag
275 |    Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }
276 | 
277 |    // Principal strains - temporary
278 |    Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }
279 |    Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }
280 |    Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }
281 | 
282 |    // New relative volume - temporary
283 |    Real_t& vnew(Index_t idx)  { return m_vnew[idx] ; }
284 | 
285 |    // Velocity gradient - temporary
286 |    Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }
287 |    Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }
288 |    Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }
289 | 
290 |    // Position gradient - temporary
291 |    Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }
292 |    Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }
293 |    Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }
294 | 
295 |    // Energy
296 |    Real_t& e(Index_t idx)          { return m_e[idx] ; }
297 | 
298 |    // Pressure
299 |    Real_t& p(Index_t idx)          { return m_pq[idx].p ; }
300 | 
301 |    // Artificial viscosity
302 |    Real_t& q(Index_t idx)          { return m_pq[idx].q ; }
303 | 
304 |    // Linear term for q
305 |    Real_t& ql(Index_t idx)         { return m_qlqq[idx].ql ; }
306 |    // Quadratic term for q
307 |    Real_t& qq(Index_t idx)         { return m_qlqq[idx].qq ; }
308 | 
309 |    Real_t& delv(Index_t idx)       { return m_delv[idx] ; }
310 | 
311 |    // Relative volume
312 |    Real_t& v(Index_t idx)          { return m_vol[idx].v ; }
313 |    // Reference volume
314 |    Real_t& volo(Index_t idx)       { return m_vol[idx].volo ; }
315 | 
316 |    // volume derivative over volume
317 |    Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }
318 | 
319 |    // Element characteristic length
320 |    Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }
321 | 
322 |    // Sound speed
323 |    Real_t& ss(Index_t idx)         { return m_ss[idx] ; }
324 | 
325 |    // Element mass
326 |    Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }
327 | 
328 |    Index_t nodeElemCount(Index_t idx)
329 |    { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; }
330 | 
331 |    Index_t *nodeElemCornerList(Index_t idx)
332 |    { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; }
333 | 
334 |    // Parameters 
335 | 
336 |    // Cutoffs
337 |    Real_t u_cut() const               { return m_u_cut ; }
338 |    Real_t e_cut() const               { return m_e_cut ; }
339 |    Real_t p_cut() const               { return m_p_cut ; }
340 |    Real_t q_cut() const               { return m_q_cut ; }
341 |    Real_t v_cut() const               { return m_v_cut ; }
342 | 
343 |    // Other constants (usually are settable via input file in real codes)
344 |    Real_t hgcoef() const              { return m_hgcoef ; }
345 |    Real_t qstop() const               { return m_qstop ; }
346 |    Real_t monoq_max_slope() const     { return m_monoq_max_slope ; }
347 |    Real_t monoq_limiter_mult() const  { return m_monoq_limiter_mult ; }
348 |    Real_t ss4o3() const               { return m_ss4o3 ; }
349 |    Real_t qlc_monoq() const           { return m_qlc_monoq ; }
350 |    Real_t qqc_monoq() const           { return m_qqc_monoq ; }
351 |    Real_t qqc() const                 { return m_qqc ; }
352 | 
353 |    Real_t eosvmax() const             { return m_eosvmax ; }
354 |    Real_t eosvmin() const             { return m_eosvmin ; }
355 |    Real_t pmin() const                { return m_pmin ; }
356 |    Real_t emin() const                { return m_emin ; }
357 |    Real_t dvovmax() const             { return m_dvovmax ; }
358 |    Real_t refdens() const             { return m_refdens ; }
359 | 
360 |    // Timestep controls, etc...
361 |    Real_t& time()                 { return m_time ; }
362 |    Real_t& deltatime()            { return m_deltatime ; }
363 |    Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }
364 |    Real_t& deltatimemultub()      { return m_deltatimemultub ; }
365 |    Real_t& stoptime()             { return m_stoptime ; }
366 |    Real_t& dtcourant()            { return m_dtcourant ; }
367 |    Real_t& dthydro()              { return m_dthydro ; }
368 |    Real_t& dtmax()                { return m_dtmax ; }
369 |    Real_t& dtfixed()              { return m_dtfixed ; }
370 | 
371 |    Int_t&  cycle()                { return m_cycle ; }
372 |    Index_t&  numRanks()           { return m_numRanks ; }
373 | 
374 |    Index_t&  colLoc()             { return m_colLoc ; }
375 |    Index_t&  rowLoc()             { return m_rowLoc ; }
376 |    Index_t&  planeLoc()           { return m_planeLoc ; }
377 |    Index_t&  tp()                 { return m_tp ; }
378 | 
379 |    Index_t&  sizeX()              { return m_sizeX ; }
380 |    Index_t&  sizeY()              { return m_sizeY ; }
381 |    Index_t&  sizeZ()              { return m_sizeZ ; }
382 |    Index_t&  numReg()             { return m_numReg ; }
383 |    Int_t&  cost()             { return m_cost ; }
384 |    Index_t&  numElem()            { return m_numElem ; }
385 |    Index_t&  numNode()            { return m_numNode ; }
386 |    
387 |    Index_t&  maxPlaneSize()       { return m_maxPlaneSize ; }
388 |    Index_t&  maxEdgeSize()        { return m_maxEdgeSize ; }
389 |    
390 |    //
391 |    // MPI-Related additional data
392 |    //
393 | 
394 | #if USE_MPI   
395 |    // Communication Work space 
396 |    Real_t *commDataSend ;
397 |    Real_t *commDataRecv ;
398 |    
399 |    // Maximum number of block neighbors 
400 |    MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 
401 |    MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 
402 | #endif
403 | 
404 |   private:
405 | 
406 |    void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems);
407 |    void SetupThreadSupportStructures();
408 |    void CreateRegionIndexSets(Int_t nreg, Int_t balance);
409 |    void SetupCommBuffers(Int_t edgeNodes);
410 |    void SetupSymmetryPlanes(Int_t edgeNodes);
411 |    void SetupElementConnectivities(Int_t edgeElems);
412 |    void SetupBoundaryConditions(Int_t edgeElems);
413 | 
414 |    //
415 |    // IMPLEMENTATION
416 |    //
417 | 
418 |    /* Node-centered */
419 | 
420 |    struct Tuple3 {
421 |       Real_t x, y, z ;
422 |    } ;
423 | 
424 |    std::vector<Tuple3> m_coord ;  /* coordinates */
425 | 
426 |    std::vector<Tuple3> m_vel ; /* velocities */
427 | 
428 |    std::vector<Tuple3> m_acc ; /* accelerations */
429 | 
430 |    std::vector<Tuple3> m_force ;  /* forces */
431 | 
432 |    std::vector<Real_t> m_nodalMass ;  /* mass */
433 | 
434 |    std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */
435 |    std::vector<Index_t> m_symmY ;
436 |    std::vector<Index_t> m_symmZ ;
437 | 
438 |    // Element-centered
439 | 
440 |    // Region information
441 |    Int_t    m_numReg ;
442 |    Int_t    m_cost; //imbalance cost
443 |    Index_t *m_regElemSize ;   // Size of region sets
444 |    Index_t *m_regNumList ;    // Region number per domain element
445 |    Index_t **m_regElemlist ;  // region indexset 
446 | 
447 |    std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */
448 | 
449 |    struct FaceElemConn {
450 |       Index_t lxim, lxip, letam, letap, lzetam, lzetap ;
451 |    } ;
452 | 
453 |    std::vector<FaceElemConn> m_faceToElem ; /* element conn across faces */
454 | 
455 |    std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */
456 | 
457 |    std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */
458 |    std::vector<Real_t> m_dyy ;
459 |    std::vector<Real_t> m_dzz ;
460 | 
461 |    std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */
462 |    std::vector<Real_t> m_delv_eta ;
463 |    std::vector<Real_t> m_delv_zeta ;
464 | 
465 |    std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */
466 |    std::vector<Real_t> m_delx_eta ;
467 |    std::vector<Real_t> m_delx_zeta ;
468 |    
469 |    std::vector<Real_t> m_e ;   /* energy */
470 | 
471 |    struct Pcomponents {
472 |       Real_t p, q ;
473 |    } ;
474 | 
475 |    std::vector<Pcomponents> m_pq ;   /* pressure and artificial viscosity */
476 | 
477 |    struct Qcomponents {
478 |       Real_t ql, qq ;
479 |    } ;
480 | 
481 |    std::vector<Qcomponents> m_qlqq ;  /* linear and quadratic terms for q */
482 | 
483 |    struct Volume {
484 |       Real_t v, volo ;
485 |    } ;
486 | 
487 |    std::vector<Volume> m_vol ;     /* relative and reference volume */
488 | 
489 |    std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */
490 |    std::vector<Real_t> m_delv ;  /* m_vnew - m_v */
491 |    std::vector<Real_t> m_vdov ;  /* volume derivative over volume */
492 | 
493 |    std::vector<Real_t> m_arealg ;  /* characteristic length of an element */
494 |    
495 |    std::vector<Real_t> m_ss ;      /* "sound speed" */
496 | 
497 |    std::vector<Real_t> m_elemMass ;  /* mass */
498 | 
499 |    // Cutoffs (treat as constants)
500 |    const Real_t  m_e_cut ;             // energy tolerance 
501 |    const Real_t  m_p_cut ;             // pressure tolerance 
502 |    const Real_t  m_q_cut ;             // q tolerance 
503 |    const Real_t  m_v_cut ;             // relative volume tolerance 
504 |    const Real_t  m_u_cut ;             // velocity tolerance 
505 | 
506 |    // Other constants (usually setable, but hardcoded in this proxy app)
507 | 
508 |    const Real_t  m_hgcoef ;            // hourglass control 
509 |    const Real_t  m_ss4o3 ;
510 |    const Real_t  m_qstop ;             // excessive q indicator 
511 |    const Real_t  m_monoq_max_slope ;
512 |    const Real_t  m_monoq_limiter_mult ;
513 |    const Real_t  m_qlc_monoq ;         // linear term coef for q 
514 |    const Real_t  m_qqc_monoq ;         // quadratic term coef for q 
515 |    const Real_t  m_qqc ;
516 |    const Real_t  m_eosvmax ;
517 |    const Real_t  m_eosvmin ;
518 |    const Real_t  m_pmin ;              // pressure floor 
519 |    const Real_t  m_emin ;              // energy floor 
520 |    const Real_t  m_dvovmax ;           // maximum allowable volume change 
521 |    const Real_t  m_refdens ;           // reference density 
522 | 
523 |    // Variables to keep track of timestep, simulation time, and cycle
524 |    Real_t  m_dtcourant ;         // courant constraint 
525 |    Real_t  m_dthydro ;           // volume change constraint 
526 |    Int_t   m_cycle ;             // iteration count for simulation 
527 |    Real_t  m_dtfixed ;           // fixed time increment 
528 |    Real_t  m_time ;              // current time 
529 |    Real_t  m_deltatime ;         // variable time increment 
530 |    Real_t  m_deltatimemultlb ;
531 |    Real_t  m_deltatimemultub ;
532 |    Real_t  m_dtmax ;             // maximum allowable time increment 
533 |    Real_t  m_stoptime ;          // end time for simulation 
534 | 
535 | 
536 |    Int_t   m_numRanks ;
537 | 
538 |    Index_t m_colLoc ;
539 |    Index_t m_rowLoc ;
540 |    Index_t m_planeLoc ;
541 |    Index_t m_tp ;
542 | 
543 |    Index_t m_sizeX ;
544 |    Index_t m_sizeY ;
545 |    Index_t m_sizeZ ;
546 |    Index_t m_numElem ;
547 |    Index_t m_numNode ;
548 | 
549 |    Index_t m_maxPlaneSize ;
550 |    Index_t m_maxEdgeSize ;
551 | 
552 |    // OMP hack 
553 |    Index_t *m_nodeElemStart ;
554 |    Index_t *m_nodeElemCornerList ;
555 | 
556 |    // Used in setup
557 |    Index_t m_rowMin, m_rowMax;
558 |    Index_t m_colMin, m_colMax;
559 |    Index_t m_planeMin, m_planeMax ;
560 | 
561 | } ;
562 | 
563 | typedef Real_t &(Domain::* Domain_member )(Index_t) ;
564 | 
565 | struct cmdLineOpts {
566 |    Int_t its; // -i 
567 |    Int_t nx;  // -s 
568 |    Int_t numReg; // -r 
569 |    Int_t numFiles; // -f
570 |    Int_t showProg; // -p
571 |    Int_t quiet; // -q
572 |    Int_t viz; // -v 
573 |    Int_t cost; // -c
574 |    Int_t balance; // -b
575 | };
576 | 
577 | 
578 | 
579 | // Function Prototypes
580 | 
581 | // lulesh-par
582 | Real_t CalcElemVolume( const Real_t x[8],
583 |                        const Real_t y[8],
584 |                        const Real_t z[8]);
585 | 
586 | // lulesh-util
587 | void ParseCommandLineOptions(int argc, char *argv[],
588 |                              Int_t myRank, struct cmdLineOpts *opts);
589 | void VerifyAndWriteFinalOutput(Real_t elapsed_time,
590 |                                Domain& locDom,
591 |                                Int_t nx,
592 |                                Int_t numRanks);
593 | 
594 | // lulesh-viz
595 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks);
596 | 
597 | // lulesh-comm
598 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields,
599 |               Index_t dx, Index_t dy, Index_t dz,
600 |               bool doRecv, bool planeOnly);
601 | void CommSend(Domain& domain, Int_t msgType,
602 |               Index_t xferFields, Domain_member *fieldData,
603 |               Index_t dx, Index_t dy, Index_t dz,
604 |               bool doSend, bool planeOnly);
605 | void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData);
606 | void CommSyncPosVel(Domain& domain);
607 | void CommMonoQ(Domain& domain);
608 | 
609 | // lulesh-init
610 | void InitMeshDecomp(Int_t numRanks, Int_t myRank,
611 |                     Int_t *col, Int_t *row, Int_t *plane, Int_t *side);
612 | 


--------------------------------------------------------------------------------