├── CMakeLists.txt ├── Makefile ├── README ├── TODO ├── lulesh-comm.cc ├── lulesh-init.cc ├── lulesh-util.cc ├── lulesh-viz.cc ├── lulesh.cc ├── lulesh.h └── lulesh_tuple.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(LULESH CXX) 4 | 5 | option(WITH_MPI "Build LULESH with MPI" TRUE) 6 | option(WITH_OPENMP "Build LULESH with OpenMP" TRUE) 7 | option(WITH_SILO "Build LULESH with silo support" FALSE) 8 | 9 | if (WITH_MPI) 10 | find_package(MPI REQUIRED) 11 | include_directories(${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) 12 | add_definitions("-DUSE_MPI=1") 13 | list(APPEND LULESH_EXTERNAL_LIBS ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) 14 | else() 15 | add_definitions("-DUSE_MPI=0") 16 | endif() 17 | 18 | if (WITH_OPENMP) 19 | find_package(OpenMP REQUIRED) 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 21 | endif() 22 | 23 | if (WITH_SILO) 24 | find_path(SILO_INCLUDE_DIR silo.h 25 | HINTS ${SILO_DIR}/include) 26 | find_library(SILO_LIBRARY 27 | NAMES siloh5 28 | HINTS ${SILO_DIR}/lib) 29 | 30 | include(FindPackageHandleStandardArgs) 31 | find_package_handle_standard_args(SILO DEFAULT_MSG 32 | SILO_LIBRARY 33 | SILO_INCLUDE_DIR) 34 | 35 | if (SILO_FOUND) 36 | add_definitions("-DVIZ_MESH") 37 | include_directories(${SILO_INCLUDE_DIR}) 38 | # Note: silo needs to be built as a dynamic lib, otherwise 39 | # there are additional dependencies (hdf5) which we don't know. 40 | # This would be fixed by silo providing a CMake package. 41 | list(APPEND LULESH_EXTERNAL_LIBS ${SILO_LIBRARY}) 42 | endif() 43 | endif() 44 | 45 | set(LULESH_SOURCES 46 | lulesh-comm.cc 47 | lulesh-init.cc 48 | lulesh-util.cc 49 | lulesh-viz.cc 50 | lulesh.cc) 51 | 52 | set(LULESH_EXEC lulesh2.0) 53 | 54 | add_executable(${LULESH_EXEC} ${LULESH_SOURCES}) 55 | target_link_libraries(${LULESH_EXEC} ${LULESH_EXTERNAL_LIBS}) 56 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #default build suggestion of MPI + OPENMP with gcc on Livermore machines you might have to change the compiler name 2 | 3 | SHELL = /bin/sh 4 | .SUFFIXES: .cc .o 5 | 6 | LULESH_EXEC = lulesh2.0 7 | 8 | MPI_INC = /opt/local/include/openmpi 9 | MPI_LIB = /opt/local/lib 10 | 11 | SERCXX = g++ -DUSE_MPI=0 12 | MPICXX = mpig++ -DUSE_MPI=1 13 | CXX = $(MPICXX) 14 | 15 | SOURCES2.0 = \ 16 | lulesh.cc \ 17 | lulesh-comm.cc \ 18 | lulesh-viz.cc \ 19 | lulesh-util.cc \ 20 | lulesh-init.cc 21 | OBJECTS2.0 = $(SOURCES2.0:.cc=.o) 22 | 23 | #Default build suggestions with OpenMP for g++ 24 | CXXFLAGS = -g -O3 -fopenmp -I. -Wall 25 | LDFLAGS = -g -O3 -fopenmp 26 | 27 | #Below are reasonable default flags for a serial build 28 | #CXXFLAGS = -g -O3 -I. -Wall 29 | #LDFLAGS = -g -O3 30 | 31 | #common places you might find silo on the Livermore machines. 32 | #SILO_INCDIR = /opt/local/include 33 | #SILO_LIBDIR = /opt/local/lib 34 | #SILO_INCDIR = ./silo/4.9/1.8.10.1/include 35 | #SILO_LIBDIR = ./silo/4.9/1.8.10.1/lib 36 | 37 | #If you do not have silo and visit you can get them at: 38 | #silo: https://wci.llnl.gov/codes/silo/downloads.html 39 | #visit: https://wci.llnl.gov/codes/visit/download.html 40 | 41 | #below is and example of how to make with silo, hdf5 to get vizulization by default all this is turned off. All paths are Livermore specific. 42 | #CXXFLAGS = -g -DVIZ_MESH -I${SILO_INCDIR} -Wall -Wno-pragmas 43 | #LDFLAGS = -g -L${SILO_LIBDIR} -Wl,-rpath -Wl,${SILO_LIBDIR} -lsiloh5 -lhdf5 44 | 45 | .cc.o: lulesh.h 46 | @echo "Building $<" 47 | $(CXX) -c $(CXXFLAGS) -o $@ $< 48 | 49 | all: $(LULESH_EXEC) 50 | 51 | $(LULESH_EXEC): $(OBJECTS2.0) 52 | @echo "Linking" 53 | $(CXX) $(OBJECTS2.0) $(LDFLAGS) -lm -o $@ 54 | 55 | clean: 56 | /bin/rm -f *.o *~ $(OBJECTS) $(LULESH_EXEC) 57 | /bin/rm -rf *.dSYM 58 | 59 | tar: clean 60 | cd .. ; tar cvf lulesh-2.0.tar LULESH-2.0 ; mv lulesh-2.0.tar LULESH-2.0 61 | 62 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This is the README for LULESH 2.0 2 | 3 | More information including LULESH 1.0 can be found at https://codesign.llnl.gov/lulesh.php 4 | 5 | If you have any questions or problems please contact: 6 | 7 | Ian Karlin or 8 | Rob Neely 9 | 10 | Also please send any notable results to Ian Karlin as we are still evaluating the performance of this code. 11 | 12 | A Makefile and a CMake build system are provided. 13 | 14 | *** Building with CMake *** 15 | 16 | Create a build directory and run cmake. Example: 17 | 18 | $ mkdir build; cd build; cmake -DCMAKE_BUILD_TYPE=Release -DMPI_CXX_COMPILER=`which mpicxx` .. 19 | 20 | CMake variables: 21 | 22 | CMAKE_BUILD_TYPE "Debug", "Release", or "RelWithDebInfo" 23 | 24 | CMAKE_CXX_COMPILER Path to the C++ compiler 25 | MPI_CXX_COMPILER Path to the MPI C++ compiler 26 | 27 | WITH_MPI=On|Off Build with MPI (Default: On) 28 | WITH_OPENMP=On|Off Build with OpenMP support (Default: On) 29 | WITH_SILO=On|Off Build with support for SILO. (Default: Off). 30 | 31 | SILO_DIR Path to SILO library (only needed when WITH_SILO is "On") 32 | 33 | *** Notable changes in LULESH 2.0 *** 34 | 35 | Split functionality into different files 36 | lulesh.cc - where most (all?) of the timed functionality lies 37 | lulesh-comm.cc - MPI functionality 38 | lulesh-init.cc - Setup code 39 | lulesh-viz.cc - Support for visualization option 40 | lulesh-util.cc - Non-timed functions 41 | 42 | The concept of "regions" was added, although every region is the same ideal gas material, and the same sedov blast wave problem is still the only problem its hardcoded to solve. Regions allow two things important to making this proxy app more representative: 43 | 44 | Four of the LULESH routines are now performed on a region-by-region basis, making the memory access patterns non-unit stride 45 | 46 | Artificial load imbalances can be easily introduced that could impact parallelization strategies. 47 | * The load balance flag changes region assignment. Region number is raised to the power entered for assignment probability. Most likely regions changes with MPI process id. 48 | * The cost flag raises the cost of ~45% of the regions to evaluate EOS by the entered multiple. The cost of 5% is 10x the entered 49 | multiple. 50 | 51 | MPI and OpenMP were added, and coalesced into a single version of the source that can support serial builds, MPI-only, OpenMP-only, and MPI+OpenMP 52 | 53 | Added support to write plot files using "poor mans parallel I/O" when linked with the silo library, which in turn can be read by VisIt. 54 | 55 | Enabled variable timestep calculation by default (courant condition), which results in an additional reduction. Also, seeded the initial timestep based on analytical equation to allow scaling to arbitrary size. Therefore steps to solution will differ from LULESH 1.0. 56 | 57 | Default domain (mesh) size reduced from 45^3 to 30^3 58 | 59 | Command line options to allow for numerous test cases without needing to recompile 60 | 61 | Performance optimizations and code cleanup uncovered during study of LULESH 1.0 62 | 63 | Added a "Figure of Merit" calculation (elements solved per microsecond) and output in support of using LULESH 2.0 for the 2017 CORAL procurement 64 | 65 | *** Notable changes in LULESH 2.1 *** 66 | 67 | Minor bug fixes. 68 | Code cleanup to add consitancy to variable names, loop indexing, memory allocation/deallocation, etc. 69 | Destructor added to main class to clean up when code exits. 70 | 71 | 72 | Possible Future 2.0 minor updates (other changes possible as discovered) 73 | 74 | * Different default parameters 75 | * Minor code performance changes and cleanupS 76 | 77 | TODO in future versions 78 | * Add reader for (truly) unstructured meshes, probably serial only 79 | 80 | 81 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | cmake build (Abhinav) 2 | -------------------------------------------------------------------------------- /lulesh-comm.cc: -------------------------------------------------------------------------------- 1 | #include "lulesh.h" 2 | 3 | // If no MPI, then this whole file is stubbed out 4 | #if USE_MPI 5 | 6 | #include 7 | #include 8 | 9 | /* Comm Routines */ 10 | 11 | #define ALLOW_UNPACKED_PLANE false 12 | #define ALLOW_UNPACKED_ROW false 13 | #define ALLOW_UNPACKED_COL false 14 | 15 | /* 16 | There are coherence issues for packing and unpacking message 17 | buffers. Ideally, you would like a lot of threads to 18 | cooperate in the assembly/dissassembly of each message. 19 | To do that, each thread should really be operating in a 20 | different coherence zone. 21 | 22 | Let's assume we have three fields, f1 through f3, defined on 23 | a 61x61x61 cube. If we want to send the block boundary 24 | information for each field to each neighbor processor across 25 | each cube face, then we have three cases for the 26 | memory layout/coherence of data on each of the six cube 27 | boundaries: 28 | 29 | (a) Two of the faces will be in contiguous memory blocks 30 | (b) Two of the faces will be comprised of pencils of 31 | contiguous memory. 32 | (c) Two of the faces will have large strides between 33 | every value living on the face. 34 | 35 | How do you pack and unpack this data in buffers to 36 | simultaneous achieve the best memory efficiency and 37 | the most thread independence? 38 | 39 | Do do you pack field f1 through f3 tighly to reduce message 40 | size? Do you align each field on a cache coherence boundary 41 | within the message so that threads can pack and unpack each 42 | field independently? For case (b), do you align each 43 | boundary pencil of each field separately? This increases 44 | the message size, but could improve cache coherence so 45 | each pencil could be processed independently by a separate 46 | thread with no conflicts. 47 | 48 | Also, memory access for case (c) would best be done without 49 | going through the cache (the stride is so large it just causes 50 | a lot of useless cache evictions). Is it worth creating 51 | a special case version of the packing algorithm that uses 52 | non-coherent load/store opcodes? 53 | */ 54 | 55 | /******************************************/ 56 | 57 | 58 | /* doRecv flag only works with regular block structure */ 59 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, 60 | Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) { 61 | 62 | if (domain.numRanks() == 1) 63 | return ; 64 | 65 | /* post recieve buffers for all incoming messages */ 66 | int myRank ; 67 | Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ; 68 | Index_t maxEdgeComm = xferFields * domain.maxEdgeSize() ; 69 | Index_t pmsg = 0 ; /* plane comm msg */ 70 | Index_t emsg = 0 ; /* edge comm msg */ 71 | Index_t cmsg = 0 ; /* corner comm msg */ 72 | MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ; 73 | bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ; 74 | 75 | /* assume communication to 6 neighbors by default */ 76 | rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ; 77 | 78 | if (domain.rowLoc() == 0) { 79 | rowMin = false ; 80 | } 81 | if (domain.rowLoc() == (domain.tp()-1)) { 82 | rowMax = false ; 83 | } 84 | if (domain.colLoc() == 0) { 85 | colMin = false ; 86 | } 87 | if (domain.colLoc() == (domain.tp()-1)) { 88 | colMax = false ; 89 | } 90 | if (domain.planeLoc() == 0) { 91 | planeMin = false ; 92 | } 93 | if (domain.planeLoc() == (domain.tp()-1)) { 94 | planeMax = false ; 95 | } 96 | 97 | for (Index_t i=0; i<26; ++i) { 98 | domain.recvRequest[i] = MPI_REQUEST_NULL ; 99 | } 100 | 101 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; 102 | 103 | /* post receives */ 104 | 105 | /* receive data from neighboring domain faces */ 106 | if (planeMin && doRecv) { 107 | /* contiguous memory */ 108 | int fromRank = myRank - domain.tp()*domain.tp() ; 109 | int recvCount = dx * dy * xferFields ; 110 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 111 | recvCount, baseType, fromRank, msgType, 112 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 113 | ++pmsg ; 114 | } 115 | if (planeMax) { 116 | /* contiguous memory */ 117 | int fromRank = myRank + domain.tp()*domain.tp() ; 118 | int recvCount = dx * dy * xferFields ; 119 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 120 | recvCount, baseType, fromRank, msgType, 121 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 122 | ++pmsg ; 123 | } 124 | if (rowMin && doRecv) { 125 | /* semi-contiguous memory */ 126 | int fromRank = myRank - domain.tp() ; 127 | int recvCount = dx * dz * xferFields ; 128 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 129 | recvCount, baseType, fromRank, msgType, 130 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 131 | ++pmsg ; 132 | } 133 | if (rowMax) { 134 | /* semi-contiguous memory */ 135 | int fromRank = myRank + domain.tp() ; 136 | int recvCount = dx * dz * xferFields ; 137 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 138 | recvCount, baseType, fromRank, msgType, 139 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 140 | ++pmsg ; 141 | } 142 | if (colMin && doRecv) { 143 | /* scattered memory */ 144 | int fromRank = myRank - 1 ; 145 | int recvCount = dy * dz * xferFields ; 146 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 147 | recvCount, baseType, fromRank, msgType, 148 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 149 | ++pmsg ; 150 | } 151 | if (colMax) { 152 | /* scattered memory */ 153 | int fromRank = myRank + 1 ; 154 | int recvCount = dy * dz * xferFields ; 155 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm], 156 | recvCount, baseType, fromRank, msgType, 157 | MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ; 158 | ++pmsg ; 159 | } 160 | 161 | if (!planeOnly) { 162 | /* receive data from domains connected only by an edge */ 163 | if (rowMin && colMin && doRecv) { 164 | int fromRank = myRank - domain.tp() - 1 ; 165 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 166 | emsg * maxEdgeComm], 167 | dz * xferFields, baseType, fromRank, msgType, 168 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 169 | ++emsg ; 170 | } 171 | 172 | if (rowMin && planeMin && doRecv) { 173 | int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ; 174 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 175 | emsg * maxEdgeComm], 176 | dx * xferFields, baseType, fromRank, msgType, 177 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 178 | ++emsg ; 179 | } 180 | 181 | if (colMin && planeMin && doRecv) { 182 | int fromRank = myRank - domain.tp()*domain.tp() - 1 ; 183 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 184 | emsg * maxEdgeComm], 185 | dy * xferFields, baseType, fromRank, msgType, 186 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 187 | ++emsg ; 188 | } 189 | 190 | if (rowMax && colMax) { 191 | int fromRank = myRank + domain.tp() + 1 ; 192 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 193 | emsg * maxEdgeComm], 194 | dz * xferFields, baseType, fromRank, msgType, 195 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 196 | ++emsg ; 197 | } 198 | 199 | if (rowMax && planeMax) { 200 | int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ; 201 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 202 | emsg * maxEdgeComm], 203 | dx * xferFields, baseType, fromRank, msgType, 204 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 205 | ++emsg ; 206 | } 207 | 208 | if (colMax && planeMax) { 209 | int fromRank = myRank + domain.tp()*domain.tp() + 1 ; 210 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 211 | emsg * maxEdgeComm], 212 | dy * xferFields, baseType, fromRank, msgType, 213 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 214 | ++emsg ; 215 | } 216 | 217 | if (rowMax && colMin) { 218 | int fromRank = myRank + domain.tp() - 1 ; 219 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 220 | emsg * maxEdgeComm], 221 | dz * xferFields, baseType, fromRank, msgType, 222 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 223 | ++emsg ; 224 | } 225 | 226 | if (rowMin && planeMax) { 227 | int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ; 228 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 229 | emsg * maxEdgeComm], 230 | dx * xferFields, baseType, fromRank, msgType, 231 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 232 | ++emsg ; 233 | } 234 | 235 | if (colMin && planeMax) { 236 | int fromRank = myRank + domain.tp()*domain.tp() - 1 ; 237 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 238 | emsg * maxEdgeComm], 239 | dy * xferFields, baseType, fromRank, msgType, 240 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 241 | ++emsg ; 242 | } 243 | 244 | if (rowMin && colMax && doRecv) { 245 | int fromRank = myRank - domain.tp() + 1 ; 246 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 247 | emsg * maxEdgeComm], 248 | dz * xferFields, baseType, fromRank, msgType, 249 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 250 | ++emsg ; 251 | } 252 | 253 | if (rowMax && planeMin && doRecv) { 254 | int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ; 255 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 256 | emsg * maxEdgeComm], 257 | dx * xferFields, baseType, fromRank, msgType, 258 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 259 | ++emsg ; 260 | } 261 | 262 | if (colMax && planeMin && doRecv) { 263 | int fromRank = myRank - domain.tp()*domain.tp() + 1 ; 264 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 265 | emsg * maxEdgeComm], 266 | dy * xferFields, baseType, fromRank, msgType, 267 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ; 268 | ++emsg ; 269 | } 270 | 271 | /* receive data from domains connected only by a corner */ 272 | if (rowMin && colMin && planeMin && doRecv) { 273 | /* corner at domain logical coord (0, 0, 0) */ 274 | int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ; 275 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 276 | emsg * maxEdgeComm + 277 | cmsg * CACHE_COHERENCE_PAD_REAL], 278 | xferFields, baseType, fromRank, msgType, 279 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 280 | ++cmsg ; 281 | } 282 | if (rowMin && colMin && planeMax) { 283 | /* corner at domain logical coord (0, 0, 1) */ 284 | int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ; 285 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 286 | emsg * maxEdgeComm + 287 | cmsg * CACHE_COHERENCE_PAD_REAL], 288 | xferFields, baseType, fromRank, msgType, 289 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 290 | ++cmsg ; 291 | } 292 | if (rowMin && colMax && planeMin && doRecv) { 293 | /* corner at domain logical coord (1, 0, 0) */ 294 | int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ; 295 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 296 | emsg * maxEdgeComm + 297 | cmsg * CACHE_COHERENCE_PAD_REAL], 298 | xferFields, baseType, fromRank, msgType, 299 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 300 | ++cmsg ; 301 | } 302 | if (rowMin && colMax && planeMax) { 303 | /* corner at domain logical coord (1, 0, 1) */ 304 | int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ; 305 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 306 | emsg * maxEdgeComm + 307 | cmsg * CACHE_COHERENCE_PAD_REAL], 308 | xferFields, baseType, fromRank, msgType, 309 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 310 | ++cmsg ; 311 | } 312 | if (rowMax && colMin && planeMin && doRecv) { 313 | /* corner at domain logical coord (0, 1, 0) */ 314 | int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ; 315 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 316 | emsg * maxEdgeComm + 317 | cmsg * CACHE_COHERENCE_PAD_REAL], 318 | xferFields, baseType, fromRank, msgType, 319 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 320 | ++cmsg ; 321 | } 322 | if (rowMax && colMin && planeMax) { 323 | /* corner at domain logical coord (0, 1, 1) */ 324 | int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ; 325 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 326 | emsg * maxEdgeComm + 327 | cmsg * CACHE_COHERENCE_PAD_REAL], 328 | xferFields, baseType, fromRank, msgType, 329 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 330 | ++cmsg ; 331 | } 332 | if (rowMax && colMax && planeMin && doRecv) { 333 | /* corner at domain logical coord (1, 1, 0) */ 334 | int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ; 335 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 336 | emsg * maxEdgeComm + 337 | cmsg * CACHE_COHERENCE_PAD_REAL], 338 | xferFields, baseType, fromRank, msgType, 339 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 340 | ++cmsg ; 341 | } 342 | if (rowMax && colMax && planeMax) { 343 | /* corner at domain logical coord (1, 1, 1) */ 344 | int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ; 345 | MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm + 346 | emsg * maxEdgeComm + 347 | cmsg * CACHE_COHERENCE_PAD_REAL], 348 | xferFields, baseType, fromRank, msgType, 349 | MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ; 350 | ++cmsg ; 351 | } 352 | } 353 | } 354 | 355 | /******************************************/ 356 | 357 | void CommSend(Domain& domain, Int_t msgType, 358 | Index_t xferFields, Domain_member *fieldData, 359 | Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly) 360 | { 361 | 362 | if (domain.numRanks() == 1) 363 | return ; 364 | 365 | /* post recieve buffers for all incoming messages */ 366 | int myRank ; 367 | Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ; 368 | Index_t maxEdgeComm = xferFields * domain.maxEdgeSize() ; 369 | Index_t pmsg = 0 ; /* plane comm msg */ 370 | Index_t emsg = 0 ; /* edge comm msg */ 371 | Index_t cmsg = 0 ; /* corner comm msg */ 372 | MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ; 373 | MPI_Status status[26] ; 374 | Real_t *destAddr ; 375 | bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ; 376 | /* assume communication to 6 neighbors by default */ 377 | rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ; 378 | if (domain.rowLoc() == 0) { 379 | rowMin = false ; 380 | } 381 | if (domain.rowLoc() == (domain.tp()-1)) { 382 | rowMax = false ; 383 | } 384 | if (domain.colLoc() == 0) { 385 | colMin = false ; 386 | } 387 | if (domain.colLoc() == (domain.tp()-1)) { 388 | colMax = false ; 389 | } 390 | if (domain.planeLoc() == 0) { 391 | planeMin = false ; 392 | } 393 | if (domain.planeLoc() == (domain.tp()-1)) { 394 | planeMax = false ; 395 | } 396 | 397 | for (Index_t i=0; i<26; ++i) { 398 | domain.sendRequest[i] = MPI_REQUEST_NULL ; 399 | } 400 | 401 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; 402 | 403 | /* post sends */ 404 | 405 | if (planeMin | planeMax) { 406 | /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */ 407 | int sendCount = dx * dy ; 408 | 409 | if (planeMin) { 410 | destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ; 411 | for (Index_t fi=0 ; fi 2 | #if USE_MPI 3 | # include 4 | #endif 5 | #if _OPENMP 6 | #include 7 | #endif 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "lulesh.h" 14 | 15 | ///////////////////////////////////////////////////////////////////// 16 | Domain::Domain(Int_t numRanks, Index_t colLoc, 17 | Index_t rowLoc, Index_t planeLoc, 18 | Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost) 19 | : 20 | m_e_cut(Real_t(1.0e-7)), 21 | m_p_cut(Real_t(1.0e-7)), 22 | m_q_cut(Real_t(1.0e-7)), 23 | m_v_cut(Real_t(1.0e-10)), 24 | m_u_cut(Real_t(1.0e-7)), 25 | m_hgcoef(Real_t(3.0)), 26 | m_ss4o3(Real_t(4.0)/Real_t(3.0)), 27 | m_qstop(Real_t(1.0e+12)), 28 | m_monoq_max_slope(Real_t(1.0)), 29 | m_monoq_limiter_mult(Real_t(2.0)), 30 | m_qlc_monoq(Real_t(0.5)), 31 | m_qqc_monoq(Real_t(2.0)/Real_t(3.0)), 32 | m_qqc(Real_t(2.0)), 33 | m_eosvmax(Real_t(1.0e+9)), 34 | m_eosvmin(Real_t(1.0e-9)), 35 | m_pmin(Real_t(0.)), 36 | m_emin(Real_t(-1.0e+15)), 37 | m_dvovmax(Real_t(0.1)), 38 | m_refdens(Real_t(1.0)), 39 | // 40 | // set pointers to (potentially) "new'd" arrays to null to 41 | // simplify deallocation. 42 | // 43 | m_regNumList(0), 44 | m_nodeElemStart(0), 45 | m_nodeElemCornerList(0), 46 | m_regElemSize(0), 47 | m_regElemlist(0) 48 | #if USE_MPI 49 | , 50 | commDataSend(0), 51 | commDataRecv(0) 52 | #endif 53 | { 54 | 55 | Index_t edgeElems = nx ; 56 | Index_t edgeNodes = edgeElems+1 ; 57 | this->cost() = cost; 58 | 59 | m_tp = tp ; 60 | m_numRanks = numRanks ; 61 | 62 | /////////////////////////////// 63 | // Initialize Sedov Mesh 64 | /////////////////////////////// 65 | 66 | // construct a uniform box for this processor 67 | 68 | m_colLoc = colLoc ; 69 | m_rowLoc = rowLoc ; 70 | m_planeLoc = planeLoc ; 71 | 72 | m_sizeX = edgeElems ; 73 | m_sizeY = edgeElems ; 74 | m_sizeZ = edgeElems ; 75 | m_numElem = edgeElems*edgeElems*edgeElems ; 76 | 77 | m_numNode = edgeNodes*edgeNodes*edgeNodes ; 78 | 79 | m_regNumList = new Index_t[numElem()] ; // material indexset 80 | 81 | // Elem-centered 82 | AllocateElemPersistent(numElem()) ; 83 | 84 | // Node-centered 85 | AllocateNodePersistent(numNode()) ; 86 | 87 | SetupCommBuffers(edgeNodes); 88 | 89 | // Basic Field Initialization 90 | for (Index_t i=0; i 1) { 281 | // set up node-centered indexing of elements 282 | Index_t *nodeElemCount = new Index_t[numNode()] ; 283 | 284 | for (Index_t i=0; i numElem()*8)) { 325 | fprintf(stderr, 326 | "AllocateNodeElemIndexes(): nodeElemCornerList entry out of range!\n"); 327 | #if USE_MPI 328 | MPI_Abort(MPI_COMM_WORLD, -1); 329 | #else 330 | exit(-1); 331 | #endif 332 | } 333 | } 334 | 335 | delete [] nodeElemCount ; 336 | } 337 | } 338 | 339 | 340 | //////////////////////////////////////////////////////////////////////////////// 341 | void 342 | Domain::SetupCommBuffers(Int_t edgeNodes) 343 | { 344 | // allocate a buffer large enough for nodal ghost data 345 | Index_t maxEdgeSize = MAX(this->sizeX(), MAX(this->sizeY(), this->sizeZ()))+1 ; 346 | m_maxPlaneSize = CACHE_ALIGN_REAL(maxEdgeSize*maxEdgeSize) ; 347 | m_maxEdgeSize = CACHE_ALIGN_REAL(maxEdgeSize) ; 348 | 349 | // assume communication to 6 neighbors by default 350 | m_rowMin = (m_rowLoc == 0) ? 0 : 1; 351 | m_rowMax = (m_rowLoc == m_tp-1) ? 0 : 1; 352 | m_colMin = (m_colLoc == 0) ? 0 : 1; 353 | m_colMax = (m_colLoc == m_tp-1) ? 0 : 1; 354 | m_planeMin = (m_planeLoc == 0) ? 0 : 1; 355 | m_planeMax = (m_planeLoc == m_tp-1) ? 0 : 1; 356 | 357 | #if USE_MPI 358 | // account for face communication 359 | Index_t comBufSize = 360 | (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) * 361 | m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ; 362 | 363 | // account for edge communication 364 | comBufSize += 365 | ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) + 366 | (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) + 367 | (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) + 368 | (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) * 369 | m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ; 370 | 371 | // account for corner communication 372 | // factor of 16 is so each buffer has its own cache line 373 | comBufSize += ((m_rowMin & m_colMin & m_planeMin) + 374 | (m_rowMin & m_colMin & m_planeMax) + 375 | (m_rowMin & m_colMax & m_planeMin) + 376 | (m_rowMin & m_colMax & m_planeMax) + 377 | (m_rowMax & m_colMin & m_planeMin) + 378 | (m_rowMax & m_colMin & m_planeMax) + 379 | (m_rowMax & m_colMax & m_planeMin) + 380 | (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ; 381 | 382 | this->commDataSend = new Real_t[comBufSize] ; 383 | this->commDataRecv = new Real_t[comBufSize] ; 384 | // prevent floating point exceptions 385 | memset(this->commDataSend, 0, comBufSize*sizeof(Real_t)) ; 386 | memset(this->commDataRecv, 0, comBufSize*sizeof(Real_t)) ; 387 | #endif 388 | 389 | // Boundary nodesets 390 | if (m_colLoc == 0) 391 | m_symmX.resize(edgeNodes*edgeNodes); 392 | if (m_rowLoc == 0) 393 | m_symmY.resize(edgeNodes*edgeNodes); 394 | if (m_planeLoc == 0) 395 | m_symmZ.resize(edgeNodes*edgeNodes); 396 | } 397 | 398 | 399 | //////////////////////////////////////////////////////////////////////////////// 400 | void 401 | Domain::CreateRegionIndexSets(Int_t nr, Int_t balance) 402 | { 403 | #if USE_MPI 404 | int myRank; 405 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ; 406 | srand(myRank); 407 | #else 408 | srand(0); 409 | Index_t myRank = 0; 410 | #endif 411 | this->numReg() = nr; 412 | m_regElemSize = new Index_t[numReg()]; 413 | m_regElemlist = new Index_t*[numReg()]; 414 | Index_t nextIndex = 0; 415 | //if we only have one region just fill it 416 | // Fill out the regNumList with material numbers, which are always 417 | // the region index plus one 418 | if(numReg() == 1) { 419 | while (nextIndex < numElem()) { 420 | this->regNumList(nextIndex) = 1; 421 | nextIndex++; 422 | } 423 | regElemSize(0) = 0; 424 | } 425 | //If we have more than one region distribute the elements. 426 | else { 427 | Int_t regionNum; 428 | Int_t regionVar; 429 | Int_t lastReg = -1; 430 | Int_t binSize; 431 | Index_t elements; 432 | Index_t runto = 0; 433 | Int_t costDenominator = 0; 434 | Int_t* regBinEnd = new Int_t[numReg()]; 435 | //Determine the relative weights of all the regions. This is based off the -b flag. Balance is the value passed into b. 436 | for (Index_t i=0 ; i= regBinEnd[i]) 447 | i++; 448 | //rotate the regions based on MPI rank. Rotation is Rank % NumRegions this makes each domain have a different region with 449 | //the highest representation 450 | regionNum = ((i + myRank) % numReg()) + 1; 451 | // make sure we don't pick the same region twice in a row 452 | while(regionNum == lastReg) { 453 | regionVar = rand() % costDenominator; 454 | i = 0; 455 | while(regionVar >= regBinEnd[i]) 456 | i++; 457 | regionNum = ((i + myRank) % numReg()) + 1; 458 | } 459 | //Pick the bin size of the region and determine the number of elements. 460 | binSize = rand() % 1000; 461 | if(binSize < 773) { 462 | elements = rand() % 15 + 1; 463 | } 464 | else if(binSize < 937) { 465 | elements = rand() % 16 + 16; 466 | } 467 | else if(binSize < 970) { 468 | elements = rand() % 32 + 32; 469 | } 470 | else if(binSize < 974) { 471 | elements = rand() % 64 + 64; 472 | } 473 | else if(binSize < 978) { 474 | elements = rand() % 128 + 128; 475 | } 476 | else if(binSize < 981) { 477 | elements = rand() % 256 + 256; 478 | } 479 | else 480 | elements = rand() % 1537 + 512; 481 | runto = elements + nextIndex; 482 | //Store the elements. If we hit the end before we run out of elements then just stop. 483 | while (nextIndex < runto && nextIndex < numElem()) { 484 | this->regNumList(nextIndex) = regionNum; 485 | nextIndex++; 486 | } 487 | lastReg = regionNum; 488 | } 489 | 490 | delete [] regBinEnd; 491 | } 492 | // Convert regNumList to region index sets 493 | // First, count size of each region 494 | for (Index_t i=0 ; iregNumList(i)-1; // region index == regnum-1 496 | regElemSize(r)++; 497 | } 498 | // Second, allocate each region index set 499 | for (Index_t i=0 ; i CACHE_COHERENCE_PAD_REAL) { 702 | printf("corner element comm buffers too small. Fix code.\n") ; 703 | #if USE_MPI 704 | MPI_Abort(MPI_COMM_WORLD, -1) ; 705 | #else 706 | exit(-1); 707 | #endif 708 | } 709 | 710 | dx = testProcs ; 711 | dy = testProcs ; 712 | dz = testProcs ; 713 | 714 | // temporary test 715 | if (dx*dy*dz != numRanks) { 716 | printf("error -- must have as many domains as procs\n") ; 717 | #if USE_MPI 718 | MPI_Abort(MPI_COMM_WORLD, -1) ; 719 | #else 720 | exit(-1); 721 | #endif 722 | } 723 | Int_t remainder = dx*dy*dz % numRanks ; 724 | if (myRank < remainder) { 725 | myDom = myRank*( 1+ (dx*dy*dz / numRanks)) ; 726 | } 727 | else { 728 | myDom = remainder*( 1+ (dx*dy*dz / numRanks)) + 729 | (myRank - remainder)*(dx*dy*dz/numRanks) ; 730 | } 731 | 732 | *col = myDom % dx ; 733 | *row = (myDom / dx) % dy ; 734 | *plane = myDom / (dx*dy) ; 735 | *side = testProcs; 736 | 737 | return; 738 | } 739 | 740 | -------------------------------------------------------------------------------- /lulesh-util.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #if USE_MPI 8 | #include 9 | #endif 10 | #include "lulesh.h" 11 | 12 | /* Helper function for converting strings to ints, with error checking */ 13 | template 14 | int StrToInt(const char *token, IntT *retVal) 15 | { 16 | const char *c ; 17 | char *endptr ; 18 | const int decimal_base = 10 ; 19 | 20 | if (token == NULL) 21 | return 0 ; 22 | 23 | c = token ; 24 | *retVal = strtol(c, &endptr, decimal_base) ; 25 | if((endptr != c) && ((*endptr == ' ') || (*endptr == '\0'))) 26 | return 1 ; 27 | else 28 | return 0 ; 29 | } 30 | 31 | static void PrintCommandLineOptions(char *execname, int myRank) 32 | { 33 | if (myRank == 0) { 34 | 35 | printf("Usage: %s [opts]\n", execname); 36 | printf(" where [opts] is one or more of:\n"); 37 | printf(" -q : quiet mode - suppress all stdout\n"); 38 | printf(" -i : number of cycles to run\n"); 39 | printf(" -s : length of cube mesh along side\n"); 40 | printf(" -r : Number of distinct regions (def: 11)\n"); 41 | printf(" -b : Load balance between regions of a domain (def: 1)\n"); 42 | printf(" -c : Extra cost of more expensive regions (def: 1)\n"); 43 | printf(" -f : Number of files to split viz dump into (def: (np+10)/9)\n"); 44 | printf(" -p : Print out progress\n"); 45 | printf(" -v : Output viz file (requires compiling with -DVIZ_MESH\n"); 46 | printf(" -h : This message\n"); 47 | printf("\n\n"); 48 | } 49 | } 50 | 51 | static void ParseError(const char *message, int myRank) 52 | { 53 | if (myRank == 0) { 54 | printf("%s\n", message); 55 | #if USE_MPI 56 | MPI_Abort(MPI_COMM_WORLD, -1); 57 | #else 58 | exit(-1); 59 | #endif 60 | } 61 | } 62 | 63 | void ParseCommandLineOptions(int argc, char *argv[], 64 | Int_t myRank, struct cmdLineOpts *opts) 65 | { 66 | if(argc > 1) { 67 | int i = 1; 68 | 69 | while(i < argc) { 70 | int ok; 71 | /* -i */ 72 | if(strcmp(argv[i], "-i") == 0) { 73 | if (i+1 >= argc) { 74 | ParseError("Missing integer argument to -i", myRank); 75 | } 76 | ok = StrToInt(argv[i+1], &(opts->its)); 77 | if(!ok) { 78 | ParseError("Parse Error on option -i integer value required after argument\n", myRank); 79 | } 80 | i+=2; 81 | } 82 | /* -s */ 83 | else if(strcmp(argv[i], "-s") == 0) { 84 | if (i+1 >= argc) { 85 | ParseError("Missing integer argument to -s\n", myRank); 86 | } 87 | ok = StrToInt(argv[i+1], &(opts->nx)); 88 | if(!ok) { 89 | ParseError("Parse Error on option -s integer value required after argument\n", myRank); 90 | } 91 | i+=2; 92 | } 93 | /* -r */ 94 | else if (strcmp(argv[i], "-r") == 0) { 95 | if (i+1 >= argc) { 96 | ParseError("Missing integer argument to -r\n", myRank); 97 | } 98 | ok = StrToInt(argv[i+1], &(opts->numReg)); 99 | if (!ok) { 100 | ParseError("Parse Error on option -r integer value required after argument\n", myRank); 101 | } 102 | i+=2; 103 | } 104 | /* -f */ 105 | else if (strcmp(argv[i], "-f") == 0) { 106 | if (i+1 >= argc) { 107 | ParseError("Missing integer argument to -f\n", myRank); 108 | } 109 | ok = StrToInt(argv[i+1], &(opts->numFiles)); 110 | if (!ok) { 111 | ParseError("Parse Error on option -f integer value required after argument\n", myRank); 112 | } 113 | i+=2; 114 | } 115 | /* -p */ 116 | else if (strcmp(argv[i], "-p") == 0) { 117 | opts->showProg = 1; 118 | i++; 119 | } 120 | /* -q */ 121 | else if (strcmp(argv[i], "-q") == 0) { 122 | opts->quiet = 1; 123 | i++; 124 | } 125 | else if (strcmp(argv[i], "-b") == 0) { 126 | if (i+1 >= argc) { 127 | ParseError("Missing integer argument to -b\n", myRank); 128 | } 129 | ok = StrToInt(argv[i+1], &(opts->balance)); 130 | if (!ok) { 131 | ParseError("Parse Error on option -b integer value required after argument\n", myRank); 132 | } 133 | i+=2; 134 | } 135 | else if (strcmp(argv[i], "-c") == 0) { 136 | if (i+1 >= argc) { 137 | ParseError("Missing integer argument to -c\n", myRank); 138 | } 139 | ok = StrToInt(argv[i+1], &(opts->cost)); 140 | if (!ok) { 141 | ParseError("Parse Error on option -c integer value required after argument\n", myRank); 142 | } 143 | i+=2; 144 | } 145 | /* -v */ 146 | else if (strcmp(argv[i], "-v") == 0) { 147 | #if VIZ_MESH 148 | opts->viz = 1; 149 | #else 150 | ParseError("Use of -v requires compiling with -DVIZ_MESH\n", myRank); 151 | #endif 152 | i++; 153 | } 154 | /* -h */ 155 | else if (strcmp(argv[i], "-h") == 0) { 156 | PrintCommandLineOptions(argv[0], myRank); 157 | #if USE_MPI 158 | MPI_Abort(MPI_COMM_WORLD, 0); 159 | #else 160 | exit(0); 161 | #endif 162 | } 163 | else { 164 | char msg[80]; 165 | PrintCommandLineOptions(argv[0], myRank); 166 | sprintf(msg, "ERROR: Unknown command line argument: %s\n", argv[i]); 167 | ParseError(msg, myRank); 168 | } 169 | } 170 | } 171 | } 172 | 173 | ///////////////////////////////////////////////////////////////////// 174 | 175 | void VerifyAndWriteFinalOutput(Real_t elapsed_time, 176 | Domain& locDom, 177 | Int_t nx, 178 | Int_t numRanks) 179 | { 180 | // GrindTime1 only takes a single domain into account, and is thus a good way to measure 181 | // processor speed indepdendent of MPI parallelism. 182 | // GrindTime2 takes into account speedups from MPI parallelism. 183 | // Cast to 64-bit integer to avoid overflows. 184 | Int8_t nx8 = nx; 185 | Real_t grindTime1 = ((elapsed_time*1e6)/locDom.cycle())/(nx8*nx8*nx8); 186 | Real_t grindTime2 = ((elapsed_time*1e6)/locDom.cycle())/(nx8*nx8*nx8*numRanks); 187 | 188 | Index_t ElemId = 0; 189 | std::cout << "Run completed:\n"; 190 | std::cout << " Problem size = " << nx << "\n"; 191 | std::cout << " MPI tasks = " << numRanks << "\n"; 192 | std::cout << " Iteration count = " << locDom.cycle() << "\n"; 193 | std::cout << " Final Origin Energy = "; 194 | std::cout << std::scientific << std::setprecision(6); 195 | std::cout << std::setw(12) << locDom.e(ElemId) << "\n"; 196 | 197 | Real_t MaxAbsDiff = Real_t(0.0); 198 | Real_t TotalAbsDiff = Real_t(0.0); 199 | Real_t MaxRelDiff = Real_t(0.0); 200 | 201 | for (Index_t j=0; j 2 | #include 3 | #include 4 | #include 5 | #include "lulesh.h" 6 | 7 | #ifdef VIZ_MESH 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | #include "silo.h" 13 | #if USE_MPI 14 | # include "pmpio.h" 15 | #endif 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | // Function prototypes 21 | static void 22 | DumpDomainToVisit(DBfile *db, Domain& domain, int myRank); 23 | static 24 | 25 | 26 | #if USE_MPI 27 | // For some reason, earlier versions of g++ (e.g. 4.2) won't let me 28 | // put the 'static' qualifier on this prototype, even if it's done 29 | // consistently in the prototype and definition 30 | void 31 | DumpMultiblockObjects(DBfile *db, PMPIO_baton_t *bat, 32 | char basename[], int numRanks); 33 | 34 | // Callback prototypes for PMPIO interface (only useful if we're 35 | // running parallel) 36 | static void * 37 | LULESH_PMPIO_Create(const char *fname, 38 | const char *dname, 39 | void *udata); 40 | static void * 41 | LULESH_PMPIO_Open(const char *fname, 42 | const char *dname, 43 | PMPIO_iomode_t ioMode, 44 | void *udata); 45 | static void 46 | LULESH_PMPIO_Close(void *file, void *udata); 47 | 48 | #else 49 | void 50 | DumpMultiblockObjects(DBfile *db, char basename[], int numRanks); 51 | #endif 52 | 53 | 54 | /**********************************************************************/ 55 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks) 56 | { 57 | char subdirName[32]; 58 | char basename[32]; 59 | DBfile *db; 60 | 61 | 62 | sprintf(basename, "lulesh_plot_c%d", domain.cycle()); 63 | sprintf(subdirName, "data_%d", myRank); 64 | 65 | #if USE_MPI 66 | 67 | PMPIO_baton_t *bat = PMPIO_Init(numFiles, 68 | PMPIO_WRITE, 69 | MPI_COMM_WORLD, 70 | 10101, 71 | LULESH_PMPIO_Create, 72 | LULESH_PMPIO_Open, 73 | LULESH_PMPIO_Close, 74 | NULL); 75 | 76 | int myiorank = PMPIO_GroupRank(bat, myRank); 77 | 78 | char fileName[64]; 79 | 80 | if (myiorank == 0) 81 | strcpy(fileName, basename); 82 | else 83 | sprintf(fileName, "%s.%03d", basename, myiorank); 84 | 85 | db = (DBfile*)PMPIO_WaitForBaton(bat, fileName, subdirName); 86 | 87 | DumpDomainToVisit(db, domain, myRank); 88 | 89 | // Processor 0 writes out bit of extra data to its file that 90 | // describes how to stitch all the pieces together 91 | if (myRank == 0) { 92 | DumpMultiblockObjects(db, bat, basename, numRanks); 93 | } 94 | 95 | PMPIO_HandOffBaton(bat, db); 96 | 97 | PMPIO_Finish(bat); 98 | #else 99 | 100 | db = (DBfile*)DBCreate(basename, DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5X); 101 | 102 | if (db) { 103 | DBMkDir(db, subdirName); 104 | DBSetDir(db, subdirName); 105 | DumpDomainToVisit(db, domain, myRank); 106 | DumpMultiblockObjects(db, basename, numRanks); 107 | DBClose(db); 108 | } 109 | else { 110 | printf("Error writing out viz file - rank %d\n", myRank); 111 | } 112 | 113 | #endif 114 | } 115 | 116 | 117 | 118 | /**********************************************************************/ 119 | 120 | static void 121 | DumpDomainToVisit(DBfile *db, Domain& domain, int myRank) 122 | { 123 | int ok = 0; 124 | 125 | /* Create an option list that will give some hints to VisIt for 126 | * printing out the cycle and time in the annotations */ 127 | DBoptlist *optlist; 128 | 129 | 130 | /* Write out the mesh connectivity in fully unstructured format */ 131 | int shapetype[1] = {DB_ZONETYPE_HEX}; 132 | int shapesize[1] = {8}; 133 | int shapecnt[1] = {domain.numElem()}; 134 | int *conn = new int[domain.numElem()*8] ; 135 | int ci = 0 ; 136 | for (int ei=0; ei < domain.numElem(); ++ei) { 137 | Index_t *elemToNode = domain.nodelist(ei) ; 138 | for (int ni=0; ni < 8; ++ni) { 139 | conn[ci++] = elemToNode[ni] ; 140 | } 141 | } 142 | ok += DBPutZonelist2(db, "connectivity", domain.numElem(), 3, 143 | conn, domain.numElem()*8, 144 | 0,0,0, /* Not carrying ghost zones */ 145 | shapetype, shapesize, shapecnt, 146 | 1, NULL); 147 | delete [] conn ; 148 | 149 | /* Write out the mesh coordinates associated with the mesh */ 150 | const char* coordnames[3] = {"X", "Y", "Z"}; 151 | float *coords[3] ; 152 | coords[0] = new float[domain.numNode()] ; 153 | coords[1] = new float[domain.numNode()] ; 154 | coords[2] = new float[domain.numNode()] ; 155 | for (int ni=0; ni < domain.numNode() ; ++ni) { 156 | coords[0][ni] = float(domain.x(ni)) ; 157 | coords[1][ni] = float(domain.y(ni)) ; 158 | coords[2][ni] = float(domain.z(ni)) ; 159 | } 160 | optlist = DBMakeOptlist(2); 161 | ok += DBAddOption(optlist, DBOPT_DTIME, &domain.time()); 162 | ok += DBAddOption(optlist, DBOPT_CYCLE, &domain.cycle()); 163 | ok += DBPutUcdmesh(db, "mesh", 3, (char**)&coordnames[0], (float**)coords, 164 | domain.numNode(), domain.numElem(), "connectivity", 165 | 0, DB_FLOAT, optlist); 166 | ok += DBFreeOptlist(optlist); 167 | delete [] coords[2] ; 168 | delete [] coords[1] ; 169 | delete [] coords[0] ; 170 | 171 | /* Write out the materials */ 172 | int *matnums = new int[domain.numReg()]; 173 | int dims[1] = {domain.numElem()}; // No mixed elements 174 | for(int i=0 ; i 7 | 8 | /* 9 | define one of these three symbols: 10 | 11 | SEDOV_SYNC_POS_VEL_NONE 12 | SEDOV_SYNC_POS_VEL_EARLY 13 | SEDOV_SYNC_POS_VEL_LATE 14 | */ 15 | 16 | #define SEDOV_SYNC_POS_VEL_EARLY 1 17 | #endif 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | //************************************************** 25 | // Allow flexibility for arithmetic representations 26 | //************************************************** 27 | 28 | #define MAX(a, b) ( ((a) > (b)) ? (a) : (b)) 29 | 30 | 31 | // Precision specification 32 | typedef float real4 ; 33 | typedef double real8 ; 34 | typedef long double real10 ; // 10 bytes on x86 35 | 36 | typedef int32_t Int4_t ; 37 | typedef int64_t Int8_t ; 38 | typedef Int4_t Index_t ; // array subscript and loop index 39 | typedef real8 Real_t ; // floating point representation 40 | typedef Int4_t Int_t ; // integer representation 41 | 42 | enum { VolumeError = -1, QStopError = -2 } ; 43 | 44 | inline real4 SQRT(real4 arg) { return sqrtf(arg) ; } 45 | inline real8 SQRT(real8 arg) { return sqrt(arg) ; } 46 | inline real10 SQRT(real10 arg) { return sqrtl(arg) ; } 47 | 48 | inline real4 CBRT(real4 arg) { return cbrtf(arg) ; } 49 | inline real8 CBRT(real8 arg) { return cbrt(arg) ; } 50 | inline real10 CBRT(real10 arg) { return cbrtl(arg) ; } 51 | 52 | inline real4 FABS(real4 arg) { return fabsf(arg) ; } 53 | inline real8 FABS(real8 arg) { return fabs(arg) ; } 54 | inline real10 FABS(real10 arg) { return fabsl(arg) ; } 55 | 56 | 57 | // Stuff needed for boundary conditions 58 | // 2 BCs on each of 6 hexahedral faces (12 bits) 59 | #define XI_M 0x00007 60 | #define XI_M_SYMM 0x00001 61 | #define XI_M_FREE 0x00002 62 | #define XI_M_COMM 0x00004 63 | 64 | #define XI_P 0x00038 65 | #define XI_P_SYMM 0x00008 66 | #define XI_P_FREE 0x00010 67 | #define XI_P_COMM 0x00020 68 | 69 | #define ETA_M 0x001c0 70 | #define ETA_M_SYMM 0x00040 71 | #define ETA_M_FREE 0x00080 72 | #define ETA_M_COMM 0x00100 73 | 74 | #define ETA_P 0x00e00 75 | #define ETA_P_SYMM 0x00200 76 | #define ETA_P_FREE 0x00400 77 | #define ETA_P_COMM 0x00800 78 | 79 | #define ZETA_M 0x07000 80 | #define ZETA_M_SYMM 0x01000 81 | #define ZETA_M_FREE 0x02000 82 | #define ZETA_M_COMM 0x04000 83 | 84 | #define ZETA_P 0x38000 85 | #define ZETA_P_SYMM 0x08000 86 | #define ZETA_P_FREE 0x10000 87 | #define ZETA_P_COMM 0x20000 88 | 89 | // MPI Message Tags 90 | #define MSG_COMM_SBN 1024 91 | #define MSG_SYNC_POS_VEL 2048 92 | #define MSG_MONOQ 3072 93 | 94 | #define MAX_FIELDS_PER_MPI_COMM 6 95 | 96 | // Assume 128 byte coherence 97 | // Assume Real_t is an "integral power of 2" bytes wide 98 | #define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) 99 | 100 | #define CACHE_ALIGN_REAL(n) \ 101 | (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1)) 102 | 103 | /*********************************/ 104 | /* Data structure implementation */ 105 | /*********************************/ 106 | 107 | /* might want to add access methods so that memory can be */ 108 | /* better managed, as in luleshFT */ 109 | 110 | template 111 | T *Allocate(size_t size) 112 | { 113 | return static_cast(malloc(sizeof(T)*size)) ; 114 | } 115 | 116 | template 117 | void Release(T **ptr) 118 | { 119 | if (*ptr != NULL) { 120 | free(*ptr) ; 121 | *ptr = NULL ; 122 | } 123 | } 124 | 125 | ////////////////////////////////////////////////////// 126 | // Primary data structure 127 | ////////////////////////////////////////////////////// 128 | 129 | /* 130 | * The implementation of the data abstraction used for lulesh 131 | * resides entirely in the Domain class below. You can change 132 | * grouping and interleaving of fields here to maximize data layout 133 | * efficiency for your underlying architecture or compiler. 134 | * 135 | * For example, fields can be implemented as STL objects or 136 | * raw array pointers. As another example, individual fields 137 | * m_x, m_y, m_z could be budled into 138 | * 139 | * struct { Real_t x, y, z ; } *m_coord ; 140 | * 141 | * allowing accessor functions such as 142 | * 143 | * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" 144 | * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" 145 | * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" 146 | */ 147 | 148 | class Domain { 149 | 150 | public: 151 | 152 | // Constructor 153 | Domain(Int_t numRanks, Index_t colLoc, 154 | Index_t rowLoc, Index_t planeLoc, 155 | Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost); 156 | 157 | // Destructor 158 | ~Domain(); 159 | 160 | // 161 | // ALLOCATION 162 | // 163 | 164 | void AllocateNodePersistent(Int_t numNode) // Node-centered 165 | { 166 | m_x.resize(numNode); // coordinates 167 | m_y.resize(numNode); 168 | m_z.resize(numNode); 169 | 170 | m_xd.resize(numNode); // velocities 171 | m_yd.resize(numNode); 172 | m_zd.resize(numNode); 173 | 174 | m_xdd.resize(numNode); // accelerations 175 | m_ydd.resize(numNode); 176 | m_zdd.resize(numNode); 177 | 178 | m_fx.resize(numNode); // forces 179 | m_fy.resize(numNode); 180 | m_fz.resize(numNode); 181 | 182 | m_nodalMass.resize(numNode); // mass 183 | } 184 | 185 | void AllocateElemPersistent(Int_t numElem) // Elem-centered 186 | { 187 | m_nodelist.resize(8*numElem); 188 | 189 | // elem connectivities through face 190 | m_lxim.resize(numElem); 191 | m_lxip.resize(numElem); 192 | m_letam.resize(numElem); 193 | m_letap.resize(numElem); 194 | m_lzetam.resize(numElem); 195 | m_lzetap.resize(numElem); 196 | 197 | m_elemBC.resize(numElem); 198 | 199 | m_e.resize(numElem); 200 | m_p.resize(numElem); 201 | 202 | m_q.resize(numElem); 203 | m_ql.resize(numElem); 204 | m_qq.resize(numElem); 205 | 206 | m_v.resize(numElem); 207 | 208 | m_volo.resize(numElem); 209 | m_delv.resize(numElem); 210 | m_vdov.resize(numElem); 211 | 212 | m_arealg.resize(numElem); 213 | 214 | m_ss.resize(numElem); 215 | 216 | m_elemMass.resize(numElem); 217 | 218 | m_vnew.resize(numElem) ; 219 | } 220 | 221 | void AllocateGradients(Int_t numElem, Int_t allElem) 222 | { 223 | // Position gradients 224 | m_delx_xi = Allocate(numElem) ; 225 | m_delx_eta = Allocate(numElem) ; 226 | m_delx_zeta = Allocate(numElem) ; 227 | 228 | // Velocity gradients 229 | m_delv_xi = Allocate(allElem) ; 230 | m_delv_eta = Allocate(allElem); 231 | m_delv_zeta = Allocate(allElem) ; 232 | } 233 | 234 | void DeallocateGradients() 235 | { 236 | Release(&m_delx_zeta); 237 | Release(&m_delx_eta) ; 238 | Release(&m_delx_xi) ; 239 | 240 | Release(&m_delv_zeta); 241 | Release(&m_delv_eta) ; 242 | Release(&m_delv_xi) ; 243 | } 244 | 245 | void AllocateStrains(Int_t numElem) 246 | { 247 | m_dxx = Allocate(numElem) ; 248 | m_dyy = Allocate(numElem) ; 249 | m_dzz = Allocate(numElem) ; 250 | } 251 | 252 | void DeallocateStrains() 253 | { 254 | Release(&m_dzz) ; 255 | Release(&m_dyy) ; 256 | Release(&m_dxx) ; 257 | } 258 | 259 | // 260 | // ACCESSORS 261 | // 262 | 263 | // Node-centered 264 | 265 | // Nodal coordinates 266 | Real_t& x(Index_t idx) { return m_x[idx] ; } 267 | Real_t& y(Index_t idx) { return m_y[idx] ; } 268 | Real_t& z(Index_t idx) { return m_z[idx] ; } 269 | 270 | // Nodal velocities 271 | Real_t& xd(Index_t idx) { return m_xd[idx] ; } 272 | Real_t& yd(Index_t idx) { return m_yd[idx] ; } 273 | Real_t& zd(Index_t idx) { return m_zd[idx] ; } 274 | 275 | // Nodal accelerations 276 | Real_t& xdd(Index_t idx) { return m_xdd[idx] ; } 277 | Real_t& ydd(Index_t idx) { return m_ydd[idx] ; } 278 | Real_t& zdd(Index_t idx) { return m_zdd[idx] ; } 279 | 280 | // Nodal forces 281 | Real_t& fx(Index_t idx) { return m_fx[idx] ; } 282 | Real_t& fy(Index_t idx) { return m_fy[idx] ; } 283 | Real_t& fz(Index_t idx) { return m_fz[idx] ; } 284 | 285 | // Nodal mass 286 | Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; } 287 | 288 | // Nodes on symmertry planes 289 | Index_t symmX(Index_t idx) { return m_symmX[idx] ; } 290 | Index_t symmY(Index_t idx) { return m_symmY[idx] ; } 291 | Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; } 292 | bool symmXempty() { return m_symmX.empty(); } 293 | bool symmYempty() { return m_symmY.empty(); } 294 | bool symmZempty() { return m_symmZ.empty(); } 295 | 296 | // 297 | // Element-centered 298 | // 299 | Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx] ; } 300 | Index_t& regNumList(Index_t idx) { return m_regNumList[idx] ; } 301 | Index_t* regNumList() { return &m_regNumList[0] ; } 302 | Index_t* regElemlist(Int_t r) { return m_regElemlist[r] ; } 303 | Index_t& regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; } 304 | 305 | Index_t* nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; } 306 | 307 | // elem connectivities through face 308 | Index_t& lxim(Index_t idx) { return m_lxim[idx] ; } 309 | Index_t& lxip(Index_t idx) { return m_lxip[idx] ; } 310 | Index_t& letam(Index_t idx) { return m_letam[idx] ; } 311 | Index_t& letap(Index_t idx) { return m_letap[idx] ; } 312 | Index_t& lzetam(Index_t idx) { return m_lzetam[idx] ; } 313 | Index_t& lzetap(Index_t idx) { return m_lzetap[idx] ; } 314 | 315 | // elem face symm/free-surface flag 316 | Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; } 317 | 318 | // Principal strains - temporary 319 | Real_t& dxx(Index_t idx) { return m_dxx[idx] ; } 320 | Real_t& dyy(Index_t idx) { return m_dyy[idx] ; } 321 | Real_t& dzz(Index_t idx) { return m_dzz[idx] ; } 322 | 323 | // New relative volume - temporary 324 | Real_t& vnew(Index_t idx) { return m_vnew[idx] ; } 325 | 326 | // Velocity gradient - temporary 327 | Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; } 328 | Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; } 329 | Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; } 330 | 331 | // Position gradient - temporary 332 | Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; } 333 | Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; } 334 | Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; } 335 | 336 | // Energy 337 | Real_t& e(Index_t idx) { return m_e[idx] ; } 338 | 339 | // Pressure 340 | Real_t& p(Index_t idx) { return m_p[idx] ; } 341 | 342 | // Artificial viscosity 343 | Real_t& q(Index_t idx) { return m_q[idx] ; } 344 | 345 | // Linear term for q 346 | Real_t& ql(Index_t idx) { return m_ql[idx] ; } 347 | // Quadratic term for q 348 | Real_t& qq(Index_t idx) { return m_qq[idx] ; } 349 | 350 | // Relative volume 351 | Real_t& v(Index_t idx) { return m_v[idx] ; } 352 | Real_t& delv(Index_t idx) { return m_delv[idx] ; } 353 | 354 | // Reference volume 355 | Real_t& volo(Index_t idx) { return m_volo[idx] ; } 356 | 357 | // volume derivative over volume 358 | Real_t& vdov(Index_t idx) { return m_vdov[idx] ; } 359 | 360 | // Element characteristic length 361 | Real_t& arealg(Index_t idx) { return m_arealg[idx] ; } 362 | 363 | // Sound speed 364 | Real_t& ss(Index_t idx) { return m_ss[idx] ; } 365 | 366 | // Element mass 367 | Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; } 368 | 369 | Index_t nodeElemCount(Index_t idx) 370 | { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; } 371 | 372 | Index_t *nodeElemCornerList(Index_t idx) 373 | { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; } 374 | 375 | // Parameters 376 | 377 | // Cutoffs 378 | Real_t u_cut() const { return m_u_cut ; } 379 | Real_t e_cut() const { return m_e_cut ; } 380 | Real_t p_cut() const { return m_p_cut ; } 381 | Real_t q_cut() const { return m_q_cut ; } 382 | Real_t v_cut() const { return m_v_cut ; } 383 | 384 | // Other constants (usually are settable via input file in real codes) 385 | Real_t hgcoef() const { return m_hgcoef ; } 386 | Real_t qstop() const { return m_qstop ; } 387 | Real_t monoq_max_slope() const { return m_monoq_max_slope ; } 388 | Real_t monoq_limiter_mult() const { return m_monoq_limiter_mult ; } 389 | Real_t ss4o3() const { return m_ss4o3 ; } 390 | Real_t qlc_monoq() const { return m_qlc_monoq ; } 391 | Real_t qqc_monoq() const { return m_qqc_monoq ; } 392 | Real_t qqc() const { return m_qqc ; } 393 | 394 | Real_t eosvmax() const { return m_eosvmax ; } 395 | Real_t eosvmin() const { return m_eosvmin ; } 396 | Real_t pmin() const { return m_pmin ; } 397 | Real_t emin() const { return m_emin ; } 398 | Real_t dvovmax() const { return m_dvovmax ; } 399 | Real_t refdens() const { return m_refdens ; } 400 | 401 | // Timestep controls, etc... 402 | Real_t& time() { return m_time ; } 403 | Real_t& deltatime() { return m_deltatime ; } 404 | Real_t& deltatimemultlb() { return m_deltatimemultlb ; } 405 | Real_t& deltatimemultub() { return m_deltatimemultub ; } 406 | Real_t& stoptime() { return m_stoptime ; } 407 | Real_t& dtcourant() { return m_dtcourant ; } 408 | Real_t& dthydro() { return m_dthydro ; } 409 | Real_t& dtmax() { return m_dtmax ; } 410 | Real_t& dtfixed() { return m_dtfixed ; } 411 | 412 | Int_t& cycle() { return m_cycle ; } 413 | Index_t& numRanks() { return m_numRanks ; } 414 | 415 | Index_t& colLoc() { return m_colLoc ; } 416 | Index_t& rowLoc() { return m_rowLoc ; } 417 | Index_t& planeLoc() { return m_planeLoc ; } 418 | Index_t& tp() { return m_tp ; } 419 | 420 | Index_t& sizeX() { return m_sizeX ; } 421 | Index_t& sizeY() { return m_sizeY ; } 422 | Index_t& sizeZ() { return m_sizeZ ; } 423 | Index_t& numReg() { return m_numReg ; } 424 | Int_t& cost() { return m_cost ; } 425 | Index_t& numElem() { return m_numElem ; } 426 | Index_t& numNode() { return m_numNode ; } 427 | 428 | Index_t& maxPlaneSize() { return m_maxPlaneSize ; } 429 | Index_t& maxEdgeSize() { return m_maxEdgeSize ; } 430 | 431 | // 432 | // MPI-Related additional data 433 | // 434 | 435 | #if USE_MPI 436 | // Communication Work space 437 | Real_t *commDataSend ; 438 | Real_t *commDataRecv ; 439 | 440 | // Maximum number of block neighbors 441 | MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 442 | MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 443 | #endif 444 | 445 | private: 446 | 447 | void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); 448 | void SetupThreadSupportStructures(); 449 | void CreateRegionIndexSets(Int_t nreg, Int_t balance); 450 | void SetupCommBuffers(Int_t edgeNodes); 451 | void SetupSymmetryPlanes(Int_t edgeNodes); 452 | void SetupElementConnectivities(Int_t edgeElems); 453 | void SetupBoundaryConditions(Int_t edgeElems); 454 | 455 | // 456 | // IMPLEMENTATION 457 | // 458 | 459 | /* Node-centered */ 460 | std::vector m_x ; /* coordinates */ 461 | std::vector m_y ; 462 | std::vector m_z ; 463 | 464 | std::vector m_xd ; /* velocities */ 465 | std::vector m_yd ; 466 | std::vector m_zd ; 467 | 468 | std::vector m_xdd ; /* accelerations */ 469 | std::vector m_ydd ; 470 | std::vector m_zdd ; 471 | 472 | std::vector m_fx ; /* forces */ 473 | std::vector m_fy ; 474 | std::vector m_fz ; 475 | 476 | std::vector m_nodalMass ; /* mass */ 477 | 478 | std::vector m_symmX ; /* symmetry plane nodesets */ 479 | std::vector m_symmY ; 480 | std::vector m_symmZ ; 481 | 482 | // Element-centered 483 | 484 | // Region information 485 | Int_t m_numReg ; 486 | Int_t m_cost; //imbalance cost 487 | Index_t *m_regElemSize ; // Size of region sets 488 | Index_t *m_regNumList ; // Region number per domain element 489 | Index_t **m_regElemlist ; // region indexset 490 | 491 | std::vector m_nodelist ; /* elemToNode connectivity */ 492 | 493 | std::vector m_lxim ; /* element connectivity across each face */ 494 | std::vector m_lxip ; 495 | std::vector m_letam ; 496 | std::vector m_letap ; 497 | std::vector m_lzetam ; 498 | std::vector m_lzetap ; 499 | 500 | std::vector m_elemBC ; /* symmetry/free-surface flags for each elem face */ 501 | 502 | Real_t *m_dxx ; /* principal strains -- temporary */ 503 | Real_t *m_dyy ; 504 | Real_t *m_dzz ; 505 | 506 | Real_t *m_delv_xi ; /* velocity gradient -- temporary */ 507 | Real_t *m_delv_eta ; 508 | Real_t *m_delv_zeta ; 509 | 510 | Real_t *m_delx_xi ; /* coordinate gradient -- temporary */ 511 | Real_t *m_delx_eta ; 512 | Real_t *m_delx_zeta ; 513 | 514 | std::vector m_e ; /* energy */ 515 | 516 | std::vector m_p ; /* pressure */ 517 | std::vector m_q ; /* q */ 518 | std::vector m_ql ; /* linear term for q */ 519 | std::vector m_qq ; /* quadratic term for q */ 520 | 521 | std::vector m_v ; /* relative volume */ 522 | std::vector m_volo ; /* reference volume */ 523 | std::vector m_vnew ; /* new relative volume -- temporary */ 524 | std::vector m_delv ; /* m_vnew - m_v */ 525 | std::vector m_vdov ; /* volume derivative over volume */ 526 | 527 | std::vector m_arealg ; /* characteristic length of an element */ 528 | 529 | std::vector m_ss ; /* "sound speed" */ 530 | 531 | std::vector m_elemMass ; /* mass */ 532 | 533 | // Cutoffs (treat as constants) 534 | const Real_t m_e_cut ; // energy tolerance 535 | const Real_t m_p_cut ; // pressure tolerance 536 | const Real_t m_q_cut ; // q tolerance 537 | const Real_t m_v_cut ; // relative volume tolerance 538 | const Real_t m_u_cut ; // velocity tolerance 539 | 540 | // Other constants (usually setable, but hardcoded in this proxy app) 541 | 542 | const Real_t m_hgcoef ; // hourglass control 543 | const Real_t m_ss4o3 ; 544 | const Real_t m_qstop ; // excessive q indicator 545 | const Real_t m_monoq_max_slope ; 546 | const Real_t m_monoq_limiter_mult ; 547 | const Real_t m_qlc_monoq ; // linear term coef for q 548 | const Real_t m_qqc_monoq ; // quadratic term coef for q 549 | const Real_t m_qqc ; 550 | const Real_t m_eosvmax ; 551 | const Real_t m_eosvmin ; 552 | const Real_t m_pmin ; // pressure floor 553 | const Real_t m_emin ; // energy floor 554 | const Real_t m_dvovmax ; // maximum allowable volume change 555 | const Real_t m_refdens ; // reference density 556 | 557 | // Variables to keep track of timestep, simulation time, and cycle 558 | Real_t m_dtcourant ; // courant constraint 559 | Real_t m_dthydro ; // volume change constraint 560 | Int_t m_cycle ; // iteration count for simulation 561 | Real_t m_dtfixed ; // fixed time increment 562 | Real_t m_time ; // current time 563 | Real_t m_deltatime ; // variable time increment 564 | Real_t m_deltatimemultlb ; 565 | Real_t m_deltatimemultub ; 566 | Real_t m_dtmax ; // maximum allowable time increment 567 | Real_t m_stoptime ; // end time for simulation 568 | 569 | 570 | Int_t m_numRanks ; 571 | 572 | Index_t m_colLoc ; 573 | Index_t m_rowLoc ; 574 | Index_t m_planeLoc ; 575 | Index_t m_tp ; 576 | 577 | Index_t m_sizeX ; 578 | Index_t m_sizeY ; 579 | Index_t m_sizeZ ; 580 | Index_t m_numElem ; 581 | Index_t m_numNode ; 582 | 583 | Index_t m_maxPlaneSize ; 584 | Index_t m_maxEdgeSize ; 585 | 586 | // OMP hack 587 | Index_t *m_nodeElemStart ; 588 | Index_t *m_nodeElemCornerList ; 589 | 590 | // Used in setup 591 | Index_t m_rowMin, m_rowMax; 592 | Index_t m_colMin, m_colMax; 593 | Index_t m_planeMin, m_planeMax ; 594 | 595 | } ; 596 | 597 | typedef Real_t &(Domain::* Domain_member )(Index_t) ; 598 | 599 | struct cmdLineOpts { 600 | Int_t its; // -i 601 | Int_t nx; // -s 602 | Int_t numReg; // -r 603 | Int_t numFiles; // -f 604 | Int_t showProg; // -p 605 | Int_t quiet; // -q 606 | Int_t viz; // -v 607 | Int_t cost; // -c 608 | Int_t balance; // -b 609 | }; 610 | 611 | 612 | 613 | // Function Prototypes 614 | 615 | // lulesh-par 616 | Real_t CalcElemVolume( const Real_t x[8], 617 | const Real_t y[8], 618 | const Real_t z[8]); 619 | 620 | // lulesh-util 621 | void ParseCommandLineOptions(int argc, char *argv[], 622 | Int_t myRank, struct cmdLineOpts *opts); 623 | void VerifyAndWriteFinalOutput(Real_t elapsed_time, 624 | Domain& locDom, 625 | Int_t nx, 626 | Int_t numRanks); 627 | 628 | // lulesh-viz 629 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); 630 | 631 | // lulesh-comm 632 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, 633 | Index_t dx, Index_t dy, Index_t dz, 634 | bool doRecv, bool planeOnly); 635 | void CommSend(Domain& domain, Int_t msgType, 636 | Index_t xferFields, Domain_member *fieldData, 637 | Index_t dx, Index_t dy, Index_t dz, 638 | bool doSend, bool planeOnly); 639 | void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData); 640 | void CommSyncPosVel(Domain& domain); 641 | void CommMonoQ(Domain& domain); 642 | 643 | // lulesh-init 644 | void InitMeshDecomp(Int_t numRanks, Int_t myRank, 645 | Int_t *col, Int_t *row, Int_t *plane, Int_t *side); 646 | -------------------------------------------------------------------------------- /lulesh_tuple.h: -------------------------------------------------------------------------------- 1 | #if !defined(USE_MPI) 2 | # error "You should specify USE_MPI=0 or USE_MPI=1 on the compile line" 3 | #endif 4 | 5 | #if USE_MPI 6 | #include 7 | 8 | /* 9 | define one of these three symbols: 10 | 11 | SEDOV_SYNC_POS_VEL_NONE 12 | SEDOV_SYNC_POS_VEL_EARLY 13 | SEDOV_SYNC_POS_VEL_LATE 14 | */ 15 | 16 | #define SEDOV_SYNC_POS_VEL_EARLY 1 17 | #endif 18 | 19 | #include 20 | #include 21 | 22 | //************************************************** 23 | // Allow flexibility for arithmetic representations 24 | //************************************************** 25 | 26 | #define MAX(a, b) ( ((a) > (b)) ? (a) : (b)) 27 | 28 | 29 | // Precision specification 30 | typedef float real4 ; 31 | typedef double real8 ; 32 | typedef long double real10 ; // 10 bytes on x86 33 | 34 | typedef int Index_t ; // array subscript and loop index 35 | typedef real8 Real_t ; // floating point representation 36 | typedef int Int_t ; // integer representation 37 | 38 | enum { VolumeError = -1, QStopError = -2 } ; 39 | 40 | inline real4 SQRT(real4 arg) { return sqrtf(arg) ; } 41 | inline real8 SQRT(real8 arg) { return sqrt(arg) ; } 42 | inline real10 SQRT(real10 arg) { return sqrtl(arg) ; } 43 | 44 | inline real4 CBRT(real4 arg) { return cbrtf(arg) ; } 45 | inline real8 CBRT(real8 arg) { return cbrt(arg) ; } 46 | inline real10 CBRT(real10 arg) { return cbrtl(arg) ; } 47 | 48 | inline real4 FABS(real4 arg) { return fabsf(arg) ; } 49 | inline real8 FABS(real8 arg) { return fabs(arg) ; } 50 | inline real10 FABS(real10 arg) { return fabsl(arg) ; } 51 | 52 | 53 | // Stuff needed for boundary conditions 54 | // 2 BCs on each of 6 hexahedral faces (12 bits) 55 | #define XI_M 0x00007 56 | #define XI_M_SYMM 0x00001 57 | #define XI_M_FREE 0x00002 58 | #define XI_M_COMM 0x00004 59 | 60 | #define XI_P 0x00038 61 | #define XI_P_SYMM 0x00008 62 | #define XI_P_FREE 0x00010 63 | #define XI_P_COMM 0x00020 64 | 65 | #define ETA_M 0x001c0 66 | #define ETA_M_SYMM 0x00040 67 | #define ETA_M_FREE 0x00080 68 | #define ETA_M_COMM 0x00100 69 | 70 | #define ETA_P 0x00e00 71 | #define ETA_P_SYMM 0x00200 72 | #define ETA_P_FREE 0x00400 73 | #define ETA_P_COMM 0x00800 74 | 75 | #define ZETA_M 0x07000 76 | #define ZETA_M_SYMM 0x01000 77 | #define ZETA_M_FREE 0x02000 78 | #define ZETA_M_COMM 0x04000 79 | 80 | #define ZETA_P 0x38000 81 | #define ZETA_P_SYMM 0x08000 82 | #define ZETA_P_FREE 0x10000 83 | #define ZETA_P_COMM 0x20000 84 | 85 | // MPI Message Tags 86 | #define MSG_COMM_SBN 1024 87 | #define MSG_SYNC_POS_VEL 2048 88 | #define MSG_MONOQ 3072 89 | 90 | #define MAX_FIELDS_PER_MPI_COMM 6 91 | 92 | // Assume 128 byte coherence 93 | // Assume Real_t is an "integral power of 2" bytes wide 94 | #define CACHE_COHERENCE_PAD_REAL (128 / sizeof(Real_t)) 95 | 96 | #define CACHE_ALIGN_REAL(n) \ 97 | (((n) + (CACHE_COHERENCE_PAD_REAL - 1)) & ~(CACHE_COHERENCE_PAD_REAL-1)) 98 | 99 | ////////////////////////////////////////////////////// 100 | // Primary data structure 101 | ////////////////////////////////////////////////////// 102 | 103 | /* 104 | * The implementation of the data abstraction used for lulesh 105 | * resides entirely in the Domain class below. You can change 106 | * grouping and interleaving of fields here to maximize data layout 107 | * efficiency for your underlying architecture or compiler. 108 | * 109 | * For example, fields can be implemented as STL objects or 110 | * raw array pointers. As another example, individual fields 111 | * m_x, m_y, m_z could be budled into 112 | * 113 | * struct { Real_t x, y, z ; } *m_coord ; 114 | * 115 | * allowing accessor functions such as 116 | * 117 | * "Real_t &x(Index_t idx) { return m_coord[idx].x ; }" 118 | * "Real_t &y(Index_t idx) { return m_coord[idx].y ; }" 119 | * "Real_t &z(Index_t idx) { return m_coord[idx].z ; }" 120 | */ 121 | 122 | class Domain { 123 | 124 | public: 125 | 126 | // Constructor 127 | Domain(Int_t numRanks, Index_t colLoc, 128 | Index_t rowLoc, Index_t planeLoc, 129 | Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost); 130 | 131 | // Destructor 132 | ~Domain(); 133 | 134 | // 135 | // ALLOCATION 136 | // 137 | 138 | void AllocateNodePersistent(Int_t numNode) // Node-centered 139 | { 140 | m_coord.resize(numNode); // coordinates 141 | 142 | m_vel.resize(numNode); // velocities 143 | 144 | m_acc.resize(numNode); // accelerations 145 | 146 | m_force.resize(numNode); // forces 147 | 148 | m_nodalMass.resize(numNode); // mass 149 | } 150 | 151 | void AllocateElemPersistent(Int_t numElem) // Elem-centered 152 | { 153 | m_nodelist.resize(8*numElem); 154 | 155 | // elem connectivities through face 156 | m_faceToElem.resize(numElem); 157 | 158 | m_elemBC.resize(numElem); 159 | 160 | m_e.resize(numElem); 161 | 162 | m_pq.resize(numElem); 163 | 164 | m_qlqq.resize(numElem); 165 | 166 | m_vol.resize(numElem); 167 | 168 | m_delv.resize(numElem); 169 | m_vdov.resize(numElem); 170 | 171 | m_arealg.resize(numElem); 172 | 173 | m_ss.resize(numElem); 174 | 175 | m_elemMass.resize(numElem); 176 | 177 | m_vnew.resize(numElem) ; 178 | } 179 | 180 | void AllocateGradients(Int_t numElem, Int_t allElem) 181 | { 182 | // Position gradients 183 | m_delx_xi.resize(numElem) ; 184 | m_delx_eta.resize(numElem) ; 185 | m_delx_zeta.resize(numElem) ; 186 | 187 | // Velocity gradients 188 | m_delv_xi.resize(allElem) ; 189 | m_delv_eta.resize(allElem); 190 | m_delv_zeta.resize(allElem) ; 191 | } 192 | 193 | void DeallocateGradients() 194 | { 195 | m_delx_zeta.clear() ; 196 | m_delx_eta.clear() ; 197 | m_delx_xi.clear() ; 198 | 199 | m_delv_zeta.clear() ; 200 | m_delv_eta.clear() ; 201 | m_delv_xi.clear() ; 202 | } 203 | 204 | void AllocateStrains(Int_t numElem) 205 | { 206 | m_dxx.resize(numElem) ; 207 | m_dyy.resize(numElem) ; 208 | m_dzz.resize(numElem) ; 209 | } 210 | 211 | void DeallocateStrains() 212 | { 213 | m_dzz.clear() ; 214 | m_dyy.clear() ; 215 | m_dxx.clear() ; 216 | } 217 | 218 | // 219 | // ACCESSORS 220 | // 221 | 222 | // Node-centered 223 | 224 | // Nodal coordinates 225 | Real_t& x(Index_t idx) { return m_coord[idx].x ; } 226 | Real_t& y(Index_t idx) { return m_coord[idx].y ; } 227 | Real_t& z(Index_t idx) { return m_coord[idx].z ; } 228 | 229 | // Nodal velocities 230 | Real_t& xd(Index_t idx) { return m_vel[idx].x ; } 231 | Real_t& yd(Index_t idx) { return m_vel[idx].y ; } 232 | Real_t& zd(Index_t idx) { return m_vel[idx].z ; } 233 | 234 | // Nodal accelerations 235 | Real_t& xdd(Index_t idx) { return m_acc[idx].x ; } 236 | Real_t& ydd(Index_t idx) { return m_acc[idx].y ; } 237 | Real_t& zdd(Index_t idx) { return m_acc[idx].z ; } 238 | 239 | // Nodal forces 240 | Real_t& fx(Index_t idx) { return m_force[idx].x ; } 241 | Real_t& fy(Index_t idx) { return m_force[idx].y ; } 242 | Real_t& fz(Index_t idx) { return m_force[idx].z ; } 243 | 244 | // Nodal mass 245 | Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; } 246 | 247 | // Nodes on symmertry planes 248 | Index_t symmX(Index_t idx) { return m_symmX[idx] ; } 249 | Index_t symmY(Index_t idx) { return m_symmY[idx] ; } 250 | Index_t symmZ(Index_t idx) { return m_symmZ[idx] ; } 251 | bool symmXempty() { return m_symmX.empty(); } 252 | bool symmYempty() { return m_symmY.empty(); } 253 | bool symmZempty() { return m_symmZ.empty(); } 254 | 255 | // 256 | // Element-centered 257 | // 258 | Index_t& regElemSize(Index_t idx) { return m_regElemSize[idx] ; } 259 | Index_t& regNumList(Index_t idx) { return m_regNumList[idx] ; } 260 | Index_t* regNumList() { return &m_regNumList[0] ; } 261 | Index_t* regElemlist(Int_t r) { return m_regElemlist[r] ; } 262 | Index_t& regElemlist(Int_t r, Index_t idx) { return m_regElemlist[r][idx] ; } 263 | 264 | Index_t* nodelist(Index_t idx) { return &m_nodelist[Index_t(8)*idx] ; } 265 | 266 | // elem connectivities through face 267 | Index_t& lxim(Index_t idx) { return m_faceToElem[idx].lxim ; } 268 | Index_t& lxip(Index_t idx) { return m_faceToElem[idx].lxip ; } 269 | Index_t& letam(Index_t idx) { return m_faceToElem[idx].letam ; } 270 | Index_t& letap(Index_t idx) { return m_faceToElem[idx].letap ; } 271 | Index_t& lzetam(Index_t idx) { return m_faceToElem[idx].lzetam ; } 272 | Index_t& lzetap(Index_t idx) { return m_faceToElem[idx].lzetap ; } 273 | 274 | // elem face symm/free-surface flag 275 | Int_t& elemBC(Index_t idx) { return m_elemBC[idx] ; } 276 | 277 | // Principal strains - temporary 278 | Real_t& dxx(Index_t idx) { return m_dxx[idx] ; } 279 | Real_t& dyy(Index_t idx) { return m_dyy[idx] ; } 280 | Real_t& dzz(Index_t idx) { return m_dzz[idx] ; } 281 | 282 | // New relative volume - temporary 283 | Real_t& vnew(Index_t idx) { return m_vnew[idx] ; } 284 | 285 | // Velocity gradient - temporary 286 | Real_t& delv_xi(Index_t idx) { return m_delv_xi[idx] ; } 287 | Real_t& delv_eta(Index_t idx) { return m_delv_eta[idx] ; } 288 | Real_t& delv_zeta(Index_t idx) { return m_delv_zeta[idx] ; } 289 | 290 | // Position gradient - temporary 291 | Real_t& delx_xi(Index_t idx) { return m_delx_xi[idx] ; } 292 | Real_t& delx_eta(Index_t idx) { return m_delx_eta[idx] ; } 293 | Real_t& delx_zeta(Index_t idx) { return m_delx_zeta[idx] ; } 294 | 295 | // Energy 296 | Real_t& e(Index_t idx) { return m_e[idx] ; } 297 | 298 | // Pressure 299 | Real_t& p(Index_t idx) { return m_pq[idx].p ; } 300 | 301 | // Artificial viscosity 302 | Real_t& q(Index_t idx) { return m_pq[idx].q ; } 303 | 304 | // Linear term for q 305 | Real_t& ql(Index_t idx) { return m_qlqq[idx].ql ; } 306 | // Quadratic term for q 307 | Real_t& qq(Index_t idx) { return m_qlqq[idx].qq ; } 308 | 309 | Real_t& delv(Index_t idx) { return m_delv[idx] ; } 310 | 311 | // Relative volume 312 | Real_t& v(Index_t idx) { return m_vol[idx].v ; } 313 | // Reference volume 314 | Real_t& volo(Index_t idx) { return m_vol[idx].volo ; } 315 | 316 | // volume derivative over volume 317 | Real_t& vdov(Index_t idx) { return m_vdov[idx] ; } 318 | 319 | // Element characteristic length 320 | Real_t& arealg(Index_t idx) { return m_arealg[idx] ; } 321 | 322 | // Sound speed 323 | Real_t& ss(Index_t idx) { return m_ss[idx] ; } 324 | 325 | // Element mass 326 | Real_t& elemMass(Index_t idx) { return m_elemMass[idx] ; } 327 | 328 | Index_t nodeElemCount(Index_t idx) 329 | { return m_nodeElemStart[idx+1] - m_nodeElemStart[idx] ; } 330 | 331 | Index_t *nodeElemCornerList(Index_t idx) 332 | { return &m_nodeElemCornerList[m_nodeElemStart[idx]] ; } 333 | 334 | // Parameters 335 | 336 | // Cutoffs 337 | Real_t u_cut() const { return m_u_cut ; } 338 | Real_t e_cut() const { return m_e_cut ; } 339 | Real_t p_cut() const { return m_p_cut ; } 340 | Real_t q_cut() const { return m_q_cut ; } 341 | Real_t v_cut() const { return m_v_cut ; } 342 | 343 | // Other constants (usually are settable via input file in real codes) 344 | Real_t hgcoef() const { return m_hgcoef ; } 345 | Real_t qstop() const { return m_qstop ; } 346 | Real_t monoq_max_slope() const { return m_monoq_max_slope ; } 347 | Real_t monoq_limiter_mult() const { return m_monoq_limiter_mult ; } 348 | Real_t ss4o3() const { return m_ss4o3 ; } 349 | Real_t qlc_monoq() const { return m_qlc_monoq ; } 350 | Real_t qqc_monoq() const { return m_qqc_monoq ; } 351 | Real_t qqc() const { return m_qqc ; } 352 | 353 | Real_t eosvmax() const { return m_eosvmax ; } 354 | Real_t eosvmin() const { return m_eosvmin ; } 355 | Real_t pmin() const { return m_pmin ; } 356 | Real_t emin() const { return m_emin ; } 357 | Real_t dvovmax() const { return m_dvovmax ; } 358 | Real_t refdens() const { return m_refdens ; } 359 | 360 | // Timestep controls, etc... 361 | Real_t& time() { return m_time ; } 362 | Real_t& deltatime() { return m_deltatime ; } 363 | Real_t& deltatimemultlb() { return m_deltatimemultlb ; } 364 | Real_t& deltatimemultub() { return m_deltatimemultub ; } 365 | Real_t& stoptime() { return m_stoptime ; } 366 | Real_t& dtcourant() { return m_dtcourant ; } 367 | Real_t& dthydro() { return m_dthydro ; } 368 | Real_t& dtmax() { return m_dtmax ; } 369 | Real_t& dtfixed() { return m_dtfixed ; } 370 | 371 | Int_t& cycle() { return m_cycle ; } 372 | Index_t& numRanks() { return m_numRanks ; } 373 | 374 | Index_t& colLoc() { return m_colLoc ; } 375 | Index_t& rowLoc() { return m_rowLoc ; } 376 | Index_t& planeLoc() { return m_planeLoc ; } 377 | Index_t& tp() { return m_tp ; } 378 | 379 | Index_t& sizeX() { return m_sizeX ; } 380 | Index_t& sizeY() { return m_sizeY ; } 381 | Index_t& sizeZ() { return m_sizeZ ; } 382 | Index_t& numReg() { return m_numReg ; } 383 | Int_t& cost() { return m_cost ; } 384 | Index_t& numElem() { return m_numElem ; } 385 | Index_t& numNode() { return m_numNode ; } 386 | 387 | Index_t& maxPlaneSize() { return m_maxPlaneSize ; } 388 | Index_t& maxEdgeSize() { return m_maxEdgeSize ; } 389 | 390 | // 391 | // MPI-Related additional data 392 | // 393 | 394 | #if USE_MPI 395 | // Communication Work space 396 | Real_t *commDataSend ; 397 | Real_t *commDataRecv ; 398 | 399 | // Maximum number of block neighbors 400 | MPI_Request recvRequest[26] ; // 6 faces + 12 edges + 8 corners 401 | MPI_Request sendRequest[26] ; // 6 faces + 12 edges + 8 corners 402 | #endif 403 | 404 | private: 405 | 406 | void BuildMesh(Int_t nx, Int_t edgeNodes, Int_t edgeElems); 407 | void SetupThreadSupportStructures(); 408 | void CreateRegionIndexSets(Int_t nreg, Int_t balance); 409 | void SetupCommBuffers(Int_t edgeNodes); 410 | void SetupSymmetryPlanes(Int_t edgeNodes); 411 | void SetupElementConnectivities(Int_t edgeElems); 412 | void SetupBoundaryConditions(Int_t edgeElems); 413 | 414 | // 415 | // IMPLEMENTATION 416 | // 417 | 418 | /* Node-centered */ 419 | 420 | struct Tuple3 { 421 | Real_t x, y, z ; 422 | } ; 423 | 424 | std::vector m_coord ; /* coordinates */ 425 | 426 | std::vector m_vel ; /* velocities */ 427 | 428 | std::vector m_acc ; /* accelerations */ 429 | 430 | std::vector m_force ; /* forces */ 431 | 432 | std::vector m_nodalMass ; /* mass */ 433 | 434 | std::vector m_symmX ; /* symmetry plane nodesets */ 435 | std::vector m_symmY ; 436 | std::vector m_symmZ ; 437 | 438 | // Element-centered 439 | 440 | // Region information 441 | Int_t m_numReg ; 442 | Int_t m_cost; //imbalance cost 443 | Index_t *m_regElemSize ; // Size of region sets 444 | Index_t *m_regNumList ; // Region number per domain element 445 | Index_t **m_regElemlist ; // region indexset 446 | 447 | std::vector m_nodelist ; /* elemToNode connectivity */ 448 | 449 | struct FaceElemConn { 450 | Index_t lxim, lxip, letam, letap, lzetam, lzetap ; 451 | } ; 452 | 453 | std::vector m_faceToElem ; /* element conn across faces */ 454 | 455 | std::vector m_elemBC ; /* symmetry/free-surface flags for each elem face */ 456 | 457 | std::vector m_dxx ; /* principal strains -- temporary */ 458 | std::vector m_dyy ; 459 | std::vector m_dzz ; 460 | 461 | std::vector m_delv_xi ; /* velocity gradient -- temporary */ 462 | std::vector m_delv_eta ; 463 | std::vector m_delv_zeta ; 464 | 465 | std::vector m_delx_xi ; /* coordinate gradient -- temporary */ 466 | std::vector m_delx_eta ; 467 | std::vector m_delx_zeta ; 468 | 469 | std::vector m_e ; /* energy */ 470 | 471 | struct Pcomponents { 472 | Real_t p, q ; 473 | } ; 474 | 475 | std::vector m_pq ; /* pressure and artificial viscosity */ 476 | 477 | struct Qcomponents { 478 | Real_t ql, qq ; 479 | } ; 480 | 481 | std::vector m_qlqq ; /* linear and quadratic terms for q */ 482 | 483 | struct Volume { 484 | Real_t v, volo ; 485 | } ; 486 | 487 | std::vector m_vol ; /* relative and reference volume */ 488 | 489 | std::vector m_vnew ; /* new relative volume -- temporary */ 490 | std::vector m_delv ; /* m_vnew - m_v */ 491 | std::vector m_vdov ; /* volume derivative over volume */ 492 | 493 | std::vector m_arealg ; /* characteristic length of an element */ 494 | 495 | std::vector m_ss ; /* "sound speed" */ 496 | 497 | std::vector m_elemMass ; /* mass */ 498 | 499 | // Cutoffs (treat as constants) 500 | const Real_t m_e_cut ; // energy tolerance 501 | const Real_t m_p_cut ; // pressure tolerance 502 | const Real_t m_q_cut ; // q tolerance 503 | const Real_t m_v_cut ; // relative volume tolerance 504 | const Real_t m_u_cut ; // velocity tolerance 505 | 506 | // Other constants (usually setable, but hardcoded in this proxy app) 507 | 508 | const Real_t m_hgcoef ; // hourglass control 509 | const Real_t m_ss4o3 ; 510 | const Real_t m_qstop ; // excessive q indicator 511 | const Real_t m_monoq_max_slope ; 512 | const Real_t m_monoq_limiter_mult ; 513 | const Real_t m_qlc_monoq ; // linear term coef for q 514 | const Real_t m_qqc_monoq ; // quadratic term coef for q 515 | const Real_t m_qqc ; 516 | const Real_t m_eosvmax ; 517 | const Real_t m_eosvmin ; 518 | const Real_t m_pmin ; // pressure floor 519 | const Real_t m_emin ; // energy floor 520 | const Real_t m_dvovmax ; // maximum allowable volume change 521 | const Real_t m_refdens ; // reference density 522 | 523 | // Variables to keep track of timestep, simulation time, and cycle 524 | Real_t m_dtcourant ; // courant constraint 525 | Real_t m_dthydro ; // volume change constraint 526 | Int_t m_cycle ; // iteration count for simulation 527 | Real_t m_dtfixed ; // fixed time increment 528 | Real_t m_time ; // current time 529 | Real_t m_deltatime ; // variable time increment 530 | Real_t m_deltatimemultlb ; 531 | Real_t m_deltatimemultub ; 532 | Real_t m_dtmax ; // maximum allowable time increment 533 | Real_t m_stoptime ; // end time for simulation 534 | 535 | 536 | Int_t m_numRanks ; 537 | 538 | Index_t m_colLoc ; 539 | Index_t m_rowLoc ; 540 | Index_t m_planeLoc ; 541 | Index_t m_tp ; 542 | 543 | Index_t m_sizeX ; 544 | Index_t m_sizeY ; 545 | Index_t m_sizeZ ; 546 | Index_t m_numElem ; 547 | Index_t m_numNode ; 548 | 549 | Index_t m_maxPlaneSize ; 550 | Index_t m_maxEdgeSize ; 551 | 552 | // OMP hack 553 | Index_t *m_nodeElemStart ; 554 | Index_t *m_nodeElemCornerList ; 555 | 556 | // Used in setup 557 | Index_t m_rowMin, m_rowMax; 558 | Index_t m_colMin, m_colMax; 559 | Index_t m_planeMin, m_planeMax ; 560 | 561 | } ; 562 | 563 | typedef Real_t &(Domain::* Domain_member )(Index_t) ; 564 | 565 | struct cmdLineOpts { 566 | Int_t its; // -i 567 | Int_t nx; // -s 568 | Int_t numReg; // -r 569 | Int_t numFiles; // -f 570 | Int_t showProg; // -p 571 | Int_t quiet; // -q 572 | Int_t viz; // -v 573 | Int_t cost; // -c 574 | Int_t balance; // -b 575 | }; 576 | 577 | 578 | 579 | // Function Prototypes 580 | 581 | // lulesh-par 582 | Real_t CalcElemVolume( const Real_t x[8], 583 | const Real_t y[8], 584 | const Real_t z[8]); 585 | 586 | // lulesh-util 587 | void ParseCommandLineOptions(int argc, char *argv[], 588 | Int_t myRank, struct cmdLineOpts *opts); 589 | void VerifyAndWriteFinalOutput(Real_t elapsed_time, 590 | Domain& locDom, 591 | Int_t nx, 592 | Int_t numRanks); 593 | 594 | // lulesh-viz 595 | void DumpToVisit(Domain& domain, int numFiles, int myRank, int numRanks); 596 | 597 | // lulesh-comm 598 | void CommRecv(Domain& domain, Int_t msgType, Index_t xferFields, 599 | Index_t dx, Index_t dy, Index_t dz, 600 | bool doRecv, bool planeOnly); 601 | void CommSend(Domain& domain, Int_t msgType, 602 | Index_t xferFields, Domain_member *fieldData, 603 | Index_t dx, Index_t dy, Index_t dz, 604 | bool doSend, bool planeOnly); 605 | void CommSBN(Domain& domain, Int_t xferFields, Domain_member *fieldData); 606 | void CommSyncPosVel(Domain& domain); 607 | void CommMonoQ(Domain& domain); 608 | 609 | // lulesh-init 610 | void InitMeshDecomp(Int_t numRanks, Int_t myRank, 611 | Int_t *col, Int_t *row, Int_t *plane, Int_t *side); 612 | --------------------------------------------------------------------------------