├── AcroTensor.hpp ├── LICENSE ├── README.md ├── config └── defaults.mk ├── exec ├── CPUInterpretedExecutor.cpp ├── CPUInterpretedExecutor.hpp ├── CudaExecutor.cpp ├── CudaExecutor.hpp ├── Executor.hpp ├── KernelExecutor.cpp └── KernelExecutor.hpp ├── kernel ├── DimensionedKernel.cpp ├── DimensionedKernel.hpp ├── DimensionedMultiKernel.cpp ├── DimensionedMultiKernel.hpp ├── TensorEngine.cpp ├── TensorEngine.hpp ├── TensorKernel.cpp └── TensorKernel.hpp ├── makefile ├── ops ├── CudaGPUOps.cpp ├── CudaGPUOps.hpp ├── NativeCPUOps.cpp ├── NativeCPUOps.hpp ├── NonContractionOps.hpp └── Ops.hpp ├── tensor ├── IndexMapping.cpp ├── IndexMapping.hpp ├── IndexVector.cpp ├── IndexVector.hpp ├── SliceTensor.cpp ├── SliceTensor.hpp ├── Tensor.cpp └── Tensor.hpp ├── unittest ├── LICENSE_1_0.txt ├── catch.hpp ├── kernel │ ├── test_DimensionedKernel.cpp │ ├── test_TensorEngine.cpp │ └── test_TensorKernel.cpp ├── makefile ├── tensor │ ├── test_SliceTensor.cpp │ └── test_Tensor.cpp └── unit_test_main.cpp └── util ├── CudaUtil.cpp ├── CudaUtil.hpp ├── Error.hpp ├── StringUtil.hpp └── Util.hpp /AcroTensor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "TensorEngine.hpp" 7 | #include "Executor.hpp" 8 | #include "SliceTensor.hpp" 9 | #include "Ops.hpp" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Lawrence Livermore National Laboratory 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Acrotensor 2 | 3 | Acrotensor is a C++/CUDA library for computing arbitrary tensor contractions both on CPUs and GPUs. The tensors are dynamically sized allowing for a high degree of flexibility, and the tensor contractions are defined with a natural mathematical notation for maximum usability. In order to maintain good performance contraction code is dynamical generated with fixed sizes and unrolled loops and then Just In Time (JIT) compiled to produce better optimized execution. 4 | 5 | ## Getting started 6 | 7 | Acrotensor depends on a C++11 compiler and requires the nvcc CUDA wrapper on the compiler in order to handle the mix of C++ and CUDA. To get the build started you will want to enter the acrotensor directory and run: 8 | ``` 9 | make config 10 | ``` 11 | 12 | This will generate a `config/config.mk` file with a set of defaults that you may need to change for your environment. Once you have edited your `config.mk` simply enter the acrotensor directory and run: 13 | ``` 14 | make 15 | ``` 16 | 17 | This will build both static and dynamic libraries that you can link against in the `lib` folder and generate an `inc` directory with all of the header files that you will need. 18 | 19 | If you would like to perform some sanity checks on Acrotensor before moving forward you can build and run the unit test suite by entering the acrotensor directory and running: 20 | ``` 21 | make unittest 22 | ``` 23 | 24 | ## Usage 25 | 26 | To gain access to the Acrotensor objects be sure to include `AcroTensor.hpp` and link against either the static or dynamic library. The two user facing objects needed to utilize Acrotensor are `acrobatic::Tensor` and `acrobatic::TensorEngine`. The `Tensor` objects can be constructed on the CPU with dimensions provided by a list of numbers or an `std::vector`: 27 | ``` 28 | //Start of an example contraction that will add 1000 random matrices together on the GPU 29 | std::vector dims {1000, 3, 3}; 30 | acro::Tensor A(dims); //1000x3x3 entry tensor 31 | acro::Tensor B(1000); //1000 entry tensor 32 | acro::Tensor S(3,3); //3x3 tensor 33 | ``` 34 | 35 | Once the tensors are created they can be accessed on the CPU with Tensor indexing using the `()` operator and linear indexing using the `[]` operator. The data in the tensors are layed out linearly with the most significant index on the left. There are also utility methods such as `Set()` and `Print()`: 36 | ``` 37 | for (int linear = 0; linear < 1000*3*3; ++linear) 38 | A[linear] = (double)rand() / RAND_MAX; 39 | 40 | B.Set(1.0); 41 | for (int i = 0; i < 3; ++i) 42 | for (int j = 0; j < 3; ++j) 43 | S(i, j) = 0.0; 44 | ``` 45 | 46 | Memory motion between the CPU and GPU can be accomplished by using the following `Tensor` methods: 47 | ``` 48 | A.MapToGPU(); //Allocate memory on the GPU 49 | B.MapToGPU(); 50 | S.MapToGPU(); 51 | 52 | A.MoveToGPU(); //Copy the data to the GPU and indicate the the GPU has the fresh copy 53 | B.MoveToGPU(); 54 | 55 | S.SwitchToGPU(); //Indicate that the GPU has the fresh copy without copying the data (good for outputs) 56 | ``` 57 | 58 | Tensor contractions can now be handled through a `TensorEngine` object. Thesor engines can be initilized with different execution policies that can handle contractions on the CPU or GPU with different approaches. The contraction string in the `[]` operator defines how the tensors will be indexed, multiplied and added. The dimensions of the contraction operation are set by the dimensions of the tensors that are passed in via the `()` operator. Any index that does not appear on the left hand side is sum across and contracted away in the ouput tensor. 59 | ``` 60 | acro::TensorEngine TE("Cuda"); //Initilize the engine with the Cuda exec policy 61 | TE("S_i_j = A_n_i_j B_n", S, A, B); //Contract on n and sum the 1000 matrices into 1 62 | TE("S_i_j = A_n_i_j", S, A); //Same result as before since n is still contracted 63 | 64 | S.MoveFromGPU(); //Get the results back from the GPU 65 | S.Print(); //Display the results of the contraction 66 | ``` 67 | -------------------------------------------------------------------------------- /config/defaults.mk: -------------------------------------------------------------------------------- 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | #All rights reserved. 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #Default values for utilizing nvcc+gcc on a P100 system 7 | DEBUG = NO 8 | CUDADIR = /usr/local/cuda 9 | CXX = $(CUDADIR)/bin/nvcc 10 | UTILCXX = $(CXX) 11 | CXX_OPT = -O3 -g -arch compute_60 -x cu --std=c++11 -DACRO_HAVE_CUDA --compiler-options="-fPIC" 12 | CXX_DEBUG = -G -g -arch compute_60 -x cu --std=c++11 -DACRO_HAVE_CUDA --compiler-options="-fPIC" 13 | UNITTEST_LDFLAGS = -O0 -G -arch compute_60 --std=c++11 -lnvrtc -lcuda -L$(CUDADIR)/lib64 -------------------------------------------------------------------------------- /exec/CPUInterpretedExecutor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "CPUInterpretedExecutor.hpp" 7 | #include 8 | #include 9 | 10 | namespace acro 11 | { 12 | 13 | 14 | CPUInterpretedExecutor::CPUInterpretedExecutor(DimensionedMultiKernel *multi_kernel) : KernelExecutor(multi_kernel) 15 | { 16 | NumLoops = FirstKernel->GetNumIndices(); 17 | NumInVars = FirstKernel->GetNumInputVars(); 18 | N = FirstKernel->GetLoopDims(); 19 | 20 | OutputRank = FirstKernel->GetVarRank(-1); 21 | OutputLoopNums = &(FirstKernel->OutputVar.LoopNums[0]); 22 | OutputStrides = new int[OutputRank]; 23 | for (int di = 0; di < OutputRank; ++di) 24 | { 25 | OutputStrides[di] = FirstKernel->GetVarDimStride(-1, di); 26 | } 27 | 28 | 29 | InputRanks = new int[NumInVars]; 30 | InputLoopNums = new int*[NumInVars]; 31 | InputStrides = new int*[NumInVars]; 32 | InputVars = new double*[NumInVars]; 33 | for (int vari = 0; vari < NumInVars; ++vari) 34 | { 35 | InputRanks[vari] = FirstKernel->GetVarRank(vari); 36 | InputLoopNums[vari] = &(FirstKernel->InputVars[vari].LoopNums[0]); 37 | InputStrides[vari] = new int[InputRanks[vari]]; 38 | for (int di = 0; di < InputRanks[vari]; ++di) 39 | { 40 | InputStrides[vari][di] = FirstKernel->GetVarDimStride(vari, di); 41 | } 42 | } 43 | } 44 | 45 | CPUInterpretedExecutor::~CPUInterpretedExecutor() 46 | { 47 | delete [] OutputStrides; 48 | delete [] InputRanks; 49 | delete [] InputLoopNums; 50 | delete [] InputVars; 51 | for (int vari = 0; vari < NumInVars; ++vari) 52 | { 53 | delete [] InputStrides[vari]; 54 | } 55 | delete [] InputStrides; 56 | } 57 | 58 | 59 | void CPUInterpretedExecutor::ExecuteSingle(Tensor *output, std::vector &inputs) 60 | { 61 | MoveTensorsFromGPU(output, inputs); 62 | 63 | //Since we are using += or -= into the output 64 | if (FirstKernel->EqOperator == "=") 65 | { 66 | output->Set(0.0); 67 | } 68 | 69 | OutputVar = output->GetData(); 70 | for (int vari = 0; vari < NumInVars; ++vari) 71 | { 72 | InputVars[vari] = inputs[vari]->GetData(); 73 | } 74 | 75 | switch (NumLoops) 76 | { 77 | case 1: Execute1Loops(); break; 78 | case 2: Execute2Loops(); break; 79 | case 3: Execute3Loops(); break; 80 | case 4: Execute4Loops(); break; 81 | case 5: Execute5Loops(); break; 82 | case 6: Execute6Loops(); break; 83 | case 7: Execute7Loops(); break; 84 | case 8: Execute8Loops(); break; 85 | case 9: Execute9Loops(); break; 86 | case 10: Execute10Loops(); break; 87 | case 11: Execute11Loops(); break; 88 | case 12: Execute12Loops(); break; 89 | default: ExecuteArbitraryLoops(); 90 | } 91 | } 92 | 93 | std::string CPUInterpretedExecutor::GetImplementation() 94 | { 95 | return "Interpreted\n"; 96 | } 97 | 98 | 99 | void CPUInterpretedExecutor::Execute1Loops() 100 | { 101 | int I[1]; 102 | int &i0 = I[0]; 103 | 104 | for (i0 = 0; i0 < N[0]; ++i0) 105 | { 106 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 107 | } 108 | } 109 | 110 | 111 | void CPUInterpretedExecutor::Execute2Loops() 112 | { 113 | int I[2]; 114 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 115 | { 116 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 117 | { 118 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 119 | } 120 | } 121 | } 122 | 123 | 124 | void CPUInterpretedExecutor::Execute3Loops() 125 | { 126 | int I[3]; 127 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 128 | { 129 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 130 | { 131 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 132 | { 133 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 134 | } 135 | } 136 | } 137 | } 138 | 139 | 140 | void CPUInterpretedExecutor::Execute4Loops() 141 | { 142 | int I[4]; 143 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 144 | { 145 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 146 | { 147 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 148 | { 149 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 150 | { 151 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 152 | } 153 | } 154 | } 155 | } 156 | } 157 | 158 | 159 | void CPUInterpretedExecutor::Execute5Loops() 160 | { 161 | int I[5]; 162 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 163 | { 164 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 165 | { 166 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 167 | { 168 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 169 | { 170 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 171 | { 172 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 173 | } 174 | } 175 | } 176 | } 177 | } 178 | } 179 | 180 | 181 | void CPUInterpretedExecutor::Execute6Loops() 182 | { 183 | int I[6]; 184 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 185 | { 186 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 187 | { 188 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 189 | { 190 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 191 | { 192 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 193 | { 194 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 195 | { 196 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 197 | } 198 | } 199 | } 200 | } 201 | } 202 | } 203 | } 204 | 205 | 206 | void CPUInterpretedExecutor::Execute7Loops() 207 | { 208 | int I[7]; 209 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 210 | { 211 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 212 | { 213 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 214 | { 215 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 216 | { 217 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 218 | { 219 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 220 | { 221 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 222 | { 223 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 224 | } 225 | } 226 | } 227 | } 228 | } 229 | } 230 | } 231 | } 232 | 233 | 234 | void CPUInterpretedExecutor::Execute8Loops() 235 | { 236 | int I[8]; 237 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 238 | { 239 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 240 | { 241 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 242 | { 243 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 244 | { 245 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 246 | { 247 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 248 | { 249 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 250 | { 251 | for (I[7] = 0; I[7] < N[7]; ++I[7]) 252 | { 253 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 254 | } 255 | } 256 | } 257 | } 258 | } 259 | } 260 | } 261 | } 262 | } 263 | 264 | 265 | void CPUInterpretedExecutor::Execute9Loops() 266 | { 267 | int I[9]; 268 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 269 | { 270 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 271 | { 272 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 273 | { 274 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 275 | { 276 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 277 | { 278 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 279 | { 280 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 281 | { 282 | for (I[7] = 0; I[7] < N[7]; ++I[7]) 283 | { 284 | for (I[8] = 0; I[8] < N[8]; ++I[8]) 285 | { 286 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 287 | } 288 | } 289 | } 290 | } 291 | } 292 | } 293 | } 294 | } 295 | } 296 | } 297 | 298 | 299 | void CPUInterpretedExecutor::Execute10Loops() 300 | { 301 | int I[10]; 302 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 303 | { 304 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 305 | { 306 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 307 | { 308 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 309 | { 310 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 311 | { 312 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 313 | { 314 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 315 | { 316 | for (I[7] = 0; I[7] < N[7]; ++I[7]) 317 | { 318 | for (I[8] = 0; I[8] < N[8]; ++I[8]) 319 | { 320 | for (I[9] = 0; I[9] < N[9]; ++I[9]) 321 | { 322 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 323 | } 324 | } 325 | } 326 | } 327 | } 328 | } 329 | } 330 | } 331 | } 332 | } 333 | } 334 | 335 | 336 | void CPUInterpretedExecutor::Execute11Loops() 337 | { 338 | int I[11]; 339 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 340 | { 341 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 342 | { 343 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 344 | { 345 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 346 | { 347 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 348 | { 349 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 350 | { 351 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 352 | { 353 | for (I[7] = 0; I[7] < N[7]; ++I[7]) 354 | { 355 | for (I[8] = 0; I[8] < N[8]; ++I[8]) 356 | { 357 | for (I[9] = 0; I[9] < N[9]; ++I[9]) 358 | { 359 | for (I[10] = 0; I[10] < N[10]; ++I[10]) 360 | { 361 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 362 | } 363 | } 364 | } 365 | } 366 | } 367 | } 368 | } 369 | } 370 | } 371 | } 372 | } 373 | } 374 | 375 | 376 | void CPUInterpretedExecutor::Execute12Loops() 377 | { 378 | int I[12]; 379 | for (I[0] = 0; I[0] < N[0]; ++I[0]) 380 | { 381 | for (I[1] = 0; I[1] < N[1]; ++I[1]) 382 | { 383 | for (I[2] = 0; I[2] < N[2]; ++I[2]) 384 | { 385 | for (I[3] = 0; I[3] < N[3]; ++I[3]) 386 | { 387 | for (I[4] = 0; I[4] < N[4]; ++I[4]) 388 | { 389 | for (I[5] = 0; I[5] < N[5]; ++I[5]) 390 | { 391 | for (I[6] = 0; I[6] < N[6]; ++I[6]) 392 | { 393 | for (I[7] = 0; I[7] < N[7]; ++I[7]) 394 | { 395 | for (I[8] = 0; I[8] < N[8]; ++I[8]) 396 | { 397 | for (I[9] = 0; I[9] < N[9]; ++I[9]) 398 | { 399 | for (I[10] = 0; I[10] < N[10]; ++I[10]) 400 | { 401 | for (I[11] = 0; I[11] < N[11]; ++I[11]) 402 | { 403 | OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I); 404 | } 405 | } 406 | } 407 | } 408 | } 409 | } 410 | } 411 | } 412 | } 413 | } 414 | } 415 | } 416 | } 417 | 418 | 419 | void CPUInterpretedExecutor::ExecuteArbitraryLoops() 420 | { 421 | std::vector I(FirstKernel->GetNumIndices(), 0); //Loop indices 422 | std::vector W(FirstKernel->GetNumIndices()); //Loop strides 423 | W[W.size()-1] = 1; 424 | for (int d = W.size() - 2; d >= 0; --d) 425 | { 426 | W[d] = W[d+1]*N[d+1]; 427 | } 428 | 429 | int flatidx_size = 1; 430 | for (int d = 0; d < W.size(); ++d) 431 | { 432 | flatidx_size *= N[d]; 433 | } 434 | 435 | for (int flatidx = 0; flatidx < flatidx_size; ++flatidx) 436 | { 437 | //Compute the unflattened indices 438 | for (int loopd = 0; loopd < I.size(); ++loopd) 439 | { 440 | I[loopd] = (flatidx / W[loopd]) % N[loopd]; 441 | } 442 | OutputVar[ComputeRawIdx(I.data(), OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I.data()); 443 | } 444 | } 445 | 446 | } -------------------------------------------------------------------------------- /exec/CPUInterpretedExecutor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP 7 | #define ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP 8 | 9 | #include "KernelExecutor.hpp" 10 | #include 11 | 12 | namespace acro 13 | { 14 | 15 | class CPUInterpretedExecutor : public KernelExecutor 16 | { 17 | public: 18 | CPUInterpretedExecutor(DimensionedMultiKernel *multi_kernel); 19 | ~CPUInterpretedExecutor(); 20 | virtual void ExecuteSingle(Tensor *output, std::vector &inputs); 21 | virtual std::string GetImplementation(); 22 | virtual std::string GetExecType() {return "CPUInterpreted";} 23 | 24 | private: 25 | void Execute1Loops(); 26 | void Execute2Loops(); 27 | void Execute3Loops(); 28 | void Execute4Loops(); 29 | void Execute5Loops(); 30 | void Execute6Loops(); 31 | void Execute7Loops(); 32 | void Execute8Loops(); 33 | void Execute9Loops(); 34 | void Execute10Loops(); 35 | void Execute11Loops(); 36 | void Execute12Loops(); 37 | void ExecuteArbitraryLoops(); 38 | 39 | inline double ComputeRHS(const int *RESTRICT I); 40 | inline int ComputeRawIdx(const int *RESTRICT I, const int *loop_nums, const int *var_stride, int rank); 41 | 42 | int NumInVars; 43 | int NumLoops; 44 | std::vector N; 45 | 46 | int OutputRank; 47 | double *OutputVar; 48 | int *OutputLoopNums; 49 | int *OutputStrides; 50 | 51 | int *InputRanks; 52 | double **InputVars; 53 | int **InputLoopNums; 54 | int **InputStrides; 55 | }; 56 | 57 | 58 | inline double CPUInterpretedExecutor::ComputeRHS(const int *RESTRICT I) 59 | { 60 | double rhs_val = InputVars[0][ComputeRawIdx(I, InputLoopNums[0], InputStrides[0], InputRanks[0])]; 61 | for (int vari = 1; vari < NumInVars; ++vari) 62 | { 63 | rhs_val *= InputVars[vari][ComputeRawIdx(I, InputLoopNums[vari], InputStrides[vari], InputRanks[vari])]; 64 | } 65 | return rhs_val; 66 | } 67 | 68 | 69 | inline int CPUInterpretedExecutor::ComputeRawIdx(const int *RESTRICT I, const int *loop_nums, const int *var_stride, int rank) 70 | { 71 | int raw_idx = I[loop_nums[0]]*var_stride[0]; 72 | for (int d = 1; d < rank; ++d) 73 | { 74 | raw_idx += I[loop_nums[d]]*var_stride[d]; 75 | } 76 | return raw_idx; 77 | } 78 | 79 | } 80 | 81 | #endif //ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP -------------------------------------------------------------------------------- /exec/CudaExecutor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifdef ACRO_HAVE_CUDA 7 | 8 | #include "CudaExecutor.hpp" 9 | #include 10 | #include 11 | #include 12 | 13 | namespace acro 14 | { 15 | 16 | 17 | CudaExecutor::CudaExecutor(DimensionedMultiKernel *multi_kernel) : KernelExecutor(multi_kernel) 18 | { 19 | HDeviceTensors = nullptr; 20 | SharedMemAllocated = 0; 21 | cudaGetDeviceProperties(&CudaDeviceProp, 0); 22 | GenerateCudaKernel(); 23 | } 24 | 25 | CudaExecutor::~CudaExecutor() 26 | { 27 | if (HDeviceTensors != nullptr) 28 | { 29 | delete HDeviceTensors; 30 | } 31 | acroCudaErrorCheck(cuModuleUnload(TheCudaKernel->Module)); 32 | delete TheCudaKernel; 33 | } 34 | 35 | 36 | void CudaExecutor::ExecuteSingle(Tensor *output, std::vector &inputs) 37 | { 38 | MoveTensorsToGPU(output, inputs); 39 | 40 | int numuvars = MultiKernel->GetNumUVars(); 41 | if (KernelParams.size() == 0) 42 | { 43 | HDeviceTensors = new double*[numuvars]; 44 | KernelParams.resize(numuvars); 45 | } 46 | 47 | for (int uvari = 0; uvari < numuvars; ++uvari) 48 | { 49 | auto ki_vari = MultiKernel->GetFirstKiVariForUVari(uvari); 50 | int vari = ki_vari.second; 51 | double *dtensor; 52 | if (vari == -1) 53 | { 54 | dtensor = output->GetDeviceData(); 55 | } 56 | else 57 | { 58 | dtensor = inputs[vari]->GetDeviceData(); 59 | } 60 | HDeviceTensors[uvari] = dtensor; 61 | KernelParams[uvari] = &(HDeviceTensors[uvari]); 62 | } 63 | 64 | TheCudaKernel->Launch(KernelParams); 65 | //cudaDeviceSynchronize(); 66 | } 67 | 68 | 69 | void CudaExecutor::ExecuteMulti(std::vector &outputs, std::vector > &inputs) 70 | { 71 | for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki) 72 | { 73 | MoveTensorsToGPU(outputs[ki], inputs[ki]); 74 | } 75 | 76 | int numuvars = MultiKernel->GetNumUVars(); 77 | if (KernelParams.size() == 0) 78 | { 79 | HDeviceTensors = new double*[numuvars]; 80 | KernelParams.resize(numuvars); 81 | } 82 | 83 | for (int uvari = 0; uvari < numuvars; ++uvari) 84 | { 85 | auto ki_vari = MultiKernel->GetFirstKiVariForUVari(uvari); 86 | int ki = ki_vari.first; 87 | int vari = ki_vari.second; 88 | double *dtensor; 89 | if (vari == -1) 90 | { 91 | dtensor = outputs[ki]->GetDeviceData(); 92 | } 93 | else 94 | { 95 | dtensor = inputs[ki][vari]->GetDeviceData(); 96 | } 97 | HDeviceTensors[uvari] = dtensor; 98 | KernelParams[uvari] = &(HDeviceTensors[uvari]); 99 | } 100 | TheCudaKernel->Launch(KernelParams); 101 | //cudaDeviceSynchronize(); 102 | } 103 | 104 | 105 | std::string CudaExecutor::GetImplementation() 106 | { 107 | return TheCudaKernel->Code; 108 | } 109 | 110 | 111 | void CudaExecutor::GenerateCudaKernel() 112 | { 113 | TheCudaKernel = new CudaKernel; 114 | TheCudaKernel->Code = 115 | "extern \"C\" \n" 116 | "__global__\n" 117 | "__launch_bounds__()\n" 118 | "void ()\n" 119 | "{\n" 120 | " double sum;\n" 121 | " const unsigned int outidx = blockIdx.x;\n" 122 | "\n" 123 | "" 124 | "\n" 125 | "" 126 | "\n" 127 | " __syncthreads();\n" 128 | "" 129 | "\n" 130 | "" 131 | 132 | "}\n"; 133 | 134 | ACROBATIC_ASSERT(MultiKernel->GetNumOuterIndices() > 0, "CudaExecutor needs at least 1 non-contraction index."); 135 | 136 | NumBlockLoops = GetNumBlockLoops(); 137 | 138 | 139 | int outidx_size = MultiKernel->GetIdxSizeForFirstNumLoops(NumBlockLoops); 140 | TheCudaKernel->FunctionName = "Kernel"; 141 | TheCudaKernel->ThreadsPerBlock = GetNumThreadsPerBlock(NumBlockLoops); 142 | TheCudaKernel->NumBlocks = outidx_size; 143 | 144 | //Generate the params list 145 | std::string params_str; 146 | for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari) 147 | { 148 | if (MultiKernel->IsOutputUVar(uvari)) 149 | { 150 | params_str += "double * const T" + std::to_string(uvari); 151 | } 152 | else 153 | { 154 | params_str += "double const * const T" + std::to_string(uvari); 155 | } 156 | 157 | if (uvari < MultiKernel->GetNumUVars()-1) 158 | { 159 | params_str += ", "; 160 | } 161 | } 162 | 163 | GetSharedMemUvars(); 164 | std::string preload_sm_str = GenSharedMemPreload(); 165 | GetSharedMemWRKernels(); 166 | std::string alloc_smwr_buffer_str = GenSharedMemWRBuffer(); 167 | 168 | //Generate the indices outside the contraction loop 169 | std::string init_indices_str = GenInitIndices(); 170 | 171 | 172 | //Generate the subkernel loops 173 | std::string subkernel_loops_str = GenSubKernelLoops(); 174 | 175 | str_replace_all(TheCudaKernel->Code, "", TheCudaKernel->ThreadsPerBlock); 176 | str_replace_all(TheCudaKernel->Code, "", 4096 / TheCudaKernel->ThreadsPerBlock); 177 | str_replace_all(TheCudaKernel->Code, "", TheCudaKernel->FunctionName); 178 | str_replace_all(TheCudaKernel->Code, "", params_str); 179 | str_replace_all(TheCudaKernel->Code, "", MultiKernel->GetNumUVars()); 180 | str_replace_all(TheCudaKernel->Code, "", outidx_size); 181 | str_replace_all(TheCudaKernel->Code, "", alloc_smwr_buffer_str); 182 | str_replace_all(TheCudaKernel->Code, "", preload_sm_str); 183 | str_replace_all(TheCudaKernel->Code, "", init_indices_str); 184 | str_replace_all(TheCudaKernel->Code, "", subkernel_loops_str); 185 | 186 | //std::cout << TheCudaKernel->Code << std::endl; 187 | //std::cout << MultiKernel->GetDimensionedNameString() << std::endl; 188 | //TheCudaKernel->WriteCodeToFile("kernel.cu"); 189 | TheCudaKernel->GenerateFunction(); 190 | } 191 | 192 | 193 | int CudaExecutor::GetNumBlockLoops() 194 | { 195 | int loopi; 196 | for (loopi = 0; loopi < MultiKernel->GetNumOuterIndices(); ++loopi) 197 | { 198 | if (MultiKernel->GetIdxSizeForFirstNumLoops(loopi) >= 4096 || GetMinMidIdxSize(loopi) < 128) 199 | { 200 | break; 201 | } 202 | } 203 | return loopi; 204 | } 205 | 206 | 207 | int CudaExecutor::GetMinMidIdxSize(int num_block_loops) 208 | { 209 | int numloops = MultiKernel->GetNumIndices(); 210 | int min_idx_size = std::numeric_limits::max(); 211 | for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki) 212 | { 213 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 214 | std::vector mid_loops; 215 | for (int loopi = num_block_loops; loopi < numloops; ++loopi) 216 | { 217 | if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi)) 218 | { 219 | mid_loops.push_back(loopi); 220 | } 221 | } 222 | min_idx_size = std::min(min_idx_size, kernel->GetLoopsIdxSize(mid_loops)); 223 | } 224 | return min_idx_size; 225 | } 226 | 227 | 228 | int CudaExecutor::GetMaxMidIdxSize(int num_block_loops) 229 | { 230 | int numloops = MultiKernel->GetNumIndices(); 231 | int max_idx_size = -1; 232 | for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki) 233 | { 234 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 235 | std::vector mid_loops; 236 | for (int loopi = num_block_loops; loopi < numloops; ++loopi) 237 | { 238 | if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi)) 239 | { 240 | mid_loops.push_back(loopi); 241 | } 242 | } 243 | max_idx_size = std::max(max_idx_size, kernel->GetLoopsIdxSize(mid_loops)); 244 | } 245 | return max_idx_size; 246 | } 247 | 248 | 249 | int CudaExecutor::GetNumThreadsPerBlock(int num_block_loops) 250 | { 251 | int min = GetMinMidIdxSize(num_block_loops); 252 | int max = GetMaxMidIdxSize(num_block_loops); 253 | int block_size; 254 | for (block_size = 64; block_size < 512; block_size *= 2) 255 | { 256 | if (block_size > max || block_size > int(1.3*float(min))) 257 | { 258 | break; 259 | } 260 | } 261 | //std::cout << block_size << std::endl; 262 | return block_size; 263 | } 264 | 265 | void CudaExecutor::GetSharedMemUvars() 266 | { 267 | int numuvars = MultiKernel->GetNumUVars(); 268 | SharedMemUvars.resize(numuvars); 269 | int num_blocks_per_full_sm = CudaDeviceProp.maxThreadsPerMultiProcessor / TheCudaKernel->ThreadsPerBlock; 270 | int shared_mem_size = (CudaDeviceProp.sharedMemPerMultiprocessor / num_blocks_per_full_sm); 271 | for (int uvari = 0; uvari < numuvars; ++uvari) 272 | { 273 | SharedMemUvars[uvari] = false; 274 | if (!MultiKernel->IsOutputUVar(uvari)) 275 | { 276 | int ivar_bytesize = MultiKernel->GetVarSize(uvari)*8; 277 | if (ivar_bytesize + SharedMemAllocated < shared_mem_size) 278 | { 279 | SharedMemUvars[uvari] = true; 280 | SharedMemAllocated += ivar_bytesize; 281 | } 282 | } 283 | } 284 | } 285 | 286 | void CudaExecutor::GetSharedMemWRKernels() 287 | { 288 | int num_blocks_per_full_sm = CudaDeviceProp.maxThreadsPerMultiProcessor / TheCudaKernel->ThreadsPerBlock; 289 | int shared_mem_size = (CudaDeviceProp.sharedMemPerMultiprocessor / num_blocks_per_full_sm); 290 | SharedMemWRKernels.resize(MultiKernel->GetNumKernels(), false); 291 | SMWRBufferSize = 0; 292 | for (int ki = 0; ki < MultiKernel->GetNumKernels() - 1; ++ki) 293 | { 294 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 295 | DimensionedKernel *next_kernel = MultiKernel->Kernels[ki + 1]; 296 | if (ki == 0 || SharedMemWRKernels[ki-1] == false) //Avoid collision on the buffer 297 | { 298 | int outuvari = MultiKernel->GetUVari(ki, -1); 299 | for (int vari = 0; vari < next_kernel->GetNumInputVars(); ++vari) 300 | { 301 | if (MultiKernel->GetUVari(ki+1, vari) == outuvari) 302 | { 303 | int onblock_var_idxsize = 8; //Bytes/double 304 | for (int di = 0; di < MultiKernel->GetVarRank(ki, vari); ++di) 305 | { 306 | int loopi = MultiKernel->GetVarDimLoopNum(ki, vari, di); 307 | if (loopi >= NumBlockLoops) 308 | { 309 | onblock_var_idxsize *= MultiKernel->GetLoopDim(loopi); 310 | } 311 | } 312 | 313 | if (onblock_var_idxsize + SharedMemAllocated < shared_mem_size) 314 | { 315 | SMWRBufferSize = std::max(SMWRBufferSize, onblock_var_idxsize); 316 | SharedMemWRKernels[ki] = true; 317 | } 318 | } 319 | } 320 | } 321 | } 322 | SharedMemAllocated += SMWRBufferSize; 323 | } 324 | 325 | 326 | std::vector CudaExecutor::GetMidloopsOrder(int ki) 327 | { 328 | DimensionedKernel* kernel = MultiKernel->Kernels[ki]; 329 | int numloops = MultiKernel->GetNumIndices(); 330 | int numinvars = kernel->GetNumInputVars(); 331 | 332 | //Generate the mid loops 333 | std::set mid_loops_set; 334 | for (int loopi = NumBlockLoops; loopi < numloops; ++loopi) 335 | { 336 | if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi)) 337 | { 338 | mid_loops_set.insert(loopi); 339 | } 340 | } 341 | 342 | int max_ivar_rank = 0; 343 | for (int vari = -1; vari < numinvars; ++vari) 344 | { 345 | max_ivar_rank = std::max(max_ivar_rank, kernel->GetVarRank(vari)); 346 | } 347 | 348 | //Collect of the loop dimensions from all the variables in lowest stride order 349 | std::vector mid_loops; 350 | for (int rankoff = 0; rankoff < max_ivar_rank; ++rankoff) 351 | { 352 | for (int vari = numinvars-1; vari >= -1; --vari) 353 | { 354 | int uvari = MultiKernel->GetUVari(ki, vari); 355 | int vidxi = kernel->GetVarRank(vari) - 1 - rankoff; 356 | int loopi = vidxi >= 0 ? kernel->GetVarDimLoopNum(vari, vidxi) : -1; 357 | auto it = mid_loops_set.find(loopi); 358 | if (!SharedMemUvars[uvari] && it != mid_loops_set.end()) 359 | { 360 | mid_loops.push_back(loopi); 361 | mid_loops_set.erase(it); 362 | } 363 | } 364 | } 365 | 366 | //Tack on the rest of the indices 367 | for (auto it = mid_loops_set.rbegin(); it != mid_loops_set.rend(); ++it) 368 | { 369 | mid_loops.push_back(*it); 370 | } 371 | 372 | //We want the lowest strides to be in the inner most loops 373 | std::reverse(mid_loops.begin(), mid_loops.end()); 374 | return mid_loops; 375 | } 376 | 377 | 378 | std::vector CudaExecutor::GetMidloopsStrides(DimensionedKernel *kernel, std::vector &mid_loops) 379 | { 380 | //Generate the mid loops 381 | int nummidloops = mid_loops.size(); 382 | std::vector strides(nummidloops); 383 | int stride = 1; 384 | for (int mloopi = nummidloops - 1; mloopi >= 0; --mloopi) 385 | { 386 | int loopi = mid_loops[mloopi]; 387 | strides[mloopi] = stride; 388 | stride *= kernel->GetLoopDim(loopi); 389 | } 390 | 391 | return strides; 392 | } 393 | 394 | 395 | std::string CudaExecutor::GenSharedMemWRBuffer() 396 | { 397 | std::string smwr_str; 398 | if (SMWRBufferSize > 0) 399 | { 400 | smwr_str += " __shared__ double SMWR[" + std::to_string(SMWRBufferSize / 8) + "];\n"; 401 | } 402 | return smwr_str; 403 | } 404 | 405 | 406 | std::string CudaExecutor::GenSharedMemPreload() 407 | { 408 | //If applicable Generate the SM preload code for small tensors 409 | std::string preload_sm_str; 410 | for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari) 411 | { 412 | if (SharedMemUvars[uvari]) 413 | { 414 | preload_sm_str += " __shared__ double sT" + std::to_string(uvari); 415 | preload_sm_str += "[" + std::to_string(MultiKernel->GetVarSize(uvari)) + "];\n"; 416 | } 417 | } 418 | for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari) 419 | { 420 | if (SharedMemUvars[uvari]) 421 | { 422 | std::string temp = 423 | " for (int idx = threadIdx.x; idx < ; idx += blockDim.x)\n" 424 | " {\n" 425 | " sT[idx] = " + GenTensor(uvari) + "[idx];\n" 426 | " }\n\n"; 427 | str_replace_all(temp, "", uvari); 428 | str_replace_all(temp, "", MultiKernel->GetVarSize(uvari)); 429 | 430 | preload_sm_str += temp; 431 | } 432 | } 433 | return preload_sm_str; 434 | } 435 | 436 | 437 | std::string CudaExecutor::GenInitIndices() 438 | { 439 | const std::vector N = MultiKernel->GetLoopDims(); 440 | int numloops = MultiKernel->GetNumIndices(); 441 | std::vector Wout(NumBlockLoops); //Outer loop strides 442 | if (NumBlockLoops > 0) 443 | { 444 | Wout[NumBlockLoops-1] = 1; 445 | } 446 | for (int d = NumBlockLoops - 2; d >= 0; --d) 447 | { 448 | Wout[d] = Wout[d+1]*N[d+1]; 449 | } 450 | 451 | std::string init_indices_str; 452 | for (int loopd = 0; loopd < NumBlockLoops; ++loopd) 453 | { 454 | //I[loopd] = (outidx / (Wout[loopd]) % N[loopd]; 455 | init_indices_str += " unsigned int I"; 456 | init_indices_str += std::to_string(loopd) + " = "; 457 | if (Wout[loopd] == 1) 458 | { 459 | init_indices_str += "outidx"; 460 | } 461 | else 462 | { 463 | init_indices_str += "(outidx / " + std::to_string(Wout[loopd]) + ")"; 464 | TheCudaKernel->IntOpsPerIndex += 1; 465 | } 466 | if (loopd > 0) 467 | { 468 | init_indices_str += " % " + std::to_string(N[loopd]); 469 | } 470 | init_indices_str += "; // " + MultiKernel->GetLoopIndex(loopd) + "\n"; 471 | 472 | } 473 | return init_indices_str; 474 | } 475 | 476 | 477 | 478 | std::string CudaExecutor::GenSubKernelLoops() 479 | { 480 | std::string kernel_loops_str; 481 | int numloops = MultiKernel->GetNumIndices(); 482 | std::vector hoisted; 483 | std::vector loop_strides(numloops); 484 | 485 | for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki) 486 | { 487 | std::string loop_str; 488 | 489 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 490 | int numinvars = kernel->GetNumInputVars(); 491 | int numcontloops = kernel->GetNumContractionIndices(); 492 | std::vector mid_loops = GetMidloopsOrder(ki); 493 | std::vector mid_loop_strides = GetMidloopsStrides(kernel, mid_loops); 494 | int mid_loops_idx_size = kernel->GetLoopsIdxSize(mid_loops); 495 | int blockdim = TheCudaKernel->ThreadsPerBlock; 496 | int numblocki = mid_loops_idx_size / blockdim; 497 | int blocki_rem = mid_loops_idx_size % blockdim; 498 | if (blocki_rem != 0) 499 | { 500 | numblocki ++; 501 | } 502 | 503 | loop_str += " //" + kernel->KernelStr + "\n"; 504 | loop_str += " {\n"; 505 | 506 | for (int mloopi = 0; mloopi < mid_loops.size(); ++mloopi) 507 | { 508 | int loopi = mid_loops[mloopi]; 509 | loop_str += " ushort2 " + GenLoopIndex(ki, loopi) + ";\n"; 510 | } 511 | loop_str += GenMidLoopIndices(ki, mid_loops, mid_loop_strides, 0); 512 | for (int blocki = 0; blocki < numblocki; ++blocki) 513 | { 514 | std::string temp; 515 | if (blocki == numblocki - 1 && blocki_rem != 0) 516 | { 517 | temp += " if (threadIdx.x < )\n"; 518 | } 519 | temp += " {\n"; 520 | temp += " sum = 0.0;\n"; 521 | temp += ""; 522 | temp += " sum += ;\n"; 523 | temp += ""; 524 | if (blocki < numblocki -1) 525 | { 526 | temp += GenMidLoopIndices(ki, mid_loops, mid_loop_strides, blocki+1); 527 | } 528 | if (SharedMemWRKernels[ki]) 529 | { 530 | if (kernel->EqOperator != "=") 531 | { 532 | temp += " SMWR[] = " + GenTensor(ki,-1) + "[];\n"; 533 | } 534 | temp += " SMWR[] sum;\n"; 535 | } 536 | temp += " " + GenTensor(ki,-1) + "[] sum;\n"; 537 | temp += " }\n"; 538 | str_replace_all(temp, "", blocki); 539 | str_replace_all(temp, "", blocki_rem); 540 | 541 | loop_str += temp; 542 | 543 | //Generate the contraction loops 544 | std::string cont_loops_str; 545 | std::vector hoisted(numinvars, false); 546 | for (int loopi = NumBlockLoops; loopi < numloops; ++loopi) 547 | { 548 | if (kernel->IsContractionLoop(loopi)) 549 | { 550 | std::string temp; 551 | for (int ivari = 0; ivari < numinvars; ++ ivari) 552 | { 553 | int uvari = MultiKernel->GetUVari(ki, ivari); 554 | if (kernel->GetVarLoopDepth(ivari) < loopi && !SharedMemUvars[uvari] && 555 | !(ki > 0 && uvari == MultiKernel->GetUVari(ki-1, -1)) && 556 | !hoisted[ivari]) 557 | { 558 | std::string ivaristr = std::to_string(ivari); 559 | std::string uvaristr = std::to_string(uvari); 560 | std::string varidxstr = GenVarIndex(ki, ivari, blocki); 561 | temp += " double hIN" + ivaristr + " = __ldg(&" + GenTensor(uvari) + "[" + varidxstr + "]);\n"; 562 | hoisted[ivari] = true; 563 | } 564 | } 565 | 566 | temp += " #pragma unroll\n"; 567 | temp += " for (unsigned int = 0; < ; ++) {"; 568 | temp += " // " + MultiKernel->GetLoopIndex(loopi) + "\n"; 569 | str_replace_all(temp, "", GenLoopIndex(ki, loopi)); 570 | str_replace_all(temp, "", kernel->GetLoopDim(loopi)); 571 | cont_loops_str += temp; 572 | } 573 | } 574 | std::string end_cont_loops_str = " " + std::string(numcontloops, '}') + "\n"; 575 | 576 | //Generate the RHS computation inside the contraction loops 577 | std::string rhs_str; 578 | for (int ivari = 0; ivari < numinvars; ++ ivari) 579 | { 580 | int uvari = MultiKernel->GetUVari(ki, ivari); 581 | std::string var_str; 582 | if (SharedMemUvars[uvari]) 583 | { 584 | var_str = "sT" + std::to_string(uvari) + "[" + GenVarIndex(ki, ivari, blocki) + "]"; 585 | } 586 | else if (hoisted[ivari]) 587 | { 588 | var_str = "hIN" + std::to_string(ivari); 589 | } 590 | else if (ki > 0 && SharedMemWRKernels[ki-1] && uvari == MultiKernel->GetUVari(ki-1, -1)) 591 | { 592 | var_str = "SMWR[" + GenVarIndex(ki, ivari, blocki, false) + "]"; 593 | } 594 | else 595 | { 596 | var_str = GenTensor(uvari) + "[" + GenVarIndex(ki, ivari, blocki) + "]"; 597 | } 598 | 599 | rhs_str += var_str; 600 | if (ivari < numinvars-1) 601 | { 602 | rhs_str += "*"; 603 | } 604 | } 605 | 606 | str_replace_all(loop_str, "", cont_loops_str); 607 | str_replace_all(loop_str, "", rhs_str); 608 | str_replace_all(loop_str, "", end_cont_loops_str); 609 | str_replace_all(loop_str, "", kernel->EqOperator); 610 | str_replace_all(loop_str, "", GenVarIndex(ki, -1, blocki)); 611 | str_replace_all(loop_str, "", GenVarIndex(ki, -1, blocki, false)); 612 | 613 | } 614 | loop_str += " }\n"; 615 | loop_str += " __syncthreads();\n\n"; 616 | kernel_loops_str += loop_str; 617 | } 618 | return kernel_loops_str; 619 | } 620 | 621 | std::string CudaExecutor::GenMidLoopIndices(int ki, std::vector &mid_loops, std::vector &mid_loop_strides, int blocki) 622 | { 623 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 624 | std::string indices; 625 | for (int mloopi = 0; mloopi < mid_loops.size(); ++mloopi) 626 | { 627 | int loopi = mid_loops[mloopi]; 628 | std::string temp = " " + GenLoopIndex(ki, loopi, blocki); 629 | temp += " = ((threadIdx.x + ) / )"; 630 | if (mloopi > 0) 631 | { 632 | temp += " % ; // "+ MultiKernel->GetLoopIndex(loopi) + "\n"; 633 | } 634 | else 635 | { 636 | temp += "; // " + MultiKernel->GetLoopIndex(loopi) + "\n"; 637 | } 638 | str_replace_all(temp, "", blocki*TheCudaKernel->ThreadsPerBlock); 639 | str_replace_all(temp, "", mid_loop_strides[mloopi]); 640 | str_replace_all(temp, "", kernel->GetLoopDim(loopi)); 641 | indices += temp; 642 | } 643 | return indices; 644 | } 645 | 646 | 647 | std::string CudaExecutor::GenTensor(int ki, int vari) 648 | { 649 | return GenTensor(MultiKernel->GetUVari(ki, vari)); 650 | } 651 | 652 | 653 | std::string CudaExecutor::GenTensor(int uvari) 654 | { 655 | std::string tensor = "T" + std::to_string(uvari); 656 | return tensor; 657 | } 658 | 659 | std::string CudaExecutor::GenVarIndex(int ki, int vari, int blocki, bool blockdims) 660 | { 661 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 662 | std::string index_str; 663 | bool first = true; 664 | for (int di = 0; di < kernel->GetVarRank(vari); ++di) 665 | { 666 | int loopi = kernel->GetVarDimLoopNum(vari, di); 667 | if (blockdims || loopi >= NumBlockLoops) 668 | { 669 | if (!first) 670 | { 671 | index_str += " + "; 672 | } 673 | 674 | std::string loopidx = GenVarSubIndex(ki, vari, di, blocki); 675 | std::string stride = std::to_string(kernel->GetVarDimStride(vari, di)); 676 | //index_str += "__umul24(" + loopidx + "," + stride + ")"; 677 | index_str += loopidx + "*" + stride; 678 | first = false; 679 | } 680 | } 681 | return index_str; 682 | } 683 | 684 | 685 | std::string CudaExecutor::GenVarSubIndex(int ki, int vari, int dimi, int blocki) 686 | { 687 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 688 | return GenLoopIndex(ki, kernel->GetVarDimLoopNum(vari, dimi), blocki); 689 | } 690 | 691 | 692 | std::string CudaExecutor::GenLoopIndex(int ki, int loopi, int blocki) 693 | { 694 | DimensionedKernel *kernel = MultiKernel->Kernels[ki]; 695 | std::string loopidx = "I" + std::to_string(loopi); 696 | if (blocki > -1 && loopi >= NumBlockLoops && !kernel->IsContractionLoop(loopi)) 697 | { 698 | loopidx += (blocki%2 == 0) ? ".x" : ".y"; 699 | } 700 | 701 | return loopidx; 702 | } 703 | 704 | } 705 | 706 | #endif -------------------------------------------------------------------------------- /exec/CudaExecutor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_CUDA_EXECUTOR_HPP 7 | #define ACROBATIC_CUDA_EXECUTOR_HPP 8 | 9 | #ifdef ACRO_HAVE_CUDA 10 | #include "KernelExecutor.hpp" 11 | #include 12 | #include 13 | #include 14 | 15 | namespace acro 16 | { 17 | 18 | class CudaExecutor : public KernelExecutor 19 | { 20 | public: 21 | CudaExecutor(DimensionedMultiKernel *multi_kernel); 22 | ~CudaExecutor(); 23 | virtual void ExecuteSingle(Tensor *output, std::vector &inputs); 24 | virtual void ExecuteMulti(std::vector &output, std::vector > &inputs); 25 | virtual std::string GetImplementation(); 26 | virtual std::string GetExecType() {return "Cuda";} 27 | 28 | private: 29 | void GenerateCudaKernel(); 30 | void ReorderIndices(std::vector &mk_outer_indices); 31 | int GetNumBlockLoops(); 32 | int GetMinMidIdxSize(int num_block_loops); 33 | int GetMaxMidIdxSize(int num_block_loops); 34 | int GetNumThreadsPerBlock(int num_block_loops); 35 | void GetSharedMemUvars(); 36 | void GetSharedMemWRKernels(); 37 | std::vector GetMidloopsOrder(int ki); 38 | std::vector GetMidloopsStrides(DimensionedKernel *kernel, std::vector &mid_loops); 39 | 40 | std::string GenSharedMemPreload(); 41 | std::string GenSharedMemWRBuffer(); 42 | std::string GenInitIndices(); 43 | std::string GenSubKernelLoops(); 44 | std::string GenTensor(int ki, int vari); 45 | std::string GenTensor(int uvari); 46 | std::string GenMidLoopIndices(int ki, std::vector &mid_loops, std::vector &mid_loop_strides, int blocki = -1); 47 | std::string GenVarIndex(int ki, int vari, int blocki = -1, bool blockdims=true); 48 | std::string GenVarSubIndex(int ki, int vari, int dimi, int blocki = -1); 49 | std::string GenLoopIndex(int ki, int loopi, int blocki = -1); 50 | 51 | cudaDeviceProp CudaDeviceProp; 52 | CudaKernel *TheCudaKernel; 53 | 54 | int NumBlockLoops; 55 | double **HDeviceTensors; 56 | 57 | int SharedMemAllocated; 58 | int SMWRBufferSize; 59 | std::vector SharedMemUvars; 60 | std::vector SharedMemWRKernels; 61 | 62 | std::vector KernelParams; 63 | }; 64 | 65 | } 66 | 67 | #endif 68 | 69 | #endif //ACROBATIC_ONEOUTPERTHREAD_EXECUTOR_HPP -------------------------------------------------------------------------------- /exec/Executor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "KernelExecutor.hpp" 7 | #include "CPUInterpretedExecutor.hpp" 8 | #include "CudaExecutor.hpp" -------------------------------------------------------------------------------- /exec/KernelExecutor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "Executor.hpp" 7 | #include "TensorKernel.hpp" 8 | 9 | namespace acro 10 | { 11 | 12 | 13 | KernelExecutor::KernelExecutor(DimensionedMultiKernel *multi_kernel) 14 | { 15 | MultiKernel = multi_kernel; 16 | if (MultiKernel->Kernels.size() > 0) 17 | { 18 | FirstKernel = MultiKernel->Kernels[0]; 19 | } 20 | else 21 | { 22 | FirstKernel = NULL; 23 | } 24 | 25 | #ifdef ACRO_HAVE_CUDA 26 | TheCudaStream = NULL; 27 | #endif 28 | } 29 | 30 | 31 | void KernelExecutor::MoveTensorsFromGPU(Tensor *output, std::vector &inputs) 32 | { 33 | if (output->IsOnGPU()) 34 | { 35 | output->MoveFromGPU(); 36 | } 37 | 38 | for (int i = 0; i < inputs.size(); ++i) 39 | { 40 | if (inputs[i]->IsOnGPU()) 41 | { 42 | inputs[i]->MoveFromGPU(); 43 | } 44 | } 45 | } 46 | 47 | 48 | void KernelExecutor::MoveTensorsToGPU(Tensor *output, std::vector &inputs) 49 | { 50 | if (!output->IsOnGPU()) 51 | { 52 | if (!output->IsMappedToGPU()) 53 | { 54 | output->MapToGPU(); 55 | } 56 | output->MoveToGPU(); 57 | } 58 | 59 | for (int i = 0; i < inputs.size(); ++i) 60 | { 61 | if (!inputs[i]->IsOnGPU()) 62 | { 63 | if (!inputs[i]->IsMappedToGPU()) 64 | { 65 | inputs[i]->MapToGPU(); 66 | } 67 | inputs[i]->MoveToGPU(); 68 | } 69 | } 70 | } 71 | 72 | 73 | void KernelExecutor::MoveTensorsToOutputLocation(Tensor *output, std::vector &inputs) 74 | { 75 | if (output->IsOnGPU()) 76 | { 77 | MoveTensorsToGPU(output, inputs); 78 | } 79 | else 80 | { 81 | MoveTensorsFromGPU(output, inputs); 82 | } 83 | } 84 | 85 | 86 | void KernelExecutor::ExecuteMulti(std::vector &output, std::vector > &inputs) 87 | { 88 | if (SubExecutors.size() != MultiKernel->Kernels.size()) 89 | { 90 | SubKernels.resize(MultiKernel->Kernels.size()); 91 | SubExecutors.resize(MultiKernel->Kernels.size()); 92 | for (int ki = 0; ki < MultiKernel->Kernels.size(); ++ki) 93 | { 94 | SubKernels[ki] = new DimensionedMultiKernel(MultiKernel->Kernels[ki]); 95 | SubExecutors[ki] = KernelExecutor::Create(GetExecType(), SubKernels[ki]); 96 | } 97 | } 98 | 99 | for (int ki = 0; ki < MultiKernel->Kernels.size(); ++ki) 100 | { 101 | SubExecutors[ki]->ExecuteSingle(output[ki], inputs[ki]); 102 | } 103 | } 104 | 105 | 106 | KernelExecutor *KernelExecutor::Create(std::string exec_type, DimensionedMultiKernel *multi_kernel) 107 | { 108 | if (exec_type == "CPUInterpreted") 109 | { 110 | return new CPUInterpretedExecutor(multi_kernel); 111 | } 112 | #ifdef ACRO_HAVE_CUDA 113 | if (exec_type == "Cuda") 114 | { 115 | return new CudaExecutor(multi_kernel); 116 | } 117 | #endif 118 | 119 | ACROBATIC_ASSERT(false, "Executor type does not exist: " + exec_type); 120 | return NULL; 121 | } 122 | 123 | 124 | KernelExecutor::~KernelExecutor() 125 | { 126 | for (int ki = 0; ki < SubExecutors.size(); ++ki) 127 | { 128 | delete SubKernels[ki]; 129 | delete SubExecutors[ki]; 130 | } 131 | } 132 | 133 | } -------------------------------------------------------------------------------- /exec/KernelExecutor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_KERNEL_EXECUTOR_HPP 7 | #define ACROBATIC_KERNEL_EXECUTOR_HPP 8 | 9 | #include 10 | #include "Tensor.hpp" 11 | #include "DimensionedMultiKernel.hpp" 12 | 13 | namespace acro 14 | { 15 | 16 | class KernelExecutor 17 | { 18 | public: 19 | KernelExecutor(DimensionedMultiKernel *multi_kernel); 20 | static KernelExecutor *Create(std::string exec_type, DimensionedMultiKernel *multi_kernel); 21 | virtual ~KernelExecutor(); 22 | 23 | virtual std::string GetImplementation() = 0; 24 | virtual std::string GetExecType() = 0; 25 | virtual void ExecuteSingle(Tensor *output, std::vector &inputs) = 0; 26 | virtual void ExecuteMulti(std::vector &output, std::vector > &inputs); 27 | 28 | #ifdef ACRO_HAVE_CUDA 29 | inline void SetCudaStream(cudaStream_t cuda_stream) {TheCudaStream = cuda_stream;} 30 | #endif 31 | 32 | protected: 33 | 34 | void MoveTensorsFromGPU(Tensor *output, std::vector &inputs); 35 | void MoveTensorsToGPU(Tensor *output, std::vector &inputs); 36 | void MoveTensorsToOutputLocation(Tensor *output, std::vector &inputs); 37 | DimensionedMultiKernel *MultiKernel; 38 | DimensionedKernel *FirstKernel; 39 | std::vector SubKernels; 40 | std::vector SubExecutors; 41 | 42 | #ifdef ACRO_HAVE_CUDA 43 | cudaStream_t TheCudaStream; 44 | #endif 45 | }; 46 | 47 | } 48 | 49 | 50 | #endif //ACROBATIC_KERNEL_EXECUTOR_HPP 51 | -------------------------------------------------------------------------------- /kernel/DimensionedKernel.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | #include "DimensionedKernel.hpp" 6 | #include 7 | 8 | namespace acro 9 | { 10 | 11 | 12 | DimensionedKernel::DimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector &inputs) 13 | { 14 | //Copy these from the original kernel 15 | KernelStr = kernel->KernelStr; 16 | OutputVar = kernel->OutputVar; 17 | EqOperator = kernel->EqOperator; 18 | InputVars = kernel->InputVars; 19 | AllIndexNames = kernel->AllIndexNames; 20 | ContractionIndexNames = kernel->ContractionIndexNames; 21 | LoopIndices = kernel->LoopIndices; 22 | 23 | LoopDims = kernel->GetLoopIdxSizes(output, inputs); 24 | LoopStrides.resize(LoopDims.size()); 25 | LoopStrides[LoopDims.size() - 1] = 1; 26 | for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd) 27 | { 28 | LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1]; 29 | } 30 | } 31 | 32 | 33 | void DimensionedKernel::SetLoopIndices(std::vector &idx_list) 34 | { 35 | //Update the loop dims before we change all the LoopIndex info 36 | std::vector NewLoopDims(idx_list.size(), 1); 37 | for (int idxi = 0; idxi < NewLoopDims.size(); ++idxi) 38 | { 39 | auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx_list[idxi]); 40 | if (it != LoopIndices.end()) 41 | { 42 | NewLoopDims[idxi] = LoopDims[std::distance(LoopIndices.begin(), it)]; 43 | } 44 | else 45 | { 46 | NewLoopDims[idxi] = 1; 47 | } 48 | } 49 | LoopDims = NewLoopDims; 50 | 51 | //Update the loop strides 52 | LoopStrides.resize(LoopDims.size()); 53 | LoopStrides[LoopDims.size() - 1] = 1; 54 | for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd) 55 | { 56 | LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1]; 57 | } 58 | 59 | //update all the indices and underlying variable objects 60 | TensorKernel::SetLoopIndices(idx_list); 61 | } 62 | 63 | 64 | std::string DimensionedKernel::GetLoopDimsString() 65 | { 66 | std::string name = "__dim"; 67 | for (auto idx : AllIndexNames) 68 | { 69 | name += "_" + std::to_string(GetLoopDim(idx)); 70 | } 71 | 72 | return name; 73 | } 74 | 75 | 76 | 77 | int DimensionedKernel::GetFlatIdxSize() 78 | { 79 | int flatidx_size = 1; 80 | for (int d = 0; d < GetNumIndices(); ++d) 81 | { 82 | flatidx_size *= LoopDims[d]; 83 | } 84 | return flatidx_size; 85 | } 86 | 87 | 88 | int DimensionedKernel::GetOutIdxSize() 89 | { 90 | int outidx_size = 1; 91 | for (int d = 0; d < GetNumIndices() - GetNumContractionIndices(); ++d) 92 | { 93 | outidx_size *= LoopDims[d]; 94 | } 95 | return outidx_size; 96 | } 97 | 98 | 99 | int DimensionedKernel::GetContIdxSize() 100 | { 101 | int contidx_size = 1; 102 | for (int d = GetNumIndices() - GetNumContractionIndices(); d < GetNumIndices(); ++d) 103 | { 104 | contidx_size *= LoopDims[d]; 105 | } 106 | return contidx_size; 107 | } 108 | 109 | 110 | int DimensionedKernel::GetLoopsIdxSize(std::vector loops) 111 | { 112 | int idx_size = 1; 113 | for (auto loopi : loops) 114 | { 115 | idx_size *= LoopDims[loopi]; 116 | } 117 | return idx_size; 118 | } 119 | 120 | 121 | int DimensionedKernel::GetIdxSizeForFirstNumLoops(int num_loops) 122 | { 123 | ACROBATIC_ASSERT(num_loops <= GetNumIndices()); 124 | int idx_size = 1; 125 | for (int d = 0; d < num_loops; ++d) 126 | { 127 | idx_size *= LoopDims[d]; 128 | } 129 | return idx_size; 130 | } 131 | 132 | 133 | int DimensionedKernel::GetVarDimStride(int vari, int dim) 134 | { 135 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 136 | 137 | int trank = GetVarRank(vari); 138 | int stride = 1; 139 | for (int d = trank-2; d >= dim; --d) 140 | { 141 | stride *= LoopDims[GetVarDimLoopNum(vari, d+1)]; 142 | } 143 | 144 | return stride; 145 | } 146 | 147 | 148 | int DimensionedKernel::GetVarSize(int vari) 149 | { 150 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 151 | 152 | int rank = GetVarRank(vari); 153 | int size = 1; 154 | for (int d = 0; d < rank; ++d) 155 | { 156 | size *= LoopDims[GetVarDimLoopNum(vari, d)]; 157 | } 158 | return size; 159 | } 160 | 161 | 162 | int DimensionedKernel::GetVarStorageReqForInnerLoops(int vari, int num_loops) 163 | { 164 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 165 | ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices()); 166 | 167 | int num_var_entries = 1; 168 | for (int loop_num = GetNumIndices() - 1; loop_num >= GetNumIndices() - num_loops; --loop_num) 169 | { 170 | if (IsVarDependentOnLoop(vari, loop_num)) 171 | { 172 | num_var_entries *= LoopDims[loop_num]; 173 | } 174 | } 175 | return num_var_entries; 176 | } 177 | 178 | 179 | int DimensionedKernel::GetInputStorageReqForInnerLoops(int num_loops) 180 | { 181 | ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices()); 182 | 183 | int num_entries = 0; 184 | for (int vari = 0; vari < GetNumInputVars(); ++vari) { 185 | num_entries += GetVarStorageReqForInnerLoops(vari, num_loops); 186 | } 187 | 188 | return num_entries; 189 | } 190 | 191 | 192 | int DimensionedKernel::GetOutputStorageReqForInnerLoops(int num_loops) 193 | { 194 | ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices()); 195 | 196 | return GetVarStorageReqForInnerLoops(-1, num_loops); 197 | } 198 | 199 | 200 | int DimensionedKernel::GetTotalStorageReqForInnerLoops(int num_loops) 201 | { 202 | return GetInputStorageReqForInnerLoops(num_loops) + 203 | GetOutputStorageReqForInnerLoops(num_loops); 204 | } 205 | 206 | 207 | int DimensionedKernel::GetIndexSpaceSizeForInnerLoops(int num_loops) 208 | { 209 | int size = 1; 210 | for (int loop = GetNumIndices() - 1; loop >= GetNumIndices() - num_loops; --loop) 211 | { 212 | size *= LoopDims[loop]; 213 | } 214 | return size; 215 | } 216 | 217 | 218 | void DimensionedKernel::GetVarIndexOffsetsForInnerLoops(int vari, int num_inner_loops, 219 | std::vector &var_off, std::vector &loop_off) 220 | { 221 | int num_loops = GetNumIndices(); 222 | int num_outer_loops = num_loops - num_inner_loops; 223 | int loadidx_size = 1; 224 | for (int loopd = num_loops - num_inner_loops; loopd < num_loops; ++loopd) 225 | { 226 | if (IsVarDependentOnLoop(vari, loopd)) 227 | { 228 | loadidx_size *= LoopDims[loopd]; 229 | } 230 | } 231 | 232 | std::vector inner_loop_strides(GetVarRank(vari), 1); 233 | var_off.resize(loadidx_size); 234 | loop_off.resize(loadidx_size); 235 | for (int loadidx = 0; loadidx < loadidx_size; ++loadidx) 236 | { 237 | //Compute the strides for the indices in the inner_loops 238 | int stride = 1; 239 | for (int d = GetVarRank(vari) - 1; d >= 0; --d) 240 | { 241 | int loopd = GetVarDimLoopNum(vari,d); 242 | if (loopd >= num_outer_loops) 243 | { 244 | inner_loop_strides[d] = stride; 245 | stride *= GetVarDimSize(vari,d); 246 | } 247 | } 248 | 249 | //Compute the unflattened var indices 250 | int varidx = 0; 251 | int loopidx = 0; 252 | for (int d = 0; d < GetVarRank(vari); ++d) 253 | { 254 | int loopd = GetVarDimLoopNum(vari,d); 255 | if (loopd >= num_outer_loops) 256 | { 257 | int I = (loadidx / inner_loop_strides[d]) % GetVarDimSize(vari,d); 258 | varidx += I*GetVarDimStride(vari, d); 259 | loopidx += I*LoopStrides[loopd]; 260 | } 261 | } 262 | var_off[loadidx] = varidx; 263 | loop_off[loadidx] = loopidx; 264 | } 265 | } 266 | 267 | } -------------------------------------------------------------------------------- /kernel/DimensionedKernel.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_DIMENSIONED_KERNEL_HPP 7 | #define ACROBATIC_DIMENSIONED_KERNEL_HPP 8 | 9 | #include "TensorKernel.hpp" 10 | #include 11 | #include 12 | 13 | namespace acro 14 | { 15 | 16 | 17 | class DimensionedKernel : public TensorKernel 18 | { 19 | public: 20 | DimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector &inputs); 21 | 22 | //The dimensions of all the loops now that we have attached tensors 23 | const std::vector &GetLoopDims() {return LoopDims;} 24 | const std::vector &GetLoopStrides() {return LoopStrides;} 25 | int GetLoopDim(std::string &idx) {return LoopDims[GetLoopNum(idx)];} 26 | int GetLoopStride(std::string &idx) {return LoopStrides[GetLoopNum(idx)];} 27 | int GetLoopDim(int i) {return LoopDims[i];} 28 | int GetLoopStride(int i) {return LoopStrides[i];} 29 | virtual void SetLoopIndices(std::vector &idx_list); 30 | 31 | //Get a string with all of the loop dimensions 32 | std::string GetLoopDimsString(); 33 | std::string GetDimensionedNameString() {return GetNameString() + GetLoopDimsString();} 34 | std::string GetDimensionedNameString(Tensor *output, std::vector &inputs) {return GetDimensionedNameString();} 35 | 36 | //The the number of index combinations for all the loops (the product of the loop dims) 37 | int GetFlatIdxSize(); 38 | 39 | //The the number of index combinations for just the outer non-contraction loops 40 | int GetOutIdxSize(); 41 | 42 | //The the number of index combinations for the inner contraction loops 43 | int GetContIdxSize(); 44 | 45 | //Get the number of indices in the first num_loops 46 | int GetIdxSizeForFirstNumLoops(int num_loops); 47 | 48 | //Get the number of indices in the list of loops 49 | int GetLoopsIdxSize(std::vector loops); 50 | 51 | //The size of the vari tensor's dim 52 | int GetVarDimSize(int vari, int dim) {return LoopDims[GetVarDimLoopNum(vari, dim)];} 53 | 54 | //The stride in flattened index space of a given variable/dimension in the kernel (vari=-1 for output) 55 | int GetVarDimStride(int vari, int dim); 56 | 57 | //The number of index combinations in a given variable in the kernel (vari=-1 for output) 58 | int GetVarSize(int vari); 59 | 60 | //Information for the inner loops 61 | int GetVarStorageReqForInnerLoops(int vari, int num_loops); 62 | int GetInputStorageReqForInnerLoops(int num_loops); 63 | int GetOutputStorageReqForInnerLoops(int num_loops); 64 | int GetTotalStorageReqForInnerLoops(int num_loops); 65 | int GetIndexSpaceSizeForInnerLoops(int num_loops); 66 | void GetVarIndexOffsetsForInnerLoops(int vari, int num_inner_loops, 67 | std::vector &var_off, std::vector &loop_off); 68 | 69 | private: 70 | //The dimensions of the kernel loops computed to match the attached tensors 71 | std::vector LoopDims; 72 | std::vector LoopStrides; 73 | }; 74 | 75 | } 76 | 77 | #endif //ACROBATIC_DIMENSIONED_KERNEL_HPP -------------------------------------------------------------------------------- /kernel/DimensionedMultiKernel.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | #include "DimensionedMultiKernel.hpp" 6 | #include 7 | #include 8 | 9 | namespace acro 10 | { 11 | 12 | DimensionedMultiKernel::DimensionedMultiKernel(std::vector &kernels) 13 | { 14 | Kernels = kernels; 15 | InitMKLVars(); 16 | } 17 | 18 | DimensionedMultiKernel::DimensionedMultiKernel(DimensionedKernel* kernel) 19 | { 20 | Kernels.push_back(kernel); 21 | InitMKLVars(); 22 | } 23 | 24 | 25 | void DimensionedMultiKernel::InitMKLVars() 26 | { 27 | int uvari = 0; 28 | std::vector added_vars; 29 | for (int ki = 0; ki < Kernels.size(); ++ki) 30 | { 31 | DimensionedKernel *kernel = Kernels[ki]; 32 | for (int indi = 0; indi < kernel->AllIndexNames.size(); ++indi) 33 | { 34 | auto it = std::find(AllIndexNames.begin(), AllIndexNames.end(), kernel->AllIndexNames[indi]); 35 | if (it == AllIndexNames.end()) 36 | { 37 | AllIndexNames.push_back(kernel->AllIndexNames[indi]); 38 | } 39 | } 40 | 41 | for (int indi = 0; indi < kernel->ContractionIndexNames.size(); ++indi) 42 | { 43 | auto it = std::find(ContractionIndexNames.begin(), ContractionIndexNames.end(), kernel->ContractionIndexNames[indi]); 44 | if (it == ContractionIndexNames.end()) 45 | { 46 | ContractionIndexNames.push_back(kernel->ContractionIndexNames[indi]); 47 | } 48 | } 49 | 50 | for (int vari = -1; vari < kernel->GetNumInputVars(); ++vari) 51 | { 52 | auto it = std::find(added_vars.begin(), added_vars.end(), kernel->GetVarName(vari)); 53 | if (it == added_vars.end()) 54 | { 55 | added_vars.push_back(kernel->GetVarName(vari)); 56 | UVariToFirstKiVari.push_back(std::make_pair(ki, vari)); 57 | KiVariToUVari[std::make_pair(ki, vari)] = uvari; 58 | ++uvari; 59 | } 60 | else 61 | { 62 | KiVariToUVari[std::make_pair(ki, vari)] = std::distance(added_vars.begin(), it); 63 | } 64 | } 65 | } 66 | 67 | //Find all the ouder indices that are shared by all subkernels 68 | std::vector remove_list; 69 | SharedOuterIndexNames = AllIndexNames; 70 | for (int ki = 0; ki < Kernels.size(); ++ki) 71 | { 72 | DimensionedKernel *kernel = Kernels[ki]; 73 | remove_list.resize(0); 74 | for (int idxi = 0; idxi < SharedOuterIndexNames.size(); ++idxi) 75 | { 76 | if (!kernel->IsDependentOnIndex(SharedOuterIndexNames[idxi]) || 77 | kernel->IsContractionIndex(SharedOuterIndexNames[idxi])) 78 | { 79 | remove_list.push_back(SharedOuterIndexNames[idxi]); 80 | } 81 | } 82 | 83 | for (int ri = 0; ri < remove_list.size(); ++ri) 84 | { 85 | SharedOuterIndexNames.erase(std::remove(SharedOuterIndexNames.begin(), 86 | SharedOuterIndexNames.end(), remove_list[ri]), 87 | SharedOuterIndexNames.end()); 88 | } 89 | } 90 | 91 | //Reorder the indices to put shared outer indices first 92 | std::vector reordered_indices = SharedOuterIndexNames; 93 | for (int idxi = 0; idxi < AllIndexNames.size(); ++idxi) 94 | { 95 | std::string idx = AllIndexNames[idxi]; 96 | auto it = std::find(reordered_indices.begin(), reordered_indices.end(), idx); 97 | if (it == reordered_indices.end()) 98 | { 99 | reordered_indices.push_back(idx); 100 | } 101 | } 102 | SetLoopIndices(reordered_indices); 103 | 104 | //Finally Reorder the Shared outer indices by size (largest first) 105 | reordered_indices.clear(); 106 | reordered_indices.resize(SharedOuterIndexNames.size()); 107 | std::set set_indices(SharedOuterIndexNames.begin(), SharedOuterIndexNames.end()); 108 | for (int i = 0; i < reordered_indices.size(); ++i) 109 | { 110 | int biggest_loop_size = -1; 111 | std::string biggest_idx; 112 | for (auto idx : set_indices) 113 | { 114 | int loop_size = GetLoopDim(idx); 115 | if (loop_size > biggest_loop_size) 116 | { 117 | biggest_loop_size = loop_size; 118 | biggest_idx = idx; 119 | } 120 | } 121 | set_indices.erase(biggest_idx); 122 | reordered_indices[i] = biggest_idx; 123 | } 124 | SharedOuterIndexNames = reordered_indices; 125 | for (int idxi = 0; idxi < AllIndexNames.size(); ++idxi) 126 | { 127 | std::string idx = AllIndexNames[idxi]; 128 | auto it = std::find(reordered_indices.begin(), reordered_indices.end(), idx); 129 | if (it == reordered_indices.end()) 130 | { 131 | reordered_indices.push_back(idx); 132 | } 133 | } 134 | SetLoopIndices(reordered_indices); 135 | } 136 | 137 | 138 | 139 | int DimensionedMultiKernel::GetNumVars() 140 | { 141 | return GetNumInputVars()+GetNumOutputVars(); 142 | } 143 | 144 | 145 | int DimensionedMultiKernel::GetNumInputVars() 146 | { 147 | int numvars = 0; 148 | for (int ki = 0; ki < Kernels.size(); ++ki) 149 | { 150 | numvars += Kernels[ki]->GetNumInputVars(); 151 | } 152 | return numvars; 153 | } 154 | 155 | 156 | int DimensionedMultiKernel::GetNumOutputVars() 157 | { 158 | return Kernels.size(); 159 | } 160 | 161 | 162 | void DimensionedMultiKernel::SetLoopIndices(std::vector &idx_list) 163 | { 164 | //Set the loop orders of the subkernels and the LoopDims 165 | LoopDims.clear(); 166 | LoopDims.resize(idx_list.size(), 1); 167 | for (int ki = 0; ki < Kernels.size(); ++ki) 168 | { 169 | Kernels[ki]->SetLoopIndices(idx_list); 170 | for (int loopi = 0; loopi < idx_list.size(); ++loopi) 171 | { 172 | LoopDims[loopi] = std::max(LoopDims[loopi], Kernels[ki]->GetLoopDim(loopi)); 173 | } 174 | } 175 | 176 | //Set the loop strides 177 | LoopStrides.clear(); 178 | LoopStrides.resize(idx_list.size()); 179 | LoopStrides[LoopDims.size() - 1] = 1; 180 | for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd) 181 | { 182 | LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1]; 183 | } 184 | 185 | LoopIndices = idx_list; 186 | } 187 | 188 | 189 | int DimensionedMultiKernel::GetIndexLoopNum(std::string &idx) 190 | { 191 | auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx); 192 | if (it == LoopIndices.end()) 193 | { 194 | return -1; 195 | } 196 | return std::distance(LoopIndices.begin(), it); 197 | } 198 | 199 | 200 | int DimensionedMultiKernel::GetVarRank(int ki, int vari) 201 | { 202 | return Kernels[ki]->GetVarRank(vari); 203 | } 204 | 205 | 206 | int DimensionedMultiKernel::GetVarDimLoopNum(int ki, int vari, int dim) 207 | { 208 | return Kernels[ki]->GetVarDimLoopNum(vari, dim); 209 | } 210 | 211 | 212 | int DimensionedMultiKernel::GetLoopNumVarDim(int loop_num, int ki, int vari) 213 | { 214 | return Kernels[ki]->GetVarDimLoopNum(loop_num, vari); 215 | } 216 | 217 | 218 | std::string DimensionedMultiKernel::GetDimensionedNameString() 219 | { 220 | std::string dimensioned_name; 221 | for (auto kernel : Kernels) 222 | { 223 | dimensioned_name += kernel->GetDimensionedNameString() + ";"; 224 | } 225 | return dimensioned_name; 226 | } 227 | 228 | bool DimensionedMultiKernel::IsVarDependentOnLoop(int ki, int vari, int loop_num) 229 | { 230 | return Kernels[ki]->IsVarDependentOnLoop(vari, loop_num); 231 | } 232 | 233 | 234 | bool DimensionedMultiKernel::IsContractionLoop(int loop_num) 235 | { 236 | std::string idxstr = LoopIndices[loop_num]; 237 | return std::find(ContractionIndexNames.begin(),ContractionIndexNames.end(), idxstr) 238 | != ContractionIndexNames.end(); 239 | } 240 | 241 | 242 | bool DimensionedMultiKernel::IsSharedOuterLoop(int loop_num) 243 | { 244 | std::string idxstr = LoopIndices[loop_num]; 245 | return std::find(SharedOuterIndexNames.begin(),SharedOuterIndexNames.end(), idxstr) 246 | != SharedOuterIndexNames.end(); 247 | } 248 | 249 | 250 | bool DimensionedMultiKernel::IsOutputUVar(int uvari) 251 | { 252 | for (int ki = 0; ki < Kernels.size(); ++ki) 253 | { 254 | if (KiVariToUVari[std::make_pair(ki,-1)] == uvari) 255 | { 256 | return true; 257 | } 258 | } 259 | return false; 260 | } 261 | 262 | 263 | bool DimensionedMultiKernel::IsInputUVar(int uvari) 264 | { 265 | for (int ki = 0; ki < Kernels.size(); ++ki) 266 | { 267 | for (int vari = 0; vari < Kernels[ki]->GetNumInputVars(); ++vari) 268 | { 269 | if (KiVariToUVari[std::make_pair(ki,vari)] == uvari) 270 | { 271 | return true; 272 | } 273 | } 274 | } 275 | return false; 276 | } 277 | 278 | 279 | int DimensionedMultiKernel::GetFlatIdxSize() 280 | { 281 | int flatidx_size = 1; 282 | for (int d = 0; d < GetNumIndices(); ++d) 283 | { 284 | flatidx_size *= LoopDims[d]; 285 | } 286 | return flatidx_size; 287 | } 288 | 289 | 290 | int DimensionedMultiKernel::GetSharedOuterIdxSize() 291 | { 292 | int outidx_size = 1; 293 | for (int d = 0; d < GetNumIndices(); ++d) 294 | { 295 | if (IsSharedOuterLoop(d)) 296 | { 297 | outidx_size *= LoopDims[d]; 298 | } 299 | } 300 | return outidx_size; 301 | } 302 | 303 | 304 | int DimensionedMultiKernel::GetIdxSizeForFirstNumLoops(int num_loops) 305 | { 306 | int idx_size = 1; 307 | for (int d = 0; d < num_loops; ++d) 308 | { 309 | idx_size *= LoopDims[d]; 310 | } 311 | return idx_size; 312 | } 313 | 314 | 315 | int DimensionedMultiKernel::GetVarDimStride(int ki, int vari, int dim) 316 | { 317 | return Kernels[ki]->GetVarDimStride(vari, dim); 318 | } 319 | 320 | 321 | int DimensionedMultiKernel::GetVarSize(int ki, int vari) 322 | { 323 | return Kernels[ki]->GetVarSize(vari); 324 | } 325 | 326 | 327 | int DimensionedMultiKernel::GetVarSize(int uvari) 328 | { 329 | auto ki_vari = UVariToFirstKiVari[uvari]; 330 | return GetVarSize(ki_vari.first, ki_vari.second); 331 | } 332 | 333 | 334 | int DimensionedMultiKernel::GetVarLoopDepth(int ki, int vari) 335 | { 336 | return Kernels[ki]->GetVarLoopDepth(vari); 337 | } 338 | 339 | 340 | int DimensionedMultiKernel::GetVarStorageReqForInnerLoops(int ki, int vari, int num_loops) 341 | { 342 | return Kernels[ki]->GetVarStorageReqForInnerLoops(vari, num_loops); 343 | } 344 | 345 | 346 | int DimensionedMultiKernel::GetInputStorageReqForInnerLoops(int num_loops) 347 | { 348 | int storage = 0; 349 | for (int ki = 0; ki < Kernels.size(); ++ki) 350 | { 351 | storage += Kernels[ki]->GetInputStorageReqForInnerLoops(num_loops); 352 | } 353 | return storage; 354 | } 355 | 356 | 357 | int DimensionedMultiKernel::GetOutputStorageReqForInnerLoops(int num_loops) 358 | { 359 | int storage = 0; 360 | for (int ki = 0; ki < Kernels.size(); ++ki) 361 | { 362 | storage += Kernels[ki]->GetOutputStorageReqForInnerLoops(num_loops); 363 | } 364 | return storage; 365 | } 366 | 367 | 368 | int DimensionedMultiKernel::GetTotalStorageReqForInnerLoops(int num_loops) 369 | { 370 | return GetInputStorageReqForInnerLoops(num_loops) + GetOutputStorageReqForInnerLoops(num_loops); 371 | } 372 | 373 | 374 | int DimensionedMultiKernel::GetIndexSpaceSizeForInnerLoops(int num_loops) 375 | { 376 | int size = 1; 377 | for (int loop = GetNumIndices() - 1; loop >= GetNumIndices() - num_loops; --loop) 378 | { 379 | size *= LoopDims[loop]; 380 | } 381 | return size; 382 | } 383 | 384 | 385 | void DimensionedMultiKernel::GetVarIndexOffsetsForInnerLoops(int ki, int vari, int num_inner_loops, 386 | std::vector &var_off, std::vector &loop_off) 387 | { 388 | Kernels[ki]->GetVarIndexOffsetsForInnerLoops(vari, num_inner_loops, var_off, loop_off); 389 | } 390 | 391 | } -------------------------------------------------------------------------------- /kernel/DimensionedMultiKernel.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP 7 | #define ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP 8 | 9 | #include "DimensionedKernel.hpp" 10 | #include 11 | #include 12 | 13 | namespace acro 14 | { 15 | 16 | 17 | class DimensionedMultiKernel 18 | { 19 | public: 20 | DimensionedMultiKernel(std::vector &kernels); 21 | DimensionedMultiKernel(DimensionedKernel* kernel); 22 | 23 | int GetNumKernels() {return Kernels.size();} 24 | 25 | //The total number of loops required to execute the kernel 26 | int GetNumIndices() {return AllIndexNames.size();} 27 | 28 | //The number of outer loops in the multi kernel 29 | int GetNumOuterIndices() {return SharedOuterIndexNames.size();} 30 | 31 | //The number of inner contraction loops in the kernel 32 | int GetNumContractionIndices() {return ContractionIndexNames.size();} 33 | 34 | //The number of variables referenced in the kernel (including the output tensors) 35 | int GetNumVars(); 36 | 37 | //The number of unique vars with the duplicates removed 38 | int GetNumUVars() {return UVariToFirstKiVari.size();} 39 | 40 | //The number of input variables referenced in the kernel 41 | int GetNumInputVars(); 42 | 43 | //The number of input variables referenced in the kernel 44 | int GetNumOutputVars(); 45 | 46 | std::string GetDimensionedNameString(); 47 | 48 | std::string GetLoopIndex(int loopi) {return LoopIndices[loopi];} 49 | int GetIndexLoopNum(std::string &idx); 50 | 51 | //Change the order of the loops which will affect the following loop_num functions and the values of Var->LoopNums 52 | void SetLoopIndices(std::vector &idx_list); 53 | 54 | //The rank of the given variable (mvari = -1..-n for output vars) 55 | int GetVarRank(int ki, int vari); 56 | 57 | //The loop number for the given mvariable/dimension (mvari = -1..-n for output) 58 | int GetVarDimLoopNum(int ki, int vari, int dim); 59 | 60 | //The input var dim given the loop num and the input mvari (mvari = -1..-n for output) 61 | //returns (-1 if input var is invariant to that loop) 62 | int GetLoopNumVarDim(int loop_num, int ki, int vari); 63 | 64 | //Does the input var have an index matching this loop num (mvari = -1..-n for outputs) 65 | bool IsVarDependentOnLoop(int ki, int vari, int loop_num); 66 | 67 | //Is this loop a contraction loop 68 | bool IsContractionLoop(int loop_num); 69 | 70 | //Is this loop bound to a shared non-contraction index for all the kernels 71 | bool IsSharedOuterLoop(int loop_num); 72 | 73 | //Is the UVar an output or input var (or both from different kernels) 74 | bool IsOutputUVar(int uvari); 75 | bool IsInputUVar(int uvari); 76 | 77 | //The dimensions of all the loops now that we have attached tensors 78 | const std::vector &GetLoopDims() {return LoopDims;} 79 | int GetLoopDim(int i) {return LoopDims[i];} 80 | int GetLoopDim(std::string &idx) {return GetLoopDim(GetIndexLoopNum(idx));} 81 | int GetLoopStride(int i) {return LoopStrides[i];} 82 | 83 | //The the number of index combinations for all the loops (the product of the loop dims) 84 | int GetFlatIdxSize(); 85 | 86 | //The the number of index combinations for just the outer non-contraction loops 87 | int GetSharedOuterIdxSize(); 88 | 89 | //Get the number of indices in the first num_loops 90 | int GetIdxSizeForFirstNumLoops(int num_loops); 91 | 92 | //The stride in flattened index space of a given variable/dimension in the kernel 93 | int GetVarDimStride(int ki, int vari, int dim); 94 | 95 | //The number of index combinations in a given variable in the kernel 96 | int GetVarSize(int ki, int vari); 97 | int GetVarSize(int uvari); 98 | 99 | //The highest loop number that the var varies by 100 | int GetVarLoopDepth(int ki, int vari); 101 | 102 | //The unique vars will be listed starting from 0..n for the unique outputs 103 | //followed by n+1..m for the unique inputs. Duplicated will not be counted! 104 | int GetUVari(int ki, int vari) {return KiVariToUVari[std::make_pair(ki,vari)];} 105 | std::pair GetFirstKiVariForUVari(int uvari) {return UVariToFirstKiVari[uvari];} 106 | 107 | //Information for the inner loops 108 | int GetVarStorageReqForInnerLoops(int ki, int vari, int num_loops); 109 | int GetInputStorageReqForInnerLoops(int num_loops); 110 | int GetOutputStorageReqForInnerLoops(int num_loops); 111 | int GetTotalStorageReqForInnerLoops(int num_loops); 112 | int GetIndexSpaceSizeForInnerLoops(int num_loops); 113 | void GetVarIndexOffsetsForInnerLoops(int ki, int vari, int num_inner_loops, 114 | std::vector &var_off, std::vector &loop_off); 115 | 116 | std::vector Kernels; 117 | std::vector AllIndexNames; 118 | std::vector ContractionIndexNames; 119 | std::vector SharedOuterIndexNames; 120 | std::vector LoopIndices; 121 | 122 | private: 123 | void InitMKLVars(); 124 | 125 | //Maps between the multikernel tensor numbering and the underlying kernel tensor numbering 126 | std::map, int> KiVariToUVari; 127 | std::vector> UVariToFirstKiVari; 128 | 129 | //The dimensions of the kernel loops computed to match the attached tensors 130 | std::vector LoopDims; 131 | std::vector LoopStrides; 132 | }; 133 | 134 | } 135 | 136 | #endif //ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP -------------------------------------------------------------------------------- /kernel/TensorEngine.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_TENSOR_ENGINE_HPP 7 | #define ACROBATIC_TENSOR_ENGINE_HPP 8 | 9 | #include 10 | #include 11 | #include "DimensionedMultiKernel.hpp" 12 | #include "Executor.hpp" 13 | #include "IndexMapping.hpp" 14 | 15 | #ifdef ACRO_HAVE_CUDA 16 | #include 17 | #endif 18 | 19 | 20 | namespace acro 21 | { 22 | 23 | class TensorKernel; 24 | class NonContractionOps; 25 | 26 | class TensorEngine 27 | { 28 | public: 29 | TensorEngine(); 30 | TensorEngine(const char *bare_exec_type); 31 | TensorEngine(std::string &exec_type); 32 | ~TensorEngine(); 33 | void SetExecutorType(const char *bare_exec_type); 34 | void SetExecutorType(std::string &exec_type); 35 | std::string GetExecType() {return ExecutorType;} 36 | 37 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1); 38 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2); 39 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3); 40 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4); 41 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5); 42 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6); 43 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7); 44 | void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8); 45 | void operator()(const char *bare_kernel_str, Tensor *out, std::vector &inputs); 46 | 47 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1); 48 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2); 49 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3); 50 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4); 51 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5); 52 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6); 53 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7); 54 | void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8); 55 | void operator()(std::string &kernel_str, Tensor *out, std::vector &inputs); 56 | 57 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1); 58 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2); 59 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3); 60 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4); 61 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5); 62 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6); 63 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7); 64 | std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8); 65 | std::string GetImplementation(const char *bare_kernel_str, Tensor *out, std::vector &inputs); 66 | 67 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1); 68 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2); 69 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3); 70 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4); 71 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5); 72 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6); 73 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7); 74 | std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8); 75 | std::string GetImplementation(std::string &kernel_str, Tensor *out, std::vector &inputs); 76 | 77 | void BatchMatrixInverse(Tensor &Ainv, Tensor &A); 78 | void BatchMatrixDet(Tensor &Adet, Tensor &A); 79 | void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A); 80 | void FlatIndexedScatter(Tensor &Aout, Tensor &Ait, IndexMapping &M); 81 | void FlatIndexedSumGather(Tensor &Aout, Tensor &Ait, IndexMapping &M); 82 | 83 | void Clear(); 84 | bool IsGPUAvailable() {return isCudaReady();} 85 | void BeginMultiKernelLaunch(); 86 | void EndMultiKernelLaunch(); 87 | 88 | private: 89 | TensorKernel *GetAddTensorKernel(std::string &kernel_str); 90 | DimensionedKernel *GetAddDimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector &inputs); 91 | KernelExecutor *GetAddKernelExecutor(); 92 | void MoveToComputeLocation(Tensor &T); 93 | void SwitchToComputeLocation(Tensor &T); 94 | void MoveToComputeLocation(IndexMapping &M); 95 | void SwitchToComputeLocation(IndexMapping &M); 96 | 97 | std::string ExecutorType; 98 | std::unordered_map KernelMap; 99 | std::unordered_map DimensionedKernelMap; 100 | std::unordered_map ExecutorMap; 101 | std::string ComputeLocation; 102 | NonContractionOps *Ops; 103 | 104 | bool IsMultiKernelLaunch; 105 | std::vector MKLKernels; 106 | std::vector MKLOutputT; 107 | std::vector > MKLInputT; 108 | 109 | #ifdef ACRO_HAVE_CUDA 110 | cudaStream_t TheCudaStream; 111 | #endif 112 | }; 113 | 114 | 115 | } 116 | 117 | #endif //ACROBATIC_TENSOR_ENGINE_HPP -------------------------------------------------------------------------------- /kernel/TensorKernel.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "TensorKernel.hpp" 7 | #include "SliceTensor.hpp" 8 | #include 9 | #include 10 | 11 | namespace acro 12 | { 13 | 14 | TensorKernel::TensorKernel(const char *kernel) 15 | { 16 | KernelStr = kernel; 17 | ParseKernel(); 18 | } 19 | 20 | TensorKernel::TensorKernel(std::string &kernel) 21 | { 22 | KernelStr = kernel; 23 | ParseKernel(); 24 | } 25 | 26 | // Recursive decent parse of tensor kernel (+ means 1 or more of in succession): 27 | // := + 28 | void TensorKernel::ParseKernel() 29 | { 30 | std::string ParseStr = KernelStr; 31 | ParseStr.erase(remove_if(ParseStr.begin(), ParseStr.end(), isspace),ParseStr.end()); 32 | std::string::iterator it = ParseStr.begin(); 33 | ParseKernelVar(it, OutputVar); 34 | 35 | ParseEqOperator(it, EqOperator); 36 | 37 | InputVars.push_back(KernelVar()); 38 | ParseKernelVar(it, InputVars.back()); 39 | 40 | while(it != ParseStr.end()) { 41 | InputVars.push_back(KernelVar()); 42 | ParseKernelVar(it, InputVars.back()); 43 | } 44 | 45 | //Gather up the IndexNames and LoopNums associated with the OutputTensor 46 | for (int d = 0; d < OutputVar.IndexNames.size(); ++d) 47 | { 48 | AllIndexNames.push_back(OutputVar.IndexNames[d]); 49 | } 50 | 51 | //Now gather up the IndexNames and LoopNums associated with the contraction indices 52 | for (int vari = 0; vari < InputVars.size(); ++vari) { 53 | for (int indi = 0; indi < InputVars[vari].IndexNames.size(); ++indi) { 54 | auto acit = std::find(AllIndexNames.begin(), 55 | AllIndexNames.end(), 56 | InputVars[vari].IndexNames[indi]); 57 | if (acit == AllIndexNames.end()) { 58 | //The IndexName is not on the list yet so add it to everything 59 | ContractionIndexNames.push_back(InputVars[vari].IndexNames[indi]); 60 | AllIndexNames.push_back(InputVars[vari].IndexNames[indi]); 61 | } 62 | } 63 | } 64 | 65 | LoopIndices = AllIndexNames; 66 | SetVarLoopNums(); 67 | } 68 | 69 | // := + 70 | // where is an uppercase letter 71 | void TensorKernel::ParseKernelVar(std::string::iterator &it, KernelVar &var) 72 | { 73 | ParseVarName(it, var); 74 | ParseIndexVar(it, var); 75 | while (*it == '_') { 76 | ParseIndexVar(it, var); 77 | } 78 | var.LoopNums.resize(var.IndexNames.size()); 79 | } 80 | 81 | 82 | // := * 83 | // where is an uppercase letter and is any letter or digit 84 | void TensorKernel::ParseVarName(std::string::iterator &it, KernelVar &var) 85 | { 86 | // 87 | ACROBATIC_ASSERT(isupper(*it)); 88 | var.Name += *it; 89 | it ++; 90 | 91 | //* 92 | while (isupper(*it) || islower(*it) || isdigit(*it)) { 93 | var.Name += *it; 94 | it ++; 95 | } 96 | } 97 | 98 | 99 | // := _+ 100 | // is a lowercase letter or a digit 101 | void TensorKernel::ParseIndexVar(std::string::iterator &it, KernelVar &var) 102 | { 103 | //_ 104 | ACROBATIC_ASSERT(*it == '_'); 105 | it ++; 106 | 107 | //+ 108 | ACROBATIC_ASSERT(islower(*it) || isdigit(*it)); 109 | var.IndexNames.push_back(""); 110 | var.IndexNames[var.IndexNames.size() - 1] += *it; 111 | it ++; 112 | while(islower(*it) || isdigit(*it)) 113 | { 114 | var.IndexNames[var.IndexNames.size() - 1] += *it; 115 | it ++; 116 | } 117 | } 118 | 119 | 120 | // := ("=" | "+=" | "-=") 121 | void TensorKernel::ParseEqOperator(std::string::iterator &it, std::string &eqoper) 122 | { 123 | if (*it == '=') { 124 | it ++; 125 | eqoper = "="; 126 | } else if (*it == '+') { 127 | it ++; 128 | ACROBATIC_ASSERT(*it == '='); 129 | it ++; 130 | eqoper = "+="; 131 | } else if (*it == '-') { 132 | it ++; 133 | ACROBATIC_ASSERT(*it == '='); 134 | it ++; 135 | eqoper = "-="; 136 | } else { 137 | ACROBATIC_ASSERT(false); 138 | } 139 | } 140 | 141 | 142 | void TensorKernel::SetVarLoopNums() 143 | { 144 | OutputVar.LoopNums.resize(OutputVar.IndexNames.size()); 145 | for (int idxi = 0; idxi < OutputVar.IndexNames.size(); ++idxi) 146 | { 147 | auto loopit = std::find(LoopIndices.begin(), LoopIndices.end(), OutputVar.IndexNames[idxi]); 148 | OutputVar.LoopNums[idxi] = std::distance(LoopIndices.begin(), loopit); 149 | } 150 | 151 | for (int ivari = 0; ivari < InputVars.size(); ++ivari) 152 | { 153 | InputVars[ivari].LoopNums.resize(InputVars[ivari].IndexNames.size()); 154 | for (int idxi = 0; idxi < InputVars[ivari].IndexNames.size(); ++idxi) 155 | { 156 | auto loopit = std::find(LoopIndices.begin(), LoopIndices.end(), InputVars[ivari].IndexNames[idxi]); 157 | InputVars[ivari].LoopNums[idxi] = std::distance(LoopIndices.begin(), loopit); 158 | } 159 | } 160 | } 161 | 162 | 163 | int TensorKernel::GetVarRank(int vari) 164 | { 165 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 166 | 167 | if (vari == -1) 168 | { 169 | return OutputVar.IndexNames.size(); 170 | } 171 | 172 | return InputVars[vari].IndexNames.size(); 173 | } 174 | 175 | 176 | int TensorKernel::GetLoopDepth() 177 | { 178 | int depth = -1; //Invariant to all loops 179 | for (int vari = -1; vari < GetNumInputVars(); ++vari) 180 | { 181 | depth = std::max(depth, GetVarLoopDepth(vari)); 182 | } 183 | return depth; 184 | } 185 | 186 | 187 | int TensorKernel::GetVarLoopDepth(int vari) 188 | { 189 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 190 | 191 | int depth = -1; //Invariant to all loops 192 | for (int loopd = 0; loopd < LoopIndices.size(); ++ loopd) 193 | { 194 | if (IsVarDependentOnLoop(vari, loopd)) 195 | { 196 | depth = loopd; 197 | } 198 | } 199 | return depth; 200 | } 201 | 202 | 203 | void TensorKernel::SetLoopIndices(std::vector &idx_list) 204 | { 205 | LoopIndices = idx_list; 206 | 207 | //Update the LoopNums with the new permuted order 208 | SetVarLoopNums(); 209 | } 210 | 211 | 212 | int TensorKernel::GetLoopNum(std::string &idx) 213 | { 214 | auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx); 215 | ACROBATIC_ASSERT(it != LoopIndices.end(), "Loop index (" + idx + ") not found in kernel:\n" 216 | + KernelStr + "\n"); 217 | return std::distance(LoopIndices.begin(), it); 218 | } 219 | 220 | int TensorKernel::GetVarDimLoopNum(int vari, int dim) 221 | { 222 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 223 | ACROBATIC_ASSERT(dim >= 0 && dim < GetVarRank(vari)); 224 | 225 | if (vari == -1) 226 | { 227 | return OutputVar.LoopNums[dim]; 228 | } 229 | 230 | return InputVars[vari].LoopNums[dim]; 231 | } 232 | 233 | 234 | int TensorKernel::GetLoopNumVarDim(int loop_num, int vari) 235 | { 236 | ACROBATIC_ASSERT(loop_num >= 0 && loop_num < LoopIndices.size()); 237 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 238 | 239 | std::string loop_index_name = GetLoopIndex(loop_num); 240 | 241 | for (int d = 0; d < GetVarRank(vari); ++d) 242 | { 243 | if (vari == -1) 244 | { 245 | if (OutputVar.IndexNames[d] == loop_index_name) 246 | { 247 | return d; 248 | } 249 | } 250 | else 251 | { 252 | if (InputVars[vari].IndexNames[d] == loop_index_name) 253 | { 254 | return d; 255 | } 256 | } 257 | } 258 | return -1; 259 | } 260 | 261 | 262 | bool TensorKernel::IsVarDependentOnLoop(int vari, int loop_num) 263 | { 264 | return GetLoopNumVarDim(loop_num, vari) > -1; 265 | } 266 | 267 | 268 | bool TensorKernel::IsDependentOnIndex(std::string &idx) 269 | { 270 | return std::find(AllIndexNames.begin(), AllIndexNames.end(), idx) != AllIndexNames.end(); 271 | } 272 | 273 | 274 | bool TensorKernel::IsDependentOnLoop(int loop_num) 275 | { 276 | std::string idxstr = LoopIndices[loop_num]; 277 | return IsDependentOnIndex(LoopIndices[loop_num]); 278 | } 279 | 280 | 281 | bool TensorKernel::IsContractionIndex(std::string &idx) 282 | { 283 | return std::find(ContractionIndexNames.begin(), ContractionIndexNames.end(), idx) != ContractionIndexNames.end(); 284 | } 285 | 286 | 287 | bool TensorKernel::IsContractionLoop(int loop_num) 288 | { 289 | return IsContractionIndex(LoopIndices[loop_num]); 290 | } 291 | 292 | 293 | bool TensorKernel::IsContractionVar(int vari) 294 | { 295 | ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars()); 296 | 297 | if (vari == -1) 298 | { 299 | return false; 300 | } 301 | 302 | for (auto idx : InputVars[vari].IndexNames) 303 | { 304 | if (IsContractionIndex(idx)) 305 | { 306 | return true; 307 | } 308 | } 309 | 310 | return false; 311 | } 312 | 313 | 314 | std::string &TensorKernel::GetVarName(int vari) 315 | { 316 | if (vari == -1) 317 | { 318 | return OutputVar.Name; 319 | } 320 | else 321 | { 322 | return InputVars[vari].Name; 323 | } 324 | } 325 | 326 | 327 | std::string TensorKernel::GetNameString() 328 | { 329 | std::string name = OutputVar.Name; 330 | for (int d = 0; d < OutputVar.IndexNames.size(); ++d) 331 | { 332 | name += "_" + OutputVar.IndexNames[d]; 333 | } 334 | 335 | if (EqOperator == "=") 336 | { 337 | name += "eq"; 338 | } 339 | else if (EqOperator == "+=") 340 | { 341 | name += "pe"; 342 | } 343 | else if (EqOperator == "-=") 344 | { 345 | name += "me"; 346 | } 347 | 348 | for (int ivari = 0; ivari < InputVars.size(); ++ivari) 349 | { 350 | name += InputVars[ivari].Name; 351 | for (int d = 0; d < InputVars[ivari].IndexNames.size(); ++d) 352 | { 353 | name += "_" + InputVars[ivari].IndexNames[d]; 354 | } 355 | } 356 | return name; 357 | } 358 | 359 | 360 | std::string TensorKernel::GetDimensionedNameString(Tensor *output, std::vector &inputs) 361 | { 362 | std::string name = GetNameString(); 363 | std::vector idx_sizes = GetLoopIdxSizes(output, inputs); 364 | 365 | name += "_"; 366 | for (int idxi = 0; idxi < idx_sizes.size(); ++idxi) 367 | { 368 | name += "_" + std::to_string(idx_sizes[idxi]); 369 | } 370 | 371 | return name; 372 | } 373 | 374 | 375 | std::vector TensorKernel::GetLoopIdxSizes(Tensor *output, std::vector &inputs) 376 | { 377 | std::vector idx_sizes(LoopIndices.size(), 1); //Set loop indices not in this kernel to dim=1 378 | for (int idxi = 0; idxi < output->GetRank(); ++idxi) 379 | { 380 | ACROBATIC_ASSERT(GetVarDimLoopNum(-1, idxi) >= 0 && GetVarDimLoopNum(-1, idxi) < idx_sizes.size()); 381 | idx_sizes[GetVarDimLoopNum(-1, idxi)] = output->GetDim(idxi); 382 | } 383 | 384 | for (int vari = 0; vari < InputVars.size(); ++vari) 385 | { 386 | for (int idxi = 0; idxi < inputs[vari]->GetRank(); ++idxi) 387 | { 388 | ACROBATIC_ASSERT(GetVarDimLoopNum(vari, idxi) >= 0 && GetVarDimLoopNum(vari, idxi) < idx_sizes.size()); 389 | idx_sizes[GetVarDimLoopNum(vari, idxi)] = inputs[vari]->GetDim(idxi); 390 | } 391 | } 392 | 393 | //Check to make sure that the dimensions of the tensors are compatible with the kernel 394 | for (int vari = 0; vari < InputVars.size(); ++vari) 395 | { 396 | for (int idxi = 0; idxi < InputVars[vari].LoopNums.size(); ++idxi) 397 | { 398 | ACROBATIC_ASSERT(idx_sizes[InputVars[vari].LoopNums[idxi]] == inputs[vari]->GetDim(idxi), 399 | "Incompatible tensor dimensions for kernel: " + KernelStr); 400 | } 401 | } 402 | return idx_sizes; 403 | } 404 | 405 | } -------------------------------------------------------------------------------- /kernel/TensorKernel.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_TENSOR_KERNEL_HPP 7 | #define ACROBATIC_TENSOR_KERNEL_HPP 8 | 9 | #include "Tensor.hpp" 10 | #include 11 | 12 | namespace acro 13 | { 14 | 15 | class KernelVar 16 | { 17 | public: 18 | KernelVar() {} 19 | std::string Name; 20 | std::vector IndexNames; 21 | std::vector LoopNums; 22 | }; 23 | 24 | 25 | class TensorKernel 26 | { 27 | protected: 28 | TensorKernel() {} 29 | public: 30 | TensorKernel(const char* kernel); 31 | TensorKernel(std::string &kernel); 32 | 33 | //The total number of loops required to execute the kernel 34 | int GetNumIndices() {return AllIndexNames.size();} 35 | 36 | //The number of outer loops in the kernel 37 | int GetNumOuterIndices() {return AllIndexNames.size() - ContractionIndexNames.size();} 38 | 39 | //The number of inner contraction loops in the kernel 40 | int GetNumContractionIndices() {return ContractionIndexNames.size();} 41 | 42 | //The number of variables referenced in the kernel (including the output tensor) 43 | int GetNumVars() {return InputVars.size()+1;} 44 | 45 | //The number of input variables referenced in the kernel 46 | int GetNumInputVars() {return InputVars.size();} 47 | 48 | //The rank of the given variable (vari = -1 for output) 49 | int GetVarRank(int vari); 50 | 51 | //Change the order of the loops which will affect the following loop_num functions and the values of Var->LoopNums 52 | std::string GetLoopIndex(int loopi) {return LoopIndices[loopi];} 53 | int GetLoopNum(std::string &idx); 54 | virtual void SetLoopIndices(std::vector &idx_list); 55 | 56 | //The loop number for the given variable/dimension (vari = -1 for output) 57 | int GetVarDimLoopNum(int vari, int dim); 58 | 59 | //The input var dim given the loop num and the input vari (vari = -1 for output) 60 | //returns (-1 if input var is invariant to that loop) 61 | int GetLoopNumVarDim(int loop_num, int vari); 62 | 63 | //Does the input var have an index matching this loop num (vari = -1 for output) 64 | bool IsVarDependentOnLoop(int vari, int loop_num); 65 | 66 | //Does the Kernel have dependence on this index 67 | bool IsDependentOnIndex(std::string &idx); 68 | 69 | //Does the Kernel have dependence on this index 70 | bool IsDependentOnLoop(int loop_num); 71 | 72 | //Does this a contraction index 73 | bool IsContractionIndex(std::string &idx); 74 | 75 | //Is this loop a contraction loop 76 | bool IsContractionLoop(int loop_num); 77 | 78 | bool IsContractionVar(int vari); 79 | 80 | //Get the highest loop number that the entire kernel depends on 81 | int GetLoopDepth(); 82 | 83 | //The highest loop number that the var varies by (vari=-1 for output) 84 | int GetVarLoopDepth(int vari); 85 | 86 | //The the name of the variable (vari=-1 for output) 87 | std::string &GetVarName(int vari); 88 | 89 | //This returns the post parsed name string 90 | std::string GetNameString(); 91 | 92 | //This returns a modified kernel string with the dimensions compatible with the tensors 93 | virtual std::string GetDimensionedNameString() {ACROBATIC_ASSERT(false); return "";} 94 | virtual std::string GetDimensionedNameString(Tensor *output, std::vector &inputs); 95 | 96 | std::vector GetLoopIdxSizes(Tensor *output, std::vector &inputs); 97 | 98 | std::string KernelStr; //The user provided kernel string 99 | KernelVar OutputVar; //The output var extracted from the kernel string 100 | std::string EqOperator; //The assignement operator extracted from the kernel string (=, +=) 101 | std::vector InputVars; //The input vars extracted from the kernel string 102 | std::vector AllIndexNames; //The names of all the indices extracted from the kernel string 103 | std::vector ContractionIndexNames; //The names of the contraction indices extracted from the kernel string 104 | std::vector LoopIndices; 105 | 106 | private: 107 | void ParseKernel(); 108 | void ParseKernelVar(std::string::iterator &it, KernelVar &var); 109 | void ParseVarName(std::string::iterator &it, KernelVar &var); 110 | void ParseIndexVar(std::string::iterator &it, KernelVar &var); 111 | void ParseEqOperator(std::string::iterator &it, std::string &op); 112 | void SetVarLoopNums(); 113 | }; 114 | 115 | } 116 | 117 | #endif //ACROBATIC_TENSOR_KERNEL_HPP -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | #All rights reserved. 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #If there is no config.mk copy over the default version 7 | JUNK := $(shell if [ ! -f "config/config.mk" ];then cp config/defaults.mk config/config.mk; fi) 8 | include config/config.mk 9 | 10 | DIRS = exec tensor util ops kernel 11 | SOURCE_FILES = $(foreach dir,$(DIRS),$(wildcard $(dir)/*.cpp)) 12 | INCLUDE_FILES = $(foreach dir,$(DIRS),$(wildcard $(dir)/*.hpp)) AcroTensor.hpp 13 | OBJECT_FILES = $(SOURCE_FILES:.cpp=.o) 14 | INCLUDES = $(foreach dir,$(DIRS),-I../$(dir)) 15 | 16 | ifeq ($(DEBUG),YES) 17 | CXX_FLAGS = $(INCLUDES) $(CXX_DEBUG) 18 | else 19 | CXX_FLAGS = $(INCLUDES) $(CXX_OPT) 20 | endif 21 | 22 | 23 | .SUFFIXES: .cpp .o 24 | .cpp.o: 25 | cd $(>>(Ainv_ptr, A_ptr, num_batch); 24 | } 25 | else if (mdim == 2) 26 | { 27 | CudaInv2x2<<>>(Ainv_ptr, A_ptr, num_batch); 28 | } 29 | else if (mdim == 3) 30 | { 31 | CudaInv3x3<<>>(Ainv_ptr, A_ptr, num_batch); 32 | } 33 | } 34 | 35 | 36 | void CudaGPUOps::BatchMatrixDet(Tensor &Adet, Tensor &A) 37 | { 38 | int rank = A.GetRank(); 39 | int mdim = A.GetDim(rank-1); 40 | int stride = mdim*mdim; 41 | int num_batch = A.GetSize() / stride; 42 | double *A_ptr = A.GetDeviceData(); 43 | double *Adet_ptr = Adet.GetDeviceData(); 44 | if (mdim == 1) 45 | { 46 | CudaDet1x1<<>>(Adet_ptr, A_ptr, num_batch); 47 | } 48 | else if (mdim == 2) 49 | { 50 | CudaDet2x2<<>>(Adet_ptr, A_ptr, num_batch); 51 | } 52 | else if (mdim == 3) 53 | { 54 | CudaDet3x3<<>>(Adet_ptr, A_ptr, num_batch); 55 | } 56 | } 57 | 58 | 59 | void CudaGPUOps::BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A) 60 | { 61 | int rank = A.GetRank(); 62 | int mdim = A.GetDim(rank-1); 63 | int stride = mdim*mdim; 64 | int num_batch = A.GetSize() / stride; 65 | double *A_ptr = A.GetDeviceData(); 66 | double *Ainv_ptr = Ainv.GetDeviceData(); 67 | double *Adet_ptr = Adet.GetDeviceData(); 68 | if (mdim == 1) 69 | { 70 | CudaInvDet1x1<<>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch); 71 | } 72 | else if (mdim == 2) 73 | { 74 | CudaInvDet2x2<<>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch); 75 | } 76 | else if (mdim == 3) 77 | { 78 | CudaInvDet3x3<<>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch); 79 | } 80 | } 81 | 82 | 83 | void CudaGPUOps::FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M) 84 | { 85 | double *Aout_ptr = Aout.GetDeviceData(); 86 | double *Ain_ptr = Ain.GetDeviceData(); 87 | int *M_ptr = M.GetMap().GetDeviceData(); 88 | int *InvM_ptr = M.GetInvMap().GetDeviceData(); 89 | int *InvMOff_ptr = M.GetInvMapOffsets().GetDeviceData(); 90 | int N = M.GetRangeSize(); 91 | CudaScatter<<>>(Aout_ptr, Ain_ptr, M_ptr, InvM_ptr, InvMOff_ptr, N); 92 | } 93 | 94 | 95 | void CudaGPUOps::FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M) 96 | { 97 | double *Aout_ptr = Aout.GetDeviceData(); 98 | double *Ain_ptr = Ain.GetDeviceData(); 99 | int *M_ptr = M.GetMap().GetDeviceData(); 100 | int *InvM_ptr = M.GetInvMap().GetDeviceData(); 101 | int *InvMOff_ptr = M.GetInvMapOffsets().GetDeviceData(); 102 | int N = M.GetDomainSize(); 103 | 104 | CudaSumGather<<>>(Aout_ptr, Ain_ptr, M_ptr, InvM_ptr, InvMOff_ptr, N); 105 | } 106 | 107 | 108 | __global__ void CudaInv1x1(double *Ainv, double *A, int N) 109 | { 110 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 111 | if (idx < N) 112 | { 113 | Ainv[idx] = 1.0 / A[idx]; 114 | } 115 | } 116 | 117 | 118 | __global__ void CudaInv2x2(double *Ainv, double *A, int N) 119 | { 120 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 121 | if (idx < N) 122 | { 123 | int b = idx*4; 124 | double A0 = A[b]; 125 | double A1 = A[b+1]; 126 | double A2 = A[b+2]; 127 | double A3 = A[b+3]; 128 | double invdet = 1.0 / (A0*A3 - A1*A2); 129 | Ainv[b+0] = invdet*A3; 130 | Ainv[b+1] = -invdet*A1; 131 | Ainv[b+2] = -invdet*A2; 132 | Ainv[b+3] = invdet*A0; 133 | } 134 | } 135 | 136 | 137 | __global__ void CudaInv3x3(double *Ainv, double *A, int N) 138 | { 139 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 140 | if (idx < N) 141 | { 142 | int b = idx*9; 143 | double A0 = A[b]; 144 | double A1 = A[b+1]; 145 | double A2 = A[b+2]; 146 | double A3 = A[b+3]; 147 | double A4 = A[b+4]; 148 | double A5 = A[b+5]; 149 | double A6 = A[b+6]; 150 | double A7 = A[b+7]; 151 | double A8 = A[b+8]; 152 | double invdet = 1.0 / (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 153 | - A6*A4*A2 - A7*A5*A0 - A8*A3*A1); 154 | Ainv[b+0] = invdet*(A4*A8 - A5*A7); 155 | Ainv[b+1] = invdet*(A5*A6 - A3*A8); 156 | Ainv[b+2] = invdet*(A3*A7 - A4*A6); 157 | Ainv[b+3] = invdet*(A2*A7 - A1*A8); 158 | Ainv[b+4] = invdet*(A0*A8 - A2*A6); 159 | Ainv[b+5] = invdet*(A1*A6 - A0*A7); 160 | Ainv[b+6] = invdet*(A1*A5 - A2*A4); 161 | Ainv[b+7] = invdet*(A2*A3 - A0*A5); 162 | Ainv[b+8] = invdet*(A0*A4 - A1*A3); 163 | } 164 | } 165 | 166 | 167 | __global__ void CudaDet1x1(double *Adet, double *A, int N) 168 | { 169 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 170 | if (idx < N) 171 | { 172 | Adet[idx] = A[idx]; 173 | } 174 | } 175 | 176 | 177 | __global__ void CudaDet2x2(double *Adet, double *A, int N) 178 | { 179 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 180 | if (idx < N) 181 | { 182 | int b = idx*4; 183 | double A0 = A[b]; 184 | double A1 = A[b+1]; 185 | double A2 = A[b+2]; 186 | double A3 = A[b+3]; 187 | Adet[idx] = (A0*A3 - A1*A2); 188 | } 189 | } 190 | 191 | 192 | __global__ void CudaDet3x3(double *Adet, double *A, int N) 193 | { 194 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 195 | if (idx < N) 196 | { 197 | int b = idx*9; 198 | double A0 = A[b]; 199 | double A1 = A[b+1]; 200 | double A2 = A[b+2]; 201 | double A3 = A[b+3]; 202 | double A4 = A[b+4]; 203 | double A5 = A[b+5]; 204 | double A6 = A[b+6]; 205 | double A7 = A[b+7]; 206 | double A8 = A[b+8]; 207 | Adet[idx] = (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 208 | - A6*A4*A2 - A7*A5*A0 - A8*A3*A1); 209 | } 210 | } 211 | 212 | 213 | __global__ void CudaInvDet1x1(double *Ainv, double *Adet, double *A, int N) 214 | { 215 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 216 | if (idx < N) 217 | { 218 | double det = A[idx]; 219 | Adet[idx] = det; 220 | Ainv[idx] = 1.0 / det; 221 | } 222 | } 223 | 224 | 225 | __global__ void CudaInvDet2x2(double *Ainv, double *Adet, double *A, int N) 226 | { 227 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 228 | if (idx < N) 229 | { 230 | int b = idx*4; 231 | double A0 = A[b]; 232 | double A1 = A[b+1]; 233 | double A2 = A[b+2]; 234 | double A3 = A[b+3]; 235 | double det = (A0*A3 - A1*A2); 236 | Adet[idx] = det; 237 | double invdet = 1.0 / det; 238 | Ainv[b+0] = invdet*A3; 239 | Ainv[b+1] = -invdet*A1; 240 | Ainv[b+2] = -invdet*A2; 241 | Ainv[b+3] = invdet*A0; 242 | } 243 | } 244 | 245 | 246 | __global__ void CudaInvDet3x3(double *Ainv, double *Adet, double *A, int N) 247 | { 248 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 249 | if (idx < N) 250 | { 251 | int b = idx*9; 252 | double A0 = A[b]; 253 | double A1 = A[b+1]; 254 | double A2 = A[b+2]; 255 | double A3 = A[b+3]; 256 | double A4 = A[b+4]; 257 | double A5 = A[b+5]; 258 | double A6 = A[b+6]; 259 | double A7 = A[b+7]; 260 | double A8 = A[b+8]; 261 | double det = (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 262 | - A6*A4*A2 - A7*A5*A0 - A8*A3*A1); 263 | Adet[idx] = det; 264 | double invdet = 1.0 / det; 265 | Ainv[b+0] = invdet*(A4*A8 - A5*A7); 266 | Ainv[b+1] = invdet*(A5*A6 - A3*A8); 267 | Ainv[b+2] = invdet*(A3*A7 - A4*A6); 268 | Ainv[b+3] = invdet*(A2*A7 - A1*A8); 269 | Ainv[b+4] = invdet*(A0*A8 - A2*A6); 270 | Ainv[b+5] = invdet*(A1*A6 - A0*A7); 271 | Ainv[b+6] = invdet*(A1*A5 - A2*A4); 272 | Ainv[b+7] = invdet*(A2*A3 - A0*A5); 273 | Ainv[b+8] = invdet*(A0*A4 - A1*A3); 274 | } 275 | } 276 | 277 | 278 | __global__ void CudaScatter(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N) 279 | { 280 | int i = blockIdx.x*blockDim.x + threadIdx.x; 281 | if (i < N) 282 | { 283 | Aout[i] = Ain[M[i]]; 284 | } 285 | } 286 | 287 | 288 | __global__ void CudaSumGather(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N) 289 | { 290 | int iout = blockIdx.x*blockDim.x + threadIdx.x; 291 | if (iout < N) 292 | { 293 | int in_beg = invMOff[iout]; 294 | int in_end = invMOff[iout + 1]; 295 | double sum = 0.0; 296 | for (int iin = in_beg; iin < in_end; ++iin) 297 | { 298 | sum += Ain[invM[iin]]; 299 | } 300 | Aout[iout] = sum; 301 | } 302 | } 303 | 304 | 305 | } 306 | 307 | #endif -------------------------------------------------------------------------------- /ops/CudaGPUOps.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_CUDA_GPU_OPS_HPP 7 | #define ACROBATIC_CUDA_GPU_OPS_HPP 8 | 9 | #ifdef ACRO_HAVE_CUDA 10 | 11 | #include "NonContractionOps.hpp" 12 | #include "Tensor.hpp" 13 | 14 | namespace acro 15 | { 16 | 17 | 18 | //Internal CPU operations on tensors that are exposed properly by the kernel executors. 19 | //Use of this class directly is not recommended. 20 | class CudaGPUOps : public NonContractionOps 21 | { 22 | public: 23 | void BatchMatrixInverse(Tensor &out, Tensor &in); 24 | void BatchMatrixDet(Tensor &Adet, Tensor &A); 25 | void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A); 26 | 27 | void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M); 28 | void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M); 29 | 30 | }; 31 | 32 | 33 | __global__ void CudaInv1x1(double *Ainv, double *A, int N); 34 | __global__ void CudaInv2x2(double *Ainv, double *A, int N); 35 | __global__ void CudaInv3x3(double *Ainv, double *A, int N); 36 | __global__ void CudaDet1x1(double *Adet, double *A, int N); 37 | __global__ void CudaDet2x2(double *Adet, double *A, int N); 38 | __global__ void CudaDet3x3(double *Adet, double *A, int N); 39 | __global__ void CudaInvDet1x1(double *Ainv, double *Adet, double *A, int N); 40 | __global__ void CudaInvDet2x2(double *Ainv, double *Adet, double *A, int N); 41 | __global__ void CudaInvDet3x3(double *Ainv, double *Adet, double *A, int N); 42 | __global__ void CudaScatter(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N); 43 | __global__ void CudaSumGather(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N); 44 | 45 | } 46 | 47 | #endif 48 | #endif //ACROBATIC_CUDA_GPU_OPS_HPP -------------------------------------------------------------------------------- /ops/NativeCPUOps.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "NativeCPUOps.hpp" 7 | 8 | namespace acro 9 | { 10 | 11 | void NativeCPUOps::BatchMatrixInverse(Tensor &Ainv, Tensor &A) 12 | { 13 | int rank = A.GetRank(); 14 | int mdim = A.GetDim(rank-1); 15 | int stride = mdim*mdim; 16 | int num_batch = A.GetSize() / stride; 17 | double *A_ptr = A.GetData(); 18 | double *Ainv_ptr = Ainv.GetData(); 19 | if (mdim == 1) 20 | { 21 | for (int i = 0; i < num_batch; ++i) 22 | { 23 | Inv1x1(Ainv_ptr, A_ptr, Det1x1(A_ptr)); 24 | Ainv_ptr += stride; 25 | A_ptr += stride; 26 | } 27 | } 28 | else if (mdim == 2) 29 | { 30 | for (int i = 0; i < num_batch; ++i) 31 | { 32 | Inv2x2(Ainv_ptr, A_ptr, Det2x2(A_ptr)); 33 | Ainv_ptr += stride; 34 | A_ptr += stride; 35 | } 36 | } 37 | else if (mdim == 3) 38 | { 39 | for (int i = 0; i < num_batch; ++i) 40 | { 41 | Inv3x3(Ainv_ptr, A_ptr, Det3x3(A_ptr)); 42 | Ainv_ptr += stride; 43 | A_ptr += stride; 44 | } 45 | } 46 | } 47 | 48 | 49 | void NativeCPUOps::BatchMatrixDet(Tensor &Adet, Tensor &A) 50 | { 51 | int rank = A.GetRank(); 52 | int mdim = A.GetDim(rank-1); 53 | int stride = mdim*mdim; 54 | int num_batch = A.GetSize() / stride; 55 | double *A_ptr = A.GetData(); 56 | double *Adet_ptr = Adet.GetData(); 57 | if (mdim == 1) 58 | { 59 | for (int i = 0; i < num_batch; ++i) 60 | { 61 | Adet_ptr[i] = Det1x1(A_ptr); 62 | A_ptr += stride; 63 | } 64 | } 65 | else if (mdim == 2) 66 | { 67 | for (int i = 0; i < num_batch; ++i) 68 | { 69 | Adet_ptr[i] = Det2x2(A_ptr); 70 | A_ptr += stride; 71 | } 72 | } 73 | else if (mdim == 3) 74 | { 75 | for (int i = 0; i < num_batch; ++i) 76 | { 77 | Adet_ptr[i] = Det3x3(A_ptr); 78 | A_ptr += stride; 79 | } 80 | } 81 | } 82 | 83 | 84 | void NativeCPUOps::BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A) 85 | { 86 | int rank = A.GetRank(); 87 | int mdim = A.GetDim(rank-1); 88 | int stride = mdim*mdim; 89 | int num_batch = A.GetSize() / stride; 90 | double *A_ptr = A.GetData(); 91 | double *Ainv_ptr = Ainv.GetData(); 92 | double *Adet_ptr = Adet.GetData(); 93 | if (mdim == 1) 94 | { 95 | for (int i = 0; i < num_batch; ++i) 96 | { 97 | Adet_ptr[i] = Det1x1(A_ptr); 98 | Inv1x1(Ainv_ptr, A_ptr, Adet_ptr[i]); 99 | A_ptr += stride; 100 | Ainv_ptr += stride; 101 | } 102 | } 103 | else if (mdim == 2) 104 | { 105 | for (int i = 0; i < num_batch; ++i) 106 | { 107 | Adet_ptr[i] = Det2x2(A_ptr); 108 | Inv2x2(Ainv_ptr, A_ptr, Adet_ptr[i]); 109 | A_ptr += stride; 110 | Ainv_ptr += stride; 111 | } 112 | } 113 | else if (mdim == 3) 114 | { 115 | for (int i = 0; i < num_batch; ++i) 116 | { 117 | Adet_ptr[i] = Det3x3(A_ptr); 118 | Inv3x3(Ainv_ptr, A_ptr, Adet_ptr[i]); 119 | A_ptr += stride; 120 | Ainv_ptr += stride; 121 | } 122 | } 123 | } 124 | 125 | 126 | void NativeCPUOps::FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M) 127 | { 128 | IndexVector &I = M.GetMap(); 129 | for (int i = 0; i < I.GetSize(); i++) 130 | { 131 | Aout[i] = Ain[I[i]]; 132 | } 133 | } 134 | 135 | 136 | void NativeCPUOps::FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M) 137 | { 138 | IndexVector &I = M.GetMap(); 139 | Aout.Set(0.0); 140 | for (int i = 0; i < I.GetSize(); i++) 141 | { 142 | Aout[I[i]] += Ain[i]; 143 | } 144 | } 145 | 146 | 147 | 148 | 149 | } -------------------------------------------------------------------------------- /ops/NativeCPUOps.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_NATIVE_CPU_OPS_HPP 7 | #define ACROBATIC_NATIVE_CPU_OPS_HPP 8 | 9 | #include "NonContractionOps.hpp" 10 | #include "Tensor.hpp" 11 | 12 | namespace acro 13 | { 14 | 15 | 16 | //Internal CPU operations on tensors that are exposed properly by the kernel executors. 17 | //Use of this class directly is not recommended. 18 | class NativeCPUOps : public NonContractionOps 19 | { 20 | public: 21 | void BatchMatrixInverse(Tensor &Ainv, Tensor &A); 22 | void BatchMatrixDet(Tensor &Adet, Tensor &A); 23 | void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A); 24 | 25 | void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M); 26 | void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M); 27 | 28 | private: 29 | inline void Inv1x1(double *Ainv, double *A, double det); 30 | inline void Inv2x2(double *Ainv, double *A, double det); 31 | inline void Inv3x3(double *Ainv, double *A, double det); 32 | inline double Det1x1(double *A); 33 | inline double Det2x2(double *A); 34 | inline double Det3x3(double *A); 35 | }; 36 | 37 | 38 | inline void NativeCPUOps::Inv1x1(double *Ainv, double *A, double det) 39 | { 40 | Ainv[0] = 1.0 / det; 41 | } 42 | 43 | 44 | inline void NativeCPUOps::Inv2x2(double *Ainv, double *A, double det) 45 | { 46 | double invdet = 1.0 / det; 47 | Ainv[0] = invdet*A[3]; 48 | Ainv[1] = -invdet*A[1]; 49 | Ainv[2] = -invdet*A[2]; 50 | Ainv[3] = invdet*A[0]; 51 | 52 | } 53 | 54 | 55 | inline void NativeCPUOps::Inv3x3(double *Ainv, double *A, double det) 56 | { 57 | double invdet = 1.0 / det; 58 | Ainv[0] = invdet*(A[4]*A[8] - A[5]*A[7]); 59 | Ainv[1] = invdet*(A[5]*A[6] - A[3]*A[8]); 60 | Ainv[2] = invdet*(A[3]*A[7] - A[4]*A[6]); 61 | Ainv[3] = invdet*(A[2]*A[7] - A[1]*A[8]); 62 | Ainv[4] = invdet*(A[0]*A[8] - A[2]*A[6]); 63 | Ainv[5] = invdet*(A[1]*A[6] - A[0]*A[7]); 64 | Ainv[6] = invdet*(A[1]*A[5] - A[2]*A[4]); 65 | Ainv[7] = invdet*(A[2]*A[3] - A[0]*A[5]); 66 | Ainv[8] = invdet*(A[0]*A[4] - A[1]*A[3]); 67 | } 68 | 69 | 70 | inline double NativeCPUOps::Det1x1(double *A) 71 | { 72 | return A[0]; 73 | } 74 | 75 | 76 | inline double NativeCPUOps::Det2x2(double *A) 77 | { 78 | return (A[0]*A[3] - A[1]*A[2]); 79 | } 80 | 81 | 82 | inline double NativeCPUOps::Det3x3(double *A) 83 | { 84 | return (A[0]*A[4]*A[8] + A[1]*A[5]*A[6] + A[2]*A[3]*A[7] 85 | - A[6]*A[4]*A[2] - A[7]*A[5]*A[0] - A[8]*A[3]*A[1]); 86 | } 87 | 88 | 89 | 90 | } 91 | 92 | 93 | #endif //ACROBATIC_NATIVE_CPU_OPS_HPP -------------------------------------------------------------------------------- /ops/NonContractionOps.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_NON_CONTRACTION_OPS_HPP 7 | #define ACROBATIC_NON_CONTRACTION_OPS_HPP 8 | 9 | #include 10 | #include "Tensor.hpp" 11 | #include "IndexMapping.hpp" 12 | 13 | namespace acro 14 | { 15 | 16 | 17 | class NonContractionOps 18 | { 19 | public: 20 | //Batched 1x1, 2x2, and 3x3 matrix inverses and determinents 21 | //The last 2 indices are for the matrices and the rests are batched over 22 | virtual void BatchMatrixInverse(Tensor &Ainv, Tensor &A) = 0; 23 | virtual void BatchMatrixDet(Tensor &Adet, Tensor &A) = 0; 24 | virtual void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A) = 0; 25 | 26 | //Aout[i] = Ain[I[i]] 27 | virtual void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M) = 0; 28 | 29 | //Aout[:] = 0.0 30 | //Aout[I[i]] += Ain[i] 31 | virtual void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M) = 0; 32 | }; 33 | 34 | } 35 | 36 | 37 | #endif //ACROBATIC_NON_CONTRACTION_OPS_HPP -------------------------------------------------------------------------------- /ops/Ops.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "NativeCPUOps.hpp" 7 | #include "CudaGPUOps.hpp" -------------------------------------------------------------------------------- /tensor/IndexMapping.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "IndexMapping.hpp" 7 | #include 8 | #include 9 | 10 | namespace acro 11 | { 12 | 13 | 14 | IndexMapping::IndexMapping(int domain_size, int range_size) : 15 | DomainSize(domain_size), 16 | RangeSize(range_size), 17 | InverseComputed(false), 18 | M(range_size), 19 | InvM(range_size), 20 | InvMOff(domain_size+1) 21 | { 22 | 23 | } 24 | 25 | 26 | 27 | void IndexMapping::ComputeInverse() 28 | { 29 | ACROBATIC_ASSERT(!InverseComputed,"Can't compute the inverse mapping twice."); 30 | 31 | std::iota(&InvM[0], &InvM[RangeSize], 0); 32 | std::stable_sort(&InvM[0], &InvM[RangeSize], 33 | [this](size_t i1, size_t i2) {return M[i1] < M[i2];}); 34 | 35 | int off = 0; 36 | for (int i = 0; i < DomainSize + 1; ++i) 37 | { 38 | InvMOff[i] = off; 39 | if (off < RangeSize) 40 | { 41 | int m = M[InvM[off]]; 42 | while (off < RangeSize && M[InvM[off]] == m) 43 | { 44 | off ++; 45 | } 46 | } 47 | else 48 | { 49 | off = RangeSize; //Handle the last one 50 | } 51 | } 52 | 53 | InverseComputed = true; 54 | 55 | if (OnGPU) 56 | { 57 | InvM.SwitchFromGPU(); 58 | InvM.MoveToGPU(); 59 | InvMOff.SwitchFromGPU(); 60 | InvMOff.MoveToGPU(); 61 | } 62 | } 63 | 64 | 65 | void IndexMapping::MapToGPU() 66 | { 67 | M.MapToGPU(); 68 | if (InverseComputed) 69 | { 70 | InvM.MapToGPU(); 71 | InvMOff.MapToGPU(); 72 | } 73 | MappedToGPU = true; 74 | } 75 | 76 | 77 | void IndexMapping::MoveToGPU() 78 | { 79 | M.MoveToGPU(); 80 | if (InverseComputed) 81 | { 82 | InvM.MoveToGPU(); 83 | InvMOff.MoveToGPU(); 84 | } 85 | OnGPU = true; 86 | } 87 | 88 | 89 | void IndexMapping::SwitchToGPU() 90 | { 91 | M.SwitchToGPU(); 92 | if (InverseComputed) 93 | { 94 | InvM.SwitchToGPU(); 95 | InvMOff.SwitchToGPU(); 96 | } 97 | OnGPU = true; 98 | } 99 | 100 | 101 | void IndexMapping::UnmapFromGPU() 102 | { 103 | M.UnmapFromGPU(); 104 | if (InverseComputed) 105 | { 106 | InvM.UnmapFromGPU(); 107 | InvMOff.UnmapFromGPU(); 108 | } 109 | MappedToGPU = false; 110 | OnGPU = false; 111 | } 112 | 113 | 114 | void IndexMapping::MoveFromGPU() 115 | { 116 | M.MoveFromGPU(); 117 | if (InverseComputed) 118 | { 119 | InvM.MoveFromGPU(); 120 | InvMOff.MoveFromGPU(); 121 | } 122 | OnGPU = false; 123 | } 124 | 125 | 126 | void IndexMapping::SwitchFromGPU() 127 | { 128 | M.SwitchFromGPU(); 129 | if (InverseComputed) 130 | { 131 | InvM.SwitchFromGPU(); 132 | InvMOff.SwitchFromGPU(); 133 | } 134 | OnGPU = false; 135 | } 136 | 137 | 138 | 139 | 140 | } 141 | -------------------------------------------------------------------------------- /tensor/IndexMapping.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_INDEXMAPPING_HPP 7 | #define ACROBATIC_INDEXMAPPING_HPP 8 | 9 | #include "IndexVector.hpp" 10 | 11 | namespace acro 12 | { 13 | 14 | class IndexMapping 15 | { 16 | public: 17 | IndexMapping(int domain_size, int range_size); 18 | 19 | int GetDomainSize() {return DomainSize;} 20 | int GetRangeSize() {return RangeSize;} 21 | bool IsInverseComputed() {return InverseComputed;} 22 | 23 | int &operator[](int raw_index); 24 | void ComputeInverse(); 25 | 26 | IndexVector &GetMap(); 27 | IndexVector &GetInvMap(); 28 | IndexVector &GetInvMapOffsets(); 29 | 30 | void MapToGPU(); //Allocate memory for the data on the GPU 31 | void MoveToGPU(); //Copy the data to the GPU and flag the data as currently on the GPU 32 | void SwitchToGPU(); //Flag the data as currently onGPU 33 | void UnmapFromGPU(); //Deallocate memory on the GPU 34 | void MoveFromGPU(); //Copy the data back from the GPU and flag the data as currently on the CPU 35 | void SwitchFromGPU(); //Flag the data as currently on the CPU 36 | bool IsMappedToGPU() const {return MappedToGPU;} 37 | bool IsOnGPU() const {return OnGPU;} 38 | 39 | private: 40 | bool InverseComputed; 41 | bool MappedToGPU; 42 | bool OnGPU; 43 | int DomainSize; 44 | int RangeSize; 45 | 46 | IndexVector M; 47 | IndexVector InvM, InvMOff; 48 | }; 49 | 50 | 51 | inline int &IndexMapping::operator[](int raw_index) 52 | { 53 | return M[raw_index]; 54 | } 55 | 56 | 57 | inline IndexVector &IndexMapping::GetMap() 58 | { 59 | return M; 60 | } 61 | 62 | 63 | inline IndexVector &IndexMapping::GetInvMap() 64 | { 65 | ACROBATIC_ASSERT(InverseComputed, "Trying to access inverse mapping before the inverse is computed."); 66 | return InvM; 67 | } 68 | 69 | 70 | inline IndexVector &IndexMapping::GetInvMapOffsets() 71 | { 72 | ACROBATIC_ASSERT(InverseComputed, "Trying to access inverse mapping offsets before the inverse is computed."); 73 | return InvMOff; 74 | } 75 | 76 | } 77 | 78 | #endif -------------------------------------------------------------------------------- /tensor/IndexVector.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "IndexVector.hpp" 7 | #include "Util.hpp" 8 | #include "CudaUtil.hpp" 9 | #include 10 | 11 | namespace acro 12 | { 13 | 14 | 15 | IndexVector::IndexVector() 16 | { 17 | Data = nullptr; 18 | DeviceData = nullptr; 19 | OwnsData = false; 20 | MappedToGPU = false; 21 | Initialized = false; 22 | } 23 | 24 | 25 | IndexVector::IndexVector(int dim, int *hdata, int *ddata, bool ongpu) 26 | { 27 | Initialized = false; 28 | Init(dim, hdata, ddata, ongpu); 29 | } 30 | 31 | 32 | void IndexVector::Init(int dim, int *hdata, int *ddata, bool ongpu) 33 | { 34 | ACROBATIC_ASSERT(!IsInitialized(), "Can't initilize a vector a second time.") 35 | ACROBATIC_ASSERT(dim > 0, "Cant initilize vector with dim <= 0."); 36 | Size = dim; 37 | ByteSize = dim*sizeof(int); 38 | 39 | if (hdata == nullptr) 40 | { 41 | Data = new int[Size]; 42 | OwnsData = true; 43 | } 44 | else 45 | { 46 | Data = hdata; 47 | OwnsData = false; 48 | } 49 | 50 | MappedToGPU = false; 51 | DeviceData = ddata; 52 | if (ddata != nullptr) 53 | { 54 | ACROBATIC_ASSERT(hdata != nullptr, 55 | "Acrotensor does not currently support GPU only tensors."); 56 | MappedToGPU = true; 57 | } 58 | 59 | ACROBATIC_ASSERT(ddata != nullptr || !ongpu, 60 | "Acrotensor cannot mark external data as on the GPU if no GPU pointer is provided."); 61 | 62 | OnGPU = ongpu; 63 | Initialized = true; 64 | } 65 | 66 | 67 | IndexVector::~IndexVector() 68 | { 69 | if (OwnsData) 70 | { 71 | delete [] Data; 72 | if (IsMappedToGPU()) 73 | { 74 | UnmapFromGPU(); 75 | } 76 | } 77 | } 78 | 79 | 80 | void IndexVector::Retarget(int *hdata, int *ddata) 81 | { 82 | ACROBATIC_ASSERT(!OwnsData); 83 | Data = hdata; 84 | DeviceData = ddata; 85 | } 86 | 87 | 88 | void IndexVector::MapToGPU() 89 | { 90 | #ifdef ACRO_HAVE_CUDA 91 | ACROBATIC_ASSERT(!IsMappedToGPU(), "Trying to map data to the GPU a second time."); 92 | ensureCudaContext(); 93 | acroCudaErrorCheck(cudaMalloc((void**)&DeviceData, ByteSize)); 94 | MappedToGPU = true; 95 | #endif 96 | } 97 | 98 | void IndexVector::MoveToGPU() 99 | { 100 | #ifdef ACRO_HAVE_CUDA 101 | if (!IsMappedToGPU()) 102 | { 103 | MapToGPU(); 104 | } 105 | if (!IsOnGPU()) 106 | { 107 | ensureCudaContext(); 108 | acroCudaErrorCheck(cudaMemcpy(DeviceData, Data, ByteSize, cudaMemcpyHostToDevice)); 109 | OnGPU = true; 110 | } 111 | #endif 112 | } 113 | 114 | void IndexVector::SwitchToGPU() 115 | { 116 | #ifdef ACRO_HAVE_CUDA 117 | if (!IsMappedToGPU()) 118 | { 119 | MapToGPU(); 120 | } 121 | OnGPU = true; 122 | #endif 123 | } 124 | 125 | void IndexVector::UnmapFromGPU() 126 | { 127 | #ifdef ACRO_HAVE_CUDA 128 | ACROBATIC_ASSERT(IsMappedToGPU(), "Can't unmap data that is not mapped to the GPU."); 129 | ensureCudaContext(); 130 | acroCudaErrorCheck(cudaFree(DeviceData)); 131 | MappedToGPU = false; 132 | OnGPU = false; 133 | #endif 134 | } 135 | 136 | void IndexVector::MoveFromGPU() 137 | { 138 | #ifdef ACRO_HAVE_CUDA 139 | if (IsOnGPU()) 140 | { 141 | ensureCudaContext(); 142 | acroCudaErrorCheck(cudaMemcpy(Data, DeviceData, ByteSize, cudaMemcpyDeviceToHost)); 143 | OnGPU = false; 144 | } 145 | #endif 146 | } 147 | 148 | 149 | void IndexVector::SwitchFromGPU() 150 | { 151 | #ifdef ACRO_HAVE_CUDA 152 | OnGPU = false; 153 | #endif 154 | } 155 | 156 | 157 | void IndexVector::Print() 158 | { 159 | for (int i = 0; i < GetSize(); ++i) 160 | { 161 | std::cout << Data[i] << std::endl; 162 | } 163 | std::cout << std::endl; 164 | } 165 | 166 | } 167 | -------------------------------------------------------------------------------- /tensor/IndexVector.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_INDEXVECTOR_HPP 7 | #define ACROBATIC_INDEXVECTOR_HPP 8 | 9 | #include 10 | #include "Util.hpp" 11 | 12 | namespace acro 13 | { 14 | 15 | class IndexVector 16 | { 17 | public: 18 | IndexVector(); 19 | IndexVector(int dim, int *hdata=nullptr, int *ddata=nullptr, bool ongpu=false); 20 | ~IndexVector(); 21 | void Init(int dim, int *hdata=nullptr, int *ddata=nullptr, bool ongpu=false); 22 | 23 | int GetSize() const; 24 | int *GetData() const; 25 | int *GetDeviceData() const; 26 | int *GetCurrentData() const; 27 | int &operator[](int raw_index); 28 | 29 | void Retarget(int *hdata, int *ddata); 30 | 31 | void MapToGPU(); //Allocate memory for the data on the GPU 32 | void MoveToGPU(); //Copy the data to the GPU and flag the data as currently on the GPU 33 | void SwitchToGPU(); //Flag the data as currently onGPU 34 | void UnmapFromGPU(); //Deallocate memory on the GPU 35 | void MoveFromGPU(); //Copy the data back from the GPU and flag the data as currently on the CPU 36 | void SwitchFromGPU(); //Flag the data as currently on the CPU 37 | bool IsMappedToGPU() const {return MappedToGPU;} 38 | bool IsOnGPU() const {return OnGPU;} 39 | bool IsInitialized() const {return Initialized;} 40 | 41 | void Print(); 42 | 43 | private: 44 | int Size; 45 | int ByteSize; 46 | 47 | bool Initialized; 48 | bool OwnsData; 49 | bool MappedToGPU; 50 | bool OnGPU; 51 | int *Data; 52 | int *DeviceData; 53 | }; 54 | 55 | 56 | inline int IndexVector::GetSize() const 57 | { 58 | return Size; 59 | } 60 | 61 | 62 | inline int *IndexVector::GetData() const 63 | { 64 | return Data; 65 | } 66 | 67 | 68 | inline int *IndexVector::GetDeviceData() const 69 | { 70 | return DeviceData; 71 | } 72 | 73 | 74 | inline int *IndexVector::GetCurrentData() const 75 | { 76 | return (IsOnGPU()) ? DeviceData : Data; 77 | } 78 | 79 | 80 | inline int &IndexVector::operator[](int raw_index) 81 | { 82 | #if DEBUG 83 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 84 | #endif 85 | return Data[raw_index]; 86 | } 87 | 88 | 89 | } 90 | 91 | #endif -------------------------------------------------------------------------------- /tensor/SliceTensor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "SliceTensor.hpp" 7 | 8 | namespace acro 9 | { 10 | 11 | SliceTensor::SliceTensor(Tensor &T, std::vector &sind) 12 | { 13 | SliceInit(T, sind); 14 | } 15 | 16 | 17 | SliceTensor::SliceTensor(Tensor &T, int d0) 18 | { 19 | std::vector sind = {d0}; 20 | SliceInit(T, sind); 21 | } 22 | 23 | 24 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1) 25 | { 26 | std::vector sind = {d0, d1}; 27 | SliceInit(T, sind); 28 | } 29 | 30 | 31 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2) 32 | { 33 | std::vector sind = {d0, d1, d2}; 34 | SliceInit(T, sind); 35 | } 36 | 37 | 38 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3) 39 | { 40 | std::vector sind = {d0, d1, d2, d3}; 41 | SliceInit(T, sind); 42 | } 43 | 44 | 45 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4) 46 | { 47 | std::vector sind = {d0, d1, d2, d3, d4}; 48 | SliceInit(T, sind); 49 | } 50 | 51 | 52 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5) 53 | { 54 | std::vector sind = {d0, d1, d2, d3, d4, d5}; 55 | SliceInit(T, sind); 56 | } 57 | 58 | 59 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6) 60 | { 61 | std::vector sind = {d0, d1, d2, d3, d4, d5, d6}; 62 | SliceInit(T, sind); 63 | } 64 | 65 | 66 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7) 67 | { 68 | std::vector sind = {d0, d1, d2, d3, d4, d5, d6, d7}; 69 | SliceInit(T, sind); 70 | } 71 | 72 | 73 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8) 74 | { 75 | std::vector sind = {d0, d1, d2, d3, d4, d5, d6, d7}; 76 | SliceInit(T, sind); 77 | } 78 | 79 | void SliceTensor::SliceInit(Tensor &T, int d0) 80 | { 81 | std::vector sind = {d0}; 82 | SliceInit(T, sind); 83 | } 84 | 85 | 86 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1) 87 | { 88 | std::vector sind = {d0,d1}; 89 | SliceInit(T, sind); 90 | } 91 | 92 | 93 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2) 94 | { 95 | std::vector sind = {d0,d1,d2}; 96 | SliceInit(T, sind); 97 | } 98 | 99 | 100 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3) 101 | { 102 | std::vector sind = {d0,d1,d2,d3}; 103 | SliceInit(T, sind); 104 | } 105 | 106 | 107 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4) 108 | { 109 | std::vector sind = {d0,d1,d2,d3,d4}; 110 | SliceInit(T, sind); 111 | } 112 | 113 | 114 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5) 115 | { 116 | std::vector sind = {d0,d1,d2,d3,d4,d5}; 117 | SliceInit(T, sind); 118 | } 119 | 120 | 121 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6) 122 | { 123 | std::vector sind = {d0,d1,d2,d3,d4,d5,d6}; 124 | SliceInit(T, sind); 125 | } 126 | 127 | 128 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7) 129 | { 130 | std::vector sind = {d0,d1,d2,d3,d4,d5,d6,d7}; 131 | SliceInit(T, sind); 132 | } 133 | 134 | 135 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8) 136 | { 137 | std::vector sind = {d0,d1,d2,d3,d4,d5,d6,d7,d8}; 138 | SliceInit(T, sind); 139 | } 140 | 141 | 142 | void SliceTensor::SliceInit(Tensor &T, std::vector &sind) 143 | { 144 | FullT = &T; 145 | ACROBATIC_ASSERT(T.IsInitialized(), "Can't slice an uninitilized tensor."); 146 | ACROBATIC_ASSERT(T.GetRank() > sind.size(), "Can't slice more dimensions than the tensor rank."); 147 | std::vector dims(T.GetRank() - sind.size()); 148 | for (int d = sind.size(); d < T.GetRank(); ++d) 149 | { 150 | dims[d - sind.size()] = T.GetDim(d); 151 | } 152 | 153 | Offset = T.GetRawIndex(sind); 154 | double *hdata = T.GetData(); 155 | double *ddata = T.GetDeviceData(); 156 | if (hdata) 157 | { 158 | hdata += Offset; 159 | } 160 | 161 | if (ddata) 162 | { 163 | ddata += Offset; 164 | } 165 | 166 | Initialized = false; 167 | Init(dims, hdata, ddata, T.IsOnGPU()); 168 | } 169 | 170 | 171 | double* SliceTensor::GetData() const 172 | { 173 | return FullT->GetData() + Offset; 174 | } 175 | 176 | 177 | double* SliceTensor::GetDeviceData() const 178 | { 179 | return FullT->GetDeviceData() + Offset; 180 | } 181 | 182 | 183 | void SliceTensor::MapToGPU() 184 | { 185 | FullT->MapToGPU(); 186 | DeviceData = FullT->GetDeviceData() + Offset; 187 | } 188 | 189 | 190 | void SliceTensor::MoveToGPU() 191 | { 192 | FullT->MoveToGPU(); //May Trigger a MapToGPU() 193 | DeviceData = FullT->GetDeviceData() + Offset; 194 | } 195 | 196 | 197 | void SliceTensor::SwitchToGPU() 198 | { 199 | FullT->SwitchToGPU(); //May Trigger a MapToGPU() 200 | DeviceData = FullT->GetDeviceData() + Offset; 201 | } 202 | 203 | 204 | void SliceTensor::MoveFromGPU() 205 | { 206 | FullT->MoveFromGPU(); 207 | 208 | } 209 | 210 | 211 | void SliceTensor::SwitchFromGPU() 212 | { 213 | FullT->SwitchFromGPU(); 214 | } 215 | 216 | 217 | bool SliceTensor::IsMappedToGPU() const 218 | { 219 | return FullT->IsMappedToGPU(); 220 | } 221 | 222 | 223 | bool SliceTensor::IsOnGPU() const 224 | { 225 | return FullT->IsOnGPU(); 226 | } 227 | 228 | 229 | void SliceTensor::UnmapFromGPU() 230 | { 231 | 232 | } 233 | 234 | 235 | } 236 | -------------------------------------------------------------------------------- /tensor/SliceTensor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_SLICETENSOR_HPP 7 | #define ACROBATIC_SLICETENSOR_HPP 8 | 9 | #include "Tensor.hpp" 10 | 11 | namespace acro 12 | { 13 | 14 | class SliceTensor : public Tensor 15 | { 16 | public: 17 | SliceTensor() {}; 18 | SliceTensor(Tensor &T, std::vector &sind); 19 | SliceTensor(Tensor &T, int d0); 20 | SliceTensor(Tensor &T, int d0, int d1); 21 | SliceTensor(Tensor &T, int d0, int d1, int d2); 22 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3); 23 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4); 24 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5); 25 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6); 26 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7); 27 | SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8); 28 | void SliceInit(Tensor &T, std::vector &sind); 29 | void SliceInit(Tensor &T, int d0); 30 | void SliceInit(Tensor &T, int d0, int d1); 31 | void SliceInit(Tensor &T, int d0, int d1, int d2); 32 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3); 33 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4); 34 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5); 35 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6); 36 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7); 37 | void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8); 38 | ~SliceTensor() {} 39 | 40 | virtual void Retarget(double *hdata, double*ddata=nullptr) {ACROBATIC_ASSERT(false, "Retarget not supported on SliceTensors");} 41 | 42 | //Routines for Data on the GPU 43 | virtual double* GetData() const; 44 | virtual double* GetDeviceData() const; 45 | virtual void MapToGPU(); 46 | virtual void MoveToGPU(); 47 | virtual void SwitchToGPU(); 48 | virtual void UnmapFromGPU(); 49 | virtual void MoveFromGPU(); 50 | virtual void SwitchFromGPU(); 51 | virtual bool IsMappedToGPU() const; 52 | virtual bool IsOnGPU() const; 53 | 54 | private: 55 | Tensor *FullT; 56 | int Offset; 57 | }; 58 | 59 | } 60 | 61 | #endif //ACROBATIC_SLICETENSOR_HPP 62 | -------------------------------------------------------------------------------- /tensor/Tensor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "Tensor.hpp" 7 | #include "Util.hpp" 8 | #include "CudaUtil.hpp" 9 | #include 10 | 11 | namespace acro 12 | { 13 | 14 | 15 | Tensor::Tensor() 16 | { 17 | Data = nullptr; 18 | DeviceData = nullptr; 19 | OwnsData = false; 20 | MappedToGPU = false; 21 | Initialized = false; 22 | } 23 | 24 | 25 | Tensor::Tensor(std::vector &dims, double *hdata, double *ddata, bool ongpu) 26 | { 27 | Initialized = false; 28 | Init(dims, hdata, ddata, ongpu); 29 | } 30 | 31 | 32 | Tensor::Tensor(int d0, double *hdata, double *ddata, bool ongpu) 33 | { 34 | Initialized = false; 35 | std::vector dims = {d0}; 36 | Init(dims, hdata, ddata, ongpu); 37 | } 38 | 39 | 40 | Tensor::Tensor(int d0, int d1, double *hdata, double *ddata, bool ongpu) 41 | { 42 | Initialized = false; 43 | std::vector dims = {d0, d1}; 44 | Init(dims, hdata, ddata, ongpu); 45 | } 46 | 47 | 48 | Tensor::Tensor(int d0, int d1, int d2, double *hdata, double *ddata, bool ongpu) 49 | { 50 | Initialized = false; 51 | std::vector dims = {d0, d1, d2}; 52 | Init(dims, hdata, ddata, ongpu); 53 | } 54 | 55 | 56 | Tensor::Tensor(int d0, int d1, int d2, int d3, double *hdata, double *ddata, bool ongpu) 57 | { 58 | Initialized = false; 59 | std::vector dims = {d0, d1, d2, d3}; 60 | Init(dims, hdata, ddata, ongpu); 61 | } 62 | 63 | 64 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, double *hdata, double *ddata, bool ongpu) 65 | { 66 | Initialized = false; 67 | std::vector dims = {d0, d1, d2, d3, d4}; 68 | Init(dims, hdata, ddata, ongpu); 69 | } 70 | 71 | 72 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata, double *ddata, bool ongpu) 73 | { 74 | Initialized = false; 75 | std::vector dims = {d0, d1, d2, d3, d4, d5}; 76 | Init(dims, hdata, ddata, ongpu); 77 | } 78 | 79 | 80 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata, double *ddata, bool ongpu) 81 | { 82 | Initialized = false; 83 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6}; 84 | Init(dims, hdata, ddata, ongpu); 85 | } 86 | 87 | 88 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata, double *ddata, bool ongpu) 89 | { 90 | Initialized = false; 91 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6, d7}; 92 | Init(dims, hdata, ddata, ongpu); 93 | } 94 | 95 | 96 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata, double *ddata, bool ongpu) 97 | { 98 | Initialized = false; 99 | Init(d0, d1, d2, d3, d4, d5, d6, d7, d8, hdata, ddata, ongpu); 100 | } 101 | 102 | 103 | void Tensor::Init(int d0, double *hdata, double *ddata, bool ongpu) 104 | { 105 | std::vector dims = {d0}; 106 | Init(dims, hdata, ddata, ongpu); 107 | } 108 | 109 | 110 | void Tensor::Init(int d0, int d1, double *hdata, double *ddata, bool ongpu) 111 | { 112 | std::vector dims = {d0, d1}; 113 | Init(dims, hdata, ddata, ongpu); 114 | } 115 | 116 | 117 | void Tensor::Init(int d0, int d1, int d2, double *hdata, double *ddata, bool ongpu) 118 | { 119 | std::vector dims = {d0, d1, d2}; 120 | Init(dims, hdata, ddata, ongpu); 121 | } 122 | 123 | 124 | void Tensor::Init(int d0, int d1, int d2, int d3, double *hdata, double *ddata, bool ongpu) 125 | { 126 | std::vector dims = {d0, d1, d2, d3}; 127 | Init(dims, hdata, ddata, ongpu); 128 | } 129 | 130 | 131 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, double *hdata, double *ddata, bool ongpu) 132 | { 133 | std::vector dims = {d0, d1, d2, d3, d4}; 134 | Init(dims, hdata, ddata, ongpu); 135 | } 136 | 137 | 138 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata, double *ddata, bool ongpu) 139 | { 140 | std::vector dims = {d0, d1, d2, d3, d4, d5}; 141 | Init(dims, hdata, ddata, ongpu); 142 | } 143 | 144 | 145 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata, double *ddata, bool ongpu) 146 | { 147 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6}; 148 | Init(dims, hdata, ddata, ongpu); 149 | } 150 | 151 | 152 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata, double *ddata, bool ongpu) 153 | { 154 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6, d7}; 155 | Init(dims, hdata, ddata, ongpu); 156 | } 157 | 158 | 159 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata, double *ddata, bool ongpu) 160 | { 161 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6, d7, d8}; 162 | Init(dims, hdata, ddata, ongpu); 163 | } 164 | 165 | 166 | void Tensor::Init(std::vector &dims, double *hdata, double *ddata, bool ongpu) 167 | { 168 | ACROBATIC_ASSERT(!IsInitialized(), "Can't initilize a tensor a second time.") 169 | ACROBATIC_ASSERT(dims.size() > 0, "Cant initilize tensor without any dimensions."); 170 | for (int d = 0; d < dims.size(); ++d) 171 | { 172 | ACROBATIC_ASSERT(dims[d] > 0, "Can't initilize tensor with non-positive dimensions."); 173 | } 174 | Dims = dims; 175 | UpdateStrides(); 176 | ComputeSize(); 177 | if (hdata == nullptr) 178 | { 179 | Data = new double[Size]; 180 | OwnsData = true; 181 | } 182 | else 183 | { 184 | Data = hdata; 185 | OwnsData = false; 186 | } 187 | 188 | MappedToGPU = false; 189 | DeviceData = ddata; 190 | if (ddata != nullptr) 191 | { 192 | ACROBATIC_ASSERT(hdata != nullptr, 193 | "Acrotensor does not currently support GPU only tensors."); 194 | MappedToGPU = true; 195 | } 196 | 197 | ACROBATIC_ASSERT(ddata != nullptr || !ongpu, 198 | "Acrotensor cannot mark external data as on the GPU if no GPU pointer is provided."); 199 | 200 | OnGPU = ongpu; 201 | Initialized = true; 202 | } 203 | 204 | 205 | Tensor::~Tensor() 206 | { 207 | if (OwnsData) 208 | { 209 | delete [] Data; 210 | if (IsMappedToGPU()) 211 | { 212 | UnmapFromGPU(); 213 | } 214 | } 215 | } 216 | 217 | void Tensor::Reshape(std::vector &dims) 218 | { 219 | ACROBATIC_ASSERT(dims.size() > 0); 220 | for (int d = 0; d < dims.size(); ++d) 221 | { 222 | ACROBATIC_ASSERT(dims[d] > 0); 223 | } 224 | 225 | int new_size = 1; 226 | for (int d = 0; d < dims.size(); ++d) 227 | { 228 | new_size *= dims[d]; 229 | } 230 | ACROBATIC_ASSERT(new_size == Size); 231 | 232 | Dims = dims; 233 | UpdateStrides(); 234 | } 235 | 236 | 237 | void Tensor::Reshape(int d0) 238 | { 239 | std::vector dims = {d0}; 240 | Reshape(dims); 241 | } 242 | 243 | 244 | void Tensor::Reshape(int d0, int d1) 245 | { 246 | std::vector dims = {d0, d1}; 247 | Reshape(dims); 248 | } 249 | 250 | 251 | void Tensor::Reshape(int d0, int d1, int d2) 252 | { 253 | std::vector dims = {d0, d1, d2}; 254 | Reshape(dims); 255 | } 256 | 257 | 258 | void Tensor::Reshape(int d0, int d1, int d2, int d3) 259 | { 260 | std::vector dims = {d0, d1, d2, d3}; 261 | Reshape(dims); 262 | } 263 | 264 | 265 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4) 266 | { 267 | std::vector dims = {d0, d1, d2, d3, d4}; 268 | Reshape(dims); 269 | } 270 | 271 | 272 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5) 273 | { 274 | std::vector dims = {d0, d1, d2, d3, d4, d5}; 275 | Reshape(dims); 276 | } 277 | 278 | 279 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6) 280 | { 281 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6}; 282 | Reshape(dims); 283 | } 284 | 285 | 286 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7) 287 | { 288 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6, d7}; 289 | Reshape(dims); 290 | } 291 | 292 | 293 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8) 294 | { 295 | std::vector dims = {d0, d1, d2, d3, d4, d5, d6, d7, d8}; 296 | Reshape(dims); 297 | } 298 | 299 | 300 | void Tensor::Retarget(double *hdata, double *ddata) 301 | { 302 | ACROBATIC_ASSERT(!OwnsData); 303 | Data = hdata; 304 | DeviceData = ddata; 305 | } 306 | 307 | 308 | void Tensor::UpdateStrides() 309 | { 310 | Strides.resize(Dims.size()); 311 | int stride = 1; 312 | for (int d = Dims.size() - 1; d >= 0; --d) 313 | { 314 | Strides[d] = stride; 315 | stride *= Dims[d]; 316 | } 317 | } 318 | 319 | 320 | void Tensor::ComputeSize() 321 | { 322 | Size = 1; 323 | for (int d = 0; d < GetRank(); ++d) 324 | { 325 | Size *= Dims[d]; 326 | } 327 | ByteSize = Size*sizeof(double); 328 | } 329 | 330 | void Tensor::Set(double val) 331 | { 332 | if (!IsOnGPU()) 333 | { 334 | for (int i = 0; i < GetSize(); ++i) 335 | { 336 | Data[i] = val; 337 | } 338 | } 339 | else 340 | { 341 | #ifdef ACRO_HAVE_CUDA 342 | ensureCudaContext(); 343 | CudaSet<<>>(DeviceData, val, GetSize()); 344 | acroCudaErrorCheck(cudaPeekAtLastError()); 345 | #endif 346 | } 347 | } 348 | 349 | 350 | void Tensor::Mult(double c) 351 | { 352 | if (!IsOnGPU()) 353 | { 354 | for (int i = 0; i < GetSize(); ++i) 355 | { 356 | Data[i] *= c; 357 | } 358 | } 359 | else 360 | { 361 | #ifdef ACRO_HAVE_CUDA 362 | ensureCudaContext(); 363 | CudaMult<<>>(DeviceData, c, GetSize()); 364 | acroCudaErrorCheck(cudaPeekAtLastError()); 365 | #endif 366 | } 367 | } 368 | 369 | 370 | void Tensor::MapToGPU() 371 | { 372 | #ifdef ACRO_HAVE_CUDA 373 | ACROBATIC_ASSERT(!IsMappedToGPU(), "Trying to map data to the GPU a second time."); 374 | ensureCudaContext(); 375 | acroCudaErrorCheck(cudaMalloc((void**)&DeviceData, ByteSize)); 376 | MappedToGPU = true; 377 | #endif 378 | } 379 | 380 | void Tensor::MoveToGPU() 381 | { 382 | #ifdef ACRO_HAVE_CUDA 383 | if (!IsMappedToGPU()) 384 | { 385 | MapToGPU(); 386 | } 387 | if (!IsOnGPU()) 388 | { 389 | ensureCudaContext(); 390 | acroCudaErrorCheck(cudaMemcpy(DeviceData, Data, ByteSize, cudaMemcpyHostToDevice)); 391 | OnGPU = true; 392 | } 393 | #endif 394 | } 395 | 396 | void Tensor::SwitchToGPU() 397 | { 398 | #ifdef ACRO_HAVE_CUDA 399 | if (!IsMappedToGPU()) 400 | { 401 | MapToGPU(); 402 | } 403 | OnGPU = true; 404 | #endif 405 | } 406 | 407 | void Tensor::UnmapFromGPU() 408 | { 409 | #ifdef ACRO_HAVE_CUDA 410 | ACROBATIC_ASSERT(IsMappedToGPU(), "Can't unmap data that is not mapped to the GPU."); 411 | ensureCudaContext(); 412 | acroCudaErrorCheck(cudaFree(DeviceData)); 413 | MappedToGPU = false; 414 | OnGPU = false; 415 | #endif 416 | } 417 | 418 | void Tensor::MoveFromGPU() 419 | { 420 | #ifdef ACRO_HAVE_CUDA 421 | if (IsOnGPU()) 422 | { 423 | ensureCudaContext(); 424 | acroCudaErrorCheck(cudaMemcpy(Data, DeviceData, ByteSize, cudaMemcpyDeviceToHost)); 425 | OnGPU = false; 426 | } 427 | #endif 428 | } 429 | 430 | 431 | void Tensor::SwitchFromGPU() 432 | { 433 | #ifdef ACRO_HAVE_CUDA 434 | OnGPU = false; 435 | #endif 436 | } 437 | 438 | 439 | void Tensor::Print() 440 | { 441 | std::cout << "Dims: "; 442 | for (int d = 0; d < Dims.size(); ++d) 443 | { 444 | std::cout << Dims[d] << " "; 445 | } 446 | std::cout << std::endl; 447 | 448 | std::cout << "Strides: "; 449 | for (int d = 0; d < Dims.size(); ++d) 450 | { 451 | std::cout << Strides[d] << " "; 452 | } 453 | std::cout << std::endl; 454 | 455 | for (int i = 0; i < GetSize(); ++i) 456 | { 457 | std::cout << Data[i] << std::endl; 458 | } 459 | std::cout << std::endl; 460 | } 461 | 462 | } 463 | -------------------------------------------------------------------------------- /tensor/Tensor.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_TENSOR_HPP 7 | #define ACROBATIC_TENSOR_HPP 8 | 9 | #include 10 | #include "Util.hpp" 11 | 12 | namespace acro 13 | { 14 | 15 | class Tensor 16 | { 17 | public: 18 | //Construct and empty tensor to be initilized later 19 | Tensor(); 20 | 21 | //Construct a tensor with the proper dimensions 22 | Tensor(std::vector &dims, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 23 | Tensor(int d0, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 24 | Tensor(int d0, int d1, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 25 | Tensor(int d0, int d1, int d2, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 26 | Tensor(int d0, int d1, int d2, int d3, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 27 | Tensor(int d0, int d1, int d2, int d3, int d4, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 28 | Tensor(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 29 | Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 30 | Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 31 | Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 32 | 33 | void Init(std::vector &dims, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 34 | void Init(int d0, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 35 | void Init(int d0, int d1, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 36 | void Init(int d0, int d1, int d2, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 37 | void Init(int d0, int d1, int d2, int d3, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 38 | void Init(int d0, int d1, int d2, int d3, int d4, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 39 | void Init(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 40 | void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 41 | void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 42 | void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false); 43 | 44 | ~Tensor(); 45 | 46 | //Simple index into data 47 | inline double &operator[](int raw_index); 48 | 49 | //Get the simple raw linear index from the tensor indices 50 | inline int GetRawIndex(const std::vector &indices); 51 | inline int GetRawIndex(int i0); 52 | inline int GetRawIndex(int i0, int i1); 53 | inline int GetRawIndex(int i0, int i1, int i2); 54 | inline int GetRawIndex(int i0, int i1, int i2, int i3); 55 | inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4); 56 | inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5); 57 | inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6); 58 | inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7); 59 | inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8); 60 | 61 | //Tensor index into the data 62 | inline double &operator()(std::vector &indices); 63 | inline double &operator()(int i0); 64 | inline double &operator()(int i0, int i1); 65 | inline double &operator()(int i0, int i1, int i2); 66 | inline double &operator()(int i0, int i1, int i2, int i3); 67 | inline double &operator()(int i0, int i1, int i2, int i3, int i4); 68 | inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5); 69 | inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6); 70 | inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7); 71 | inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8); 72 | 73 | //Change the dimensions of the tensor without reorganizing the data representation 74 | void Reshape(std::vector &dims); 75 | void Reshape(int d0); 76 | void Reshape(int d0, int d1); 77 | void Reshape(int d0, int d1, int d2); 78 | void Reshape(int d0, int d1, int d2, int d3); 79 | void Reshape(int d0, int d1, int d2, int d3, int d4); 80 | void Reshape(int d0, int d1, int d2, int d3, int d4, int d5); 81 | void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6); 82 | void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7); 83 | void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int i8); 84 | 85 | inline int GetRank() const; 86 | inline int GetSize() const ; 87 | inline int GetDim(int d) const ; 88 | inline int GetStride(int d) const; 89 | virtual double *GetData() const; 90 | virtual double *GetDeviceData() const; 91 | inline double *GetCurrentData() const; 92 | 93 | //Change where externally owned data is pointing 94 | virtual void Retarget(double *hdata, double*ddata=nullptr); 95 | 96 | //Routines for Data on the GPU 97 | virtual void MapToGPU(); //Allocate memory for the data on the GPU 98 | virtual void MoveToGPU(); //Copy the data to the GPU and flag the data as currently on the GPU 99 | virtual void SwitchToGPU(); //Flag the data as currently onGPU 100 | virtual void UnmapFromGPU(); //Deallocate memory on the GPU 101 | virtual void MoveFromGPU(); //Copy the data back from the GPU and flag the data as currently on the CPU 102 | virtual void SwitchFromGPU(); //Flag the data as currently on the CPU 103 | virtual bool IsMappedToGPU() const {return MappedToGPU;} 104 | virtual bool IsOnGPU() const {return OnGPU;} 105 | virtual bool IsInitialized() const {return Initialized;} 106 | 107 | void Set(double val); //Sets all values in the tensor to a constant 108 | void Mult(double c); //Multiply all values by a constant 109 | 110 | void Print(); 111 | 112 | protected: 113 | void UpdateStrides(); 114 | void ComputeSize(); 115 | 116 | std::vector Dims; 117 | std::vector Strides; 118 | int Size; 119 | int ByteSize; 120 | 121 | bool Initialized; 122 | bool OwnsData; 123 | bool MappedToGPU; 124 | bool OnGPU; 125 | double *Data; 126 | double *DeviceData; 127 | }; 128 | 129 | 130 | inline int Tensor::GetRank() const 131 | { 132 | return Dims.size(); 133 | } 134 | 135 | 136 | inline int Tensor::GetSize() const 137 | { 138 | return Size; 139 | } 140 | 141 | 142 | inline int Tensor::GetDim(int d) const 143 | { 144 | return Dims[d]; 145 | } 146 | 147 | 148 | inline int Tensor::GetStride(int d) const 149 | { 150 | return Strides[d]; 151 | } 152 | 153 | 154 | inline double *Tensor::GetData() const 155 | { 156 | return Data; 157 | } 158 | 159 | 160 | inline double *Tensor::GetDeviceData() const 161 | { 162 | return DeviceData; 163 | } 164 | 165 | 166 | inline double *Tensor::GetCurrentData() const 167 | { 168 | return (IsOnGPU()) ? DeviceData : Data; 169 | } 170 | 171 | 172 | 173 | inline int Tensor::GetRawIndex(const std::vector &indices) 174 | { 175 | int index = 0; 176 | for (unsigned int d = 0; d < indices.size(); ++d) 177 | { 178 | index += Strides[d] * indices[d]; 179 | } 180 | return index; 181 | } 182 | 183 | 184 | inline int Tensor::GetRawIndex(int i0) 185 | { 186 | return Strides[0]*i0; 187 | } 188 | 189 | 190 | inline int Tensor::GetRawIndex(int i0, int i1) 191 | { 192 | return Strides[0]*i0 + Strides[1]*i1; 193 | } 194 | 195 | 196 | inline int Tensor::GetRawIndex(int i0, int i1, int i2) 197 | { 198 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2; 199 | } 200 | 201 | 202 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3) 203 | { 204 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3; 205 | } 206 | 207 | 208 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4) 209 | { 210 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 211 | Strides[4]*i4; 212 | } 213 | 214 | 215 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5) 216 | { 217 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 218 | Strides[4]*i4 + Strides[5]*i5; 219 | } 220 | 221 | 222 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6) 223 | { 224 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 225 | Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6; 226 | } 227 | 228 | 229 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7) 230 | { 231 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 232 | Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6 + Strides[7]*i7; 233 | } 234 | 235 | 236 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8) 237 | { 238 | return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 239 | Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6 + Strides[7]*i7 + 240 | Strides[8]*i8; 241 | } 242 | 243 | 244 | inline double &Tensor::operator()(std::vector &indices) 245 | { 246 | #if DEBUG 247 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 248 | #endif 249 | return Data[GetRawIndex(indices)]; 250 | } 251 | 252 | inline double &Tensor::operator()(int i0) 253 | { 254 | #if DEBUG 255 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 256 | #endif 257 | return Data[GetRawIndex(i0)]; 258 | } 259 | 260 | 261 | inline double &Tensor::operator()(int i0, int i1) 262 | { 263 | #if DEBUG 264 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 265 | #endif 266 | return Data[GetRawIndex(i0, i1)]; 267 | } 268 | 269 | 270 | inline double &Tensor::operator[](int raw_index) 271 | { 272 | #if DEBUG 273 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 274 | #endif 275 | return Data[raw_index]; 276 | } 277 | 278 | inline double &Tensor::operator()(int i0, int i1, int i2) 279 | { 280 | #if DEBUG 281 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 282 | #endif 283 | return Data[GetRawIndex(i0, i1, i2)]; 284 | } 285 | 286 | 287 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3) 288 | { 289 | #if DEBUG 290 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 291 | #endif 292 | return Data[GetRawIndex(i0, i1, i2, i3)]; 293 | } 294 | 295 | 296 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4) 297 | { 298 | #if DEBUG 299 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 300 | #endif 301 | return Data[GetRawIndex(i0, i1, i2, i3, i4)]; 302 | } 303 | 304 | 305 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5) 306 | { 307 | #if DEBUG 308 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 309 | #endif 310 | return Data[GetRawIndex(i0, i1, i2, i3, i4, i5)]; 311 | } 312 | 313 | 314 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6) 315 | { 316 | #if DEBUG 317 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 318 | #endif 319 | return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6)]; 320 | } 321 | 322 | 323 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7) 324 | { 325 | #if DEBUG 326 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 327 | #endif 328 | return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6, i7)]; 329 | } 330 | 331 | 332 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8) 333 | { 334 | #if DEBUG 335 | ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU."); 336 | #endif 337 | return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6, i7, i8)]; 338 | } 339 | 340 | } 341 | 342 | #endif //ACROBATIC_TENSOR_HPP 343 | -------------------------------------------------------------------------------- /unittest/kernel/test_DimensionedKernel.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "catch.hpp" 7 | #include "DimensionedKernel.hpp" 8 | 9 | using namespace acro; 10 | 11 | 12 | TEST_CASE("DimensionedKernel operations", "[DimensionedKernel]") 13 | { 14 | Tensor T1out_3(3), T2out_3_3(3, 3), T1_3(3), T1_2(2), T2_3_3(3,3); 15 | 16 | 17 | SECTION("A_i=B_iC_i") 18 | { 19 | TensorKernel Kernel("A_i=B_iC_i"); 20 | std::vector inputs; 21 | inputs.push_back(&T1_3); 22 | inputs.push_back(&T1_3); 23 | DimensionedKernel DKernel(&Kernel, &T1out_3, inputs); 24 | REQUIRE(DKernel.GetFlatIdxSize() == 3); 25 | REQUIRE(DKernel.GetOutIdxSize() == 3); 26 | REQUIRE(DKernel.GetContIdxSize() == 1); 27 | REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 3); 28 | REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 6); 29 | } 30 | 31 | SECTION("A_i=B_s_iC_i_s") 32 | { 33 | TensorKernel Kernel("A_i=B_s_iC_i_s"); 34 | std::vector inputs; 35 | inputs.push_back(&T2_3_3); 36 | inputs.push_back(&T2_3_3); 37 | DimensionedKernel DKernel(&Kernel, &T1out_3, inputs); 38 | REQUIRE(DKernel.GetFlatIdxSize() == 9); 39 | REQUIRE(DKernel.GetOutIdxSize() == 3); 40 | REQUIRE(DKernel.GetContIdxSize() == 3); 41 | REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 1); 42 | REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(2) == 3); 43 | REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 6); 44 | REQUIRE(DKernel.GetInputStorageReqForInnerLoops(2) == 18); 45 | } 46 | 47 | SECTION("A_i=B_s_iC_i") 48 | { 49 | TensorKernel Kernel("A_i=B_s_iC_i"); 50 | std::vector inputs; 51 | inputs.push_back(&T2_3_3); 52 | inputs.push_back(&T1_3); 53 | DimensionedKernel DKernel(&Kernel, &T1out_3, inputs); 54 | REQUIRE(DKernel.GetFlatIdxSize() == 9); 55 | REQUIRE(DKernel.GetOutIdxSize() == 3); 56 | REQUIRE(DKernel.GetContIdxSize() == 3); 57 | REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 1); 58 | REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(2) == 3); 59 | REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 4); 60 | REQUIRE(DKernel.GetInputStorageReqForInnerLoops(2) == 12); 61 | } 62 | 63 | SECTION("S_e_i1_i2_i3_j1_j2_j3=B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_nD_e_k1_k2_k3_m_n") 64 | { 65 | std::string kernel_str = "S_e_i1_i2_i3_j1_j2_j3 =B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_n D_e_k1_k2_k3_m_n"; 66 | TensorKernel Kernel(kernel_str); 67 | Tensor S(10, 5, 5, 5, 5, 5, 5); 68 | Tensor Btilde1(5, 5, 5, 3, 3); 69 | Tensor Btilde2(5, 5, 5, 3, 3); 70 | Tensor Btilde3(5, 5, 5, 3, 3); 71 | Tensor D(10, 5, 5, 5, 3, 3); 72 | std::vector inputs = {&Btilde1, &Btilde2, &Btilde3, &D}; 73 | DimensionedKernel DKernel(&Kernel, &S, inputs); 74 | REQUIRE(DKernel.GetFlatIdxSize() == 175781250); 75 | REQUIRE(DKernel.GetOutIdxSize() == 156250); 76 | REQUIRE(DKernel.GetContIdxSize() == 1125); 77 | } 78 | } -------------------------------------------------------------------------------- /unittest/kernel/test_TensorKernel.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "catch.hpp" 7 | #include "TensorKernel.hpp" 8 | 9 | using namespace acro; 10 | 11 | std::string reconstruct_kernel_str(TensorKernel &Kernel); 12 | 13 | TEST_CASE("TensorKernel operations", "[TensorKernel]") 14 | { 15 | SECTION("Assert Parsable String") 16 | { 17 | REQUIRE_THROWS(new TensorKernel("Blah")); 18 | REQUIRE_THROWS(new TensorKernel("Blah=")); 19 | REQUIRE_THROWS(new TensorKernel("Blah=Alah")); 20 | REQUIRE_THROWS(new TensorKernel("BLA1h=Alah")); 21 | REQUIRE_NOTHROW(new TensorKernel("B_lah=A_lah")); 22 | REQUIRE_THROWS(new TensorKernel("B_lah=A_lah_")); 23 | REQUIRE_THROWS(new TensorKernel("B_lah_=A_lah")); 24 | REQUIRE_THROWS(new TensorKernel("a_lah_=A_lah")); 25 | } 26 | 27 | SECTION("Can Reconstruct Various Kernels") 28 | { 29 | REQUIRE(reconstruct_kernel_str(*(new TensorKernel("BA_i=A_j"))) == "BA_i=A_j"); 30 | REQUIRE(reconstruct_kernel_str(*(new TensorKernel("B1_i=CB_jBr_j"))) == "B1_i=CB_jBr_j"); 31 | } 32 | 33 | SECTION("A_i=B_iC_i") 34 | { 35 | TensorKernel Kernel("A_i=B_iC_i"); 36 | 37 | SECTION("Basic Parsing") 38 | { 39 | REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_iC_i"); 40 | REQUIRE(Kernel.AllIndexNames.size() == 1); 41 | REQUIRE(Kernel.AllIndexNames[0] == "i"); 42 | REQUIRE(Kernel.ContractionIndexNames.size() == 0); 43 | 44 | REQUIRE(Kernel.GetNumIndices() == 1); 45 | REQUIRE(Kernel.GetNumContractionIndices() == 0); 46 | REQUIRE(Kernel.GetNumVars() == 3); 47 | REQUIRE(Kernel.GetNumInputVars() == 2); 48 | REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 0); 49 | REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0); 50 | REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0)); 51 | REQUIRE(Kernel.IsVarDependentOnLoop(0, 0)); 52 | REQUIRE(Kernel.IsVarDependentOnLoop(1, 0)); 53 | } 54 | } 55 | 56 | SECTION("A_i=B_s_iC_i_s") 57 | { 58 | TensorKernel Kernel("A_i=B_s_iC_i_s"); 59 | SECTION("Basic Parsing") 60 | { 61 | REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_s_iC_i_s"); 62 | REQUIRE(Kernel.AllIndexNames.size() == 2); 63 | REQUIRE(Kernel.AllIndexNames[0] == "i"); 64 | REQUIRE(Kernel.AllIndexNames[1] == "s"); 65 | REQUIRE(Kernel.ContractionIndexNames.size() == 1); 66 | REQUIRE(Kernel.ContractionIndexNames[0] == "s"); 67 | 68 | REQUIRE(Kernel.GetNumIndices() == 2); 69 | REQUIRE(Kernel.GetNumContractionIndices() == 1); 70 | REQUIRE(Kernel.GetNumVars() == 3); 71 | REQUIRE(Kernel.GetNumInputVars() == 2); 72 | REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 1); 73 | REQUIRE(Kernel.GetVarDimLoopNum(0, 1) == 0); 74 | REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0); 75 | REQUIRE(Kernel.GetVarDimLoopNum(1, 1) == 1); 76 | REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0)); 77 | REQUIRE(!Kernel.IsVarDependentOnLoop(-1, 1)); 78 | REQUIRE(Kernel.IsVarDependentOnLoop(0, 0)); 79 | REQUIRE(Kernel.IsVarDependentOnLoop(0, 1)); 80 | REQUIRE(Kernel.IsVarDependentOnLoop(1, 0)); 81 | REQUIRE(Kernel.IsVarDependentOnLoop(1, 1)); 82 | } 83 | } 84 | 85 | SECTION("A_i=B_s_iC_i") 86 | { 87 | TensorKernel Kernel("A_i=B_s_iC_i"); 88 | SECTION("Basic Parsing") 89 | { 90 | REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_s_iC_i"); 91 | REQUIRE(Kernel.AllIndexNames.size() == 2); 92 | REQUIRE(Kernel.AllIndexNames[0] == "i"); 93 | REQUIRE(Kernel.AllIndexNames[1] == "s"); 94 | REQUIRE(Kernel.ContractionIndexNames.size() == 1); 95 | REQUIRE(Kernel.ContractionIndexNames[0] == "s"); 96 | 97 | REQUIRE(Kernel.GetNumIndices() == 2); 98 | REQUIRE(Kernel.GetNumContractionIndices() == 1); 99 | REQUIRE(Kernel.GetNumVars() == 3); 100 | REQUIRE(Kernel.GetNumInputVars() == 2); 101 | REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 1); 102 | REQUIRE(Kernel.GetVarDimLoopNum(0, 1) == 0); 103 | REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0); 104 | REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0)); 105 | REQUIRE(!Kernel.IsVarDependentOnLoop(-1, 1)); 106 | REQUIRE(Kernel.IsVarDependentOnLoop(0, 0)); 107 | REQUIRE(Kernel.IsVarDependentOnLoop(0, 1)); 108 | REQUIRE(Kernel.IsVarDependentOnLoop(1, 0)); 109 | REQUIRE(!Kernel.IsVarDependentOnLoop(1, 1)); 110 | } 111 | } 112 | 113 | SECTION("A_i1_i2=B_i1_i2_sum1") 114 | { 115 | TensorKernel Kernel("A_i1_i2=B_i1_i2_sum1"); 116 | SECTION("Basic Parsing") 117 | { 118 | REQUIRE(reconstruct_kernel_str(Kernel) == "A_i1_i2=B_i1_i2_sum1"); 119 | REQUIRE(Kernel.AllIndexNames.size() == 3); 120 | REQUIRE(Kernel.AllIndexNames[0] == "i1"); 121 | REQUIRE(Kernel.AllIndexNames[1] == "i2"); 122 | REQUIRE(Kernel.AllIndexNames[2] == "sum1"); 123 | REQUIRE(Kernel.ContractionIndexNames.size() == 1); 124 | REQUIRE(Kernel.ContractionIndexNames[0] == "sum1"); 125 | } 126 | } 127 | 128 | SECTION("S_e_i1_i2_i3_j1_j2_j3=B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_nD_e_k1_k2_k3_m_n") 129 | { 130 | std::string kernel_str = "S_e_i1_i2_i3_j1_j2_j3 =B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_n D_e_k1_k2_k3_m_n"; 131 | TensorKernel Kernel(kernel_str); 132 | SECTION("Basic Parsing") 133 | { 134 | REQUIRE(Kernel.AllIndexNames.size() == 12); 135 | REQUIRE(Kernel.AllIndexNames[0] == "e"); 136 | REQUIRE(Kernel.AllIndexNames[1] == "i1"); 137 | REQUIRE(Kernel.AllIndexNames[2] == "i2"); 138 | REQUIRE(Kernel.AllIndexNames[3] == "i3"); 139 | REQUIRE(Kernel.AllIndexNames[4] == "j1"); 140 | REQUIRE(Kernel.AllIndexNames[5] == "j2"); 141 | REQUIRE(Kernel.AllIndexNames[6] == "j3"); 142 | REQUIRE(Kernel.AllIndexNames[7] == "k1"); 143 | REQUIRE(Kernel.AllIndexNames[8] == "m"); 144 | REQUIRE(Kernel.AllIndexNames[9] == "n"); 145 | REQUIRE(Kernel.AllIndexNames[10] == "k2"); 146 | REQUIRE(Kernel.AllIndexNames[11] == "k3"); 147 | 148 | REQUIRE(Kernel.ContractionIndexNames.size() == 5); 149 | REQUIRE(Kernel.ContractionIndexNames[0] == "k1"); 150 | REQUIRE(Kernel.ContractionIndexNames[1] == "m"); 151 | REQUIRE(Kernel.ContractionIndexNames[2] == "n"); 152 | REQUIRE(Kernel.ContractionIndexNames[3] == "k2"); 153 | REQUIRE(Kernel.ContractionIndexNames[4] == "k3"); 154 | 155 | REQUIRE(Kernel.GetNumIndices() == 12); 156 | REQUIRE(Kernel.GetNumContractionIndices() == 5); 157 | REQUIRE(Kernel.GetNumVars() == 5); 158 | REQUIRE(Kernel.GetNumInputVars() == 4); 159 | } 160 | } 161 | } 162 | 163 | std::string reconstruct_kernel_str(TensorKernel &Kernel) 164 | { 165 | std::string str; 166 | str += Kernel.OutputVar.Name; 167 | for (int d = 0; d < Kernel.OutputVar.IndexNames.size(); ++d) 168 | { 169 | str += "_"; 170 | str += Kernel.OutputVar.IndexNames[d]; 171 | } 172 | 173 | str += Kernel.EqOperator; 174 | 175 | for (int vari = 0; vari < Kernel.InputVars.size(); ++vari) 176 | { 177 | str += Kernel.InputVars[vari].Name; 178 | for (int d = 0; d < Kernel.InputVars[vari].IndexNames.size(); ++d) 179 | { 180 | str += "_"; 181 | str += Kernel.InputVars[vari].IndexNames[d]; 182 | } 183 | } 184 | return str; 185 | } 186 | -------------------------------------------------------------------------------- /unittest/makefile: -------------------------------------------------------------------------------- 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | #All rights reserved. 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | # Serial compiler 7 | ACRO_BASEDIR = .. 8 | 9 | CXX = clang++ 10 | CXX_FLAGS = -O3 -fopenmp=libomp -std=c++11 -stdlib=libc++ 11 | CCC = $(CXX) $(CXX_FLAGS) 12 | LD_FLAGS = -Wl,-rpath,$(ACRO_BASEDIR) 13 | 14 | DIRS = exec tensor util kernel 15 | SRCDIRS = $(foreach dir,$(DIRS),$(ACRO_BASEDIR)/$(dir)) 16 | INCLUDES = -I. $(foreach dir,$(SRCDIRS),-I$(dir)) -I../inc 17 | 18 | SOURCE_FILES = unit_test_main.cpp $(sort $(wildcard ./*/*.cpp)) 19 | HEADER_FILES = catch.hpp 20 | OBJECT_FILES = $(SOURCE_FILES:.cpp=.o) 21 | ACRO_LIB = $(ACRO_BASEDIR)/lib/shared/libacrotensor.so 22 | 23 | .SUFFIXES: .cpp .o 24 | .cpp.o: 25 | $(CCC) -c $( 9 | using namespace acro; 10 | 11 | 12 | TEST_CASE("Basic SliceTensor unit tests", "[SliceTensor]") 13 | { 14 | SECTION("Prefixed sliced indexing works") 15 | { 16 | Tensor T(2, 3, 4); 17 | SliceTensor S0(T, 0); 18 | SliceTensor S1(T, 1); 19 | SliceTensor S00(T, 0, 0); 20 | SliceTensor S01(T, 0, 1); 21 | SliceTensor S02(T, 0, 2); 22 | SliceTensor S10(T, 1, 0); 23 | SliceTensor S11(T, 1, 1); 24 | SliceTensor S12(T, 1, 2); 25 | SliceTensor S20(T, 2, 0); 26 | SliceTensor S21(T, 2, 1); 27 | SliceTensor S22(T, 2, 2); 28 | 29 | REQUIRE(S0.GetRank() == 2); 30 | REQUIRE(S1.GetRank() == 2); 31 | REQUIRE(S00.GetRank() == 1); 32 | REQUIRE(S01.GetRank() == 1); 33 | REQUIRE(S10.GetRank() == 1); 34 | REQUIRE(S11.GetRank() == 1); 35 | 36 | REQUIRE(S0.GetSize() == 12); 37 | REQUIRE(S1.GetSize() == 12); 38 | REQUIRE(S00.GetSize() == 4); 39 | REQUIRE(S01.GetSize() == 4); 40 | REQUIRE(S10.GetSize() == 4); 41 | REQUIRE(S11.GetSize() == 4); 42 | 43 | REQUIRE(S0.GetDim(0) == 3); 44 | REQUIRE(S1.GetDim(0) == 3); 45 | REQUIRE(S0.GetDim(1) == 4); 46 | REQUIRE(S1.GetDim(1) == 4); 47 | REQUIRE(S00.GetDim(0) == 4); 48 | REQUIRE(S01.GetDim(0) == 4); 49 | REQUIRE(S10.GetDim(0) == 4); 50 | REQUIRE(S11.GetDim(0) == 4); 51 | 52 | REQUIRE(S0.GetStride(0) == 4); 53 | REQUIRE(S1.GetStride(0) == 4); 54 | REQUIRE(S0.GetStride(1) == 1); 55 | REQUIRE(S1.GetStride(1) == 1); 56 | REQUIRE(S00.GetStride(0) == 1); 57 | REQUIRE(S01.GetStride(0) == 1); 58 | REQUIRE(S10.GetStride(0) == 1); 59 | REQUIRE(S11.GetStride(0) == 1); 60 | 61 | for (int idx = 0; idx < 24; ++idx) 62 | { 63 | T[idx] = idx; 64 | } 65 | 66 | for (int k = 0; k < 4; ++k) 67 | { 68 | for (int j = 0; j < 3; ++j) 69 | { 70 | REQUIRE(S0(j, k) == T(0, j, k)); 71 | REQUIRE(S1(j, k) == T(1, j, k)); 72 | } 73 | REQUIRE(S00(k) == T(0,0,k)); 74 | REQUIRE(S01(k) == T(0,1,k)); 75 | REQUIRE(S02(k) == T(0,2,k)); 76 | REQUIRE(S10(k) == T(1,0,k)); 77 | REQUIRE(S11(k) == T(1,1,k)); 78 | REQUIRE(S12(k) == T(1,2,k)); 79 | REQUIRE(S20(k) == T(2,0,k)); 80 | REQUIRE(S21(k) == T(2,1,k)); 81 | REQUIRE(S22(k) == T(2,2,k)); 82 | } 83 | } 84 | 85 | SECTION("Prefixed sliced Set Method") 86 | { 87 | Tensor T(2, 3, 4); 88 | SliceTensor S0(T, 0); 89 | SliceTensor S1(T, 1); 90 | 91 | S0.Set(1.0); 92 | S1.Set(2.0); 93 | for (int j = 0; j < 3; ++ j) 94 | { 95 | for (int k = 0; k < 4; ++ k) 96 | { 97 | REQUIRE(T(0,j,k) == Approx(1.0)); 98 | REQUIRE(T(1,j,k) == Approx(2.0)); 99 | REQUIRE(S0(j,k) == Approx(1.0)); 100 | REQUIRE(S1(j,k) == Approx(2.0)); 101 | } 102 | } 103 | } 104 | 105 | SECTION("Prefixed sliced tensor Set Method on GPU") 106 | { 107 | if (isCudaReady()) 108 | { 109 | Tensor T(2, 10, 4, 4); 110 | T.SwitchToGPU(); 111 | SliceTensor S0(T, 0); 112 | SliceTensor S1(T, 1); 113 | 114 | S0.Set(1.0); 115 | S1.Set(2.0); 116 | T.MoveFromGPU(); 117 | for (int i = 0; i < 10; ++ i) 118 | { 119 | for (int j = 0; j < 4; ++ j) 120 | { 121 | for (int k = 0; k < 4; ++ k) 122 | { 123 | REQUIRE(T(0,i,j,k) == Approx(1.0)); 124 | REQUIRE(T(1,i,j,k) == Approx(2.0)); 125 | REQUIRE(S0(i,j,k) == Approx(1.0)); 126 | REQUIRE(S1(i,j,k) == Approx(2.0)); 127 | } 128 | } 129 | } 130 | } 131 | } 132 | 133 | SECTION("GPU Move Semantics") 134 | { 135 | if (isCudaReady()) 136 | { 137 | Tensor T(2, 3); 138 | T.MapToGPU(); 139 | double *t_cpu = T.GetData(); 140 | double *t_gpu = T.GetDeviceData(); 141 | CHECK(t_cpu != t_gpu); 142 | CHECK(T.GetCurrentData() == t_cpu); 143 | 144 | SliceTensor S(T, 0); 145 | double *s_cpu = S.GetData(); 146 | double *s_gpu = S.GetDeviceData(); 147 | CHECK(s_cpu != s_gpu); 148 | CHECK(S.GetCurrentData() == s_cpu); 149 | CHECK(S.IsMappedToGPU()); 150 | 151 | S.MoveToGPU(); 152 | CHECK(T.IsOnGPU()); 153 | CHECK(S.IsOnGPU()); 154 | CHECK(T.GetCurrentData() == t_gpu); 155 | CHECK(S.GetCurrentData() == s_gpu); 156 | 157 | S.Set(2.0); 158 | T.MoveFromGPU(); 159 | CHECK(!T.IsOnGPU()); 160 | CHECK(!S.IsOnGPU()); 161 | CHECK(T.GetCurrentData() == t_cpu); 162 | CHECK(S.GetCurrentData() == s_cpu); 163 | for (int i = 0; i < S.GetSize(); ++i) 164 | { 165 | CHECK(S[i] == Approx(2.0)); 166 | } 167 | } 168 | } 169 | } -------------------------------------------------------------------------------- /unittest/tensor/test_Tensor.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "catch.hpp" 7 | #include "AcroTensor.hpp" 8 | #include 9 | using namespace acro; 10 | 11 | 12 | TEST_CASE("Basic Tensor unit tests", "[Tensor]") 13 | { 14 | Tensor T1a(10), T2a(5, 6), T3a(4, 3, 1), T4a(5, 1, 3, 2), T5a(1, 1, 1, 2, 3); 15 | std::vector dims = {1, 4, 3, 2}; 16 | Tensor Tdims(dims); 17 | 18 | SECTION("Assert dims > 0") 19 | { 20 | REQUIRE_NOTHROW(new Tensor(1)); 21 | REQUIRE_THROWS(new Tensor(-1)); 22 | REQUIRE_THROWS(new Tensor(10, 0)); 23 | REQUIRE_THROWS(new Tensor(-1, 10)); 24 | REQUIRE_THROWS(new Tensor(10, -1, 10)); 25 | 26 | std::vector empty_dims; 27 | REQUIRE_THROWS(new Tensor(empty_dims)); 28 | 29 | std::vector bogus_dims = {1, 2, 3, 4, 5, 6, 7, -100}; 30 | REQUIRE_THROWS(new Tensor(bogus_dims)); 31 | 32 | REQUIRE_NOTHROW(new Tensor(dims)); 33 | } 34 | 35 | SECTION("Dimensions set properly") 36 | { 37 | SECTION("Ranks") 38 | { 39 | REQUIRE(T1a.GetRank() == 1); 40 | REQUIRE(T2a.GetRank() == 2); 41 | REQUIRE(T3a.GetRank() == 3); 42 | REQUIRE(T4a.GetRank() == 4); 43 | REQUIRE(T5a.GetRank() == 5); 44 | REQUIRE(Tdims.GetRank() == 4); 45 | } 46 | 47 | SECTION("Dims") 48 | { 49 | REQUIRE(T1a.GetDim(0) == 10); 50 | REQUIRE(T2a.GetDim(0) == 5); 51 | REQUIRE(T2a.GetDim(1) == 6); 52 | REQUIRE(T3a.GetDim(0) == 4); 53 | REQUIRE(T3a.GetDim(1) == 3); 54 | REQUIRE(T3a.GetDim(2) == 1); 55 | REQUIRE(T4a.GetDim(0) == 5); 56 | REQUIRE(T4a.GetDim(1) == 1); 57 | REQUIRE(T4a.GetDim(2) == 3); 58 | REQUIRE(T4a.GetDim(3) == 2); 59 | REQUIRE(T5a.GetDim(0) == 1); 60 | REQUIRE(T5a.GetDim(1) == 1); 61 | REQUIRE(T5a.GetDim(2) == 1); 62 | REQUIRE(T5a.GetDim(3) == 2); 63 | REQUIRE(T5a.GetDim(4) == 3); 64 | REQUIRE(Tdims.GetDim(0) == 1); 65 | REQUIRE(Tdims.GetDim(1) == 4); 66 | REQUIRE(Tdims.GetDim(2) == 3); 67 | REQUIRE(Tdims.GetDim(3) == 2); 68 | } 69 | 70 | SECTION("Sizes") 71 | { 72 | REQUIRE(T1a.GetSize() == 10); 73 | REQUIRE(T2a.GetSize() == 30); 74 | REQUIRE(T3a.GetSize() == 12); 75 | REQUIRE(T4a.GetSize() == 30); 76 | REQUIRE(T5a.GetSize() == 6); 77 | REQUIRE(Tdims.GetSize() == 24); 78 | } 79 | } 80 | 81 | SECTION("Index Space Covered") 82 | { 83 | std::vector covered(T4a.GetSize(), false); 84 | for (int i = 0; i < T4a.GetDim(0); ++i) 85 | { 86 | for (int j = 0; j < T4a.GetDim(1); ++j) 87 | { 88 | for (int k = 0; k < T4a.GetDim(2); ++k) 89 | { 90 | for (int l = 0; l < T4a.GetDim(3); ++l) 91 | { 92 | int raw_index = T4a.GetRawIndex(i,j,k,l); 93 | REQUIRE(raw_index >= 0); 94 | REQUIRE(raw_index < T4a.GetSize()); 95 | covered[raw_index] = true; 96 | } 97 | } 98 | } 99 | } 100 | 101 | for (int raw_index = 0; raw_index < T4a.GetSize(); ++raw_index) 102 | { 103 | REQUIRE(covered[raw_index]); 104 | } 105 | } 106 | 107 | SECTION("Accessing the Data") 108 | { 109 | T1a.Set(0.0); 110 | T2a.Set(0.0); 111 | T3a.Set(0.0); 112 | T4a.Set(0.0); 113 | T5a.Set(0.0); 114 | 115 | T1a(3) = 4.0; 116 | REQUIRE(T1a(3) == Approx(4.0)); 117 | REQUIRE(T1a[3] == Approx(4.0)); 118 | 119 | T2a(2,1) = 3.0; 120 | REQUIRE(T2a(2,1) == Approx(3.0)); 121 | REQUIRE(T2a[T2a.GetRawIndex(2,1)] == Approx(3.0)); 122 | } 123 | 124 | SECTION("Reshaping") 125 | { 126 | Tensor T(6); 127 | for (int flatidx = 0; flatidx < T.GetSize(); ++flatidx) 128 | { 129 | T[flatidx] = double(flatidx); 130 | } 131 | 132 | T.Reshape(3, 2); 133 | REQUIRE_NOTHROW(T(1,0)); 134 | REQUIRE(T(1,0) == Approx(2.0)); 135 | REQUIRE_THROWS(T.Reshape(3,4)); 136 | } 137 | 138 | SECTION("Tensor on existing data") 139 | { 140 | double data[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; 141 | Tensor T(2, 3, data); 142 | REQUIRE(T(1,1) == Approx(4.0)); 143 | } 144 | 145 | SECTION("Defered initialization") 146 | { 147 | Tensor T; 148 | REQUIRE(!T.IsInitialized()); 149 | REQUIRE_NOTHROW(T.Init(2, 2)); 150 | REQUIRE(T.IsInitialized()); 151 | REQUIRE_NOTHROW(T(0,0) = 2.0); 152 | REQUIRE(T(0,0) == 2.0); 153 | } 154 | 155 | SECTION("Basic CUDA tests") 156 | { 157 | if (isCudaReady()) 158 | { 159 | Tensor T(2); 160 | T.Set(3.0); 161 | REQUIRE(T(0) == Approx(3.0)); 162 | REQUIRE(T(1) == Approx(3.0)); 163 | 164 | T.MapToGPU(); 165 | REQUIRE(T.IsMappedToGPU()); 166 | REQUIRE(!T.IsOnGPU()); 167 | 168 | T.SwitchToGPU(); 169 | REQUIRE(T.IsOnGPU()); 170 | 171 | T.Set(9.0); 172 | REQUIRE(T(0) == Approx(3.0)); //Not moved back from GPU yet 173 | REQUIRE(T(1) == Approx(3.0)); 174 | 175 | T.MoveFromGPU(); 176 | REQUIRE(!T.IsOnGPU()); 177 | REQUIRE(T(0) == Approx(9.0)); 178 | REQUIRE(T(1) == Approx(9.0)); 179 | } 180 | else 181 | { 182 | std::cout << "No GPU found. Ignoring CUDA tests." << std::endl; 183 | } 184 | } 185 | } -------------------------------------------------------------------------------- /unittest/unit_test_main.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do this in one cpp file 7 | #include "catch.hpp" 8 | -------------------------------------------------------------------------------- /util/CudaUtil.cpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #include "CudaUtil.hpp" 7 | #include 8 | #include 9 | 10 | #ifdef ACRO_HAVE_CUDA 11 | namespace acro 12 | { 13 | CUcontext theCudaContext = NULL; 14 | 15 | CudaKernel::CudaKernel() : 16 | IntOpsPerIndex(0), 17 | FloatOpsPerIndex(0), 18 | MemReadsPerIndex(0), 19 | NumBlocks(0), 20 | ThreadsPerBlock(0), 21 | MaxRegCount(-1), 22 | IsMultipleBlockPerOutput(true) 23 | { 24 | 25 | } 26 | 27 | 28 | CudaKernel::~CudaKernel() 29 | { 30 | for (auto it = Textures.begin(); it != Textures.end(); ++it) 31 | { 32 | cudaDestroyTextureObject(it->second); 33 | } 34 | } 35 | 36 | 37 | cudaTextureObject_t CudaKernel::GetTextureObject(int id) 38 | { 39 | return Textures[id]; 40 | } 41 | 42 | 43 | void CudaKernel::GenerateFunction() 44 | { 45 | ensureCudaContext(); 46 | nvrtcProgram prog; 47 | acroCudaErrorCheck(nvrtcCreateProgram(&prog, // prog 48 | Code.c_str(), // buffer 49 | NULL, // name 50 | 0, // numHeaders 51 | NULL, // headers 52 | NULL)); // includeNames 53 | 54 | std::string regstr = "--maxrregcount=" + std::to_string(MaxRegCount); 55 | const char *opts[5] = {"--restrict","--use_fast_math","--gpu-architecture=compute_60","-lineinfo",regstr.c_str()}; 56 | int num_options = (MaxRegCount > 0) ? 5 : 4; 57 | nvrtcResult rcode = nvrtcCompileProgram(prog, // prog 58 | num_options, // numOptions 59 | opts); // options 60 | if (rcode != NVRTC_SUCCESS) 61 | { 62 | std::cout << "NVRTC Compilation error found in:" << std::endl; 63 | std::cout << Code << std::endl; 64 | size_t log_size; 65 | nvrtcGetProgramLogSize(prog, &log_size); 66 | char *compile_log = new char[log_size]; 67 | nvrtcGetProgramLog(prog, compile_log); 68 | std::cout << compile_log << std::endl; 69 | delete[] compile_log; 70 | throw_error("Encountered in CudaKernel::GenerateFunction()"); 71 | } 72 | 73 | 74 | size_t ptxSize; 75 | acroCudaErrorCheck(nvrtcGetPTXSize(prog, &ptxSize)); 76 | char *ptx = new char[ptxSize]; 77 | acroCudaErrorCheck(nvrtcGetPTX(prog, ptx)); 78 | // Load the generated PTX and get a handle to the kernel. 79 | acroCudaErrorCheck(cuModuleLoadDataEx(&Module, ptx, 0, 0, 0)); 80 | acroCudaErrorCheck(cuModuleGetFunction(&Function, Module, FunctionName.c_str())); 81 | acroCudaErrorCheck(nvrtcDestroyProgram(&prog)); 82 | 83 | delete [] ptx; 84 | } 85 | 86 | 87 | void CudaKernel::SetGlobalArray(std::string &name, std::vector &arr) 88 | { 89 | CUdeviceptr device_arr; 90 | int bytesize = sizeof(int)*arr.size(); 91 | acroCudaErrorCheck(cuModuleGetGlobal(&device_arr, NULL, Module, name.c_str())); 92 | acroCudaErrorCheck(cudaMemcpy((void*)device_arr, &arr[0], bytesize, cudaMemcpyHostToDevice)); 93 | } 94 | 95 | void CudaKernel::Launch(std::vector &kernel_params, cudaStream_t cuda_stream) 96 | { 97 | ensureCudaContext(); 98 | acroCudaErrorCheck(cuLaunchKernel(Function, 99 | NumBlocks, 1, 1, // grid dim 100 | ThreadsPerBlock, 1, 1, // threads per block 101 | 0, cuda_stream, // shared mem and stream 102 | &kernel_params[0], 0)); // arguments 103 | } 104 | 105 | 106 | void CudaKernel::WriteCodeToFile(const char *fname) 107 | { 108 | std::string fname_str(fname); 109 | WriteCodeToFile(fname_str); 110 | } 111 | 112 | 113 | void CudaKernel::WriteCodeToFile(std::string &fname) 114 | { 115 | std::ofstream file; 116 | file.open(fname); 117 | file << Code; 118 | file.close(); 119 | } 120 | 121 | 122 | 123 | __global__ void CudaSet(double *d, double val, int N) 124 | { 125 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 126 | if (idx < N) 127 | { 128 | d[idx] = val; 129 | } 130 | } 131 | 132 | 133 | __global__ void CudaMult(double *d, double c, int N) 134 | { 135 | int idx = blockIdx.x*blockDim.x + threadIdx.x; 136 | if (idx < N) 137 | { 138 | d[idx] *= c; 139 | } 140 | } 141 | 142 | 143 | __device__ int2 CudaWarpSort(int2 val) 144 | { 145 | int2 val2 = val; 146 | // const int lanei = threadIdx.x % 32; 147 | // const bool odd = threadIdx.x % 2 == 1; 148 | // const bool even = !odd; 149 | // bool comp_less; 150 | // int2 comp_val; 151 | // for (int pass = 0; pass < 32; ++pass) 152 | // { 153 | // //Even pass 154 | // comp_val.x = __shfl_sync(0xFFFF, val, lanei + even - odd); 155 | // comp_val.y = __shfl_sync(0xFFFF, val, lanei + even - odd); 156 | // comp_less = (comp_val.x < val.x) || ((comp_val.x == val.x) && (comp_val.y < val.y)); 157 | // val.x = int(even && (comp_less) || odd && (!comp_less)) * comp_val.x + 158 | // int(even && (!comp_less) || odd && (comp_less)) * val.x; 159 | // val.y = int(even && (comp_less) || odd && (!comp_less)) * comp_val.y + 160 | // int(even && (!comp_less) || odd && (comp_less)) * val.y; 161 | 162 | // //Odd pass 163 | // comp_val.x = __shfl_sync(0xFFFF, val, min(max(lanei - even + odd, 0), 31)); 164 | // comp_val.y = __shfl_sync(0xFFFF, val, min(max(lanei - even + odd, 0), 31)); 165 | // comp_less = (comp_val.x < val.x) || (comp_val.x == val.x) && (comp_val.y < val.y); 166 | // val.x = int(odd && (comp_less) || even && (!comp_less)) * comp_val.x + 167 | // int(odd && (!comp_less) || even && (comp_less)) * val.x; 168 | // val.y = int(odd && (comp_less) || even && (!comp_less)) * comp_val.y + 169 | // int(odd && (!comp_less) || even && (comp_less)) * val.y; 170 | // } 171 | return val2; 172 | } 173 | 174 | 175 | __device__ int2 shfl_sync_int2(unsigned mask, int2 val, int srcLane, int width) 176 | { 177 | int2 retval; 178 | retval.x = __shfl_sync(mask, val.x, srcLane, width); 179 | retval.y = __shfl_sync(mask, val.y, srcLane, width); 180 | return retval; 181 | } 182 | 183 | } 184 | 185 | #endif -------------------------------------------------------------------------------- /util/CudaUtil.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_CUDA_UTIL_HPP 7 | #define ACROBATIC_CUDA_UTIL_HPP 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "Error.hpp" 15 | #include 16 | 17 | #ifdef ACRO_HAVE_CUDA 18 | #include "cuda.h" 19 | #include "nvrtc.h" 20 | #include "cuda_runtime.h" 21 | #endif 22 | 23 | namespace acro 24 | { 25 | 26 | #define RESTRICT __restrict__ 27 | 28 | #ifdef ACRO_HAVE_CUDA 29 | class CudaKernel 30 | { 31 | public: 32 | CudaKernel(); 33 | ~CudaKernel(); 34 | void GenerateFunction(); 35 | void SetGlobalArray(std::string &ame, std::vector &arr); 36 | void WriteCodeToFile(const char *fname); 37 | void WriteCodeToFile(std::string &fname); 38 | template 39 | inline void AddTextureData(int id, std::vector &data); 40 | cudaTextureObject_t GetTextureObject(int id); 41 | void Launch(std::vector &kernel_params, cudaStream_t cuda_stream = NULL); 42 | 43 | std::string FunctionName; 44 | std::string Code; 45 | CUmodule Module; 46 | CUfunction Function; 47 | int IntOpsPerIndex; 48 | int FloatOpsPerIndex; 49 | int MemReadsPerIndex; 50 | int NumBlocks; 51 | int ThreadsPerBlock; 52 | int MaxRegCount; 53 | bool IsMultipleBlockPerOutput; 54 | 55 | private: 56 | std::map Textures; 57 | }; 58 | 59 | #define acroCudaErrorCheck(ans) acroCudaAssert((ans), __FILE__, __LINE__); 60 | inline void acroCudaAssert(cudaError_t code, const char *file, int line) 61 | { 62 | if (code != cudaSuccess) 63 | { 64 | fprintf(stderr,"CUDA Error: %s\n", cudaGetErrorString(code)); 65 | throw_error(std::string("Encountered at: ") + std::string(file) + ": " + std::to_string(line)); 66 | } 67 | } 68 | 69 | 70 | inline void acroCudaAssert(nvrtcResult code, const char *file, int line) 71 | { 72 | if (code != NVRTC_SUCCESS) 73 | { 74 | fprintf(stderr,"NVRTC Error: %s\n", nvrtcGetErrorString(code)); 75 | throw_error(std::string("Encountered at: ") + std::string(file) + ": " + std::to_string(line)); 76 | } 77 | } 78 | 79 | 80 | inline void acroCudaAssert(CUresult code, const char *file, int line) 81 | { 82 | if (code != CUDA_SUCCESS) 83 | { 84 | const char *msg; 85 | cuGetErrorName(code, &msg); 86 | fprintf(stderr,"CUDA Error: %s\n", msg); 87 | throw_error(std::string("Encountered at: ") + std::string(file) + ": " + std::to_string(line)); 88 | } 89 | } 90 | 91 | 92 | extern CUcontext theCudaContext; 93 | inline void setCudaContext(void *ctx) 94 | { 95 | 96 | theCudaContext = (CUcontext) ctx; 97 | 98 | } 99 | 100 | 101 | inline void ensureCudaContext() 102 | { 103 | if (!theCudaContext) 104 | { 105 | acroCudaErrorCheck(cuCtxCreate(&theCudaContext, 0, 0)); 106 | } 107 | acroCudaErrorCheck(cuCtxSetCurrent(theCudaContext)); 108 | } 109 | 110 | 111 | template 112 | inline void CudaKernel::AddTextureData(int id, std::vector &data) 113 | { 114 | int Tsize = sizeof(T); 115 | int bitT = Tsize * 8; 116 | int bitTo2 = bitT / 2; 117 | int bitTo4 = bitT / 4; 118 | int arr_bytesize = Tsize*data.size(); 119 | T *buffer; 120 | acroCudaErrorCheck(cudaMalloc(&buffer, arr_bytesize)); 121 | acroCudaErrorCheck(cudaMemcpy((void*)buffer, &data[0], arr_bytesize, cudaMemcpyHostToDevice)); 122 | 123 | // create texture object 124 | cudaResourceDesc resDesc; 125 | std::memset(&resDesc, 0, sizeof(resDesc)); 126 | resDesc.resType = cudaResourceTypeLinear; 127 | resDesc.res.linear.devPtr = buffer; 128 | resDesc.res.linear.sizeInBytes = arr_bytesize; 129 | if (std::is_same::value || std::is_same::value || std::is_same::value) 130 | { 131 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindUnsigned); 132 | } 133 | else if (std::is_same::value || std::is_same::value || std::is_same::value) 134 | { 135 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindSigned); 136 | } 137 | else if (std::is_same::value) 138 | { 139 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindFloat); 140 | } 141 | else if (std::is_same::value || std::is_same::value || std::is_same::value) 142 | { 143 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindUnsigned); 144 | } 145 | else if (std::is_same::value || std::is_same::value || std::is_same::value) 146 | { 147 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindSigned); 148 | } 149 | else if (std::is_same::value) 150 | { 151 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindFloat); 152 | } 153 | else 154 | { 155 | resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindUnsigned); 156 | } 157 | 158 | 159 | cudaTextureDesc texDesc; 160 | std::memset(&texDesc, 0, sizeof(texDesc)); 161 | texDesc.readMode = cudaReadModeElementType; 162 | 163 | Textures[id] = 0; 164 | cudaCreateTextureObject(&Textures[id], &resDesc, &texDesc, NULL); 165 | } 166 | 167 | __global__ void CudaSet(double *d, double val, int N); 168 | __global__ void CudaMult(double *d, double c, int N); 169 | 170 | __device__ int2 CudaWarpSort(int2 val); 171 | __device__ int2 shfl_sync_int2(unsigned mask, int2 var, int srcLane, int width=32); 172 | 173 | #endif 174 | 175 | inline bool isCudaReady() 176 | { 177 | #ifndef ACRO_HAVE_CUDA 178 | return false; 179 | #else 180 | int cuda_device_count = -1; 181 | cudaGetDeviceCount(&cuda_device_count); 182 | return (cuda_device_count > 0); 183 | #endif 184 | } 185 | 186 | 187 | 188 | } 189 | 190 | #endif //ACROBATIC_CUDA_UTIL_HPP -------------------------------------------------------------------------------- /util/Error.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_ERROR_HPP 7 | #define ACROBATIC_ERROR_HPP 8 | 9 | #include 10 | #include 11 | 12 | namespace acro 13 | { 14 | 15 | #define GET_MACRO(_1,_2,NAME,...) NAME 16 | #define ACROBATIC_ASSERT(...) GET_MACRO(__VA_ARGS__, ACROBATIC_ASSERT2, ACROBATIC_ASSERT1)(__VA_ARGS__) 17 | #define ACROBATIC_ASSERT1(EX) if (!(EX)) throw_error(std::string(__FILE__) + ": " + std::to_string(__LINE__)); 18 | #define ACROBATIC_ASSERT2(EX, STR) if (!(EX)) throw_error(std::string(__FILE__) + ": " + std::to_string(__LINE__) + " " + STR); 19 | 20 | inline void throw_error(std::string error) 21 | { 22 | throw std::runtime_error(error); 23 | } 24 | 25 | } 26 | 27 | #endif //ACROBATIC_ERROR_HPP -------------------------------------------------------------------------------- /util/StringUtil.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_STRING_UTIL_HPP 7 | #define ACROBATIC_STRING_UTIL_HPP 8 | 9 | #include 10 | 11 | namespace acro 12 | { 13 | 14 | inline void str_replace_all(std::string &instr, const std::string &keystr, const std::string &repstr) 15 | { 16 | std::size_t instr_pos = instr.find(keystr); 17 | while (instr_pos != std::string::npos) 18 | { 19 | instr.replace(instr_pos, keystr.length(), repstr); 20 | instr_pos = instr.find(keystr); 21 | } 22 | } 23 | 24 | inline void str_replace_all(std::string &instr, const std::string &keystr, const int repint) 25 | { 26 | str_replace_all(instr, keystr, std::to_string(repint)); 27 | } 28 | 29 | } 30 | 31 | #endif //ACROBATIC_STRING_UTIL_HPP -------------------------------------------------------------------------------- /util/Util.hpp: -------------------------------------------------------------------------------- 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419. 3 | //All rights reserved. 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor. 5 | 6 | #ifndef ACROBATIC_UTIL_HPP 7 | #define ACROBATIC_UTIL_HPP 8 | 9 | #include "Error.hpp" 10 | #include "CudaUtil.hpp" 11 | #include "StringUtil.hpp" 12 | 13 | #endif --------------------------------------------------------------------------------