├── AcroTensor.hpp
├── LICENSE
├── README.md
├── config
    └── defaults.mk
├── exec
    ├── CPUInterpretedExecutor.cpp
    ├── CPUInterpretedExecutor.hpp
    ├── CudaExecutor.cpp
    ├── CudaExecutor.hpp
    ├── Executor.hpp
    ├── KernelExecutor.cpp
    └── KernelExecutor.hpp
├── kernel
    ├── DimensionedKernel.cpp
    ├── DimensionedKernel.hpp
    ├── DimensionedMultiKernel.cpp
    ├── DimensionedMultiKernel.hpp
    ├── TensorEngine.cpp
    ├── TensorEngine.hpp
    ├── TensorKernel.cpp
    └── TensorKernel.hpp
├── makefile
├── ops
    ├── CudaGPUOps.cpp
    ├── CudaGPUOps.hpp
    ├── NativeCPUOps.cpp
    ├── NativeCPUOps.hpp
    ├── NonContractionOps.hpp
    └── Ops.hpp
├── tensor
    ├── IndexMapping.cpp
    ├── IndexMapping.hpp
    ├── IndexVector.cpp
    ├── IndexVector.hpp
    ├── SliceTensor.cpp
    ├── SliceTensor.hpp
    ├── Tensor.cpp
    └── Tensor.hpp
├── unittest
    ├── LICENSE_1_0.txt
    ├── catch.hpp
    ├── kernel
    │   ├── test_DimensionedKernel.cpp
    │   ├── test_TensorEngine.cpp
    │   └── test_TensorKernel.cpp
    ├── makefile
    ├── tensor
    │   ├── test_SliceTensor.cpp
    │   └── test_Tensor.cpp
    └── unit_test_main.cpp
└── util
    ├── CudaUtil.cpp
    ├── CudaUtil.hpp
    ├── Error.hpp
    ├── StringUtil.hpp
    └── Util.hpp


/AcroTensor.hpp:
--------------------------------------------------------------------------------
1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
3 | //All rights reserved.
4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
5 | 
6 | #include "TensorEngine.hpp"
7 | #include "Executor.hpp"
8 | #include "SliceTensor.hpp"
9 | #include "Ops.hpp"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Lawrence Livermore National Laboratory
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Acrotensor
 2 | 
 3 | Acrotensor is a C++/CUDA library for computing arbitrary tensor contractions both on CPUs and GPUs.  The tensors are dynamically sized allowing for a high degree of flexibility, and the tensor contractions are defined with a natural mathematical notation for maximum usability.  In order to maintain good performance contraction code is dynamical generated with fixed sizes and unrolled loops and then Just In Time (JIT) compiled to produce better optimized execution.
 4 | 
 5 | ## Getting started
 6 | 
 7 | Acrotensor depends on a C++11 compiler and requires the nvcc CUDA wrapper on the compiler in order to handle the mix of C++ and CUDA.  To get the build started you will want to enter the acrotensor directory and run:
 8 | ```
 9 | make config
10 | ```
11 | 
12 | This will generate a `config/config.mk` file with a set of defaults that you may need to change for your environment.  Once you have edited your `config.mk` simply enter the acrotensor directory and run:
13 | ```
14 | make
15 | ```
16 | 
17 | This will build both static and dynamic libraries that you can link against in the `lib` folder and generate an `inc` directory with all of the header files that you will need.
18 | 
19 | If you would like to perform some sanity checks on Acrotensor before moving forward you can build and run the unit test suite by entering the acrotensor directory and running:
20 | ```
21 | make unittest
22 | ```
23 | 
24 | ## Usage
25 | 
26 | To gain access to the Acrotensor objects be sure to include `AcroTensor.hpp` and link against either the static or dynamic library.  The two user facing objects needed to utilize Acrotensor are `acrobatic::Tensor` and `acrobatic::TensorEngine`.  The `Tensor` objects can be constructed on the CPU with dimensions provided by a list of numbers or an `std::vector<int>`:
27 | ```
28 | //Start of an example contraction that will add 1000 random matrices together on the GPU
29 | std::vector<int> dims {1000, 3, 3};
30 | acro::Tensor A(dims);    //1000x3x3 entry tensor
31 | acro::Tensor B(1000);    //1000 entry tensor
32 | acro::Tensor S(3,3);     //3x3 tensor
33 | ```
34 | 
35 | Once the tensors are created they can be accessed on the CPU with Tensor indexing using the `()` operator and linear indexing using the `[]` operator.  The data in the tensors are layed out linearly with the most significant index on the left.  There are also utility methods such as `Set()` and `Print()`:
36 | ```
37 | for (int linear = 0; linear < 1000*3*3; ++linear)
38 |    A[linear] = (double)rand() / RAND_MAX;
39 | 
40 | B.Set(1.0);
41 | for (int i = 0; i < 3; ++i)
42 |    for (int j = 0; j < 3; ++j)
43 |       S(i, j) = 0.0;
44 | ```
45 | 
46 | Memory motion between the CPU and GPU can be accomplished by using the following `Tensor` methods:
47 | ```
48 | A.MapToGPU();     //Allocate memory on the GPU
49 | B.MapToGPU();
50 | S.MapToGPU();
51 | 
52 | A.MoveToGPU();    //Copy the data to the GPU and indicate the the GPU has the fresh copy
53 | B.MoveToGPU();
54 | 
55 | S.SwitchToGPU();  //Indicate that the GPU has the fresh copy without copying the data (good for outputs)
56 | ```
57 | 
58 | Tensor contractions can now be handled through a `TensorEngine` object.  Thesor engines can be initilized with different execution policies that can handle contractions on the CPU or GPU with different approaches.  The contraction string in the `[]` operator defines how the tensors will be indexed, multiplied and added.  The dimensions of the contraction operation are set by the dimensions of the tensors that are passed in via the `()` operator.  Any index that does not appear on the left hand side is sum across and contracted away in the ouput tensor.
59 | ```
60 | acro::TensorEngine TE("Cuda");  //Initilize the engine with the Cuda exec policy
61 | TE("S_i_j = A_n_i_j B_n", S, A, B);             //Contract on n and sum the 1000 matrices into 1
62 | TE("S_i_j = A_n_i_j", S, A);                    //Same result as before since n is still contracted
63 | 
64 | S.MoveFromGPU();     //Get the results back from the GPU
65 | S.Print();           //Display the results of the contraction
66 | ```
67 | 


--------------------------------------------------------------------------------
/config/defaults.mk:
--------------------------------------------------------------------------------
 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | #All rights reserved.
 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #Default values for utilizing nvcc+gcc on a P100 system
 7 | DEBUG = NO
 8 | CUDADIR = /usr/local/cuda
 9 | CXX = $(CUDADIR)/bin/nvcc
10 | UTILCXX = $(CXX)
11 | CXX_OPT = -O3 -g -arch compute_60 -x cu --std=c++11 -DACRO_HAVE_CUDA --compiler-options="-fPIC"
12 | CXX_DEBUG = -G -g -arch compute_60 -x cu --std=c++11 -DACRO_HAVE_CUDA --compiler-options="-fPIC"
13 | UNITTEST_LDFLAGS = -O0 -G -arch compute_60 --std=c++11 -lnvrtc -lcuda -L$(CUDADIR)/lib64


--------------------------------------------------------------------------------
/exec/CPUInterpretedExecutor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "CPUInterpretedExecutor.hpp"
  7 | #include <iostream>
  8 | #include <math.h>
  9 | 
 10 | namespace acro
 11 | {
 12 | 
 13 | 
 14 | CPUInterpretedExecutor::CPUInterpretedExecutor(DimensionedMultiKernel *multi_kernel) : KernelExecutor(multi_kernel) 
 15 | {
 16 |     NumLoops = FirstKernel->GetNumIndices();
 17 |     NumInVars = FirstKernel->GetNumInputVars();
 18 |     N = FirstKernel->GetLoopDims();
 19 | 
 20 |     OutputRank = FirstKernel->GetVarRank(-1);
 21 |     OutputLoopNums = &(FirstKernel->OutputVar.LoopNums[0]);
 22 |     OutputStrides = new int[OutputRank];
 23 |     for (int di = 0; di < OutputRank; ++di)
 24 |     {
 25 |         OutputStrides[di] = FirstKernel->GetVarDimStride(-1, di);
 26 |     }
 27 | 
 28 |     
 29 |     InputRanks = new int[NumInVars];
 30 |     InputLoopNums = new int*[NumInVars]; 
 31 |     InputStrides = new int*[NumInVars];
 32 |     InputVars = new double*[NumInVars];
 33 |     for (int vari = 0; vari < NumInVars; ++vari)
 34 |     {
 35 |         InputRanks[vari] = FirstKernel->GetVarRank(vari);
 36 |         InputLoopNums[vari] = &(FirstKernel->InputVars[vari].LoopNums[0]);
 37 |         InputStrides[vari] = new int[InputRanks[vari]];
 38 |         for (int di = 0; di < InputRanks[vari]; ++di)
 39 |         {
 40 |             InputStrides[vari][di] = FirstKernel->GetVarDimStride(vari, di);
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | CPUInterpretedExecutor::~CPUInterpretedExecutor()
 46 | {
 47 |     delete [] OutputStrides;
 48 |     delete [] InputRanks;
 49 |     delete [] InputLoopNums;
 50 |     delete [] InputVars;
 51 |     for (int vari = 0; vari < NumInVars; ++vari)
 52 |     {
 53 |         delete [] InputStrides[vari];
 54 |     }
 55 |     delete [] InputStrides;
 56 | }
 57 | 
 58 | 
 59 | void CPUInterpretedExecutor::ExecuteSingle(Tensor *output, std::vector<Tensor*> &inputs)
 60 | {
 61 |     MoveTensorsFromGPU(output, inputs);
 62 | 
 63 |     //Since we are using += or -= into the output 
 64 |     if (FirstKernel->EqOperator == "=")
 65 |     {
 66 |         output->Set(0.0);
 67 |     }
 68 | 
 69 |     OutputVar = output->GetData();
 70 |     for (int vari = 0; vari < NumInVars; ++vari)
 71 |     {
 72 |         InputVars[vari] = inputs[vari]->GetData();
 73 |     }
 74 | 
 75 |     switch (NumLoops)
 76 |     {
 77 |         case 1: Execute1Loops(); break;
 78 |         case 2: Execute2Loops(); break;
 79 |         case 3: Execute3Loops(); break;
 80 |         case 4: Execute4Loops(); break;
 81 |         case 5: Execute5Loops(); break;
 82 |         case 6: Execute6Loops(); break;
 83 |         case 7: Execute7Loops(); break;
 84 |         case 8: Execute8Loops(); break;
 85 |         case 9: Execute9Loops(); break;
 86 |         case 10: Execute10Loops(); break;
 87 |         case 11: Execute11Loops(); break;
 88 |         case 12: Execute12Loops(); break;
 89 |         default: ExecuteArbitraryLoops();
 90 |     }
 91 | }
 92 | 
 93 | std::string CPUInterpretedExecutor::GetImplementation()
 94 | {
 95 |     return "Interpreted\n";
 96 | }
 97 | 
 98 | 
 99 | void CPUInterpretedExecutor::Execute1Loops()
100 | {
101 |     int I[1];
102 |     int &i0 = I[0];
103 | 
104 |     for (i0 = 0; i0 < N[0]; ++i0)
105 |     {
106 |         OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
107 |     }
108 | }
109 | 
110 | 
111 | void CPUInterpretedExecutor::Execute2Loops()
112 | {
113 |     int I[2];
114 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
115 |     {
116 |         for (I[1] = 0; I[1] < N[1]; ++I[1]) 
117 |         {
118 |             OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
119 |         }
120 |     }
121 | }
122 | 
123 | 
124 | void CPUInterpretedExecutor::Execute3Loops()
125 | {
126 |     int I[3];
127 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
128 |     {
129 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
130 |         {
131 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
132 |             {
133 |                 OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
134 |             }
135 |         }
136 |     }    
137 | }
138 | 
139 | 
140 | void CPUInterpretedExecutor::Execute4Loops()
141 | {
142 |     int I[4];
143 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
144 |     {
145 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
146 |         {
147 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
148 |             {
149 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
150 |                 {
151 |                     OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
152 |                 }
153 |             }
154 |         }
155 |     }    
156 | }
157 | 
158 | 
159 | void CPUInterpretedExecutor::Execute5Loops()
160 | {
161 |     int I[5];
162 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
163 |     {
164 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
165 |         {
166 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
167 |             {
168 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
169 |                 {
170 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
171 |                     {
172 |                         OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
173 |                     }
174 |                 }
175 |             }
176 |         }
177 |     }
178 | }
179 | 
180 | 
181 | void CPUInterpretedExecutor::Execute6Loops()
182 | {
183 |     int I[6];
184 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
185 |     {
186 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
187 |         {
188 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
189 |             {
190 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
191 |                 {
192 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
193 |                     {
194 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
195 |                         {
196 |                             OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
197 |                         }
198 |                     }
199 |                 }
200 |             }
201 |         }
202 |     }
203 | }
204 | 
205 | 
206 | void CPUInterpretedExecutor::Execute7Loops()
207 | {
208 |     int I[7];
209 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
210 |     {
211 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
212 |         {
213 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
214 |             {
215 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
216 |                 {
217 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
218 |                     {
219 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
220 |                         {
221 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
222 |                             {
223 |                                 OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
224 |                             }
225 |                         }
226 |                     }
227 |                 }
228 |             }
229 |         }
230 |     }    
231 | }
232 | 
233 | 
234 | void CPUInterpretedExecutor::Execute8Loops()
235 | {
236 |     int I[8];
237 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
238 |     {
239 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
240 |         {
241 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
242 |             {
243 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
244 |                 {
245 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
246 |                     {
247 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
248 |                         {
249 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
250 |                             {
251 |                                 for (I[7] = 0; I[7] < N[7]; ++I[7])
252 |                                 {
253 |                                     OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
254 |                                 }
255 |                             }
256 |                         }
257 |                     }
258 |                 }
259 |             }
260 |         }
261 |     }    
262 | }
263 | 
264 | 
265 | void CPUInterpretedExecutor::Execute9Loops()
266 | {
267 |     int I[9];
268 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
269 |     {
270 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
271 |         {
272 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
273 |             {
274 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
275 |                 {
276 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
277 |                     {
278 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
279 |                         {
280 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
281 |                             {
282 |                                 for (I[7] = 0; I[7] < N[7]; ++I[7])
283 |                                 {
284 |                                     for (I[8] = 0; I[8] < N[8]; ++I[8])
285 |                                     {                                    
286 |                                         OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
287 |                                     }
288 |                                 }
289 |                             }
290 |                         }
291 |                     }
292 |                 }
293 |             }
294 |         }
295 |     }    
296 | }
297 | 
298 | 
299 | void CPUInterpretedExecutor::Execute10Loops()
300 | {
301 |     int I[10];
302 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
303 |     {
304 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
305 |         {
306 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
307 |             {
308 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
309 |                 {
310 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
311 |                     {
312 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
313 |                         {
314 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
315 |                             {
316 |                                 for (I[7] = 0; I[7] < N[7]; ++I[7])
317 |                                 {
318 |                                     for (I[8] = 0; I[8] < N[8]; ++I[8])
319 |                                     {                                    
320 |                                         for (I[9] = 0; I[9] < N[9]; ++I[9])
321 |                                         {                                    
322 |                                             OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
323 |                                         }
324 |                                     }
325 |                                 }
326 |                             }
327 |                         }
328 |                     }
329 |                 }
330 |             }
331 |         }
332 |     }
333 | }
334 | 
335 | 
336 | void CPUInterpretedExecutor::Execute11Loops()
337 | {
338 |     int I[11];
339 |     for (I[0] = 0; I[0] < N[0]; ++I[0]) 
340 |     {
341 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
342 |         {
343 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
344 |             {
345 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
346 |                 {
347 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
348 |                     {
349 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
350 |                         {
351 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
352 |                             {
353 |                                 for (I[7] = 0; I[7] < N[7]; ++I[7])
354 |                                 {
355 |                                     for (I[8] = 0; I[8] < N[8]; ++I[8])
356 |                                     {                                    
357 |                                         for (I[9] = 0; I[9] < N[9]; ++I[9])
358 |                                         {
359 |                                             for (I[10] = 0; I[10] < N[10]; ++I[10])
360 |                                             {
361 |                                                 OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
362 |                                             }
363 |                                         }
364 |                                     }
365 |                                 }
366 |                             }
367 |                         }
368 |                     }
369 |                 }
370 |             }
371 |         }
372 |     }    
373 | }
374 | 
375 | 
376 | void CPUInterpretedExecutor::Execute12Loops()
377 | {
378 |     int I[12];
379 |     for (I[0] = 0; I[0] < N[0]; ++I[0])
380 |     {
381 |         for (I[1] = 0; I[1] < N[1]; ++I[1])
382 |         {
383 |             for (I[2] = 0; I[2] < N[2]; ++I[2])
384 |             {
385 |                 for (I[3] = 0; I[3] < N[3]; ++I[3])
386 |                 {
387 |                     for (I[4] = 0; I[4] < N[4]; ++I[4])
388 |                     {
389 |                         for (I[5] = 0; I[5] < N[5]; ++I[5])
390 |                         {
391 |                             for (I[6] = 0; I[6] < N[6]; ++I[6])
392 |                             {
393 |                                 for (I[7] = 0; I[7] < N[7]; ++I[7])
394 |                                 {
395 |                                     for (I[8] = 0; I[8] < N[8]; ++I[8])
396 |                                     {                                    
397 |                                         for (I[9] = 0; I[9] < N[9]; ++I[9])
398 |                                         {
399 |                                             for (I[10] = 0; I[10] < N[10]; ++I[10])
400 |                                             {
401 |                                                 for (I[11] = 0; I[11] < N[11]; ++I[11])
402 |                                                 {
403 |                                                     OutputVar[ComputeRawIdx(I, OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I);
404 |                                                 }
405 |                                             }
406 |                                         }
407 |                                     }
408 |                                 }
409 |                             }
410 |                         }
411 |                     }
412 |                 }
413 |             }
414 |         }
415 |     }    
416 | }
417 | 
418 | 
419 | void CPUInterpretedExecutor::ExecuteArbitraryLoops()
420 | {
421 |     std::vector<int> I(FirstKernel->GetNumIndices(), 0);     //Loop indices
422 |     std::vector<int> W(FirstKernel->GetNumIndices());        //Loop strides
423 |     W[W.size()-1] = 1;
424 |     for (int d = W.size() - 2; d >= 0; --d)
425 |     {
426 |         W[d] = W[d+1]*N[d+1];
427 |     }
428 | 
429 |     int flatidx_size = 1;
430 |     for (int d = 0; d < W.size(); ++d)
431 |     {
432 |         flatidx_size *= N[d];
433 |     }
434 | 
435 |     for (int flatidx = 0; flatidx < flatidx_size; ++flatidx)
436 |     {
437 |         //Compute the unflattened indices
438 |         for (int loopd = 0; loopd < I.size(); ++loopd)
439 |         {
440 |             I[loopd] = (flatidx / W[loopd]) % N[loopd];
441 |         }
442 |         OutputVar[ComputeRawIdx(I.data(), OutputLoopNums,OutputStrides, OutputRank)] += ComputeRHS(I.data());
443 |     }
444 | }
445 | 
446 | }


--------------------------------------------------------------------------------
/exec/CPUInterpretedExecutor.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP
 7 | #define ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP
 8 | 
 9 | #include "KernelExecutor.hpp"
10 | #include <map>
11 | 
12 | namespace acro
13 | {
14 | 
15 | class CPUInterpretedExecutor : public KernelExecutor
16 | {
17 |     public:
18 |     CPUInterpretedExecutor(DimensionedMultiKernel *multi_kernel);
19 |     ~CPUInterpretedExecutor();
20 |     virtual void ExecuteSingle(Tensor *output, std::vector<Tensor*> &inputs);
21 |     virtual std::string GetImplementation();
22 |     virtual std::string GetExecType() {return "CPUInterpreted";}
23 | 
24 |     private:
25 |     void Execute1Loops();
26 |     void Execute2Loops();
27 |     void Execute3Loops();
28 |     void Execute4Loops();
29 |     void Execute5Loops();
30 |     void Execute6Loops();
31 |     void Execute7Loops();
32 |     void Execute8Loops();
33 |     void Execute9Loops();
34 |     void Execute10Loops();
35 |     void Execute11Loops();
36 |     void Execute12Loops();
37 |     void ExecuteArbitraryLoops();
38 | 
39 |     inline double ComputeRHS(const int *RESTRICT I);
40 |     inline int ComputeRawIdx(const int *RESTRICT I, const int *loop_nums, const int *var_stride, int rank);
41 | 
42 |     int NumInVars;
43 |     int NumLoops;
44 |     std::vector<int> N;
45 | 
46 |     int OutputRank;
47 |     double *OutputVar;
48 |     int *OutputLoopNums;
49 |     int *OutputStrides;
50 | 
51 |     int *InputRanks;
52 |     double **InputVars;
53 |     int **InputLoopNums;
54 |     int **InputStrides;
55 | };
56 | 
57 | 
58 | inline double CPUInterpretedExecutor::ComputeRHS(const int *RESTRICT I)
59 | {
60 |     double rhs_val = InputVars[0][ComputeRawIdx(I, InputLoopNums[0], InputStrides[0], InputRanks[0])];
61 |     for (int vari = 1; vari < NumInVars; ++vari)
62 |     {
63 |         rhs_val *= InputVars[vari][ComputeRawIdx(I, InputLoopNums[vari], InputStrides[vari], InputRanks[vari])];
64 |     }
65 |     return rhs_val;
66 | }
67 | 
68 | 
69 | inline int CPUInterpretedExecutor::ComputeRawIdx(const int *RESTRICT I, const int *loop_nums, const int *var_stride, int rank)
70 | {   
71 |     int raw_idx = I[loop_nums[0]]*var_stride[0];
72 |     for (int d = 1; d < rank; ++d)
73 |     {
74 |         raw_idx += I[loop_nums[d]]*var_stride[d];
75 |     }
76 |     return raw_idx;
77 | }
78 | 
79 | }
80 | 
81 | #endif //ACROBATIC_CPUINTERPRETED_EXECUTOR_HPP


--------------------------------------------------------------------------------
/exec/CudaExecutor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifdef ACRO_HAVE_CUDA
  7 | 
  8 | #include "CudaExecutor.hpp"
  9 | #include <algorithm>
 10 | #include <set>
 11 | #include <limits>
 12 | 
 13 | namespace acro
 14 | {
 15 | 
 16 | 
 17 | CudaExecutor::CudaExecutor(DimensionedMultiKernel *multi_kernel) : KernelExecutor(multi_kernel) 
 18 | {
 19 |     HDeviceTensors = nullptr;
 20 |     SharedMemAllocated = 0;
 21 |     cudaGetDeviceProperties(&CudaDeviceProp, 0);
 22 |     GenerateCudaKernel();
 23 | }
 24 | 
 25 | CudaExecutor::~CudaExecutor()
 26 | {
 27 |     if (HDeviceTensors != nullptr)
 28 |     {
 29 |         delete HDeviceTensors;
 30 |     }
 31 |     acroCudaErrorCheck(cuModuleUnload(TheCudaKernel->Module));
 32 |     delete TheCudaKernel;
 33 | }
 34 | 
 35 | 
 36 | void CudaExecutor::ExecuteSingle(Tensor *output, std::vector<Tensor*> &inputs)
 37 | {
 38 |     MoveTensorsToGPU(output, inputs);
 39 | 
 40 |     int numuvars = MultiKernel->GetNumUVars();
 41 |     if (KernelParams.size() == 0)
 42 |     {
 43 |         HDeviceTensors = new double*[numuvars];
 44 |         KernelParams.resize(numuvars);
 45 |     }
 46 | 
 47 |     for (int uvari = 0; uvari < numuvars; ++uvari)
 48 |     {
 49 |         auto ki_vari = MultiKernel->GetFirstKiVariForUVari(uvari);
 50 |         int vari = ki_vari.second;
 51 |         double *dtensor;
 52 |         if (vari == -1)
 53 |         {
 54 |             dtensor = output->GetDeviceData();
 55 |         }
 56 |         else
 57 |         {
 58 |             dtensor = inputs[vari]->GetDeviceData();
 59 |         }
 60 |         HDeviceTensors[uvari] = dtensor;
 61 |         KernelParams[uvari] = &(HDeviceTensors[uvari]);
 62 |     }
 63 | 
 64 |     TheCudaKernel->Launch(KernelParams);
 65 |     //cudaDeviceSynchronize();
 66 | }
 67 | 
 68 | 
 69 | void CudaExecutor::ExecuteMulti(std::vector<Tensor*> &outputs, std::vector<std::vector<Tensor*> > &inputs)
 70 | {
 71 |     for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki)
 72 |     {
 73 |         MoveTensorsToGPU(outputs[ki], inputs[ki]);
 74 |     }
 75 | 
 76 |     int numuvars = MultiKernel->GetNumUVars();
 77 |     if (KernelParams.size() == 0)
 78 |     {
 79 |         HDeviceTensors = new double*[numuvars];
 80 |         KernelParams.resize(numuvars);
 81 |     }
 82 | 
 83 |     for (int uvari = 0; uvari < numuvars; ++uvari)
 84 |     {
 85 |         auto ki_vari = MultiKernel->GetFirstKiVariForUVari(uvari);
 86 |         int ki = ki_vari.first;
 87 |         int vari = ki_vari.second;
 88 |         double *dtensor;
 89 |         if (vari == -1)
 90 |         {
 91 |             dtensor = outputs[ki]->GetDeviceData();
 92 |         }
 93 |         else
 94 |         {
 95 |             dtensor = inputs[ki][vari]->GetDeviceData();
 96 |         }
 97 |         HDeviceTensors[uvari] = dtensor;
 98 |         KernelParams[uvari] = &(HDeviceTensors[uvari]);
 99 |     }
100 |     TheCudaKernel->Launch(KernelParams);
101 |     //cudaDeviceSynchronize();
102 | }
103 | 
104 | 
105 | std::string CudaExecutor::GetImplementation()
106 | {
107 |     return TheCudaKernel->Code;
108 | }
109 | 
110 | 
111 | void CudaExecutor::GenerateCudaKernel()
112 | {
113 |     TheCudaKernel = new CudaKernel;
114 |     TheCudaKernel->Code = 
115 |     "extern \"C\"  \n"
116 |     "__global__\n"
117 |     "__launch_bounds__(<BLOCK_SIZE>)\n"
118 |     "void <KERNEL_NAME>(<PARAMS>)\n"
119 |     "{\n"
120 |     "    double sum;\n"
121 |     "    const unsigned int outidx = blockIdx.x;\n"
122 |     "\n"
123 |         "<SMWR_BUFFER>"
124 |     "\n"
125 |         "<PRELOAD_SMVARS>"
126 |     "\n"
127 |     "    __syncthreads();\n"
128 |         "<INIT_INDICES>"
129 |     "\n"
130 |         "<SUBKERNEL_LOOPS>"
131 | 
132 |     "}\n";
133 | 
134 |     ACROBATIC_ASSERT(MultiKernel->GetNumOuterIndices() > 0, "CudaExecutor needs at least 1 non-contraction index.");
135 | 
136 |     NumBlockLoops = GetNumBlockLoops();
137 | 
138 | 
139 |     int outidx_size = MultiKernel->GetIdxSizeForFirstNumLoops(NumBlockLoops);
140 |     TheCudaKernel->FunctionName = "Kernel";
141 |     TheCudaKernel->ThreadsPerBlock = GetNumThreadsPerBlock(NumBlockLoops);
142 |     TheCudaKernel->NumBlocks = outidx_size;    
143 | 
144 |     //Generate the params list
145 |     std::string params_str;
146 |     for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari)
147 |     {
148 |         if (MultiKernel->IsOutputUVar(uvari))
149 |         {
150 |             params_str += "double * const T" + std::to_string(uvari);
151 |         }
152 |         else
153 |         {
154 |             params_str += "double const * const T" + std::to_string(uvari);
155 |         }
156 | 
157 |         if (uvari < MultiKernel->GetNumUVars()-1)
158 |         {
159 |             params_str += ", ";
160 |         }
161 |     }
162 | 
163 |     GetSharedMemUvars();
164 |     std::string preload_sm_str = GenSharedMemPreload();
165 |     GetSharedMemWRKernels();
166 |     std::string alloc_smwr_buffer_str = GenSharedMemWRBuffer();
167 | 
168 |     //Generate the indices outside the contraction loop
169 |     std::string init_indices_str = GenInitIndices();
170 | 
171 | 
172 |     //Generate the subkernel loops
173 |     std::string subkernel_loops_str = GenSubKernelLoops();
174 | 
175 |     str_replace_all(TheCudaKernel->Code, "<BLOCK_SIZE>", TheCudaKernel->ThreadsPerBlock);
176 |     str_replace_all(TheCudaKernel->Code, "<BLOCKS_PER_SM>", 4096 / TheCudaKernel->ThreadsPerBlock);
177 |     str_replace_all(TheCudaKernel->Code, "<KERNEL_NAME>", TheCudaKernel->FunctionName);
178 |     str_replace_all(TheCudaKernel->Code, "<PARAMS>", params_str);
179 |     str_replace_all(TheCudaKernel->Code, "<NUMUVARS>", MultiKernel->GetNumUVars());
180 |     str_replace_all(TheCudaKernel->Code, "<OUTIDX_SIZE>", outidx_size);
181 |     str_replace_all(TheCudaKernel->Code, "<SMWR_BUFFER>", alloc_smwr_buffer_str);
182 |     str_replace_all(TheCudaKernel->Code, "<PRELOAD_SMVARS>", preload_sm_str);
183 |     str_replace_all(TheCudaKernel->Code, "<INIT_INDICES>", init_indices_str);
184 |     str_replace_all(TheCudaKernel->Code, "<SUBKERNEL_LOOPS>", subkernel_loops_str);
185 | 
186 |     //std::cout << TheCudaKernel->Code << std::endl;
187 |     //std::cout << MultiKernel->GetDimensionedNameString() << std::endl;
188 |     //TheCudaKernel->WriteCodeToFile("kernel.cu");
189 |     TheCudaKernel->GenerateFunction();
190 | }
191 | 
192 | 
193 | int CudaExecutor::GetNumBlockLoops()
194 | {
195 |     int loopi;
196 |     for (loopi = 0; loopi < MultiKernel->GetNumOuterIndices(); ++loopi)
197 |     {
198 |         if (MultiKernel->GetIdxSizeForFirstNumLoops(loopi) >= 4096 || GetMinMidIdxSize(loopi) < 128)
199 |         {
200 |             break;
201 |         }
202 |     }
203 |     return loopi;
204 | }
205 | 
206 | 
207 | int CudaExecutor::GetMinMidIdxSize(int num_block_loops)
208 | {
209 |     int numloops = MultiKernel->GetNumIndices();
210 |     int min_idx_size = std::numeric_limits<int>::max();
211 |     for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki)
212 |     {
213 |         DimensionedKernel *kernel = MultiKernel->Kernels[ki];
214 |         std::vector<int> mid_loops;
215 |         for (int loopi = num_block_loops; loopi < numloops; ++loopi)
216 |         {
217 |             if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi))
218 |             {
219 |                 mid_loops.push_back(loopi);
220 |             }
221 |         }        
222 |         min_idx_size = std::min(min_idx_size, kernel->GetLoopsIdxSize(mid_loops));
223 |     }
224 |     return min_idx_size;
225 | }
226 | 
227 | 
228 | int CudaExecutor::GetMaxMidIdxSize(int num_block_loops)
229 | {
230 |     int numloops = MultiKernel->GetNumIndices();
231 |     int max_idx_size = -1;
232 |     for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki)
233 |     {
234 |         DimensionedKernel *kernel = MultiKernel->Kernels[ki];
235 |         std::vector<int> mid_loops;
236 |         for (int loopi = num_block_loops; loopi < numloops; ++loopi)
237 |         {
238 |             if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi))
239 |             {
240 |                 mid_loops.push_back(loopi);
241 |             }
242 |         }        
243 |         max_idx_size = std::max(max_idx_size, kernel->GetLoopsIdxSize(mid_loops));
244 |     }
245 |     return max_idx_size;
246 | }
247 | 
248 | 
249 | int CudaExecutor::GetNumThreadsPerBlock(int num_block_loops)
250 | {
251 |     int min = GetMinMidIdxSize(num_block_loops);
252 |     int max = GetMaxMidIdxSize(num_block_loops);
253 |     int block_size;
254 |     for (block_size = 64; block_size < 512; block_size *= 2)
255 |     {
256 |         if (block_size > max || block_size > int(1.3*float(min)))
257 |         {
258 |             break;
259 |         } 
260 |     }
261 |     //std::cout << block_size << std::endl;
262 |     return block_size;
263 | }
264 | 
265 | void CudaExecutor::GetSharedMemUvars()
266 | {
267 |     int numuvars = MultiKernel->GetNumUVars();
268 |     SharedMemUvars.resize(numuvars);
269 |     int num_blocks_per_full_sm = CudaDeviceProp.maxThreadsPerMultiProcessor / TheCudaKernel->ThreadsPerBlock;
270 |     int shared_mem_size = (CudaDeviceProp.sharedMemPerMultiprocessor / num_blocks_per_full_sm);
271 |     for (int uvari = 0; uvari < numuvars; ++uvari)
272 |     {
273 |         SharedMemUvars[uvari] = false;
274 |         if (!MultiKernel->IsOutputUVar(uvari))
275 |         {
276 |             int ivar_bytesize = MultiKernel->GetVarSize(uvari)*8;
277 |             if (ivar_bytesize + SharedMemAllocated < shared_mem_size)
278 |             {
279 |                 SharedMemUvars[uvari] = true;
280 |                 SharedMemAllocated += ivar_bytesize;
281 |             }
282 |         }
283 |     }
284 | }
285 | 
286 | void CudaExecutor::GetSharedMemWRKernels()
287 | {
288 |     int num_blocks_per_full_sm = CudaDeviceProp.maxThreadsPerMultiProcessor / TheCudaKernel->ThreadsPerBlock;
289 |     int shared_mem_size = (CudaDeviceProp.sharedMemPerMultiprocessor / num_blocks_per_full_sm);
290 |     SharedMemWRKernels.resize(MultiKernel->GetNumKernels(), false);
291 |     SMWRBufferSize = 0;
292 |     for (int ki = 0; ki < MultiKernel->GetNumKernels() - 1; ++ki)
293 |     {
294 |         DimensionedKernel *kernel = MultiKernel->Kernels[ki];
295 |         DimensionedKernel *next_kernel = MultiKernel->Kernels[ki + 1];
296 |         if (ki == 0 || SharedMemWRKernels[ki-1] == false)           //Avoid collision on the buffer
297 |         {
298 |             int outuvari = MultiKernel->GetUVari(ki, -1);
299 |             for (int vari = 0; vari < next_kernel->GetNumInputVars(); ++vari)
300 |             {
301 |                 if (MultiKernel->GetUVari(ki+1, vari) == outuvari)
302 |                 {
303 |                     int onblock_var_idxsize = 8;            //Bytes/double
304 |                     for (int di = 0; di < MultiKernel->GetVarRank(ki, vari); ++di)
305 |                     {
306 |                         int loopi = MultiKernel->GetVarDimLoopNum(ki, vari, di);
307 |                         if (loopi >= NumBlockLoops)
308 |                         {
309 |                             onblock_var_idxsize *= MultiKernel->GetLoopDim(loopi);
310 |                         }
311 |                     }
312 | 
313 |                     if (onblock_var_idxsize + SharedMemAllocated < shared_mem_size)
314 |                     {
315 |                         SMWRBufferSize = std::max(SMWRBufferSize, onblock_var_idxsize);
316 |                         SharedMemWRKernels[ki] = true;
317 |                     }
318 |                 }
319 |             }
320 |         }
321 |     }
322 |     SharedMemAllocated += SMWRBufferSize;
323 | }
324 | 
325 | 
326 | std::vector<int> CudaExecutor::GetMidloopsOrder(int ki)
327 | {
328 |     DimensionedKernel* kernel = MultiKernel->Kernels[ki];
329 |     int numloops = MultiKernel->GetNumIndices();
330 |     int numinvars = kernel->GetNumInputVars();
331 |     
332 |     //Generate the mid loops
333 |     std::set<int> mid_loops_set;
334 |     for (int loopi = NumBlockLoops; loopi < numloops; ++loopi)
335 |     {
336 |         if (kernel->IsDependentOnLoop(loopi) && !kernel->IsContractionLoop(loopi))
337 |         {
338 |             mid_loops_set.insert(loopi);
339 |         }
340 |     }
341 | 
342 |     int max_ivar_rank = 0;
343 |     for (int vari = -1; vari < numinvars; ++vari)
344 |     { 
345 |         max_ivar_rank = std::max(max_ivar_rank, kernel->GetVarRank(vari));
346 |     }
347 | 
348 |     //Collect of the loop dimensions from all the variables in lowest stride order
349 |     std::vector<int> mid_loops;
350 |     for (int rankoff = 0; rankoff < max_ivar_rank; ++rankoff)
351 |     {
352 |         for (int vari = numinvars-1; vari >= -1; --vari)
353 |         {
354 |             int uvari = MultiKernel->GetUVari(ki, vari);
355 |             int vidxi = kernel->GetVarRank(vari) - 1 - rankoff;
356 |             int loopi = vidxi >= 0 ? kernel->GetVarDimLoopNum(vari, vidxi) : -1;
357 |             auto it = mid_loops_set.find(loopi);
358 |             if (!SharedMemUvars[uvari] && it != mid_loops_set.end())
359 |             {
360 |                 mid_loops.push_back(loopi);
361 |                 mid_loops_set.erase(it);
362 |             }
363 |         }
364 |     }
365 | 
366 |     //Tack on the rest of the indices
367 |     for (auto it = mid_loops_set.rbegin(); it != mid_loops_set.rend(); ++it)
368 |     {
369 |         mid_loops.push_back(*it);
370 |     }
371 | 
372 |     //We want the lowest strides to be in the inner most loops
373 |     std::reverse(mid_loops.begin(), mid_loops.end());
374 |     return mid_loops;
375 | }
376 | 
377 | 
378 | std::vector<int> CudaExecutor::GetMidloopsStrides(DimensionedKernel *kernel, std::vector<int> &mid_loops)
379 | {
380 |     //Generate the mid loops
381 |     int nummidloops = mid_loops.size();
382 |     std::vector<int> strides(nummidloops);
383 |     int stride = 1;
384 |     for (int mloopi = nummidloops - 1; mloopi >= 0; --mloopi)
385 |     {
386 |         int loopi = mid_loops[mloopi];
387 |         strides[mloopi] = stride;
388 |         stride *= kernel->GetLoopDim(loopi);
389 |     }
390 | 
391 |     return strides;
392 | }
393 | 
394 | 
395 | std::string CudaExecutor::GenSharedMemWRBuffer()
396 | {
397 |     std::string smwr_str;
398 |     if (SMWRBufferSize > 0)
399 |     {
400 |         smwr_str += "    __shared__ double SMWR[" + std::to_string(SMWRBufferSize / 8) + "];\n";
401 |     }
402 |     return smwr_str;
403 | }
404 | 
405 | 
406 | std::string CudaExecutor::GenSharedMemPreload()
407 | {
408 |     //If applicable Generate the SM preload code for small tensors
409 |     std::string preload_sm_str;
410 |     for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari)
411 |     {
412 |         if (SharedMemUvars[uvari])
413 |         {
414 |             preload_sm_str += "    __shared__ double sT" + std::to_string(uvari);
415 |             preload_sm_str += "[" + std::to_string(MultiKernel->GetVarSize(uvari)) + "];\n";
416 |         }
417 |     }
418 |     for (int uvari = 0; uvari < MultiKernel->GetNumUVars(); ++uvari)
419 |     {
420 |         if (SharedMemUvars[uvari])
421 |         {
422 |             std::string temp = 
423 |                 "    for (int idx = threadIdx.x; idx < <SMVAR_SIZE>; idx += blockDim.x)\n"
424 |                 "    {\n"
425 |                 "        sT<VARNUM>[idx] = " + GenTensor(uvari) + "[idx];\n"
426 |                 "    }\n\n";
427 |             str_replace_all(temp, "<VARNUM>", uvari);
428 |             str_replace_all(temp, "<SMVAR_SIZE>", MultiKernel->GetVarSize(uvari));
429 | 
430 |             preload_sm_str += temp;
431 |         }
432 |     }
433 |     return preload_sm_str;
434 | }
435 | 
436 | 
437 | std::string CudaExecutor::GenInitIndices()
438 | {
439 |     const std::vector<int> N = MultiKernel->GetLoopDims();
440 |     int numloops = MultiKernel->GetNumIndices();
441 |     std::vector<int> Wout(NumBlockLoops); //Outer loop strides
442 |     if (NumBlockLoops > 0) 
443 |     {
444 |         Wout[NumBlockLoops-1] = 1;
445 |     }
446 |     for (int d = NumBlockLoops - 2; d >= 0; --d)
447 |     {
448 |         Wout[d] = Wout[d+1]*N[d+1];
449 |     }
450 | 
451 |     std::string init_indices_str;
452 |     for (int loopd = 0; loopd < NumBlockLoops; ++loopd)
453 |     {
454 |         //I[loopd] = (outidx / (Wout[loopd]) % N[loopd];
455 |         init_indices_str += "    unsigned int I";
456 |         init_indices_str += std::to_string(loopd) + " = ";
457 |         if (Wout[loopd] == 1)
458 |         {
459 |             init_indices_str += "outidx";
460 |         }
461 |         else
462 |         {
463 |             init_indices_str += "(outidx / " + std::to_string(Wout[loopd]) + ")";
464 |             TheCudaKernel->IntOpsPerIndex += 1;
465 |         }
466 |         if (loopd > 0)
467 |         {
468 |             init_indices_str += " % " + std::to_string(N[loopd]);
469 |         }
470 |         init_indices_str += ";    // " + MultiKernel->GetLoopIndex(loopd) + "\n";
471 | 
472 |     }
473 |     return init_indices_str;
474 | }
475 | 
476 | 
477 | 
478 | std::string CudaExecutor::GenSubKernelLoops()
479 | {
480 |     std::string kernel_loops_str;
481 |     int numloops = MultiKernel->GetNumIndices();
482 |     std::vector<bool> hoisted;
483 |     std::vector<int> loop_strides(numloops);
484 | 
485 |     for (int ki = 0; ki < MultiKernel->GetNumKernels(); ++ki)
486 |     {
487 |         std::string loop_str; 
488 | 
489 |         DimensionedKernel *kernel = MultiKernel->Kernels[ki];
490 |         int numinvars = kernel->GetNumInputVars();
491 |         int numcontloops = kernel->GetNumContractionIndices();
492 |         std::vector<int> mid_loops = GetMidloopsOrder(ki);
493 |         std::vector<int> mid_loop_strides = GetMidloopsStrides(kernel, mid_loops);
494 |         int mid_loops_idx_size = kernel->GetLoopsIdxSize(mid_loops);
495 |         int blockdim = TheCudaKernel->ThreadsPerBlock;
496 |         int numblocki = mid_loops_idx_size / blockdim;
497 |         int blocki_rem = mid_loops_idx_size % blockdim;
498 |         if (blocki_rem != 0)
499 |         {
500 |             numblocki ++;
501 |         }
502 | 
503 |         loop_str += "    //" + kernel->KernelStr + "\n";
504 |         loop_str += "    {\n";  
505 | 
506 |         for (int mloopi = 0; mloopi < mid_loops.size(); ++mloopi)
507 |         {
508 |             int loopi = mid_loops[mloopi]; 
509 |             loop_str += "        ushort2 " + GenLoopIndex(ki, loopi) + ";\n";
510 |         }
511 |         loop_str += GenMidLoopIndices(ki, mid_loops, mid_loop_strides, 0);
512 |         for (int blocki = 0; blocki < numblocki; ++blocki)
513 |         {
514 |             std::string temp;
515 |             if (blocki == numblocki - 1 && blocki_rem != 0)
516 |             {
517 |                 temp += "        if (threadIdx.x < <BLOCKI_REM>)\n";
518 |             }
519 |             temp += "        {\n";
520 |             temp += "            sum = 0.0;\n";
521 |             temp +=             "<CONTLOOPS>";
522 |             temp += "                sum += <COMPUTE_RHS>;\n";
523 |             temp +=             "<ENDCONTLOOPS>";
524 |             if (blocki < numblocki -1)
525 |             {
526 |                 temp += GenMidLoopIndices(ki, mid_loops, mid_loop_strides, blocki+1);
527 |             }
528 |             if (SharedMemWRKernels[ki])
529 |             {
530 |                 if (kernel->EqOperator != "=")
531 |                 {
532 |                     temp += "            SMWR[<SMOUTIDX>] = " + GenTensor(ki,-1) + "[<OUTIDX>];\n";
533 |                 }
534 |                 temp += "            SMWR[<SMOUTIDX>] <OUT_EQ_OP> sum;\n";
535 |             }
536 |             temp += "            " + GenTensor(ki,-1) + "[<OUTIDX>] <OUT_EQ_OP> sum;\n";
537 |             temp += "        }\n";
538 |             str_replace_all(temp, "<BLOCKI>", blocki);
539 |             str_replace_all(temp, "<BLOCKI_REM>", blocki_rem);
540 | 
541 |             loop_str += temp;
542 | 
543 |             //Generate the contraction loops
544 |             std::string cont_loops_str;
545 |             std::vector<bool> hoisted(numinvars, false);
546 |             for (int loopi = NumBlockLoops; loopi < numloops; ++loopi)
547 |             {
548 |                 if (kernel->IsContractionLoop(loopi))
549 |                 {
550 |                     std::string temp;
551 |                     for (int ivari = 0; ivari < numinvars; ++ ivari)
552 |                     {
553 |                         int uvari = MultiKernel->GetUVari(ki, ivari);
554 |                         if (kernel->GetVarLoopDepth(ivari) < loopi && !SharedMemUvars[uvari] &&
555 |                             !(ki > 0 && uvari == MultiKernel->GetUVari(ki-1, -1)) && 
556 |                             !hoisted[ivari])
557 |                         {
558 |                             std::string ivaristr = std::to_string(ivari);
559 |                             std::string uvaristr = std::to_string(uvari);
560 |                             std::string varidxstr = GenVarIndex(ki, ivari, blocki);     
561 |                             temp += "        double hIN" + ivaristr + " = __ldg(&" + GenTensor(uvari) + "[" + varidxstr + "]);\n";
562 |                             hoisted[ivari] = true;
563 |                         }
564 |                     }
565 | 
566 |                     temp += "            #pragma unroll\n";
567 |                     temp += "            for (unsigned int <LOOPIDX> = 0; <LOOPIDX> < <LOOP_SIZE>; ++<LOOPIDX>) {";
568 |                     temp += "    // " + MultiKernel->GetLoopIndex(loopi) + "\n";
569 |                     str_replace_all(temp, "<LOOPIDX>", GenLoopIndex(ki, loopi));
570 |                     str_replace_all(temp, "<LOOP_SIZE>", kernel->GetLoopDim(loopi));
571 |                     cont_loops_str += temp;
572 |                 }
573 |             }
574 |             std::string end_cont_loops_str = "            " + std::string(numcontloops, '}') + "\n";
575 | 
576 |             //Generate the RHS computation inside the contraction loops
577 |             std::string rhs_str;
578 |             for (int ivari = 0; ivari < numinvars; ++ ivari)
579 |             {
580 |                 int uvari = MultiKernel->GetUVari(ki, ivari);
581 |                 std::string var_str;
582 |                 if (SharedMemUvars[uvari])
583 |                 {
584 |                     var_str = "sT" + std::to_string(uvari) + "[" + GenVarIndex(ki, ivari, blocki) + "]";
585 |                 }
586 |                 else if (hoisted[ivari])
587 |                 {
588 |                     var_str = "hIN" + std::to_string(ivari);
589 |                 }
590 |                 else if (ki > 0 && SharedMemWRKernels[ki-1] && uvari == MultiKernel->GetUVari(ki-1, -1))
591 |                 {
592 |                     var_str = "SMWR[" + GenVarIndex(ki, ivari, blocki, false) + "]";
593 |                 }
594 |                 else
595 |                 {
596 |                     var_str = GenTensor(uvari) + "[" + GenVarIndex(ki, ivari, blocki) + "]";
597 |                 }
598 | 
599 |                 rhs_str += var_str;
600 |                 if (ivari < numinvars-1)
601 |                 {
602 |                     rhs_str += "*";
603 |                 }
604 |             }
605 | 
606 |             str_replace_all(loop_str, "<CONTLOOPS>", cont_loops_str);
607 |             str_replace_all(loop_str, "<COMPUTE_RHS>", rhs_str);
608 |             str_replace_all(loop_str, "<ENDCONTLOOPS>", end_cont_loops_str);
609 |             str_replace_all(loop_str, "<OUT_EQ_OP>", kernel->EqOperator);
610 |             str_replace_all(loop_str, "<OUTIDX>", GenVarIndex(ki, -1, blocki));
611 |             str_replace_all(loop_str, "<SMOUTIDX>", GenVarIndex(ki, -1, blocki, false));
612 | 
613 |         }
614 |         loop_str += "    }\n";
615 |         loop_str += "    __syncthreads();\n\n";
616 |         kernel_loops_str += loop_str;
617 |     }
618 |     return kernel_loops_str;
619 | }
620 | 
621 | std::string CudaExecutor::GenMidLoopIndices(int ki, std::vector<int> &mid_loops, std::vector<int> &mid_loop_strides, int blocki)
622 | {
623 |     DimensionedKernel *kernel = MultiKernel->Kernels[ki];
624 |     std::string indices;
625 |     for (int mloopi = 0; mloopi < mid_loops.size(); ++mloopi)
626 |     {
627 |         int loopi = mid_loops[mloopi];
628 |         std::string temp = "        " + GenLoopIndex(ki, loopi, blocki);
629 |         temp +=  " = ((threadIdx.x + <MOFF>) / <LOOP_STRIDE>)";
630 |         if (mloopi > 0)
631 |         {
632 |             temp += " % <LOOP_SIZE>;    // "+ MultiKernel->GetLoopIndex(loopi) + "\n";
633 |         }
634 |         else
635 |         {
636 |             temp += ";    // " + MultiKernel->GetLoopIndex(loopi) + "\n";
637 |         }
638 |         str_replace_all(temp, "<MOFF>", blocki*TheCudaKernel->ThreadsPerBlock);
639 |         str_replace_all(temp, "<LOOP_STRIDE>", mid_loop_strides[mloopi]);
640 |         str_replace_all(temp, "<LOOP_SIZE>", kernel->GetLoopDim(loopi));
641 |         indices += temp;
642 |     }
643 |     return indices;
644 | }
645 | 
646 | 
647 | std::string CudaExecutor::GenTensor(int ki, int vari)
648 | {
649 |     return GenTensor(MultiKernel->GetUVari(ki, vari));
650 | }
651 | 
652 | 
653 | std::string CudaExecutor::GenTensor(int uvari)
654 | {
655 |     std::string tensor = "T" + std::to_string(uvari);
656 |     return tensor;
657 | }
658 | 
659 | std::string CudaExecutor::GenVarIndex(int ki, int vari, int blocki, bool blockdims)
660 | {
661 |     DimensionedKernel *kernel = MultiKernel->Kernels[ki];
662 |     std::string index_str;
663 |     bool first = true;              
664 |     for (int di = 0; di < kernel->GetVarRank(vari); ++di)
665 |     {
666 |         int loopi = kernel->GetVarDimLoopNum(vari, di);
667 |         if (blockdims || loopi >= NumBlockLoops)
668 |         {
669 |             if (!first)
670 |             {
671 |                 index_str += " + ";
672 |             }
673 | 
674 |             std::string loopidx = GenVarSubIndex(ki, vari, di, blocki);
675 |             std::string stride = std::to_string(kernel->GetVarDimStride(vari, di));
676 |             //index_str += "__umul24(" + loopidx + "," + stride + ")";
677 |             index_str += loopidx + "*" + stride;
678 |             first = false;
679 |         }           
680 |     }
681 |     return index_str;
682 | }
683 | 
684 | 
685 | std::string CudaExecutor::GenVarSubIndex(int ki, int vari, int dimi, int blocki)
686 | {
687 |     DimensionedKernel *kernel = MultiKernel->Kernels[ki];   
688 |     return GenLoopIndex(ki, kernel->GetVarDimLoopNum(vari, dimi), blocki);
689 | }
690 | 
691 | 
692 | std::string CudaExecutor::GenLoopIndex(int ki, int loopi, int blocki)
693 | {
694 |     DimensionedKernel *kernel = MultiKernel->Kernels[ki];
695 |     std::string loopidx = "I" + std::to_string(loopi);
696 |     if (blocki > -1 && loopi >= NumBlockLoops && !kernel->IsContractionLoop(loopi))
697 |     {
698 |         loopidx += (blocki%2 == 0) ? ".x" : ".y";
699 |     }
700 |     
701 |     return loopidx;
702 | }
703 | 
704 | }
705 | 
706 | #endif


--------------------------------------------------------------------------------
/exec/CudaExecutor.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_CUDA_EXECUTOR_HPP
 7 | #define ACROBATIC_CUDA_EXECUTOR_HPP
 8 | 
 9 | #ifdef ACRO_HAVE_CUDA
10 | #include "KernelExecutor.hpp"
11 | #include <string>
12 | #include <vector>
13 | #include <nvrtc.h>
14 | 
15 | namespace acro
16 | {
17 | 
18 | class CudaExecutor : public KernelExecutor
19 | {
20 |     public:
21 |     CudaExecutor(DimensionedMultiKernel *multi_kernel);
22 |     ~CudaExecutor();
23 |     virtual void ExecuteSingle(Tensor *output, std::vector<Tensor*> &inputs);
24 |     virtual void ExecuteMulti(std::vector<Tensor*> &output, std::vector<std::vector<Tensor*> > &inputs);
25 |     virtual std::string GetImplementation();
26 |     virtual std::string GetExecType() {return "Cuda";}
27 | 
28 |     private:
29 |     void GenerateCudaKernel();
30 |     void ReorderIndices(std::vector<std::string> &mk_outer_indices);
31 |     int GetNumBlockLoops();
32 |     int GetMinMidIdxSize(int num_block_loops);
33 |     int GetMaxMidIdxSize(int num_block_loops);
34 |     int GetNumThreadsPerBlock(int num_block_loops);
35 |     void GetSharedMemUvars();
36 |     void GetSharedMemWRKernels();
37 |     std::vector<int> GetMidloopsOrder(int ki);
38 |     std::vector<int> GetMidloopsStrides(DimensionedKernel *kernel, std::vector<int> &mid_loops);
39 | 
40 |     std::string GenSharedMemPreload();
41 |     std::string GenSharedMemWRBuffer();
42 |     std::string GenInitIndices();
43 |     std::string GenSubKernelLoops();
44 |     std::string GenTensor(int ki, int vari);
45 |     std::string GenTensor(int uvari);
46 |     std::string GenMidLoopIndices(int ki, std::vector<int> &mid_loops, std::vector<int> &mid_loop_strides, int blocki = -1);
47 |     std::string GenVarIndex(int ki, int vari, int blocki = -1, bool blockdims=true);
48 |     std::string GenVarSubIndex(int ki, int vari, int dimi, int blocki = -1);
49 |     std::string GenLoopIndex(int ki, int loopi, int blocki = -1);
50 |     
51 |     cudaDeviceProp CudaDeviceProp;
52 |     CudaKernel *TheCudaKernel;
53 | 
54 |     int NumBlockLoops;
55 |     double **HDeviceTensors;
56 | 
57 |     int SharedMemAllocated;
58 |     int SMWRBufferSize;
59 |     std::vector<bool> SharedMemUvars;
60 |     std::vector<bool> SharedMemWRKernels;
61 | 
62 |     std::vector<void*> KernelParams;
63 | };
64 | 
65 | }
66 | 
67 | #endif
68 | 
69 | #endif //ACROBATIC_ONEOUTPERTHREAD_EXECUTOR_HPP


--------------------------------------------------------------------------------
/exec/Executor.hpp:
--------------------------------------------------------------------------------
1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
3 | //All rights reserved.
4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
5 | 
6 | #include "KernelExecutor.hpp"
7 | #include "CPUInterpretedExecutor.hpp"
8 | #include "CudaExecutor.hpp"


--------------------------------------------------------------------------------
/exec/KernelExecutor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "Executor.hpp"
  7 | #include "TensorKernel.hpp"
  8 | 
  9 | namespace acro
 10 | {
 11 | 
 12 | 
 13 | KernelExecutor::KernelExecutor(DimensionedMultiKernel *multi_kernel)
 14 | {
 15 |     MultiKernel = multi_kernel;
 16 |     if (MultiKernel->Kernels.size() > 0)
 17 |     {
 18 |         FirstKernel = MultiKernel->Kernels[0];
 19 |     }
 20 |     else
 21 |     {
 22 |         FirstKernel = NULL;
 23 |     }
 24 | 
 25 | #ifdef ACRO_HAVE_CUDA
 26 |     TheCudaStream = NULL;
 27 | #endif
 28 | }
 29 | 
 30 | 
 31 | void KernelExecutor::MoveTensorsFromGPU(Tensor *output, std::vector<Tensor*> &inputs)
 32 | {
 33 |     if (output->IsOnGPU())
 34 |     {
 35 |         output->MoveFromGPU();
 36 |     }
 37 | 
 38 |     for (int i = 0; i < inputs.size(); ++i)
 39 |     {
 40 |         if (inputs[i]->IsOnGPU())
 41 |         {
 42 |             inputs[i]->MoveFromGPU();
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | 
 48 | void KernelExecutor::MoveTensorsToGPU(Tensor *output, std::vector<Tensor*> &inputs)
 49 | {
 50 |     if (!output->IsOnGPU())
 51 |     {
 52 |         if (!output->IsMappedToGPU())
 53 |         {
 54 |             output->MapToGPU();
 55 |         }
 56 |         output->MoveToGPU();
 57 |     }
 58 | 
 59 |     for (int i = 0; i < inputs.size(); ++i)
 60 |     {
 61 |         if (!inputs[i]->IsOnGPU())
 62 |         {
 63 |             if (!inputs[i]->IsMappedToGPU())
 64 |             {
 65 |                 inputs[i]->MapToGPU();
 66 |             }
 67 |             inputs[i]->MoveToGPU();
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | 
 73 | void KernelExecutor::MoveTensorsToOutputLocation(Tensor *output, std::vector<Tensor*> &inputs)
 74 | {
 75 |     if (output->IsOnGPU())
 76 |     {
 77 |         MoveTensorsToGPU(output, inputs);
 78 |     }
 79 |     else
 80 |     {
 81 |         MoveTensorsFromGPU(output, inputs);
 82 |     }
 83 | }
 84 | 
 85 | 
 86 | void KernelExecutor::ExecuteMulti(std::vector<Tensor*> &output, std::vector<std::vector<Tensor*> > &inputs)
 87 | {
 88 |     if (SubExecutors.size() != MultiKernel->Kernels.size())
 89 |     {
 90 |         SubKernels.resize(MultiKernel->Kernels.size());
 91 |         SubExecutors.resize(MultiKernel->Kernels.size());
 92 |         for (int ki = 0; ki < MultiKernel->Kernels.size(); ++ki)
 93 |         {
 94 |             SubKernels[ki] = new DimensionedMultiKernel(MultiKernel->Kernels[ki]);
 95 |             SubExecutors[ki] = KernelExecutor::Create(GetExecType(), SubKernels[ki]);
 96 |         }
 97 |     }
 98 | 
 99 |     for (int ki = 0; ki < MultiKernel->Kernels.size(); ++ki)
100 |     {
101 |         SubExecutors[ki]->ExecuteSingle(output[ki], inputs[ki]);
102 |     }
103 | }
104 | 
105 | 
106 | KernelExecutor *KernelExecutor::Create(std::string exec_type, DimensionedMultiKernel *multi_kernel)
107 | {
108 |     if (exec_type == "CPUInterpreted")
109 |     {
110 |         return new CPUInterpretedExecutor(multi_kernel);    
111 |     }
112 | #ifdef ACRO_HAVE_CUDA    
113 |     if (exec_type == "Cuda")
114 |     {
115 |         return new CudaExecutor(multi_kernel);
116 |     }
117 | #endif
118 | 
119 |     ACROBATIC_ASSERT(false, "Executor type does not exist:  " + exec_type);
120 |     return NULL;
121 | }
122 | 
123 | 
124 | KernelExecutor::~KernelExecutor()
125 | {
126 |     for (int ki = 0; ki < SubExecutors.size(); ++ki)
127 |     {
128 |         delete SubKernels[ki];
129 |         delete SubExecutors[ki];
130 |     }
131 | }
132 | 
133 | }


--------------------------------------------------------------------------------
/exec/KernelExecutor.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_KERNEL_EXECUTOR_HPP
 7 | #define ACROBATIC_KERNEL_EXECUTOR_HPP
 8 | 
 9 | #include <string>
10 | #include "Tensor.hpp"
11 | #include "DimensionedMultiKernel.hpp"
12 | 
13 | namespace acro
14 | {
15 | 
16 | class KernelExecutor
17 | {
18 |     public:
19 |     KernelExecutor(DimensionedMultiKernel *multi_kernel);
20 |     static KernelExecutor *Create(std::string exec_type, DimensionedMultiKernel *multi_kernel);
21 |     virtual ~KernelExecutor();
22 | 
23 |     virtual std::string GetImplementation() = 0;
24 |     virtual std::string GetExecType() = 0;
25 |     virtual void ExecuteSingle(Tensor *output, std::vector<Tensor*> &inputs) = 0;
26 |     virtual void ExecuteMulti(std::vector<Tensor*> &output, std::vector<std::vector<Tensor*> > &inputs);
27 | 
28 | #ifdef ACRO_HAVE_CUDA    
29 |     inline void SetCudaStream(cudaStream_t cuda_stream) {TheCudaStream = cuda_stream;}
30 | #endif
31 | 
32 |     protected:
33 | 
34 |     void MoveTensorsFromGPU(Tensor *output, std::vector<Tensor*> &inputs);
35 |     void MoveTensorsToGPU(Tensor *output, std::vector<Tensor*> &inputs);
36 |     void MoveTensorsToOutputLocation(Tensor *output, std::vector<Tensor*> &inputs);
37 |     DimensionedMultiKernel *MultiKernel;
38 |     DimensionedKernel *FirstKernel;
39 |     std::vector<DimensionedMultiKernel*> SubKernels;
40 |     std::vector<KernelExecutor*> SubExecutors;
41 | 
42 | #ifdef ACRO_HAVE_CUDA
43 |     cudaStream_t TheCudaStream;
44 | #endif    
45 | };
46 | 
47 | }
48 | 
49 | 
50 | #endif //ACROBATIC_KERNEL_EXECUTOR_HPP
51 | 


--------------------------------------------------------------------------------
/kernel/DimensionedKernel.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | #include "DimensionedKernel.hpp"
  6 | #include <algorithm>
  7 | 
  8 | namespace acro
  9 | {
 10 | 
 11 | 
 12 | DimensionedKernel::DimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector<Tensor*> &inputs)
 13 | {
 14 |     //Copy these from the original kernel
 15 |     KernelStr = kernel->KernelStr;
 16 |     OutputVar = kernel->OutputVar;
 17 |     EqOperator = kernel->EqOperator;
 18 |     InputVars = kernel->InputVars;
 19 |     AllIndexNames = kernel->AllIndexNames;
 20 |     ContractionIndexNames = kernel->ContractionIndexNames;
 21 |     LoopIndices = kernel->LoopIndices;
 22 | 
 23 |     LoopDims = kernel->GetLoopIdxSizes(output, inputs);
 24 |     LoopStrides.resize(LoopDims.size());
 25 |     LoopStrides[LoopDims.size() - 1] = 1;
 26 |     for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd)
 27 |     {
 28 |         LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1];
 29 |     }
 30 | }
 31 | 
 32 | 
 33 | void DimensionedKernel::SetLoopIndices(std::vector<std::string> &idx_list)
 34 | {
 35 |     //Update the loop dims before we change all the LoopIndex info
 36 |     std::vector<int> NewLoopDims(idx_list.size(), 1);
 37 |     for (int idxi = 0; idxi < NewLoopDims.size(); ++idxi)
 38 |     {
 39 |         auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx_list[idxi]);
 40 |         if (it != LoopIndices.end())
 41 |         {
 42 |             NewLoopDims[idxi] = LoopDims[std::distance(LoopIndices.begin(), it)];
 43 |         }
 44 |         else
 45 |         {
 46 |             NewLoopDims[idxi] = 1;
 47 |         }
 48 |     }
 49 |     LoopDims = NewLoopDims;
 50 | 
 51 |     //Update the loop strides
 52 |     LoopStrides.resize(LoopDims.size());
 53 |     LoopStrides[LoopDims.size() - 1] = 1;
 54 |     for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd)
 55 |     {
 56 |         LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1];
 57 |     }
 58 | 
 59 |     //update all the indices and underlying variable objects
 60 |     TensorKernel::SetLoopIndices(idx_list);
 61 | }
 62 | 
 63 | 
 64 | std::string DimensionedKernel::GetLoopDimsString()
 65 | {
 66 |     std::string name = "__dim";
 67 |     for (auto idx : AllIndexNames)
 68 |     {
 69 |         name += "_" + std::to_string(GetLoopDim(idx));
 70 |     }
 71 |     
 72 |     return name;
 73 | }
 74 | 
 75 | 
 76 | 
 77 | int DimensionedKernel::GetFlatIdxSize()
 78 | {
 79 |     int flatidx_size = 1;
 80 |     for (int d = 0; d < GetNumIndices(); ++d)
 81 |     {
 82 |         flatidx_size *= LoopDims[d];
 83 |     }
 84 |     return flatidx_size;
 85 | }
 86 | 
 87 | 
 88 | int DimensionedKernel::GetOutIdxSize()
 89 | {
 90 |     int outidx_size = 1;
 91 |     for (int d = 0; d < GetNumIndices() - GetNumContractionIndices(); ++d)
 92 |     {
 93 |         outidx_size *= LoopDims[d];
 94 |     }
 95 |     return outidx_size;
 96 | }
 97 | 
 98 | 
 99 | int DimensionedKernel::GetContIdxSize()
100 | {   
101 |     int contidx_size = 1;
102 |     for (int d = GetNumIndices() - GetNumContractionIndices(); d < GetNumIndices(); ++d)
103 |     {
104 |         contidx_size *= LoopDims[d];
105 |     }
106 |     return contidx_size;
107 | }
108 | 
109 | 
110 | int DimensionedKernel::GetLoopsIdxSize(std::vector<int> loops)
111 | {
112 |     int idx_size = 1;
113 |     for (auto loopi : loops)
114 |     {
115 |         idx_size *= LoopDims[loopi];
116 |     }
117 |     return idx_size;
118 | }
119 | 
120 | 
121 | int DimensionedKernel::GetIdxSizeForFirstNumLoops(int num_loops)
122 | {
123 |     ACROBATIC_ASSERT(num_loops <= GetNumIndices());
124 |     int idx_size = 1;
125 |     for (int d = 0; d < num_loops; ++d)
126 |     {
127 |         idx_size *= LoopDims[d];
128 |     }
129 |     return idx_size;
130 | }
131 | 
132 | 
133 | int DimensionedKernel::GetVarDimStride(int vari, int dim)
134 | {
135 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
136 | 
137 |     int trank = GetVarRank(vari);
138 |     int stride = 1;
139 |     for (int d = trank-2; d >= dim; --d)
140 |     {
141 |         stride *= LoopDims[GetVarDimLoopNum(vari, d+1)];
142 |     }
143 | 
144 |     return stride;
145 | }
146 | 
147 | 
148 | int DimensionedKernel::GetVarSize(int vari)
149 | {
150 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
151 | 
152 |     int rank = GetVarRank(vari);
153 |     int size = 1;
154 |     for (int d = 0; d < rank; ++d)
155 |     {
156 |         size *= LoopDims[GetVarDimLoopNum(vari, d)];
157 |     }
158 |     return size;
159 | }
160 | 
161 | 
162 | int DimensionedKernel::GetVarStorageReqForInnerLoops(int vari, int num_loops)
163 | {
164 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
165 |     ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices());
166 | 
167 |     int num_var_entries = 1;
168 |     for (int loop_num = GetNumIndices() - 1; loop_num >= GetNumIndices() - num_loops; --loop_num)
169 |     {
170 |         if (IsVarDependentOnLoop(vari, loop_num))
171 |         {
172 |             num_var_entries *= LoopDims[loop_num];
173 |         }
174 |     }
175 |     return num_var_entries;
176 | }
177 | 
178 | 
179 | int DimensionedKernel::GetInputStorageReqForInnerLoops(int num_loops)
180 | {
181 |     ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices());
182 | 
183 |     int num_entries = 0;
184 |     for (int vari = 0; vari < GetNumInputVars(); ++vari) {
185 |         num_entries += GetVarStorageReqForInnerLoops(vari, num_loops);
186 |     }
187 | 
188 |     return num_entries;
189 | }
190 | 
191 | 
192 | int DimensionedKernel::GetOutputStorageReqForInnerLoops(int num_loops)
193 | {
194 |     ACROBATIC_ASSERT(num_loops >= 0 && num_loops <= GetNumIndices());
195 | 
196 |     return GetVarStorageReqForInnerLoops(-1, num_loops);
197 | }
198 | 
199 | 
200 | int DimensionedKernel::GetTotalStorageReqForInnerLoops(int num_loops)
201 | {
202 |     return GetInputStorageReqForInnerLoops(num_loops) + 
203 |            GetOutputStorageReqForInnerLoops(num_loops);
204 | }
205 | 
206 | 
207 | int DimensionedKernel::GetIndexSpaceSizeForInnerLoops(int num_loops)
208 | {
209 |     int size = 1;
210 |     for (int loop = GetNumIndices() - 1; loop >= GetNumIndices() - num_loops; --loop)
211 |     {
212 |         size *= LoopDims[loop];
213 |     }
214 |     return size;
215 | }
216 | 
217 | 
218 | void DimensionedKernel::GetVarIndexOffsetsForInnerLoops(int vari, int num_inner_loops, 
219 |                                                    std::vector<int> &var_off, std::vector<int> &loop_off)
220 | {
221 |     int num_loops = GetNumIndices();
222 |     int num_outer_loops = num_loops - num_inner_loops;
223 |     int loadidx_size = 1;
224 |     for (int loopd = num_loops - num_inner_loops; loopd < num_loops; ++loopd)
225 |     {
226 |         if (IsVarDependentOnLoop(vari, loopd))
227 |         {
228 |             loadidx_size *= LoopDims[loopd];
229 |         }
230 |     }
231 | 
232 |     std::vector<int> inner_loop_strides(GetVarRank(vari), 1);
233 |     var_off.resize(loadidx_size);
234 |     loop_off.resize(loadidx_size);
235 |     for (int loadidx = 0; loadidx < loadidx_size; ++loadidx)
236 |     {
237 |         //Compute the strides for the indices in the inner_loops
238 |         int stride = 1;
239 |         for (int d = GetVarRank(vari) - 1; d >= 0; --d)
240 |         {
241 |             int loopd = GetVarDimLoopNum(vari,d);
242 |             if (loopd >= num_outer_loops)
243 |             {
244 |                 inner_loop_strides[d] = stride;
245 |                 stride *= GetVarDimSize(vari,d);
246 |             }
247 |         }
248 | 
249 |         //Compute the unflattened var indices
250 |         int varidx = 0;
251 |         int loopidx = 0;
252 |         for (int d = 0; d < GetVarRank(vari); ++d)
253 |         {
254 |             int loopd = GetVarDimLoopNum(vari,d);
255 |             if (loopd >= num_outer_loops)
256 |             {
257 |                 int I = (loadidx / inner_loop_strides[d]) % GetVarDimSize(vari,d);
258 |                 varidx += I*GetVarDimStride(vari, d);
259 |                 loopidx += I*LoopStrides[loopd];
260 |             }
261 |         }
262 |         var_off[loadidx] = varidx;
263 |         loop_off[loadidx] = loopidx;
264 |     }
265 | }
266 | 
267 | }


--------------------------------------------------------------------------------
/kernel/DimensionedKernel.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_DIMENSIONED_KERNEL_HPP
 7 | #define ACROBATIC_DIMENSIONED_KERNEL_HPP
 8 | 
 9 | #include "TensorKernel.hpp"
10 | #include <string>
11 | #include <vector>
12 | 
13 | namespace acro
14 | {
15 | 
16 | 
17 | class DimensionedKernel : public TensorKernel
18 | {
19 |     public:
20 |     DimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector<Tensor*> &inputs);
21 | 
22 |     //The dimensions of all the loops now that we have attached tensors
23 |     const std::vector<int> &GetLoopDims() {return LoopDims;}
24 |     const std::vector<int> &GetLoopStrides() {return LoopStrides;}
25 |     int GetLoopDim(std::string &idx) {return LoopDims[GetLoopNum(idx)];}
26 |     int GetLoopStride(std::string &idx) {return LoopStrides[GetLoopNum(idx)];}
27 |     int GetLoopDim(int i) {return LoopDims[i];}
28 |     int GetLoopStride(int i) {return LoopStrides[i];}
29 |     virtual void SetLoopIndices(std::vector<std::string> &idx_list);
30 | 
31 |     //Get a string with all of the loop dimensions
32 |     std::string GetLoopDimsString();
33 |     std::string GetDimensionedNameString() {return GetNameString() + GetLoopDimsString();}
34 |     std::string GetDimensionedNameString(Tensor *output, std::vector<Tensor*> &inputs) {return GetDimensionedNameString();}
35 | 
36 |     //The the number of index combinations for all the loops (the product of the loop dims)
37 |     int GetFlatIdxSize();
38 | 
39 |     //The the number of index combinations for just the outer non-contraction loops
40 |     int GetOutIdxSize();
41 | 
42 |     //The the number of index combinations for the inner contraction loops
43 |     int GetContIdxSize();
44 | 
45 |     //Get the number of indices in the first num_loops
46 |     int GetIdxSizeForFirstNumLoops(int num_loops);
47 | 
48 |     //Get the number of indices in the list of loops
49 |     int GetLoopsIdxSize(std::vector<int> loops);
50 | 
51 |     //The size of the vari tensor's dim
52 |     int GetVarDimSize(int vari, int dim) {return LoopDims[GetVarDimLoopNum(vari, dim)];}
53 | 
54 |     //The stride in flattened index space of a given variable/dimension in the kernel (vari=-1 for output)
55 |     int GetVarDimStride(int vari, int dim);
56 | 
57 |     //The number of index combinations in a given variable in the kernel (vari=-1 for output)
58 |     int GetVarSize(int vari);
59 | 
60 |     //Information for the inner loops
61 |     int GetVarStorageReqForInnerLoops(int vari, int num_loops);
62 |     int GetInputStorageReqForInnerLoops(int num_loops);
63 |     int GetOutputStorageReqForInnerLoops(int num_loops);
64 |     int GetTotalStorageReqForInnerLoops(int num_loops);
65 |     int GetIndexSpaceSizeForInnerLoops(int num_loops);
66 |     void GetVarIndexOffsetsForInnerLoops(int vari, int num_inner_loops, 
67 |                                          std::vector<int> &var_off, std::vector<int> &loop_off);
68 | 
69 |     private:
70 |     //The dimensions of the kernel loops computed to match the attached tensors
71 |     std::vector<int> LoopDims;
72 |     std::vector<int> LoopStrides;
73 | };
74 | 
75 | }
76 | 
77 | #endif //ACROBATIC_DIMENSIONED_KERNEL_HPP


--------------------------------------------------------------------------------
/kernel/DimensionedMultiKernel.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | #include "DimensionedMultiKernel.hpp"
  6 | #include <algorithm>
  7 | #include <set>
  8 | 
  9 | namespace acro
 10 | {
 11 | 
 12 | DimensionedMultiKernel::DimensionedMultiKernel(std::vector<DimensionedKernel*> &kernels)
 13 | {
 14 |     Kernels = kernels;
 15 |     InitMKLVars();
 16 | }
 17 | 
 18 | DimensionedMultiKernel::DimensionedMultiKernel(DimensionedKernel* kernel)
 19 | {
 20 |     Kernels.push_back(kernel);
 21 |     InitMKLVars();
 22 | }
 23 | 
 24 | 
 25 | void DimensionedMultiKernel::InitMKLVars()
 26 | {
 27 |     int uvari = 0;
 28 |     std::vector<std::string> added_vars;
 29 |     for (int ki = 0; ki < Kernels.size(); ++ki)
 30 |     {
 31 |         DimensionedKernel *kernel = Kernels[ki];
 32 |         for (int indi = 0; indi < kernel->AllIndexNames.size(); ++indi)
 33 |         {
 34 |             auto it = std::find(AllIndexNames.begin(), AllIndexNames.end(), kernel->AllIndexNames[indi]);
 35 |             if (it == AllIndexNames.end())
 36 |             {
 37 |                 AllIndexNames.push_back(kernel->AllIndexNames[indi]);
 38 |             }
 39 |         }
 40 | 
 41 |         for (int indi = 0; indi < kernel->ContractionIndexNames.size(); ++indi)
 42 |         {
 43 |             auto it = std::find(ContractionIndexNames.begin(), ContractionIndexNames.end(), kernel->ContractionIndexNames[indi]);
 44 |             if (it == ContractionIndexNames.end())
 45 |             {
 46 |                 ContractionIndexNames.push_back(kernel->ContractionIndexNames[indi]);
 47 |             }
 48 |         }
 49 | 
 50 |         for (int vari = -1; vari < kernel->GetNumInputVars(); ++vari)
 51 |         {
 52 |             auto it = std::find(added_vars.begin(), added_vars.end(), kernel->GetVarName(vari));
 53 |             if (it == added_vars.end())
 54 |             {
 55 |                 added_vars.push_back(kernel->GetVarName(vari));
 56 |                 UVariToFirstKiVari.push_back(std::make_pair(ki, vari));
 57 |                 KiVariToUVari[std::make_pair(ki, vari)] = uvari;
 58 |                 ++uvari;
 59 |             }
 60 |             else
 61 |             {
 62 |                 KiVariToUVari[std::make_pair(ki, vari)] = std::distance(added_vars.begin(), it);
 63 |             }
 64 |         }
 65 |     }
 66 | 
 67 |     //Find all the ouder indices that are shared by all subkernels
 68 |     std::vector<std::string> remove_list;
 69 |     SharedOuterIndexNames = AllIndexNames;
 70 |     for (int ki = 0; ki < Kernels.size(); ++ki)
 71 |     {
 72 |         DimensionedKernel *kernel = Kernels[ki];
 73 |         remove_list.resize(0);
 74 |         for (int idxi = 0; idxi < SharedOuterIndexNames.size(); ++idxi)
 75 |         {
 76 |             if (!kernel->IsDependentOnIndex(SharedOuterIndexNames[idxi]) || 
 77 |                  kernel->IsContractionIndex(SharedOuterIndexNames[idxi]))
 78 |             {
 79 |                 remove_list.push_back(SharedOuterIndexNames[idxi]);
 80 |             }
 81 |         }
 82 | 
 83 |         for (int ri = 0; ri < remove_list.size(); ++ri)
 84 |         {
 85 |             SharedOuterIndexNames.erase(std::remove(SharedOuterIndexNames.begin(), 
 86 |                                                     SharedOuterIndexNames.end(), remove_list[ri]),
 87 |                                         SharedOuterIndexNames.end());
 88 |         }
 89 |     }
 90 | 
 91 |     //Reorder the indices to put shared outer indices first
 92 |     std::vector<std::string> reordered_indices = SharedOuterIndexNames;
 93 |     for (int idxi = 0; idxi < AllIndexNames.size(); ++idxi)
 94 |     {
 95 |         std::string idx = AllIndexNames[idxi];
 96 |         auto it = std::find(reordered_indices.begin(), reordered_indices.end(), idx);
 97 |         if (it == reordered_indices.end())
 98 |         {
 99 |             reordered_indices.push_back(idx);
100 |         }
101 |     }
102 |     SetLoopIndices(reordered_indices);
103 | 
104 |     //Finally Reorder the Shared outer indices by size (largest first)
105 |     reordered_indices.clear();
106 |     reordered_indices.resize(SharedOuterIndexNames.size());
107 |     std::set<std::string> set_indices(SharedOuterIndexNames.begin(), SharedOuterIndexNames.end());
108 |     for (int i = 0; i < reordered_indices.size(); ++i)
109 |     {
110 |         int biggest_loop_size = -1;
111 |         std::string biggest_idx;
112 |         for (auto idx : set_indices)
113 |         {
114 |             int loop_size = GetLoopDim(idx);
115 |             if (loop_size > biggest_loop_size)
116 |             {
117 |                 biggest_loop_size = loop_size;
118 |                 biggest_idx = idx;
119 |             }
120 |         }
121 |         set_indices.erase(biggest_idx);
122 |         reordered_indices[i] = biggest_idx;
123 |     }
124 |     SharedOuterIndexNames = reordered_indices;
125 |     for (int idxi = 0; idxi < AllIndexNames.size(); ++idxi)
126 |     {
127 |         std::string idx = AllIndexNames[idxi];
128 |         auto it = std::find(reordered_indices.begin(), reordered_indices.end(), idx);
129 |         if (it == reordered_indices.end())
130 |         {
131 |             reordered_indices.push_back(idx);
132 |         }
133 |     }
134 |     SetLoopIndices(reordered_indices);
135 | }
136 | 
137 | 
138 | 
139 | int DimensionedMultiKernel::GetNumVars()
140 | {
141 |     return GetNumInputVars()+GetNumOutputVars();
142 | }
143 | 
144 | 
145 | int DimensionedMultiKernel::GetNumInputVars()
146 | {
147 |     int numvars = 0;
148 |     for (int ki = 0; ki < Kernels.size(); ++ki)
149 |     {
150 |         numvars += Kernels[ki]->GetNumInputVars();
151 |     }
152 |     return numvars;
153 | }
154 | 
155 | 
156 | int DimensionedMultiKernel::GetNumOutputVars()
157 | {
158 |     return Kernels.size();
159 | }
160 | 
161 | 
162 | void DimensionedMultiKernel::SetLoopIndices(std::vector<std::string> &idx_list)
163 | {
164 |     //Set the loop orders of the subkernels and the LoopDims
165 |     LoopDims.clear();
166 |     LoopDims.resize(idx_list.size(), 1);
167 |     for (int ki = 0; ki < Kernels.size(); ++ki)
168 |     {
169 |         Kernels[ki]->SetLoopIndices(idx_list);
170 |         for (int loopi = 0; loopi < idx_list.size(); ++loopi)
171 |         {
172 |             LoopDims[loopi] = std::max(LoopDims[loopi], Kernels[ki]->GetLoopDim(loopi));
173 |         }
174 |     }
175 | 
176 |     //Set the loop strides
177 |     LoopStrides.clear();
178 |     LoopStrides.resize(idx_list.size());
179 |     LoopStrides[LoopDims.size() - 1] = 1;
180 |     for (int loopd = LoopDims.size() - 2; loopd >= 0; --loopd)
181 |     {
182 |         LoopStrides[loopd] = LoopStrides[loopd+1]*LoopDims[loopd+1];
183 |     }   
184 | 
185 |     LoopIndices = idx_list;
186 | }
187 | 
188 | 
189 | int DimensionedMultiKernel::GetIndexLoopNum(std::string &idx)
190 | {
191 |     auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx);
192 |     if (it == LoopIndices.end())
193 |     {
194 |         return -1;
195 |     }
196 |     return std::distance(LoopIndices.begin(), it);
197 | }
198 | 
199 | 
200 | int DimensionedMultiKernel::GetVarRank(int ki, int vari)
201 | {
202 |     return Kernels[ki]->GetVarRank(vari);
203 | }
204 | 
205 | 
206 | int DimensionedMultiKernel::GetVarDimLoopNum(int ki, int vari, int dim)
207 | {
208 |     return Kernels[ki]->GetVarDimLoopNum(vari, dim);
209 | }
210 | 
211 | 
212 | int DimensionedMultiKernel::GetLoopNumVarDim(int loop_num, int ki, int vari)
213 | {
214 |     return Kernels[ki]->GetVarDimLoopNum(loop_num, vari);
215 | }
216 | 
217 | 
218 | std::string DimensionedMultiKernel::GetDimensionedNameString()
219 | {
220 |     std::string dimensioned_name;
221 |     for (auto kernel : Kernels)
222 |     {
223 |         dimensioned_name += kernel->GetDimensionedNameString() + ";";
224 |     }
225 |     return dimensioned_name;
226 | }
227 | 
228 | bool DimensionedMultiKernel::IsVarDependentOnLoop(int ki, int vari, int loop_num)
229 | {
230 |     return Kernels[ki]->IsVarDependentOnLoop(vari, loop_num);
231 | }
232 | 
233 | 
234 | bool DimensionedMultiKernel::IsContractionLoop(int loop_num)
235 | {
236 |     std::string idxstr = LoopIndices[loop_num];
237 |     return std::find(ContractionIndexNames.begin(),ContractionIndexNames.end(), idxstr)
238 |                      != ContractionIndexNames.end();
239 | }
240 | 
241 | 
242 | bool DimensionedMultiKernel::IsSharedOuterLoop(int loop_num)
243 | {
244 |     std::string idxstr = LoopIndices[loop_num];
245 |     return std::find(SharedOuterIndexNames.begin(),SharedOuterIndexNames.end(), idxstr)
246 |                      != SharedOuterIndexNames.end();
247 | }
248 | 
249 | 
250 | bool DimensionedMultiKernel::IsOutputUVar(int uvari)
251 | {
252 |     for (int ki = 0; ki < Kernels.size(); ++ki)
253 |     {
254 |         if (KiVariToUVari[std::make_pair(ki,-1)] == uvari)
255 |         {
256 |             return true;
257 |         }
258 |     }
259 |     return false;
260 | }
261 | 
262 | 
263 | bool DimensionedMultiKernel::IsInputUVar(int uvari)
264 | {
265 |     for (int ki = 0; ki < Kernels.size(); ++ki)
266 |     {
267 |         for (int vari = 0; vari < Kernels[ki]->GetNumInputVars(); ++vari)
268 |         {
269 |             if (KiVariToUVari[std::make_pair(ki,vari)] == uvari)
270 |             {
271 |                 return true;
272 |             }
273 |         }
274 |     }
275 |     return false;
276 | }
277 | 
278 | 
279 | int DimensionedMultiKernel::GetFlatIdxSize()
280 | {
281 |     int flatidx_size = 1;
282 |     for (int d = 0; d < GetNumIndices(); ++d)
283 |     {
284 |         flatidx_size *= LoopDims[d];
285 |     }
286 |     return flatidx_size;
287 | }
288 | 
289 | 
290 | int DimensionedMultiKernel::GetSharedOuterIdxSize()
291 | {
292 |     int outidx_size = 1;
293 |     for (int d = 0; d < GetNumIndices(); ++d)
294 |     {
295 |         if (IsSharedOuterLoop(d))
296 |         {
297 |             outidx_size *= LoopDims[d];
298 |         }
299 |     }
300 |     return outidx_size;
301 | }
302 | 
303 | 
304 | int DimensionedMultiKernel::GetIdxSizeForFirstNumLoops(int num_loops)
305 | {
306 |     int idx_size = 1;
307 |     for (int d = 0; d < num_loops; ++d)
308 |     {
309 |         idx_size *= LoopDims[d];
310 |     }
311 |     return idx_size;
312 | }
313 | 
314 | 
315 | int DimensionedMultiKernel::GetVarDimStride(int ki, int vari, int dim)
316 | {
317 |     return Kernels[ki]->GetVarDimStride(vari, dim);
318 | }
319 | 
320 | 
321 | int DimensionedMultiKernel::GetVarSize(int ki, int vari)
322 | {
323 |     return Kernels[ki]->GetVarSize(vari);
324 | }
325 | 
326 | 
327 | int DimensionedMultiKernel::GetVarSize(int uvari)
328 | {
329 |     auto ki_vari = UVariToFirstKiVari[uvari];
330 |     return GetVarSize(ki_vari.first, ki_vari.second);
331 | }
332 | 
333 | 
334 | int DimensionedMultiKernel::GetVarLoopDepth(int ki, int vari)
335 | {
336 |     return Kernels[ki]->GetVarLoopDepth(vari);
337 | }
338 | 
339 | 
340 | int DimensionedMultiKernel::GetVarStorageReqForInnerLoops(int ki, int vari, int num_loops)
341 | {
342 |     return Kernels[ki]->GetVarStorageReqForInnerLoops(vari, num_loops);
343 | }
344 | 
345 | 
346 | int DimensionedMultiKernel::GetInputStorageReqForInnerLoops(int num_loops)
347 | {
348 |     int storage = 0;
349 |     for (int ki = 0; ki < Kernels.size(); ++ki)
350 |     {
351 |         storage += Kernels[ki]->GetInputStorageReqForInnerLoops(num_loops);
352 |     }
353 |     return storage;
354 | }
355 | 
356 | 
357 | int DimensionedMultiKernel::GetOutputStorageReqForInnerLoops(int num_loops)
358 | {
359 |     int storage = 0;
360 |     for (int ki = 0; ki < Kernels.size(); ++ki)
361 |     {
362 |         storage += Kernels[ki]->GetOutputStorageReqForInnerLoops(num_loops);
363 |     }
364 |     return storage;
365 | }
366 | 
367 | 
368 | int DimensionedMultiKernel::GetTotalStorageReqForInnerLoops(int num_loops)
369 | {
370 |     return GetInputStorageReqForInnerLoops(num_loops) + GetOutputStorageReqForInnerLoops(num_loops);
371 | }
372 | 
373 | 
374 | int DimensionedMultiKernel::GetIndexSpaceSizeForInnerLoops(int num_loops)
375 | {
376 |     int size = 1;
377 |     for (int loop = GetNumIndices() - 1; loop >= GetNumIndices() - num_loops; --loop)
378 |     {
379 |         size *= LoopDims[loop];
380 |     }
381 |     return size;
382 | }
383 | 
384 | 
385 | void DimensionedMultiKernel::GetVarIndexOffsetsForInnerLoops(int ki, int vari, int num_inner_loops, 
386 |                                                   std::vector<int> &var_off, std::vector<int> &loop_off)
387 | {
388 |     Kernels[ki]->GetVarIndexOffsetsForInnerLoops(vari, num_inner_loops, var_off, loop_off);
389 | }
390 | 
391 | }


--------------------------------------------------------------------------------
/kernel/DimensionedMultiKernel.hpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifndef ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP
  7 | #define ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP
  8 | 
  9 | #include "DimensionedKernel.hpp"
 10 | #include <string>
 11 | #include <vector>
 12 | 
 13 | namespace acro
 14 | {
 15 | 
 16 | 
 17 | class DimensionedMultiKernel
 18 | {
 19 |     public:
 20 |     DimensionedMultiKernel(std::vector<DimensionedKernel*> &kernels);
 21 |     DimensionedMultiKernel(DimensionedKernel* kernel);
 22 | 
 23 |     int GetNumKernels() {return Kernels.size();}
 24 | 
 25 |     //The total number of loops required to execute the kernel
 26 |     int GetNumIndices() {return AllIndexNames.size();}
 27 | 
 28 |     //The number of outer loops in the multi kernel
 29 |     int GetNumOuterIndices() {return SharedOuterIndexNames.size();}
 30 | 
 31 |     //The number of inner contraction loops in the kernel
 32 |     int GetNumContractionIndices() {return ContractionIndexNames.size();}
 33 | 
 34 |     //The number of variables referenced in the kernel (including the output tensors)
 35 |     int GetNumVars();
 36 | 
 37 |     //The number of unique vars with the duplicates removed
 38 |     int GetNumUVars() {return UVariToFirstKiVari.size();}
 39 | 
 40 |     //The number of input variables referenced in the kernel
 41 |     int GetNumInputVars();
 42 | 
 43 |     //The number of input variables referenced in the kernel
 44 |     int GetNumOutputVars();
 45 | 
 46 |     std::string GetDimensionedNameString();
 47 | 
 48 |     std::string GetLoopIndex(int loopi) {return LoopIndices[loopi];}
 49 |     int GetIndexLoopNum(std::string &idx);
 50 | 
 51 |     //Change the order of the loops which will affect the following loop_num functions and the values of Var->LoopNums
 52 |     void SetLoopIndices(std::vector<std::string> &idx_list);
 53 | 
 54 |     //The rank of the given variable (mvari = -1..-n for output vars)
 55 |     int GetVarRank(int ki, int vari);
 56 | 
 57 |     //The loop number for the given mvariable/dimension (mvari = -1..-n for output)
 58 |     int GetVarDimLoopNum(int ki, int vari, int dim);
 59 | 
 60 |     //The input var dim given the loop num and the input mvari (mvari = -1..-n for output)
 61 |     //returns (-1 if input var is invariant to that loop)
 62 |     int GetLoopNumVarDim(int loop_num, int ki, int vari);
 63 | 
 64 |     //Does the input var have an index matching this loop num (mvari = -1..-n for outputs)
 65 |     bool IsVarDependentOnLoop(int ki, int vari, int loop_num);
 66 | 
 67 |     //Is this loop a contraction loop
 68 |     bool IsContractionLoop(int loop_num);
 69 | 
 70 |     //Is this loop bound to a shared non-contraction index for all the kernels
 71 |     bool IsSharedOuterLoop(int loop_num);
 72 | 
 73 |     //Is the UVar an output or input var (or both from different kernels)
 74 |     bool IsOutputUVar(int uvari);
 75 |     bool IsInputUVar(int uvari);
 76 | 
 77 |     //The dimensions of all the loops now that we have attached tensors
 78 |     const std::vector<int> &GetLoopDims() {return LoopDims;}
 79 |     int GetLoopDim(int i) {return LoopDims[i];}
 80 |     int GetLoopDim(std::string &idx) {return GetLoopDim(GetIndexLoopNum(idx));}
 81 |     int GetLoopStride(int i) {return LoopStrides[i];}
 82 | 
 83 |     //The the number of index combinations for all the loops (the product of the loop dims)
 84 |     int GetFlatIdxSize();
 85 | 
 86 |     //The the number of index combinations for just the outer non-contraction loops
 87 |     int GetSharedOuterIdxSize();
 88 | 
 89 |     //Get the number of indices in the first num_loops
 90 |     int GetIdxSizeForFirstNumLoops(int num_loops);
 91 | 
 92 |     //The stride in flattened index space of a given variable/dimension in the kernel 
 93 |     int GetVarDimStride(int ki, int vari, int dim);
 94 | 
 95 |     //The number of index combinations in a given variable in the kernel
 96 |     int GetVarSize(int ki, int vari);
 97 |     int GetVarSize(int uvari);
 98 | 
 99 |     //The highest loop number that the var varies by
100 |     int GetVarLoopDepth(int ki, int vari);
101 | 
102 |     //The unique vars will be listed starting from 0..n for the unique outputs
103 |     //followed by n+1..m for the unique inputs.  Duplicated will not be counted!
104 |     int GetUVari(int ki, int vari) {return KiVariToUVari[std::make_pair(ki,vari)];}
105 |     std::pair<int,int> GetFirstKiVariForUVari(int uvari) {return UVariToFirstKiVari[uvari];}
106 | 
107 |     //Information for the inner loops
108 |     int GetVarStorageReqForInnerLoops(int ki, int vari, int num_loops);
109 |     int GetInputStorageReqForInnerLoops(int num_loops);
110 |     int GetOutputStorageReqForInnerLoops(int num_loops);
111 |     int GetTotalStorageReqForInnerLoops(int num_loops);
112 |     int GetIndexSpaceSizeForInnerLoops(int num_loops);
113 |     void GetVarIndexOffsetsForInnerLoops(int ki, int vari, int num_inner_loops, 
114 |                                          std::vector<int> &var_off, std::vector<int> &loop_off);  
115 | 
116 |     std::vector<DimensionedKernel*> Kernels;
117 |     std::vector<std::string> AllIndexNames;
118 |     std::vector<std::string> ContractionIndexNames;
119 |     std::vector<std::string> SharedOuterIndexNames;
120 |     std::vector<std::string> LoopIndices;
121 | 
122 |     private:
123 |     void InitMKLVars();
124 | 
125 |     //Maps between the multikernel tensor numbering and the underlying kernel tensor numbering
126 |     std::map<std::pair<int,int>, int> KiVariToUVari;
127 |     std::vector<std::pair<int,int>> UVariToFirstKiVari;
128 | 
129 |     //The dimensions of the kernel loops computed to match the attached tensors
130 |     std::vector<int> LoopDims;
131 |     std::vector<int> LoopStrides;
132 | };
133 | 
134 | }
135 | 
136 | #endif //ACROBATIC_DIMENSIONED_MULTI_KERNEL_HPP


--------------------------------------------------------------------------------
/kernel/TensorEngine.hpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifndef ACROBATIC_TENSOR_ENGINE_HPP
  7 | #define ACROBATIC_TENSOR_ENGINE_HPP
  8 | 
  9 | #include <unordered_map>
 10 | #include <string>
 11 | #include "DimensionedMultiKernel.hpp"
 12 | #include "Executor.hpp"
 13 | #include "IndexMapping.hpp"
 14 | 
 15 | #ifdef ACRO_HAVE_CUDA
 16 | #include <cuda.h>
 17 | #endif 
 18 | 
 19 | 
 20 | namespace acro
 21 | {
 22 | 
 23 | class TensorKernel;
 24 | class NonContractionOps;
 25 | 
 26 | class TensorEngine
 27 | {
 28 |     public:
 29 |     TensorEngine();
 30 |     TensorEngine(const char *bare_exec_type);
 31 |     TensorEngine(std::string &exec_type);
 32 |     ~TensorEngine();
 33 |     void SetExecutorType(const char *bare_exec_type);
 34 |     void SetExecutorType(std::string &exec_type);
 35 |     std::string GetExecType() {return ExecutorType;}
 36 | 
 37 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1);
 38 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2);
 39 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3);
 40 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4);
 41 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5);
 42 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6);
 43 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7);
 44 |     void operator()(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8);
 45 |     void operator()(const char *bare_kernel_str, Tensor *out, std::vector<Tensor*> &inputs);
 46 | 
 47 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1);
 48 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2);
 49 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3);
 50 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4);
 51 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5);
 52 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6);
 53 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7);
 54 |     void operator()(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8);
 55 |     void operator()(std::string &kernel_str, Tensor *out, std::vector<Tensor*> &inputs);
 56 | 
 57 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1);
 58 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2);
 59 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3);
 60 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4);
 61 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5);
 62 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6);
 63 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7);
 64 |     std::string GetImplementation(const char *bare_kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8);
 65 |     std::string GetImplementation(const char *bare_kernel_str, Tensor *out, std::vector<Tensor*> &inputs);
 66 | 
 67 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1);
 68 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2);
 69 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3);
 70 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4);
 71 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5);
 72 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6);
 73 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7);
 74 |     std::string GetImplementation(std::string &kernel_str, Tensor &out, Tensor &in1, Tensor &in2, Tensor &in3, Tensor &in4, Tensor &in5, Tensor &in6, Tensor &in7, Tensor &in8);
 75 |     std::string GetImplementation(std::string &kernel_str, Tensor *out, std::vector<Tensor*> &inputs);
 76 | 
 77 |     void BatchMatrixInverse(Tensor &Ainv, Tensor &A);
 78 |     void BatchMatrixDet(Tensor &Adet, Tensor &A);
 79 |     void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A);
 80 |     void FlatIndexedScatter(Tensor &Aout, Tensor &Ait, IndexMapping &M);
 81 |     void FlatIndexedSumGather(Tensor &Aout, Tensor &Ait, IndexMapping &M);
 82 | 
 83 |     void Clear();
 84 |     bool IsGPUAvailable() {return isCudaReady();}
 85 |     void BeginMultiKernelLaunch();
 86 |     void EndMultiKernelLaunch();
 87 | 
 88 |     private:
 89 |     TensorKernel *GetAddTensorKernel(std::string &kernel_str);
 90 |     DimensionedKernel *GetAddDimensionedKernel(TensorKernel *kernel, Tensor *output, std::vector<Tensor*> &inputs);
 91 |     KernelExecutor *GetAddKernelExecutor();
 92 |     void MoveToComputeLocation(Tensor &T);
 93 |     void SwitchToComputeLocation(Tensor &T);
 94 |     void MoveToComputeLocation(IndexMapping &M);
 95 |     void SwitchToComputeLocation(IndexMapping &M);
 96 | 
 97 |     std::string ExecutorType;
 98 |     std::unordered_map<std::string, TensorKernel*> KernelMap;
 99 |     std::unordered_map<std::string, DimensionedKernel*> DimensionedKernelMap;
100 |     std::unordered_map<std::string, KernelExecutor*> ExecutorMap;
101 |     std::string ComputeLocation;
102 |     NonContractionOps *Ops;
103 | 
104 |     bool IsMultiKernelLaunch;
105 |     std::vector<DimensionedKernel*> MKLKernels;
106 |     std::vector<Tensor*> MKLOutputT;
107 |     std::vector<std::vector<Tensor*> > MKLInputT;
108 | 
109 | #ifdef ACRO_HAVE_CUDA
110 |     cudaStream_t TheCudaStream;
111 | #endif
112 | };
113 | 
114 | 
115 | }
116 | 
117 | #endif //ACROBATIC_TENSOR_ENGINE_HPP


--------------------------------------------------------------------------------
/kernel/TensorKernel.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "TensorKernel.hpp"
  7 | #include "SliceTensor.hpp"
  8 | #include <algorithm>
  9 | #include <iostream>
 10 | 
 11 | namespace acro
 12 | {
 13 | 
 14 | TensorKernel::TensorKernel(const char *kernel)
 15 | {
 16 |     KernelStr = kernel;
 17 |     ParseKernel();
 18 | }
 19 | 
 20 | TensorKernel::TensorKernel(std::string &kernel)
 21 | {
 22 |     KernelStr = kernel;
 23 |     ParseKernel();
 24 | }
 25 | 
 26 | // Recursive decent parse of tensor kernel (+ means 1 or more of in succession):
 27 | // <kernel> := <kernalvar><eqop><kernalvar>+
 28 | void TensorKernel::ParseKernel()
 29 | {
 30 |     std::string ParseStr = KernelStr;
 31 |     ParseStr.erase(remove_if(ParseStr.begin(), ParseStr.end(), isspace),ParseStr.end());
 32 |     std::string::iterator it = ParseStr.begin();
 33 |     ParseKernelVar(it, OutputVar);
 34 |  
 35 |     ParseEqOperator(it, EqOperator);
 36 | 
 37 |     InputVars.push_back(KernelVar());
 38 |     ParseKernelVar(it, InputVars.back());
 39 | 
 40 |     while(it != ParseStr.end()) {
 41 |         InputVars.push_back(KernelVar());
 42 |         ParseKernelVar(it, InputVars.back());
 43 |     }
 44 | 
 45 |     //Gather up the IndexNames and LoopNums associated with the OutputTensor
 46 |     for (int d = 0; d < OutputVar.IndexNames.size(); ++d)
 47 |     {
 48 |         AllIndexNames.push_back(OutputVar.IndexNames[d]);
 49 |     }
 50 | 
 51 |     //Now gather up the IndexNames and LoopNums associated with the contraction indices
 52 |     for (int vari = 0; vari < InputVars.size(); ++vari) {
 53 |         for (int indi = 0; indi < InputVars[vari].IndexNames.size(); ++indi) {
 54 |             auto acit = std::find(AllIndexNames.begin(), 
 55 |                                 AllIndexNames.end(),
 56 |                                 InputVars[vari].IndexNames[indi]);
 57 |             if (acit == AllIndexNames.end()) {
 58 |                 //The IndexName is not on the list yet so add it to everything
 59 |                 ContractionIndexNames.push_back(InputVars[vari].IndexNames[indi]);
 60 |                 AllIndexNames.push_back(InputVars[vari].IndexNames[indi]);
 61 |             }
 62 |         }
 63 |     }
 64 | 
 65 |     LoopIndices = AllIndexNames;
 66 |     SetVarLoopNums();
 67 | }
 68 | 
 69 | // <kernalvar> := <name><indexvar>+
 70 | // where <U> is an uppercase letter
 71 | void TensorKernel::ParseKernelVar(std::string::iterator &it, KernelVar &var)
 72 | {
 73 |     ParseVarName(it, var);
 74 |     ParseIndexVar(it, var);
 75 |     while (*it == '_') {
 76 |         ParseIndexVar(it, var);
 77 |     }
 78 |     var.LoopNums.resize(var.IndexNames.size());
 79 | }
 80 | 
 81 | 
 82 | // <name> := <U><U/L>*
 83 | // where <U> is an uppercase letter and <U/L/D> is any letter or digit
 84 | void TensorKernel::ParseVarName(std::string::iterator &it, KernelVar &var)
 85 | {
 86 |     //<U>
 87 |     ACROBATIC_ASSERT(isupper(*it));
 88 |     var.Name += *it;
 89 |     it ++;
 90 | 
 91 |     //<U/L/D>*
 92 |     while (isupper(*it) || islower(*it) || isdigit(*it)) {
 93 |         var.Name += *it;
 94 |         it ++;
 95 |     }
 96 | }
 97 | 
 98 | 
 99 | //<indexvar> :=  _<L>+
100 | //<L> is a lowercase letter or a digit
101 | void TensorKernel::ParseIndexVar(std::string::iterator &it, KernelVar &var)
102 | {
103 |     //_
104 |     ACROBATIC_ASSERT(*it == '_');
105 |     it ++;
106 | 
107 |     //<L>+
108 |     ACROBATIC_ASSERT(islower(*it) || isdigit(*it));
109 |     var.IndexNames.push_back("");
110 |     var.IndexNames[var.IndexNames.size() - 1] += *it;
111 |     it ++;
112 |     while(islower(*it) || isdigit(*it))
113 |     {
114 |         var.IndexNames[var.IndexNames.size() - 1] += *it;
115 |         it ++;
116 |     }
117 | }
118 | 
119 | 
120 | // <eqop> := ("=" | "+=" | "-=")
121 | void TensorKernel::ParseEqOperator(std::string::iterator &it, std::string &eqoper)
122 | {
123 |     if (*it == '=') {
124 |         it ++;
125 |         eqoper = "=";
126 |     } else if (*it == '+') {
127 |         it ++;
128 |         ACROBATIC_ASSERT(*it == '=');
129 |         it ++;
130 |         eqoper = "+=";
131 |     } else if (*it == '-') {
132 |         it ++;
133 |         ACROBATIC_ASSERT(*it == '=');
134 |         it ++;
135 |         eqoper = "-=";    
136 |     } else {
137 |         ACROBATIC_ASSERT(false);
138 |     }
139 | }
140 | 
141 | 
142 | void TensorKernel::SetVarLoopNums()
143 | {
144 |     OutputVar.LoopNums.resize(OutputVar.IndexNames.size());
145 |     for (int idxi = 0; idxi < OutputVar.IndexNames.size(); ++idxi)
146 |     {
147 |         auto loopit = std::find(LoopIndices.begin(), LoopIndices.end(), OutputVar.IndexNames[idxi]);
148 |         OutputVar.LoopNums[idxi] = std::distance(LoopIndices.begin(), loopit);
149 |     }
150 | 
151 |     for (int ivari = 0; ivari < InputVars.size(); ++ivari)
152 |     {
153 |         InputVars[ivari].LoopNums.resize(InputVars[ivari].IndexNames.size());
154 |         for (int idxi = 0; idxi < InputVars[ivari].IndexNames.size(); ++idxi)
155 |         {
156 |             auto loopit = std::find(LoopIndices.begin(), LoopIndices.end(), InputVars[ivari].IndexNames[idxi]);
157 |             InputVars[ivari].LoopNums[idxi] = std::distance(LoopIndices.begin(), loopit);
158 |         }
159 |     }
160 | }
161 | 
162 | 
163 | int TensorKernel::GetVarRank(int vari)
164 | {
165 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
166 | 
167 |     if (vari == -1)
168 |     {
169 |         return OutputVar.IndexNames.size();
170 |     }
171 | 
172 |     return InputVars[vari].IndexNames.size();
173 | }
174 | 
175 | 
176 | int TensorKernel::GetLoopDepth()
177 | {
178 |     int depth = -1; //Invariant to all loops
179 |     for (int vari = -1; vari < GetNumInputVars(); ++vari)
180 |     {
181 |         depth = std::max(depth, GetVarLoopDepth(vari));
182 |     }
183 |     return depth;
184 | }
185 | 
186 | 
187 | int TensorKernel::GetVarLoopDepth(int vari)
188 | {
189 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
190 | 
191 |     int depth = -1; //Invariant to all loops
192 |     for (int loopd = 0; loopd < LoopIndices.size(); ++ loopd)
193 |     {
194 |         if (IsVarDependentOnLoop(vari, loopd))
195 |         {
196 |             depth = loopd;
197 |         }
198 |     }
199 |     return depth;
200 | }
201 | 
202 | 
203 | void TensorKernel::SetLoopIndices(std::vector<std::string> &idx_list)
204 | {
205 |     LoopIndices = idx_list;
206 | 
207 |     //Update the LoopNums with the new permuted order
208 |     SetVarLoopNums();
209 | }
210 | 
211 | 
212 | int TensorKernel::GetLoopNum(std::string &idx)
213 | {
214 |     auto it = std::find(LoopIndices.begin(), LoopIndices.end(), idx);
215 |     ACROBATIC_ASSERT(it != LoopIndices.end(), "Loop index (" + idx + ") not found in kernel:\n"
216 |                      + KernelStr + "\n");
217 |     return std::distance(LoopIndices.begin(), it);
218 | }
219 | 
220 | int TensorKernel::GetVarDimLoopNum(int vari, int dim)
221 | {
222 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
223 |     ACROBATIC_ASSERT(dim >= 0 && dim < GetVarRank(vari));
224 | 
225 |     if (vari == -1)
226 |     {
227 |         return OutputVar.LoopNums[dim];
228 |     }
229 | 
230 |     return InputVars[vari].LoopNums[dim];
231 | }
232 | 
233 | 
234 | int TensorKernel::GetLoopNumVarDim(int loop_num, int vari)
235 | {
236 |     ACROBATIC_ASSERT(loop_num >= 0 && loop_num < LoopIndices.size());
237 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
238 | 
239 |     std::string loop_index_name = GetLoopIndex(loop_num);
240 | 
241 |     for (int d = 0; d < GetVarRank(vari); ++d)
242 |     {
243 |         if (vari == -1) 
244 |         {  
245 |             if (OutputVar.IndexNames[d] == loop_index_name)
246 |             {
247 |                 return d;
248 |             }
249 |         }
250 |         else
251 |         {
252 |             if (InputVars[vari].IndexNames[d] == loop_index_name)
253 |             {
254 |                 return d;
255 |             }
256 |         }
257 |     }
258 |     return -1;
259 | }
260 | 
261 | 
262 | bool TensorKernel::IsVarDependentOnLoop(int vari, int loop_num)
263 | {
264 |     return GetLoopNumVarDim(loop_num, vari) > -1;
265 | }
266 | 
267 | 
268 | bool TensorKernel::IsDependentOnIndex(std::string &idx) 
269 | {
270 |     return std::find(AllIndexNames.begin(), AllIndexNames.end(), idx) != AllIndexNames.end();
271 | }
272 | 
273 | 
274 | bool TensorKernel::IsDependentOnLoop(int loop_num) 
275 | {
276 |     std::string idxstr = LoopIndices[loop_num];
277 |     return IsDependentOnIndex(LoopIndices[loop_num]);
278 | }
279 | 
280 | 
281 | bool TensorKernel::IsContractionIndex(std::string &idx) 
282 | {
283 |     return std::find(ContractionIndexNames.begin(), ContractionIndexNames.end(), idx) != ContractionIndexNames.end();
284 | }
285 | 
286 | 
287 | bool TensorKernel::IsContractionLoop(int loop_num) 
288 | {
289 |     return IsContractionIndex(LoopIndices[loop_num]);
290 | }
291 | 
292 | 
293 | bool TensorKernel::IsContractionVar(int vari)
294 | {
295 |     ACROBATIC_ASSERT(vari >= -1 && vari < GetNumInputVars());
296 | 
297 |     if (vari == -1)
298 |     {
299 |         return false;
300 |     }
301 | 
302 |     for (auto idx : InputVars[vari].IndexNames)
303 |     {
304 |         if (IsContractionIndex(idx))
305 |         {
306 |             return true;
307 |         }
308 |     }
309 | 
310 |     return false;
311 | }
312 | 
313 | 
314 | std::string &TensorKernel::GetVarName(int vari)
315 | {
316 |     if (vari == -1)
317 |     {
318 |         return OutputVar.Name;
319 |     }
320 |     else
321 |     {
322 |         return InputVars[vari].Name;
323 |     }
324 | }
325 | 
326 | 
327 | std::string TensorKernel::GetNameString()
328 | {
329 |     std::string name = OutputVar.Name;
330 |     for (int d = 0; d < OutputVar.IndexNames.size(); ++d)
331 |     {
332 |         name += "_" + OutputVar.IndexNames[d];
333 |     }
334 | 
335 |     if (EqOperator == "=")
336 |     {
337 |         name += "eq";
338 |     }
339 |     else if (EqOperator == "+=")
340 |     {
341 |         name += "pe";
342 |     }
343 |     else if (EqOperator == "-=")
344 |     {
345 |         name += "me";
346 |     }
347 | 
348 |     for (int ivari = 0; ivari < InputVars.size(); ++ivari)
349 |     {
350 |         name += InputVars[ivari].Name;
351 |         for (int d = 0; d < InputVars[ivari].IndexNames.size(); ++d)
352 |         {
353 |             name += "_" + InputVars[ivari].IndexNames[d];
354 |         }
355 |     }
356 |     return name;
357 | }
358 | 
359 | 
360 | std::string TensorKernel::GetDimensionedNameString(Tensor *output, std::vector<Tensor*> &inputs)
361 | {
362 |     std::string name = GetNameString();
363 |     std::vector<int> idx_sizes = GetLoopIdxSizes(output, inputs);
364 |     
365 |     name += "_";
366 |     for (int idxi = 0; idxi < idx_sizes.size(); ++idxi)
367 |     {
368 |         name += "_" + std::to_string(idx_sizes[idxi]);
369 |     }
370 | 
371 |     return name;
372 | }
373 | 
374 | 
375 | std::vector<int> TensorKernel::GetLoopIdxSizes(Tensor *output, std::vector<Tensor*> &inputs)
376 | {
377 |     std::vector<int> idx_sizes(LoopIndices.size(), 1);  //Set loop indices not in this kernel to dim=1
378 |     for (int idxi = 0; idxi < output->GetRank(); ++idxi)
379 |     {
380 |         ACROBATIC_ASSERT(GetVarDimLoopNum(-1, idxi) >= 0 && GetVarDimLoopNum(-1, idxi) < idx_sizes.size());
381 |         idx_sizes[GetVarDimLoopNum(-1, idxi)] = output->GetDim(idxi);
382 |     }
383 | 
384 |     for (int vari = 0; vari < InputVars.size(); ++vari)
385 |     {
386 |         for (int idxi = 0; idxi < inputs[vari]->GetRank(); ++idxi)
387 |         {
388 |             ACROBATIC_ASSERT(GetVarDimLoopNum(vari, idxi) >= 0 && GetVarDimLoopNum(vari, idxi) < idx_sizes.size());
389 |             idx_sizes[GetVarDimLoopNum(vari, idxi)] = inputs[vari]->GetDim(idxi);
390 |         }
391 |     }
392 | 
393 |     //Check to make sure that the dimensions of the tensors are compatible with the kernel
394 |     for (int vari = 0; vari < InputVars.size(); ++vari)
395 |     {
396 |         for (int idxi = 0; idxi < InputVars[vari].LoopNums.size(); ++idxi)
397 |         {
398 |             ACROBATIC_ASSERT(idx_sizes[InputVars[vari].LoopNums[idxi]] == inputs[vari]->GetDim(idxi),
399 |                              "Incompatible tensor dimensions for kernel:  " + KernelStr);
400 |         }
401 |     }
402 |     return idx_sizes;
403 | }
404 | 
405 | }


--------------------------------------------------------------------------------
/kernel/TensorKernel.hpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifndef ACROBATIC_TENSOR_KERNEL_HPP
  7 | #define ACROBATIC_TENSOR_KERNEL_HPP
  8 | 
  9 | #include "Tensor.hpp"
 10 | #include <string>
 11 | 
 12 | namespace acro
 13 | {
 14 | 
 15 | class KernelVar
 16 | {
 17 |     public:
 18 |     KernelVar() {}
 19 |     std::string Name;
 20 |     std::vector<std::string> IndexNames;
 21 |     std::vector<int> LoopNums;
 22 | };
 23 | 
 24 | 
 25 | class TensorKernel
 26 | {
 27 |     protected:
 28 |     TensorKernel() {}
 29 |     public:
 30 |     TensorKernel(const char* kernel);
 31 |     TensorKernel(std::string &kernel);
 32 | 
 33 |     //The total number of loops required to execute the kernel
 34 |     int GetNumIndices() {return AllIndexNames.size();}
 35 | 
 36 |     //The number of outer loops in the kernel
 37 |     int GetNumOuterIndices() {return AllIndexNames.size() - ContractionIndexNames.size();}
 38 | 
 39 |     //The number of inner contraction loops in the kernel
 40 |     int GetNumContractionIndices() {return ContractionIndexNames.size();}
 41 | 
 42 |     //The number of variables referenced in the kernel (including the output tensor)
 43 |     int GetNumVars() {return InputVars.size()+1;}
 44 | 
 45 |     //The number of input variables referenced in the kernel
 46 |     int GetNumInputVars() {return InputVars.size();}
 47 | 
 48 |     //The rank of the given variable (vari = -1 for output)
 49 |     int GetVarRank(int vari);
 50 | 
 51 |     //Change the order of the loops which will affect the following loop_num functions and the values of Var->LoopNums
 52 |     std::string GetLoopIndex(int loopi) {return LoopIndices[loopi];}
 53 |     int GetLoopNum(std::string &idx);
 54 |     virtual void SetLoopIndices(std::vector<std::string> &idx_list);
 55 | 
 56 |     //The loop number for the given variable/dimension (vari = -1 for output)
 57 |     int GetVarDimLoopNum(int vari, int dim);
 58 | 
 59 |     //The input var dim given the loop num and the input vari (vari = -1 for output)
 60 |     //returns (-1 if input var is invariant to that loop)
 61 |     int GetLoopNumVarDim(int loop_num, int vari);
 62 | 
 63 |     //Does the input var have an index matching this loop num (vari = -1 for output)
 64 |     bool IsVarDependentOnLoop(int vari, int loop_num);
 65 | 
 66 |     //Does the Kernel have dependence on this index
 67 |     bool IsDependentOnIndex(std::string &idx);
 68 | 
 69 |     //Does the Kernel have dependence on this index
 70 |     bool IsDependentOnLoop(int loop_num);    
 71 | 
 72 |     //Does this a contraction index
 73 |     bool IsContractionIndex(std::string &idx);
 74 | 
 75 |     //Is this loop a contraction loop
 76 |     bool IsContractionLoop(int loop_num);
 77 | 
 78 |     bool IsContractionVar(int vari);
 79 | 
 80 |     //Get the highest loop number that the entire kernel depends on
 81 |     int GetLoopDepth();
 82 | 
 83 |     //The highest loop number that the var varies by (vari=-1 for output)
 84 |     int GetVarLoopDepth(int vari);
 85 | 
 86 |     //The the name of the variable (vari=-1 for output)
 87 |     std::string &GetVarName(int vari);
 88 | 
 89 |     //This returns the post parsed name string
 90 |     std::string GetNameString();
 91 | 
 92 |     //This returns a modified kernel string with the dimensions compatible with the tensors
 93 |     virtual std::string GetDimensionedNameString() {ACROBATIC_ASSERT(false); return "";}
 94 |     virtual std::string GetDimensionedNameString(Tensor *output, std::vector<Tensor*> &inputs);
 95 | 
 96 |     std::vector<int> GetLoopIdxSizes(Tensor *output, std::vector<Tensor*> &inputs);
 97 | 
 98 |     std::string KernelStr;                          //The user provided kernel string
 99 |     KernelVar OutputVar;                            //The output var extracted from the kernel string
100 |     std::string EqOperator;                         //The assignement operator extracted from the kernel string (=, +=)
101 |     std::vector<KernelVar> InputVars;              //The input vars extracted from the kernel string
102 |     std::vector<std::string> AllIndexNames;         //The names of all the indices extracted from the kernel string
103 |     std::vector<std::string> ContractionIndexNames; //The names of the contraction indices extracted from the kernel string
104 |     std::vector<std::string> LoopIndices;
105 | 
106 |     private:
107 |     void ParseKernel();
108 |     void ParseKernelVar(std::string::iterator &it, KernelVar &var);
109 |     void ParseVarName(std::string::iterator &it, KernelVar &var);
110 |     void ParseIndexVar(std::string::iterator &it, KernelVar &var);
111 |     void ParseEqOperator(std::string::iterator &it, std::string &op);
112 |     void SetVarLoopNums();
113 | };
114 | 
115 | }
116 | 
117 | #endif //ACROBATIC_TENSOR_KERNEL_HPP


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | #All rights reserved.
 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #If there is no config.mk copy over the default version
 7 | JUNK := $(shell if [ ! -f "config/config.mk" ];then cp config/defaults.mk config/config.mk; fi)
 8 | include config/config.mk
 9 | 
10 | DIRS = exec tensor util ops kernel
11 | SOURCE_FILES = $(foreach dir,$(DIRS),$(wildcard $(dir)/*.cpp))
12 | INCLUDE_FILES = $(foreach dir,$(DIRS),$(wildcard $(dir)/*.hpp)) AcroTensor.hpp
13 | OBJECT_FILES = $(SOURCE_FILES:.cpp=.o)
14 | INCLUDES = $(foreach dir,$(DIRS),-I../$(dir))
15 | 
16 | ifeq ($(DEBUG),YES)
17 | 	CXX_FLAGS = $(INCLUDES) $(CXX_DEBUG)
18 | else
19 | 	CXX_FLAGS = $(INCLUDES) $(CXX_OPT)
20 | endif
21 | 
22 | 
23 | .SUFFIXES: .cpp .o
24 | .cpp.o:
25 | 	cd $(<D); $(CXX) $(CXX_FLAGS) -c $(<F)
26 | 
27 | all: lib
28 | 
29 | lib: banner install_dirs libacrotensor.so libacrotensor.a
30 | 	cp $(INCLUDE_FILES) inc/
31 | 
32 | config: FORCE
33 | 	@echo "config/config.mk file generated.  Place your build preferences here."
34 | 
35 | banner:
36 | 	@echo ------------------------------------------------------------
37 | 	@echo
38 | 	@echo "                  Building Acrotensor"
39 | 	@echo
40 | 	@echo ------------------------------------------------------------
41 | 
42 | install_dirs:
43 | 	if [ ! -d "inc" ];then mkdir inc; fi
44 | 	if [ ! -d "lib" ];then mkdir lib; fi
45 | 	if [ ! -d "lib/shared" ];then mkdir lib/shared; fi
46 | 
47 | libacrotensor.so: $(OBJECT_FILES)
48 | 	$(CXX) -shared $(OBJECT_FILES) -o lib/shared/libacrotensor.so
49 | 
50 | libacrotensor.a: $(OBJECT_FILES)
51 | 	ar rcs lib/libacrotensor.a $(OBJECT_FILES) 
52 | 
53 | buildunittest: lib
54 | 	cd unittest; make CXX=$(UTILCXX) CXX_FLAGS="$(CXX_FLAGS)" LD_FLAGS="$(UNITTEST_LDFLAGS)"
55 | 
56 | unittest: buildunittest
57 | 	cd unittest; ./test
58 | 
59 | clean:
60 | 	rm -f */*.o */*.o.tgt-nvptx64sm_35-nvidia-linux */*~ *~ 
61 | 	rm -rf lib
62 | 	rm -rf inc
63 | 	cd unittest; make clean
64 | 
65 | FORCE:
66 | 


--------------------------------------------------------------------------------
/ops/CudaGPUOps.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifdef ACRO_HAVE_CUDA
  7 | 
  8 | #include "CudaGPUOps.hpp"
  9 | 
 10 | namespace acro
 11 | {
 12 | 
 13 | void CudaGPUOps::BatchMatrixInverse(Tensor &Ainv, Tensor &A)
 14 | {
 15 |     int rank = A.GetRank();
 16 |     int mdim = A.GetDim(rank-1);
 17 |     int stride = mdim*mdim;
 18 |     int num_batch = A.GetSize() / stride;
 19 |     double *A_ptr = A.GetDeviceData();
 20 |     double *Ainv_ptr = Ainv.GetDeviceData();
 21 |     if (mdim == 1)
 22 |     {
 23 |         CudaInv1x1<<<num_batch/128+1,128>>>(Ainv_ptr, A_ptr, num_batch);
 24 |     }
 25 |     else if (mdim == 2)
 26 |     {
 27 |         CudaInv2x2<<<num_batch/128+1,128>>>(Ainv_ptr, A_ptr, num_batch);
 28 |     }
 29 |     else if (mdim == 3)
 30 |     {
 31 |         CudaInv3x3<<<num_batch/128+1,128>>>(Ainv_ptr, A_ptr, num_batch);
 32 |     }
 33 | }
 34 | 
 35 | 
 36 | void CudaGPUOps::BatchMatrixDet(Tensor &Adet, Tensor &A)
 37 | {
 38 |     int rank = A.GetRank();
 39 |     int mdim = A.GetDim(rank-1);
 40 |     int stride = mdim*mdim;
 41 |     int num_batch = A.GetSize() / stride;
 42 |     double *A_ptr = A.GetDeviceData();
 43 |     double *Adet_ptr = Adet.GetDeviceData();
 44 |     if (mdim == 1)
 45 |     {
 46 |         CudaDet1x1<<<num_batch/128+1,128>>>(Adet_ptr, A_ptr, num_batch);
 47 |     }
 48 |     else if (mdim == 2)
 49 |     {
 50 |         CudaDet2x2<<<num_batch/128+1,128>>>(Adet_ptr, A_ptr, num_batch);
 51 |     }
 52 |     else if (mdim == 3)
 53 |     {
 54 |         CudaDet3x3<<<num_batch/128+1,128>>>(Adet_ptr, A_ptr, num_batch);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | void CudaGPUOps::BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A)
 60 | {
 61 |     int rank = A.GetRank();
 62 |     int mdim = A.GetDim(rank-1);
 63 |     int stride = mdim*mdim;
 64 |     int num_batch = A.GetSize() / stride;
 65 |     double *A_ptr = A.GetDeviceData();
 66 |     double *Ainv_ptr = Ainv.GetDeviceData();    
 67 |     double *Adet_ptr = Adet.GetDeviceData();
 68 |     if (mdim == 1)
 69 |     {
 70 |         CudaInvDet1x1<<<num_batch/128+1,128>>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch);
 71 |     }
 72 |     else if (mdim == 2)
 73 |     {
 74 |         CudaInvDet2x2<<<num_batch/128+1,128>>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch);
 75 |     }
 76 |     else if (mdim == 3)
 77 |     {
 78 |         CudaInvDet3x3<<<num_batch/128+1,128>>>(Ainv_ptr, Adet_ptr, A_ptr, num_batch);
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | void CudaGPUOps::FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M)
 84 | {
 85 |     double *Aout_ptr = Aout.GetDeviceData();
 86 |     double *Ain_ptr = Ain.GetDeviceData();
 87 |     int *M_ptr = M.GetMap().GetDeviceData();
 88 |     int *InvM_ptr = M.GetInvMap().GetDeviceData();
 89 |     int *InvMOff_ptr = M.GetInvMapOffsets().GetDeviceData();
 90 |     int N = M.GetRangeSize();
 91 |     CudaScatter<<<N/128+1,128>>>(Aout_ptr, Ain_ptr, M_ptr, InvM_ptr, InvMOff_ptr, N);
 92 | }
 93 | 
 94 | 
 95 | void CudaGPUOps::FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M)
 96 | {
 97 |     double *Aout_ptr = Aout.GetDeviceData();
 98 |     double *Ain_ptr = Ain.GetDeviceData();
 99 |     int *M_ptr = M.GetMap().GetDeviceData();
100 |     int *InvM_ptr = M.GetInvMap().GetDeviceData();
101 |     int *InvMOff_ptr = M.GetInvMapOffsets().GetDeviceData();
102 |     int N = M.GetDomainSize();
103 | 
104 |     CudaSumGather<<<N/128+1,128>>>(Aout_ptr, Ain_ptr, M_ptr, InvM_ptr, InvMOff_ptr, N);
105 | }
106 | 
107 | 
108 | __global__ void CudaInv1x1(double *Ainv, double *A, int N)
109 | {
110 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
111 |     if (idx < N)
112 |     {
113 |         Ainv[idx] = 1.0 / A[idx];
114 |     }
115 | }
116 | 
117 | 
118 | __global__ void CudaInv2x2(double *Ainv, double *A, int N)
119 | {
120 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
121 |     if (idx < N)
122 |     {
123 |         int b = idx*4;
124 |         double A0 = A[b];
125 |         double A1 = A[b+1];
126 |         double A2 = A[b+2];
127 |         double A3 = A[b+3];
128 |         double invdet = 1.0 / (A0*A3 - A1*A2);
129 |         Ainv[b+0] = invdet*A3;
130 |         Ainv[b+1] = -invdet*A1;
131 |         Ainv[b+2] = -invdet*A2;
132 |         Ainv[b+3] = invdet*A0;
133 |     }
134 | }
135 | 
136 | 
137 | __global__ void CudaInv3x3(double *Ainv, double *A, int N)
138 | {
139 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
140 |     if (idx < N)
141 |     {
142 |         int b = idx*9;
143 |         double A0 = A[b];
144 |         double A1 = A[b+1];
145 |         double A2 = A[b+2];
146 |         double A3 = A[b+3];
147 |         double A4 = A[b+4];
148 |         double A5 = A[b+5];
149 |         double A6 = A[b+6];
150 |         double A7 = A[b+7];
151 |         double A8 = A[b+8];
152 |         double invdet = 1.0 / (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 
153 |                              - A6*A4*A2 - A7*A5*A0 - A8*A3*A1);
154 |         Ainv[b+0] = invdet*(A4*A8 - A5*A7);
155 |         Ainv[b+1] = invdet*(A5*A6 - A3*A8);
156 |         Ainv[b+2] = invdet*(A3*A7 - A4*A6);
157 |         Ainv[b+3] = invdet*(A2*A7 - A1*A8);
158 |         Ainv[b+4] = invdet*(A0*A8 - A2*A6);
159 |         Ainv[b+5] = invdet*(A1*A6 - A0*A7);
160 |         Ainv[b+6] = invdet*(A1*A5 - A2*A4);
161 |         Ainv[b+7] = invdet*(A2*A3 - A0*A5);
162 |         Ainv[b+8] = invdet*(A0*A4 - A1*A3);        
163 |     }
164 | }
165 | 
166 | 
167 | __global__ void CudaDet1x1(double *Adet, double *A, int N)
168 | {
169 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
170 |     if (idx < N)
171 |     {
172 |         Adet[idx] = A[idx];
173 |     }
174 | }
175 | 
176 | 
177 | __global__ void CudaDet2x2(double *Adet, double *A, int N)
178 | {
179 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
180 |     if (idx < N)
181 |     {
182 |         int b = idx*4;
183 |         double A0 = A[b];
184 |         double A1 = A[b+1];
185 |         double A2 = A[b+2];
186 |         double A3 = A[b+3];
187 |         Adet[idx] = (A0*A3 - A1*A2);
188 |     }
189 | }
190 | 
191 | 
192 | __global__ void CudaDet3x3(double *Adet, double *A, int N)
193 | {
194 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
195 |     if (idx < N)
196 |     {
197 |         int b = idx*9;
198 |         double A0 = A[b];
199 |         double A1 = A[b+1];
200 |         double A2 = A[b+2];
201 |         double A3 = A[b+3];
202 |         double A4 = A[b+4];
203 |         double A5 = A[b+5];
204 |         double A6 = A[b+6];
205 |         double A7 = A[b+7];
206 |         double A8 = A[b+8];
207 |         Adet[idx] = (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 
208 |                    - A6*A4*A2 - A7*A5*A0 - A8*A3*A1);       
209 |     }
210 | }
211 | 
212 | 
213 | __global__ void CudaInvDet1x1(double *Ainv, double *Adet, double *A, int N)
214 | {
215 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
216 |     if (idx < N)
217 |     {
218 |         double det = A[idx];
219 |         Adet[idx] = det;
220 |         Ainv[idx] = 1.0 / det;
221 |     }
222 | }
223 | 
224 | 
225 | __global__ void CudaInvDet2x2(double *Ainv, double *Adet, double *A, int N)
226 | {
227 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
228 |     if (idx < N)
229 |     {
230 |         int b = idx*4;
231 |         double A0 = A[b];
232 |         double A1 = A[b+1];
233 |         double A2 = A[b+2];
234 |         double A3 = A[b+3];
235 |         double det = (A0*A3 - A1*A2);
236 |         Adet[idx] = det;
237 |         double invdet = 1.0 / det;
238 |         Ainv[b+0] = invdet*A3;
239 |         Ainv[b+1] = -invdet*A1;
240 |         Ainv[b+2] = -invdet*A2;
241 |         Ainv[b+3] = invdet*A0;
242 |     }
243 | }
244 | 
245 | 
246 | __global__ void CudaInvDet3x3(double *Ainv, double *Adet, double *A, int N)
247 | {
248 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
249 |     if (idx < N)
250 |     {
251 |         int b = idx*9;
252 |         double A0 = A[b];
253 |         double A1 = A[b+1];
254 |         double A2 = A[b+2];
255 |         double A3 = A[b+3];
256 |         double A4 = A[b+4];
257 |         double A5 = A[b+5];
258 |         double A6 = A[b+6];
259 |         double A7 = A[b+7];
260 |         double A8 = A[b+8];
261 |         double det = (A0*A4*A8 + A1*A5*A6 + A2*A3*A7 
262 |                     - A6*A4*A2 - A7*A5*A0 - A8*A3*A1);        
263 |         Adet[idx] = det;
264 |         double invdet = 1.0 / det;        
265 |         Ainv[b+0] = invdet*(A4*A8 - A5*A7);
266 |         Ainv[b+1] = invdet*(A5*A6 - A3*A8);
267 |         Ainv[b+2] = invdet*(A3*A7 - A4*A6);
268 |         Ainv[b+3] = invdet*(A2*A7 - A1*A8);
269 |         Ainv[b+4] = invdet*(A0*A8 - A2*A6);
270 |         Ainv[b+5] = invdet*(A1*A6 - A0*A7);
271 |         Ainv[b+6] = invdet*(A1*A5 - A2*A4);
272 |         Ainv[b+7] = invdet*(A2*A3 - A0*A5);
273 |         Ainv[b+8] = invdet*(A0*A4 - A1*A3);
274 |     }
275 | }
276 | 
277 | 
278 | __global__ void CudaScatter(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N)
279 | {
280 |     int i = blockIdx.x*blockDim.x + threadIdx.x;
281 |     if (i < N)
282 |     {
283 |         Aout[i] = Ain[M[i]];
284 |     }
285 | }
286 | 
287 | 
288 | __global__ void CudaSumGather(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N)
289 | {
290 |     int iout = blockIdx.x*blockDim.x + threadIdx.x;
291 |     if (iout < N)
292 |     {
293 |         int in_beg = invMOff[iout];
294 |         int in_end = invMOff[iout + 1];
295 |         double sum = 0.0;
296 |         for (int iin = in_beg; iin < in_end; ++iin)
297 |         {
298 |             sum += Ain[invM[iin]];
299 |         }
300 |         Aout[iout] = sum;
301 |     }
302 | }
303 | 
304 | 
305 | }
306 | 
307 | #endif


--------------------------------------------------------------------------------
/ops/CudaGPUOps.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_CUDA_GPU_OPS_HPP
 7 | #define ACROBATIC_CUDA_GPU_OPS_HPP
 8 | 
 9 | #ifdef ACRO_HAVE_CUDA
10 | 
11 | #include "NonContractionOps.hpp"
12 | #include "Tensor.hpp"
13 | 
14 | namespace acro
15 | {
16 | 
17 | 
18 | //Internal CPU operations on tensors that are exposed properly by the kernel executors.
19 | //Use of this class directly is not recommended.
20 | class CudaGPUOps : public NonContractionOps
21 | {
22 |     public:
23 |     void BatchMatrixInverse(Tensor &out, Tensor &in);
24 |     void BatchMatrixDet(Tensor &Adet, Tensor &A);
25 |     void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A);
26 | 
27 |     void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M);
28 |     void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M);
29 | 
30 | };
31 | 
32 | 
33 | __global__ void CudaInv1x1(double *Ainv, double *A, int N);
34 | __global__ void CudaInv2x2(double *Ainv, double *A, int N);
35 | __global__ void CudaInv3x3(double *Ainv, double *A, int N);
36 | __global__ void CudaDet1x1(double *Adet, double *A, int N);
37 | __global__ void CudaDet2x2(double *Adet, double *A, int N);
38 | __global__ void CudaDet3x3(double *Adet, double *A, int N);
39 | __global__ void CudaInvDet1x1(double *Ainv, double *Adet, double *A, int N);
40 | __global__ void CudaInvDet2x2(double *Ainv, double *Adet, double *A, int N);
41 | __global__ void CudaInvDet3x3(double *Ainv, double *Adet, double *A, int N);
42 | __global__ void CudaScatter(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N);
43 | __global__ void CudaSumGather(double *Aout, double *Ain, int *M, int *invM, int *invMOff, int N);
44 | 
45 | }
46 | 
47 | #endif
48 | #endif //ACROBATIC_CUDA_GPU_OPS_HPP


--------------------------------------------------------------------------------
/ops/NativeCPUOps.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "NativeCPUOps.hpp"
  7 | 
  8 | namespace acro
  9 | {
 10 | 
 11 | void NativeCPUOps::BatchMatrixInverse(Tensor &Ainv, Tensor &A)
 12 | {
 13 |     int rank = A.GetRank();
 14 |     int mdim = A.GetDim(rank-1);
 15 |     int stride = mdim*mdim;
 16 |     int num_batch = A.GetSize() / stride;
 17 |     double *A_ptr = A.GetData();
 18 |     double *Ainv_ptr = Ainv.GetData();
 19 |     if (mdim == 1)
 20 |     {
 21 |         for (int i = 0; i < num_batch; ++i)
 22 |         {
 23 |             Inv1x1(Ainv_ptr, A_ptr, Det1x1(A_ptr));
 24 |             Ainv_ptr += stride;
 25 |             A_ptr += stride;
 26 |         }
 27 |     }
 28 |     else if (mdim == 2)
 29 |     {
 30 |         for (int i = 0; i < num_batch; ++i)
 31 |         {
 32 |             Inv2x2(Ainv_ptr, A_ptr, Det2x2(A_ptr));
 33 |             Ainv_ptr += stride;
 34 |             A_ptr += stride;
 35 |         }        
 36 |     }
 37 |     else if (mdim == 3)
 38 |     {
 39 |         for (int i = 0; i < num_batch; ++i)
 40 |         {
 41 |             Inv3x3(Ainv_ptr, A_ptr, Det3x3(A_ptr));
 42 |             Ainv_ptr += stride;
 43 |             A_ptr += stride;
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | 
 49 | void NativeCPUOps::BatchMatrixDet(Tensor &Adet, Tensor &A)
 50 | {
 51 |     int rank = A.GetRank();
 52 |     int mdim = A.GetDim(rank-1);
 53 |     int stride = mdim*mdim;
 54 |     int num_batch = A.GetSize() / stride;
 55 |     double *A_ptr = A.GetData();
 56 |     double *Adet_ptr = Adet.GetData();
 57 |     if (mdim == 1)
 58 |     {
 59 |         for (int i = 0; i < num_batch; ++i)
 60 |         {
 61 |             Adet_ptr[i] = Det1x1(A_ptr);
 62 |             A_ptr += stride;
 63 |         }
 64 |     }
 65 |     else if (mdim == 2)
 66 |     {
 67 |         for (int i = 0; i < num_batch; ++i)
 68 |         {
 69 |             Adet_ptr[i] = Det2x2(A_ptr);
 70 |             A_ptr += stride;
 71 |         }        
 72 |     }
 73 |     else if (mdim == 3)
 74 |     {
 75 |         for (int i = 0; i < num_batch; ++i)
 76 |         {
 77 |             Adet_ptr[i] = Det3x3(A_ptr);
 78 |             A_ptr += stride;
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | 
 84 | void NativeCPUOps::BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A)
 85 | {
 86 |     int rank = A.GetRank();
 87 |     int mdim = A.GetDim(rank-1);
 88 |     int stride = mdim*mdim;
 89 |     int num_batch = A.GetSize() / stride;
 90 |     double *A_ptr = A.GetData();
 91 |     double *Ainv_ptr = Ainv.GetData();
 92 |     double *Adet_ptr = Adet.GetData();
 93 |     if (mdim == 1)
 94 |     {
 95 |         for (int i = 0; i < num_batch; ++i)
 96 |         {
 97 |             Adet_ptr[i] = Det1x1(A_ptr);
 98 |             Inv1x1(Ainv_ptr, A_ptr, Adet_ptr[i]);
 99 |             A_ptr += stride;
100 |             Ainv_ptr += stride;
101 |         }
102 |     }
103 |     else if (mdim == 2)
104 |     {
105 |         for (int i = 0; i < num_batch; ++i)
106 |         {
107 |             Adet_ptr[i] = Det2x2(A_ptr);
108 |             Inv2x2(Ainv_ptr, A_ptr, Adet_ptr[i]);
109 |             A_ptr += stride;
110 |             Ainv_ptr += stride;
111 |         }        
112 |     }
113 |     else if (mdim == 3)
114 |     {
115 |         for (int i = 0; i < num_batch; ++i)
116 |         {
117 |             Adet_ptr[i] = Det3x3(A_ptr);
118 |             Inv3x3(Ainv_ptr, A_ptr, Adet_ptr[i]);
119 |             A_ptr += stride;
120 |             Ainv_ptr += stride;
121 |         }
122 |     }
123 | }
124 | 
125 | 
126 | void NativeCPUOps::FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M)
127 | {
128 |     IndexVector &I = M.GetMap();
129 |     for (int i = 0; i < I.GetSize(); i++)
130 |     {
131 |         Aout[i] = Ain[I[i]];
132 |     }
133 | }
134 | 
135 | 
136 | void NativeCPUOps::FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M)
137 | {
138 |     IndexVector &I = M.GetMap();
139 |     Aout.Set(0.0);
140 |     for (int i = 0; i < I.GetSize(); i++)
141 |     {
142 |         Aout[I[i]] += Ain[i];
143 |     }
144 | }
145 | 
146 | 
147 | 
148 | 
149 | }


--------------------------------------------------------------------------------
/ops/NativeCPUOps.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_NATIVE_CPU_OPS_HPP
 7 | #define ACROBATIC_NATIVE_CPU_OPS_HPP
 8 | 
 9 | #include "NonContractionOps.hpp"
10 | #include "Tensor.hpp"
11 | 
12 | namespace acro
13 | {
14 | 
15 | 
16 | //Internal CPU operations on tensors that are exposed properly by the kernel executors.
17 | //Use of this class directly is not recommended.
18 | class NativeCPUOps : public NonContractionOps
19 | {
20 |     public:
21 |     void BatchMatrixInverse(Tensor &Ainv, Tensor &A);
22 |     void BatchMatrixDet(Tensor &Adet, Tensor &A);
23 |     void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A);
24 | 
25 |     void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M);
26 |     void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M);
27 | 
28 |     private:
29 |     inline void Inv1x1(double *Ainv, double *A, double det);
30 |     inline void Inv2x2(double *Ainv, double *A, double det);
31 |     inline void Inv3x3(double *Ainv, double *A, double det);
32 |     inline double Det1x1(double *A);
33 |     inline double Det2x2(double *A);
34 |     inline double Det3x3(double *A);
35 | };
36 | 
37 | 
38 | inline void NativeCPUOps::Inv1x1(double *Ainv, double *A, double det)
39 | {
40 |     Ainv[0] = 1.0 / det;
41 | }
42 | 
43 | 
44 | inline void NativeCPUOps::Inv2x2(double *Ainv, double *A, double det)
45 | {
46 |     double invdet = 1.0 / det;
47 |     Ainv[0] = invdet*A[3];
48 |     Ainv[1] = -invdet*A[1];
49 |     Ainv[2] = -invdet*A[2];
50 |     Ainv[3] = invdet*A[0];
51 | 
52 | }
53 | 
54 | 
55 | inline void NativeCPUOps::Inv3x3(double *Ainv, double *A, double det)
56 | {
57 |     double invdet = 1.0 / det;
58 |     Ainv[0] = invdet*(A[4]*A[8] - A[5]*A[7]);
59 |     Ainv[1] = invdet*(A[5]*A[6] - A[3]*A[8]);
60 |     Ainv[2] = invdet*(A[3]*A[7] - A[4]*A[6]);
61 |     Ainv[3] = invdet*(A[2]*A[7] - A[1]*A[8]);
62 |     Ainv[4] = invdet*(A[0]*A[8] - A[2]*A[6]);
63 |     Ainv[5] = invdet*(A[1]*A[6] - A[0]*A[7]);
64 |     Ainv[6] = invdet*(A[1]*A[5] - A[2]*A[4]);
65 |     Ainv[7] = invdet*(A[2]*A[3] - A[0]*A[5]);
66 |     Ainv[8] = invdet*(A[0]*A[4] - A[1]*A[3]);
67 | }
68 | 
69 | 
70 | inline double NativeCPUOps::Det1x1(double *A)
71 | {
72 |     return A[0];
73 | }
74 | 
75 | 
76 | inline double NativeCPUOps::Det2x2(double *A)
77 | {
78 |     return (A[0]*A[3] - A[1]*A[2]);
79 | }
80 | 
81 | 
82 | inline double NativeCPUOps::Det3x3(double *A)
83 | {
84 |     return (A[0]*A[4]*A[8] + A[1]*A[5]*A[6] + A[2]*A[3]*A[7] 
85 |           - A[6]*A[4]*A[2] - A[7]*A[5]*A[0] - A[8]*A[3]*A[1]);
86 | }
87 | 
88 | 
89 | 
90 | }
91 | 
92 | 
93 | #endif //ACROBATIC_NATIVE_CPU_OPS_HPP


--------------------------------------------------------------------------------
/ops/NonContractionOps.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_NON_CONTRACTION_OPS_HPP
 7 | #define ACROBATIC_NON_CONTRACTION_OPS_HPP
 8 | 
 9 | #include <string>
10 | #include "Tensor.hpp"
11 | #include "IndexMapping.hpp"
12 | 
13 | namespace acro
14 | {
15 | 
16 | 
17 | class NonContractionOps
18 | {
19 |     public:
20 |     //Batched 1x1, 2x2, and 3x3 matrix inverses and determinents
21 |     //The last 2 indices are for the matrices and the rests are batched over
22 |     virtual void BatchMatrixInverse(Tensor &Ainv, Tensor &A) = 0;
23 |     virtual void BatchMatrixDet(Tensor &Adet, Tensor &A) = 0;
24 |     virtual void BatchMatrixInvDet(Tensor &Ainv, Tensor &Adet, Tensor &A) = 0;
25 | 
26 |     //Aout[i] = Ain[I[i]]
27 |     virtual void FlatIndexedScatter(Tensor &Aout, Tensor &Ain, IndexMapping &M) = 0;
28 | 
29 |     //Aout[:] = 0.0
30 |     //Aout[I[i]] += Ain[i]
31 |     virtual void FlatIndexedSumGather(Tensor &Aout, Tensor &Ain, IndexMapping &M) = 0;
32 | };
33 | 
34 | }
35 | 
36 | 
37 | #endif //ACROBATIC_NON_CONTRACTION_OPS_HPP


--------------------------------------------------------------------------------
/ops/Ops.hpp:
--------------------------------------------------------------------------------
1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
3 | //All rights reserved.
4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
5 | 
6 | #include "NativeCPUOps.hpp"
7 | #include "CudaGPUOps.hpp"


--------------------------------------------------------------------------------
/tensor/IndexMapping.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "IndexMapping.hpp"
  7 | #include <algorithm>
  8 | #include <numeric>
  9 | 
 10 | namespace acro
 11 | {
 12 | 
 13 | 
 14 | IndexMapping::IndexMapping(int domain_size, int range_size) :
 15 |     DomainSize(domain_size),
 16 |     RangeSize(range_size),
 17 |     InverseComputed(false),
 18 |     M(range_size),
 19 |     InvM(range_size),
 20 |     InvMOff(domain_size+1)
 21 | {
 22 | 
 23 | }
 24 | 
 25 | 
 26 | 
 27 | void IndexMapping::ComputeInverse()
 28 | {
 29 |     ACROBATIC_ASSERT(!InverseComputed,"Can't compute the inverse mapping twice.");
 30 | 
 31 |     std::iota(&InvM[0], &InvM[RangeSize], 0);
 32 |     std::stable_sort(&InvM[0], &InvM[RangeSize],
 33 |        [this](size_t i1, size_t i2) {return M[i1] < M[i2];});
 34 | 
 35 |     int off = 0;
 36 |     for (int i = 0; i < DomainSize + 1; ++i)
 37 |     {
 38 |         InvMOff[i] = off;
 39 |         if (off < RangeSize)
 40 |         {
 41 |             int m = M[InvM[off]];
 42 |             while (off < RangeSize && M[InvM[off]] == m)
 43 |             {
 44 |                 off ++;
 45 |             }
 46 |         }
 47 |         else
 48 |         {
 49 |             off = RangeSize;    //Handle the last one
 50 |         }
 51 |     }
 52 | 
 53 |     InverseComputed = true;
 54 | 
 55 |     if (OnGPU)
 56 |     {
 57 |         InvM.SwitchFromGPU();
 58 |         InvM.MoveToGPU();
 59 |         InvMOff.SwitchFromGPU();
 60 |         InvMOff.MoveToGPU();
 61 |     }
 62 | }
 63 | 
 64 | 
 65 | void IndexMapping::MapToGPU()
 66 | {
 67 |     M.MapToGPU();
 68 |     if (InverseComputed)
 69 |     {
 70 |         InvM.MapToGPU();
 71 |         InvMOff.MapToGPU();
 72 |     }
 73 |     MappedToGPU = true;
 74 | }
 75 | 
 76 | 
 77 | void IndexMapping::MoveToGPU()
 78 | {
 79 |     M.MoveToGPU();
 80 |     if (InverseComputed)
 81 |     {
 82 |         InvM.MoveToGPU();
 83 |         InvMOff.MoveToGPU();
 84 |     }
 85 |     OnGPU = true;
 86 | }
 87 | 
 88 | 
 89 | void IndexMapping::SwitchToGPU()
 90 | {
 91 |     M.SwitchToGPU();
 92 |     if (InverseComputed)
 93 |     {
 94 |         InvM.SwitchToGPU();
 95 |         InvMOff.SwitchToGPU();
 96 |     }
 97 |     OnGPU = true;
 98 | }
 99 | 
100 | 
101 | void IndexMapping::UnmapFromGPU()
102 | {
103 |     M.UnmapFromGPU();
104 |     if (InverseComputed)
105 |     {
106 |         InvM.UnmapFromGPU();
107 |         InvMOff.UnmapFromGPU();
108 |     }
109 |     MappedToGPU = false;
110 |     OnGPU = false;
111 | }
112 | 
113 | 
114 | void IndexMapping::MoveFromGPU()
115 | {
116 |     M.MoveFromGPU();
117 |     if (InverseComputed)
118 |     {
119 |         InvM.MoveFromGPU();
120 |         InvMOff.MoveFromGPU();
121 |     }
122 |     OnGPU = false;
123 | }
124 | 
125 | 
126 | void IndexMapping::SwitchFromGPU()
127 | {
128 |     M.SwitchFromGPU();
129 |     if (InverseComputed)
130 |     {
131 |         InvM.SwitchFromGPU();
132 |         InvMOff.SwitchFromGPU();
133 |     }
134 |     OnGPU = false;
135 | }
136 | 
137 | 
138 | 
139 | 
140 | }
141 | 


--------------------------------------------------------------------------------
/tensor/IndexMapping.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_INDEXMAPPING_HPP
 7 | #define ACROBATIC_INDEXMAPPING_HPP
 8 | 
 9 | #include "IndexVector.hpp"
10 | 
11 | namespace acro
12 | {
13 | 
14 | class IndexMapping
15 | {
16 |     public:
17 |     IndexMapping(int domain_size, int range_size);
18 | 
19 |     int GetDomainSize() {return DomainSize;}
20 |     int GetRangeSize() {return RangeSize;}
21 |     bool IsInverseComputed() {return InverseComputed;}
22 | 
23 |     int &operator[](int raw_index);
24 |     void ComputeInverse();
25 | 
26 |     IndexVector &GetMap();
27 |     IndexVector &GetInvMap();
28 |     IndexVector &GetInvMapOffsets();
29 | 
30 |     void MapToGPU();            //Allocate memory for the data on the GPU
31 |     void MoveToGPU();           //Copy the data to the GPU and flag the data as currently on the GPU
32 |     void SwitchToGPU();         //Flag the data as currently onGPU
33 |     void UnmapFromGPU();        //Deallocate memory on the GPU
34 |     void MoveFromGPU();         //Copy the data back from the GPU and flag the data as currently on the CPU
35 |     void SwitchFromGPU();       //Flag the data as currently on the CPU
36 |     bool IsMappedToGPU() const {return MappedToGPU;}
37 |     bool IsOnGPU() const {return OnGPU;}
38 | 
39 |     private:
40 |     bool InverseComputed;
41 |     bool MappedToGPU;
42 |     bool OnGPU;
43 |     int DomainSize;
44 |     int RangeSize;
45 | 
46 |     IndexVector M;
47 |     IndexVector InvM, InvMOff;
48 | };
49 | 
50 | 
51 | inline int &IndexMapping::operator[](int raw_index) 
52 | {
53 |     return M[raw_index];
54 | }
55 | 
56 | 
57 | inline IndexVector &IndexMapping::GetMap()
58 | {
59 |     return M;
60 | }
61 | 
62 | 
63 | inline IndexVector &IndexMapping::GetInvMap()
64 | {
65 |     ACROBATIC_ASSERT(InverseComputed, "Trying to access inverse mapping before the inverse is computed.");
66 |     return InvM;
67 | }
68 | 
69 | 
70 | inline IndexVector &IndexMapping::GetInvMapOffsets()
71 | {
72 |     ACROBATIC_ASSERT(InverseComputed, "Trying to access inverse mapping offsets before the inverse is computed.");
73 |     return InvMOff;
74 | }
75 | 
76 | }
77 | 
78 | #endif


--------------------------------------------------------------------------------
/tensor/IndexVector.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "IndexVector.hpp"
  7 | #include "Util.hpp"
  8 | #include "CudaUtil.hpp"
  9 | #include <iostream>
 10 | 
 11 | namespace acro
 12 | {
 13 | 
 14 | 
 15 | IndexVector::IndexVector()
 16 | {
 17 |     Data = nullptr;
 18 |     DeviceData = nullptr;
 19 |     OwnsData = false;
 20 |     MappedToGPU = false;
 21 |     Initialized = false;
 22 | }
 23 | 
 24 | 
 25 | IndexVector::IndexVector(int dim, int *hdata, int *ddata, bool ongpu)
 26 | {
 27 |     Initialized = false;
 28 |     Init(dim, hdata, ddata, ongpu);
 29 | }
 30 | 
 31 | 
 32 | void IndexVector::Init(int dim, int *hdata, int *ddata, bool ongpu)
 33 | {
 34 |     ACROBATIC_ASSERT(!IsInitialized(), "Can't initilize a vector a second time.")
 35 |     ACROBATIC_ASSERT(dim > 0, "Cant initilize vector with dim <= 0.");
 36 |     Size = dim;
 37 |     ByteSize = dim*sizeof(int);
 38 | 
 39 |     if (hdata == nullptr)
 40 |     {
 41 |         Data = new int[Size];
 42 |         OwnsData = true;
 43 |     }
 44 |     else
 45 |     {
 46 |         Data = hdata;
 47 |         OwnsData = false;
 48 |     }
 49 | 
 50 |     MappedToGPU = false;
 51 |     DeviceData = ddata;
 52 |     if (ddata != nullptr)
 53 |     {
 54 |         ACROBATIC_ASSERT(hdata != nullptr, 
 55 |                         "Acrotensor does not currently support GPU only tensors.");
 56 |         MappedToGPU = true;
 57 |     }
 58 | 
 59 |     ACROBATIC_ASSERT(ddata != nullptr || !ongpu,
 60 |                      "Acrotensor cannot mark external data as on the GPU if no GPU pointer is provided.");
 61 | 
 62 |     OnGPU = ongpu;
 63 |     Initialized = true;
 64 | }
 65 | 
 66 | 
 67 | IndexVector::~IndexVector()
 68 | {
 69 |     if (OwnsData)
 70 |     {
 71 |         delete [] Data;
 72 |         if (IsMappedToGPU())
 73 |         {
 74 |             UnmapFromGPU();
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | 
 80 | void IndexVector::Retarget(int *hdata, int *ddata)
 81 | {
 82 |     ACROBATIC_ASSERT(!OwnsData);
 83 |     Data = hdata;
 84 |     DeviceData = ddata;
 85 | }
 86 | 
 87 | 
 88 | void IndexVector::MapToGPU()
 89 | {
 90 | #ifdef ACRO_HAVE_CUDA
 91 |     ACROBATIC_ASSERT(!IsMappedToGPU(), "Trying to map data to the GPU a second time.");
 92 |     ensureCudaContext();
 93 |     acroCudaErrorCheck(cudaMalloc((void**)&DeviceData, ByteSize));
 94 |     MappedToGPU = true;
 95 | #endif
 96 | }
 97 | 
 98 | void IndexVector::MoveToGPU()
 99 | {
100 | #ifdef ACRO_HAVE_CUDA   
101 |     if (!IsMappedToGPU())
102 |     {
103 |         MapToGPU();
104 |     }
105 |     if (!IsOnGPU())
106 |     {
107 |         ensureCudaContext();
108 |         acroCudaErrorCheck(cudaMemcpy(DeviceData, Data, ByteSize, cudaMemcpyHostToDevice));
109 |         OnGPU = true;
110 |     }
111 | #endif
112 | }
113 | 
114 | void IndexVector::SwitchToGPU()
115 | {
116 | #ifdef ACRO_HAVE_CUDA
117 |     if (!IsMappedToGPU())
118 |     {
119 |         MapToGPU();
120 |     }
121 |     OnGPU = true;
122 | #endif
123 | }
124 | 
125 | void IndexVector::UnmapFromGPU()
126 | {
127 | #ifdef ACRO_HAVE_CUDA    
128 |     ACROBATIC_ASSERT(IsMappedToGPU(), "Can't unmap data that is not mapped to the GPU.");
129 |     ensureCudaContext();
130 |     acroCudaErrorCheck(cudaFree(DeviceData));
131 |     MappedToGPU = false;
132 |     OnGPU = false;
133 | #endif
134 | }
135 | 
136 | void IndexVector::MoveFromGPU()
137 | {
138 | #ifdef ACRO_HAVE_CUDA
139 |     if (IsOnGPU())
140 |     {
141 |         ensureCudaContext();
142 |         acroCudaErrorCheck(cudaMemcpy(Data, DeviceData, ByteSize, cudaMemcpyDeviceToHost));
143 |         OnGPU = false;
144 |     }
145 | #endif
146 | }
147 | 
148 | 
149 | void IndexVector::SwitchFromGPU()
150 | {
151 | #ifdef ACRO_HAVE_CUDA
152 |     OnGPU = false;
153 | #endif
154 | }
155 | 
156 | 
157 | void IndexVector::Print()
158 | {
159 |     for (int i = 0; i < GetSize(); ++i)
160 |     {
161 |         std::cout << Data[i] << std::endl;
162 |     }
163 |     std::cout << std::endl;
164 | }
165 | 
166 | }
167 | 


--------------------------------------------------------------------------------
/tensor/IndexVector.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_INDEXVECTOR_HPP
 7 | #define ACROBATIC_INDEXVECTOR_HPP
 8 | 
 9 | #include <vector>
10 | #include "Util.hpp"
11 | 
12 | namespace acro
13 | {
14 | 
15 | class IndexVector
16 | {
17 |     public:
18 |     IndexVector();
19 |     IndexVector(int dim, int *hdata=nullptr, int *ddata=nullptr, bool ongpu=false);
20 |     ~IndexVector();
21 |     void Init(int dim, int *hdata=nullptr, int *ddata=nullptr, bool ongpu=false);
22 | 
23 |     int GetSize() const;
24 |     int *GetData() const;
25 |     int *GetDeviceData() const;
26 |     int *GetCurrentData() const;
27 |     int &operator[](int raw_index);
28 | 
29 |     void Retarget(int *hdata, int *ddata);
30 | 
31 |     void MapToGPU();            //Allocate memory for the data on the GPU
32 |     void MoveToGPU();           //Copy the data to the GPU and flag the data as currently on the GPU
33 |     void SwitchToGPU();         //Flag the data as currently onGPU
34 |     void UnmapFromGPU();        //Deallocate memory on the GPU
35 |     void MoveFromGPU();         //Copy the data back from the GPU and flag the data as currently on the CPU
36 |     void SwitchFromGPU();       //Flag the data as currently on the CPU
37 |     bool IsMappedToGPU() const {return MappedToGPU;}
38 |     bool IsOnGPU() const {return OnGPU;}
39 |     bool IsInitialized() const {return Initialized;}
40 | 
41 |     void Print();
42 | 
43 |     private:
44 |     int Size;
45 |     int ByteSize;
46 | 
47 |     bool Initialized;
48 |     bool OwnsData;
49 |     bool MappedToGPU;
50 |     bool OnGPU;
51 |     int *Data;
52 |     int *DeviceData;
53 | };
54 | 
55 | 
56 | inline int IndexVector::GetSize() const
57 | {
58 |     return Size;
59 | }
60 | 
61 | 
62 | inline int *IndexVector::GetData() const
63 | {
64 |     return Data;
65 | }
66 | 
67 | 
68 | inline int *IndexVector::GetDeviceData() const
69 | {
70 |     return DeviceData;
71 | }
72 | 
73 | 
74 | inline int *IndexVector::GetCurrentData() const
75 | {
76 |     return (IsOnGPU()) ? DeviceData : Data;
77 | }
78 | 
79 | 
80 | inline int &IndexVector::operator[](int raw_index) 
81 | {
82 | #if DEBUG
83 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
84 | #endif    
85 |     return Data[raw_index];
86 | }
87 | 
88 | 
89 | }
90 | 
91 | #endif


--------------------------------------------------------------------------------
/tensor/SliceTensor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "SliceTensor.hpp"
  7 | 
  8 | namespace acro
  9 | {
 10 | 
 11 | SliceTensor::SliceTensor(Tensor &T, std::vector<int> &sind)
 12 | {
 13 |     SliceInit(T, sind);
 14 | }
 15 | 
 16 | 
 17 | SliceTensor::SliceTensor(Tensor &T, int d0)
 18 | {
 19 |     std::vector<int> sind = {d0};
 20 |     SliceInit(T, sind);
 21 | }
 22 | 
 23 | 
 24 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1)
 25 | {
 26 |     std::vector<int> sind = {d0, d1};
 27 |     SliceInit(T, sind);
 28 | }
 29 | 
 30 | 
 31 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2)
 32 | {
 33 |     std::vector<int> sind = {d0, d1, d2};
 34 |     SliceInit(T, sind);
 35 | }
 36 | 
 37 | 
 38 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3)
 39 | {
 40 |     std::vector<int> sind = {d0, d1, d2, d3};
 41 |     SliceInit(T, sind);
 42 | }
 43 | 
 44 | 
 45 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4)
 46 | {
 47 |     std::vector<int> sind = {d0, d1, d2, d3, d4};
 48 |     SliceInit(T, sind);
 49 | }
 50 | 
 51 | 
 52 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5)
 53 | {
 54 |     std::vector<int> sind = {d0, d1, d2, d3, d4, d5};
 55 |     SliceInit(T, sind);
 56 | }
 57 | 
 58 | 
 59 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6)
 60 | {
 61 |     std::vector<int> sind = {d0, d1, d2, d3, d4, d5, d6};
 62 |     SliceInit(T, sind);
 63 | }
 64 | 
 65 | 
 66 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7)
 67 | {
 68 |     std::vector<int> sind = {d0, d1, d2, d3, d4, d5, d6, d7};
 69 |     SliceInit(T, sind);
 70 | }
 71 | 
 72 | 
 73 | SliceTensor::SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8)
 74 | {
 75 |     std::vector<int> sind = {d0, d1, d2, d3, d4, d5, d6, d7};
 76 |     SliceInit(T, sind);
 77 | }
 78 | 
 79 | void SliceTensor::SliceInit(Tensor &T, int d0)
 80 | {
 81 |     std::vector<int> sind = {d0};
 82 |     SliceInit(T, sind);
 83 | }
 84 | 
 85 | 
 86 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1)
 87 | {
 88 |     std::vector<int> sind = {d0,d1};
 89 |     SliceInit(T, sind);
 90 | }
 91 | 
 92 | 
 93 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2)
 94 | {
 95 |     std::vector<int> sind = {d0,d1,d2};
 96 |     SliceInit(T, sind);
 97 | }
 98 | 
 99 | 
100 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3)
101 | {
102 |     std::vector<int> sind = {d0,d1,d2,d3};
103 |     SliceInit(T, sind);
104 | }
105 | 
106 | 
107 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4)
108 | {
109 |     std::vector<int> sind = {d0,d1,d2,d3,d4};
110 |     SliceInit(T, sind);
111 | }
112 | 
113 | 
114 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5)
115 | {
116 |     std::vector<int> sind = {d0,d1,d2,d3,d4,d5};
117 |     SliceInit(T, sind);
118 | }
119 | 
120 | 
121 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6)
122 | {
123 |     std::vector<int> sind = {d0,d1,d2,d3,d4,d5,d6};
124 |     SliceInit(T, sind);
125 | }
126 | 
127 | 
128 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7)
129 | {
130 |     std::vector<int> sind = {d0,d1,d2,d3,d4,d5,d6,d7};
131 |     SliceInit(T, sind);
132 | }
133 | 
134 | 
135 | void SliceTensor::SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8)
136 | {
137 |     std::vector<int> sind = {d0,d1,d2,d3,d4,d5,d6,d7,d8};
138 |     SliceInit(T, sind);
139 | }
140 | 
141 | 
142 | void SliceTensor::SliceInit(Tensor &T, std::vector<int> &sind)
143 | {
144 |     FullT = &T;
145 |     ACROBATIC_ASSERT(T.IsInitialized(), "Can't slice an uninitilized tensor.");
146 |     ACROBATIC_ASSERT(T.GetRank() > sind.size(), "Can't slice more dimensions than the tensor rank.");
147 |     std::vector<int> dims(T.GetRank() - sind.size());
148 |     for (int d = sind.size(); d < T.GetRank(); ++d)
149 |     {
150 |         dims[d - sind.size()] = T.GetDim(d);
151 |     }
152 | 
153 |     Offset = T.GetRawIndex(sind);
154 |     double *hdata = T.GetData();
155 |     double *ddata = T.GetDeviceData();
156 |     if (hdata)
157 |     {
158 |         hdata += Offset;
159 |     }
160 | 
161 |     if (ddata)
162 |     {
163 |         ddata += Offset;
164 |     }
165 | 
166 |     Initialized = false;
167 |     Init(dims, hdata, ddata, T.IsOnGPU());
168 | }
169 | 
170 | 
171 | double* SliceTensor::GetData() const
172 | {
173 |     return FullT->GetData() + Offset;
174 | }
175 | 
176 | 
177 | double* SliceTensor::GetDeviceData() const
178 | {
179 |     return FullT->GetDeviceData() + Offset;
180 | }
181 | 
182 | 
183 | void SliceTensor::MapToGPU()
184 | {
185 |     FullT->MapToGPU();
186 |     DeviceData = FullT->GetDeviceData() + Offset;
187 | }
188 | 
189 | 
190 | void SliceTensor::MoveToGPU()
191 | {
192 |     FullT->MoveToGPU();     //May Trigger a MapToGPU()
193 |     DeviceData = FullT->GetDeviceData() + Offset;
194 | }
195 | 
196 | 
197 | void SliceTensor::SwitchToGPU()
198 | {
199 |     FullT->SwitchToGPU();   //May Trigger a MapToGPU()
200 |     DeviceData = FullT->GetDeviceData() + Offset;
201 | }
202 | 
203 | 
204 | void SliceTensor::MoveFromGPU()   
205 | {
206 |     FullT->MoveFromGPU();
207 | 
208 | }
209 | 
210 | 
211 | void SliceTensor::SwitchFromGPU() 
212 | {
213 |     FullT->SwitchFromGPU();
214 | }     
215 | 
216 | 
217 | bool SliceTensor::IsMappedToGPU() const
218 | {
219 |     return FullT->IsMappedToGPU();
220 | }
221 | 
222 | 
223 | bool SliceTensor::IsOnGPU() const 
224 | {
225 |     return FullT->IsOnGPU();
226 | }
227 | 
228 | 
229 | void SliceTensor::UnmapFromGPU()
230 | {
231 | 
232 | }
233 | 
234 | 
235 | }
236 | 


--------------------------------------------------------------------------------
/tensor/SliceTensor.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_SLICETENSOR_HPP
 7 | #define ACROBATIC_SLICETENSOR_HPP
 8 | 
 9 | #include "Tensor.hpp"
10 | 
11 | namespace acro 
12 | {
13 | 
14 | class SliceTensor : public Tensor
15 | {
16 |     public:
17 |     SliceTensor() {};
18 |     SliceTensor(Tensor &T, std::vector<int> &sind);
19 |     SliceTensor(Tensor &T, int d0);
20 |     SliceTensor(Tensor &T, int d0, int d1);
21 |     SliceTensor(Tensor &T, int d0, int d1, int d2);
22 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3);
23 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4);
24 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5);
25 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6);
26 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7);
27 |     SliceTensor(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8);
28 |     void SliceInit(Tensor &T, std::vector<int> &sind);
29 |     void SliceInit(Tensor &T, int d0);
30 |     void SliceInit(Tensor &T, int d0, int d1);
31 |     void SliceInit(Tensor &T, int d0, int d1, int d2);
32 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3);
33 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4);
34 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5);
35 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6);
36 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7);
37 |     void SliceInit(Tensor &T, int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8);    
38 |     ~SliceTensor() {}
39 | 
40 |     virtual void Retarget(double *hdata, double*ddata=nullptr) {ACROBATIC_ASSERT(false, "Retarget not supported on SliceTensors");}
41 | 
42 |     //Routines for Data on the GPU
43 |     virtual double* GetData() const;
44 |     virtual double* GetDeviceData() const;
45 |     virtual void MapToGPU();
46 |     virtual void MoveToGPU();
47 |     virtual void SwitchToGPU();
48 |     virtual void UnmapFromGPU();
49 |     virtual void MoveFromGPU();
50 |     virtual void SwitchFromGPU();
51 |     virtual bool IsMappedToGPU() const;
52 |     virtual bool IsOnGPU() const;
53 | 
54 |     private:
55 |     Tensor *FullT;
56 |     int Offset;
57 | };
58 | 
59 | }
60 | 
61 | #endif //ACROBATIC_SLICETENSOR_HPP
62 | 


--------------------------------------------------------------------------------
/tensor/Tensor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "Tensor.hpp"
  7 | #include "Util.hpp"
  8 | #include "CudaUtil.hpp"
  9 | #include <iostream>
 10 | 
 11 | namespace acro
 12 | {
 13 | 
 14 | 
 15 | Tensor::Tensor()
 16 | {
 17 |     Data = nullptr;
 18 |     DeviceData = nullptr;
 19 |     OwnsData = false;
 20 |     MappedToGPU = false;
 21 |     Initialized = false;
 22 | }
 23 | 
 24 | 
 25 | Tensor::Tensor(std::vector<int> &dims, double *hdata, double *ddata, bool ongpu)
 26 | {
 27 |     Initialized = false;
 28 |     Init(dims, hdata, ddata, ongpu);
 29 | }
 30 | 
 31 | 
 32 | Tensor::Tensor(int d0, double *hdata, double *ddata, bool ongpu)
 33 | {
 34 |     Initialized = false;
 35 |     std::vector<int> dims = {d0};
 36 |     Init(dims, hdata, ddata, ongpu);
 37 | }
 38 | 
 39 | 
 40 | Tensor::Tensor(int d0, int d1, double *hdata, double *ddata, bool ongpu)
 41 | {
 42 |     Initialized = false;
 43 |     std::vector<int> dims = {d0, d1};
 44 |     Init(dims, hdata, ddata, ongpu);
 45 | }
 46 | 
 47 | 
 48 | Tensor::Tensor(int d0, int d1, int d2, double *hdata, double *ddata, bool ongpu)
 49 | {
 50 |     Initialized = false;
 51 |     std::vector<int> dims = {d0, d1, d2};
 52 |     Init(dims, hdata, ddata, ongpu);
 53 | }
 54 | 
 55 | 
 56 | Tensor::Tensor(int d0, int d1, int d2, int d3, double *hdata, double *ddata, bool ongpu)
 57 | {
 58 |     Initialized = false;
 59 |     std::vector<int> dims = {d0, d1, d2, d3};
 60 |     Init(dims, hdata, ddata, ongpu);
 61 | }
 62 | 
 63 | 
 64 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, double *hdata, double *ddata, bool ongpu)
 65 | {
 66 |     Initialized = false;
 67 |     std::vector<int> dims = {d0, d1, d2, d3, d4};
 68 |     Init(dims, hdata, ddata, ongpu);
 69 | }
 70 | 
 71 | 
 72 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata, double *ddata, bool ongpu)
 73 | {
 74 |     Initialized = false;
 75 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5};
 76 |     Init(dims, hdata, ddata, ongpu);
 77 | }
 78 | 
 79 | 
 80 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata, double *ddata, bool ongpu)
 81 | {
 82 |     Initialized = false;
 83 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6};
 84 |     Init(dims, hdata, ddata, ongpu);
 85 | }
 86 | 
 87 | 
 88 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata, double *ddata, bool ongpu)
 89 | {
 90 |     Initialized = false;
 91 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6, d7};
 92 |     Init(dims, hdata, ddata, ongpu);
 93 | }
 94 | 
 95 | 
 96 | Tensor::Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata, double *ddata, bool ongpu)
 97 | {
 98 |     Initialized = false;
 99 |     Init(d0, d1, d2, d3, d4, d5, d6, d7, d8, hdata, ddata, ongpu);
100 | }
101 | 
102 | 
103 | void Tensor::Init(int d0, double *hdata, double *ddata, bool ongpu)
104 | {
105 |     std::vector<int> dims = {d0};
106 |     Init(dims, hdata, ddata, ongpu);
107 | }
108 | 
109 | 
110 | void Tensor::Init(int d0, int d1, double *hdata, double *ddata, bool ongpu)
111 | {
112 |     std::vector<int> dims = {d0, d1};
113 |     Init(dims, hdata, ddata, ongpu);
114 | }
115 | 
116 | 
117 | void Tensor::Init(int d0, int d1, int d2, double *hdata, double *ddata, bool ongpu)
118 | {
119 |     std::vector<int> dims = {d0, d1, d2};
120 |     Init(dims, hdata, ddata, ongpu);
121 | }
122 | 
123 | 
124 | void Tensor::Init(int d0, int d1, int d2, int d3, double *hdata, double *ddata, bool ongpu)
125 | {
126 |     std::vector<int> dims = {d0, d1, d2, d3};
127 |     Init(dims, hdata, ddata, ongpu);
128 | }
129 | 
130 | 
131 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, double *hdata, double *ddata, bool ongpu)
132 | {
133 |     std::vector<int> dims = {d0, d1, d2, d3, d4};
134 |     Init(dims, hdata, ddata, ongpu);
135 | }
136 | 
137 | 
138 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata, double *ddata, bool ongpu)
139 | {
140 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5};
141 |     Init(dims, hdata, ddata, ongpu);
142 | }
143 | 
144 | 
145 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata, double *ddata, bool ongpu)
146 | {
147 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6};
148 |     Init(dims, hdata, ddata, ongpu);
149 | }
150 | 
151 | 
152 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata, double *ddata, bool ongpu)
153 | {
154 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6, d7};
155 |     Init(dims, hdata, ddata, ongpu);
156 | }
157 | 
158 | 
159 | void Tensor::Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata, double *ddata, bool ongpu)
160 | {
161 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6, d7, d8};
162 |     Init(dims, hdata, ddata, ongpu);
163 | }
164 | 
165 | 
166 | void Tensor::Init(std::vector<int> &dims, double *hdata, double *ddata, bool ongpu)
167 | {
168 |     ACROBATIC_ASSERT(!IsInitialized(), "Can't initilize a tensor a second time.")
169 |     ACROBATIC_ASSERT(dims.size() > 0, "Cant initilize tensor without any dimensions.");
170 |     for (int d = 0; d < dims.size(); ++d)
171 |     {
172 |         ACROBATIC_ASSERT(dims[d] > 0, "Can't initilize tensor with non-positive dimensions.");
173 |     }
174 |     Dims = dims;
175 |     UpdateStrides();
176 |     ComputeSize();
177 |     if (hdata == nullptr)
178 |     {
179 |         Data = new double[Size];
180 |         OwnsData = true;
181 |     }
182 |     else
183 |     {
184 |         Data = hdata;
185 |         OwnsData = false;
186 |     }
187 | 
188 |     MappedToGPU = false;
189 |     DeviceData = ddata;
190 |     if (ddata != nullptr)
191 |     {
192 |         ACROBATIC_ASSERT(hdata != nullptr, 
193 |                         "Acrotensor does not currently support GPU only tensors.");
194 |         MappedToGPU = true;
195 |     }
196 | 
197 |     ACROBATIC_ASSERT(ddata != nullptr || !ongpu,
198 |                      "Acrotensor cannot mark external data as on the GPU if no GPU pointer is provided.");
199 | 
200 |     OnGPU = ongpu;
201 |     Initialized = true;
202 | }
203 | 
204 | 
205 | Tensor::~Tensor()
206 | {
207 |     if (OwnsData)
208 |     {
209 |         delete [] Data;
210 |         if (IsMappedToGPU())
211 |         {
212 |             UnmapFromGPU();
213 |         }
214 |     }
215 | }
216 | 
217 | void Tensor::Reshape(std::vector<int> &dims)
218 | {
219 |     ACROBATIC_ASSERT(dims.size() > 0);
220 |     for (int d = 0; d < dims.size(); ++d)
221 |     {
222 |         ACROBATIC_ASSERT(dims[d] > 0);
223 |     }
224 | 
225 |     int new_size = 1;
226 |     for (int d = 0; d < dims.size(); ++d)
227 |     {
228 |         new_size *= dims[d];
229 |     }
230 |     ACROBATIC_ASSERT(new_size == Size);
231 | 
232 |     Dims = dims;
233 |     UpdateStrides();
234 | }
235 | 
236 | 
237 | void Tensor::Reshape(int d0)
238 | {
239 |     std::vector<int> dims = {d0};
240 |     Reshape(dims);
241 | }
242 | 
243 | 
244 | void Tensor::Reshape(int d0, int d1)
245 | {
246 |     std::vector<int> dims = {d0, d1};
247 |     Reshape(dims);
248 | }
249 | 
250 | 
251 | void Tensor::Reshape(int d0, int d1, int d2)
252 | {
253 |     std::vector<int> dims = {d0, d1, d2};
254 |     Reshape(dims);
255 | }
256 | 
257 | 
258 | void Tensor::Reshape(int d0, int d1, int d2, int d3)
259 | {
260 |     std::vector<int> dims = {d0, d1, d2, d3};
261 |     Reshape(dims);
262 | }
263 | 
264 | 
265 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4)
266 | {
267 |     std::vector<int> dims = {d0, d1, d2, d3, d4};
268 |     Reshape(dims);
269 | }
270 | 
271 | 
272 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5)
273 | {
274 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5};
275 |     Reshape(dims);
276 | }
277 | 
278 | 
279 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6)
280 | {
281 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6};
282 |     Reshape(dims);
283 | }
284 | 
285 | 
286 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7)
287 | {
288 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6, d7};
289 |     Reshape(dims);
290 | }
291 | 
292 | 
293 | void Tensor::Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8)
294 | {
295 |     std::vector<int> dims = {d0, d1, d2, d3, d4, d5, d6, d7, d8};
296 |     Reshape(dims);
297 | }
298 | 
299 | 
300 | void Tensor::Retarget(double *hdata, double *ddata)
301 | {
302 |     ACROBATIC_ASSERT(!OwnsData);
303 |     Data = hdata;
304 |     DeviceData = ddata;
305 | }
306 | 
307 | 
308 | void Tensor::UpdateStrides()
309 | {
310 |     Strides.resize(Dims.size());
311 |     int stride = 1;
312 |     for (int d = Dims.size() - 1; d >= 0; --d)
313 |     {
314 |         Strides[d] = stride;
315 |         stride *= Dims[d];
316 |     }
317 | }
318 | 
319 | 
320 | void Tensor::ComputeSize()
321 | {
322 |     Size = 1;
323 |     for (int d = 0; d < GetRank(); ++d)
324 |     {
325 |         Size *= Dims[d];
326 |     }
327 |     ByteSize = Size*sizeof(double);
328 | }
329 | 
330 | void Tensor::Set(double val)
331 | {
332 |     if (!IsOnGPU())
333 |     {
334 |         for (int i = 0; i < GetSize(); ++i)
335 |         {
336 |             Data[i] = val;
337 |         }
338 |     }
339 |     else
340 |     {
341 | #ifdef ACRO_HAVE_CUDA
342 |         ensureCudaContext();
343 |         CudaSet<<<Size/512+1,512>>>(DeviceData, val, GetSize());
344 |         acroCudaErrorCheck(cudaPeekAtLastError());
345 | #endif
346 |     }
347 | }
348 | 
349 | 
350 | void Tensor::Mult(double c)
351 | {
352 |     if (!IsOnGPU())
353 |     {
354 |         for (int i = 0; i < GetSize(); ++i)
355 |         {
356 |             Data[i] *= c;
357 |         }
358 |     }
359 |     else
360 |     {
361 | #ifdef ACRO_HAVE_CUDA
362 |         ensureCudaContext();
363 |         CudaMult<<<Size/512+1,512>>>(DeviceData, c, GetSize());
364 |         acroCudaErrorCheck(cudaPeekAtLastError());
365 | #endif
366 |     }
367 | }
368 | 
369 | 
370 | void Tensor::MapToGPU()
371 | {
372 | #ifdef ACRO_HAVE_CUDA
373 |     ACROBATIC_ASSERT(!IsMappedToGPU(), "Trying to map data to the GPU a second time.");
374 |     ensureCudaContext();
375 |     acroCudaErrorCheck(cudaMalloc((void**)&DeviceData, ByteSize));
376 |     MappedToGPU = true;
377 | #endif
378 | }
379 | 
380 | void Tensor::MoveToGPU()
381 | {
382 | #ifdef ACRO_HAVE_CUDA   
383 |     if (!IsMappedToGPU())
384 |     {
385 |         MapToGPU();
386 |     }
387 |     if (!IsOnGPU())
388 |     {
389 |         ensureCudaContext();
390 |         acroCudaErrorCheck(cudaMemcpy(DeviceData, Data, ByteSize, cudaMemcpyHostToDevice));
391 |         OnGPU = true;
392 |     }
393 | #endif
394 | }
395 | 
396 | void Tensor::SwitchToGPU()
397 | {
398 | #ifdef ACRO_HAVE_CUDA
399 |     if (!IsMappedToGPU())
400 |     {
401 |         MapToGPU();
402 |     }
403 |     OnGPU = true;
404 | #endif
405 | }
406 | 
407 | void Tensor::UnmapFromGPU()
408 | {
409 | #ifdef ACRO_HAVE_CUDA    
410 |     ACROBATIC_ASSERT(IsMappedToGPU(), "Can't unmap data that is not mapped to the GPU.");
411 |     ensureCudaContext();
412 |     acroCudaErrorCheck(cudaFree(DeviceData));
413 |     MappedToGPU = false;
414 |     OnGPU = false;
415 | #endif
416 | }
417 | 
418 | void Tensor::MoveFromGPU()
419 | {
420 | #ifdef ACRO_HAVE_CUDA
421 |     if (IsOnGPU())
422 |     {
423 |         ensureCudaContext();
424 |         acroCudaErrorCheck(cudaMemcpy(Data, DeviceData, ByteSize, cudaMemcpyDeviceToHost));
425 |         OnGPU = false;
426 |     }
427 | #endif
428 | }
429 | 
430 | 
431 | void Tensor::SwitchFromGPU()
432 | {
433 | #ifdef ACRO_HAVE_CUDA
434 |     OnGPU = false;
435 | #endif
436 | }
437 | 
438 | 
439 | void Tensor::Print()
440 | {
441 |     std::cout << "Dims:  ";
442 |     for (int d = 0; d < Dims.size(); ++d)
443 |     {
444 |         std::cout << Dims[d] << "  ";
445 |     }
446 |     std::cout << std::endl;
447 | 
448 |     std::cout << "Strides:  ";
449 |     for (int d = 0; d < Dims.size(); ++d)
450 |     {
451 |         std::cout << Strides[d] << "  ";
452 |     }
453 |     std::cout << std::endl;
454 | 
455 |     for (int i = 0; i < GetSize(); ++i)
456 |     {
457 |         std::cout << Data[i] << std::endl;
458 |     }
459 |     std::cout << std::endl;
460 | }
461 | 
462 | }
463 | 


--------------------------------------------------------------------------------
/tensor/Tensor.hpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifndef ACROBATIC_TENSOR_HPP
  7 | #define ACROBATIC_TENSOR_HPP
  8 | 
  9 | #include <vector>
 10 | #include "Util.hpp"
 11 | 
 12 | namespace acro
 13 | {
 14 | 
 15 | class Tensor
 16 | {
 17 |     public:
 18 |     //Construct and empty tensor to be initilized later
 19 |     Tensor();
 20 | 
 21 |     //Construct a tensor with the proper dimensions 
 22 |     Tensor(std::vector<int> &dims, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 23 |     Tensor(int d0, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 24 |     Tensor(int d0, int d1, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 25 |     Tensor(int d0, int d1, int d2, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 26 |     Tensor(int d0, int d1, int d2, int d3, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 27 |     Tensor(int d0, int d1, int d2, int d3, int d4, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 28 |     Tensor(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 29 |     Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 30 |     Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 31 |     Tensor(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 32 | 
 33 |     void Init(std::vector<int> &dims, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 34 |     void Init(int d0, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 35 |     void Init(int d0, int d1, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 36 |     void Init(int d0, int d1, int d2, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 37 |     void Init(int d0, int d1, int d2, int d3, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 38 |     void Init(int d0, int d1, int d2, int d3, int d4, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 39 |     void Init(int d0, int d1, int d2, int d3, int d4, int d5, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 40 |     void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 41 |     void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 42 |     void Init(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int d8, double *hdata=nullptr, double *ddata=nullptr, bool ongpu=false);
 43 | 
 44 |     ~Tensor();
 45 | 
 46 |     //Simple index into data
 47 |     inline double &operator[](int raw_index);
 48 | 
 49 |     //Get the simple raw linear index from the tensor indices
 50 |     inline int GetRawIndex(const std::vector<int> &indices);
 51 |     inline int GetRawIndex(int i0);
 52 |     inline int GetRawIndex(int i0, int i1);
 53 |     inline int GetRawIndex(int i0, int i1, int i2);
 54 |     inline int GetRawIndex(int i0, int i1, int i2, int i3);
 55 |     inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4);
 56 |     inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5);
 57 |     inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6);
 58 |     inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7);
 59 |     inline int GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8);
 60 | 
 61 |     //Tensor index into the data
 62 |     inline double &operator()(std::vector<int> &indices);
 63 |     inline double &operator()(int i0);
 64 |     inline double &operator()(int i0, int i1);
 65 |     inline double &operator()(int i0, int i1, int i2);
 66 |     inline double &operator()(int i0, int i1, int i2, int i3);
 67 |     inline double &operator()(int i0, int i1, int i2, int i3, int i4);
 68 |     inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5);
 69 |     inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6);
 70 |     inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7);
 71 |     inline double &operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8);
 72 | 
 73 |     //Change the dimensions of the tensor without reorganizing the data representation
 74 |     void Reshape(std::vector<int> &dims);
 75 |     void Reshape(int d0);
 76 |     void Reshape(int d0, int d1);
 77 |     void Reshape(int d0, int d1, int d2);
 78 |     void Reshape(int d0, int d1, int d2, int d3);
 79 |     void Reshape(int d0, int d1, int d2, int d3, int d4);
 80 |     void Reshape(int d0, int d1, int d2, int d3, int d4, int d5);
 81 |     void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6);
 82 |     void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7);
 83 |     void Reshape(int d0, int d1, int d2, int d3, int d4, int d5, int d6, int d7, int i8);
 84 |     
 85 |     inline int GetRank() const;
 86 |     inline int GetSize() const ;
 87 |     inline int GetDim(int d) const ;
 88 |     inline int GetStride(int d) const;
 89 |     virtual double *GetData() const;
 90 |     virtual double *GetDeviceData() const;
 91 |     inline double *GetCurrentData() const;
 92 | 
 93 |     //Change where externally owned data is pointing
 94 |     virtual void Retarget(double *hdata, double*ddata=nullptr);
 95 | 
 96 |     //Routines for Data on the GPU
 97 |     virtual void MapToGPU();            //Allocate memory for the data on the GPU
 98 |     virtual void MoveToGPU();           //Copy the data to the GPU and flag the data as currently on the GPU
 99 |     virtual void SwitchToGPU();         //Flag the data as currently onGPU
100 |     virtual void UnmapFromGPU();        //Deallocate memory on the GPU
101 |     virtual void MoveFromGPU();         //Copy the data back from the GPU and flag the data as currently on the CPU
102 |     virtual void SwitchFromGPU();       //Flag the data as currently on the CPU
103 |     virtual bool IsMappedToGPU() const {return MappedToGPU;}
104 |     virtual bool IsOnGPU() const {return OnGPU;}
105 |     virtual bool IsInitialized() const {return Initialized;}
106 | 
107 |     void Set(double val);       //Sets all values in the tensor to a constant
108 |     void Mult(double c);        //Multiply all values by a constant
109 | 
110 |     void Print();
111 | 
112 |     protected:
113 |     void UpdateStrides();
114 |     void ComputeSize();
115 | 
116 |     std::vector<int> Dims;
117 |     std::vector<int> Strides;
118 |     int Size;
119 |     int ByteSize;
120 | 
121 |     bool Initialized;
122 |     bool OwnsData;
123 |     bool MappedToGPU;
124 |     bool OnGPU;
125 |     double *Data;
126 |     double *DeviceData;
127 | };
128 | 
129 | 
130 | inline int Tensor::GetRank() const
131 | {
132 |     return Dims.size();
133 | }
134 | 
135 | 
136 | inline int Tensor::GetSize() const
137 | {
138 |     return Size;
139 | }
140 | 
141 | 
142 | inline int Tensor::GetDim(int d) const
143 | {
144 |     return Dims[d];
145 | }
146 | 
147 | 
148 | inline int Tensor::GetStride(int d) const
149 | {
150 |     return Strides[d];
151 | }
152 | 
153 | 
154 | inline double *Tensor::GetData() const
155 | {
156 |     return Data;
157 | }
158 | 
159 | 
160 | inline double *Tensor::GetDeviceData() const
161 | {
162 |     return DeviceData;
163 | }
164 | 
165 | 
166 | inline double *Tensor::GetCurrentData() const
167 | {
168 |     return (IsOnGPU()) ? DeviceData : Data;
169 | }
170 | 
171 | 
172 | 
173 | inline int Tensor::GetRawIndex(const std::vector<int> &indices)
174 | {
175 |     int index = 0;
176 |     for (unsigned int d = 0; d < indices.size(); ++d)
177 |     {
178 |         index += Strides[d] * indices[d];
179 |     }
180 |     return index;
181 | }
182 | 
183 | 
184 | inline int Tensor::GetRawIndex(int i0)
185 | {
186 |     return Strides[0]*i0;
187 | }
188 | 
189 | 
190 | inline int Tensor::GetRawIndex(int i0, int i1)
191 | {
192 |     return Strides[0]*i0 + Strides[1]*i1;
193 | }
194 | 
195 | 
196 | inline int Tensor::GetRawIndex(int i0, int i1, int i2)
197 | {
198 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2;
199 | }
200 | 
201 | 
202 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3)
203 | {
204 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3;
205 | }
206 | 
207 | 
208 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4)
209 | {
210 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 
211 |            Strides[4]*i4;
212 | }
213 | 
214 | 
215 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5)
216 | {
217 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 
218 |            Strides[4]*i4 + Strides[5]*i5;
219 | }
220 | 
221 | 
222 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6)
223 | {
224 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 
225 |            Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6;
226 | }
227 | 
228 | 
229 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7)
230 | {
231 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 
232 |            Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6 + Strides[7]*i7;
233 | }
234 | 
235 | 
236 | inline int Tensor::GetRawIndex(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8)
237 | {
238 |     return Strides[0]*i0 + Strides[1]*i1 + Strides[2]*i2 + Strides[3]*i3 + 
239 |            Strides[4]*i4 + Strides[5]*i5 + Strides[6]*i6 + Strides[7]*i7 +
240 |            Strides[8]*i8;
241 | }
242 | 
243 | 
244 | inline double &Tensor::operator()(std::vector<int> &indices) 
245 | {
246 | #if DEBUG
247 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
248 | #endif
249 |     return Data[GetRawIndex(indices)];
250 | }
251 | 
252 | inline double &Tensor::operator()(int i0)
253 | {
254 | #if DEBUG
255 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
256 | #endif
257 |     return Data[GetRawIndex(i0)];
258 | }
259 | 
260 | 
261 | inline double &Tensor::operator()(int i0, int i1)
262 | {
263 | #if DEBUG
264 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
265 | #endif
266 |     return Data[GetRawIndex(i0, i1)];
267 | }
268 | 
269 | 
270 | inline double &Tensor::operator[](int raw_index) 
271 | {
272 | #if DEBUG
273 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
274 | #endif    
275 |     return Data[raw_index];
276 | }
277 | 
278 | inline double &Tensor::operator()(int i0, int i1, int i2)
279 | {
280 | #if DEBUG
281 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
282 | #endif
283 |     return Data[GetRawIndex(i0, i1, i2)];
284 | }
285 | 
286 | 
287 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3)
288 | {
289 | #if DEBUG
290 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
291 | #endif
292 |     return Data[GetRawIndex(i0, i1, i2, i3)];
293 | }
294 | 
295 | 
296 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4)
297 | {
298 | #if DEBUG
299 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
300 | #endif
301 |     return Data[GetRawIndex(i0, i1, i2, i3, i4)];
302 | }
303 | 
304 | 
305 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5)
306 | {
307 | #if DEBUG
308 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
309 | #endif
310 |     return Data[GetRawIndex(i0, i1, i2, i3, i4, i5)];
311 | }
312 | 
313 | 
314 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6)
315 | {
316 | #if DEBUG
317 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
318 | #endif
319 |     return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6)];
320 | }
321 | 
322 | 
323 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7)
324 | {
325 | #if DEBUG
326 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
327 | #endif
328 |     return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6, i7)];
329 | }
330 | 
331 | 
332 | inline double &Tensor::operator()(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8)
333 | {
334 | #if DEBUG
335 |     ACROBATIC_ASSERT(OnGPU, "You have accessed the CPU version of the data that is fresh on the GPU.");
336 | #endif
337 |     return Data[GetRawIndex(i0, i1, i2, i3, i4, i5, i6, i7, i8)];
338 | }
339 | 
340 | }
341 | 
342 | #endif //ACROBATIC_TENSOR_HPP
343 | 


--------------------------------------------------------------------------------
/unittest/kernel/test_DimensionedKernel.cpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #include "catch.hpp"
 7 | #include "DimensionedKernel.hpp"
 8 | 
 9 | using namespace acro;
10 | 
11 | 
12 | TEST_CASE("DimensionedKernel operations", "[DimensionedKernel]")
13 | {
14 |    Tensor T1out_3(3), T2out_3_3(3, 3), T1_3(3), T1_2(2), T2_3_3(3,3);
15 | 
16 | 
17 |    SECTION("A_i=B_iC_i")
18 |    {
19 |       TensorKernel Kernel("A_i=B_iC_i");
20 |       std::vector<Tensor*> inputs;
21 |       inputs.push_back(&T1_3);
22 |       inputs.push_back(&T1_3);
23 |       DimensionedKernel DKernel(&Kernel, &T1out_3, inputs);
24 |       REQUIRE(DKernel.GetFlatIdxSize() == 3);
25 |       REQUIRE(DKernel.GetOutIdxSize() == 3);
26 |       REQUIRE(DKernel.GetContIdxSize() == 1);
27 |       REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 3);
28 |       REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 6);
29 |    }
30 | 
31 |    SECTION("A_i=B_s_iC_i_s")
32 |    {
33 |       TensorKernel Kernel("A_i=B_s_iC_i_s");
34 |       std::vector<Tensor*> inputs;
35 |       inputs.push_back(&T2_3_3);
36 |       inputs.push_back(&T2_3_3);
37 |       DimensionedKernel DKernel(&Kernel, &T1out_3, inputs);
38 |       REQUIRE(DKernel.GetFlatIdxSize() == 9);
39 |       REQUIRE(DKernel.GetOutIdxSize() == 3);
40 |       REQUIRE(DKernel.GetContIdxSize() == 3);
41 |       REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 1);
42 |       REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(2) == 3);
43 |       REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 6);
44 |       REQUIRE(DKernel.GetInputStorageReqForInnerLoops(2) == 18);
45 |    }
46 | 
47 |    SECTION("A_i=B_s_iC_i")
48 |    {
49 |       TensorKernel Kernel("A_i=B_s_iC_i");
50 |       std::vector<Tensor*> inputs;
51 |       inputs.push_back(&T2_3_3);
52 |       inputs.push_back(&T1_3);
53 |       DimensionedKernel DKernel(&Kernel, &T1out_3, inputs);
54 |       REQUIRE(DKernel.GetFlatIdxSize() == 9);
55 |       REQUIRE(DKernel.GetOutIdxSize() == 3);
56 |       REQUIRE(DKernel.GetContIdxSize() == 3);
57 |       REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(1) == 1);
58 |       REQUIRE(DKernel.GetOutputStorageReqForInnerLoops(2) == 3);
59 |       REQUIRE(DKernel.GetInputStorageReqForInnerLoops(1) == 4);
60 |       REQUIRE(DKernel.GetInputStorageReqForInnerLoops(2) == 12);            
61 |    }
62 | 
63 |    SECTION("S_e_i1_i2_i3_j1_j2_j3=B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_nD_e_k1_k2_k3_m_n")
64 |    {
65 |       std::string kernel_str = "S_e_i1_i2_i3_j1_j2_j3 =B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_n D_e_k1_k2_k3_m_n";
66 |       TensorKernel Kernel(kernel_str);
67 |       Tensor S(10, 5, 5, 5, 5, 5, 5);
68 |       Tensor Btilde1(5, 5, 5, 3, 3);
69 |       Tensor Btilde2(5, 5, 5, 3, 3);
70 |       Tensor Btilde3(5, 5, 5, 3, 3);
71 |       Tensor D(10, 5, 5, 5, 3, 3);
72 |       std::vector<Tensor*> inputs = {&Btilde1, &Btilde2, &Btilde3, &D};
73 |       DimensionedKernel DKernel(&Kernel, &S, inputs);
74 |       REQUIRE(DKernel.GetFlatIdxSize() == 175781250);
75 |       REQUIRE(DKernel.GetOutIdxSize() == 156250);
76 |       REQUIRE(DKernel.GetContIdxSize() == 1125);        
77 |    }
78 | }


--------------------------------------------------------------------------------
/unittest/kernel/test_TensorKernel.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "catch.hpp"
  7 | #include "TensorKernel.hpp"
  8 | 
  9 | using namespace acro;
 10 | 
 11 | std::string reconstruct_kernel_str(TensorKernel &Kernel);
 12 | 
 13 | TEST_CASE("TensorKernel operations", "[TensorKernel]")
 14 | {
 15 |    SECTION("Assert Parsable String")
 16 |    {
 17 |       REQUIRE_THROWS(new TensorKernel("Blah"));
 18 |       REQUIRE_THROWS(new TensorKernel("Blah="));
 19 |       REQUIRE_THROWS(new TensorKernel("Blah=Alah"));
 20 |       REQUIRE_THROWS(new TensorKernel("BLA1h=Alah"));
 21 |       REQUIRE_NOTHROW(new TensorKernel("B_lah=A_lah"));
 22 |       REQUIRE_THROWS(new TensorKernel("B_lah=A_lah_"));
 23 |       REQUIRE_THROWS(new TensorKernel("B_lah_=A_lah"));
 24 |       REQUIRE_THROWS(new TensorKernel("a_lah_=A_lah"));
 25 |    }
 26 | 
 27 |    SECTION("Can Reconstruct Various Kernels")
 28 |    {
 29 |       REQUIRE(reconstruct_kernel_str(*(new TensorKernel("BA_i=A_j"))) == "BA_i=A_j");
 30 |       REQUIRE(reconstruct_kernel_str(*(new TensorKernel("B1_i=CB_jBr_j"))) == "B1_i=CB_jBr_j");
 31 |    }
 32 | 
 33 |    SECTION("A_i=B_iC_i")
 34 |    {
 35 |       TensorKernel Kernel("A_i=B_iC_i");
 36 | 
 37 |       SECTION("Basic Parsing")
 38 |       {
 39 |          REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_iC_i");
 40 |          REQUIRE(Kernel.AllIndexNames.size() == 1);
 41 |          REQUIRE(Kernel.AllIndexNames[0] == "i");
 42 |          REQUIRE(Kernel.ContractionIndexNames.size() == 0);
 43 | 
 44 |          REQUIRE(Kernel.GetNumIndices() == 1);
 45 |          REQUIRE(Kernel.GetNumContractionIndices() == 0);
 46 |          REQUIRE(Kernel.GetNumVars() == 3);
 47 |          REQUIRE(Kernel.GetNumInputVars() == 2);
 48 |          REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 0);
 49 |          REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0);
 50 |          REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0));
 51 |          REQUIRE(Kernel.IsVarDependentOnLoop(0, 0));
 52 |          REQUIRE(Kernel.IsVarDependentOnLoop(1, 0));
 53 |       }
 54 |    }
 55 | 
 56 |    SECTION("A_i=B_s_iC_i_s")
 57 |    {
 58 |       TensorKernel Kernel("A_i=B_s_iC_i_s");
 59 |       SECTION("Basic Parsing")
 60 |       {
 61 |          REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_s_iC_i_s");
 62 |          REQUIRE(Kernel.AllIndexNames.size() == 2);
 63 |          REQUIRE(Kernel.AllIndexNames[0] == "i");
 64 |          REQUIRE(Kernel.AllIndexNames[1] == "s");
 65 |          REQUIRE(Kernel.ContractionIndexNames.size() == 1);
 66 |          REQUIRE(Kernel.ContractionIndexNames[0] == "s");
 67 | 
 68 |          REQUIRE(Kernel.GetNumIndices() == 2);
 69 |          REQUIRE(Kernel.GetNumContractionIndices() == 1);
 70 |          REQUIRE(Kernel.GetNumVars() == 3);
 71 |          REQUIRE(Kernel.GetNumInputVars() == 2);
 72 |          REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 1);
 73 |          REQUIRE(Kernel.GetVarDimLoopNum(0, 1) == 0);
 74 |          REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0);
 75 |          REQUIRE(Kernel.GetVarDimLoopNum(1, 1) == 1);
 76 |          REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0));
 77 |          REQUIRE(!Kernel.IsVarDependentOnLoop(-1, 1));
 78 |          REQUIRE(Kernel.IsVarDependentOnLoop(0, 0));
 79 |          REQUIRE(Kernel.IsVarDependentOnLoop(0, 1));
 80 |          REQUIRE(Kernel.IsVarDependentOnLoop(1, 0));
 81 |          REQUIRE(Kernel.IsVarDependentOnLoop(1, 1));
 82 |       }
 83 |    }
 84 | 
 85 |    SECTION("A_i=B_s_iC_i")
 86 |    {
 87 |       TensorKernel Kernel("A_i=B_s_iC_i");
 88 |       SECTION("Basic Parsing")
 89 |       {
 90 |          REQUIRE(reconstruct_kernel_str(Kernel) == "A_i=B_s_iC_i");
 91 |          REQUIRE(Kernel.AllIndexNames.size() == 2);
 92 |          REQUIRE(Kernel.AllIndexNames[0] == "i");
 93 |          REQUIRE(Kernel.AllIndexNames[1] == "s");
 94 |          REQUIRE(Kernel.ContractionIndexNames.size() == 1);
 95 |          REQUIRE(Kernel.ContractionIndexNames[0] == "s");
 96 | 
 97 |          REQUIRE(Kernel.GetNumIndices() == 2);
 98 |          REQUIRE(Kernel.GetNumContractionIndices() == 1);
 99 |          REQUIRE(Kernel.GetNumVars() == 3);
100 |          REQUIRE(Kernel.GetNumInputVars() == 2);
101 |          REQUIRE(Kernel.GetVarDimLoopNum(0, 0) == 1);
102 |          REQUIRE(Kernel.GetVarDimLoopNum(0, 1) == 0);
103 |          REQUIRE(Kernel.GetVarDimLoopNum(1, 0) == 0);
104 |          REQUIRE(Kernel.IsVarDependentOnLoop(-1, 0));
105 |          REQUIRE(!Kernel.IsVarDependentOnLoop(-1, 1));
106 |          REQUIRE(Kernel.IsVarDependentOnLoop(0, 0));
107 |          REQUIRE(Kernel.IsVarDependentOnLoop(0, 1));
108 |          REQUIRE(Kernel.IsVarDependentOnLoop(1, 0));
109 |          REQUIRE(!Kernel.IsVarDependentOnLoop(1, 1));   
110 |       }      
111 |    }
112 | 
113 |    SECTION("A_i1_i2=B_i1_i2_sum1")
114 |    {
115 |       TensorKernel Kernel("A_i1_i2=B_i1_i2_sum1");
116 |       SECTION("Basic Parsing")
117 |       {
118 |          REQUIRE(reconstruct_kernel_str(Kernel) == "A_i1_i2=B_i1_i2_sum1");
119 |          REQUIRE(Kernel.AllIndexNames.size() == 3);
120 |          REQUIRE(Kernel.AllIndexNames[0] == "i1");
121 |          REQUIRE(Kernel.AllIndexNames[1] == "i2");
122 |          REQUIRE(Kernel.AllIndexNames[2] == "sum1");
123 |          REQUIRE(Kernel.ContractionIndexNames.size() == 1);
124 |          REQUIRE(Kernel.ContractionIndexNames[0] == "sum1");
125 |       }
126 |    }
127 | 
128 |    SECTION("S_e_i1_i2_i3_j1_j2_j3=B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_nD_e_k1_k2_k3_m_n")
129 |    {
130 |       std::string kernel_str = "S_e_i1_i2_i3_j1_j2_j3 =B_i1_j1_k1_m_nB_i2_j2_k2_m_nB_i3_j3_k3_m_n D_e_k1_k2_k3_m_n";
131 |       TensorKernel Kernel(kernel_str);
132 |       SECTION("Basic Parsing")
133 |       {
134 |          REQUIRE(Kernel.AllIndexNames.size() == 12);
135 |          REQUIRE(Kernel.AllIndexNames[0] == "e");
136 |          REQUIRE(Kernel.AllIndexNames[1] == "i1");
137 |          REQUIRE(Kernel.AllIndexNames[2] == "i2");
138 |          REQUIRE(Kernel.AllIndexNames[3] == "i3");
139 |          REQUIRE(Kernel.AllIndexNames[4] == "j1");
140 |          REQUIRE(Kernel.AllIndexNames[5] == "j2");
141 |          REQUIRE(Kernel.AllIndexNames[6] == "j3");
142 |          REQUIRE(Kernel.AllIndexNames[7] == "k1");
143 |          REQUIRE(Kernel.AllIndexNames[8] == "m");
144 |          REQUIRE(Kernel.AllIndexNames[9] == "n");
145 |          REQUIRE(Kernel.AllIndexNames[10] == "k2");
146 |          REQUIRE(Kernel.AllIndexNames[11] == "k3");
147 | 
148 |          REQUIRE(Kernel.ContractionIndexNames.size() == 5);
149 |          REQUIRE(Kernel.ContractionIndexNames[0] == "k1");
150 |          REQUIRE(Kernel.ContractionIndexNames[1] == "m");
151 |          REQUIRE(Kernel.ContractionIndexNames[2] == "n");
152 |          REQUIRE(Kernel.ContractionIndexNames[3] == "k2");
153 |          REQUIRE(Kernel.ContractionIndexNames[4] == "k3");
154 | 
155 |          REQUIRE(Kernel.GetNumIndices() == 12);
156 |          REQUIRE(Kernel.GetNumContractionIndices() == 5);
157 |          REQUIRE(Kernel.GetNumVars() == 5);
158 |          REQUIRE(Kernel.GetNumInputVars() == 4);     
159 |       }            
160 |    }
161 | }
162 | 
163 | std::string reconstruct_kernel_str(TensorKernel &Kernel)
164 | {
165 |    std::string str;
166 |    str += Kernel.OutputVar.Name;
167 |    for (int d = 0; d < Kernel.OutputVar.IndexNames.size(); ++d)
168 |    {
169 |       str += "_";
170 |       str += Kernel.OutputVar.IndexNames[d];
171 |    }
172 | 
173 |    str += Kernel.EqOperator;
174 | 
175 |    for (int vari = 0; vari < Kernel.InputVars.size(); ++vari)
176 |    {
177 |       str += Kernel.InputVars[vari].Name;
178 |       for (int d = 0; d < Kernel.InputVars[vari].IndexNames.size(); ++d)
179 |       {
180 |          str += "_";
181 |          str += Kernel.InputVars[vari].IndexNames[d];
182 |       }
183 |    }
184 |    return str;
185 | }
186 | 


--------------------------------------------------------------------------------
/unittest/makefile:
--------------------------------------------------------------------------------
 1 | #Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | #Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | #All rights reserved.
 4 | #This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | # Serial compiler
 7 | ACRO_BASEDIR = ..
 8 | 
 9 | CXX         = clang++
10 | CXX_FLAGS   = -O3 -fopenmp=libomp -std=c++11 -stdlib=libc++
11 | CCC = $(CXX) $(CXX_FLAGS)
12 | LD_FLAGS = -Wl,-rpath,$(ACRO_BASEDIR)
13 | 
14 | DIRS = exec tensor util kernel
15 | SRCDIRS = $(foreach dir,$(DIRS),$(ACRO_BASEDIR)/$(dir))
16 | INCLUDES = -I. $(foreach dir,$(SRCDIRS),-I$(dir)) -I../inc
17 | 
18 | SOURCE_FILES = unit_test_main.cpp $(sort $(wildcard ./*/*.cpp))
19 | HEADER_FILES = catch.hpp
20 | OBJECT_FILES = $(SOURCE_FILES:.cpp=.o)
21 | ACRO_LIB = $(ACRO_BASEDIR)/lib/shared/libacrotensor.so
22 | 
23 | .SUFFIXES: .cpp .o
24 | .cpp.o:
25 | 	$(CCC) -c $(<D)/$(<F) $(INCLUDES) -o $(<D)/$(<F:.cpp=.o)
26 | 
27 | test: banner $(OBJECT_FILES)
28 | 	$(CXX) $(OBJECT_FILES) $(INCLUDES) -lrt $(ACRO_LIB) $(LD_FLAGS) -o test
29 | 
30 | banner:
31 | 	@echo ------------------------------------------------------------
32 | 	@echo
33 | 	@echo "              Building Acrotensor Unittests"
34 | 	@echo
35 | 	@echo ------------------------------------------------------------
36 | 
37 | $(OBJECT_FILES): $(HEADER_FILES)
38 | 
39 | clean:
40 | 	rm -f *.o */*.o */*~ *~ *.core *.v3breakpoints test
41 | 


--------------------------------------------------------------------------------
/unittest/tensor/test_SliceTensor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "catch.hpp"
  7 | #include "AcroTensor.hpp"
  8 | #include <iostream>
  9 | using namespace acro;
 10 | 
 11 | 
 12 | TEST_CASE("Basic SliceTensor unit tests", "[SliceTensor]")
 13 | {
 14 |    SECTION("Prefixed sliced indexing works")
 15 |    {
 16 |       Tensor T(2, 3, 4);
 17 |       SliceTensor S0(T, 0);
 18 |       SliceTensor S1(T, 1);
 19 |       SliceTensor S00(T, 0, 0);
 20 |       SliceTensor S01(T, 0, 1);
 21 |       SliceTensor S02(T, 0, 2);
 22 |       SliceTensor S10(T, 1, 0);
 23 |       SliceTensor S11(T, 1, 1);
 24 |       SliceTensor S12(T, 1, 2);
 25 |       SliceTensor S20(T, 2, 0);
 26 |       SliceTensor S21(T, 2, 1);
 27 |       SliceTensor S22(T, 2, 2);
 28 | 
 29 |       REQUIRE(S0.GetRank() == 2);
 30 |       REQUIRE(S1.GetRank() == 2);
 31 |       REQUIRE(S00.GetRank() == 1);
 32 |       REQUIRE(S01.GetRank() == 1);
 33 |       REQUIRE(S10.GetRank() == 1);
 34 |       REQUIRE(S11.GetRank() == 1);
 35 | 
 36 |       REQUIRE(S0.GetSize() == 12);
 37 |       REQUIRE(S1.GetSize() == 12);
 38 |       REQUIRE(S00.GetSize() == 4);
 39 |       REQUIRE(S01.GetSize() == 4);
 40 |       REQUIRE(S10.GetSize() == 4);
 41 |       REQUIRE(S11.GetSize() == 4);
 42 | 
 43 |       REQUIRE(S0.GetDim(0) == 3);
 44 |       REQUIRE(S1.GetDim(0) == 3);
 45 |       REQUIRE(S0.GetDim(1) == 4);
 46 |       REQUIRE(S1.GetDim(1) == 4);
 47 |       REQUIRE(S00.GetDim(0) == 4);
 48 |       REQUIRE(S01.GetDim(0) == 4);
 49 |       REQUIRE(S10.GetDim(0) == 4);
 50 |       REQUIRE(S11.GetDim(0) == 4);
 51 | 
 52 |       REQUIRE(S0.GetStride(0) == 4);
 53 |       REQUIRE(S1.GetStride(0) == 4);
 54 |       REQUIRE(S0.GetStride(1) == 1);
 55 |       REQUIRE(S1.GetStride(1) == 1);
 56 |       REQUIRE(S00.GetStride(0) == 1);
 57 |       REQUIRE(S01.GetStride(0) == 1);
 58 |       REQUIRE(S10.GetStride(0) == 1);
 59 |       REQUIRE(S11.GetStride(0) == 1);    
 60 | 
 61 |       for (int idx = 0; idx < 24; ++idx)
 62 |       {
 63 |          T[idx] = idx;
 64 |       }
 65 | 
 66 |       for (int k = 0; k < 4; ++k)
 67 |       {
 68 |          for (int j = 0; j < 3; ++j)
 69 |          {
 70 |             REQUIRE(S0(j, k) == T(0, j, k));
 71 |             REQUIRE(S1(j, k) == T(1, j, k));
 72 |          }
 73 |          REQUIRE(S00(k) == T(0,0,k));
 74 |          REQUIRE(S01(k) == T(0,1,k));
 75 |          REQUIRE(S02(k) == T(0,2,k));
 76 |          REQUIRE(S10(k) == T(1,0,k));
 77 |          REQUIRE(S11(k) == T(1,1,k));
 78 |          REQUIRE(S12(k) == T(1,2,k));
 79 |          REQUIRE(S20(k) == T(2,0,k));
 80 |          REQUIRE(S21(k) == T(2,1,k));
 81 |          REQUIRE(S22(k) == T(2,2,k));
 82 |       }
 83 |    }
 84 | 
 85 |    SECTION("Prefixed sliced Set Method")
 86 |    {
 87 |       Tensor T(2, 3, 4);
 88 |       SliceTensor S0(T, 0);
 89 |       SliceTensor S1(T, 1);
 90 | 
 91 |       S0.Set(1.0);
 92 |       S1.Set(2.0);
 93 |       for (int j = 0; j < 3; ++ j)
 94 |       {
 95 |          for (int k = 0; k < 4; ++ k)
 96 |          {
 97 |             REQUIRE(T(0,j,k) == Approx(1.0));
 98 |             REQUIRE(T(1,j,k) == Approx(2.0));
 99 |             REQUIRE(S0(j,k) == Approx(1.0));
100 |             REQUIRE(S1(j,k) == Approx(2.0));            
101 |          }
102 |       }
103 |    }
104 | 
105 |    SECTION("Prefixed sliced tensor Set Method on GPU")
106 |    {
107 |       if (isCudaReady())
108 |       {
109 |          Tensor T(2, 10, 4, 4);
110 |          T.SwitchToGPU();
111 |          SliceTensor S0(T, 0);
112 |          SliceTensor S1(T, 1);
113 | 
114 |          S0.Set(1.0);
115 |          S1.Set(2.0);
116 |          T.MoveFromGPU();
117 |          for (int i = 0; i < 10; ++ i)
118 |          {
119 |             for (int j = 0; j < 4; ++ j)
120 |             {
121 |                for (int k = 0; k < 4; ++ k)
122 |                {
123 |                   REQUIRE(T(0,i,j,k) == Approx(1.0));
124 |                   REQUIRE(T(1,i,j,k) == Approx(2.0));
125 |                   REQUIRE(S0(i,j,k) == Approx(1.0));
126 |                   REQUIRE(S1(i,j,k) == Approx(2.0));
127 |                }
128 |             }
129 |          }
130 |       }
131 |    }
132 | 
133 |    SECTION("GPU Move Semantics")
134 |    {
135 |       if (isCudaReady())
136 |       {
137 |          Tensor T(2, 3);
138 |          T.MapToGPU();
139 |          double *t_cpu = T.GetData();
140 |          double *t_gpu = T.GetDeviceData();
141 |          CHECK(t_cpu != t_gpu);
142 |          CHECK(T.GetCurrentData() == t_cpu);
143 | 
144 |          SliceTensor S(T, 0);
145 |          double *s_cpu = S.GetData();
146 |          double *s_gpu = S.GetDeviceData();
147 |          CHECK(s_cpu != s_gpu);
148 |          CHECK(S.GetCurrentData() == s_cpu);
149 |          CHECK(S.IsMappedToGPU());
150 | 
151 |          S.MoveToGPU();
152 |          CHECK(T.IsOnGPU());
153 |          CHECK(S.IsOnGPU());
154 |          CHECK(T.GetCurrentData() == t_gpu);
155 |          CHECK(S.GetCurrentData() == s_gpu);
156 | 
157 |          S.Set(2.0);
158 |          T.MoveFromGPU();
159 |          CHECK(!T.IsOnGPU());
160 |          CHECK(!S.IsOnGPU());
161 |          CHECK(T.GetCurrentData() == t_cpu);
162 |          CHECK(S.GetCurrentData() == s_cpu);
163 |          for (int i = 0; i < S.GetSize(); ++i)
164 |          {
165 |             CHECK(S[i] == Approx(2.0));
166 |          }
167 |       }
168 |    }
169 | }


--------------------------------------------------------------------------------
/unittest/tensor/test_Tensor.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "catch.hpp"
  7 | #include "AcroTensor.hpp"
  8 | #include <iostream>
  9 | using namespace acro;
 10 | 
 11 | 
 12 | TEST_CASE("Basic Tensor unit tests", "[Tensor]")
 13 | {
 14 |    Tensor T1a(10), T2a(5, 6), T3a(4, 3, 1), T4a(5, 1, 3, 2), T5a(1, 1, 1, 2, 3);
 15 |    std::vector<int> dims = {1, 4, 3, 2};
 16 |    Tensor Tdims(dims);
 17 | 
 18 |    SECTION("Assert dims > 0")
 19 |    {
 20 |       REQUIRE_NOTHROW(new Tensor(1));
 21 |       REQUIRE_THROWS(new Tensor(-1));
 22 |       REQUIRE_THROWS(new Tensor(10, 0));
 23 |       REQUIRE_THROWS(new Tensor(-1, 10));
 24 |       REQUIRE_THROWS(new Tensor(10, -1, 10));
 25 | 
 26 |       std::vector<int> empty_dims;
 27 |       REQUIRE_THROWS(new Tensor(empty_dims));
 28 | 
 29 |       std::vector<int> bogus_dims = {1, 2, 3, 4, 5, 6, 7, -100};
 30 |       REQUIRE_THROWS(new Tensor(bogus_dims));
 31 | 
 32 |       REQUIRE_NOTHROW(new Tensor(dims));
 33 |    }
 34 | 
 35 |    SECTION("Dimensions set properly")
 36 |    {
 37 |       SECTION("Ranks")
 38 |       {
 39 |          REQUIRE(T1a.GetRank() == 1);
 40 |          REQUIRE(T2a.GetRank() == 2);
 41 |          REQUIRE(T3a.GetRank() == 3);
 42 |          REQUIRE(T4a.GetRank() == 4);
 43 |          REQUIRE(T5a.GetRank() == 5);
 44 |          REQUIRE(Tdims.GetRank() == 4);
 45 |       }
 46 |       
 47 |       SECTION("Dims")
 48 |       {
 49 |          REQUIRE(T1a.GetDim(0) == 10);
 50 |          REQUIRE(T2a.GetDim(0) == 5);  
 51 |          REQUIRE(T2a.GetDim(1) == 6);
 52 |          REQUIRE(T3a.GetDim(0) == 4); 
 53 |          REQUIRE(T3a.GetDim(1) == 3); 
 54 |          REQUIRE(T3a.GetDim(2) == 1);
 55 |          REQUIRE(T4a.GetDim(0) == 5); 
 56 |          REQUIRE(T4a.GetDim(1) == 1); 
 57 |          REQUIRE(T4a.GetDim(2) == 3); 
 58 |          REQUIRE(T4a.GetDim(3) == 2);
 59 |          REQUIRE(T5a.GetDim(0) == 1); 
 60 |          REQUIRE(T5a.GetDim(1) == 1); 
 61 |          REQUIRE(T5a.GetDim(2) == 1); 
 62 |          REQUIRE(T5a.GetDim(3) == 2); 
 63 |          REQUIRE(T5a.GetDim(4) == 3);
 64 |          REQUIRE(Tdims.GetDim(0) == 1); 
 65 |          REQUIRE(Tdims.GetDim(1) == 4); 
 66 |          REQUIRE(Tdims.GetDim(2) == 3); 
 67 |          REQUIRE(Tdims.GetDim(3) == 2);
 68 |       }
 69 | 
 70 |       SECTION("Sizes")
 71 |       {
 72 |          REQUIRE(T1a.GetSize() == 10);
 73 |          REQUIRE(T2a.GetSize() == 30);
 74 |          REQUIRE(T3a.GetSize() == 12);
 75 |          REQUIRE(T4a.GetSize() == 30);
 76 |          REQUIRE(T5a.GetSize() == 6);
 77 |          REQUIRE(Tdims.GetSize() == 24);
 78 |       }
 79 |    }
 80 | 
 81 |    SECTION("Index Space Covered")
 82 |    {
 83 |       std::vector<bool> covered(T4a.GetSize(), false);
 84 |       for (int i = 0; i < T4a.GetDim(0); ++i)
 85 |       {
 86 |          for (int j = 0; j < T4a.GetDim(1); ++j)
 87 |          {
 88 |             for (int k = 0; k < T4a.GetDim(2); ++k)
 89 |             {
 90 |                for (int l = 0; l < T4a.GetDim(3); ++l)
 91 |                {
 92 |                   int raw_index = T4a.GetRawIndex(i,j,k,l);
 93 |                   REQUIRE(raw_index >= 0);
 94 |                   REQUIRE(raw_index < T4a.GetSize());
 95 |                   covered[raw_index] = true;
 96 |                }
 97 |             }
 98 |          }
 99 |       }
100 | 
101 |       for (int raw_index = 0; raw_index < T4a.GetSize(); ++raw_index)
102 |       {
103 |          REQUIRE(covered[raw_index]);
104 |       }
105 |    }
106 | 
107 |    SECTION("Accessing the Data")
108 |    {
109 |       T1a.Set(0.0);
110 |       T2a.Set(0.0);
111 |       T3a.Set(0.0);
112 |       T4a.Set(0.0);
113 |       T5a.Set(0.0);
114 | 
115 |       T1a(3) = 4.0;
116 |       REQUIRE(T1a(3) == Approx(4.0));
117 |       REQUIRE(T1a[3] == Approx(4.0));
118 | 
119 |       T2a(2,1) = 3.0;
120 |       REQUIRE(T2a(2,1) == Approx(3.0));
121 |       REQUIRE(T2a[T2a.GetRawIndex(2,1)] == Approx(3.0));
122 |    }
123 | 
124 |    SECTION("Reshaping")
125 |    {
126 |       Tensor T(6);
127 |       for (int flatidx = 0; flatidx < T.GetSize(); ++flatidx)
128 |       {
129 |          T[flatidx] = double(flatidx);
130 |       }
131 | 
132 |       T.Reshape(3, 2);
133 |       REQUIRE_NOTHROW(T(1,0));
134 |       REQUIRE(T(1,0) == Approx(2.0));
135 |       REQUIRE_THROWS(T.Reshape(3,4));
136 |    }
137 | 
138 |    SECTION("Tensor on existing data")
139 |    {
140 |       double data[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
141 |       Tensor T(2, 3, data);
142 |       REQUIRE(T(1,1) == Approx(4.0));
143 |    }
144 | 
145 |    SECTION("Defered initialization")
146 |    {
147 |       Tensor T;
148 |       REQUIRE(!T.IsInitialized());
149 |       REQUIRE_NOTHROW(T.Init(2, 2));
150 |       REQUIRE(T.IsInitialized());
151 |       REQUIRE_NOTHROW(T(0,0) = 2.0);
152 |       REQUIRE(T(0,0) == 2.0);
153 |    }
154 | 
155 |    SECTION("Basic CUDA tests")
156 |    {
157 |       if (isCudaReady())
158 |       {         
159 |          Tensor T(2);
160 |          T.Set(3.0);
161 |          REQUIRE(T(0) == Approx(3.0));
162 |          REQUIRE(T(1) == Approx(3.0));
163 | 
164 |          T.MapToGPU();
165 |          REQUIRE(T.IsMappedToGPU());
166 |          REQUIRE(!T.IsOnGPU());
167 | 
168 |          T.SwitchToGPU();
169 |          REQUIRE(T.IsOnGPU());
170 | 
171 |          T.Set(9.0);
172 |          REQUIRE(T(0) == Approx(3.0));    //Not moved back from GPU yet
173 |          REQUIRE(T(1) == Approx(3.0));
174 | 
175 |          T.MoveFromGPU();
176 |          REQUIRE(!T.IsOnGPU());
177 |          REQUIRE(T(0) == Approx(9.0));
178 |          REQUIRE(T(1) == Approx(9.0));
179 |       }
180 |       else
181 |       {
182 |          std::cout << "No GPU found.  Ignoring CUDA tests." << std::endl;
183 |       }            
184 |    }
185 | }


--------------------------------------------------------------------------------
/unittest/unit_test_main.cpp:
--------------------------------------------------------------------------------
1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
3 | //All rights reserved.
4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
5 | 
6 | #define CATCH_CONFIG_MAIN     // This tells Catch to provide a main() - only do this in one cpp file
7 | #include "catch.hpp"
8 | 


--------------------------------------------------------------------------------
/util/CudaUtil.cpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #include "CudaUtil.hpp"
  7 | #include <iostream>
  8 | #include <fstream>
  9 | 
 10 | #ifdef ACRO_HAVE_CUDA
 11 | namespace acro
 12 | {
 13 | CUcontext theCudaContext = NULL;
 14 | 
 15 | CudaKernel::CudaKernel() : 
 16 |     IntOpsPerIndex(0),
 17 |     FloatOpsPerIndex(0),
 18 |     MemReadsPerIndex(0),
 19 |     NumBlocks(0),
 20 |     ThreadsPerBlock(0),
 21 |     MaxRegCount(-1),
 22 |     IsMultipleBlockPerOutput(true)
 23 | {
 24 | 
 25 | }
 26 | 
 27 | 
 28 | CudaKernel::~CudaKernel()
 29 | {
 30 |     for (auto it = Textures.begin(); it != Textures.end(); ++it)
 31 |     {
 32 |         cudaDestroyTextureObject(it->second);
 33 |     }
 34 | }
 35 | 
 36 | 
 37 | cudaTextureObject_t CudaKernel::GetTextureObject(int id)
 38 | {
 39 |     return Textures[id];
 40 | }
 41 | 
 42 | 
 43 | void CudaKernel::GenerateFunction()
 44 | {
 45 |     ensureCudaContext();
 46 |     nvrtcProgram prog;
 47 |     acroCudaErrorCheck(nvrtcCreateProgram(&prog,         // prog
 48 |                                           Code.c_str(),         // buffer
 49 |                                           NULL,    // name
 50 |                                           0,             // numHeaders
 51 |                                           NULL,          // headers
 52 |                                           NULL));        // includeNames
 53 | 
 54 |     std::string regstr = "--maxrregcount=" + std::to_string(MaxRegCount);
 55 |     const char *opts[5] = {"--restrict","--use_fast_math","--gpu-architecture=compute_60","-lineinfo",regstr.c_str()};
 56 |     int num_options = (MaxRegCount > 0) ? 5 : 4;
 57 |     nvrtcResult rcode = nvrtcCompileProgram(prog,  // prog
 58 |                                             num_options,     // numOptions
 59 |                                             opts); // options
 60 |     if (rcode != NVRTC_SUCCESS)
 61 |     {
 62 |         std::cout << "NVRTC Compilation error found in:" << std::endl;
 63 |         std::cout << Code << std::endl;
 64 |         size_t log_size;
 65 |         nvrtcGetProgramLogSize(prog, &log_size);
 66 |         char *compile_log = new char[log_size];
 67 |         nvrtcGetProgramLog(prog, compile_log);
 68 |         std::cout << compile_log << std::endl;
 69 |         delete[] compile_log;
 70 |         throw_error("Encountered in CudaKernel::GenerateFunction()");
 71 |     }
 72 | 
 73 | 
 74 |     size_t ptxSize;
 75 |     acroCudaErrorCheck(nvrtcGetPTXSize(prog, &ptxSize));
 76 |     char *ptx = new char[ptxSize];
 77 |     acroCudaErrorCheck(nvrtcGetPTX(prog, ptx));
 78 |     // Load the generated PTX and get a handle to the kernel.
 79 |     acroCudaErrorCheck(cuModuleLoadDataEx(&Module, ptx, 0, 0, 0));
 80 |     acroCudaErrorCheck(cuModuleGetFunction(&Function, Module, FunctionName.c_str()));
 81 |     acroCudaErrorCheck(nvrtcDestroyProgram(&prog));
 82 | 
 83 |     delete [] ptx;
 84 | }
 85 | 
 86 | 
 87 | void CudaKernel::SetGlobalArray(std::string &name, std::vector<int> &arr)
 88 | {
 89 |     CUdeviceptr device_arr;
 90 |     int bytesize = sizeof(int)*arr.size();
 91 |     acroCudaErrorCheck(cuModuleGetGlobal(&device_arr, NULL, Module, name.c_str()));
 92 |     acroCudaErrorCheck(cudaMemcpy((void*)device_arr, &arr[0], bytesize, cudaMemcpyHostToDevice));
 93 | }
 94 | 
 95 | void CudaKernel::Launch(std::vector<void*> &kernel_params, cudaStream_t cuda_stream)
 96 | {
 97 |     ensureCudaContext();
 98 |     acroCudaErrorCheck(cuLaunchKernel(Function,
 99 |                                       NumBlocks, 1, 1,            // grid dim
100 |                                       ThreadsPerBlock, 1, 1,      // threads per block
101 |                                       0, cuda_stream,             // shared mem and stream
102 |                                       &kernel_params[0], 0));     // arguments
103 | }
104 | 
105 | 
106 | void CudaKernel::WriteCodeToFile(const char *fname)
107 | {
108 |     std::string fname_str(fname);
109 |     WriteCodeToFile(fname_str);
110 | }
111 | 
112 | 
113 | void CudaKernel::WriteCodeToFile(std::string &fname)
114 | {
115 |     std::ofstream file;
116 |     file.open(fname);
117 |     file << Code;
118 |     file.close();
119 | }
120 | 
121 | 
122 | 
123 | __global__ void CudaSet(double *d, double val, int N)
124 | {
125 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
126 |     if (idx < N)
127 |     {
128 |         d[idx] = val;
129 |     }
130 | }
131 | 
132 | 
133 | __global__ void CudaMult(double *d, double c, int N)
134 | {
135 |     int idx = blockIdx.x*blockDim.x + threadIdx.x;
136 |     if (idx < N)
137 |     {
138 |         d[idx] *= c;
139 |     }
140 | }
141 | 
142 | 
143 | __device__ int2 CudaWarpSort(int2 val)
144 | {
145 |     int2 val2 = val;
146 |     // const int lanei = threadIdx.x % 32;
147 |     // const bool odd = threadIdx.x % 2 == 1;
148 |     // const bool even = !odd;
149 |     // bool comp_less;
150 |     // int2 comp_val;
151 |     // for (int pass = 0; pass < 32; ++pass)
152 |     // {
153 |     //     //Even pass
154 |     //     comp_val.x = __shfl_sync(0xFFFF, val, lanei + even - odd);
155 |     //     comp_val.y = __shfl_sync(0xFFFF, val, lanei + even - odd);
156 |     //     comp_less = (comp_val.x < val.x) || ((comp_val.x == val.x) && (comp_val.y < val.y));
157 |     //     val.x = int(even && (comp_less) || odd && (!comp_less)) * comp_val.x +
158 |     //             int(even && (!comp_less) || odd && (comp_less)) * val.x;
159 |     //     val.y = int(even && (comp_less) || odd && (!comp_less)) * comp_val.y +
160 |     //             int(even && (!comp_less) || odd && (comp_less)) * val.y;              
161 | 
162 |     //     //Odd pass
163 |     //     comp_val.x = __shfl_sync(0xFFFF, val, min(max(lanei - even + odd, 0), 31));
164 |     //     comp_val.y = __shfl_sync(0xFFFF, val, min(max(lanei - even + odd, 0), 31));        
165 |     //     comp_less = (comp_val.x < val.x) || (comp_val.x == val.x) && (comp_val.y < val.y);
166 |     //     val.x = int(odd && (comp_less) || even && (!comp_less)) * comp_val.x +
167 |     //             int(odd && (!comp_less) || even && (comp_less)) * val.x;
168 |     //     val.y = int(odd && (comp_less) || even && (!comp_less)) * comp_val.y +
169 |     //             int(odd && (!comp_less) || even && (comp_less)) * val.y;
170 |     // }
171 |     return val2;
172 | }
173 | 
174 | 
175 | __device__ int2 shfl_sync_int2(unsigned mask, int2 val, int srcLane, int width)
176 | {
177 |     int2 retval;
178 |     retval.x = __shfl_sync(mask, val.x, srcLane, width);
179 |     retval.y = __shfl_sync(mask, val.y, srcLane, width);
180 |     return retval;
181 | }
182 | 
183 | }
184 | 
185 | #endif


--------------------------------------------------------------------------------
/util/CudaUtil.hpp:
--------------------------------------------------------------------------------
  1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
  2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
  3 | //All rights reserved.
  4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
  5 | 
  6 | #ifndef ACROBATIC_CUDA_UTIL_HPP
  7 | #define ACROBATIC_CUDA_UTIL_HPP
  8 | 
  9 | #include <vector>
 10 | #include <string>
 11 | #include <cstring>
 12 | #include <map>
 13 | #include <iostream>
 14 | #include "Error.hpp"
 15 | #include <type_traits>
 16 | 
 17 | #ifdef ACRO_HAVE_CUDA
 18 | #include "cuda.h"
 19 | #include "nvrtc.h"
 20 | #include "cuda_runtime.h"
 21 | #endif
 22 | 
 23 | namespace acro
 24 | {
 25 | 
 26 | #define RESTRICT __restrict__
 27 | 
 28 | #ifdef ACRO_HAVE_CUDA
 29 | class CudaKernel
 30 | {
 31 |    public:
 32 |    CudaKernel();
 33 |    ~CudaKernel();
 34 |    void GenerateFunction();
 35 |    void SetGlobalArray(std::string &ame, std::vector<int> &arr);
 36 |    void WriteCodeToFile(const char *fname);
 37 |    void WriteCodeToFile(std::string &fname);
 38 |    template<typename T>
 39 |    inline void AddTextureData(int id, std::vector<T> &data);
 40 |    cudaTextureObject_t GetTextureObject(int id);
 41 |    void Launch(std::vector<void*> &kernel_params, cudaStream_t cuda_stream = NULL);
 42 | 
 43 |    std::string FunctionName;
 44 |    std::string Code;
 45 |    CUmodule Module;
 46 |    CUfunction Function;
 47 |    int IntOpsPerIndex;
 48 |    int FloatOpsPerIndex;
 49 |    int MemReadsPerIndex;
 50 |    int NumBlocks;
 51 |    int ThreadsPerBlock;
 52 |    int MaxRegCount;
 53 |    bool IsMultipleBlockPerOutput;
 54 | 
 55 |    private:
 56 |    std::map<int, cudaTextureObject_t> Textures;
 57 | };
 58 | 
 59 | #define acroCudaErrorCheck(ans) acroCudaAssert((ans), __FILE__, __LINE__);
 60 | inline void acroCudaAssert(cudaError_t code, const char *file, int line)
 61 | {
 62 |    if (code != cudaSuccess) 
 63 |    {
 64 |       fprintf(stderr,"CUDA Error: %s\n", cudaGetErrorString(code));
 65 |       throw_error(std::string("Encountered at:  ") + std::string(file) + ":  " + std::to_string(line));
 66 |    }
 67 | }
 68 | 
 69 | 
 70 | inline void acroCudaAssert(nvrtcResult code, const char *file, int line)
 71 | {
 72 |    if (code != NVRTC_SUCCESS) 
 73 |    {
 74 |       fprintf(stderr,"NVRTC Error: %s\n", nvrtcGetErrorString(code));
 75 |       throw_error(std::string("Encountered at:  ") + std::string(file) + ":  " + std::to_string(line));
 76 |    }
 77 | }
 78 | 
 79 | 
 80 | inline void acroCudaAssert(CUresult code, const char *file, int line)
 81 | {
 82 |    if (code != CUDA_SUCCESS) 
 83 |    {
 84 |       const char *msg;
 85 |       cuGetErrorName(code, &msg);
 86 |       fprintf(stderr,"CUDA Error: %s\n", msg);
 87 |       throw_error(std::string("Encountered at:  ") + std::string(file) + ":  " + std::to_string(line));
 88 |    }
 89 | }
 90 | 
 91 | 
 92 | extern CUcontext theCudaContext;
 93 | inline void setCudaContext(void *ctx)
 94 | {
 95 | 
 96 |    theCudaContext = (CUcontext) ctx;
 97 | 
 98 | }
 99 | 
100 | 
101 | inline void ensureCudaContext()
102 | {
103 |     if (!theCudaContext)
104 |     {
105 |         acroCudaErrorCheck(cuCtxCreate(&theCudaContext, 0, 0));
106 |     }
107 |     acroCudaErrorCheck(cuCtxSetCurrent(theCudaContext));
108 | }
109 | 
110 | 
111 | template<typename T>
112 | inline void CudaKernel::AddTextureData(int id, std::vector<T> &data)
113 | {
114 |     int Tsize = sizeof(T);
115 |     int bitT = Tsize * 8;
116 |     int bitTo2 = bitT / 2;
117 |     int bitTo4 = bitT / 4;
118 |     int arr_bytesize = Tsize*data.size();
119 |     T *buffer;
120 |     acroCudaErrorCheck(cudaMalloc(&buffer, arr_bytesize));
121 |     acroCudaErrorCheck(cudaMemcpy((void*)buffer, &data[0], arr_bytesize, cudaMemcpyHostToDevice));
122 | 
123 |     // create texture object
124 |     cudaResourceDesc resDesc;
125 |     std::memset(&resDesc, 0, sizeof(resDesc));
126 |     resDesc.resType = cudaResourceTypeLinear;
127 |     resDesc.res.linear.devPtr = buffer;
128 |     resDesc.res.linear.sizeInBytes = arr_bytesize;
129 |     if (std::is_same<T,uchar2>::value || std::is_same<T,ushort2>::value || std::is_same<T,uint2>::value)
130 |     {
131 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindUnsigned);
132 |     }
133 |     else if (std::is_same<T,char2>::value || std::is_same<T,short2>::value || std::is_same<T,int2>::value)
134 |     {
135 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindSigned);
136 |     }
137 |     else if (std::is_same<T,float2>::value)
138 |     {
139 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindFloat);
140 |     }    
141 |     else if (std::is_same<T,uchar4>::value || std::is_same<T,ushort4>::value || std::is_same<T,uint4>::value)
142 |     {
143 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindUnsigned);
144 |     }
145 |     else if (std::is_same<T,char4>::value || std::is_same<T,short4>::value || std::is_same<T,int4>::value)
146 |     {
147 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindSigned);
148 |     }    
149 |     else if (std::is_same<T,float4>::value)
150 |     {
151 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo4, bitTo4, bitTo4, bitTo4, cudaChannelFormatKindFloat);
152 |     }
153 |     else
154 |     {
155 |         resDesc.res.linear.desc = cudaCreateChannelDesc( bitTo2, bitTo2, 0, 0, cudaChannelFormatKindUnsigned);
156 |     }
157 | 
158 | 
159 |     cudaTextureDesc texDesc;
160 |     std::memset(&texDesc, 0, sizeof(texDesc));
161 |     texDesc.readMode = cudaReadModeElementType;
162 | 
163 |     Textures[id] = 0;
164 |     cudaCreateTextureObject(&Textures[id], &resDesc, &texDesc, NULL);
165 | }
166 | 
167 | __global__ void CudaSet(double *d, double val, int N);
168 | __global__ void CudaMult(double *d, double c, int N);
169 | 
170 | __device__ int2 CudaWarpSort(int2 val);
171 | __device__ int2 shfl_sync_int2(unsigned mask, int2 var, int srcLane, int width=32);
172 | 
173 | #endif
174 | 
175 | inline bool isCudaReady()
176 | {
177 | #ifndef ACRO_HAVE_CUDA
178 |    return false;
179 | #else
180 |    int cuda_device_count = -1;
181 |    cudaGetDeviceCount(&cuda_device_count);
182 |    return (cuda_device_count > 0);
183 | #endif
184 | }
185 | 
186 | 
187 | 
188 | }
189 | 
190 | #endif //ACROBATIC_CUDA_UTIL_HPP


--------------------------------------------------------------------------------
/util/Error.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_ERROR_HPP
 7 | #define ACROBATIC_ERROR_HPP
 8 | 
 9 | #include <string>
10 | #include <stdexcept>
11 | 
12 | namespace acro
13 | {
14 | 
15 | #define GET_MACRO(_1,_2,NAME,...) NAME
16 | #define ACROBATIC_ASSERT(...) GET_MACRO(__VA_ARGS__, ACROBATIC_ASSERT2, ACROBATIC_ASSERT1)(__VA_ARGS__)
17 | #define ACROBATIC_ASSERT1(EX) if (!(EX)) throw_error(std::string(__FILE__) + ":  " + std::to_string(__LINE__));
18 | #define ACROBATIC_ASSERT2(EX, STR) if (!(EX)) throw_error(std::string(__FILE__) + ":  " + std::to_string(__LINE__) + "  " + STR);
19 | 
20 | inline void throw_error(std::string error)
21 | {
22 |    throw std::runtime_error(error);
23 | }
24 | 
25 | }
26 | 
27 | #endif //ACROBATIC_ERROR_HPP


--------------------------------------------------------------------------------
/util/StringUtil.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_STRING_UTIL_HPP
 7 | #define ACROBATIC_STRING_UTIL_HPP
 8 | 
 9 | #include <string>
10 | 
11 | namespace acro
12 | {
13 | 
14 | inline void str_replace_all(std::string &instr, const std::string &keystr, const std::string &repstr)
15 | {
16 |    std::size_t instr_pos = instr.find(keystr);
17 |    while (instr_pos != std::string::npos)
18 |    {
19 |       instr.replace(instr_pos, keystr.length(), repstr);
20 |       instr_pos = instr.find(keystr);
21 |    }
22 | }
23 | 
24 | inline void str_replace_all(std::string &instr, const std::string &keystr, const int repint)
25 | {
26 |    str_replace_all(instr, keystr, std::to_string(repint));
27 | }
28 | 
29 | }
30 | 
31 | #endif //ACROBATIC_STRING_UTIL_HPP


--------------------------------------------------------------------------------
/util/Util.hpp:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory
 2 | //Written by Aaron Fisher (fisher47@llnl.gov). LLNL-CODE-738419.
 3 | //All rights reserved.
 4 | //This file is part of Acrotensor. For details, see https://github.com/LLNL/acrotensor.
 5 | 
 6 | #ifndef ACROBATIC_UTIL_HPP
 7 | #define ACROBATIC_UTIL_HPP
 8 | 
 9 | #include "Error.hpp"
10 | #include "CudaUtil.hpp"
11 | #include "StringUtil.hpp"
12 | 
13 | #endif


--------------------------------------------------------------------------------