├── .gitignore
├── doc
    ├── bw_k20x.png
    ├── bw_titanx.png
    └── k20x_bench.png
├── src
    ├── calls.h
    ├── cuttTypes.h
    ├── cuttkernel.h
    ├── CudaMemcpy.h
    ├── cuttGpuModelKernel.h
    ├── TensorTester.h
    ├── LRUCache.h
    ├── cutt.h
    ├── cuttTimer.h
    ├── cuttGpuModel.h
    ├── CudaMemcpy.cu
    ├── CudaUtils.cu
    ├── CudaUtils.h
    ├── cuttplan.h
    ├── cuttTimer.cpp
    ├── TensorTester.cu
    ├── int_vector.h
    ├── cutt.cpp
    ├── cutt_test.cpp
    ├── cuttGpuModelKernel.cu
    ├── cutt_bench.cpp
    └── cuttkernel.cu
├── include
    └── cutt.h
├── Makefile
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | build/
3 | lib/
4 | 


--------------------------------------------------------------------------------
/doc/bw_k20x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/bw_k20x.png


--------------------------------------------------------------------------------
/doc/bw_titanx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/bw_titanx.png


--------------------------------------------------------------------------------
/doc/k20x_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/k20x_bench.png


--------------------------------------------------------------------------------
/src/calls.h:
--------------------------------------------------------------------------------
 1 | #if MAX_REG_STORAGE >= 1
 2 | CALL(1);
 3 | #endif
 4 | #if MAX_REG_STORAGE >= 2
 5 | CALL(2);
 6 | #endif
 7 | #if MAX_REG_STORAGE >= 3
 8 | CALL(3);
 9 | #endif
10 | #if MAX_REG_STORAGE >= 4
11 | CALL(4);
12 | #endif
13 | #if MAX_REG_STORAGE >= 5
14 | CALL(5);
15 | #endif
16 | #if MAX_REG_STORAGE >= 6
17 | CALL(6);
18 | #endif
19 | #if MAX_REG_STORAGE >= 7
20 | CALL(7);
21 | #endif
22 | #if MAX_REG_STORAGE >= 8
23 | CALL(8);
24 | #endif
25 | #if MAX_REG_STORAGE >= 9
26 | CALL(9);
27 | #endif
28 | #if MAX_REG_STORAGE >= 10
29 | CALL(10);
30 | #endif
31 | 
32 | #if MAX_REG_STORAGE >= 11
33 | CALL(11);
34 | #endif
35 | #if MAX_REG_STORAGE >= 12
36 | CALL(12);
37 | #endif
38 | #if MAX_REG_STORAGE >= 13
39 | CALL(13);
40 | #endif
41 | #if MAX_REG_STORAGE >= 14
42 | CALL(14);
43 | #endif
44 | #if MAX_REG_STORAGE >= 15
45 | CALL(15);
46 | #endif
47 | #if MAX_REG_STORAGE >= 16
48 | CALL(16);
49 | #endif
50 | #if MAX_REG_STORAGE >= 17
51 | CALL(17);
52 | #endif
53 | #if MAX_REG_STORAGE >= 18
54 | CALL(18);
55 | #endif
56 | #if MAX_REG_STORAGE >= 19
57 | CALL(19);
58 | #endif
59 | #if MAX_REG_STORAGE >= 20
60 | CALL(20);
61 | #endif
62 | 
63 | #if MAX_REG_STORAGE >= 21
64 | CALL(21);
65 | #endif
66 | #if MAX_REG_STORAGE >= 22
67 | CALL(22);
68 | #endif
69 | #if MAX_REG_STORAGE >= 23
70 | CALL(23);
71 | #endif
72 | #if MAX_REG_STORAGE >= 24
73 | CALL(24);
74 | #endif
75 | #if MAX_REG_STORAGE >= 25
76 | CALL(25);
77 | #endif
78 | #if MAX_REG_STORAGE >= 26
79 | CALL(26);
80 | #endif
81 | #if MAX_REG_STORAGE >= 27
82 | CALL(27);
83 | #endif
84 | #if MAX_REG_STORAGE >= 28
85 | CALL(28);
86 | #endif
87 | #if MAX_REG_STORAGE >= 29
88 | CALL(29);
89 | #endif
90 | #if MAX_REG_STORAGE >= 30
91 | CALL(30);
92 | #endif
93 | 


--------------------------------------------------------------------------------
/src/cuttTypes.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef CUTTTYPES_H
26 | #define CUTTTYPES_H
27 | 
28 | #define MAX_REG_STORAGE 8
29 | 
30 | struct TensorConv {
31 |   int c;
32 |   int d;
33 |   int ct;
34 | };
35 | 
36 | struct TensorConvInOut {
37 |   int c_in;
38 |   int d_in;
39 |   int ct_in;
40 |   int c_out;
41 |   int d_out;
42 |   int ct_out;
43 |   
44 | };
45 | 
46 | #endif // CUTTTYPES_H
47 | 


--------------------------------------------------------------------------------
/src/cuttkernel.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef CUTTKERNEL_H
26 | #define CUTTKERNEL_H
27 | #include "cuttplan.h"
28 | 
29 | void cuttKernelSetSharedMemConfig();
30 | 
31 | int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts,
32 | 	const int deviceID, const cudaDeviceProp& prop, LaunchConfig& lc);
33 | 
34 | bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut);
35 | 
36 | #endif // CUTTKERNEL_H
37 | 


--------------------------------------------------------------------------------
/src/CudaMemcpy.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef CUDAMEMCPY_H
26 | #define CUDAMEMCPY_H
27 | 
28 | #include <cuda_runtime.h>
29 | 
30 | template <typename T> void scalarCopy(const int n, const T* data_in, T* data_out, cudaStream_t stream);
31 | template <typename T> void vectorCopy(const int n, T* data_in, T* data_out, cudaStream_t stream);
32 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream);
33 | 
34 | #endif // CUDAMEMCPY_H
35 | 


--------------------------------------------------------------------------------
/src/cuttGpuModelKernel.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef CUTTGPUMODELKERNEL_H
26 | #define CUTTGPUMODELKERNEL_H
27 | #include "cuttplan.h"
28 | 
29 | void runCounters(const int warpSize, const int* hostPosData, const int numPosData,
30 |   const int accWidth, const int cacheWidth, int* host_tran, int* host_cl_full, int* host_cl_part);
31 | 
32 | bool cuttGpuModelKernel(cuttPlan_t& plan, const int accWidth, const int cacheWidth,
33 |   int& gld_tran, int& gst_tran, int& gld_req, int& gst_req,
34 |   int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1);
35 | 
36 | #endif // CUTTGPUMODELKERNEL_H
37 | 


--------------------------------------------------------------------------------
/src/TensorTester.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef TENSORTESTER_H
26 | #define TENSORTESTER_H
27 | #include "cuttTypes.h"
28 | 
29 | //
30 | // Simple tensor transpose tester class
31 | //
32 | 
33 | struct TensorError_t {
34 |   int refVal;
35 |   int dataVal;
36 |   unsigned int pos;
37 | };
38 | 
39 | class TensorTester {
40 | private:
41 |   static int calcTensorConv(const int rank, const int* dim, const int* permutation, TensorConv* tensorConv);
42 | 
43 |   const int maxRank;
44 |   const int maxNumblock;
45 | 
46 | public:
47 |   TensorConv* h_tensorConv;
48 |   TensorConv* d_tensorConv;
49 |   TensorError_t* h_error;
50 |   TensorError_t* d_error;
51 |   int* d_fail;
52 | 
53 |   TensorTester();
54 |   ~TensorTester();
55 | 
56 |   void setTensorCheckPattern(unsigned int* data, unsigned int ndata);
57 |   
58 |   template<typename T> bool checkTranspose(int rank, int* dim, int* permutation, T* data);
59 | 
60 | };
61 | 
62 | #endif // TENSORTESTER_H
63 | 


--------------------------------------------------------------------------------
/src/LRUCache.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 NVIDIA
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | 
26 | #include <utility>
27 | #include <list>
28 | #include <unordered_map>
29 | 
30 | using namespace std;
31 | 
32 | //
33 | // Simple LRU cache implementation
34 | //
35 | template <typename key_type, typename value_type>
36 | class LRUCache {
37 | private:
38 | 
39 |   struct ValueIterator {
40 |     value_type value;
41 |     typename list<key_type>::iterator it;
42 |   };
43 | 
44 |   // Size of the cache
45 |   const size_t capacity;
46 | 
47 |   // Value that is returned when the key is not found
48 |   const value_type null_value;
49 | 
50 |   // Double linked list of keys. Oldest is at the back
51 |   list<key_type> keys;
52 | 
53 |   // Cache: (hash table)
54 |   // key = key
55 |   // value = {value, pointer to linked list}
56 |   unordered_map<key_type, ValueIterator> cache;
57 | 
58 | public:
59 |   
60 |   LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {} 
61 |  
62 |   value_type get(key_type key) {
63 |     auto it = cache.find(key);
64 |     if (it == cache.end()) return null_value;
65 |     touch(it);
66 |     return it->second.value;
67 |   }
68 |   
69 |   void set(key_type key, value_type value) {
70 |     auto it = cache.find(key);
71 |     if (it != cache.end()) {
72 |       // key found
73 |       it->second.value = value;
74 |       touch(it);
75 |     } else {
76 |       // key not found
77 |       if (cache.size() == capacity) {
78 |         key_type oldest_key = keys.back();
79 |         keys.pop_back();
80 |         cache.erase( cache.find(oldest_key) );
81 |       }
82 |       keys.push_front(key);
83 |       ValueIterator vi;
84 |       vi.value = value;
85 |       vi.it = keys.begin();
86 |       pair<key_type, ValueIterator> boo(key, vi);
87 |       cache.insert(boo);
88 |     }
89 |   }
90 | 
91 | private:
92 | 
93 |   void touch(typename unordered_map<key_type, ValueIterator>::iterator it) {
94 |     keys.erase(it->second.it);
95 |     keys.push_front(it->first);
96 |     it->second.it = keys.begin();
97 |   }
98 | };
99 | 


--------------------------------------------------------------------------------
/include/cutt.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #ifndef CUTT_H
 26 | #define CUTT_H
 27 | 
 28 | #include <cuda_runtime.h>
 29 | 
 30 | // Handle type that is used to store and access cutt plans
 31 | typedef unsigned int cuttHandle;
 32 | 
 33 | // Return value
 34 | typedef enum cuttResult_t {
 35 |   CUTT_SUCCESS,            // Success
 36 |   CUTT_INVALID_PLAN,       // Invalid plan handle
 37 |   CUTT_INVALID_PARAMETER,  // Invalid input parameter
 38 |   CUTT_INVALID_DEVICE,     // Execution tried on device different than where plan was created
 39 |   CUTT_INTERNAL_ERROR,     // Internal error
 40 |   CUTT_UNDEFINED_ERROR,    // Undefined error
 41 | } cuttResult;
 42 | 
 43 | //
 44 | // Create plan
 45 | //
 46 | // Parameters
 47 | // handle            = Returned handle to cuTT plan
 48 | // rank              = Rank of the tensor
 49 | // dim[rank]         = Dimensions of the tensor
 50 | // permutation[rank] = Transpose permutation
 51 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
 52 | // stream            = CUDA stream (0 if no stream is used)
 53 | //
 54 | // Returns
 55 | // Success/unsuccess code
 56 | // 
 57 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
 58 |   cudaStream_t stream);
 59 | 
 60 | //
 61 | // Create plan and choose implementation by measuring performance
 62 | //
 63 | // Parameters
 64 | // handle            = Returned handle to cuTT plan
 65 | // rank              = Rank of the tensor
 66 | // dim[rank]         = Dimensions of the tensor
 67 | // permutation[rank] = Transpose permutation
 68 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
 69 | // stream            = CUDA stream (0 if no stream is used)
 70 | // idata             = Input data size product(dim)
 71 | // odata             = Output data size product(dim)
 72 | //
 73 | // Returns
 74 | // Success/unsuccess code
 75 | // 
 76 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
 77 |   cudaStream_t stream, void* idata, void* odata);
 78 | 
 79 | //
 80 | // Destroy plan
 81 | //
 82 | // Parameters
 83 | // handle            = Handle to the cuTT plan
 84 | // 
 85 | // Returns
 86 | // Success/unsuccess code
 87 | //
 88 | cuttResult cuttDestroy(cuttHandle handle);
 89 | 
 90 | //
 91 | // Execute plan out-of-place
 92 | //
 93 | // Parameters
 94 | // handle            = Returned handle to cuTT plan
 95 | // idata             = Input data size product(dim)
 96 | // odata             = Output data size product(dim)
 97 | // 
 98 | // Returns
 99 | // Success/unsuccess code
100 | //
101 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata);
102 | 
103 | #endif // CUTT_H
104 | 


--------------------------------------------------------------------------------
/src/cutt.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #ifndef CUTT_H
 26 | #define CUTT_H
 27 | 
 28 | #include <cuda_runtime.h>
 29 | 
 30 | // Handle type that is used to store and access cutt plans
 31 | typedef unsigned int cuttHandle;
 32 | 
 33 | // Return value
 34 | typedef enum cuttResult_t {
 35 |   CUTT_SUCCESS,            // Success
 36 |   CUTT_INVALID_PLAN,       // Invalid plan handle
 37 |   CUTT_INVALID_PARAMETER,  // Invalid input parameter
 38 |   CUTT_INVALID_DEVICE,     // Execution tried on device different than where plan was created
 39 |   CUTT_INTERNAL_ERROR,     // Internal error
 40 |   CUTT_UNDEFINED_ERROR,    // Undefined error
 41 | } cuttResult;
 42 | 
 43 | //
 44 | // Create plan
 45 | //
 46 | // Parameters
 47 | // handle            = Returned handle to cuTT plan
 48 | // rank              = Rank of the tensor
 49 | // dim[rank]         = Dimensions of the tensor
 50 | // permutation[rank] = Transpose permutation
 51 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
 52 | // stream            = CUDA stream (0 if no stream is used)
 53 | //
 54 | // Returns
 55 | // Success/unsuccess code
 56 | // 
 57 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
 58 |   cudaStream_t stream);
 59 | 
 60 | //
 61 | // Create plan and choose implementation by measuring performance
 62 | //
 63 | // Parameters
 64 | // handle            = Returned handle to cuTT plan
 65 | // rank              = Rank of the tensor
 66 | // dim[rank]         = Dimensions of the tensor
 67 | // permutation[rank] = Transpose permutation
 68 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
 69 | // stream            = CUDA stream (0 if no stream is used)
 70 | // idata             = Input data size product(dim)
 71 | // odata             = Output data size product(dim)
 72 | //
 73 | // Returns
 74 | // Success/unsuccess code
 75 | // 
 76 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
 77 |   cudaStream_t stream, void* idata, void* odata);
 78 | 
 79 | //
 80 | // Destroy plan
 81 | //
 82 | // Parameters
 83 | // handle            = Handle to the cuTT plan
 84 | // 
 85 | // Returns
 86 | // Success/unsuccess code
 87 | //
 88 | cuttResult cuttDestroy(cuttHandle handle);
 89 | 
 90 | //
 91 | // Execute plan out-of-place
 92 | //
 93 | // Parameters
 94 | // handle            = Returned handle to cuTT plan
 95 | // idata             = Input data size product(dim)
 96 | // odata             = Output data size product(dim)
 97 | // 
 98 | // Returns
 99 | // Success/unsuccess code
100 | //
101 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata);
102 | 
103 | #endif // CUTT_H
104 | 


--------------------------------------------------------------------------------
/src/cuttTimer.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | 
 26 | #ifndef CUTTTIMER_H
 27 | #define CUTTTIMER_H
 28 | 
 29 | #include <vector>
 30 | #include <chrono>
 31 | #include <cstdlib>
 32 | #include <unordered_map>
 33 | #include <set>
 34 | // -------------------------------------------------
 35 | // By default uses CUDA event timer. Comment out
 36 | // this line if you want to use the wallclock 
 37 | #define CUDA_EVENT_TIMER
 38 | // -------------------------------------------------
 39 | #ifdef CUDA_EVENT_TIMER
 40 | #include <cuda_runtime.h>
 41 | #endif
 42 | 
 43 | //
 44 | // Simple raw timer
 45 | //
 46 | class Timer {
 47 | private:
 48 | #ifdef CUDA_EVENT_TIMER
 49 |   cudaEvent_t tmstart, tmend;
 50 | #else
 51 |   std::chrono::high_resolution_clock::time_point tmstart, tmend;
 52 | #endif
 53 | public:
 54 | #ifdef CUDA_EVENT_TIMER
 55 |   Timer();
 56 |   ~Timer();
 57 | #endif
 58 |   void start();
 59 |   void stop();
 60 |   double seconds();
 61 | };
 62 | 
 63 | //
 64 | // Records timings for cuTT and gives out bandwidths and other data
 65 | //
 66 | class cuttTimer {
 67 | private:
 68 |   // Size of the type we're measuring
 69 |   const int sizeofType;
 70 | 
 71 |   // Dimension and permutation of the current run
 72 |   std::vector<int> curDim;
 73 |   std::vector<int> curPermutation;
 74 | 
 75 |   // Bytes transposed in the current run
 76 |   size_t curBytes;
 77 | 
 78 |   // Timer for current run
 79 |   Timer timer;
 80 | 
 81 |   struct Stat {
 82 |     double totBW;
 83 |     double minBW;
 84 |     double maxBW;
 85 |     std::vector<double> BW;
 86 |     std::vector<int> worstDim;
 87 |     std::vector<int> worstPermutation;
 88 |     Stat() {
 89 |       totBW = 0.0;
 90 |       minBW = 1.0e20;
 91 |       maxBW = -1.0;
 92 |     }
 93 |   };
 94 | 
 95 |   // List of ranks that have been recorded
 96 |   std::set<int> ranks;
 97 | 
 98 |   // Statistics for every rank
 99 |   std::unordered_map<int, Stat> stats;
100 | 
101 | public:
102 |   cuttTimer(int sizeofType);
103 |   ~cuttTimer();
104 |   void start(std::vector<int>& dim, std::vector<int>& permutation);
105 |   void stop();
106 |   double seconds();
107 |   double GBs();
108 |   double GiBs();
109 |   double getBest(int rank);
110 |   double getWorst(int rank);
111 |   double getWorst(int rank, std::vector<int>& dim, std::vector<int>& permutation);
112 |   double getMedian(int rank);
113 |   double getAverage(int rank);
114 |   std::vector<double> getData(int rank);
115 | 
116 |   double getWorst(std::vector<int>& dim, std::vector<int>& permutation);
117 | 
118 |   std::set<int>::const_iterator ranksBegin() {
119 |     return ranks.begin();
120 |   }
121 | 
122 |   std::set<int>::const_iterator ranksEnd() {
123 |     return ranks.end();
124 |   }
125 | };
126 | 
127 | #endif // CUTTTIMER_H
128 | 


--------------------------------------------------------------------------------
/src/cuttGpuModel.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | MIT License
 3 | 
 4 | Copyright (c) 2016 Antti-Pekka Hynninen
 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | *******************************************************************************/
25 | #ifndef CUTTGPUMODEL_H
26 | #define CUTTGPUMODEL_H
27 | 
28 | #include <vector>
29 | #include "cuttTypes.h"
30 | #include "cuttplan.h"
31 | #include "int_vector.h"
32 | 
33 | void computePos(const int vol0, const int vol1,
34 |   const TensorConvInOut* conv, const int numConv,
35 |   int* posIn, int* posOut);
36 | 
37 | void computePos0(const int vol,
38 |   const TensorConvInOut* conv, const int numConv,
39 |   int* posIn, int* posOut);
40 | 
41 | void computePosRef(int vol0, int vol1,
42 |   std::vector<TensorConvInOut>::iterator it0, std::vector<TensorConvInOut>::iterator it1,
43 |   std::vector<int>& posIn, std::vector<int>& posOut);
44 | 
45 | void countPackedGlTransactions(const int warpSize, const int accWidth, const int cacheWidth,
46 |   const int numthread, const int posMbarIn, const int posMbarOut, const int volMmk, 
47 |   std::vector<int>& posMmkIn, std::vector<int>& posMmkOut,
48 |   int& gld_tran, int& gst_tran, int& gld_req, int& gst_req,
49 |   int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1);
50 | 
51 | void countPackedGlTransactions0(const int warpSize, const int accWidth, const int cacheWidth,
52 |   const int numthread, 
53 |   const int numPos, const int posMbarIn[INT_VECTOR_LEN], const int posMbarOut[INT_VECTOR_LEN],
54 |   const int volMmk,  const int* __restrict__ posMmkIn, const int* __restrict__ posMmkOut,
55 |   int& gld_tran, int& gst_tran, int& gld_req, int& gst_req,
56 |   int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1);
57 | 
58 | void countPackedShTransactions(const int warpSize, const int bankWidth, const int numthread,
59 |   const int volMmk, const TensorConv* msh, const int numMsh,
60 |   int& sld_tran, int& sst_tran, int& sld_req, int& sst_req);
61 | 
62 | void countPackedShTransactions0(const int warpSize, const int bankWidth, const int numthread,
63 |   const int volMmk, const TensorConv* msh, const int numMsh,
64 |   int& sld_tran, int& sst_tran, int& sld_req, int& sst_req);
65 | 
66 | void countPackedShTransactionsRef(const int warpSize, const int bankWidth, const int numthread,
67 |   const int volMmk, const TensorConv* msh, const int numMsh,
68 |   int& sld_tran, int& sst_tran, int& sld_req, int& sst_req);
69 | 
70 | void countTiledGlTransactions(const bool leadVolSame,
71 |   const int numPosMbarSample, const int volMm, const int volMk, const int volMbar,
72 |   const int cIn, const int cOut, const int accWidth, const int cacheWidth,
73 |   std::vector<TensorConvInOut>& hostMbar, const int sizeMbar,
74 |   int& num_iter, float& mlp, int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, int& cl_full, int& cl_part);
75 | 
76 | double cyclesPacked(const bool isSplit, const size_t sizeofType, cudaDeviceProp& prop,
77 |   int nthread, int numActiveBlock, float mlp, 
78 |   int gld_req, int gst_req, int gld_tran, int gst_tran,
79 |   int sld_req, int sst_req, int sld_tran, int sst_tran, int num_iter, int cl_full, int cl_part);
80 | 
81 | double cyclesTiled(const bool isCopy, const size_t sizeofType, cudaDeviceProp& prop,
82 |   int nthread, int numActiveBlock, float mlp, 
83 |   int gld_req, int gst_req, int gld_tran, int gst_tran,
84 |   int sld_req, int sst_req, int sld_tran, int sst_tran, int num_iter, int cl_full, int cl_part);
85 | 
86 | bool testCounters(const int warpSize, const int accWidth, const int cacheWidth);
87 | 
88 | #endif // CUTTGPUMODEL_H


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #******************************************************************************
  2 | #MIT License
  3 | #
  4 | #Copyright (c) 2016 Antti-Pekka Hynninen
  5 | #Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | #
  7 | #Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | #of this software and associated documentation files (the "Software"), to deal
  9 | #in the Software without restriction, including without limitation the rights
 10 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | #copies of the Software, and to permit persons to whom the Software is
 12 | #furnished to do so, subject to the following conditions:
 13 | #
 14 | #The above copyright notice and this permission notice shall be included in all
 15 | #copies or substantial portions of the Software.
 16 | #
 17 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | #SOFTWARE.
 24 | #*******************************************************************************
 25 | 
 26 | #################### User Settings ####################
 27 | 
 28 | # C++ compiler
 29 | CC = g++ -fPIC
 30 | 
 31 | # CUDA compiler
 32 | ifeq ($(nvcc_path),)
 33 | 	CUDAC = /usr/local/cuda/bin/nvcc -Xcompiler -fPIC
 34 | else
 35 | 	CUDAC = $(nvcc_path) -Xcompiler -fPIC
 36 | endif
 37 | 
 38 | # Enable nvvp profiling of CPU code by using "make ENABLE_NVTOOLS=1"
 39 | # If aligned_alloc() is not available, use "make NO_ALIGNED_ALLOC=1"
 40 | 
 41 | # SM versions for which code is generated must be sm_30 and above
 42 | GENCODE_SM35  := -gencode arch=compute_35,code=sm_35
 43 | GENCODE_SM50  := -gencode arch=compute_50,code=sm_50
 44 | GENCODE_SM52  := -gencode arch=compute_52,code=sm_52
 45 | GENCODE_SM60  := -gencode arch=compute_60,code=sm_60
 46 | GENCODE_SM75  := -gencode arch=compute_75,code=sm_75
 47 | GENCODE_FLAGS := $(GENCODE_SM35) $(GENCODE_SM52) $(GENCODE_SM60) $(GENCODE_SM75)
 48 | GENCODE_FLAGS := $(NVCC_GENCODE)
 49 | 
 50 | #######################################################
 51 | 
 52 | # Detect OS
 53 | ifeq ($(shell uname -a|grep Linux|wc -l|tr -d ' '), 1)
 54 | OS = linux
 55 | endif
 56 | 
 57 | ifeq ($(shell uname -a|grep titan|wc -l|tr -d ' '), 1)
 58 | OS = linux
 59 | endif
 60 | 
 61 | ifeq ($(shell uname -a|grep Darwin|wc -l|tr -d ' '), 1)
 62 | OS = osx
 63 | endif
 64 | 
 65 | # Detect x86_64 vs. Power
 66 | CPU = unknown
 67 | 
 68 | ifeq ($(shell uname -a|grep x86_64|wc -l|tr -d ' '), 1)
 69 | CPU = x86_64
 70 | endif
 71 | 
 72 | ifeq ($(shell uname -a|grep ppc64|wc -l|tr -d ' '), 1)
 73 | CPU = ppc64
 74 | endif
 75 | 
 76 | # Set optimization level
 77 | OPTLEV = -O3
 78 | 
 79 | # Defines
 80 | DEFS =
 81 | 
 82 | ifdef ENABLE_NVTOOLS
 83 | DEFS += -DENABLE_NVTOOLS
 84 | endif
 85 | 
 86 | ifdef NO_ALIGNED_ALLOC
 87 | DEFS += -DNO_ALIGNED_ALLOC
 88 | endif
 89 | 
 90 | OBJSLIB = build/cutt.o build/cuttplan.o build/cuttkernel.o build/cuttGpuModel.o build/CudaUtils.o build/cuttTimer.o build/cuttGpuModelKernel.o
 91 | OBJSTEST = build/cutt_test.o build/TensorTester.o build/CudaUtils.o build/cuttTimer.o
 92 | OBJSBENCH = build/cutt_bench.o build/TensorTester.o build/CudaUtils.o build/cuttTimer.o build/CudaMemcpy.o
 93 | OBJS = $(OBJSLIB) $(OBJSTEST) $(OBJSBENCH)
 94 | 
 95 | #CUDAROOT = $(subst /bin/,,$(dir $(shell which nvcc)))
 96 | #CUDAROOT = $(subst /bin/,,$(dir $(shell which $(CUDAC))))
 97 | 
 98 | ifeq ($(nvcc_path),)
 99 | 	CUDAROOT = /usr/local/cuda
100 | else
101 | 	CUDAROOT = $(subst /bin/nvcc,, $(nvcc_path))
102 | endif
103 | 
104 | CFLAGS = -I${CUDAROOT}/include -std=c++11 $(DEFS) $(OPTLEV) 
105 | ifeq ($(CPU),x86_64)
106 | CFLAGS += -march=native
107 | endif
108 | 
109 | CUDA_CFLAGS = -I${CUDAROOT}/include -std=c++11 $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES
110 | 
111 | ifeq ($(OS),osx)
112 | CUDA_LFLAGS = -L$(CUDAROOT)/lib
113 | else
114 | CUDA_LFLAGS = -L$(CUDAROOT)/lib64
115 | endif
116 | 
117 | CUDA_LFLAGS += -Llib -lcudart -lcutt
118 | ifdef ENABLE_NVTOOLS
119 | CUDA_LFLAGS += -lnvToolsExt
120 | endif
121 | 
122 | all: create_build lib/libcutt.so bin/cutt_test bin/cutt_bench
123 | 
124 | create_build:
125 | 	mkdir -p build
126 | 
127 | lib/libcutt.so: $(OBJSLIB)
128 | 	mkdir -p lib
129 | 	rm -f lib/libcutt.so
130 | 	g++ -fPIC --share -o lib/libcutt.so $(OBJSLIB)
131 | 	mkdir -p include
132 | 	cp -f src/cutt.h include/cutt.h
133 | 
134 | bin/cutt_test : lib/libcutt.so $(OBJSTEST)
135 | 	mkdir -p bin
136 | 	$(CC) -o bin/cutt_test $(OBJSTEST) $(CUDA_LFLAGS)
137 | 
138 | bin/cutt_bench : lib/libcutt.so $(OBJSBENCH)
139 | 	mkdir -p bin
140 | 	$(CC) -o bin/cutt_bench $(OBJSBENCH) $(CUDA_LFLAGS)
141 | 
142 | clean: 
143 | 	rm -f $(OBJS)
144 | 	rm -f build/*.d
145 | 	rm -f *~
146 | 	rm -f lib/libcutt.so
147 | 	rm -f bin/cutt_test
148 | 	rm -f bin/cutt_bench
149 | 
150 | # Pull in dependencies that already exist
151 | -include $(OBJS:.o=.d)
152 | 
153 | build/%.o : src/%.cu
154 | 	$(CUDAC) -c $(CUDA_CFLAGS) -o build/$*.o $<
155 | 	echo -e 'build/\c' > build/$*.d
156 | 	$(CUDAC) -M $(CUDA_CFLAGS) $< >> build/$*.d
157 | 
158 | build/%.o : src/%.cpp
159 | 	$(CC) -c $(CFLAGS) -o build/$*.o $<
160 | 	echo -e 'build/\c' > build/$*.d
161 | 	$(CC) -M $(CFLAGS) $< >> build/$*.d
162 | 


--------------------------------------------------------------------------------
/src/CudaMemcpy.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <cuda.h>
 26 | #include "CudaUtils.h"
 27 | #include "CudaMemcpy.h"
 28 | 
 29 | const int numthread = 64;
 30 | 
 31 | // -----------------------------------------------------------------------------------
 32 | //
 33 | // Copy using scalar loads and stores
 34 | //
 35 | template <typename T>
 36 | __global__ void scalarCopyKernel(const int n, const T* data_in, T* data_out) {
 37 | 
 38 |   for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < n;i += blockDim.x*gridDim.x) {
 39 |     data_out[i] = data_in[i];
 40 |   }
 41 | 
 42 | }
 43 | template <typename T>
 44 | void scalarCopy(const int n, const T* data_in, T* data_out, cudaStream_t stream) {
 45 | 
 46 |   int numblock = (n - 1)/numthread + 1;
 47 |   // numblock = min(65535, numblock);
 48 |   // numblock = min(256, numblock);
 49 | 
 50 |   scalarCopyKernel<T> <<< numblock, numthread, 0, stream >>>
 51 |   (n, data_in, data_out);
 52 | 
 53 |   cudaCheck(cudaGetLastError());
 54 | }
 55 | // -----------------------------------------------------------------------------------
 56 | 
 57 | // -----------------------------------------------------------------------------------
 58 | //
 59 | // Copy using vectorized loads and stores
 60 | //
 61 | template <typename T>
 62 | __global__ void vectorCopyKernel(const int n, T* data_in, T* data_out) {
 63 | 
 64 |   // Maximum vector load is 128 bits = 16 bytes
 65 |   const int vectorLength = 16/sizeof(T);
 66 | 
 67 |   int idx = threadIdx.x + blockIdx.x*blockDim.x;
 68 | 
 69 |   // Vector elements
 70 |   for (int i = idx;i < n/vectorLength;i += blockDim.x*gridDim.x) {
 71 |     reinterpret_cast<int4*>(data_out)[i] = reinterpret_cast<int4*>(data_in)[i];
 72 |   }
 73 | 
 74 |   // Remaining elements
 75 |   for (int i = idx + (n/vectorLength)*vectorLength;i < n;i += blockDim.x*gridDim.x + threadIdx.x) {
 76 |     data_out[i] = data_in[i];
 77 |   }
 78 | 
 79 | }
 80 | 
 81 | template <typename T>
 82 | void vectorCopy(const int n, T* data_in, T* data_out, cudaStream_t stream) {
 83 | 
 84 |   const int vectorLength = 16/sizeof(T);
 85 | 
 86 |   int numblock = (n/vectorLength - 1)/numthread + 1;
 87 |   // numblock = min(65535, numblock);
 88 |   int shmemsize = 0;
 89 | 
 90 |   vectorCopyKernel<T> <<< numblock, numthread, shmemsize, stream >>>
 91 |   (n, data_in, data_out);
 92 | 
 93 |   cudaCheck(cudaGetLastError());
 94 | }
 95 | // -----------------------------------------------------------------------------------
 96 | 
 97 | // -----------------------------------------------------------------------------------
 98 | //
 99 | // Copy using vectorized loads and stores
100 | //
101 | template <int numElem>
102 | __global__ void memcpyFloatKernel(const int n, float4 *data_in, float4* data_out) {
103 |   int index = threadIdx.x + numElem*blockIdx.x*blockDim.x;
104 |   float4 a[numElem];
105 | #pragma unroll
106 |   for (int i=0;i < numElem;i++) {
107 |     if (index + i*blockDim.x < n) a[i] = data_in[index + i*blockDim.x];
108 |   }
109 | #pragma unroll
110 |   for (int i=0;i < numElem;i++) {
111 |     if (index + i*blockDim.x < n) data_out[index + i*blockDim.x] = a[i];
112 |   }
113 | }
114 | 
115 | template <int numElem>
116 | __global__ void memcpyFloatLoopKernel(const int n, float4 *data_in, float4* data_out) {
117 |   for (int index=threadIdx.x + blockIdx.x*numElem*blockDim.x;index < n;index += numElem*gridDim.x*blockDim.x)
118 |   {
119 |     float4 a[numElem];
120 | #pragma unroll
121 |     for (int i=0;i < numElem;i++) {
122 |       if (index + i*blockDim.x < n) a[i] = data_in[index + i*blockDim.x];
123 |     }
124 | #pragma unroll
125 |     for (int i=0;i < numElem;i++) {
126 |       if (index + i*blockDim.x < n) data_out[index + i*blockDim.x] = a[i];
127 |     }
128 |   }
129 | }
130 | 
131 | #define NUM_ELEM 2
132 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream) {
133 | 
134 |   int numblock = (n/(4*NUM_ELEM) - 1)/numthread + 1;
135 |   int shmemsize = 0;
136 |   memcpyFloatKernel<NUM_ELEM> <<< numblock, numthread, shmemsize, stream >>>
137 |   (n/4, (float4 *)data_in, (float4 *)data_out);
138 | 
139 |   // int numblock = 64;
140 |   // int shmemsize = 0;
141 |   // memcpyFloatLoopKernel<NUM_ELEM> <<< numblock, numthread, shmemsize, stream >>>
142 |   // (n/4, (float4 *)data_in, (float4 *)data_out);
143 | 
144 |   cudaCheck(cudaGetLastError());
145 | }
146 | // -----------------------------------------------------------------------------------
147 | 
148 | // Explicit instances
149 | template void scalarCopy<int>(const int n, const int* data_in, int* data_out, cudaStream_t stream);
150 | template void scalarCopy<long long int>(const int n, const long long int* data_in, long long int* data_out, cudaStream_t stream);
151 | template void vectorCopy<int>(const int n, int* data_in, int* data_out, cudaStream_t stream);
152 | template void vectorCopy<long long int>(const int n, long long int* data_in, long long int* data_out, cudaStream_t stream);
153 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream);
154 | 


--------------------------------------------------------------------------------
/src/CudaUtils.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | 
 26 | #include <stdio.h>
 27 | #ifdef ENABLE_NVTOOLS
 28 | #include <nvToolsExtCuda.h>
 29 | #endif
 30 | #include "CudaUtils.h"
 31 | 
 32 | //----------------------------------------------------------------------------------------
 33 | 
 34 | void set_device_array_async_T(void *data, int value, const size_t ndata, cudaStream_t stream, const size_t sizeofT) {
 35 |   cudaCheck(cudaMemsetAsync(data, value, sizeofT*ndata, stream));
 36 | }
 37 | 
 38 | void set_device_array_T(void *data, int value, const size_t ndata, const size_t sizeofT) {
 39 |   cudaCheck(cudaMemset(data, value, sizeofT*ndata));
 40 | }
 41 | 
 42 | //----------------------------------------------------------------------------------------
 43 | //
 44 | // Jittor malloc & free
 45 | //
 46 | void cutt_malloc(void** p, size_t len, size_t& allocation) {
 47 |   cudaCheck(cudaMalloc(p, len));
 48 | }
 49 | 
 50 | void cutt_free(void* p, size_t len, size_t& allocation) {
 51 |   cudaCheck(cudaFree(p));
 52 | }
 53 | 
 54 | void (*custom_cuda_malloc)(void** p, size_t len, size_t& allocation) = NULL;
 55 | 
 56 | void (*custom_cuda_free)(void* p, size_t len, size_t& allocation) = NULL;
 57 | 
 58 | //----------------------------------------------------------------------------------------
 59 | //
 60 | // Allocate gpu memory
 61 | // pp = memory pointer
 62 | // len = length of the array
 63 | //
 64 | void allocate_device_T(void **pp, const size_t len, const size_t sizeofT) {
 65 |   cudaCheck(cudaMalloc(pp, sizeofT*len));
 66 | }
 67 | 
 68 | //----------------------------------------------------------------------------------------
 69 | //
 70 | // Deallocate gpu memory
 71 | // pp = memory pointer
 72 | //
 73 | void deallocate_device_T(void **pp) {
 74 |   if (*pp != NULL) {
 75 |     cudaCheck(cudaFree((void *)(*pp)));
 76 |     *pp = NULL;
 77 |   }
 78 | 
 79 | }
 80 | 
 81 | //----------------------------------------------------------------------------------------
 82 | //
 83 | // Jittor allocate gpu memory
 84 | // pp = memory pointer
 85 | // len = length of the array
 86 | //
 87 | void jit_allocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation) {
 88 |   if (custom_cuda_malloc==NULL){
 89 |     cutt_malloc(pp, sizeofT*len, allocation);
 90 |   }else custom_cuda_malloc(pp, sizeofT*len, allocation);
 91 | }
 92 | 
 93 | //----------------------------------------------------------------------------------------
 94 | //
 95 | // Jittor deallocate gpu memory
 96 | // pp = memory pointer
 97 | //
 98 | void jit_deallocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation) {
 99 |   if (*pp != NULL) {
100 |     if (custom_cuda_free==NULL){
101 |       cutt_free((void *)(*pp), sizeofT*len, allocation);
102 |     }else custom_cuda_free((void *)(*pp), sizeofT*len, allocation);
103 |     *pp = NULL;
104 |   }
105 | 
106 | }
107 | 
108 | //----------------------------------------------------------------------------------------
109 | //
110 | // Copies memory Host -> Device
111 | //
112 | void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream,
113 |            const size_t sizeofT) {
114 |   cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream));
115 | }
116 | 
117 | void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len,
118 |      const size_t sizeofT) {
119 |   cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice));
120 | }
121 | 
122 | //----------------------------------------------------------------------------------------
123 | //
124 | // Copies memory Device -> Host
125 | //
126 | void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream,
127 |            const size_t sizeofT) {
128 |   cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream));
129 | }
130 | 
131 | void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT) {
132 |   cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost));
133 | }
134 | 
135 | //----------------------------------------------------------------------------------------
136 | #ifdef ENABLE_NVTOOLS
137 | void gpuRangeStart(const char *range_name) {
138 |   static int color_id=0;
139 |   nvtxEventAttributes_t att;
140 |   att.version = NVTX_VERSION;
141 |   att.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
142 |   att.colorType = NVTX_COLOR_ARGB;
143 |   if (color_id == 0) {
144 |     att.color = 0xFFFF0000;
145 |   } else if (color_id == 1) {
146 |     att.color = 0xFF00FF00;
147 |   } else if (color_id == 2) {
148 |     att.color = 0xFF0000FF;
149 |   } else if (color_id == 3) {
150 |     att.color = 0xFFFF00FF;
151 |   }
152 |   color_id++;
153 |   if (color_id > 3) color_id = 0;
154 |   att.messageType = NVTX_MESSAGE_TYPE_ASCII;
155 |   att.message.ascii = range_name;
156 |   nvtxRangePushEx(&att);
157 | }
158 | 
159 | void gpuRangeStop() {
160 |   nvtxRangePop();
161 | }
162 | #endif
163 | 


--------------------------------------------------------------------------------
/src/CudaUtils.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #ifndef CUDAUTILS_H
 26 | #define CUDAUTILS_H
 27 | 
 28 | #include <stdio.h>
 29 | #include <cuda.h>
 30 | #include <cuda_runtime.h>
 31 | #include <string>
 32 | #include <stdexcept>
 33 | 
 34 | #define S(x) #x
 35 | #define S_(x) S(x)
 36 | #define S__LINE__ S_(__LINE__)
 37 | #define FULL_MASK 0xffffffff
 38 | 
 39 | 
 40 | //
 41 | // Error checking wrapper for CUDA
 42 | //
 43 | #define cudaCheck(stmt) do {                         \
 44 | 	cudaError_t err = stmt;                            \
 45 |   if (err != cudaSuccess) {                          \
 46 |     std::string msg = #stmt;        \
 47 |     msg += " in file ";             \
 48 |     msg += __FILE__;                \
 49 |     msg += ":";                     \
 50 |     msg += S__LINE__;               \
 51 |     msg += ", function ";           \
 52 |     msg += __FUNCTION__;            \
 53 |     msg += "\nError message: ";     \
 54 |     msg += cudaGetErrorString(err); \
 55 | 	  throw std::runtime_error(msg);  \
 56 |   }                                                  \
 57 | } while(0)
 58 | 
 59 | void set_device_array_async_T(void *data, int value, const size_t ndata, cudaStream_t stream, const size_t sizeofT);
 60 | void set_device_array_T(void *data, int value, const size_t ndata, const size_t sizeofT);
 61 | 
 62 | template <class T>
 63 | void set_device_array(T *data, int value, const size_t ndata, cudaStream_t stream=0) {
 64 |   set_device_array_async_T(data, value, ndata, stream, sizeof(T));
 65 | }
 66 | 
 67 | template <class T>
 68 | void set_device_array_sync(T *data, int value, const size_t ndata) {
 69 |   set_device_array_T(data, value, ndata, sizeof(T));
 70 | }
 71 | 
 72 | void allocate_device_T(void **pp, const size_t len, const size_t sizeofT);
 73 | //----------------------------------------------------------------------------------------
 74 | //
 75 | // Allocate gpu memory
 76 | // pp = memory pointer
 77 | // len = length of the array
 78 | //
 79 | template <class T>
 80 | void allocate_device(T **pp, const size_t len) {
 81 |   allocate_device_T((void **)pp, len, sizeof(T));
 82 | }
 83 | 
 84 | void deallocate_device_T(void **pp);
 85 | //----------------------------------------------------------------------------------------
 86 | //
 87 | // Deallocate gpu memory
 88 | // pp = memory pointer
 89 | //
 90 | template <class T>
 91 | void deallocate_device(T **pp) {
 92 |   deallocate_device_T((void **)pp);
 93 | }
 94 | //----------------------------------------------------------------------------------------
 95 | 
 96 | void jit_allocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t &allocation);
 97 | //----------------------------------------------------------------------------------------
 98 | //
 99 | // Allocate gpu memory
100 | // pp = memory pointer
101 | // len = length of the array
102 | //
103 | template <class T>
104 | void jit_allocate_device(T **pp, const size_t len, size_t& allocation) {
105 |   jit_allocate_device_T((void **)pp, len, sizeof(T), allocation);
106 | }
107 | 
108 | void jit_deallocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation);
109 | //----------------------------------------------------------------------------------------
110 | //
111 | // Deallocate gpu memory
112 | // pp = memory pointer
113 | //
114 | template <class T>
115 | void jit_deallocate_device(T **pp, const size_t len, size_t& allocation) {
116 |   jit_deallocate_device_T((void **)pp, len, sizeof(T), allocation);
117 | }
118 | //----------------------------------------------------------------------------------------
119 | 
120 | void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream,
121 |            const size_t sizeofT);
122 | void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len,
123 |      const size_t sizeofT);
124 | void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream,
125 |            const size_t sizeofT);
126 | void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT);
127 | 
128 | //----------------------------------------------------------------------------------------
129 | //
130 | // Copies memory Host -> Device
131 | //
132 | template <class T>
133 | void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0) {
134 |   copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T));
135 | }
136 | 
137 | //----------------------------------------------------------------------------------------
138 | //
139 | // Copies memory Host -> Device using synchronous calls
140 | //
141 | template <class T>
142 | void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len) {
143 |   copy_HtoD_T(h_array, d_array, array_len, sizeof(T));
144 | }
145 | 
146 | //----------------------------------------------------------------------------------------
147 | //
148 | // Copies memory Device -> Host
149 | //
150 | template <class T>
151 | void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0) {
152 |   copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T));
153 | }
154 | //----------------------------------------------------------------------------------------
155 | //
156 | // Copies memory Device -> Host using synchronous calls
157 | //
158 | template <class T>
159 | void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len) {
160 |   copy_DtoH_T(d_array, h_array, array_len, sizeof(T));
161 | }
162 | 
163 | #ifdef ENABLE_NVTOOLS
164 | void gpuRangeStart(const char *range_name);
165 | void gpuRangeStop();
166 | #endif
167 | 
168 | //----------------------------------------------------------------------------------------
169 | //
170 | // Jittor malloc & free
171 | //
172 | void cutt_malloc(void** p, size_t len, size_t& allocation);
173 | 
174 | void cutt_free(void* p, size_t len, size_t& allocation);
175 | 
176 | extern void (*custom_cuda_malloc)(void** p, size_t len, size_t& allocation);
177 | 
178 | extern void (*custom_cuda_free)(void* p, size_t len, size_t& allocation);
179 | 
180 | #endif // CUDAUTILS_H


--------------------------------------------------------------------------------
/src/cuttplan.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #ifndef CUTTPLAN_H
 26 | #define CUTTPLAN_H
 27 | 
 28 | #include <list>
 29 | #include <vector>
 30 | #include <cuda.h>
 31 | #include "cuttTypes.h"
 32 | 
 33 | const int TILEDIM = 32;
 34 | const int TILEROWS = 8;
 35 | 
 36 | // Transposing methods
 37 | enum {Unknown, Trivial, Packed, PackedSplit,
 38 |   Tiled, TiledCopy,
 39 |   NumTransposeMethods};
 40 | 
 41 | // Tells how tensor is split into Mm and Mk and what method is used
 42 | // NOTE: sizeMm and sizeMk fully define the split
 43 | class TensorSplit {
 44 | public:
 45 |   // Transposing method
 46 |   int method;
 47 | 
 48 |   // Input volume
 49 |   int sizeMm;
 50 |   int volMm;
 51 | 
 52 |   // Output volume
 53 |   int sizeMk;
 54 |   int volMk;
 55 | 
 56 |   // {Input} U {Output}
 57 |   int sizeMmk;
 58 |   int volMmk;
 59 | 
 60 |   // {Input} CUT {Output} = Mk which is not in Mm
 61 |   int sizeMkBar;
 62 |   int volMkBar;
 63 | 
 64 |   // Remaining volume
 65 |   int sizeMbar;
 66 |   int volMbar;
 67 | 
 68 |   // For Packed and PackedSplit methods:
 69 |   // Amount of contigious volume
 70 |   int volMmkInCont;
 71 |   int volMmkOutCont;
 72 | 
 73 |   // For PackedSplit method:
 74 |   // Number of splits
 75 |   int numSplit;
 76 | 
 77 |   // Rank that is split
 78 |   int splitRank;
 79 |   int splitDim;
 80 | 
 81 |   // volMmk that is left unsplit
 82 |   int volMmkUnsplit;
 83 | 
 84 |   TensorSplit();
 85 | 
 86 |   void print();
 87 | 
 88 |   void update(const int sizeMm_in, const int sizeMk_in, const int rank,
 89 |     const int* dim, const int* permutation);
 90 | 
 91 |   // Number of elements in shared memory space
 92 |   size_t shmem() const;
 93 | 
 94 |   // Number of elements in Mmk that are used effectively
 95 |   size_t volMmkUsed() const;
 96 | 
 97 |   // Bytes the shared memory space that needs to be allocated
 98 |   // (can be larger than volShmem() due to padding)
 99 |   size_t shmemAlloc(int sizeofType) const;
100 | 
101 | };
102 | 
103 | class LaunchConfig {
104 | public:
105 |  // Kernel launch configuration
106 |   dim3 numthread;
107 |   dim3 numblock;
108 |   size_t shmemsize;
109 | 
110 |   // For the Packed method, number of registers to use for storage
111 |   int numRegStorage;
112 | 
113 |   void print();
114 | 
115 |  };
116 | 
117 | // Class that stores the plan data
118 | class cuttPlan_t {
119 | public:
120 |   // Device for which this plan was made
121 |   int deviceID;
122 | 
123 |   // CUDA stream associated with the plan
124 |   cudaStream_t stream;
125 | 
126 |   // Kernel launch configuration
127 |   LaunchConfig launchConfig;
128 |   
129 |   // Rank of the tensor
130 |   int rank;
131 | 
132 |   // Size of the tensor elements in bytes
133 |   size_t sizeofType;
134 | 
135 |   TensorSplit tensorSplit;
136 | 
137 |   // Number of active thread blocks
138 |   int numActiveBlock;
139 | 
140 |   int cuDimMk;
141 |   int cuDimMm;
142 | 
143 |   int2 tiledVol;
144 | 
145 |   // Number of iterations of the kernel
146 |   int num_iter;
147 |   // Average memory level parallelism = average unroll count
148 |   float mlp;
149 |   int gld_req, gst_req, gld_tran, gst_tran;
150 |   int cl_full_l2, cl_part_l2;
151 |   int cl_full_l1, cl_part_l1;
152 |   int sld_req, sst_req, sld_tran, sst_tran;
153 |   double cycles;
154 | 
155 |   //--------------
156 |   // Host buffers
157 |   //--------------
158 |   std::vector<TensorConvInOut> hostMbar;
159 |   std::vector<TensorConvInOut> hostMmk;
160 |   std::vector<TensorConv> hostMsh;
161 | 
162 |   //----------------
163 |   // Device buffers
164 |   //----------------
165 |   // sizeMbar
166 |   TensorConvInOut* Mbar;
167 |   size_t MbarSz;
168 |   size_t MbarAllocation;
169 | 
170 |   // sizeMmk
171 |   TensorConvInOut* Mmk;
172 |   size_t MmkSz;
173 |   size_t MmkAllocation;
174 | 
175 |   // sizeMmk
176 |   TensorConv* Msh;
177 |   size_t MshSz;
178 |   size_t MshAllocation;
179 | 
180 |   // For TiledSingleInRank
181 |   TensorConv* Mk;
182 |   size_t MkSz;
183 |   size_t MkAllocation;
184 | 
185 |   // For TiledSingleOutRank
186 |   TensorConv* Mm;
187 |   size_t MmSz;
188 |   size_t MmAllocation;
189 | 
190 |   cuttPlan_t();
191 |   ~cuttPlan_t();
192 |   void print();
193 |   void setStream(cudaStream_t stream_in);
194 |   bool countCycles(cudaDeviceProp& prop, const int numPosMbarSample=0);
195 |   void activate();
196 |   void nullDevicePointers();
197 | 
198 |   static bool createPlans(const int rank, const int* dim, const int* permutation,
199 |     const int redRank, const int* redDim, const int* redPermutation,
200 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
201 | 
202 | private:
203 |   static bool createTrivialPlans(const int rank, const int* dim, const int* permutation,
204 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
205 | 
206 |   static bool createTiledPlans(const int rank, const int* dim, const int* permutation,
207 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
208 | 
209 |   static bool createTiledCopyPlans(const int rank, const int* dim, const int* permutation,
210 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
211 | 
212 |   static bool createPackedPlans(const int rank, const int* dim, const int* permutation,
213 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
214 | 
215 |   static bool createPackedSplitPlans(const int rank, const int* dim, const int* permutation,
216 |     const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list<cuttPlan_t>& plans);
217 | 
218 |   bool setup(const int rank_in, const int* dim, const int* permutation,
219 |     const size_t sizeofType_in, const TensorSplit& tensorSplit_in,
220 |     const LaunchConfig& launchConfig_in, const int numActiveBlock_in);
221 | 
222 | };
223 | 
224 | void printMatlab(cudaDeviceProp& prop, std::list<cuttPlan_t>& plans, std::vector<double>& times);
225 | 
226 | void reduceRanks(const int rank, const int* dim, const int* permutation,
227 |   std::vector<int>& redDim, std::vector<int>& redPermutation);
228 | 
229 | std::list<cuttPlan_t>::iterator choosePlanHeuristic(std::list<cuttPlan_t>& plans);
230 | 
231 | #endif // CUTTPLAN_H
232 | 


--------------------------------------------------------------------------------
/src/cuttTimer.cpp:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | 
 26 | #include "cuttTimer.h"
 27 | #include "CudaUtils.h"
 28 | // #include <limits>       // std::numeric_limits
 29 | #include <algorithm>
 30 | #ifdef CUDA_EVENT_TIMER
 31 | #include "CudaUtils.h"
 32 | #endif
 33 | 
 34 | #ifdef CUDA_EVENT_TIMER
 35 | Timer::Timer() {
 36 |   cudaCheck(cudaEventCreate(&tmstart));
 37 |   cudaCheck(cudaEventCreate(&tmend));
 38 | }
 39 | Timer::~Timer() {
 40 |   cudaCheck(cudaEventDestroy(tmstart));
 41 |   cudaCheck(cudaEventDestroy(tmend));
 42 | }
 43 | #endif
 44 | 
 45 | void Timer::start() {
 46 | #ifdef CUDA_EVENT_TIMER
 47 |   cudaCheck(cudaEventRecord(tmstart, 0));
 48 | #else
 49 |   tmstart = std::chrono::high_resolution_clock::now();
 50 | #endif
 51 | }
 52 | 
 53 | void Timer::stop() {
 54 | #ifdef CUDA_EVENT_TIMER
 55 |   cudaCheck(cudaEventRecord(tmend, 0));
 56 |   cudaCheck(cudaEventSynchronize(tmend));
 57 | #else
 58 |   cudaCheck(cudaDeviceSynchronize());
 59 |   tmend = std::chrono::high_resolution_clock::now();
 60 | #endif
 61 | }
 62 | 
 63 | //
 64 | // Returns the duration of the last run in seconds
 65 | //
 66 | double Timer::seconds() {
 67 | #ifdef CUDA_EVENT_TIMER
 68 |   float ms;
 69 |   cudaCheck(cudaEventElapsedTime(&ms, tmstart, tmend));
 70 |   return (double)(ms/1000.0f);
 71 | #else
 72 |   return std::chrono::duration_cast< std::chrono::duration<double> >(tmend - tmstart).count();
 73 | #endif
 74 | }
 75 | 
 76 | //
 77 | // Class constructor
 78 | //
 79 | cuttTimer::cuttTimer(int sizeofType) : sizeofType(sizeofType) {}
 80 | 
 81 | //
 82 | // Class destructor
 83 | //
 84 | cuttTimer::~cuttTimer() {}
 85 | 
 86 | //
 87 | // Start timer
 88 | //
 89 | void cuttTimer::start(std::vector<int>& dim, std::vector<int>& permutation) {
 90 |   curDim = dim;
 91 |   curPermutation = permutation;
 92 |   curBytes = sizeofType*2;   // "2x" because every element is read and also written out
 93 |   for (int i=0;i < curDim.size();i++) {
 94 |     curBytes *= dim[i];
 95 |   }
 96 |   ranks.insert(curDim.size());
 97 |   timer.start();
 98 | }
 99 | 
100 | //
101 | // Stop timer and record statistics
102 | //
103 | void cuttTimer::stop() {
104 |   timer.stop();
105 |   double bandwidth = GBs();
106 |   auto it = stats.find(curDim.size());
107 |   if (it == stats.end()) {
108 |     Stat new_stat;
109 |     std::pair<int, Stat> new_elem(curDim.size(), new_stat);
110 |     auto retval = stats.insert(new_elem);
111 |     it = retval.first;
112 |   }
113 |   Stat& stat = it->second;
114 |   stat.totBW += bandwidth;
115 |   if (bandwidth < stat.minBW) {
116 |     stat.minBW = bandwidth;
117 |     stat.worstDim = curDim;
118 |     stat.worstPermutation = curPermutation;
119 |   }
120 |   stat.maxBW = std::max(stat.maxBW, bandwidth);
121 |   stat.BW.push_back(bandwidth);
122 | }
123 | 
124 | //
125 | // Returns the duration of the last run in seconds
126 | //
127 | double cuttTimer::seconds() {
128 |   return timer.seconds();
129 | }
130 | 
131 | //
132 | // Returns the bandwidth of the last run in GB/s
133 | //
134 | double cuttTimer::GBs() {
135 |   const double BILLION = 1000000000.0;
136 |   double sec = seconds();
137 |   return (sec == 0.0) ? 0.0 : (double)(curBytes)/(BILLION*sec);
138 | }
139 | 
140 | //
141 | // Returns the bandwidth of the last run in GiB/s
142 | //
143 | double cuttTimer::GiBs() {
144 |   const double iBILLION = 1073741824.0;
145 |   double sec = seconds();
146 |   return (sec == 0.0) ? 0.0 : (double)(curBytes)/(iBILLION*sec);
147 | }
148 | 
149 | //
150 | // Returns the best performing tensor transpose for rank
151 | //
152 | double cuttTimer::getBest(int rank) {
153 |   auto it = stats.find(rank);
154 |   if (it == stats.end()) return 0.0;
155 |   Stat& stat = it->second;
156 |   return stat.maxBW;  
157 | }
158 | 
159 | //
160 | // Returns the worst performing tensor transpose for rank
161 | //
162 | double cuttTimer::getWorst(int rank) {
163 |   auto it = stats.find(rank);
164 |   if (it == stats.end()) return 0.0;
165 |   Stat& stat = it->second;
166 |   return stat.minBW;
167 | }
168 | 
169 | //
170 | // Returns the worst performing tensor transpose for rank
171 | //
172 | double cuttTimer::getWorst(int rank, std::vector<int>& dim, std::vector<int>& permutation) {
173 |   auto it = stats.find(rank);
174 |   if (it == stats.end()) return 0.0;
175 |   Stat& stat = it->second;
176 |   dim = stat.worstDim;
177 |   permutation = stat.worstPermutation;
178 |   return stat.minBW;
179 | }
180 | 
181 | //
182 | // Returns the median bandwidth for rank
183 | //
184 | double cuttTimer::getMedian(int rank) {
185 |   auto it = stats.find(rank);
186 |   if (it == stats.end()) return 0.0;
187 |   Stat& stat = it->second;
188 |   if (stat.BW.size() == 0) return 0.0;
189 |   // Set middle element in to correct position
190 |   std::nth_element(stat.BW.begin(), stat.BW.begin() + stat.BW.size()/2, stat.BW.end());
191 |   double median = stat.BW[stat.BW.size()/2];
192 |   if (stat.BW.size() % 2 == 0) {
193 |     // For even number of elements, set middle - 1 element in to correct position
194 |     // and take average
195 |     std::nth_element(stat.BW.begin(), stat.BW.begin() + stat.BW.size()/2 - 1, stat.BW.end());
196 |     median += stat.BW[stat.BW.size()/2 - 1];
197 |     median *= 0.5;
198 |   }
199 |   return median;
200 | }
201 | 
202 | //
203 | // Returns the average bandwidth for rank
204 | //
205 | double cuttTimer::getAverage(int rank) {
206 |   auto it = stats.find(rank);
207 |   if (it == stats.end()) return 0.0;
208 |   Stat& stat = it->second;
209 |   return stat.totBW/(double)stat.BW.size();
210 | }
211 | 
212 | //
213 | // Returns all data for rank
214 | //
215 | std::vector<double> cuttTimer::getData(int rank) {
216 |   std::vector<double> res;
217 |   auto it = stats.find(rank);
218 |   if (it != stats.end()) {
219 |     Stat& stat = it->second;
220 |     res = stat.BW;
221 |   }
222 |   return res;
223 | }
224 | 
225 | //
226 | // Returns the worst performing tensor transpose of all
227 | //
228 | double cuttTimer::getWorst(std::vector<int>& dim, std::vector<int>& permutation) {
229 |   double worstBW = 1.0e20;
230 |   int worstRank = 0;
231 |   for (auto it=ranks.begin(); it != ranks.end(); it++) {
232 |     double bw = stats.find(*it)->second.minBW;
233 |     if (worstBW > bw) {
234 |       worstRank = *it;
235 |       worstBW = bw;
236 |     }
237 |   }
238 |   if (worstRank == 0) {
239 |     dim.resize(0);
240 |     permutation.resize(0);
241 |     return 0.0;
242 |   }
243 |   dim = stats.find(worstRank)->second.worstDim;
244 |   permutation = stats.find(worstRank)->second.worstPermutation;
245 |   return worstBW;
246 | }
247 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | cuTT - CUDA Tensor Transpose
  2 | ============================
  3 | 
  4 | cuTT is a high performance tensor transpose library for NVIDIA GPUs. It works with Kepler (SM 3.0) and above GPUs.
  5 | 
  6 | Copyright (c) 2016 Antti-Pekka Hynninen
  7 | 
  8 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  9 | 
 10 | Version 1.1
 11 | 
 12 | Installation
 13 | ============
 14 | 
 15 | Software requirements:
 16 |  * C++ compiler with C++11 compitability
 17 |  * CUDA compiler
 18 | 
 19 | Hardware requirements:
 20 |  * Kepler (SM 3.0) or above NVIDIA GPU
 21 | 
 22 | To compile cuTT library as well as test cases and benchmarks, simply do
 23 | 
 24 | make
 25 | 
 26 | This will create the library itself:
 27 | 
 28 |  * include/cutt.h
 29 |  * lib/libcutt.a
 30 | 
 31 | as well as the test and benchmarks
 32 | 
 33 |  * bin/cutt_test
 34 |  * bin/cutt_bench
 35 | 
 36 | In order to use cuTT, you only need the include (include/cutt.h) and the library (lib/libcutt.a) files.
 37 | 
 38 | Running tests and benchmarks
 39 | ============================
 40 | 
 41 | Tests and benchmark executables are in the bin/ directory and they can be run without any options.
 42 | Options to the test executable lets you choose the device ID on which to run:
 43 | 
 44 | cutt_test [options]
 45 | Options:
 46 | -device gpuid : use GPU with ID gpuid
 47 | 
 48 | For the benchmark executable, we have an additional option that lets you run the benchmarks using
 49 | plans that are chosen optimally by measuring the performance of every possible implementation and
 50 | choosing the best one.
 51 | 
 52 | cutt_bench [options]
 53 | Options:
 54 | -device gpuid : use GPU with ID gpuid
 55 | -measure      : use cuttPlanMeasure (default is cuttPlan)
 56 | 
 57 | Performance
 58 | ===========
 59 | 
 60 | cuTT was designed with performance as the main goal. Here are performance benchmarks for a random set of tensors with 200M `double` elements with ranks 2 to 7. The benchmarks were run with the measurement flag on
 61 | (cutt_bench -measure)
 62 | 
 63 | ![k20x](https://raw.githubusercontent.com/ap-hynninen/cutt/master/doc/k20x_bench.png)
 64 | 
 65 | <!-- ![k40m](https://raw.githubusercontent.com/ap-hynninen/cutt/master/doc/bw_k40m_july1_2016.png)
 66 |  -->
 67 | 
 68 | <!-- ![titanx](https://raw.githubusercontent.com/ap-hynninen/cutt/master/doc/bw_titanx.png)
 69 |  -->
 70 | 
 71 | Usage
 72 | =====
 73 | 
 74 | cuTT uses a "plan structure" similar to FFTW and cuFFT libraries, where the
 75 | user first creates a plan for the transpose and then executes that plan.
 76 | Here is an example code.
 77 | 
 78 | ```c++
 79 | #include <cutt.h>
 80 | 
 81 | //
 82 | // Error checking wrapper for cutt
 83 | //
 84 | #define cuttCheck(stmt) do {                                 \
 85 |   cuttResult err = stmt;                            \
 86 |   if (err != CUTT_SUCCESS) {                          \
 87 |     fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
 88 |     exit(1); \
 89 |   }                                                  \
 90 | } while(0)
 91 | 
 92 | int main() {
 93 | 
 94 |   // Four dimensional tensor
 95 |   // Transpose (31, 549, 2, 3) -> (3, 31, 2, 549)
 96 |   int dim[4] = {31, 549, 2, 3};
 97 |   int permutation[4] = {3, 0, 2, 1};
 98 | 
 99 |   .... input and output data is setup here ...
100 |   // double* idata : size product(dim)
101 |   // double* odata : size product(dim)
102 | 
103 |   // Option 1: Create plan on NULL stream and choose implementation based on heuristics
104 |   cuttHandle plan;
105 |   cuttCheck(cuttPlan(&plan, 4, dim, permutation, sizeof(double), 0));
106 | 
107 |   // Option 2: Create plan on NULL stream and choose implementation based on performance measurements
108 |   // cuttCheck(cuttPlanMeasure(&plan, 4, dim, permutation, sizeof(double), 0, idata, odata));
109 | 
110 |   // Execute plan
111 |   cuttCheck(cuttExecute(plan, idata, odata));
112 | 
113 |   ... do stuff with your output and deallocate data ...
114 | 
115 |   // Destroy plan
116 |   cuttCheck(cuttDestroy(plan));
117 | 
118 |   return 0;
119 | }
120 | ```
121 | 
122 | Input (idata) and output (odata) data are both in GPU memory and must point to different
123 | memory areas for correct operation. That is, cuTT only currently supports out-of-place
124 | transposes. Note that using Option 2 to create the plan can take up some time especially
125 | for high-rank tensors.
126 | 
127 | cuTT API
128 | ========
129 | 
130 | ```c++
131 | //
132 | // Create plan
133 | //
134 | // Parameters
135 | // handle            = Returned handle to cuTT plan
136 | // rank              = Rank of the tensor
137 | // dim[rank]         = Dimensions of the tensor
138 | // permutation[rank] = Transpose permutation
139 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
140 | // stream            = CUDA stream (0 if no stream is used)
141 | //
142 | // Returns
143 | // Success/unsuccess code
144 | // 
145 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
146 |   cudaStream_t stream);
147 | 
148 | //
149 | // Create plan and choose implementation by measuring performance
150 | //
151 | // Parameters
152 | // handle            = Returned handle to cuTT plan
153 | // rank              = Rank of the tensor
154 | // dim[rank]         = Dimensions of the tensor
155 | // permutation[rank] = Transpose permutation
156 | // sizeofType        = Size of the elements of the tensor in bytes (=4 or 8)
157 | // stream            = CUDA stream (0 if no stream is used)
158 | // idata             = Input data size product(dim)
159 | // odata             = Output data size product(dim)
160 | //
161 | // Returns
162 | // Success/unsuccess code
163 | // 
164 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
165 |   cudaStream_t stream, void* idata, void* odata);
166 |   
167 | //
168 | // Destroy plan
169 | //
170 | // Parameters
171 | // handle            = Handle to the cuTT plan
172 | // 
173 | // Returns
174 | // Success/unsuccess code
175 | //
176 | cuttResult cuttDestroy(cuttHandle handle);
177 | 
178 | //
179 | // Execute plan out-of-place
180 | //
181 | // Parameters
182 | // handle            = Returned handle to cuTT plan
183 | // idata             = Input data size product(dim)
184 | // odata             = Output data size product(dim)
185 | // 
186 | // Returns
187 | // Success/unsuccess code
188 | //
189 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata);
190 | ```
191 | 
192 | KNOWN BUGS
193 | ==========
194 |  * Benchmarks sometime fail due to the stupid algorithm I have now to create
195 |  random tensors with fixed volume.
196 | 
197 | TODO
198 | ====
199 |  * Make "tiled" method work with sets of ranks (where ranks in M_m and M_k remain in same order)
200 | 
201 | Licence
202 | =======
203 | 
204 | MIT License
205 | 
206 | Copyright (c) 2016 Antti-Pekka Hynninen
207 | 
208 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
209 | 
210 | Permission is hereby granted, free of charge, to any person obtaining a copy
211 | of this software and associated documentation files (the "Software"), to deal
212 | in the Software without restriction, including without limitation the rights
213 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
214 | copies of the Software, and to permit persons to whom the Software is
215 | furnished to do so, subject to the following conditions:
216 | 
217 | The above copyright notice and this permission notice shall be included in all
218 | copies or substantial portions of the Software.
219 | 
220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
221 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
222 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
223 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
224 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
225 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
226 | SOFTWARE.
227 | 


--------------------------------------------------------------------------------
/src/TensorTester.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | 
 26 | //
 27 | // Testing utilities
 28 | //
 29 | #include <cuda.h>
 30 | #include "CudaUtils.h"
 31 | #include "TensorTester.h"
 32 | 
 33 | __global__ void setTensorCheckPatternKernel(unsigned int* data, unsigned int ndata) {
 34 |   for (unsigned int i = threadIdx.x + blockIdx.x*blockDim.x;i < ndata;i += blockDim.x*gridDim.x) {
 35 |     data[i] = i;
 36 |   }
 37 | }
 38 | 
 39 | template<typename T>
 40 | __global__ void checkTransposeKernel(T* data, unsigned int ndata, int rank, TensorConv* glTensorConv,
 41 |   TensorError_t* glError, int* glFail) {
 42 | 
 43 |   extern __shared__ unsigned int shPos[];
 44 | 
 45 |   const int warpLane = threadIdx.x & (warpSize - 1);
 46 |   TensorConv tc;
 47 |   if (warpLane < rank) {
 48 |     tc = glTensorConv[warpLane];
 49 |   }
 50 | 
 51 |   TensorError_t error;
 52 |   error.pos = 0xffffffff;
 53 |   error.refVal = 0;
 54 |   error.dataVal = 0;
 55 | 
 56 |   for (int base = blockIdx.x*blockDim.x;base < ndata;base += blockDim.x*gridDim.x) {
 57 |     int i = base + threadIdx.x;
 58 |     T dataValT = (i < ndata) ? data[i] : -1;
 59 |     int refVal = 0;
 60 |     for (int j=0;j < rank;j++) {
 61 |       refVal += ((i/__shfl_sync(FULL_MASK, tc.c,j)) % __shfl_sync(FULL_MASK, tc.d,j))*__shfl_sync(FULL_MASK, tc.ct,j);
 62 |     }
 63 | 
 64 |     int dataVal = (dataValT & 0xffffffff)/(sizeof(T)/4);
 65 | 
 66 |     if (i < ndata && refVal != dataVal && i < error.pos) {
 67 |       error.pos = i;
 68 |       error.refVal = refVal;
 69 |       error.dataVal = dataVal;
 70 |     }
 71 |   }
 72 | 
 73 |   // Set FAIL flag
 74 |   if (error.pos != 0xffffffff) {
 75 |     // printf("error %d %d %d\n", error.pos, error.refVal, error.dataVal);
 76 |     *glFail = 1;
 77 |   }
 78 | 
 79 |   shPos[threadIdx.x] = error.pos;
 80 |   __syncthreads();
 81 |   for (int d=1;d < blockDim.x;d *= 2) {
 82 |     int t = threadIdx.x + d;
 83 |     unsigned int posval = (t < blockDim.x) ? shPos[t] : 0xffffffff;
 84 |     __syncthreads();
 85 |     shPos[threadIdx.x] = min(posval, shPos[threadIdx.x]);
 86 |   __syncthreads();
 87 |   }
 88 |   // Minimum error.pos is in shPos[0] (or 0xffffffff in case of no error)
 89 | 
 90 |   if (shPos[0] != 0xffffffff && shPos[0] == error.pos) {
 91 |     // Error has occured and this thread has the minimum error.pos
 92 |     // printf("BOO error %d %d %d | %d\n", error.pos, error.refVal, error.dataVal, blockIdx.x);
 93 |     glError[blockIdx.x] = error;
 94 |   }
 95 | 
 96 | }
 97 | 
 98 | // ################################################################################
 99 | // ################################################################################
100 | // ################################################################################
101 | 
102 | //
103 | // Class constructor
104 | //
105 | TensorTester::TensorTester() : maxRank(32), maxNumblock(256) {
106 |   h_tensorConv = new TensorConv[maxRank];
107 |   h_error      = new TensorError_t[maxNumblock];
108 |   allocate_device<TensorConv>(&d_tensorConv, maxRank);
109 |   allocate_device<TensorError_t>(&d_error, maxNumblock);
110 |   allocate_device<int>(&d_fail, 1);
111 | }
112 | 
113 | //
114 | // Class destructor
115 | //
116 | TensorTester::~TensorTester() {
117 |   delete [] h_tensorConv;
118 |   delete [] h_error;
119 |   deallocate_device<TensorConv>(&d_tensorConv);
120 |   deallocate_device<TensorError_t>(&d_error);
121 |   deallocate_device<int>(&d_fail);
122 | }
123 | 
124 | void TensorTester::setTensorCheckPattern(unsigned int* data, unsigned int ndata) {
125 |   int numthread = 512;
126 |   int numblock = min(65535, (ndata - 1)/numthread + 1 );
127 |   setTensorCheckPatternKernel<<< numblock, numthread >>>(data, ndata);
128 |   cudaCheck(cudaGetLastError());
129 | }
130 | 
131 | // void calcTensorConv(const int rank, const int* dim, const int* permutation,
132 | //   TensorConv* tensorConv) {
133 | 
134 | //   tensorConv[0].c = 1;
135 | //   tensorConv[0].d = dim[0];
136 | //   tensorConv[permutation[0]].ct = 1;
137 | //   int ct_prev = 1;
138 | //   for (int i=1;i < rank;i++) {
139 | //     tensorConv[i].c = tensorConv[i-1].c*dim[i-1];
140 | //     tensorConv[i].d = dim[i];
141 | //     int ct = ct_prev*dim[permutation[i-1]];
142 | //     tensorConv[permutation[i]].ct = ct;
143 | //     ct_prev = ct;
144 | //   }
145 | 
146 | // }
147 | 
148 | //
149 | // Calculates tensor conversion constants. Returns total volume of tensor
150 | //
151 | int TensorTester::calcTensorConv(const int rank, const int* dim, const int* permutation,
152 |   TensorConv* tensorConv) {
153 | 
154 |   int vol = dim[0];
155 | 
156 |   tensorConv[permutation[0]].c  = 1;
157 |   tensorConv[0].ct = 1;
158 |   tensorConv[0].d  = dim[0];
159 |   for (int i=1;i < rank;i++) {
160 |     vol *= dim[i];
161 | 
162 |     tensorConv[permutation[i]].c = tensorConv[permutation[i-1]].c*dim[permutation[i-1]];
163 | 
164 |     tensorConv[i].d  = dim[i];
165 |     tensorConv[i].ct = tensorConv[i-1].ct*dim[i-1];
166 | 
167 |   }
168 | 
169 |   return vol;
170 | }
171 | 
172 | template<typename T> bool TensorTester::checkTranspose(int rank, int* dim, int* permutation, T* data) {
173 | 
174 |   if (rank > 32) {
175 |     return false;
176 |   }
177 | 
178 |   int ndata = calcTensorConv(rank, dim, permutation, h_tensorConv);
179 |   copy_HtoD<TensorConv>(h_tensorConv, d_tensorConv, rank);
180 | 
181 |   // printf("tensorConv\n");
182 |   // for (int i=0;i < rank;i++) {
183 |   //   printf("%d %d %d\n", h_tensorConv[i].c, h_tensorConv[i].d, h_tensorConv[i].ct);
184 |   // }
185 | 
186 |   set_device_array<TensorError_t>(d_error, 0, maxNumblock);
187 |   set_device_array<int>(d_fail, 0, 1);
188 | 
189 |   int numthread = 512;
190 |   int numblock = min(maxNumblock, (ndata - 1)/numthread + 1 );
191 |   int shmemsize = numthread*sizeof(unsigned int);
192 |   checkTransposeKernel<<< numblock, numthread, shmemsize >>>(data, ndata, rank, d_tensorConv, d_error, d_fail);
193 |   cudaCheck(cudaGetLastError());
194 | 
195 |   int h_fail;
196 |   copy_DtoH<int>(d_fail, &h_fail, 1);
197 |   cudaCheck(cudaDeviceSynchronize());
198 | 
199 |   if (h_fail) {
200 |     copy_DtoH_sync<TensorError_t>(d_error, h_error, maxNumblock);
201 |     TensorError_t error;
202 |     error.pos = 0x0fffffff;
203 |     for (int i=0;i < numblock;i++) {
204 |       // printf("%d %d %d\n", error.pos, error.refVal, error.dataVal);
205 |       if (h_error[i].refVal != h_error[i].dataVal && error.pos > h_error[i].pos) {
206 |         error = h_error[i];
207 |       }
208 |     }
209 |     printf("TensorTester::checkTranspose FAIL at %d ref %d data %d\n", error.pos, error.refVal, error.dataVal);
210 |     return false;
211 |   }
212 | 
213 |   return true;
214 | }
215 | 
216 | // Explicit instances
217 | template bool TensorTester::checkTranspose<int>(int rank, int* dim, int* permutation, int* data);
218 | template bool TensorTester::checkTranspose<long long int>(int rank, int* dim, int* permutation, long long int* data);
219 | 


--------------------------------------------------------------------------------
/src/int_vector.h:
--------------------------------------------------------------------------------
  1 | #ifndef INT_VECTOR_H
  2 | #define INT_VECTOR_H
  3 | 
  4 | // Intel: Minimum SSE2 required for vectorization.
  5 | // SSE can't be used because it does not support integer operations. SSE defaults to scalar
  6 | 
  7 | #if defined(__SSE2__)
  8 | // Intel x86
  9 | #include <x86intrin.h>
 10 | 
 11 | #if defined(__AVX2__)
 12 | #define USE_AVX
 13 | const int INT_VECTOR_LEN = 8;
 14 | // #include <avx2intrin.h>
 15 | const char INT_VECTOR_TYPE[] = "AVX2";
 16 | #else
 17 | #define USE_SSE
 18 | const int INT_VECTOR_LEN = 4;
 19 | const char INT_VECTOR_TYPE[] = "SSE2";
 20 | #endif
 21 | 
 22 | #elif defined(__ALTIVEC__)  // #if defined(__SSE2__)
 23 | #define USE_ALTIVEC
 24 | // IBM altivec
 25 | #include <altivec.h>
 26 | #undef bool
 27 | const int INT_VECTOR_LEN = 4;
 28 | const char INT_VECTOR_TYPE[] = "ALTIVEC";
 29 | 
 30 | #else // #if defined(__SSE2__)
 31 | // Nothing
 32 | const int INT_VECTOR_LEN = 1;
 33 | const char INT_VECTOR_TYPE[] = "SCALAR";
 34 | #endif
 35 | 
 36 | //
 37 | // Integer vector class for Intel and IBM CPU platforms
 38 | //
 39 | class int_vector {
 40 | private:
 41 | 
 42 | #if defined(USE_AVX)
 43 |   __m256i x;
 44 | #elif defined(USE_SSE)
 45 |   __m128i x;
 46 | #elif defined(USE_ALTIVEC)
 47 |   vector signed int x;
 48 | #else
 49 |   int x;
 50 | #endif
 51 | 
 52 | public:
 53 | 
 54 |   inline int_vector() {
 55 |   }
 56 | 
 57 |   inline int_vector(const int a) {
 58 | #if defined(USE_AVX)
 59 |     x = _mm256_set1_epi32(a);
 60 | #elif defined(USE_SSE)
 61 |     x = _mm_set1_epi32(a);
 62 | #elif defined(USE_ALTIVEC)
 63 |     x = (vector signed int){a, a, a, a};
 64 | #else
 65 |     x = a;
 66 | #endif    
 67 |   }
 68 | 
 69 |   inline int_vector(const int a[]) {
 70 | #if defined(USE_AVX)
 71 |     x = _mm256_set_epi32(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
 72 | #elif defined(USE_SSE)
 73 |     x = _mm_set_epi32(a[3], a[2], a[1], a[0]);
 74 | #elif defined(USE_ALTIVEC)
 75 |     x = vec_ld(0, a);
 76 | #else
 77 |     x = a[0];
 78 | #endif    
 79 |   }
 80 | 
 81 | #if defined(USE_AVX)
 82 |   inline int_vector(const __m256i ax) {
 83 |     x = ax;
 84 |   }
 85 | #elif defined(USE_SSE)
 86 |   inline int_vector(const __m128i ax) {
 87 |     x = ax;
 88 |   }
 89 | #elif defined(USE_ALTIVEC)
 90 |   inline int_vector(const vector signed int ax) {
 91 |     x = ax;
 92 |   }
 93 | #endif
 94 | 
 95 |   // 
 96 |   // Member functions
 97 |   //
 98 | 
 99 |   inline int_vector operator+=(const int_vector a) {
100 | #if defined(USE_AVX)
101 |     x = _mm256_add_epi32(x, a.x);
102 | #elif defined(USE_SSE)
103 |     x = _mm_add_epi32(x, a.x);
104 | #elif defined(USE_ALTIVEC)
105 |     x += a.x;
106 | #else
107 |     x += a.x;
108 | #endif
109 |     return *this;
110 |   }
111 | 
112 |   inline int_vector operator-=(const int_vector a) {
113 | #if defined(USE_AVX)
114 |     x = _mm256_sub_epi32(x, a.x);
115 | #elif defined(USE_SSE)
116 |     x = _mm_sub_epi32(x, a.x);
117 | #elif defined(USE_ALTIVEC)
118 |     x -= a.x;
119 | #else
120 |     x -= a.x;
121 | #endif
122 |     return *this;
123 |   }
124 | 
125 |   inline int_vector operator&=(const int_vector a) {
126 | #if defined(USE_AVX)
127 |     x = _mm256_and_si256(x, a.x);
128 | #elif defined(USE_SSE)
129 |     x = _mm_and_si128(x, a.x);
130 | #elif defined(USE_ALTIVEC)
131 |     x &= a.x;
132 | #else
133 |     x &= a.x;
134 | #endif
135 |     return *this;
136 |   }
137 | 
138 |   inline int_vector operator|=(const int_vector a) {
139 | #if defined(USE_AVX)
140 |     x = _mm256_or_si256(x, a.x);
141 | #elif defined(USE_SSE)
142 |     x = _mm_or_si128(x, a.x);
143 | #elif defined(USE_ALTIVEC)
144 |     x |= a.x;
145 | #else
146 |     x |= a.x;
147 | #endif
148 |     return *this;
149 |   }
150 | 
151 |   inline int_vector operator~() {
152 | #if defined(USE_AVX)
153 |     int_vector fullmask = int_vector(-1);
154 |     return int_vector( _mm256_andnot_si256(x, fullmask.x) );
155 | #elif defined(USE_SSE)
156 |     int_vector fullmask = int_vector(-1);
157 |     return int_vector( _mm_andnot_si128(x, fullmask.x) );
158 | #elif defined(USE_ALTIVEC)
159 |     return int_vector( ~x );
160 | #else
161 |     return ~x;
162 | #endif
163 |   }
164 | 
165 |   // Sign extended shift by a constant.
166 |   // Note: 0 <= n <= 31. Otherwise results are unpredictable
167 |   inline int_vector operator>>=(const int n) {
168 | #if defined(USE_AVX)
169 |     x = _mm256_srai_epi32(x, n);
170 | #elif defined(USE_SSE)
171 |     x = _mm_srai_epi32(x, n);
172 | #elif defined(USE_ALTIVEC)
173 |     x >>= n;
174 | #else
175 |     x >>= n;
176 | #endif
177 |     return *this;
178 |   }
179 | 
180 |   // Sign extended shift by a constant
181 |   // Note: 0 <= n <= 31. Otherwise results are unpredictable
182 |   inline int_vector operator<<=(const int n) {
183 | #if defined(USE_AVX)
184 |     x = _mm256_slli_epi32(x, n);
185 | #elif defined(USE_SSE)
186 |     x = _mm_slli_epi32(x, n);
187 | #elif defined(USE_ALTIVEC)
188 |     x <<= n;
189 | #else
190 |     x <<= n;
191 | #endif
192 |     return *this;
193 |   }
194 | 
195 |   // Copy contest to int array
196 |   void copy(int* a) const {
197 | #if defined(USE_AVX)
198 |     _mm256_storeu_si256((__m256i *)a, x);
199 | #elif defined(USE_SSE)
200 |     _mm_storeu_si128((__m128i *)a, x);
201 | #elif defined(USE_ALTIVEC)
202 |      // void vec_stl (vector signed int, int, int *);
203 |     vec_stl(x, 0, a);
204 | #else
205 |     a[0] = x;
206 | #endif
207 |   }
208 | 
209 |   //
210 |   // Non-member functions
211 |   //
212 | 
213 |   inline friend int_vector operator+(int_vector a, const int_vector b) {
214 |     a += b;
215 |     return a;
216 |   }
217 | 
218 |   inline friend int_vector operator-(int_vector a, const int_vector b) {
219 |     a -= b;
220 |     return a;
221 |   }
222 | 
223 |   inline friend int_vector operator&(int_vector a, const int_vector b) {
224 |     a &= b;
225 |     return a;
226 |   }
227 | 
228 |   inline friend int_vector operator|(int_vector a, const int_vector b) {
229 |     a |= b;
230 |     return a;
231 |   }
232 | 
233 |   inline friend int_vector operator>>(int_vector a, const int n) {
234 |     a >>= n;
235 |     return a;
236 |   }
237 | 
238 |   inline friend int_vector operator<<(int_vector a, const int n) {
239 |     a <<= n;
240 |     return a;
241 |   }
242 | 
243 |   // Returns 0xffffffff = -1 on the vector elements that are equal
244 |   inline friend int_vector eq_mask(const int_vector a, const int_vector b) {
245 | #if defined(USE_AVX)
246 |     return int_vector(_mm256_cmpeq_epi32(a.x, b.x));
247 | #elif defined(USE_SSE)
248 |     return int_vector(_mm_cmpeq_epi32(a.x, b.x));
249 | #elif defined(USE_ALTIVEC)
250 |     return int_vector(a.x == b.x);
251 | #else
252 |     return int_vector((a.x == b.x)*(-1));
253 | #endif
254 |   }
255 | 
256 |   inline friend int_vector neq_mask(const int_vector a, const int_vector b) {
257 |     return ~eq_mask(a, b);
258 |   }
259 | 
260 |   // 0xffffffff => 1
261 |   inline friend int_vector mask_to_bool(const int_vector a) {
262 | #if defined(USE_AVX)
263 |     return int_vector(_mm256_srli_epi32(a.x, 31));
264 | #elif defined(USE_SSE)
265 |     return int_vector(_mm_srli_epi32(a.x, 31));
266 | #elif defined(USE_ALTIVEC)
267 |     return int_vector((vector signed int)((vector unsigned int)a.x >> 31));
268 | #else
269 |     return ((unsigned int)a.x >> 31);
270 | #endif
271 |   }
272 | 
273 |   inline friend int_vector operator==(const int_vector a, const int_vector b) {
274 |     return mask_to_bool(eq_mask(a, b));
275 |   }
276 | 
277 |   inline friend int_vector operator!=(const int_vector a, const int_vector b) {
278 |     return mask_to_bool(neq_mask(a, b));
279 |   }
280 | 
281 |   // 1 => 0xffffffff
282 |   inline friend int_vector bool_to_mask(const int_vector a) {
283 | #if defined(USE_AVX)
284 |     return neq_mask(a, int_vector(0));
285 | #elif defined(USE_SSE)
286 |     return neq_mask(a, int_vector(0));
287 | #elif defined(USE_ALTIVEC)
288 |     return neq_mask(a, int_vector(0));
289 | #else
290 |     return (a ? -1 : 0);
291 | #endif
292 |   }
293 | 
294 |   // Implicit type conversion
295 |   // Returns true if any of the elements are != 0
296 |   operator bool() const {
297 | #if defined(USE_AVX)
298 |     int_vector a = neq_mask(*this, int_vector(0));
299 |     return (_mm256_movemask_epi8(a.x) != 0);
300 | #elif defined(USE_SSE)
301 |     int_vector a = neq_mask(*this, int_vector(0));
302 |     return (_mm_movemask_epi8(a.x) != 0);
303 | #elif defined(USE_ALTIVEC)
304 |     return vec_any_ne(x, ((const vector signed int){0, 0, 0, 0}));
305 | #else
306 |     return x;
307 | #endif
308 |   }
309 | 
310 |   //
311 |   // Helper functions
312 |   //
313 |   void print() {
314 |     int vec[INT_VECTOR_LEN];
315 |     this->copy(vec);
316 |     for (int i=0;i < INT_VECTOR_LEN;i++) {
317 |       printf("%d ", vec[i]);
318 |     }
319 |   }
320 | 
321 | };
322 | 
323 | 
324 | #if defined(USE_ALTIVEC)
325 | #undef vector
326 | #undef pixel
327 | #endif
328 | 
329 | #endif // INT_VECTOR_H
330 | 


--------------------------------------------------------------------------------
/src/cutt.cpp:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <cuda.h>
 26 | #include <list>
 27 | #include <unordered_map>
 28 | #include "CudaUtils.h"
 29 | #include "cuttplan.h"
 30 | #include "cuttkernel.h"
 31 | #include "cuttTimer.h"
 32 | #include "cutt.h"
 33 | // #include <chrono>
 34 | 
 35 | // Hash table to store the plans
 36 | static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage;
 37 | 
 38 | // Current handle
 39 | static cuttHandle curHandle = 0;
 40 | 
 41 | // Table of devices that have been initialized
 42 | static std::unordered_map<int, cudaDeviceProp> deviceProps;
 43 | 
 44 | // Checks prepares device if it's not ready yet and returns device properties
 45 | // Also sets shared memory configuration
 46 | void getDeviceProp(int& deviceID, cudaDeviceProp &prop) {
 47 |   cudaCheck(cudaGetDevice(&deviceID));
 48 |   auto it = deviceProps.find(deviceID);
 49 |   if (it == deviceProps.end()) {
 50 |     // Get device properties and store it for later use
 51 |     cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
 52 |     cuttKernelSetSharedMemConfig();
 53 |     deviceProps.insert({deviceID, prop});
 54 |   } else {
 55 |     prop = it->second;
 56 |   }
 57 | }
 58 | 
 59 | cuttResult cuttPlanCheckInput(int rank, int* dim, int* permutation, size_t sizeofType) {
 60 |   // Check sizeofType
 61 |   if (sizeofType != 4 && sizeofType != 8) return CUTT_INVALID_PARAMETER;
 62 |   // Check rank
 63 |   if (rank <= 1) return CUTT_INVALID_PARAMETER;
 64 |   // Check dim[]
 65 |   for (int i=0;i < rank;i++) {
 66 |     if (dim[i] <= 1) return CUTT_INVALID_PARAMETER;
 67 |   }
 68 |   // Check permutation
 69 |   bool permutation_fail = false;
 70 |   int* check = new int[rank];
 71 |   for (int i=0;i < rank;i++) check[i] = 0;
 72 |   for (int i=0;i < rank;i++) {
 73 |     if (permutation[i] < 0 || permutation[i] >= rank || check[permutation[i]]++) {
 74 |       permutation_fail = true;
 75 |       break;
 76 |     }
 77 |   }
 78 |   delete [] check;
 79 |   if (permutation_fail) return CUTT_INVALID_PARAMETER;  
 80 | 
 81 |   return CUTT_SUCCESS;
 82 | }
 83 | 
 84 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
 85 |   cudaStream_t stream) {
 86 | 
 87 | #ifdef ENABLE_NVTOOLS
 88 |   gpuRangeStart("init");
 89 | #endif
 90 | 
 91 |   // Check that input parameters are valid
 92 |   cuttResult inpCheck = cuttPlanCheckInput(rank, dim, permutation, sizeofType);
 93 |   if (inpCheck != CUTT_SUCCESS) return inpCheck;
 94 | 
 95 |   // Create new handle
 96 |   *handle = curHandle;
 97 |   curHandle++;
 98 | 
 99 |   // Check that the current handle is available (it better be!)
100 |   if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;
101 | 
102 |   // Prepare device
103 |   int deviceID;
104 |   cudaDeviceProp prop;
105 |   getDeviceProp(deviceID, prop);
106 | 
107 |   // Reduce ranks
108 |   std::vector<int> redDim;
109 |   std::vector<int> redPermutation;
110 |   reduceRanks(rank, dim, permutation, redDim, redPermutation);
111 | 
112 |   // Create plans from reduced ranks
113 |   std::list<cuttPlan_t> plans;
114 |   // if (rank != redDim.size()) {
115 |   //   if (!createPlans(redDim.size(), redDim.data(), redPermutation.data(), sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
116 |   // }
117 | 
118 |   // // Create plans from non-reduced ranks
119 |   // if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
120 | 
121 | #if 0
122 |   if (!cuttKernelDatabase(deviceID, prop)) return CUTT_INTERNAL_ERROR;
123 | #endif
124 | 
125 | #ifdef ENABLE_NVTOOLS
126 |   gpuRangeStop();
127 |   gpuRangeStart("createPlans");
128 | #endif
129 | 
130 |   // std::chrono::high_resolution_clock::time_point plan_start;
131 |   // plan_start = std::chrono::high_resolution_clock::now();
132 | 
133 |   if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
134 |     sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;
135 | 
136 |   // std::chrono::high_resolution_clock::time_point plan_end;
137 |   // plan_end = std::chrono::high_resolution_clock::now();
138 |   // double plan_duration = std::chrono::duration_cast< std::chrono::duration<double> >(plan_end - plan_start).count();
139 |   // printf("createPlans took %lf ms\n", plan_duration*1000.0);
140 | 
141 | #ifdef ENABLE_NVTOOLS
142 |   gpuRangeStop();
143 |   gpuRangeStart("countCycles");
144 | #endif
145 | 
146 |   // Count cycles
147 |   for (auto it=plans.begin();it != plans.end();it++) {
148 |     if (!it->countCycles(prop, 10)) return CUTT_INTERNAL_ERROR;
149 |   }
150 | 
151 | #ifdef ENABLE_NVTOOLS
152 |   gpuRangeStop();
153 |   gpuRangeStart("rest");
154 | #endif
155 | 
156 |   // Choose the plan
157 |   std::list<cuttPlan_t>::iterator bestPlan = choosePlanHeuristic(plans);
158 |   if (bestPlan == plans.end()) return CUTT_INTERNAL_ERROR;
159 | 
160 |   // bestPlan->print();
161 | 
162 |   // Create copy of the plan outside the list
163 |   cuttPlan_t* plan = new cuttPlan_t();
164 |   // NOTE: No deep copy needed here since device memory hasn't been allocated yet
165 |   *plan = *bestPlan;
166 |   // Set device pointers to NULL in the old copy of the plan so
167 |   // that they won't be deallocated later when the object is destroyed
168 |   bestPlan->nullDevicePointers();
169 | 
170 |   // Set stream
171 |   plan->setStream(stream);
172 | 
173 |   // Activate plan
174 |   plan->activate();
175 | 
176 |   // Insert plan into storage
177 |   planStorage.insert( {*handle, plan} );
178 | 
179 | #ifdef ENABLE_NVTOOLS
180 |   gpuRangeStop();
181 | #endif
182 | 
183 |   return CUTT_SUCCESS;
184 | }
185 | 
186 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType,
187 |   cudaStream_t stream, void* idata, void* odata) {
188 | 
189 |   // Check that input parameters are valid
190 |   cuttResult inpCheck = cuttPlanCheckInput(rank, dim, permutation, sizeofType);
191 |   if (inpCheck != CUTT_SUCCESS) return inpCheck;
192 | 
193 |   if (idata == odata) return CUTT_INVALID_PARAMETER;
194 | 
195 |   // Create new handle
196 |   *handle = curHandle;
197 |   curHandle++;
198 | 
199 |   // Check that the current handle is available (it better be!)
200 |   if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;
201 | 
202 |   // Prepare device
203 |   int deviceID;
204 |   cudaDeviceProp prop;
205 |   getDeviceProp(deviceID, prop);
206 | 
207 |   // Reduce ranks
208 |   std::vector<int> redDim;
209 |   std::vector<int> redPermutation;
210 |   reduceRanks(rank, dim, permutation, redDim, redPermutation);
211 | 
212 |   // Create plans from reduced ranks
213 |   std::list<cuttPlan_t> plans;
214 | #if 0
215 |   // if (rank != redDim.size()) {
216 |     if (!createPlans(redDim.size(), redDim.data(), redPermutation.data(), sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
217 |   // }
218 | 
219 |   // Create plans from non-reduced ranks
220 |   // if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
221 | #else
222 |   if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
223 |     sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;
224 | #endif
225 | 
226 |   // // Count cycles
227 |   // for (auto it=plans.begin();it != plans.end();it++) {
228 |   //   if (!it->countCycles(prop, 10)) return CUTT_INTERNAL_ERROR;
229 |   // }
230 | 
231 |   // // Count the number of elements
232 |   size_t numBytes = sizeofType;
233 |   for (int i=0;i < rank;i++) numBytes *= dim[i];
234 | 
235 |   // Choose the plan
236 |   double bestTime = 1.0e40;
237 |   auto bestPlan = plans.end();
238 |   Timer timer;
239 |   std::vector<double> times;
240 |   for (auto it=plans.begin();it != plans.end();it++) {
241 |     // Activate plan
242 |     it->activate();
243 |     // Clear output data to invalidate caches
244 |     set_device_array<char>((char *)odata, -1, numBytes);
245 |     cudaCheck(cudaDeviceSynchronize());
246 |     timer.start();
247 |     // Execute plan
248 |     if (!cuttKernel(*it, idata, odata)) return CUTT_INTERNAL_ERROR;
249 |     timer.stop();
250 |     double curTime = timer.seconds();
251 |     // it->print();
252 |     // printf("curTime %1.2lf\n", curTime*1000.0);
253 |     times.push_back(curTime);
254 |     if (curTime < bestTime) {
255 |       bestTime = curTime;
256 |       bestPlan = it;
257 |     }
258 |   }
259 |   if (bestPlan == plans.end()) return CUTT_INTERNAL_ERROR;
260 | 
261 |   // bestPlan = plans.begin();
262 | 
263 |   // printMatlab(prop, plans, times);
264 |   // findMispredictionBest(plans, times, bestPlan, bestTime);
265 |   // bestPlan->print();
266 | 
267 |   // Create copy of the plan outside the list
268 |   cuttPlan_t* plan = new cuttPlan_t();
269 |   *plan = *bestPlan;
270 |   // Set device pointers to NULL in the old copy of the plan so
271 |   // that they won't be deallocated later when the object is destroyed
272 |   bestPlan->nullDevicePointers();
273 | 
274 |   // Set stream
275 |   plan->setStream(stream);
276 | 
277 |   // Activate plan
278 |   plan->activate();
279 | 
280 |   // Insert plan into storage
281 |   planStorage.insert( {*handle, plan} );
282 | 
283 |   return CUTT_SUCCESS;
284 | }
285 | 
286 | cuttResult cuttDestroy(cuttHandle handle) {
287 |   auto it = planStorage.find(handle);
288 |   if (it == planStorage.end()) return CUTT_INVALID_PLAN;
289 |   // Delete instance of cuttPlan_t
290 |   delete it->second;
291 |   // Delete entry from plan storage
292 |   planStorage.erase(it);
293 |   return CUTT_SUCCESS;
294 | }
295 | 
296 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) {
297 |   auto it = planStorage.find(handle);
298 |   if (it == planStorage.end()) return CUTT_INVALID_PLAN;
299 | 
300 |   if (idata == odata) return CUTT_INVALID_PARAMETER;
301 | 
302 |   cuttPlan_t& plan = *(it->second);
303 | 
304 |   int deviceID;
305 |   cudaCheck(cudaGetDevice(&deviceID));
306 |   if (deviceID != plan.deviceID) return CUTT_INVALID_DEVICE;
307 | 
308 |   if (!cuttKernel(plan, idata, odata)) return CUTT_INTERNAL_ERROR;
309 |   return CUTT_SUCCESS;
310 | }
311 | 


--------------------------------------------------------------------------------
/src/cutt_test.cpp:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <vector>
 26 | #include <algorithm>
 27 | #include <ctime>           // std::time
 28 | #include <cstring>         // strcmp
 29 | #include <cmath>
 30 | #include "cutt.h"
 31 | #include "CudaUtils.h"
 32 | #include "TensorTester.h"
 33 | #include "cuttTimer.h"
 34 | #include "cuttGpuModel.h"  // testCounters
 35 | 
 36 | //
 37 | // Error checking wrapper for cutt
 38 | //
 39 | #define cuttCheck(stmt) do {                                 \
 40 |   cuttResult err = stmt;                            \
 41 |   if (err != CUTT_SUCCESS) {                          \
 42 |     fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
 43 |     exit(1); \
 44 |   }                                                  \
 45 | } while(0)
 46 | 
 47 | cuttTimer* timerFloat;
 48 | cuttTimer* timerDouble;
 49 | 
 50 | long long int* dataIn  = NULL;
 51 | long long int* dataOut = NULL;
 52 | int dataSize  = 200000000;
 53 | TensorTester* tester = NULL;
 54 | 
 55 | bool test1();
 56 | bool test2();
 57 | bool test3();
 58 | bool test4();
 59 | bool test5();
 60 | template <typename T> bool test_tensor(std::vector<int>& dim, std::vector<int>& permutation);
 61 | void printVec(std::vector<int>& vec);
 62 | 
 63 | int main(int argc, char *argv[]) {
 64 | 
 65 |   int gpuid = -1;
 66 |   bool arg_ok = true;
 67 |   if (argc >= 3) {
 68 |     if (strcmp(argv[1], "-device") == 0) {
 69 |       sscanf(argv[2], "%d", &gpuid);
 70 |     } else {
 71 |       arg_ok = false;
 72 |     }
 73 |   } else if (argc > 1) {
 74 |     arg_ok = false;
 75 |   }
 76 | 
 77 |   if (!arg_ok) {
 78 |     printf("cutt_test [options]\n");
 79 |     printf("Options:\n");
 80 |     printf("-device gpuid : use GPU with ID gpuid\n");
 81 |     return 1;
 82 |   }
 83 | 
 84 |   if (gpuid >= 0) {
 85 |     cudaCheck(cudaSetDevice(gpuid));
 86 |   }
 87 | 
 88 |   cudaCheck(cudaDeviceReset());
 89 |   cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
 90 | 
 91 |   timerFloat = new cuttTimer(4);
 92 |   timerDouble = new cuttTimer(8);
 93 | 
 94 |   // Allocate device data, 100M elements
 95 |   allocate_device<long long int>(&dataIn, dataSize);
 96 |   allocate_device<long long int>(&dataOut, dataSize);
 97 | 
 98 |   // Create tester
 99 |   tester = new TensorTester();
100 |   tester->setTensorCheckPattern((unsigned int *)dataIn, dataSize*2);
101 | 
102 |   if (!test1()) goto fail;
103 |   if (!test2()) goto fail;
104 |   if (!test3()) goto fail;
105 |   if (!test4()) goto fail;
106 |   if (!test5()) goto fail;
107 | 
108 |   {
109 |     std::vector<int> worstDim;
110 |     std::vector<int> worstPermutation;
111 |     double worstBW = timerDouble->getWorst(worstDim, worstPermutation);
112 |     printf("worstBW %4.2lf GB/s\n", worstBW);
113 |     printf("dim\n");
114 |     printVec(worstDim);
115 |     printf("permutation\n");
116 |     printVec(worstPermutation);
117 |   }
118 | 
119 |   printf("test OK\n");
120 |   goto end;
121 | fail:
122 |   printf("test FAIL\n");
123 | end:
124 |   deallocate_device<long long int>(&dataIn);
125 |   deallocate_device<long long int>(&dataOut);
126 |   delete tester;
127 | 
128 |   delete timerFloat;
129 |   delete timerDouble;
130 | 
131 |   cudaCheck(cudaDeviceReset());
132 |   return 0;
133 | }
134 | 
135 | //
136 | // Test 1: Test all permutations up to rank 7 on smallish tensors
137 | //
138 | bool test1() {
139 |   const int minDim = 2;
140 |   const int maxDim = 16;
141 |   for (int rank = 2;rank <= 7;rank++) {
142 | 
143 |     std::vector<int> dim(rank);
144 |     std::vector<int> permutation(rank);
145 |     for (int r=0;r < rank;r++) {
146 |       permutation[r] = r;
147 |       dim[r] = minDim + r*(maxDim - minDim)/rank;
148 |     }
149 | 
150 |     do {
151 |       if (!test_tensor<long long int>(dim, permutation)) return false;
152 |       if (!test_tensor<int>(dim, permutation)) return false;
153 |     } while (std::next_permutation(permutation.begin(), permutation.begin() + rank));
154 | 
155 |   }
156 | 
157 |   return true;
158 | }
159 | 
160 | //
161 | // Test 2: Test ranks 2-15, random volume, random permutation, random dimensions
162 | //         100 samples each rank
163 | //
164 | bool test2() {
165 |   double minDim = 2.0;
166 | 
167 |   std::srand(unsigned (std::time(0)));
168 | 
169 |   for (int rank = 2;rank <= 15;rank++) {
170 |     double volmin = pow(minDim+1, rank);
171 |     double volmax = (double)dataSize;
172 | 
173 |     for (int isample=0;isample < 100;isample++) {
174 | 
175 |       std::vector<int> dim(rank);
176 |       std::vector<int> permutation(rank);
177 |       for (int r=0;r < rank;r++) permutation[r] = r;
178 |       double vol = 1.0;
179 |       double curvol = 1.0;
180 |       int iter = 0;
181 |       do {
182 |         vol = (volmin + (volmax - volmin)*((double)rand())/((double)RAND_MAX) );
183 | 
184 |         int subiter = 0;
185 |         do {
186 |           for (int r=0;r < rank;r++) {
187 |             double vol_left = vol/(curvol*pow(minDim, (double)(rank-r)));
188 |             double aveDim = pow(vol, 1.0/(double)rank);
189 |             double dimSpread = (aveDim - minDim);
190 |             // rn = -1 ... 1
191 |             double rn = 2.0*(((double)rand())/((double)RAND_MAX) - 0.5);
192 |             dim[r] = (int)(aveDim + dimSpread*rn);
193 |             curvol *= (double)dim[r];
194 |           }
195 | 
196 |           // printf("vol %lf curvol %lf\n", vol, curvol);
197 |           // printf("dim");
198 |           // for (int r=0;r < rank;r++) printf(" %d", dim[r]);
199 |           // printf("\n");
200 | 
201 |           double vol_scale = pow(vol/curvol, 1.0/(double)rank);
202 |           // printf("vol_scale %lf\n", vol_scale);
203 |           curvol = 1.0;
204 |           for (int r=0;r < rank;r++) {
205 |             dim[r] = std::max(2, (int)round((double)dim[r]*vol_scale));
206 |             curvol *= dim[r];
207 |           }
208 | 
209 |           // printf("vol %lf curvol %lf\n", vol, curvol);
210 |           // printf("dim");
211 |           // for (int r=0;r < rank;r++) printf(" %d", dim[r]);
212 |           // printf("\n");
213 |           // return false;
214 | 
215 |           subiter++;
216 |         } while (subiter < 50 && (curvol > volmax || fabs(curvol-vol)/(double)vol > 2.3));
217 | 
218 |         // printf("vol %lf curvol %lf volmin %lf volmax %lf\n", vol, curvol, volmin, volmax);
219 |         // printf("dim");
220 |         // for (int r=0;r < rank;r++) printf(" %d", dim[r]);
221 |         // printf("\n");
222 | 
223 |         iter++;
224 |         if (iter == 1000) {
225 |           printf("vol %lf\n", vol);
226 |           printf("Unable to determine dimensions in 1000 iterations\n");
227 |           return false;
228 |         }
229 |       } while (curvol > volmax || fabs(curvol-vol)/(double)vol > 2.3);
230 | 
231 |       std::random_shuffle(permutation.begin(), permutation.end());
232 | 
233 |       if (!test_tensor<long long int>(dim, permutation)) return false;
234 |       if (!test_tensor<int>(dim, permutation)) return false;
235 |     }
236 | 
237 |   }
238 | 
239 |   return true;
240 | }
241 | 
242 | //
243 | // Test 3: hand picked examples
244 | //
245 | bool test3() {
246 | 
247 |   {
248 |     int rank = 2;
249 |     std::vector<int> dim(rank);
250 |     std::vector<int> permutation(rank);
251 |     dim[0] = 43;
252 |     dim[1] = 67;
253 |     permutation[0] = 1;
254 |     permutation[1] = 0;
255 |     if (!test_tensor<long long int>(dim, permutation)) return false;
256 |     if (!test_tensor<int>(dim, permutation)) return false;
257 |     dim[0] = 65536*32;
258 |     dim[1] = 2;
259 |     permutation[0] = 1;
260 |     permutation[1] = 0;
261 |     if (!test_tensor<long long int>(dim, permutation)) return false;
262 |     if (!test_tensor<int>(dim, permutation)) return false;
263 |   }
264 | 
265 |   {
266 |     int rank = 3;
267 |     std::vector<int> dim(rank);
268 |     std::vector<int> permutation(rank);
269 |     dim[0] = 1305;
270 |     dim[1] = 599;
271 |     dim[2] = 88;
272 |     permutation[0] = 0;
273 |     permutation[1] = 2;
274 |     permutation[2] = 1;
275 |     if (!test_tensor<long long int>(dim, permutation)) return false;
276 |     if (!test_tensor<int>(dim, permutation)) return false;
277 |   }
278 | 
279 |   {
280 |     int rank = 4;
281 |     std::vector<int> dim(rank);
282 |     std::vector<int> permutation(rank);
283 |     dim[0] = 24;
284 |     dim[1] = 330;
285 |     dim[2] = 64;
286 |     dim[3] = 147;
287 |     permutation[0] = 1;
288 |     permutation[1] = 0;
289 |     permutation[2] = 2;
290 |     permutation[3] = 3;
291 |     if (!test_tensor<long long int>(dim, permutation)) return false;
292 |     if (!test_tensor<int>(dim, permutation)) return false;
293 |   }
294 | 
295 |   {
296 |     int rank = 4;
297 |     std::vector<int> dim(rank);
298 |     std::vector<int> permutation(rank);
299 |     dim[0] = 2;
300 |     dim[1] = 5;
301 |     dim[2] = 9;
302 |     dim[3] = 12;
303 |     permutation[0] = 0;
304 |     permutation[1] = 1;
305 |     permutation[2] = 2;
306 |     permutation[3] = 3;
307 |     if (!test_tensor<long long int>(dim, permutation)) return false;
308 |     if (!test_tensor<int>(dim, permutation)) return false;    
309 |   }
310 | 
311 |   {
312 |     int rank = 6;
313 |     std::vector<int> dim(rank);
314 |     std::vector<int> permutation(rank);
315 |     dim[0] = 2;
316 |     dim[1] = 4;
317 |     dim[2] = 6;
318 |     dim[3] = 9;
319 |     dim[4] = 11;
320 |     dim[5] = 13;
321 |     permutation[0] = 0;
322 |     permutation[1] = 1;
323 |     permutation[2] = 2;
324 |     permutation[3] = 3;
325 |     permutation[4] = 4;
326 |     permutation[5] = 5;
327 |     if (!test_tensor<long long int>(dim, permutation)) return false;
328 |     if (!test_tensor<int>(dim, permutation)) return false;    
329 |   }
330 | 
331 |   {
332 |     std::vector<int> dim(5);
333 |     std::vector<int> permutation(5);
334 |     dim[0] = 5;
335 |     dim[1] = 42;
336 |     dim[2] = 75;
337 |     dim[3] = 86;
338 |     dim[4] = 57;
339 |     permutation[0] = 2 - 1;
340 |     permutation[1] = 4 - 1;
341 |     permutation[2] = 5 - 1;
342 |     permutation[3] = 3 - 1;
343 |     permutation[4] = 1 - 1;
344 |     if (!test_tensor<long long int>(dim, permutation)) return false;
345 |     if (!test_tensor<int>(dim, permutation)) return false;        
346 |   }
347 | 
348 |   {
349 |     std::vector<int> dim(5);
350 |     std::vector<int> permutation(5);
351 |     dim[0] = 5;
352 |     dim[1] = 3;
353 |     dim[2] = 2;
354 |     dim[3] = 9;
355 |     dim[4] = 14;
356 |     permutation[0] = 0;
357 |     permutation[1] = 1;
358 |     permutation[2] = 3;
359 |     permutation[3] = 2;
360 |     permutation[4] = 4;
361 |     if (!test_tensor<long long int>(dim, permutation)) return false;
362 |     if (!test_tensor<int>(dim, permutation)) return false;        
363 |   }
364 | 
365 |   return true;
366 | }
367 | 
368 | //
369 | // Test 4: streaming
370 | //
371 | bool test4() {
372 | 
373 |   std::vector<int> dim = {24, 32, 16, 36, 43, 9};
374 |   std::vector<int> permutation = {5, 1, 4, 2, 3, 0};
375 | 
376 |   const int numStream = 10;
377 | 
378 |   cudaStream_t streams[numStream];
379 |   for (int i=0;i < numStream;i++) {
380 |     cudaCheck(cudaStreamCreate(&streams[i]));
381 |   }
382 | 
383 |   cudaCheck(cudaDeviceSynchronize());
384 | 
385 |   cuttHandle plans[numStream];
386 | 
387 |   for (int i=0;i < numStream;i++) {
388 |     cuttCheck(cuttPlan(&plans[i], dim.size(), dim.data(), permutation.data(), sizeof(double), streams[i]));
389 |     cuttCheck(cuttExecute(plans[i], dataIn, dataOut));
390 |   }
391 | 
392 |   cudaCheck(cudaDeviceSynchronize());
393 | 
394 |   bool run_ok = tester->checkTranspose(dim.size(), dim.data(), permutation.data(), (long long int *)dataOut);
395 | 
396 |   cudaCheck(cudaDeviceSynchronize());
397 | 
398 |   for (int i=0;i < numStream;i++) {
399 |     cuttCheck(cuttDestroy(plans[i]));
400 |     cudaCheck(cudaStreamDestroy(streams[i]));
401 |   }
402 | 
403 |   return run_ok;
404 | }
405 | 
406 | 
407 | //
408 | // Test 5: Transaction and cache line counters
409 | //
410 | bool test5() {
411 | 
412 |   {
413 |     // Number of elements that are loaded per memory transaction:
414 |     // 128 bytes per transaction
415 |     const  int accWidth = 128/sizeof(double);
416 |     // L2 cache line width is 32 bytes
417 |     const int cacheWidth = 32/sizeof(double);
418 |     if (!testCounters(32, accWidth, cacheWidth)) return false;
419 |   }
420 | 
421 |   {
422 |     // Number of elements that are loaded per memory transaction:
423 |     // 128 bytes per transaction
424 |     const  int accWidth = 128/sizeof(float);
425 |     // L2 cache line width is 32 bytes
426 |     const int cacheWidth = 32/sizeof(float);
427 |     if (!testCounters(32, accWidth, cacheWidth)) return false;
428 |   }
429 | 
430 |   return true;
431 | }
432 | 
433 | 
434 | template <typename T>
435 | bool test_tensor(std::vector<int>& dim, std::vector<int>& permutation) {
436 | 
437 |   int rank = dim.size();
438 | 
439 |   int vol = 1;
440 |   for (int r=0;r < rank;r++) {
441 |     vol *= dim[r];
442 |   }
443 | 
444 |   size_t volmem = vol*sizeof(T);
445 |   size_t datamem = dataSize*sizeof(long long int);
446 |   if (volmem > datamem) {
447 |     printf("test_tensor, data size exceeded\n");
448 |     return false;
449 |   }
450 | 
451 |   printf("number of elements %d\n", vol);
452 |   printf("dimensions\n");
453 |   printVec(dim);
454 |   printf("permutation\n");
455 |   printVec(permutation);
456 | 
457 |   cuttTimer* timer;
458 |   if (sizeof(T) == 4) {
459 |     timer = timerFloat;
460 |   } else {
461 |     timer = timerDouble;
462 |   }
463 | 
464 |   cuttHandle plan;
465 |   cuttCheck(cuttPlan(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0));
466 |   set_device_array<T>((T *)dataOut, -1, vol);
467 |   cudaCheck(cudaDeviceSynchronize());
468 | 
469 |   if (vol > 1000000) timer->start(dim, permutation);
470 |   cuttCheck(cuttExecute(plan, dataIn, dataOut));
471 |   if (vol > 1000000) timer->stop();
472 | 
473 |   cuttCheck(cuttDestroy(plan));
474 | 
475 |   return tester->checkTranspose<T>(rank, dim.data(), permutation.data(), (T *)dataOut);
476 | }
477 | 
478 | void printVec(std::vector<int>& vec) {
479 |   for (int i=0;i < vec.size();i++) {
480 |     printf("%d ", vec[i]);
481 |   }
482 |   printf("\n");
483 | }
484 | 
485 | 


--------------------------------------------------------------------------------
/src/cuttGpuModelKernel.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <cuda.h>
 26 | #include "CudaUtils.h"
 27 | #include "cuttGpuModelKernel.h"
 28 | 
 29 | #define RESTRICT //__restrict__
 30 | 
 31 | //
 32 | // Global memory access statistics
 33 | //
 34 | struct MemStat {
 35 |   int gld_tran;
 36 |   int gst_tran;
 37 |   int gld_req;
 38 |   int gst_req;
 39 |   int cl_full_l2;
 40 |   int cl_part_l2;
 41 |   int cl_full_l1;
 42 |   int cl_part_l1;
 43 |   // int l1_tran;
 44 |   __device__ __forceinline__ void clear() {
 45 |     gld_tran = 0;
 46 |     gst_tran = 0;
 47 |     gld_req = 0;
 48 |     gst_req = 0;
 49 |     cl_full_l2 = 0;
 50 |     cl_part_l2 = 0;
 51 |     cl_full_l1 = 0;
 52 |     cl_part_l1 = 0;
 53 |     // l1_tran = 0;
 54 |   }
 55 | };
 56 | 
 57 | //
 58 | // Returns scalar tensor position. Each lane has the same p
 59 | // NOTE: c and d on inactive warps must be 1 !!
 60 | //
 61 | __device__ __forceinline__
 62 | int tensorPos(
 63 |   const int p, const int rank, const int c, const int d, const int ct,
 64 |   const int numLane=warpSize
 65 |   ) {
 66 | 
 67 |   int r = ((p/c) % d)*ct;
 68 | #pragma unroll
 69 |   for (int i=numLane/2;i >= 1;i/=2) {
 70 |     r += __shfl_xor_sync(FULL_MASK, r, i);
 71 |   }
 72 |   return r;
 73 | 
 74 | }
 75 | 
 76 | //
 77 | // Counts number of global memory transactions for a warp that accesses
 78 | // memory at pos using warp lanes 0, ..., n - 1
 79 | //
 80 | __device__ __forceinline__
 81 | int countGlTransactions(const int pos, const int n, const int accWidth, const int warpLane) {
 82 |   int seg0 = pos/accWidth;
 83 |   int srcLane = (warpLane == 0 || warpLane >= n) ? (warpLane) : (warpLane - 1);
 84 |   int seg1 = __shfl_sync(FULL_MASK, seg0, srcLane);
 85 |   int count = __popc(__ballot_sync(FULL_MASK, seg0 != seg1)) + 1;
 86 |   count = (n == 0) ? 0 : count;
 87 |   return count;
 88 | }
 89 | 
 90 | //
 91 | // Counts number of global memory transactions for a warp that accesses
 92 | // memory at pos using warp lanes 0, ..., n - 1
 93 | //
 94 | __device__ __forceinline__
 95 | int countGlTransactions(const int* segbuf, const int n) {
 96 |   int count = 0;
 97 |   for (int i = threadIdx.x;i < n;i += blockDim.x) {
 98 |     int seg      = segbuf[i];
 99 |     int seg_prev = (i - 1 >= 0) ? segbuf[i - 1] : -1;
100 |     count += (seg != seg_prev);
101 |   }
102 |   return count;
103 | }
104 | 
105 | //
106 | // Counts number of full and partial cache lines for a warp that accesses per warp
107 | // memory at pos using warp lanes 0, ..., n - 1
108 | //
109 | __device__ __forceinline__
110 | void countCacheLines(const int pos, const int n, const int cacheWidth, const int warpLane,
111 |   int& cl_full, int& cl_part) {
112 | 
113 |   int seg = pos/cacheWidth;
114 |   // Lane is at the beginning of a full cache line, if seg0 matches seg0 cacheWidth - 1 away
115 |   int readLane = warpLane + (cacheWidth - 1);
116 |   int val = (seg == __shfl_sync(FULL_MASK, seg, readLane));
117 |   val = (readLane < n) ? val : 0;
118 |   cl_full += val;
119 | 
120 |   unsigned int valbit = (((val << cacheWidth) - 1)*val) << warpLane;
121 |   // Perform warpSize-way bitwise or
122 | #pragma unroll
123 |   for (int i=warpSize/2;i >= 1;i/=2) {
124 |     valbit |= __shfl_xor_sync(FULL_MASK, valbit, i);
125 |   }
126 |   // Now: lanes with valbit set are part of a full cache line,
127 |   //      lanes with valbit unset are part of a partial cache line
128 |   int full = (valbit >> warpLane) & 1;
129 | 
130 |   seg = (warpLane < n) ? seg : -1;
131 |   int segP1 = __shfl_down_sync(FULL_MASK, seg, 1);
132 |   segP1 = (warpLane + 1 < warpSize) ? segP1 : -1;
133 |   int val2 = ((!full) && seg != segP1);
134 |   cl_part += val2;
135 | }
136 | 
137 | //
138 | // Counts number of full and partial cache lines for a warp that accesses
139 | // memory at cachelines segbuf[0] ... segbuf[n - 1]
140 | //
141 | __device__ __forceinline__
142 | void countCacheLines(int* segbuf, const int n, const int cacheWidth,
143 |   int& cl_full, int& cl_part) {
144 | 
145 |   const int topbit = (1 << 31);
146 |   const int lowbits = ~(1 << 31);
147 | 
148 |   for (int i = threadIdx.x;i < n;i += blockDim.x) {
149 |     // seg[i] is at the beginning of a full cache line, if seg[i] matches seg[i + cacheWidth - 1]
150 |     int i1 = i + (cacheWidth - 1);
151 |     int val = 0;
152 |     if (i1 < n) val = ((segbuf[i] & lowbits) == (segbuf[i1] & lowbits));
153 |     cl_full += val;
154 |     // Mark full cache lines with top bit set to 1
155 |     if (val) {
156 |       for (int j=0;j < cacheWidth;j++) {
157 |         if (i + j < n) segbuf[i + j] |= topbit;
158 |       }
159 |     }
160 |   }
161 |   __syncthreads();
162 | 
163 |   for (int i = threadIdx.x;i < n;i += blockDim.x) {
164 |     int seg = segbuf[i];
165 |     int segP1 = (i + 1 < n) ? segbuf[i + 1] : -1;
166 |     int part = ((seg & topbit) == 0);
167 |     int val2 = (part && seg != segP1);
168 |     cl_part += val2;
169 |   }
170 | 
171 |   // Clear top bits
172 |   __syncthreads();
173 |   for (int i = threadIdx.x;i < n;i += blockDim.x) {
174 |     segbuf[i] &= lowbits;
175 |   }
176 | 
177 | }
178 | 
179 | //
180 | // Runs countGlTransactions and countCacheLines counters for testing
181 | // Unused values in posData[] are marked with "-1"
182 | //
183 | __global__ void runCountersKernel(const int* posData, const int numPosData,
184 |   const int accWidth, const int cacheWidth, int* tranData, int* cl_fullData, int* cl_partData) {
185 | 
186 |   const int warpLane = threadIdx.x & (warpSize - 1);
187 | 
188 |   for (int i=threadIdx.x + blockIdx.x*blockDim.x;i < numPosData;i+=blockDim.x*gridDim.x) {
189 |     int pos = posData[i];
190 |     int flag = (pos == -1);
191 |     int ffsval = __ffs(__ballot_sync(FULL_MASK, flag)) - 1;
192 |     int n = (__any_sync(FULL_MASK, flag)) ? ffsval : warpSize;
193 |     int tran = countGlTransactions(pos, n, accWidth, warpLane);
194 |     int cl_full = 0;
195 |     int cl_part = 0;
196 |     countCacheLines(pos, n, cacheWidth, warpLane, cl_full, cl_part);
197 | #pragma unroll
198 |     for (int k=warpSize/2;k >= 1;k/=2) {
199 |       cl_full += __shfl_xor_sync(FULL_MASK, cl_full, k);
200 |       cl_part += __shfl_xor_sync(FULL_MASK, cl_part, k);
201 |     }
202 |     int j = i / warpSize;
203 |     tranData[j] = tran;
204 |     cl_fullData[j] = cl_full;
205 |     cl_partData[j] = cl_part;
206 |   }
207 | 
208 | }
209 | 
210 | //
211 | // Reduce memStat within warp and write result to global memory
212 | // NOTE: Not super-efficient since every warp does atomicAdd().
213 | //
214 | __device__ __forceinline__
215 | void writeMemStat(const int warpLane, MemStat memStat, MemStat* RESTRICT glMemStat) {
216 |   for (int i=16;i >= 1;i/=2) {
217 |     // memStat.gld_tran += __shfl_xor_sync(FULL_MASK, memStat.gld_tran, i);
218 |     // memStat.gst_tran += __shfl_xor_sync(FULL_MASK, memStat.gst_tran, i);
219 |     // memStat.gld_req  += __shfl_xor_sync(FULL_MASK, memStat.gld_req, i);
220 |     // memStat.gst_req  += __shfl_xor_sync(FULL_MASK, memStat.gst_req, i);
221 |     memStat.cl_full_l2  += __shfl_xor_sync(FULL_MASK, memStat.cl_full_l2, i);
222 |     memStat.cl_part_l2  += __shfl_xor_sync(FULL_MASK, memStat.cl_part_l2, i);
223 |     memStat.cl_full_l1  += __shfl_xor_sync(FULL_MASK, memStat.cl_full_l1, i);
224 |     memStat.cl_part_l1  += __shfl_xor_sync(FULL_MASK, memStat.cl_part_l1, i);
225 |     // memStat.l1_tran     += __shfl_xor_sync(FULL_MASK, memStat.l1_tran, i);
226 |   }
227 |   if (warpLane == 0) {
228 |     atomicAdd(&(glMemStat->gld_tran), memStat.gld_tran);
229 |     atomicAdd(&(glMemStat->gst_tran), memStat.gst_tran);
230 |     atomicAdd(&(glMemStat->gld_req), memStat.gld_req);
231 |     atomicAdd(&(glMemStat->gst_req), memStat.gst_req);
232 |     atomicAdd(&(glMemStat->cl_full_l2), memStat.cl_full_l2);
233 |     atomicAdd(&(glMemStat->cl_part_l2), memStat.cl_part_l2);
234 |     atomicAdd(&(glMemStat->cl_full_l1), memStat.cl_full_l1);
235 |     atomicAdd(&(glMemStat->cl_part_l1), memStat.cl_part_l1);
236 |     // atomicAdd(&(glMemStat->l1_tran), memStat.l1_tran);
237 |   }
238 | }
239 | 
240 | //
241 | // Transpose when Mm and Mk don't overlap and contain only single rank
242 | //
243 | //  dim3 numthread(TILEDIM, TILEROWS, 1);
244 | //  dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMk-1)/TILEDIM+1), 1, plan.volMbar);
245 | //
246 | __global__ void
247 | __launch_bounds__(TILEDIM*TILEROWS, 1)
248 | countTiled(
249 |   const int numMm, const int volMbar, const int sizeMbar,
250 |   const int2 tiledVol, const int cuDimMk, const int cuDimMm,
251 |   const TensorConvInOut* RESTRICT glMbar,
252 |   const int accWidth, const int cacheWidth,
253 |   MemStat* RESTRICT glMemStat) {
254 | 
255 |   const int warpLane = threadIdx.x & (warpSize - 1);
256 |   TensorConvInOut Mbar;
257 |   Mbar.c_in = 1;
258 |   Mbar.d_in = 1;
259 |   Mbar.c_out = 1;
260 |   Mbar.d_out = 1;
261 |   if (warpLane < sizeMbar) {
262 |     Mbar = glMbar[warpLane];
263 |   }
264 | 
265 |   const int bx = (blockIdx.x % numMm)*TILEDIM;
266 |   const int by = (blockIdx.x / numMm)*TILEDIM;
267 | 
268 |   const int xin = bx + threadIdx.x;
269 |   const int yin = by + threadIdx.y;
270 | 
271 |   const int xout = bx + threadIdx.y;
272 |   const int yout = by + threadIdx.x;
273 | 
274 |   const unsigned int maskIny = __ballot_sync(FULL_MASK, (yin + warpLane < tiledVol.y))*(xin < tiledVol.x);
275 |   const unsigned int maskOutx = __ballot_sync(FULL_MASK, (xout + warpLane < tiledVol.x))*(yout < tiledVol.y);
276 | 
277 |   const int posMinorIn = xin + yin*cuDimMk;
278 |   const int posMinorOut = yout + xout*cuDimMm;
279 |   const int posInAdd = TILEROWS*cuDimMk;
280 |   const int posOutAdd = TILEROWS*cuDimMm;
281 | 
282 |   MemStat memStat;
283 |   memStat.clear();
284 | 
285 |   for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z)
286 |   {
287 | 
288 |     // Compute global memory positions
289 |     int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
290 |     int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
291 | #pragma unroll
292 |     for (int i=16;i >= 1;i/=2) {
293 |       posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i);
294 |       posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i);
295 |     }
296 |     int posIn = posMajorIn + posMinorIn;
297 |     int posOut = posMajorOut + posMinorOut;
298 | 
299 |     // Read data into shared memory tile
300 | #pragma unroll
301 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
302 |       int n = __popc(__ballot_sync(FULL_MASK, maskIny & (1 << j)));
303 |       memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane);
304 |       memStat.gld_req += __any_sync(FULL_MASK, n > 0);
305 |       posIn += posInAdd;
306 |     }
307 | 
308 | #pragma unroll
309 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
310 |       int n = __popc(__ballot_sync(FULL_MASK, maskOutx & (1 << j)));
311 |       memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane);
312 |       memStat.gst_req += __any_sync(FULL_MASK, n > 0);
313 |       countCacheLines(posOut, n, cacheWidth, warpLane, memStat.cl_full_l2, memStat.cl_part_l2);
314 |       posOut += posOutAdd;
315 |     }
316 | 
317 |   }
318 | 
319 |   // Reduce memStat within thread block and write result to global memory
320 |   writeMemStat(warpLane, memStat, glMemStat);
321 | 
322 | }
323 | 
324 | //
325 | // Packed transpose. Thread block loads plan.volMmk number of elements
326 | //
327 | template <int numRegStorage>
328 | __global__ void
329 | __launch_bounds__(1024, 1)
330 | countPacked(
331 |   const int volMmk, const int volMbar,
332 |   const int sizeMmk, const int sizeMbar,
333 |   const TensorConvInOut* RESTRICT gl_Mmk,
334 |   const TensorConvInOut* RESTRICT gl_Mbar,
335 |   const int accWidth, const int cacheWidth,
336 |   MemStat* RESTRICT glMemStat) {
337 | 
338 |   extern __shared__ int shSegOut[];
339 | 
340 |   const int warpLane = threadIdx.x & (warpSize - 1);
341 | 
342 |   TensorConvInOut Mmk;
343 |   Mmk.c_in = 1;
344 |   Mmk.d_in = 1;
345 |   Mmk.c_out = 1;
346 |   Mmk.d_out = 1;
347 |   if (warpLane < sizeMmk) {
348 |     Mmk = gl_Mmk[warpLane];
349 |   }
350 | 
351 |   // Pre-compute tensor positions in Mmk
352 |   // 3*numRegStorage registers
353 |   int posMmkIn[numRegStorage];
354 |   int posMmkOut[numRegStorage];
355 | #pragma unroll
356 |   for (int j=0;j < numRegStorage;j++) {
357 |     posMmkIn[j] = 0;
358 |     posMmkOut[j] = 0;
359 |   }
360 |   for (int i=0;i < sizeMmk;i++) {
361 | #pragma unroll
362 |     for (int j=0;j < numRegStorage;j++) {
363 |       int posMmk = threadIdx.x + j*blockDim.x;
364 |       posMmkIn[j]  += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i);
365 |       posMmkOut[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i);
366 |     }
367 |   }
368 | 
369 |   // 6 registers
370 |   TensorConvInOut Mbar;
371 |   Mbar.c_in = 1;
372 |   Mbar.d_in = 1;
373 |   Mbar.c_out = 1;
374 |   Mbar.d_out = 1;
375 |   if (warpLane < sizeMbar) {
376 |     Mbar = gl_Mbar[warpLane];
377 |   }
378 | 
379 |   MemStat memStat;
380 |   memStat.clear();
381 | 
382 |   for (int posMbar=blockIdx.x;posMbar < volMbar;posMbar += gridDim.x)
383 |   {
384 | 
385 |     int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
386 | #pragma unroll
387 |     for (int i=16;i >= 1;i/=2) {
388 |       posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i);
389 |     }
390 | 
391 |     int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
392 | #pragma unroll
393 |     for (int i=16;i >= 1;i/=2) {
394 |       posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i);
395 |     }
396 | 
397 |     // Read from global memory
398 | #pragma unroll
399 |     for (int j=0;j < numRegStorage;j++) {
400 |       int posMmk = threadIdx.x + j*blockDim.x;
401 |       int posIn = posMbarIn + posMmkIn[j];
402 |       int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmk));
403 |       memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane);
404 |       memStat.gld_req += __any_sync(FULL_MASK, n > 0);
405 |     }
406 | 
407 |     // Write to global memory
408 | #pragma unroll
409 |     for (int j=0;j < numRegStorage;j++) {
410 |       int posMmk = threadIdx.x + j*blockDim.x;
411 |       int posOut = posMbarOut + posMmkOut[j];
412 |       int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmk));
413 |       memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane);
414 |       memStat.gst_req += __any_sync(FULL_MASK, n > 0);
415 |       if (posMmk < volMmk) shSegOut[posMmk] = posOut/cacheWidth;
416 |     }
417 | 
418 |     __syncthreads();
419 |     countCacheLines(shSegOut, volMmk, cacheWidth, memStat.cl_full_l2, memStat.cl_part_l2);
420 |     // Go from L2 segments to L1 segments
421 |     __syncthreads();
422 |     const int L2toL1 = accWidth/cacheWidth;
423 |     for (int i=threadIdx.x;i < volMmk;i+=blockDim.x) {
424 |       shSegOut[i] /= L2toL1;
425 |     }
426 |     __syncthreads();
427 |     countCacheLines(shSegOut, volMmk, accWidth, memStat.cl_full_l1, memStat.cl_part_l1);
428 | 
429 |     // __syncthreads();
430 |     // memStat.l1_tran += countGlTransactions(shSegOut, volMmk);
431 | 
432 |   }
433 | 
434 |   // Reduce memStat within thread block and write result to global memory
435 |   writeMemStat(warpLane, memStat, glMemStat);
436 |   
437 | }
438 | 
439 | //
440 | // Packed method with a split rank
441 | //
442 | // dim nthread(((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize, 1, 1)
443 | // dim nblock(ts.numSplit, min(256, max(1, ts.volMbar)), 1)
444 | //
445 | template <int numRegStorage>
446 | __global__ void
447 | __launch_bounds__(1024, 1)
448 | countPackedSplit(
449 |   const int splitDim, const int volMmkUnsplit, const int volMbar,
450 |   const int sizeMmk, const int sizeMbar,
451 |   const int cMmSplit, const int cMkSplit,
452 |   const TensorConvInOut* RESTRICT glMmk,
453 |   const TensorConvInOut* RESTRICT glMbar,
454 |   const int accWidth, const int cacheWidth,
455 |   MemStat* RESTRICT glMemStat) {
456 | 
457 |   extern __shared__ int shSegOut[];
458 | 
459 |   const int warpLane = threadIdx.x & (warpSize - 1);
460 | 
461 |   // const int plusone = (blockIdx.x < (splitDim % gridDim.x));
462 |   const int p0 = blockIdx.x*splitDim/gridDim.x;
463 |   const int volSplit = (blockIdx.x + 1)*splitDim/gridDim.x - p0;
464 |   const int plusone = volSplit - splitDim/gridDim.x;
465 | 
466 |   TensorConvInOut Mmk;
467 |   Mmk.c_in = 1;
468 |   Mmk.d_in = 1;
469 |   Mmk.c_out = 1;
470 |   Mmk.d_out = 1;
471 |   if (warpLane < sizeMmk) {
472 |     Mmk = glMmk[warpLane + plusone*sizeMmk];
473 |   }
474 | 
475 |   // gridDim.x = number of splits
476 |   // blockIdx.x = {0 ... gridDim.x - 1} is the split-index
477 |   // Volume of this split
478 |   // const int volSplit = (splitDim/gridDim.x) + plusone;
479 |   // Start position in this split
480 |   // const int p0 = (splitDim/gridDim.x)*blockIdx.x + min(blockIdx.x, (splitDim % gridDim.x));
481 |   const int posMmkIn0  = p0*cMmSplit;
482 |   const int posMmkOut0 = p0*cMkSplit;
483 |   // Volume of split Mmk
484 |   const int volMmkSplit = volSplit*volMmkUnsplit;
485 | 
486 |   // Pre-compute tensor positions in Mmk
487 |   // 3*numRegStorage registers
488 |   int posMmkIn[numRegStorage];
489 |   int posMmkOut[numRegStorage];
490 | #pragma unroll
491 |   for (int j=0;j < numRegStorage;j++) {
492 |     posMmkIn[j]  = posMmkIn0;
493 |     posMmkOut[j] = posMmkOut0;
494 |   }
495 |   for (int i=0;i < sizeMmk;i++) {
496 | #pragma unroll
497 |     for (int j=0;j < numRegStorage;j++) {
498 |       int t = threadIdx.x + j*blockDim.x;
499 |       posMmkIn[j]  += ((t/__shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i);
500 |       posMmkOut[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i);
501 |     }
502 |   }
503 | 
504 |   TensorConvInOut Mbar;
505 |   Mbar.c_in = 1;
506 |   Mbar.d_in = 1;
507 |   Mbar.c_out = 1;
508 |   Mbar.d_out = 1;
509 |   if (warpLane < sizeMbar) {
510 |     Mbar = glMbar[warpLane];
511 |   }
512 | 
513 |   MemStat memStat;
514 |   memStat.clear();
515 | 
516 |   for (int posMbar=blockIdx.y;posMbar < volMbar;posMbar+=gridDim.y)
517 |   {
518 | 
519 |     int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
520 | #pragma unroll
521 |     for (int i=16;i >= 1;i/=2) {
522 |       posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i);
523 |     }
524 | 
525 |     int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
526 | #pragma unroll
527 |     for (int i=16;i >= 1;i/=2) {
528 |       posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i);
529 |     }
530 | 
531 |     // Read from global memory
532 | #pragma unroll
533 |     for (int j=0;j < numRegStorage;j++) {
534 |       int posMmk = threadIdx.x + j*blockDim.x;
535 |       int posIn = posMbarIn + posMmkIn[j];
536 |       int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmkSplit));
537 |       memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane);
538 |       memStat.gld_req += __any_sync(FULL_MASK, n > 0);
539 |     }
540 | 
541 |     // Write to global memory
542 | #pragma unroll
543 |     for (int j=0;j < numRegStorage;j++) {
544 |       int posMmk = threadIdx.x + j*blockDim.x;
545 |       int posOut = posMbarOut + posMmkOut[j];
546 |       int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmkSplit));
547 |       memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane);
548 |       memStat.gst_req += __any_sync(FULL_MASK, n > 0);
549 |       if (posMmk < volMmkSplit) shSegOut[posMmk] = posOut / cacheWidth;
550 |       // countCacheLines(posOut, n, cacheWidth, warpLane, memStat.cl_full, memStat.cl_part);
551 |     }
552 | 
553 |     __syncthreads();
554 |     countCacheLines(shSegOut, volMmkSplit, cacheWidth, memStat.cl_full_l2, memStat.cl_part_l2);
555 |     // Go from L2 segments to L1 segments
556 |     __syncthreads();
557 |     const int L2toL1 = accWidth/cacheWidth;
558 |     for (int i=threadIdx.x;i < volMmkSplit;i+=blockDim.x) {
559 |       shSegOut[i] /= L2toL1;
560 |     }
561 |     __syncthreads();
562 |     countCacheLines(shSegOut, volMmkSplit, accWidth, memStat.cl_full_l1, memStat.cl_part_l1);
563 | 
564 |     // __syncthreads();
565 |     // memStat.l1_tran += countGlTransactions(shSegOut, volMmkSplit);
566 | 
567 |   }
568 | 
569 |   // Reduce memStat within thread block and write result to global memory
570 |   writeMemStat(warpLane, memStat, glMemStat);
571 | 
572 | }
573 | 
574 | //
575 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2)
576 | //
577 | //  dim3 numthread(TILEDIM, TILEROWS, 1);
578 | //  dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar);
579 | //
580 | __global__ void
581 | __launch_bounds__(TILEDIM*TILEROWS, 1)
582 | countTiledCopy(
583 |   const int numMm, const int volMbar, const int sizeMbar,
584 |   const int cuDimMk, const int cuDimMm,
585 |   const int2 tiledVol,
586 |   const TensorConvInOut* RESTRICT gl_Mbar,
587 |   const int accWidth, const int cacheWidth,
588 |   MemStat* RESTRICT glMemStat) {
589 | 
590 |   const int warpLane = threadIdx.x & (warpSize - 1);
591 |   TensorConvInOut Mbar;
592 |   Mbar.c_in = 1;
593 |   Mbar.d_in = 1;
594 |   Mbar.c_out = 1;
595 |   Mbar.d_out = 1;
596 |   if (warpLane < sizeMbar) {
597 |     Mbar = gl_Mbar[warpLane];
598 |   }
599 | 
600 |   const int bx = (blockIdx.x % numMm)*TILEDIM;
601 |   const int by = (blockIdx.x / numMm)*TILEDIM;
602 | 
603 |   const int x = bx + threadIdx.x;
604 |   const int y = by + threadIdx.y;
605 | 
606 |   MemStat memStat;
607 |   memStat.clear();
608 | 
609 |   for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z)
610 |   {
611 | 
612 |     // Read global memory
613 |     {
614 |       int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_in, Mbar.d_in, Mbar.ct_in);
615 |       pos0 += x + y*cuDimMk;
616 | 
617 | #pragma unroll
618 |       for (int j=0;j < TILEDIM;j += TILEROWS) {
619 |         int pos  = pos0  + j*cuDimMk;
620 |         int n = __popc(__ballot_sync(FULL_MASK, (x < tiledVol.x) && (y + j < tiledVol.y)));
621 |         memStat.gld_tran += countGlTransactions(pos, n, accWidth, warpLane);
622 |         memStat.gld_req += __any_sync(FULL_MASK, n > 0);
623 |       }
624 |     }
625 | 
626 |     // Write global memory
627 |     {
628 |       int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_out, Mbar.d_out, Mbar.ct_out);
629 |       pos0 += x + y*cuDimMm;
630 | 
631 | #pragma unroll
632 |       for (int j=0;j < TILEDIM;j += TILEROWS) {
633 |         int pos = pos0 + j*cuDimMm;
634 |         int n = __popc(__ballot_sync(FULL_MASK, (x < tiledVol.x) && (y + j < tiledVol.y)));
635 |         memStat.gst_tran += countGlTransactions(pos, n, accWidth, warpLane);
636 |         memStat.gst_req += __any_sync(FULL_MASK, n > 0);
637 |         countCacheLines(pos, n, cacheWidth, warpLane, memStat.cl_full_l2, memStat.cl_part_l2);
638 |       }
639 |     }
640 | 
641 |   }
642 |   
643 |   // Reduce memStat within thread block and write result to global memory
644 |   writeMemStat(warpLane, memStat, glMemStat);
645 | 
646 | }
647 | 
648 | //######################################################################################
649 | //######################################################################################
650 | //######################################################################################
651 | 
652 | void runCounters(const int warpSize, const int* hostPosData, const int numPosData,
653 |   const int accWidth, const int cacheWidth, int* host_tran, int* host_cl_full, int* host_cl_part) {
654 |   
655 |   const int numWarp = numPosData/warpSize;
656 | 
657 |   int* devPosData;
658 |   allocate_device<int>(&devPosData, numPosData);
659 |   copy_HtoD<int>(hostPosData, devPosData, numPosData);
660 | 
661 |   int* dev_tran;
662 |   int* dev_cl_full;
663 |   int* dev_cl_part;
664 |   allocate_device<int>(&dev_tran, numWarp);
665 |   allocate_device<int>(&dev_cl_full, numWarp);
666 |   allocate_device<int>(&dev_cl_part, numWarp);
667 | 
668 |   int nthread = 512;
669 |   int nblock = (numPosData - 1)/nthread + 1;
670 |   runCountersKernel<<< nblock, nthread >>>(devPosData, numPosData,
671 |     accWidth, cacheWidth, dev_tran, dev_cl_full, dev_cl_part);
672 |   cudaCheck(cudaGetLastError());
673 | 
674 |   copy_DtoH<int>(dev_tran,    host_tran,    numWarp);
675 |   copy_DtoH<int>(dev_cl_full, host_cl_full, numWarp);
676 |   copy_DtoH<int>(dev_cl_part, host_cl_part, numWarp);
677 |   cudaCheck(cudaDeviceSynchronize());
678 | 
679 |   deallocate_device<int>(&dev_tran);
680 |   deallocate_device<int>(&dev_cl_full);
681 |   deallocate_device<int>(&dev_cl_part);
682 | 
683 |   deallocate_device<int>(&devPosData);
684 | }
685 | 
686 | bool cuttGpuModelKernel(cuttPlan_t& plan, const int accWidth, const int cacheWidth,
687 |   int& gld_tran, int& gst_tran, int& gld_req, int& gst_req,
688 |   int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1) {
689 | 
690 |   LaunchConfig& lc = plan.launchConfig;
691 |   TensorSplit& ts = plan.tensorSplit;
692 | 
693 |   MemStat* devMemStat;
694 |   allocate_device<MemStat>(&devMemStat, 1);
695 |   set_device_array<MemStat>(devMemStat, 0, 1, plan.stream);
696 | 
697 |   switch(ts.method) {
698 |     case Trivial:
699 |     {
700 |       return false;
701 |     }
702 | 
703 |     case Packed:
704 |     {
705 |       switch(lc.numRegStorage) {
706 | #define CALL0(NREG) \
707 |     countPacked<NREG> <<< lc.numblock, lc.numthread, ts.volMmk*sizeof(int), plan.stream >>> \
708 |       (ts.volMmk, ts.volMbar, ts.sizeMmk, ts.sizeMbar, \
709 |       plan.Mmk, plan.Mbar, accWidth, cacheWidth, devMemStat)
710 | #define CALL(ICASE) case ICASE: CALL0(ICASE); break
711 | #include "calls.h"
712 |         default:
713 |         printf("cuttGpuModelKernel no template implemented for numRegStorage %d\n", lc.numRegStorage);
714 |         return false;
715 | #undef CALL
716 | #undef CALL0
717 |       }
718 | 
719 |     }
720 |     break;
721 | 
722 |     case PackedSplit:
723 |     {
724 | 
725 |       // Calculate max. volume of split Mmk
726 |       const int volSplit = (ts.splitDim/ts.numSplit) + ((ts.splitDim % ts.numSplit) != 0);
727 |       const int volMmkSplit = volSplit*ts.volMmkUnsplit;
728 | 
729 |       switch(lc.numRegStorage) {
730 | #define CALL0(NREG) \
731 |     countPackedSplit<NREG> <<< lc.numblock, lc.numthread, volMmkSplit*sizeof(int), plan.stream >>> \
732 |       (ts.splitDim, ts.volMmkUnsplit, ts. volMbar, ts.sizeMmk, ts.sizeMbar, \
733 |         plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, accWidth, cacheWidth, devMemStat)
734 | #define CALL(ICASE) case ICASE: CALL0(ICASE); break
735 | #include "calls.h"
736 |         default:
737 |         printf("cuttGpuModelKernel no template implemented for numRegStorage %d\n", lc.numRegStorage);
738 |         return false;
739 | #undef CALL
740 | #undef CALL0
741 |       }
742 | 
743 |     }
744 |     break;
745 | 
746 |     case Tiled:
747 |     {
748 |       countTiled <<< lc.numblock, lc.numthread, 0, plan.stream >>>
749 |       (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.tiledVol, plan.cuDimMk, plan.cuDimMm,
750 |         plan.Mbar, accWidth, cacheWidth, devMemStat);
751 |     }
752 |     break;
753 | 
754 |     case TiledCopy:
755 |     {
756 |       countTiledCopy <<< lc.numblock, lc.numthread, 0, plan.stream >>>
757 |       (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.cuDimMk, plan.cuDimMm, plan.tiledVol,
758 |         plan.Mbar, accWidth, cacheWidth, devMemStat);
759 |     }
760 |     break;
761 | 
762 |   }
763 | 
764 |   cudaCheck(cudaGetLastError());
765 | 
766 |   MemStat hostMemStat;
767 |   copy_DtoH<MemStat>(devMemStat, &hostMemStat, 1, plan.stream);
768 |   cudaCheck(cudaDeviceSynchronize());
769 |   deallocate_device<MemStat>(&devMemStat);
770 | 
771 |   gld_tran   = hostMemStat.gld_tran;
772 |   gst_tran   = hostMemStat.gst_tran;
773 |   gld_req    = hostMemStat.gld_req;
774 |   gst_req    = hostMemStat.gst_req;
775 |   cl_full_l2 = hostMemStat.cl_full_l2;
776 |   cl_part_l2 = hostMemStat.cl_part_l2;
777 |   cl_full_l1 = hostMemStat.cl_full_l1;
778 |   cl_part_l1 = hostMemStat.cl_part_l1;
779 |   // l1_tran    = hostMemStat.l1_tran;
780 | 
781 |   return true;
782 | }
783 | 


--------------------------------------------------------------------------------
/src/cutt_bench.cpp:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <vector>
 26 | #include <algorithm>
 27 | #include <cstring>         // strcmp
 28 | #include <ctime>           // std::time
 29 | #include <chrono>
 30 | #include <cmath>
 31 | #include <cctype>
 32 | #include <random>
 33 | #include "cutt.h"
 34 | #include "CudaUtils.h"
 35 | #include "TensorTester.h"
 36 | #include "cuttTimer.h"
 37 | #include "CudaMemcpy.h"
 38 | #include "int_vector.h"
 39 | 
 40 | #define MILLION 1000000
 41 | #define BILLION 1000000000
 42 | 
 43 | //
 44 | // Error checking wrapper for cutt
 45 | //
 46 | #define cuttCheck(stmt) do {                                 \
 47 |   cuttResult err = stmt;                            \
 48 |   if (err != CUTT_SUCCESS) {                          \
 49 |     fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \
 50 |     exit(1); \
 51 |   }                                                  \
 52 | } while(0)
 53 | 
 54 | char* dataIn  = NULL;
 55 | char* dataOut = NULL;
 56 | size_t dataSize = 0;
 57 | TensorTester* tester = NULL;
 58 | 
 59 | cuttTimer* timer;
 60 | bool use_cuttPlanMeasure;
 61 | bool use_plantimer;
 62 | 
 63 | std::default_random_engine generator;
 64 | 
 65 | bool bench1(int numElem);
 66 | bool bench2(int numElem);
 67 | bool bench3(int numElem);
 68 | bool bench4();
 69 | template <typename T> bool bench5(int numElem, int ratio);
 70 | bool bench6();
 71 | template <typename T> bool bench7();
 72 | template <typename T> bool bench_input(std::vector<int>& dim, std::vector<int>& permutation);
 73 | template <typename T> bool bench_memcpy(int numElem);
 74 | 
 75 | bool isTrivial(std::vector<int>& permutation);
 76 | void getRandomDim(double vol, std::vector<int>& dim);
 77 | template <typename T> bool bench_tensor(std::vector<int>& dim, std::vector<int>& permutation);
 78 | void printVec(std::vector<int>& vec);
 79 | void printDeviceInfo();
 80 | 
 81 | int main(int argc, char *argv[]) {
 82 | 
 83 |   int gpuid = -1;
 84 |   unsigned seed = unsigned (std::time(0));
 85 |   bool arg_ok = true;
 86 |   int benchID = 0;
 87 |   use_cuttPlanMeasure = false;
 88 |   use_plantimer = false;
 89 |   int elemsize = 8;
 90 |   std::vector<int> dimIn;
 91 |   std::vector<int> permutationIn;
 92 |   if (argc >= 2) {
 93 |     int i = 1;
 94 |     while (i < argc) {
 95 |       if (strcmp(argv[i], "-device") == 0) {
 96 |         sscanf(argv[i+1], "%d", &gpuid);
 97 |         i += 2;
 98 |       } else if (strcmp(argv[i], "-bench") == 0) {
 99 |         sscanf(argv[i+1], "%d", &benchID);
100 |         i += 2;
101 |       } else if (strcmp(argv[i], "-measure") == 0) {
102 |         use_cuttPlanMeasure = true;
103 |         i++;
104 |       } else if (strcmp(argv[i], "-seed") == 0) {
105 |         sscanf(argv[i+1], "%u", &seed);
106 |         i += 2;
107 |       } else if (strcmp(argv[i], "-plantimer") == 0) {
108 |         use_plantimer = true;
109 |         i++;
110 |       } else if (strcmp(argv[i], "-elemsize") == 0) {
111 |         sscanf(argv[i+1], "%u", &elemsize);
112 |         i += 2;
113 |       } else if (strcmp(argv[i], "-dim") == 0) {
114 |         i++;
115 |         while (i < argc && isdigit(*argv[i])) {
116 |           int val;
117 |           sscanf(argv[i++], "%d", &val);
118 |           dimIn.push_back(val);
119 |         }
120 |       } else if (strcmp(argv[i], "-permutation") == 0) {
121 |         i++;
122 |         while (i < argc && isdigit(*argv[i])) {
123 |           int val;
124 |           sscanf(argv[i++], "%d", &val);
125 |           permutationIn.push_back(val);
126 |         }
127 |       } else {
128 |         arg_ok = false;
129 |         break;
130 |       }
131 |     }
132 |   } else if (argc > 1) {
133 |     arg_ok = false;
134 |   }
135 | 
136 |   if (elemsize != 4 && elemsize != 8) {
137 |     arg_ok = false;
138 |   }
139 | 
140 |   if (!arg_ok) {
141 |     printf("cutt_bench [options]\n");
142 |     printf("Options:\n");
143 |     printf("-device [int]    : GPU ID (default is 0)\n");
144 |     printf("-measure         : use cuttPlanMeasure (default is cuttPlan)\n");
145 |     printf("-plantimer       : planning is timed (default is no)\n");
146 |     printf("-seed [int]      : seed value for random number generator (default is system timer)\n");
147 |     printf("-elemsize [int]  : size of elements in bytes, 4 or 8. (default is 8)\n");
148 |     printf("-dim ...         : space-separated list of dimensions\n");
149 |     printf("-permutation ... : space-separated list of permutations\n");
150 |     printf("-bench benchID   : benchmark to run\n");
151 |     return 1;
152 |   }
153 | 
154 |   if (gpuid >= 0) {
155 |     cudaCheck(cudaSetDevice(gpuid));
156 |   }
157 | 
158 |   cudaCheck(cudaDeviceReset());
159 |   if (elemsize == 4) {
160 |     cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
161 |   } else {
162 |     cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));    
163 |   }
164 | 
165 |   printDeviceInfo();
166 |   printf("CPU using vector type %s of length %d\n", INT_VECTOR_TYPE, INT_VECTOR_LEN);
167 | 
168 |   timer = new cuttTimer(elemsize);
169 | 
170 |   dataSize = (elemsize == 4) ? 420*MILLION : 370*MILLION;
171 | 
172 |   // Allocate device data, 100M elements
173 |   allocate_device<char>(&dataIn, dataSize*(size_t)elemsize);
174 |   allocate_device<char>(&dataOut, dataSize*(size_t)elemsize);
175 | 
176 |   // Create tester
177 |   tester = new TensorTester();
178 |   tester->setTensorCheckPattern((unsigned int *)dataIn, dataSize*(size_t)elemsize/sizeof(unsigned int));
179 | 
180 |   std::vector<int> worstDim;
181 |   std::vector<int> worstPermutation;
182 | 
183 |   std::srand(seed);
184 |   generator.seed(seed);
185 | 
186 |   // if (!bench1(40*MILLION, bandwidths)) goto fail;
187 |   // printf("bench1:\n");
188 |   // for (int i=0;i < bandwidths.size();i++) {
189 |   //   printf("%lf\n", bandwidths[i]);
190 |   // }
191 | 
192 |   // if (!bench2(40*MILLION, bandwidths)) goto fail;
193 |   // printf("bench2:\n");
194 |   // for (int i=0;i < bandwidths.size();i++) {
195 |   //   printf("%lf\n", bandwidths[i]);
196 |   // }
197 | 
198 |   if (dimIn.size() > 0) {
199 |     bool ok = (elemsize == 4) ? bench_input<int>(dimIn, permutationIn) : bench_input<long long int>(dimIn, permutationIn);
200 |     if (ok) goto benchOK;
201 |     goto fail;
202 |   }
203 | 
204 |   if (benchID == 3) {
205 |     if (elemsize == 4) {
206 |       printf("bench 3 not implemented for elemsize = 4\n");
207 |       goto fail;
208 |     }
209 |     if (bench3(200*MILLION)) {
210 |       printf("bench3:\n");
211 |       printf("rank best worst average median\n");
212 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
213 |         double worstBW = timer->getWorst(*it);
214 |         double bestBW = timer->getBest(*it);
215 |         double aveBW = timer->getAverage(*it);
216 |         double medBW = timer->getMedian(*it);
217 |         printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW);
218 |       }
219 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
220 |         std::vector<int> dim;
221 |         std::vector<int> permutation;
222 |         double worstBW = timer->getWorst(*it, dim, permutation);
223 |         printf("rank %d BW %4.2lf\n", *it, worstBW);
224 |         printf("dimensions\n");
225 |         printVec(dim);
226 |         printf("permutation\n");
227 |         printVec(permutation);
228 |       }
229 |       goto benchOK;
230 |     } else {
231 |       goto fail;
232 |     }
233 |   }
234 | 
235 |   if (benchID/100 == 5) {
236 |     bool ok = (elemsize == 4) ? bench5<int>(200*MILLION, benchID % 100) : bench5<long long int>(200*MILLION, benchID % 100);
237 |     if (ok) {
238 |       printf("bench5:\n");
239 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
240 |         std::vector<double> v = timer->getData(*it);
241 |         printf("RANK%d", *it);
242 |         for (int i=0;i < v.size();i++) {
243 |           printf(" %1.2lf", v[i]);
244 |         }
245 |         printf("\n");
246 |       }
247 |       printf("rank best worst average median\n");
248 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
249 |         double worstBW = timer->getWorst(*it);
250 |         double bestBW = timer->getBest(*it);
251 |         double aveBW = timer->getAverage(*it);
252 |         double medBW = timer->getMedian(*it);
253 |         printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW);
254 |       }
255 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
256 |         std::vector<int> dim;
257 |         std::vector<int> permutation;
258 |         double worstBW = timer->getWorst(*it, dim, permutation);
259 |         printf("rank %d BW %4.2lf\n", *it, worstBW);
260 |         printf("dimensions\n");
261 |         printVec(dim);
262 |         printf("permutation\n");
263 |         printVec(permutation);
264 |       }
265 |       goto benchOK;
266 |     } else {
267 |       goto fail;
268 |     }
269 |   }
270 | 
271 |   if (benchID == 6) {
272 |     if (elemsize == 4) {
273 |       printf("bench 6 not implemented for elemsize = 4\n");
274 |       goto fail;
275 |     }
276 |     if (bench6()) {
277 |       printf("bench6:\n");
278 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
279 |         std::vector<double> v = timer->getData(*it);
280 |         printf("RANK%d", *it);
281 |         for (int i=0;i < v.size();i++) {
282 |           printf(" %1.2lf", v[i]);
283 |         }
284 |         printf("\n");
285 |       }
286 |       printf("rank best worst average\n");
287 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
288 |         double worstBW = timer->getWorst(*it);
289 |         double bestBW = timer->getBest(*it);
290 |         double aveBW = timer->getAverage(*it);
291 |         printf("%d %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW);
292 |       }
293 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
294 |         std::vector<int> dim;
295 |         std::vector<int> permutation;
296 |         double worstBW = timer->getWorst(*it, dim, permutation);
297 |         printf("rank %d BW %4.2lf\n", *it, worstBW);
298 |         printf("dimensions\n");
299 |         printVec(dim);
300 |         printf("permutation\n");
301 |         printVec(permutation);
302 |       }
303 |       goto benchOK;
304 |     } else {
305 |       goto fail;
306 |     }
307 |   }
308 | 
309 |   if (benchID == 7) {
310 |     bool ok = (elemsize == 4) ? bench7<int>() : bench7<long long int>();
311 |     if (ok) {
312 |       printf("bench7:\n");
313 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
314 |         std::vector<double> v = timer->getData(*it);
315 |         printf("RANK%d", *it);
316 |         for (int i=0;i < v.size();i++) {
317 |           printf(" %1.2lf", v[i]);
318 |         }
319 |         printf("\n");
320 |       }
321 |       printf("rank best worst average median\n");
322 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
323 |         double worstBW = timer->getWorst(*it);
324 |         double bestBW = timer->getBest(*it);
325 |         double aveBW = timer->getAverage(*it);
326 |         double medBW = timer->getMedian(*it);
327 |         printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW);
328 |       }
329 |       for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) {
330 |         std::vector<int> dim;
331 |         std::vector<int> permutation;
332 |         double worstBW = timer->getWorst(*it, dim, permutation);
333 |         printf("rank %d BW %4.2lf\n", *it, worstBW);
334 |         printf("dimensions\n");
335 |         printVec(dim);
336 |         printf("permutation\n");
337 |         printVec(permutation);
338 |       }
339 |       goto benchOK;
340 |     } else {
341 |       goto fail;
342 |     }
343 |   }
344 | 
345 |   // Otherwise, do memcopy benchmark
346 |   {
347 |     bool ok = (elemsize == 4) ? bench_memcpy<int>(benchID) : bench_memcpy<long long int>(benchID);
348 |     if (ok) goto benchOK;
349 |     goto fail;
350 |   }
351 | 
352 | benchOK:
353 |   printf("bench OK\n");
354 | 
355 |   goto end;
356 | fail:
357 |   printf("bench FAIL\n");
358 | end:
359 |   deallocate_device<char>(&dataIn);
360 |   deallocate_device<char>(&dataOut);
361 |   delete tester;
362 | 
363 |   printf("seed %u\n", seed);
364 | 
365 |   delete timer;
366 | 
367 |   cudaCheck(cudaDeviceSynchronize());
368 | 
369 |   cudaCheck(cudaDeviceReset());
370 |   return 0;
371 | }
372 | 
373 | //
374 | // Benchmark 1: ranks 2-8,15 in inverse permutation. 32 start and end dimension
375 | //
376 | bool bench1(int numElem) {
377 |   int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15};
378 |   for (int i=0;i <= 7;i++) {
379 |     std::vector<int> dim(ranks[i]);
380 |     std::vector<int> permutation(ranks[i]);
381 |     int dimave = (int)pow(numElem, 1.0/(double)ranks[i]);
382 | 
383 |     if (dimave < 100.0) {
384 |       dim[0]            = 32;
385 |       dim[ranks[i] - 1] = 32;
386 |     } else {
387 |       dim[0]            = dimave;
388 |       dim[ranks[i] - 1] = dimave;
389 |     }
390 |     // Distribute remaining volume to the middle ranks
391 |     int ranks_left = ranks[i] - 2;
392 |     double numElem_left = numElem/(double)(dim[0]*dim[ranks[i] - 1]);
393 |     for (int r=1;r < ranks[i] - 1;r++) {
394 |       dim[r] = (int)pow(numElem_left, 1.0/(double)ranks_left);
395 |       numElem_left /= (double)dim[r];
396 |       ranks_left--;
397 |     }
398 | 
399 |     // Inverse order
400 |     for (int r=0;r < ranks[i];r++) {
401 |       permutation[r] = ranks[i] - 1 - r;
402 |     }
403 | 
404 |     if (!bench_tensor<long long int>(dim, permutation)) return false;
405 |   }
406 | 
407 |   return true;
408 | }
409 | 
410 | //
411 | // Benchmark 2: ranks 2-8,15 in inverse permutation. Even spread of dimensions.
412 | //
413 | bool bench2(int numElem) {
414 |   int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15};
415 |   for (int i=0;i <= 7;i++) {
416 |     std::vector<int> dim(ranks[i]);
417 |     std::vector<int> permutation(ranks[i]);
418 |     int dimave = (int)pow(numElem, 1.0/(double)ranks[i]);
419 | 
420 |     double numElem_left = numElem;
421 |     for (int r=0;r < ranks[i];r++) {
422 |       dim[r] = (int)pow(numElem_left, 1.0/(double)(ranks[i] - r));
423 |       numElem_left /= (double)dim[r];
424 |     }
425 | 
426 |     // Inverse order
427 |     for (int r=0;r < ranks[i];r++) {
428 |       permutation[r] = ranks[i] - 1 - r;
429 |     }
430 | 
431 |     if (!bench_tensor<long long int>(dim, permutation)) return false;
432 |   }
433 | 
434 |   return true;
435 | }
436 | 
437 | //
438 | // Benchmark 3: ranks 2-8,15 in random permutation and dimensions.
439 | //
440 | bool bench3(int numElem) {
441 | 
442 |   int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15};
443 | 
444 |   for (int i=0;i <= 7;i++) {
445 |     std::vector<int> dim(ranks[i]);
446 |     std::vector<int> permutation(ranks[i]);
447 |     for (int r=0;r < ranks[i];r++) permutation[r] = r;
448 |     for (int nsample=0;nsample < 50;nsample++) {
449 |       std::random_shuffle(permutation.begin(), permutation.end());
450 |       getRandomDim((double)numElem, dim);
451 |       if (!bench_tensor<long long int>(dim, permutation)) return false;
452 |     }
453 |   }
454 | 
455 |   return true;
456 | }
457 | 
458 | //
459 | // Benchmark 4: specific examples
460 | //
461 | bool bench4() {
462 | }
463 | 
464 | template <typename T>
465 | bool bench_input(std::vector<int>& dim, std::vector<int>& permutation) {
466 |   if (!bench_tensor<T>(dim, permutation)) return false;
467 |   printf("dimensions\n");
468 |   printVec(dim);
469 |   printf("permutation\n");
470 |   printVec(permutation);
471 |   printf("bandwidth %4.2lf GB/s\n", timer->GBs());
472 |   return true;  
473 | }
474 | 
475 | //
476 | // Benchmark 5: All permutations for ranks 2-4, limited permutations for ranks 5-7
477 | //
478 | template <typename T>
479 | bool bench5(int numElemAvg, int ratio) {
480 | 
481 |   std::normal_distribution<double> numElem_dist((double)numElemAvg, (double)numElemAvg*0.2);
482 | 
483 |   const int minDim = 2;
484 |   const int maxDim = 16;
485 |   for (int rank=2;rank <= 7;rank++) {
486 | 
487 |     for (int iter=0;iter < 500;iter++) {
488 | 
489 |       int numElem = (int)numElem_dist(generator);
490 | 
491 |       std::vector<int> dim(rank);
492 |       std::vector<int> permutation(rank);
493 |       std::vector<double> dimf(rank);
494 |       double volf = 1.0;
495 |       for (int r=0;r < rank;r++) {
496 |         permutation[r] = r;
497 |         dimf[r] = 1.0 + (double)r*(ratio - 1.0)/(double)(rank - 1);
498 |         volf *= dimf[r];
499 |       }
500 |       // fprintf(stderr, "volf %lf\n", volf);
501 |       double scale = pow((double)numElem/volf, 1.0/(double)rank);
502 |       // fprintf(stderr, "scale %lf\n", scale);
503 |       int vol = 1;
504 |       for (int r=0;r < rank;r++) {
505 |         if (r == rank - 1) {
506 |           dim[r] = ratio*dim[0];
507 |         } else {
508 |           dim[r] = (int)round(dimf[r]*scale);
509 |         }
510 |         dim[r] = std::max(2, dim[r]);
511 |         vol *= dim[r];
512 |       }
513 |       // fprintf(stderr, "dim[0] %lf\n", dim[0]);
514 |       double cur_ratio = (double)dim[rank-1]/(double)dim[0];
515 |       double vol_re = fabs((double)(vol - numElem)/(double)numElem);
516 |       // fprintf(stderr, "cur_ratio %lf vol_re %lf\n", cur_ratio, vol_re);
517 |       // Fix dimensions if volume is off by more than 5%
518 |       if (vol_re > 0.05) {
519 |         int d = (vol < numElem) ? 1 : -1;
520 |         int r = 1;
521 |         while (vol_re > 0.05 && r < rank) {
522 |           int dim_plus_d = std::max(2, dim[r] + d);
523 |           // fprintf(stderr, "r %d vol %lf dim[r] %d dim_plus_d %d\n", vol, dim[r], dim_plus_d);
524 |           vol = (vol/dim[r])*dim_plus_d;
525 |           dim[r] = dim_plus_d;
526 |           vol_re = fabs((double)(vol - numElem)/(double)numElem);
527 |           r++;
528 |         }
529 |       }
530 |       int minDim = *(std::min_element(dim.begin(), dim.end()));
531 |       int maxDim = *(std::max_element(dim.begin(), dim.end()));
532 |       // fprintf(stderr, "minDim %lf maxDim\n", minDim, maxDim);
533 |       cur_ratio = (double)maxDim/(double)minDim;
534 |       printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
535 |       printVec(dim);
536 | 
537 |       std::random_shuffle(dim.begin(), dim.end());
538 |       while (isTrivial(permutation)) {
539 |         std::random_shuffle(permutation.begin(), permutation.end());
540 |       }
541 |       if (!bench_tensor<T>(dim, permutation)) return false;
542 |     }
543 |   }
544 | 
545 |   return true;
546 | }
547 | 
548 | //
549 | // Benchmark 6: from "TTC: A Tensor Transposition Compiler for Multiple Architectures"
550 | //
551 | bool bench6() {
552 | 
553 |   std::vector< std::vector<int> > dims = {
554 |     std::vector<int>{7248,7248},
555 |     std::vector<int>{43408,1216},
556 |     std::vector<int>{1216,43408},
557 |     std::vector<int>{368,384,384},
558 |     std::vector<int>{2144,64,384},
559 |     std::vector<int>{368,64,2307},
560 |     std::vector<int>{384,384,355},
561 |     std::vector<int>{2320,384,59},
562 |     std::vector<int>{384,2320,59},
563 |     std::vector<int>{384,355,384},
564 |     std::vector<int>{2320,59,384},
565 |     std::vector<int>{384,59,2320},
566 |     std::vector<int>{80,96,75,96},
567 |     std::vector<int>{464,16,75,96},
568 |     std::vector<int>{80,16,75,582},
569 |     std::vector<int>{96,75,96,75},
570 |     std::vector<int>{608,12,96,75},
571 |     std::vector<int>{96,12,608,75},
572 |     std::vector<int>{96,75,96,75},
573 |     std::vector<int>{608,12,96,75},
574 |     std::vector<int>{96,12,608,75},
575 |     std::vector<int>{96,96,75,75},
576 |     std::vector<int>{608,96,12,75},
577 |     std::vector<int>{96,608,12,75},
578 |     std::vector<int>{96,75,75,96},
579 |     std::vector<int>{608,12,75,96},
580 |     std::vector<int>{96,12,75,608},
581 |     std::vector<int>{32,48,28,28,48},
582 |     std::vector<int>{176,8,28,28,48},
583 |     std::vector<int>{32,8,28,28,298},
584 |     std::vector<int>{48,28,28,48,28},
585 |     std::vector<int>{352,4,28,48,28},
586 |     std::vector<int>{48,4,28,352,28},
587 |     std::vector<int>{48,28,48,28,28},
588 |     std::vector<int>{352,4,48,28,28},
589 |     std::vector<int>{48,4,352,28,28},
590 |     std::vector<int>{48,48,28,28,28},
591 |     std::vector<int>{352,48,4,28,28},
592 |     std::vector<int>{48,352,4,28,28},
593 |     std::vector<int>{48,28,28,28,48},
594 |     std::vector<int>{352,4,28,28,48},
595 |     std::vector<int>{48,4,28,28,352},
596 |     std::vector<int>{16,32,15,32,15,15},
597 |     std::vector<int>{48,10,15,32,15,15},
598 |     std::vector<int>{16,10,15,103,15,15},
599 |     std::vector<int>{32,15,15,32,15,15},
600 |     std::vector<int>{112,5,15,32,15,15},
601 |     std::vector<int>{32,5,15,112,15,15},
602 |     std::vector<int>{32,15,32,15,15,15},
603 |     std::vector<int>{112,5,32,15,15,15},
604 |     std::vector<int>{32,5,112,15,15,15},
605 |     std::vector<int>{32,15,15,32,15,15},
606 |     std::vector<int>{112,5,15,32,15,15},
607 |     std::vector<int>{32,5,15,112,15,15},
608 |     std::vector<int>{32,15,15,15,15,32},
609 |     std::vector<int>{112,5,15,15,15,32},
610 |     std::vector<int>{32,5,15,15,15,112}
611 |   };
612 | 
613 |   std::vector< std::vector<int> > permutations = {
614 |     std::vector<int>{1,0},
615 |     std::vector<int>{1,0},
616 |     std::vector<int>{1,0},
617 |     std::vector<int>{0,2,1},
618 |     std::vector<int>{0,2,1},
619 |     std::vector<int>{0,2,1},
620 |     std::vector<int>{1,0,2},
621 |     std::vector<int>{1,0,2},
622 |     std::vector<int>{1,0,2},
623 |     std::vector<int>{2,1,0},
624 |     std::vector<int>{2,1,0},
625 |     std::vector<int>{2,1,0},
626 |     std::vector<int>{0,3,2,1},
627 |     std::vector<int>{0,3,2,1},
628 |     std::vector<int>{0,3,2,1},
629 |     std::vector<int>{2,1,3,0},
630 |     std::vector<int>{2,1,3,0},
631 |     std::vector<int>{2,1,3,0},
632 |     std::vector<int>{2,0,3,1},
633 |     std::vector<int>{2,0,3,1},
634 |     std::vector<int>{2,0,3,1},
635 |     std::vector<int>{1,0,3,2},
636 |     std::vector<int>{1,0,3,2},
637 |     std::vector<int>{1,0,3,2},
638 |     std::vector<int>{3,2,1,0},
639 |     std::vector<int>{3,2,1,0},
640 |     std::vector<int>{3,2,1,0},
641 |     std::vector<int>{0,4,2,1,3},
642 |     std::vector<int>{0,4,2,1,3},
643 |     std::vector<int>{0,4,2,1,3},
644 |     std::vector<int>{3,2,1,4,0},
645 |     std::vector<int>{3,2,1,4,0},
646 |     std::vector<int>{3,2,1,4,0},
647 |     std::vector<int>{2,0,4,1,3},
648 |     std::vector<int>{2,0,4,1,3},
649 |     std::vector<int>{2,0,4,1,3},
650 |     std::vector<int>{1,3,0,4,2},
651 |     std::vector<int>{1,3,0,4,2},
652 |     std::vector<int>{1,3,0,4,2},
653 |     std::vector<int>{4,3,2,1,0},
654 |     std::vector<int>{4,3,2,1,0},
655 |     std::vector<int>{4,3,2,1,0},
656 |     std::vector<int>{0,3,2,5,4,1},
657 |     std::vector<int>{0,3,2,5,4,1},
658 |     std::vector<int>{0,3,2,5,4,1},
659 |     std::vector<int>{3,2,0,5,1,4},
660 |     std::vector<int>{3,2,0,5,1,4},
661 |     std::vector<int>{3,2,0,5,1,4},
662 |     std::vector<int>{2,0,4,1,5,3},
663 |     std::vector<int>{2,0,4,1,5,3},
664 |     std::vector<int>{2,0,4,1,5,3},
665 |     std::vector<int>{3,2,5,1,0,4},
666 |     std::vector<int>{3,2,5,1,0,4},
667 |     std::vector<int>{3,2,5,1,0,4},
668 |     std::vector<int>{5,4,3,2,1,0},
669 |     std::vector<int>{5,4,3,2,1,0},
670 |     std::vector<int>{5,4,3,2,1,0}
671 |   };
672 | 
673 |   for (int i=0;i < dims.size();i++) {
674 |     if (!bench_tensor<long long int>(dims[i], permutations[i])) return false;
675 |     printf("dimensions\n");
676 |     printVec(dims[i]);
677 |     printf("permutation\n");
678 |     printVec(permutations[i]);
679 |     printf("bandwidth %4.2lf GiB/s\n", timer->GiBs());
680 |   }
681 | 
682 |   return true;
683 | }
684 | 
685 | //
686 | // Benchmark 7: ranks 8 and 12 with 4 large dimensions and rest small dimensions
687 | //
688 | template <typename T>
689 | bool bench7() {
690 | 
691 |   // 199584000 elements
692 |   {
693 |     std::vector<int> dim = {5, 3, 2, 4, 35, 33, 37, 40};
694 |     std::vector<int> permutation(8);
695 |     // Inverse
696 |     for (int r=0;r < dim.size();r++) permutation[r] = dim.size() - 1 - r;
697 |     if (!bench_tensor<T>(dim, permutation)) return false;
698 |     // Random
699 |     for (int r=0;r < dim.size();r++) permutation[r] = r;
700 |     for (int nsample=0;nsample < 500;nsample++) {
701 |       std::random_shuffle(dim.begin(), dim.end());
702 |       std::random_shuffle(permutation.begin(), permutation.end());
703 |       if (!isTrivial(permutation)) {
704 |         if (!bench_tensor<T>(dim, permutation)) return false;
705 |       }
706 |     }
707 |   }
708 | 
709 |   // 328458240 elements
710 |   {
711 |     std::vector<int> dim = {2, 3, 4, 3, 2, 2, 3, 2, 20, 18, 22, 24};
712 |     std::vector<int> permutation(12);
713 |     // Inverse
714 |     for (int r=0;r < dim.size();r++) permutation[r] = dim.size() - 1 - r;
715 |     if (!bench_tensor<T>(dim, permutation)) return false;
716 |     // Random
717 |     for (int r=0;r < dim.size();r++) permutation[r] = r;
718 |     for (int nsample=0;nsample < 500;nsample++) {
719 |       std::random_shuffle(dim.begin(), dim.end());
720 |       std::random_shuffle(permutation.begin(), permutation.end());
721 |       if (!isTrivial(permutation)) {
722 |         if (!bench_tensor<T>(dim, permutation)) return false;
723 |       }
724 |     }
725 |   }
726 | 
727 |   return true;
728 | }
729 | 
730 | //
731 | // Returns true for trivial permutation
732 | //
733 | bool isTrivial(std::vector<int>& permutation) {
734 |   for (int i=0;i < permutation.size();i++) {
735 |     if (permutation[i] != i) return false;
736 |   }
737 |   return true;
738 | }
739 | 
740 | //
741 | // Get random dimensions for a fixed volume tensor
742 | //
743 | void getRandomDim(double vol, std::vector<int>& dim) {
744 |   double dimave = floor(pow(vol, 1.0/(double)dim.size()));
745 |   double curvol = 1.0;
746 |   int iter = 0;
747 |   do {
748 |     curvol = 1.0;
749 |     for (int r=0;r < dim.size();r++) {
750 |       // p is -1 ... 1
751 |       double p = (((double)rand()/(double)RAND_MAX) - 0.5)*2.0;
752 |       dim[r] = round(dimave + p*(dimave - 2.0));
753 |       curvol *= (double)dim[r];
754 |     }
755 | 
756 |     double vol_scale = pow(vol/curvol, 1.0/(double)dim.size());
757 |     curvol = 1.0;
758 |     for (int r=0;r < dim.size();r++) {
759 |       dim[r] = std::max(2, (int)(dim[r]*vol_scale));
760 |       curvol *= dim[r];
761 |     }
762 |     // printf("curvol %lf\n", curvol/MILLION);
763 |     iter++;
764 |   } while (iter < 5000 && (fabs(curvol-vol)/(double)vol > 0.3));
765 | 
766 |   if (iter == 5000) {
767 |     printf("getRandomDim: Unable to determine dimensions in 5000 iterations\n");
768 |     exit(1);
769 |   }
770 | }
771 | 
772 | template <typename T>
773 | bool bench_tensor(std::vector<int>& dim, std::vector<int>& permutation) {
774 | 
775 |   int rank = dim.size();
776 | 
777 |   int vol = 1;
778 |   for (int r=0;r < rank;r++) {
779 |     vol *= dim[r];
780 |   }
781 | 
782 |   size_t volmem = vol*sizeof(T);
783 |   size_t datamem = dataSize*sizeof(long long int);
784 |   if (volmem > datamem) {
785 |     printf("test_tensor, data size exceeded\n");
786 |     return false;
787 |   }
788 | 
789 |   std::vector<int> dimp(rank);
790 |   for (int r=0;r < rank;r++) {
791 |     dimp[r] = dim[permutation[r]];
792 |   }
793 | 
794 |   printf("number of elements %d\n", vol);
795 |   printf("dimensions\n");
796 |   printVec(dim);
797 |   printVec(dimp);
798 |   printf("permutation\n");
799 |   printVec(permutation);
800 | 
801 |   cuttHandle plan;
802 |   std::chrono::high_resolution_clock::time_point plan_start;
803 |   if (use_plantimer) {
804 |     plan_start = std::chrono::high_resolution_clock::now();
805 |   }
806 |   if (use_cuttPlanMeasure) {
807 |     cuttCheck(cuttPlanMeasure(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0, dataIn, dataOut));
808 |   } else {
809 |     cuttCheck(cuttPlan(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0));
810 |   }
811 |   if (use_plantimer) {
812 |     std::chrono::high_resolution_clock::time_point plan_end;
813 |     plan_end = std::chrono::high_resolution_clock::now();
814 |     double plan_duration = std::chrono::duration_cast< std::chrono::duration<double> >(plan_end - plan_start).count();
815 |     printf("plan took %lf ms\n", plan_duration*1000.0);
816 |   }
817 | 
818 |   for (int i=0;i < 4;i++) {
819 |     set_device_array<T>((T *)dataOut, -1, vol);
820 |     cudaCheck(cudaDeviceSynchronize());
821 | 
822 |     timer->start(dim, permutation);
823 |     cuttCheck(cuttExecute(plan, dataIn, dataOut));
824 |     timer->stop();
825 | 
826 |     printf("wall time %lf ms %lf GB/s\n", timer->seconds()*1000.0, timer->GBs());
827 |   }
828 | 
829 |   cuttCheck(cuttDestroy(plan));
830 |   return tester->checkTranspose<T>(rank, dim.data(), permutation.data(), (T *)dataOut);
831 | }
832 | 
833 | void printVec(std::vector<int>& vec) {
834 |   for (int i=0;i < vec.size();i++) {
835 |     printf("%d ", vec[i]);
836 |   }
837 |   printf("\n");
838 | }
839 | 
840 | //
841 | // Benchmarks memory copy. Returns bandwidth in GB/s
842 | //
843 | template <typename T>
844 | bool bench_memcpy(int numElem) {
845 | 
846 |   std::vector<int> dim(1, numElem);
847 |   std::vector<int> permutation(1, 0);
848 | 
849 |   {
850 |     cuttTimer timer(sizeof(T));
851 |     for (int i=0;i < 4;i++) {
852 |       set_device_array<T>((T *)dataOut, -1, numElem);
853 |       cudaCheck(cudaDeviceSynchronize());
854 |       timer.start(dim, permutation);
855 |       scalarCopy<T>(numElem, (T *)dataIn, (T *)dataOut, 0);
856 |       timer.stop();
857 |       printf("%4.2lf GB/s\n", timer.GBs());
858 |     }
859 |     if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
860 |     printf("scalarCopy %lf GB/s\n", timer.getAverage(1));
861 |   }
862 | 
863 |   {
864 |     cuttTimer timer(sizeof(T));
865 |     for (int i=0;i < 4;i++) {
866 |       set_device_array<T>((T *)dataOut, -1, numElem);
867 |       cudaCheck(cudaDeviceSynchronize());
868 |       timer.start(dim, permutation);
869 |       vectorCopy<T>(numElem, (T *)dataIn, (T *)dataOut, 0);
870 |       timer.stop();
871 |       printf("%4.2lf GB/s\n", timer.GBs());
872 |     }
873 |     if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
874 |     printf("vectorCopy %lf GB/s\n", timer.getAverage(1));
875 |   }
876 | 
877 |   {
878 |     cuttTimer timer(sizeof(T));
879 |     for (int i=0;i < 4;i++) {
880 |       set_device_array<T>((T *)dataOut, -1, numElem);
881 |       cudaCheck(cudaDeviceSynchronize());
882 |       timer.start(dim, permutation);
883 |       memcpyFloat(numElem*sizeof(T)/sizeof(float), (float *)dataIn, (float *)dataOut, 0);
884 |       timer.stop();
885 |       printf("%4.2lf GB/s\n", timer.GBs());
886 |     }
887 |     if (!tester->checkTranspose<T>(1, dim.data(), permutation.data(), (T *)dataOut)) return false;
888 |     printf("memcpyFloat %lf GB/s\n", timer.getAverage(1));
889 |   }
890 | 
891 |   return true;
892 | }
893 | 
894 | void printDeviceInfo() {
895 |   int deviceID;
896 |   cudaCheck(cudaGetDevice(&deviceID));
897 |   cudaDeviceProp prop;
898 |   cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
899 |   cudaSharedMemConfig pConfig;
900 |   cudaCheck(cudaDeviceGetSharedMemConfig(&pConfig));
901 |   int shMemBankSize = 4;
902 |   if (pConfig == cudaSharedMemBankSizeEightByte) shMemBankSize = 8;
903 |   double mem_BW = (double)(prop.memoryClockRate*2*(prop.memoryBusWidth/8))/1.0e6;
904 |   printf("Using %s SM version %d.%d\n", prop.name, prop.major, prop.minor);
905 |   printf("Clock %1.3lfGhz numSM %d ECC %d mem BW %1.2lfGB/s shMemBankSize %dB\n", (double)prop.clockRate/1e6,
906 |     prop.multiProcessorCount, prop.ECCEnabled, mem_BW, shMemBankSize);
907 |   printf("L2 %1.2lfMB\n", (double)prop.l2CacheSize/(double)(1024*1024));
908 | 
909 | }
910 | 


--------------------------------------------------------------------------------
/src/cuttkernel.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | MIT License
  3 | 
  4 | Copyright (c) 2016 Antti-Pekka Hynninen
  5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in all
 15 | copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | SOFTWARE.
 24 | *******************************************************************************/
 25 | #include <cuda.h>
 26 | #include "CudaUtils.h"
 27 | #include "LRUCache.h"
 28 | #include "cuttkernel.h"
 29 | 
 30 | #define RESTRICT __restrict__
 31 | 
 32 | //
 33 | // Transpose when Mm and Mk don't overlap and contain only single rank
 34 | //
 35 | //  dim3 numthread(TILEDIM, TILEROWS, 1);
 36 | //  dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMk-1)/TILEDIM+1), 1, plan.volMbar);
 37 | //
 38 | template <typename T>
 39 | __global__ void transposeTiled(
 40 |   const int numMm, const int volMbar, const int sizeMbar,
 41 |   const int2 tiledVol, const int cuDimMk, const int cuDimMm,
 42 |   const TensorConvInOut* RESTRICT glMbar,
 43 |   const T* RESTRICT dataIn, T* RESTRICT dataOut) {
 44 | 
 45 |   // Shared memory
 46 |   __shared__ T shTile[TILEDIM][TILEDIM+1];
 47 | 
 48 |   const int warpLane = threadIdx.x & (warpSize - 1);
 49 |   TensorConvInOut Mbar;
 50 |   Mbar.c_in = 1;
 51 |   Mbar.d_in = 1;
 52 |   Mbar.c_out = 1;
 53 |   Mbar.d_out = 1;
 54 |   if (warpLane < sizeMbar) {
 55 |     Mbar = glMbar[warpLane];
 56 |   }
 57 | 
 58 |   const int bx = (blockIdx.x % numMm)*TILEDIM;
 59 |   const int by = (blockIdx.x / numMm)*TILEDIM;
 60 | 
 61 |   const int xin = bx + threadIdx.x;
 62 |   const int yin = by + threadIdx.y;
 63 | 
 64 |   const int xout = bx + threadIdx.y;
 65 |   const int yout = by + threadIdx.x;
 66 | 
 67 |   const unsigned int maskIny = __ballot_sync(FULL_MASK, (yin + warpLane < tiledVol.y))*(xin < tiledVol.x);
 68 |   const unsigned int maskOutx = __ballot_sync(FULL_MASK, (xout + warpLane < tiledVol.x))*(yout < tiledVol.y);
 69 | 
 70 |   const int posMinorIn = xin + yin*cuDimMk;
 71 |   const int posMinorOut = yout + xout*cuDimMm;
 72 |   const int posInAdd = TILEROWS*cuDimMk;
 73 |   const int posOutAdd = TILEROWS*cuDimMm;
 74 | 
 75 |   for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z)
 76 |   {
 77 | 
 78 |     // Compute global memory positions
 79 |     int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
 80 |     int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
 81 | #pragma unroll
 82 |     for (int i=16;i >= 1;i/=2) {
 83 |       posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i);
 84 |       posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i);
 85 |     }
 86 |     int posIn = posMajorIn + posMinorIn;
 87 |     int posOut = posMajorOut + posMinorOut;
 88 | 
 89 |     // Read from global memory
 90 |     __syncthreads();
 91 | 
 92 |     // Read data into shared memory tile
 93 | #pragma unroll
 94 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
 95 |       // int pos = posIn + j*cuDimMk;
 96 |       // if (xin < readVol.x && yin + j < readVol.y) {
 97 |       if ((maskIny & (1 << j)) != 0) {
 98 |         shTile[threadIdx.y + j][threadIdx.x] = dataIn[posIn];
 99 |       }
100 |       posIn += posInAdd;
101 |     }
102 | 
103 |     // Write to global memory
104 |     __syncthreads();
105 | 
106 | #pragma unroll
107 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
108 |       // int pos = posOut + j*cuDimMm;
109 |       // if (xout + j < readVol.x && yout < readVol.y) {
110 |       if ((maskOutx & (1 << j)) != 0 ) {
111 |         dataOut[posOut] = shTile[threadIdx.x][threadIdx.y + j];
112 |       }
113 |       posOut += posOutAdd;
114 |     }
115 | 
116 |   }
117 |   
118 | }
119 | 
120 | //
121 | // Packed transpose. Thread block loads plan.volMmk number of elements
122 | //
123 | template <typename T, int numRegStorage>
124 | __global__ void transposePacked(
125 |   const int volMmk, const int volMbar,
126 |   const int sizeMmk, const int sizeMbar,
127 |   const TensorConvInOut* RESTRICT gl_Mmk,
128 |   const TensorConvInOut* RESTRICT gl_Mbar,
129 |   const TensorConv* RESTRICT gl_Msh,
130 |   const T* RESTRICT dataIn, T* RESTRICT dataOut) {
131 | 
132 |   // Shared memory. volMmk elements
133 |   extern __shared__ char shBuffer_char[];
134 |   T* shBuffer = (T *)shBuffer_char;
135 | 
136 |   const int warpLane = threadIdx.x & (warpSize - 1);
137 | 
138 |   TensorConvInOut Mmk;
139 |   Mmk.c_in = 1;
140 |   Mmk.d_in = 1;
141 |   Mmk.c_out = 1;
142 |   Mmk.d_out = 1;
143 |   if (warpLane < sizeMmk) {
144 |     Mmk = gl_Mmk[warpLane];
145 |   }
146 |   TensorConv Msh;
147 |   Msh.c = 1;
148 |   Msh.d = 1;
149 |   if (warpLane < sizeMmk) {
150 |     Msh = gl_Msh[warpLane];
151 |   }
152 | 
153 |   // Pre-compute tensor positions in Mmk
154 |   // 3*numRegStorage registers
155 |   int posMmkIn[numRegStorage];
156 |   int posMmkOut[numRegStorage];
157 |   int posSh[numRegStorage];
158 | #pragma unroll
159 |   for (int j=0;j < numRegStorage;j++) {
160 |     posMmkIn[j] = 0;
161 |     posMmkOut[j] = 0;
162 |     posSh[j] = 0;
163 |   }
164 |   for (int i=0;i < sizeMmk;i++) {
165 | #pragma unroll
166 |     for (int j=0;j < numRegStorage;j++) {
167 |       int posMmk = threadIdx.x + j*blockDim.x;
168 |       posMmkIn[j]  += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i);
169 |       posMmkOut[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i);
170 |       posSh[j]     += ((posMmk / __shfl_sync(FULL_MASK, Msh.c,i)) % __shfl_sync(FULL_MASK, Msh.d,i))*__shfl_sync(FULL_MASK, Msh.ct,i);
171 |     }
172 |   }
173 | 
174 |   // 6 registers
175 |   TensorConvInOut Mbar;
176 |   Mbar.c_in = 1;
177 |   Mbar.d_in = 1;
178 |   Mbar.c_out = 1;
179 |   Mbar.d_out = 1;
180 |   if (warpLane < sizeMbar) {
181 |     Mbar = gl_Mbar[warpLane];
182 |   }
183 | 
184 |   for (int posMbar=blockIdx.x;posMbar < volMbar;posMbar += gridDim.x)
185 |   {
186 | 
187 |     int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
188 | #pragma unroll
189 |     for (int i=16;i >= 1;i/=2) {
190 |       posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i);
191 |     }
192 | 
193 |     int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
194 | #pragma unroll
195 |     for (int i=16;i >= 1;i/=2) {
196 |       posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i);
197 |     }
198 | 
199 |     __syncthreads();
200 | 
201 |     // Read from global memory
202 | #pragma unroll
203 |     for (int j=0;j < numRegStorage;j++) {
204 |       int posMmk = threadIdx.x + j*blockDim.x;
205 |       int posIn = posMbarIn + posMmkIn[j];
206 |       if (posMmk < volMmk) shBuffer[posMmk] = dataIn[posIn];
207 |     }
208 | 
209 |     __syncthreads();
210 | 
211 |     // Write to global memory
212 | #pragma unroll
213 |     for (int j=0;j < numRegStorage;j++) {
214 |       int posMmk = threadIdx.x + j*blockDim.x;
215 |       int posOut = posMbarOut + posMmkOut[j];
216 |       if (posMmk < volMmk) dataOut[posOut] = shBuffer[posSh[j]];
217 |     }
218 | 
219 | 
220 |   }
221 |   
222 | }
223 | 
224 | //
225 | // Packed method with a split rank
226 | //
227 | // dim nthread(((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize, 1, 1)
228 | // dim nblock(ts.numSplit, min(256, max(1, ts.volMbar)), 1)
229 | //
230 | template <typename T, int numRegStorage>
231 | __global__ void transposePackedSplit(
232 |   const int splitDim, const int volMmkUnsplit, const int volMbar,
233 |   const int sizeMmk, const int sizeMbar,
234 |   const int cMmSplit, const int cMkSplit,
235 |   const TensorConvInOut* RESTRICT glMmk,
236 |   const TensorConvInOut* RESTRICT glMbar,
237 |   const TensorConv* RESTRICT glMsh,
238 |   const T* RESTRICT dataIn, T* RESTRICT dataOut) {
239 | 
240 |   // Shared memory. max(volSplit)*volMmkUnsplit T elements
241 |   extern __shared__ char shBuffer_char[];
242 |   T* shBuffer = (T *)shBuffer_char;
243 | 
244 |   const int warpLane = threadIdx.x & (warpSize - 1);
245 | 
246 |   // const int plusone = (blockIdx.x < (splitDim % gridDim.x));
247 |   const int p0 = blockIdx.x*splitDim/gridDim.x;
248 |   const int volSplit = (blockIdx.x + 1)*splitDim/gridDim.x - p0;
249 |   const int plusone = volSplit - splitDim/gridDim.x;
250 | 
251 |   TensorConvInOut Mmk;
252 |   Mmk.c_in = 1;
253 |   Mmk.d_in = 1;
254 |   Mmk.c_out = 1;
255 |   Mmk.d_out = 1;
256 |   if (warpLane < sizeMmk) {
257 |     Mmk = glMmk[warpLane + plusone*sizeMmk];
258 |   }
259 |   TensorConv Msh;
260 |   Msh.c = 1;
261 |   Msh.d = 1;
262 |   if (warpLane < sizeMmk) {
263 |     Msh = glMsh[warpLane + plusone*sizeMmk];
264 |   }
265 | 
266 |   // gridDim.x = number of splits
267 |   // blockIdx.x = {0 ... gridDim.x - 1} is the split-index
268 |   // Volume of this split
269 |   // const int volSplit = (splitDim/gridDim.x) + plusone;
270 |   // Start position in this split
271 |   // const int p0 = (splitDim/gridDim.x)*blockIdx.x + min(blockIdx.x, (splitDim % gridDim.x));
272 |   const int posMmkIn0  = p0*cMmSplit;
273 |   const int posMmkOut0 = p0*cMkSplit;
274 |   // Volume of split Mmk
275 |   const int volMmkSplit = volSplit*volMmkUnsplit;
276 | 
277 |   // Pre-compute tensor positions in Mmk
278 |   // 3*numRegStorage registers
279 |   int posMmkIn[numRegStorage];
280 |   int posMmkOut[numRegStorage];
281 |   int posSh[numRegStorage];
282 | #pragma unroll
283 |   for (int j=0;j < numRegStorage;j++) {
284 |     posMmkIn[j]  = posMmkIn0;
285 |     posMmkOut[j] = posMmkOut0;
286 |     posSh[j] = 0;
287 |   }
288 |   for (int i=0;i < sizeMmk;i++) {
289 | #pragma unroll
290 |     for (int j=0;j < numRegStorage;j++) {
291 |       int t = threadIdx.x + j*blockDim.x;
292 |       posMmkIn[j]  += ((t/__shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i);
293 |       posMmkOut[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i);
294 |       posSh[j]     += ((t/__shfl_sync(FULL_MASK, Msh.c,i)) % __shfl_sync(FULL_MASK, Msh.d,i))*__shfl_sync(FULL_MASK, Msh.ct,i);
295 |     }
296 |   }
297 | 
298 |   TensorConvInOut Mbar;
299 |   Mbar.c_in = 1;
300 |   Mbar.d_in = 1;
301 |   Mbar.c_out = 1;
302 |   Mbar.d_out = 1;
303 |   if (warpLane < sizeMbar) {
304 |     Mbar = glMbar[warpLane];
305 |   }
306 | 
307 |   const int posMbar0 = blockIdx.y*volMbar/gridDim.y;
308 |   const int posMbar1 = (blockIdx.y + 1)*volMbar/gridDim.y;
309 |   for (int posMbar=posMbar0;posMbar < posMbar1;posMbar++)
310 |   // for (int posMbar=blockIdx.y;posMbar < volMbar;posMbar+=gridDim.y)
311 |   {
312 | 
313 |     int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
314 | #pragma unroll
315 |     for (int i=16;i >= 1;i/=2) {
316 |       posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i);
317 |     }
318 | 
319 |     int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
320 | #pragma unroll
321 |     for (int i=16;i >= 1;i/=2) {
322 |       posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i);
323 |     }
324 | 
325 |     // Read from global memory
326 |     __syncthreads();
327 | 
328 | #pragma unroll
329 |     for (int j=0;j < numRegStorage;j++) {
330 |       int posMmk = threadIdx.x + j*blockDim.x;
331 |       int posIn = posMbarIn + posMmkIn[j];
332 |       if (posMmk < volMmkSplit) shBuffer[posMmk] = dataIn[posIn];
333 |     }
334 | 
335 |     // Write to global memory
336 |     __syncthreads();
337 | 
338 | #pragma unroll
339 |     for (int j=0;j < numRegStorage;j++) {
340 |       int posMmk = threadIdx.x + j*blockDim.x;
341 |       int posOut = posMbarOut + posMmkOut[j];
342 |       if (posMmk < volMmkSplit) dataOut[posOut] = shBuffer[posSh[j]];
343 |     }
344 | 
345 |   }
346 | 
347 | }
348 | 
349 | #if 1
350 | //
351 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2)
352 | //
353 | //  dim3 numthread(TILEDIM, TILEROWS, 1);
354 | //  dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar);
355 | //
356 | template <typename T>
357 | __global__ void transposeTiledCopy(
358 |   const int numMm, const int volMbar, const int sizeMbar,
359 |   const int cuDimMk, const int cuDimMm,
360 |   const int2 tiledVol,
361 |   const TensorConvInOut* RESTRICT gl_Mbar,
362 |   const T* RESTRICT dataIn, T* RESTRICT dataOut) {
363 | 
364 |   const int warpLane = threadIdx.x & (warpSize - 1);
365 |   TensorConvInOut Mbar;
366 |   Mbar.c_in = 1;
367 |   Mbar.d_in = 1;
368 |   Mbar.c_out = 1;
369 |   Mbar.d_out = 1;
370 |   if (warpLane < sizeMbar) {
371 |     Mbar = gl_Mbar[warpLane];
372 |   }
373 | 
374 |   const int bx = (blockIdx.x % numMm)*TILEDIM;
375 |   const int by = (blockIdx.x / numMm)*TILEDIM;
376 | 
377 |   const int x = bx + threadIdx.x;
378 |   const int y = by + threadIdx.y;
379 | 
380 |   const unsigned int mask = __ballot_sync(FULL_MASK, (y + warpLane < tiledVol.y))*(x < tiledVol.x);
381 | 
382 |   const int posMinorIn = x + y*cuDimMk;
383 |   const int posMinorOut = x + y*cuDimMm;
384 |   const int posInAdd = TILEROWS*cuDimMk;
385 |   const int posOutAdd = TILEROWS*cuDimMm;
386 | 
387 |   for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z)
388 |   {
389 | 
390 |     // Compute global memory positions
391 |     int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in;
392 |     int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out;
393 | #pragma unroll
394 |     for (int i=16;i >= 1;i/=2) {
395 |       posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i);
396 |       posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i);
397 |     }
398 |     int posIn = posMajorIn + posMinorIn;
399 |     int posOut = posMajorOut + posMinorOut;
400 | 
401 |     // Variables where values are stored
402 |     T val[TILEDIM/TILEROWS];
403 | 
404 |     // Read global memory
405 | #pragma unroll
406 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
407 |       // if ((x < tiledVol.x) && (y + j < tiledVol.y)) {
408 |       if ((mask & (1 << j)) != 0) {
409 |         val[j/TILEROWS] = dataIn[posIn];
410 |       }
411 |       posIn += posInAdd;
412 |     }
413 | 
414 |     // Write global memory
415 | #pragma unroll
416 |     for (int j=0;j < TILEDIM;j += TILEROWS) {
417 |       // if ((x < tiledVol.x) && (y + j < tiledVol.y)) {
418 |       if ((mask & (1 << j)) != 0) {
419 |         dataOut[posOut] = val[j/TILEROWS];
420 |       }
421 |       posOut += posOutAdd;
422 |     }
423 | 
424 |   }
425 |   
426 | }
427 | #else
428 | 
429 | //
430 | // Returns scalar tensor position. Each lane has the same p
431 | // NOTE: c and d on inactive warps must be 1 !!
432 | //
433 | __device__ __forceinline__
434 | int tensorPos(
435 |   const int p, const int rank, const int c, const int d, const int ct,
436 |   const int numLane=warpSize
437 |   ) {
438 | 
439 |   int r = ((p/c) % d)*ct;
440 | #pragma unroll
441 |   for (int i=numLane/2;i >= 1;i/=2) {
442 |     r += __shfl_xor_sync(FULL_MASK, r, i);
443 |   }
444 |   return r;
445 | 
446 | }
447 | 
448 | //
449 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2)
450 | //
451 | //  dim3 numthread(TILEDIM, TILEROWS, 1);
452 | //  dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar);
453 | //
454 | template <typename T>
455 | __global__ void transposeTiledCopy(
456 |   const int numMm, const int volMbar, const int sizeMbar,
457 |   const int cuDimMk, const int cuDimMm,
458 |   const int2 tiledVol,
459 |   const TensorConvInOut* RESTRICT gl_Mbar,
460 |   const T* RESTRICT dataIn, T* RESTRICT dataOut) {
461 | 
462 |   const int warpLane = threadIdx.x & (warpSize - 1);
463 |   TensorConvInOut Mbar;
464 |   Mbar.c_in = 1;
465 |   Mbar.d_in = 1;
466 |   Mbar.c_out = 1;
467 |   Mbar.d_out = 1;
468 |   if (warpLane < sizeMbar) {
469 |     Mbar = gl_Mbar[warpLane];
470 |   }
471 | 
472 |   const int bx = (blockIdx.x % numMm)*TILEDIM;
473 |   const int by = (blockIdx.x / numMm)*TILEDIM;
474 | 
475 |   const int x = bx + threadIdx.x;
476 |   const int y = by + threadIdx.y;
477 | 
478 |   for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z)
479 |   {
480 | 
481 |     // Variables where values are stored
482 |     T val[TILEDIM/TILEROWS];
483 | 
484 |     // Read global memory
485 |     {
486 |       int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_in, Mbar.d_in, Mbar.ct_in);
487 |       pos0 += x + y*cuDimMk;
488 | 
489 | #pragma unroll
490 |       for (int j=0;j < TILEDIM;j += TILEROWS) {
491 |         int pos  = pos0  + j*cuDimMk;
492 |         if ((x < tiledVol.x) && (y + j < tiledVol.y)) {
493 |           val[j/TILEROWS] = dataIn[pos];
494 |         }
495 |       }
496 |     }
497 | 
498 |     // Write global memory
499 |     {
500 |       int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_out, Mbar.d_out, Mbar.ct_out);
501 |       pos0 += x + y*cuDimMm;
502 | 
503 | #pragma unroll
504 |       for (int j=0;j < TILEDIM;j += TILEROWS) {
505 |         int pos = pos0 + j*cuDimMm;
506 |         if ((x < tiledVol.x) && (y + j < tiledVol.y)) {
507 |           dataOut[pos] = val[j/TILEROWS];
508 |         }
509 |       }
510 |     }
511 | 
512 |   }
513 |   
514 | }
515 | #endif
516 | 
517 | //######################################################################################
518 | //######################################################################################
519 | //######################################################################################
520 | 
521 | //
522 | // Sets shared memory bank configuration for all kernels. Needs to be called once per device.
523 | //
524 | void cuttKernelSetSharedMemConfig() {  
525 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked<float, NREG>, cudaSharedMemBankSizeFourByte ))
526 | #include "calls.h"
527 | #undef CALL
528 | 
529 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked<double, NREG>, cudaSharedMemBankSizeEightByte ))
530 | #include "calls.h"
531 | #undef CALL
532 | 
533 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit<float, NREG>, cudaSharedMemBankSizeFourByte ))
534 | #include "calls.h"
535 | #undef CALL
536 | 
537 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit<double, NREG>, cudaSharedMemBankSizeEightByte ))
538 | #include "calls.h"
539 | #undef CALL
540 | 
541 |   cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled<float>, cudaSharedMemBankSizeFourByte));
542 |   cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy<float>, cudaSharedMemBankSizeFourByte));
543 | 
544 |   cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled<double>, cudaSharedMemBankSizeEightByte));
545 |   cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy<double>, cudaSharedMemBankSizeEightByte));
546 | 
547 | }
548 | 
549 | // Caches for PackedSplit kernels. One cache for all devices
550 | // NOTE: Not thread safe
551 | const int CACHE_SIZE = 100000;
552 | const int MAX_NUMWARP = (1024/32);
553 | const int MAX_NUMTYPE = 2;
554 | static int numDevices = -1;
555 | LRUCache<unsigned long long int, int> nabCache(CACHE_SIZE, -1);
556 | 
557 | //
558 | // Returns the maximum number of active blocks per SM
559 | //
560 | int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig& lc,
561 |   const int deviceID, const cudaDeviceProp& prop) {
562 | 
563 |   int numActiveBlock;
564 |   int numthread = lc.numthread.x * lc.numthread.y * lc.numthread.z;
565 |   switch(method) {
566 |     case Trivial:
567 |     {
568 |       // This value does not matter, but should be > 0
569 |       numActiveBlock = 1;
570 |     }
571 |     break;
572 | 
573 |     case Packed:
574 |     {
575 | #define CALL0(TYPE, NREG) \
576 |   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \
577 |     transposePacked<TYPE, NREG>, numthread, lc.shmemsize)
578 |       switch(lc.numRegStorage) {
579 | #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float,  ICASE); if (sizeofType == 8) CALL0(double, ICASE); break
580 | #include "calls.h"
581 |       }
582 | #undef CALL
583 | #undef CALL0
584 |     }
585 |     break;
586 | 
587 |     case PackedSplit:
588 |     {
589 |       // Allocate cache structure if needed
590 |       if (numDevices == -1) {
591 |         cudaCheck(cudaGetDeviceCount(&numDevices));
592 |       }
593 |       // Build unique key for cache
594 |       int key_warp = (numthread/prop.warpSize - 1);
595 |       if (key_warp >= MAX_NUMWARP) {
596 |         printf("getNumActiveBlock maximum number of warps exceeded\n");
597 |         exit(1);
598 |       }
599 |       int key_reg = (lc.numRegStorage - 1);
600 |       int key_type = (sizeofType == 4);
601 |       unsigned long long int key = 
602 |       (unsigned long long int)(lc.shmemsize/sizeofType)*MAX_NUMWARP*MAX_REG_STORAGE*MAX_NUMTYPE*numDevices + 
603 |       (unsigned long long int)deviceID*MAX_NUMWARP*MAX_REG_STORAGE*MAX_NUMTYPE +
604 |       (unsigned long long int)key_type*MAX_NUMWARP*MAX_REG_STORAGE + 
605 |       (unsigned long long int)key_reg*MAX_NUMWARP + 
606 |       (unsigned long long int)key_warp;
607 | 
608 |       numActiveBlock = nabCache.get(key);
609 |       if (numActiveBlock == -1) {
610 |         // key not found in cache, determine value and add it to cache
611 | #define CALL0(TYPE, NREG) \
612 |   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \
613 |     transposePackedSplit<TYPE, NREG>, numthread, lc.shmemsize)
614 |       switch(lc.numRegStorage) {
615 | #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float,  ICASE); if (sizeofType == 8) CALL0(double, ICASE); break
616 | #include "calls.h"
617 |       }
618 | #undef CALL
619 | #undef CALL0
620 |         nabCache.set(key, numActiveBlock);
621 |       }
622 |     }
623 |     break;
624 | 
625 |     case Tiled:
626 |     {
627 |       if (sizeofType == 4) {
628 |         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock,
629 |           transposeTiled<float>, numthread, lc.shmemsize);
630 |       } else {
631 |         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock,
632 |           transposeTiled<double>, numthread, lc.shmemsize);
633 |       }
634 |     }
635 |     break;
636 | 
637 |     case TiledCopy:
638 |     {
639 |       if (sizeofType == 4) {
640 |         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock,
641 |           transposeTiledCopy<float>, numthread, lc.shmemsize);
642 |       } else {
643 |         cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock,
644 |           transposeTiledCopy<double>, numthread, lc.shmemsize);
645 |       }
646 |     }
647 |     break;
648 |   }
649 | 
650 |   return numActiveBlock;
651 | }
652 | 
653 | //
654 | // Sets up kernel launch configuration
655 | //
656 | // Returns the number of active blocks per SM that can be achieved on the Packed kernel
657 | // NOTE: Returns 0 when kernel execution is not possible
658 | //
659 | // Sets:
660 | // lc.numthread
661 | // lc.numblock
662 | // lc.shmemsize
663 | // lc.numRegStorage  (for Packed method)
664 | //
665 | int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts,
666 |   const int deviceID, const cudaDeviceProp& prop, LaunchConfig& lc) {
667 | 
668 |   // Return value of numActiveBlock
669 |   int numActiveBlockReturn = -1;
670 | 
671 |   switch(ts.method) {
672 |     case Trivial:
673 |     {
674 |       // These values don't matter
675 |       lc.numthread.x = 1;
676 |       lc.numthread.y = 1;
677 |       lc.numthread.z = 1;
678 |       lc.numblock.x = 1;
679 |       lc.numblock.y = 1;
680 |       lc.numblock.z = 1;
681 |       lc.numblock.z = 1;
682 |       lc.numblock.z = 1;
683 |       lc.shmemsize = 0;
684 |       lc.numRegStorage = 0;
685 |     }
686 |     break;
687 | 
688 |     case Packed:
689 |     {
690 |       // Amount of shared memory required
691 |       lc.shmemsize = ts.shmemAlloc(sizeofType); //ts.volMmk*sizeofType;
692 | 
693 |       // Check that we're not using too much shared memory per block
694 |       if (lc.shmemsize > prop.sharedMemPerBlock) {
695 |         // printf("lc.shmemsize %d prop.sharedMemPerBlock %d\n", lc.shmemsize, prop.sharedMemPerBlock);
696 |         return 0;
697 |       }
698 | 
699 |       // Min and max number of threads we can use
700 |       int minNumthread = ((ts.volMmk - 1)/(prop.warpSize*MAX_REG_STORAGE) + 1)*prop.warpSize;
701 |       int maxNumthread = ((ts.volMmk - 1)/(prop.warpSize) + 1)*prop.warpSize;      
702 |       if (minNumthread > prop.maxThreadsPerBlock) return 0;
703 |       maxNumthread = min(prop.maxThreadsPerBlock, maxNumthread);
704 |       // printf("minNumthread %d maxNumthread %d\n", minNumthread, maxNumthread);
705 | 
706 |       // Min and max number of register storage we can use
707 |       int minNumRegStorage = (ts.volMmk - 1)/maxNumthread + 1;
708 |       int maxNumRegStorage = (ts.volMmk - 1)/minNumthread + 1;
709 |       // printf("minNumRegStorage %d maxNumRegStorage %d\n", minNumRegStorage, maxNumRegStorage);
710 | 
711 |       int bestVal = 0;
712 |       int bestNumRegStorage = 0;
713 |       int bestNumActiveBlock = 0;
714 | 
715 |       lc.numthread.y = 1;
716 |       lc.numthread.z = 1;
717 |       lc.numblock.x = max(1, ts.volMbar);
718 |       lc.numblock.x = min(prop.multiProcessorCount*18, lc.numblock.x);
719 |       lc.numblock.y = 1;
720 |       lc.numblock.z = 1;
721 | 
722 |       for (lc.numRegStorage=minNumRegStorage;lc.numRegStorage <= maxNumRegStorage;lc.numRegStorage++) {
723 |         lc.numthread.x = ((ts.volMmk - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize;
724 | 
725 |         int numActiveBlock = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop);
726 |         // int val = numActiveBlock*lc.numthread.x;
727 |         int val = ts.volMmkUsed()*numActiveBlock;
728 |         if (val > bestVal) {
729 |           bestVal = val;
730 |           bestNumRegStorage = lc.numRegStorage;
731 |           bestNumActiveBlock = numActiveBlock;
732 |         }
733 |       }
734 | 
735 |       if (bestNumRegStorage == 0) return 0;
736 | 
737 |       lc.numRegStorage = bestNumRegStorage;
738 |       lc.numthread.x = ((ts.volMmk - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize;
739 |       numActiveBlockReturn = bestNumActiveBlock;
740 |     }
741 |     break;
742 | 
743 |     case PackedSplit:
744 |     {
745 |       // Amount of shared memory required
746 |       lc.shmemsize = ts.shmemAlloc(sizeofType);
747 | 
748 |       // Check that we're not using too much shared memory per block
749 |       if (lc.shmemsize > prop.sharedMemPerBlock) {
750 |         // printf("lc.shmemsize %d prop.sharedMemPerBlock %d\n", lc.shmemsize, prop.sharedMemPerBlock);
751 |         return 0;
752 |       }
753 | 
754 |       int volMmkWithSplit = (ts.splitDim/ts.numSplit + ((ts.splitDim % ts.numSplit) > 0))*ts.volMmkUnsplit;
755 | 
756 |       // Min and max number of threads we can use
757 |       int minNumthread = ((volMmkWithSplit - 1)/(prop.warpSize*MAX_REG_STORAGE) + 1)*prop.warpSize;
758 |       int maxNumthread = ((volMmkWithSplit - 1)/(prop.warpSize) + 1)*prop.warpSize;      
759 |       if (minNumthread > prop.maxThreadsPerBlock) return 0;
760 |       maxNumthread = min(prop.maxThreadsPerBlock, maxNumthread);
761 |       // printf("minNumthread %d maxNumthread %d\n", minNumthread, maxNumthread);
762 | 
763 |       // Min and max number of register storage we can use
764 |       int minNumRegStorage = (volMmkWithSplit - 1)/maxNumthread + 1;
765 |       int maxNumRegStorage = (volMmkWithSplit - 1)/minNumthread + 1;
766 |       // printf("minNumRegStorage %d maxNumRegStorage %d\n", minNumRegStorage, maxNumRegStorage);
767 | 
768 |       int bestVal = 0;
769 |       int bestNumRegStorage = 0;
770 |       int bestNumActiveBlock = 0;
771 | 
772 |       lc.numthread.y = 1;
773 |       lc.numthread.z = 1;
774 |       lc.numblock.x = ts.numSplit;
775 |       lc.numblock.y = max(1, min((prop.multiProcessorCount*18)/lc.numblock.x, ts.volMbar));
776 |       lc.numblock.z = 1;
777 | 
778 |       for (lc.numRegStorage=minNumRegStorage;lc.numRegStorage <= maxNumRegStorage;lc.numRegStorage++) {
779 |         lc.numthread.x = ((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize;
780 | 
781 |         int numActiveBlock = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop);
782 |         // int val = numActiveBlock*lc.numthread.x*lc.numRegStorage;
783 |         int val = ts.volMmkUsed()*numActiveBlock;
784 |         if (val > bestVal) {
785 |           bestVal = val;
786 |           bestNumRegStorage = lc.numRegStorage;
787 |           bestNumActiveBlock = numActiveBlock;
788 |         }
789 |       }
790 | 
791 |       if (bestNumRegStorage == 0) return 0;
792 | 
793 |       lc.numRegStorage = bestNumRegStorage;
794 |       lc.numthread.x = ((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize;
795 |       numActiveBlockReturn = bestNumActiveBlock;
796 |     }
797 |     break;
798 | 
799 |     case Tiled:
800 |     {
801 |       lc.numthread.x = TILEDIM;
802 |       lc.numthread.y = TILEROWS;
803 |       lc.numthread.z = 1;
804 |       lc.numblock.x = ((ts.volMm - 1)/TILEDIM + 1)*((ts.volMk - 1)/TILEDIM + 1);
805 |       lc.numblock.y = 1;
806 |       lc.numblock.z = max(1, min((prop.multiProcessorCount*8)/(lc.numblock.x*lc.numblock.y), ts.volMbar));
807 |       lc.shmemsize = 0;
808 |       lc.numRegStorage = 0;
809 |     }
810 |     break;
811 | 
812 |     case TiledCopy:
813 |     {
814 |       lc.numthread.x = TILEDIM;
815 |       lc.numthread.y = TILEROWS;
816 |       lc.numthread.z = 1;
817 |       lc.numblock.x = ((ts.volMm - 1)/TILEDIM + 1)*((ts.volMkBar - 1)/TILEDIM + 1);
818 |       lc.numblock.y = 1;
819 |       lc.numblock.z = ts.volMbar;
820 |       lc.numblock.z = min((prop.multiProcessorCount*8)/(lc.numblock.x*lc.numblock.y), lc.numblock.z);
821 |       lc.numblock.z = max(1, lc.numblock.z);
822 |       lc.shmemsize = 0;
823 |       lc.numRegStorage = 0;
824 |     }
825 |     break;
826 |   }
827 | 
828 |   if (lc.numblock.x > prop.maxGridSize[0] ||
829 |     lc.numblock.y > prop.maxGridSize[1] ||
830 |     lc.numblock.z > prop.maxGridSize[2]) return 0;
831 | 
832 |   // Return the number of active blocks with these settings
833 |   if (numActiveBlockReturn == -1) {
834 |     // Not set, get it
835 |     numActiveBlockReturn = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop);
836 |   }
837 |   return numActiveBlockReturn;
838 | }
839 | 
840 | bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) {
841 |   LaunchConfig& lc = plan.launchConfig;
842 |   TensorSplit& ts = plan.tensorSplit;
843 | 
844 |   switch(ts.method) {
845 |     case Trivial:
846 |     {
847 |       cudaCheck(cudaMemcpyAsync(dataOut, dataIn, ts.volMmk*ts.volMbar*plan.sizeofType,
848 |         cudaMemcpyDeviceToDevice, plan.stream));
849 |     }
850 |     break;
851 | 
852 |     case Packed:
853 |     {
854 |       switch(lc.numRegStorage) {
855 | #define CALL0(TYPE, NREG) \
856 |     transposePacked<TYPE, NREG> <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \
857 |       (ts.volMmk, ts.volMbar, ts.sizeMmk, ts.sizeMbar, \
858 |       plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut)
859 | #define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float,  ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break
860 | #include "calls.h"
861 |         default:
862 |         printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage);
863 |         return false;
864 | #undef CALL
865 | #undef CALL0
866 |       }
867 | 
868 |     }
869 |     break;
870 | 
871 |     case PackedSplit:
872 |     {
873 |       switch(lc.numRegStorage) {
874 | #define CALL0(TYPE, NREG) \
875 |       transposePackedSplit<TYPE, NREG> <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \
876 |       (ts.splitDim, ts.volMmkUnsplit, ts. volMbar, ts.sizeMmk, ts.sizeMbar, \
877 |         plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut);
878 | #define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float,  ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break
879 | #include "calls.h"
880 |         default:
881 |         printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage);
882 |         return false;
883 | #undef CALL
884 | #undef CALL0
885 |       }
886 | 
887 |     }
888 |     break;
889 | 
890 |     case Tiled:
891 |     {
892 | #define CALL(TYPE) \
893 |       transposeTiled<TYPE> <<< lc.numblock, lc.numthread, 0, plan.stream >>> \
894 |       (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.tiledVol, plan.cuDimMk, plan.cuDimMm, \
895 |         plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut)
896 |       if (plan.sizeofType == 4) CALL(float);
897 |       if (plan.sizeofType == 8) CALL(double);
898 | #undef CALL
899 |     }
900 |     break;
901 | 
902 |     case TiledCopy:
903 |     {
904 | #define CALL(TYPE) \
905 |       transposeTiledCopy<TYPE> <<< lc.numblock, lc.numthread, 0, plan.stream >>> \
906 |       (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.cuDimMk, plan.cuDimMm, plan.tiledVol, \
907 |         plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut)
908 |       if (plan.sizeofType == 4) CALL(float);
909 |       if (plan.sizeofType == 8) CALL(double);
910 | #undef CALL
911 |     }
912 |     break;
913 | 
914 |   }
915 | 
916 |   cudaCheck(cudaGetLastError());
917 |   return true;
918 | }
919 | 


--------------------------------------------------------------------------------