├── .gitignore ├── doc ├── bw_k20x.png ├── bw_titanx.png └── k20x_bench.png ├── src ├── calls.h ├── cuttTypes.h ├── cuttkernel.h ├── CudaMemcpy.h ├── cuttGpuModelKernel.h ├── TensorTester.h ├── LRUCache.h ├── cutt.h ├── cuttTimer.h ├── cuttGpuModel.h ├── CudaMemcpy.cu ├── CudaUtils.cu ├── CudaUtils.h ├── cuttplan.h ├── cuttTimer.cpp ├── TensorTester.cu ├── int_vector.h ├── cutt.cpp ├── cutt_test.cpp ├── cuttGpuModelKernel.cu ├── cutt_bench.cpp └── cuttkernel.cu ├── include └── cutt.h ├── Makefile └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | build/ 3 | lib/ 4 | -------------------------------------------------------------------------------- /doc/bw_k20x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/bw_k20x.png -------------------------------------------------------------------------------- /doc/bw_titanx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/bw_titanx.png -------------------------------------------------------------------------------- /doc/k20x_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jittor/cutt/HEAD/doc/k20x_bench.png -------------------------------------------------------------------------------- /src/calls.h: -------------------------------------------------------------------------------- 1 | #if MAX_REG_STORAGE >= 1 2 | CALL(1); 3 | #endif 4 | #if MAX_REG_STORAGE >= 2 5 | CALL(2); 6 | #endif 7 | #if MAX_REG_STORAGE >= 3 8 | CALL(3); 9 | #endif 10 | #if MAX_REG_STORAGE >= 4 11 | CALL(4); 12 | #endif 13 | #if MAX_REG_STORAGE >= 5 14 | CALL(5); 15 | #endif 16 | #if MAX_REG_STORAGE >= 6 17 | CALL(6); 18 | #endif 19 | #if MAX_REG_STORAGE >= 7 20 | CALL(7); 21 | #endif 22 | #if MAX_REG_STORAGE >= 8 23 | CALL(8); 24 | #endif 25 | #if MAX_REG_STORAGE >= 9 26 | CALL(9); 27 | #endif 28 | #if MAX_REG_STORAGE >= 10 29 | CALL(10); 30 | #endif 31 | 32 | #if MAX_REG_STORAGE >= 11 33 | CALL(11); 34 | #endif 35 | #if MAX_REG_STORAGE >= 12 36 | CALL(12); 37 | #endif 38 | #if MAX_REG_STORAGE >= 13 39 | CALL(13); 40 | #endif 41 | #if MAX_REG_STORAGE >= 14 42 | CALL(14); 43 | #endif 44 | #if MAX_REG_STORAGE >= 15 45 | CALL(15); 46 | #endif 47 | #if MAX_REG_STORAGE >= 16 48 | CALL(16); 49 | #endif 50 | #if MAX_REG_STORAGE >= 17 51 | CALL(17); 52 | #endif 53 | #if MAX_REG_STORAGE >= 18 54 | CALL(18); 55 | #endif 56 | #if MAX_REG_STORAGE >= 19 57 | CALL(19); 58 | #endif 59 | #if MAX_REG_STORAGE >= 20 60 | CALL(20); 61 | #endif 62 | 63 | #if MAX_REG_STORAGE >= 21 64 | CALL(21); 65 | #endif 66 | #if MAX_REG_STORAGE >= 22 67 | CALL(22); 68 | #endif 69 | #if MAX_REG_STORAGE >= 23 70 | CALL(23); 71 | #endif 72 | #if MAX_REG_STORAGE >= 24 73 | CALL(24); 74 | #endif 75 | #if MAX_REG_STORAGE >= 25 76 | CALL(25); 77 | #endif 78 | #if MAX_REG_STORAGE >= 26 79 | CALL(26); 80 | #endif 81 | #if MAX_REG_STORAGE >= 27 82 | CALL(27); 83 | #endif 84 | #if MAX_REG_STORAGE >= 28 85 | CALL(28); 86 | #endif 87 | #if MAX_REG_STORAGE >= 29 88 | CALL(29); 89 | #endif 90 | #if MAX_REG_STORAGE >= 30 91 | CALL(30); 92 | #endif 93 | -------------------------------------------------------------------------------- /src/cuttTypes.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTTTYPES_H 26 | #define CUTTTYPES_H 27 | 28 | #define MAX_REG_STORAGE 8 29 | 30 | struct TensorConv { 31 | int c; 32 | int d; 33 | int ct; 34 | }; 35 | 36 | struct TensorConvInOut { 37 | int c_in; 38 | int d_in; 39 | int ct_in; 40 | int c_out; 41 | int d_out; 42 | int ct_out; 43 | 44 | }; 45 | 46 | #endif // CUTTTYPES_H 47 | -------------------------------------------------------------------------------- /src/cuttkernel.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTTKERNEL_H 26 | #define CUTTKERNEL_H 27 | #include "cuttplan.h" 28 | 29 | void cuttKernelSetSharedMemConfig(); 30 | 31 | int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts, 32 | const int deviceID, const cudaDeviceProp& prop, LaunchConfig& lc); 33 | 34 | bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut); 35 | 36 | #endif // CUTTKERNEL_H 37 | -------------------------------------------------------------------------------- /src/CudaMemcpy.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUDAMEMCPY_H 26 | #define CUDAMEMCPY_H 27 | 28 | #include 29 | 30 | template void scalarCopy(const int n, const T* data_in, T* data_out, cudaStream_t stream); 31 | template void vectorCopy(const int n, T* data_in, T* data_out, cudaStream_t stream); 32 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream); 33 | 34 | #endif // CUDAMEMCPY_H 35 | -------------------------------------------------------------------------------- /src/cuttGpuModelKernel.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTTGPUMODELKERNEL_H 26 | #define CUTTGPUMODELKERNEL_H 27 | #include "cuttplan.h" 28 | 29 | void runCounters(const int warpSize, const int* hostPosData, const int numPosData, 30 | const int accWidth, const int cacheWidth, int* host_tran, int* host_cl_full, int* host_cl_part); 31 | 32 | bool cuttGpuModelKernel(cuttPlan_t& plan, const int accWidth, const int cacheWidth, 33 | int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, 34 | int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1); 35 | 36 | #endif // CUTTGPUMODELKERNEL_H 37 | -------------------------------------------------------------------------------- /src/TensorTester.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef TENSORTESTER_H 26 | #define TENSORTESTER_H 27 | #include "cuttTypes.h" 28 | 29 | // 30 | // Simple tensor transpose tester class 31 | // 32 | 33 | struct TensorError_t { 34 | int refVal; 35 | int dataVal; 36 | unsigned int pos; 37 | }; 38 | 39 | class TensorTester { 40 | private: 41 | static int calcTensorConv(const int rank, const int* dim, const int* permutation, TensorConv* tensorConv); 42 | 43 | const int maxRank; 44 | const int maxNumblock; 45 | 46 | public: 47 | TensorConv* h_tensorConv; 48 | TensorConv* d_tensorConv; 49 | TensorError_t* h_error; 50 | TensorError_t* d_error; 51 | int* d_fail; 52 | 53 | TensorTester(); 54 | ~TensorTester(); 55 | 56 | void setTensorCheckPattern(unsigned int* data, unsigned int ndata); 57 | 58 | template bool checkTranspose(int rank, int* dim, int* permutation, T* data); 59 | 60 | }; 61 | 62 | #endif // TENSORTESTER_H 63 | -------------------------------------------------------------------------------- /src/LRUCache.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 NVIDIA 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | using namespace std; 31 | 32 | // 33 | // Simple LRU cache implementation 34 | // 35 | template 36 | class LRUCache { 37 | private: 38 | 39 | struct ValueIterator { 40 | value_type value; 41 | typename list::iterator it; 42 | }; 43 | 44 | // Size of the cache 45 | const size_t capacity; 46 | 47 | // Value that is returned when the key is not found 48 | const value_type null_value; 49 | 50 | // Double linked list of keys. Oldest is at the back 51 | list keys; 52 | 53 | // Cache: (hash table) 54 | // key = key 55 | // value = {value, pointer to linked list} 56 | unordered_map cache; 57 | 58 | public: 59 | 60 | LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {} 61 | 62 | value_type get(key_type key) { 63 | auto it = cache.find(key); 64 | if (it == cache.end()) return null_value; 65 | touch(it); 66 | return it->second.value; 67 | } 68 | 69 | void set(key_type key, value_type value) { 70 | auto it = cache.find(key); 71 | if (it != cache.end()) { 72 | // key found 73 | it->second.value = value; 74 | touch(it); 75 | } else { 76 | // key not found 77 | if (cache.size() == capacity) { 78 | key_type oldest_key = keys.back(); 79 | keys.pop_back(); 80 | cache.erase( cache.find(oldest_key) ); 81 | } 82 | keys.push_front(key); 83 | ValueIterator vi; 84 | vi.value = value; 85 | vi.it = keys.begin(); 86 | pair boo(key, vi); 87 | cache.insert(boo); 88 | } 89 | } 90 | 91 | private: 92 | 93 | void touch(typename unordered_map::iterator it) { 94 | keys.erase(it->second.it); 95 | keys.push_front(it->first); 96 | it->second.it = keys.begin(); 97 | } 98 | }; 99 | -------------------------------------------------------------------------------- /include/cutt.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTT_H 26 | #define CUTT_H 27 | 28 | #include 29 | 30 | // Handle type that is used to store and access cutt plans 31 | typedef unsigned int cuttHandle; 32 | 33 | // Return value 34 | typedef enum cuttResult_t { 35 | CUTT_SUCCESS, // Success 36 | CUTT_INVALID_PLAN, // Invalid plan handle 37 | CUTT_INVALID_PARAMETER, // Invalid input parameter 38 | CUTT_INVALID_DEVICE, // Execution tried on device different than where plan was created 39 | CUTT_INTERNAL_ERROR, // Internal error 40 | CUTT_UNDEFINED_ERROR, // Undefined error 41 | } cuttResult; 42 | 43 | // 44 | // Create plan 45 | // 46 | // Parameters 47 | // handle = Returned handle to cuTT plan 48 | // rank = Rank of the tensor 49 | // dim[rank] = Dimensions of the tensor 50 | // permutation[rank] = Transpose permutation 51 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 52 | // stream = CUDA stream (0 if no stream is used) 53 | // 54 | // Returns 55 | // Success/unsuccess code 56 | // 57 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 58 | cudaStream_t stream); 59 | 60 | // 61 | // Create plan and choose implementation by measuring performance 62 | // 63 | // Parameters 64 | // handle = Returned handle to cuTT plan 65 | // rank = Rank of the tensor 66 | // dim[rank] = Dimensions of the tensor 67 | // permutation[rank] = Transpose permutation 68 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 69 | // stream = CUDA stream (0 if no stream is used) 70 | // idata = Input data size product(dim) 71 | // odata = Output data size product(dim) 72 | // 73 | // Returns 74 | // Success/unsuccess code 75 | // 76 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 77 | cudaStream_t stream, void* idata, void* odata); 78 | 79 | // 80 | // Destroy plan 81 | // 82 | // Parameters 83 | // handle = Handle to the cuTT plan 84 | // 85 | // Returns 86 | // Success/unsuccess code 87 | // 88 | cuttResult cuttDestroy(cuttHandle handle); 89 | 90 | // 91 | // Execute plan out-of-place 92 | // 93 | // Parameters 94 | // handle = Returned handle to cuTT plan 95 | // idata = Input data size product(dim) 96 | // odata = Output data size product(dim) 97 | // 98 | // Returns 99 | // Success/unsuccess code 100 | // 101 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata); 102 | 103 | #endif // CUTT_H 104 | -------------------------------------------------------------------------------- /src/cutt.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTT_H 26 | #define CUTT_H 27 | 28 | #include 29 | 30 | // Handle type that is used to store and access cutt plans 31 | typedef unsigned int cuttHandle; 32 | 33 | // Return value 34 | typedef enum cuttResult_t { 35 | CUTT_SUCCESS, // Success 36 | CUTT_INVALID_PLAN, // Invalid plan handle 37 | CUTT_INVALID_PARAMETER, // Invalid input parameter 38 | CUTT_INVALID_DEVICE, // Execution tried on device different than where plan was created 39 | CUTT_INTERNAL_ERROR, // Internal error 40 | CUTT_UNDEFINED_ERROR, // Undefined error 41 | } cuttResult; 42 | 43 | // 44 | // Create plan 45 | // 46 | // Parameters 47 | // handle = Returned handle to cuTT plan 48 | // rank = Rank of the tensor 49 | // dim[rank] = Dimensions of the tensor 50 | // permutation[rank] = Transpose permutation 51 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 52 | // stream = CUDA stream (0 if no stream is used) 53 | // 54 | // Returns 55 | // Success/unsuccess code 56 | // 57 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 58 | cudaStream_t stream); 59 | 60 | // 61 | // Create plan and choose implementation by measuring performance 62 | // 63 | // Parameters 64 | // handle = Returned handle to cuTT plan 65 | // rank = Rank of the tensor 66 | // dim[rank] = Dimensions of the tensor 67 | // permutation[rank] = Transpose permutation 68 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 69 | // stream = CUDA stream (0 if no stream is used) 70 | // idata = Input data size product(dim) 71 | // odata = Output data size product(dim) 72 | // 73 | // Returns 74 | // Success/unsuccess code 75 | // 76 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 77 | cudaStream_t stream, void* idata, void* odata); 78 | 79 | // 80 | // Destroy plan 81 | // 82 | // Parameters 83 | // handle = Handle to the cuTT plan 84 | // 85 | // Returns 86 | // Success/unsuccess code 87 | // 88 | cuttResult cuttDestroy(cuttHandle handle); 89 | 90 | // 91 | // Execute plan out-of-place 92 | // 93 | // Parameters 94 | // handle = Returned handle to cuTT plan 95 | // idata = Input data size product(dim) 96 | // odata = Output data size product(dim) 97 | // 98 | // Returns 99 | // Success/unsuccess code 100 | // 101 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata); 102 | 103 | #endif // CUTT_H 104 | -------------------------------------------------------------------------------- /src/cuttTimer.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | 26 | #ifndef CUTTTIMER_H 27 | #define CUTTTIMER_H 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | // ------------------------------------------------- 35 | // By default uses CUDA event timer. Comment out 36 | // this line if you want to use the wallclock 37 | #define CUDA_EVENT_TIMER 38 | // ------------------------------------------------- 39 | #ifdef CUDA_EVENT_TIMER 40 | #include 41 | #endif 42 | 43 | // 44 | // Simple raw timer 45 | // 46 | class Timer { 47 | private: 48 | #ifdef CUDA_EVENT_TIMER 49 | cudaEvent_t tmstart, tmend; 50 | #else 51 | std::chrono::high_resolution_clock::time_point tmstart, tmend; 52 | #endif 53 | public: 54 | #ifdef CUDA_EVENT_TIMER 55 | Timer(); 56 | ~Timer(); 57 | #endif 58 | void start(); 59 | void stop(); 60 | double seconds(); 61 | }; 62 | 63 | // 64 | // Records timings for cuTT and gives out bandwidths and other data 65 | // 66 | class cuttTimer { 67 | private: 68 | // Size of the type we're measuring 69 | const int sizeofType; 70 | 71 | // Dimension and permutation of the current run 72 | std::vector curDim; 73 | std::vector curPermutation; 74 | 75 | // Bytes transposed in the current run 76 | size_t curBytes; 77 | 78 | // Timer for current run 79 | Timer timer; 80 | 81 | struct Stat { 82 | double totBW; 83 | double minBW; 84 | double maxBW; 85 | std::vector BW; 86 | std::vector worstDim; 87 | std::vector worstPermutation; 88 | Stat() { 89 | totBW = 0.0; 90 | minBW = 1.0e20; 91 | maxBW = -1.0; 92 | } 93 | }; 94 | 95 | // List of ranks that have been recorded 96 | std::set ranks; 97 | 98 | // Statistics for every rank 99 | std::unordered_map stats; 100 | 101 | public: 102 | cuttTimer(int sizeofType); 103 | ~cuttTimer(); 104 | void start(std::vector& dim, std::vector& permutation); 105 | void stop(); 106 | double seconds(); 107 | double GBs(); 108 | double GiBs(); 109 | double getBest(int rank); 110 | double getWorst(int rank); 111 | double getWorst(int rank, std::vector& dim, std::vector& permutation); 112 | double getMedian(int rank); 113 | double getAverage(int rank); 114 | std::vector getData(int rank); 115 | 116 | double getWorst(std::vector& dim, std::vector& permutation); 117 | 118 | std::set::const_iterator ranksBegin() { 119 | return ranks.begin(); 120 | } 121 | 122 | std::set::const_iterator ranksEnd() { 123 | return ranks.end(); 124 | } 125 | }; 126 | 127 | #endif // CUTTTIMER_H 128 | -------------------------------------------------------------------------------- /src/cuttGpuModel.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTTGPUMODEL_H 26 | #define CUTTGPUMODEL_H 27 | 28 | #include 29 | #include "cuttTypes.h" 30 | #include "cuttplan.h" 31 | #include "int_vector.h" 32 | 33 | void computePos(const int vol0, const int vol1, 34 | const TensorConvInOut* conv, const int numConv, 35 | int* posIn, int* posOut); 36 | 37 | void computePos0(const int vol, 38 | const TensorConvInOut* conv, const int numConv, 39 | int* posIn, int* posOut); 40 | 41 | void computePosRef(int vol0, int vol1, 42 | std::vector::iterator it0, std::vector::iterator it1, 43 | std::vector& posIn, std::vector& posOut); 44 | 45 | void countPackedGlTransactions(const int warpSize, const int accWidth, const int cacheWidth, 46 | const int numthread, const int posMbarIn, const int posMbarOut, const int volMmk, 47 | std::vector& posMmkIn, std::vector& posMmkOut, 48 | int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, 49 | int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1); 50 | 51 | void countPackedGlTransactions0(const int warpSize, const int accWidth, const int cacheWidth, 52 | const int numthread, 53 | const int numPos, const int posMbarIn[INT_VECTOR_LEN], const int posMbarOut[INT_VECTOR_LEN], 54 | const int volMmk, const int* __restrict__ posMmkIn, const int* __restrict__ posMmkOut, 55 | int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, 56 | int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1); 57 | 58 | void countPackedShTransactions(const int warpSize, const int bankWidth, const int numthread, 59 | const int volMmk, const TensorConv* msh, const int numMsh, 60 | int& sld_tran, int& sst_tran, int& sld_req, int& sst_req); 61 | 62 | void countPackedShTransactions0(const int warpSize, const int bankWidth, const int numthread, 63 | const int volMmk, const TensorConv* msh, const int numMsh, 64 | int& sld_tran, int& sst_tran, int& sld_req, int& sst_req); 65 | 66 | void countPackedShTransactionsRef(const int warpSize, const int bankWidth, const int numthread, 67 | const int volMmk, const TensorConv* msh, const int numMsh, 68 | int& sld_tran, int& sst_tran, int& sld_req, int& sst_req); 69 | 70 | void countTiledGlTransactions(const bool leadVolSame, 71 | const int numPosMbarSample, const int volMm, const int volMk, const int volMbar, 72 | const int cIn, const int cOut, const int accWidth, const int cacheWidth, 73 | std::vector& hostMbar, const int sizeMbar, 74 | int& num_iter, float& mlp, int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, int& cl_full, int& cl_part); 75 | 76 | double cyclesPacked(const bool isSplit, const size_t sizeofType, cudaDeviceProp& prop, 77 | int nthread, int numActiveBlock, float mlp, 78 | int gld_req, int gst_req, int gld_tran, int gst_tran, 79 | int sld_req, int sst_req, int sld_tran, int sst_tran, int num_iter, int cl_full, int cl_part); 80 | 81 | double cyclesTiled(const bool isCopy, const size_t sizeofType, cudaDeviceProp& prop, 82 | int nthread, int numActiveBlock, float mlp, 83 | int gld_req, int gst_req, int gld_tran, int gst_tran, 84 | int sld_req, int sst_req, int sld_tran, int sst_tran, int num_iter, int cl_full, int cl_part); 85 | 86 | bool testCounters(const int warpSize, const int accWidth, const int cacheWidth); 87 | 88 | #endif // CUTTGPUMODEL_H -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #****************************************************************************** 2 | #MIT License 3 | # 4 | #Copyright (c) 2016 Antti-Pekka Hynninen 5 | #Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | # 7 | #Permission is hereby granted, free of charge, to any person obtaining a copy 8 | #of this software and associated documentation files (the "Software"), to deal 9 | #in the Software without restriction, including without limitation the rights 10 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | #copies of the Software, and to permit persons to whom the Software is 12 | #furnished to do so, subject to the following conditions: 13 | # 14 | #The above copyright notice and this permission notice shall be included in all 15 | #copies or substantial portions of the Software. 16 | # 17 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | #SOFTWARE. 24 | #******************************************************************************* 25 | 26 | #################### User Settings #################### 27 | 28 | # C++ compiler 29 | CC = g++ -fPIC 30 | 31 | # CUDA compiler 32 | ifeq ($(nvcc_path),) 33 | CUDAC = /usr/local/cuda/bin/nvcc -Xcompiler -fPIC 34 | else 35 | CUDAC = $(nvcc_path) -Xcompiler -fPIC 36 | endif 37 | 38 | # Enable nvvp profiling of CPU code by using "make ENABLE_NVTOOLS=1" 39 | # If aligned_alloc() is not available, use "make NO_ALIGNED_ALLOC=1" 40 | 41 | # SM versions for which code is generated must be sm_30 and above 42 | GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 43 | GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 44 | GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 45 | GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 46 | GENCODE_SM75 := -gencode arch=compute_75,code=sm_75 47 | GENCODE_FLAGS := $(GENCODE_SM35) $(GENCODE_SM52) $(GENCODE_SM60) $(GENCODE_SM75) 48 | GENCODE_FLAGS := $(NVCC_GENCODE) 49 | 50 | ####################################################### 51 | 52 | # Detect OS 53 | ifeq ($(shell uname -a|grep Linux|wc -l|tr -d ' '), 1) 54 | OS = linux 55 | endif 56 | 57 | ifeq ($(shell uname -a|grep titan|wc -l|tr -d ' '), 1) 58 | OS = linux 59 | endif 60 | 61 | ifeq ($(shell uname -a|grep Darwin|wc -l|tr -d ' '), 1) 62 | OS = osx 63 | endif 64 | 65 | # Detect x86_64 vs. Power 66 | CPU = unknown 67 | 68 | ifeq ($(shell uname -a|grep x86_64|wc -l|tr -d ' '), 1) 69 | CPU = x86_64 70 | endif 71 | 72 | ifeq ($(shell uname -a|grep ppc64|wc -l|tr -d ' '), 1) 73 | CPU = ppc64 74 | endif 75 | 76 | # Set optimization level 77 | OPTLEV = -O3 78 | 79 | # Defines 80 | DEFS = 81 | 82 | ifdef ENABLE_NVTOOLS 83 | DEFS += -DENABLE_NVTOOLS 84 | endif 85 | 86 | ifdef NO_ALIGNED_ALLOC 87 | DEFS += -DNO_ALIGNED_ALLOC 88 | endif 89 | 90 | OBJSLIB = build/cutt.o build/cuttplan.o build/cuttkernel.o build/cuttGpuModel.o build/CudaUtils.o build/cuttTimer.o build/cuttGpuModelKernel.o 91 | OBJSTEST = build/cutt_test.o build/TensorTester.o build/CudaUtils.o build/cuttTimer.o 92 | OBJSBENCH = build/cutt_bench.o build/TensorTester.o build/CudaUtils.o build/cuttTimer.o build/CudaMemcpy.o 93 | OBJS = $(OBJSLIB) $(OBJSTEST) $(OBJSBENCH) 94 | 95 | #CUDAROOT = $(subst /bin/,,$(dir $(shell which nvcc))) 96 | #CUDAROOT = $(subst /bin/,,$(dir $(shell which $(CUDAC)))) 97 | 98 | ifeq ($(nvcc_path),) 99 | CUDAROOT = /usr/local/cuda 100 | else 101 | CUDAROOT = $(subst /bin/nvcc,, $(nvcc_path)) 102 | endif 103 | 104 | CFLAGS = -I${CUDAROOT}/include -std=c++11 $(DEFS) $(OPTLEV) 105 | ifeq ($(CPU),x86_64) 106 | CFLAGS += -march=native 107 | endif 108 | 109 | CUDA_CFLAGS = -I${CUDAROOT}/include -std=c++11 $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES 110 | 111 | ifeq ($(OS),osx) 112 | CUDA_LFLAGS = -L$(CUDAROOT)/lib 113 | else 114 | CUDA_LFLAGS = -L$(CUDAROOT)/lib64 115 | endif 116 | 117 | CUDA_LFLAGS += -Llib -lcudart -lcutt 118 | ifdef ENABLE_NVTOOLS 119 | CUDA_LFLAGS += -lnvToolsExt 120 | endif 121 | 122 | all: create_build lib/libcutt.so bin/cutt_test bin/cutt_bench 123 | 124 | create_build: 125 | mkdir -p build 126 | 127 | lib/libcutt.so: $(OBJSLIB) 128 | mkdir -p lib 129 | rm -f lib/libcutt.so 130 | g++ -fPIC --share -o lib/libcutt.so $(OBJSLIB) 131 | mkdir -p include 132 | cp -f src/cutt.h include/cutt.h 133 | 134 | bin/cutt_test : lib/libcutt.so $(OBJSTEST) 135 | mkdir -p bin 136 | $(CC) -o bin/cutt_test $(OBJSTEST) $(CUDA_LFLAGS) 137 | 138 | bin/cutt_bench : lib/libcutt.so $(OBJSBENCH) 139 | mkdir -p bin 140 | $(CC) -o bin/cutt_bench $(OBJSBENCH) $(CUDA_LFLAGS) 141 | 142 | clean: 143 | rm -f $(OBJS) 144 | rm -f build/*.d 145 | rm -f *~ 146 | rm -f lib/libcutt.so 147 | rm -f bin/cutt_test 148 | rm -f bin/cutt_bench 149 | 150 | # Pull in dependencies that already exist 151 | -include $(OBJS:.o=.d) 152 | 153 | build/%.o : src/%.cu 154 | $(CUDAC) -c $(CUDA_CFLAGS) -o build/$*.o $< 155 | echo -e 'build/\c' > build/$*.d 156 | $(CUDAC) -M $(CUDA_CFLAGS) $< >> build/$*.d 157 | 158 | build/%.o : src/%.cpp 159 | $(CC) -c $(CFLAGS) -o build/$*.o $< 160 | echo -e 'build/\c' > build/$*.d 161 | $(CC) -M $(CFLAGS) $< >> build/$*.d 162 | -------------------------------------------------------------------------------- /src/CudaMemcpy.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include "CudaUtils.h" 27 | #include "CudaMemcpy.h" 28 | 29 | const int numthread = 64; 30 | 31 | // ----------------------------------------------------------------------------------- 32 | // 33 | // Copy using scalar loads and stores 34 | // 35 | template 36 | __global__ void scalarCopyKernel(const int n, const T* data_in, T* data_out) { 37 | 38 | for (int i = threadIdx.x + blockIdx.x*blockDim.x;i < n;i += blockDim.x*gridDim.x) { 39 | data_out[i] = data_in[i]; 40 | } 41 | 42 | } 43 | template 44 | void scalarCopy(const int n, const T* data_in, T* data_out, cudaStream_t stream) { 45 | 46 | int numblock = (n - 1)/numthread + 1; 47 | // numblock = min(65535, numblock); 48 | // numblock = min(256, numblock); 49 | 50 | scalarCopyKernel <<< numblock, numthread, 0, stream >>> 51 | (n, data_in, data_out); 52 | 53 | cudaCheck(cudaGetLastError()); 54 | } 55 | // ----------------------------------------------------------------------------------- 56 | 57 | // ----------------------------------------------------------------------------------- 58 | // 59 | // Copy using vectorized loads and stores 60 | // 61 | template 62 | __global__ void vectorCopyKernel(const int n, T* data_in, T* data_out) { 63 | 64 | // Maximum vector load is 128 bits = 16 bytes 65 | const int vectorLength = 16/sizeof(T); 66 | 67 | int idx = threadIdx.x + blockIdx.x*blockDim.x; 68 | 69 | // Vector elements 70 | for (int i = idx;i < n/vectorLength;i += blockDim.x*gridDim.x) { 71 | reinterpret_cast(data_out)[i] = reinterpret_cast(data_in)[i]; 72 | } 73 | 74 | // Remaining elements 75 | for (int i = idx + (n/vectorLength)*vectorLength;i < n;i += blockDim.x*gridDim.x + threadIdx.x) { 76 | data_out[i] = data_in[i]; 77 | } 78 | 79 | } 80 | 81 | template 82 | void vectorCopy(const int n, T* data_in, T* data_out, cudaStream_t stream) { 83 | 84 | const int vectorLength = 16/sizeof(T); 85 | 86 | int numblock = (n/vectorLength - 1)/numthread + 1; 87 | // numblock = min(65535, numblock); 88 | int shmemsize = 0; 89 | 90 | vectorCopyKernel <<< numblock, numthread, shmemsize, stream >>> 91 | (n, data_in, data_out); 92 | 93 | cudaCheck(cudaGetLastError()); 94 | } 95 | // ----------------------------------------------------------------------------------- 96 | 97 | // ----------------------------------------------------------------------------------- 98 | // 99 | // Copy using vectorized loads and stores 100 | // 101 | template 102 | __global__ void memcpyFloatKernel(const int n, float4 *data_in, float4* data_out) { 103 | int index = threadIdx.x + numElem*blockIdx.x*blockDim.x; 104 | float4 a[numElem]; 105 | #pragma unroll 106 | for (int i=0;i < numElem;i++) { 107 | if (index + i*blockDim.x < n) a[i] = data_in[index + i*blockDim.x]; 108 | } 109 | #pragma unroll 110 | for (int i=0;i < numElem;i++) { 111 | if (index + i*blockDim.x < n) data_out[index + i*blockDim.x] = a[i]; 112 | } 113 | } 114 | 115 | template 116 | __global__ void memcpyFloatLoopKernel(const int n, float4 *data_in, float4* data_out) { 117 | for (int index=threadIdx.x + blockIdx.x*numElem*blockDim.x;index < n;index += numElem*gridDim.x*blockDim.x) 118 | { 119 | float4 a[numElem]; 120 | #pragma unroll 121 | for (int i=0;i < numElem;i++) { 122 | if (index + i*blockDim.x < n) a[i] = data_in[index + i*blockDim.x]; 123 | } 124 | #pragma unroll 125 | for (int i=0;i < numElem;i++) { 126 | if (index + i*blockDim.x < n) data_out[index + i*blockDim.x] = a[i]; 127 | } 128 | } 129 | } 130 | 131 | #define NUM_ELEM 2 132 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream) { 133 | 134 | int numblock = (n/(4*NUM_ELEM) - 1)/numthread + 1; 135 | int shmemsize = 0; 136 | memcpyFloatKernel <<< numblock, numthread, shmemsize, stream >>> 137 | (n/4, (float4 *)data_in, (float4 *)data_out); 138 | 139 | // int numblock = 64; 140 | // int shmemsize = 0; 141 | // memcpyFloatLoopKernel <<< numblock, numthread, shmemsize, stream >>> 142 | // (n/4, (float4 *)data_in, (float4 *)data_out); 143 | 144 | cudaCheck(cudaGetLastError()); 145 | } 146 | // ----------------------------------------------------------------------------------- 147 | 148 | // Explicit instances 149 | template void scalarCopy(const int n, const int* data_in, int* data_out, cudaStream_t stream); 150 | template void scalarCopy(const int n, const long long int* data_in, long long int* data_out, cudaStream_t stream); 151 | template void vectorCopy(const int n, int* data_in, int* data_out, cudaStream_t stream); 152 | template void vectorCopy(const int n, long long int* data_in, long long int* data_out, cudaStream_t stream); 153 | void memcpyFloat(const int n, float* data_in, float* data_out, cudaStream_t stream); 154 | -------------------------------------------------------------------------------- /src/CudaUtils.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | 26 | #include 27 | #ifdef ENABLE_NVTOOLS 28 | #include 29 | #endif 30 | #include "CudaUtils.h" 31 | 32 | //---------------------------------------------------------------------------------------- 33 | 34 | void set_device_array_async_T(void *data, int value, const size_t ndata, cudaStream_t stream, const size_t sizeofT) { 35 | cudaCheck(cudaMemsetAsync(data, value, sizeofT*ndata, stream)); 36 | } 37 | 38 | void set_device_array_T(void *data, int value, const size_t ndata, const size_t sizeofT) { 39 | cudaCheck(cudaMemset(data, value, sizeofT*ndata)); 40 | } 41 | 42 | //---------------------------------------------------------------------------------------- 43 | // 44 | // Jittor malloc & free 45 | // 46 | void cutt_malloc(void** p, size_t len, size_t& allocation) { 47 | cudaCheck(cudaMalloc(p, len)); 48 | } 49 | 50 | void cutt_free(void* p, size_t len, size_t& allocation) { 51 | cudaCheck(cudaFree(p)); 52 | } 53 | 54 | void (*custom_cuda_malloc)(void** p, size_t len, size_t& allocation) = NULL; 55 | 56 | void (*custom_cuda_free)(void* p, size_t len, size_t& allocation) = NULL; 57 | 58 | //---------------------------------------------------------------------------------------- 59 | // 60 | // Allocate gpu memory 61 | // pp = memory pointer 62 | // len = length of the array 63 | // 64 | void allocate_device_T(void **pp, const size_t len, const size_t sizeofT) { 65 | cudaCheck(cudaMalloc(pp, sizeofT*len)); 66 | } 67 | 68 | //---------------------------------------------------------------------------------------- 69 | // 70 | // Deallocate gpu memory 71 | // pp = memory pointer 72 | // 73 | void deallocate_device_T(void **pp) { 74 | if (*pp != NULL) { 75 | cudaCheck(cudaFree((void *)(*pp))); 76 | *pp = NULL; 77 | } 78 | 79 | } 80 | 81 | //---------------------------------------------------------------------------------------- 82 | // 83 | // Jittor allocate gpu memory 84 | // pp = memory pointer 85 | // len = length of the array 86 | // 87 | void jit_allocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation) { 88 | if (custom_cuda_malloc==NULL){ 89 | cutt_malloc(pp, sizeofT*len, allocation); 90 | }else custom_cuda_malloc(pp, sizeofT*len, allocation); 91 | } 92 | 93 | //---------------------------------------------------------------------------------------- 94 | // 95 | // Jittor deallocate gpu memory 96 | // pp = memory pointer 97 | // 98 | void jit_deallocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation) { 99 | if (*pp != NULL) { 100 | if (custom_cuda_free==NULL){ 101 | cutt_free((void *)(*pp), sizeofT*len, allocation); 102 | }else custom_cuda_free((void *)(*pp), sizeofT*len, allocation); 103 | *pp = NULL; 104 | } 105 | 106 | } 107 | 108 | //---------------------------------------------------------------------------------------- 109 | // 110 | // Copies memory Host -> Device 111 | // 112 | void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, 113 | const size_t sizeofT) { 114 | cudaCheck(cudaMemcpyAsync(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice, stream)); 115 | } 116 | 117 | void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, 118 | const size_t sizeofT) { 119 | cudaCheck(cudaMemcpy(d_array, h_array, sizeofT*array_len, cudaMemcpyHostToDevice)); 120 | } 121 | 122 | //---------------------------------------------------------------------------------------- 123 | // 124 | // Copies memory Device -> Host 125 | // 126 | void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, 127 | const size_t sizeofT) { 128 | cudaCheck(cudaMemcpyAsync(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost, stream)); 129 | } 130 | 131 | void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT) { 132 | cudaCheck(cudaMemcpy(h_array, d_array, sizeofT*array_len, cudaMemcpyDeviceToHost)); 133 | } 134 | 135 | //---------------------------------------------------------------------------------------- 136 | #ifdef ENABLE_NVTOOLS 137 | void gpuRangeStart(const char *range_name) { 138 | static int color_id=0; 139 | nvtxEventAttributes_t att; 140 | att.version = NVTX_VERSION; 141 | att.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; 142 | att.colorType = NVTX_COLOR_ARGB; 143 | if (color_id == 0) { 144 | att.color = 0xFFFF0000; 145 | } else if (color_id == 1) { 146 | att.color = 0xFF00FF00; 147 | } else if (color_id == 2) { 148 | att.color = 0xFF0000FF; 149 | } else if (color_id == 3) { 150 | att.color = 0xFFFF00FF; 151 | } 152 | color_id++; 153 | if (color_id > 3) color_id = 0; 154 | att.messageType = NVTX_MESSAGE_TYPE_ASCII; 155 | att.message.ascii = range_name; 156 | nvtxRangePushEx(&att); 157 | } 158 | 159 | void gpuRangeStop() { 160 | nvtxRangePop(); 161 | } 162 | #endif 163 | -------------------------------------------------------------------------------- /src/CudaUtils.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUDAUTILS_H 26 | #define CUDAUTILS_H 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #define S(x) #x 35 | #define S_(x) S(x) 36 | #define S__LINE__ S_(__LINE__) 37 | #define FULL_MASK 0xffffffff 38 | 39 | 40 | // 41 | // Error checking wrapper for CUDA 42 | // 43 | #define cudaCheck(stmt) do { \ 44 | cudaError_t err = stmt; \ 45 | if (err != cudaSuccess) { \ 46 | std::string msg = #stmt; \ 47 | msg += " in file "; \ 48 | msg += __FILE__; \ 49 | msg += ":"; \ 50 | msg += S__LINE__; \ 51 | msg += ", function "; \ 52 | msg += __FUNCTION__; \ 53 | msg += "\nError message: "; \ 54 | msg += cudaGetErrorString(err); \ 55 | throw std::runtime_error(msg); \ 56 | } \ 57 | } while(0) 58 | 59 | void set_device_array_async_T(void *data, int value, const size_t ndata, cudaStream_t stream, const size_t sizeofT); 60 | void set_device_array_T(void *data, int value, const size_t ndata, const size_t sizeofT); 61 | 62 | template 63 | void set_device_array(T *data, int value, const size_t ndata, cudaStream_t stream=0) { 64 | set_device_array_async_T(data, value, ndata, stream, sizeof(T)); 65 | } 66 | 67 | template 68 | void set_device_array_sync(T *data, int value, const size_t ndata) { 69 | set_device_array_T(data, value, ndata, sizeof(T)); 70 | } 71 | 72 | void allocate_device_T(void **pp, const size_t len, const size_t sizeofT); 73 | //---------------------------------------------------------------------------------------- 74 | // 75 | // Allocate gpu memory 76 | // pp = memory pointer 77 | // len = length of the array 78 | // 79 | template 80 | void allocate_device(T **pp, const size_t len) { 81 | allocate_device_T((void **)pp, len, sizeof(T)); 82 | } 83 | 84 | void deallocate_device_T(void **pp); 85 | //---------------------------------------------------------------------------------------- 86 | // 87 | // Deallocate gpu memory 88 | // pp = memory pointer 89 | // 90 | template 91 | void deallocate_device(T **pp) { 92 | deallocate_device_T((void **)pp); 93 | } 94 | //---------------------------------------------------------------------------------------- 95 | 96 | void jit_allocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t &allocation); 97 | //---------------------------------------------------------------------------------------- 98 | // 99 | // Allocate gpu memory 100 | // pp = memory pointer 101 | // len = length of the array 102 | // 103 | template 104 | void jit_allocate_device(T **pp, const size_t len, size_t& allocation) { 105 | jit_allocate_device_T((void **)pp, len, sizeof(T), allocation); 106 | } 107 | 108 | void jit_deallocate_device_T(void **pp, const size_t len, const size_t sizeofT, size_t& allocation); 109 | //---------------------------------------------------------------------------------------- 110 | // 111 | // Deallocate gpu memory 112 | // pp = memory pointer 113 | // 114 | template 115 | void jit_deallocate_device(T **pp, const size_t len, size_t& allocation) { 116 | jit_deallocate_device_T((void **)pp, len, sizeof(T), allocation); 117 | } 118 | //---------------------------------------------------------------------------------------- 119 | 120 | void copy_HtoD_async_T(const void *h_array, void *d_array, size_t array_len, cudaStream_t stream, 121 | const size_t sizeofT); 122 | void copy_HtoD_T(const void *h_array, void *d_array, size_t array_len, 123 | const size_t sizeofT); 124 | void copy_DtoH_async_T(const void *d_array, void *h_array, const size_t array_len, cudaStream_t stream, 125 | const size_t sizeofT); 126 | void copy_DtoH_T(const void *d_array, void *h_array, const size_t array_len, const size_t sizeofT); 127 | 128 | //---------------------------------------------------------------------------------------- 129 | // 130 | // Copies memory Host -> Device 131 | // 132 | template 133 | void copy_HtoD(const T *h_array, T *d_array, size_t array_len, cudaStream_t stream=0) { 134 | copy_HtoD_async_T(h_array, d_array, array_len, stream, sizeof(T)); 135 | } 136 | 137 | //---------------------------------------------------------------------------------------- 138 | // 139 | // Copies memory Host -> Device using synchronous calls 140 | // 141 | template 142 | void copy_HtoD_sync(const T *h_array, T *d_array, size_t array_len) { 143 | copy_HtoD_T(h_array, d_array, array_len, sizeof(T)); 144 | } 145 | 146 | //---------------------------------------------------------------------------------------- 147 | // 148 | // Copies memory Device -> Host 149 | // 150 | template 151 | void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0) { 152 | copy_DtoH_async_T(d_array, h_array, array_len, stream, sizeof(T)); 153 | } 154 | //---------------------------------------------------------------------------------------- 155 | // 156 | // Copies memory Device -> Host using synchronous calls 157 | // 158 | template 159 | void copy_DtoH_sync(const T *d_array, T *h_array, const size_t array_len) { 160 | copy_DtoH_T(d_array, h_array, array_len, sizeof(T)); 161 | } 162 | 163 | #ifdef ENABLE_NVTOOLS 164 | void gpuRangeStart(const char *range_name); 165 | void gpuRangeStop(); 166 | #endif 167 | 168 | //---------------------------------------------------------------------------------------- 169 | // 170 | // Jittor malloc & free 171 | // 172 | void cutt_malloc(void** p, size_t len, size_t& allocation); 173 | 174 | void cutt_free(void* p, size_t len, size_t& allocation); 175 | 176 | extern void (*custom_cuda_malloc)(void** p, size_t len, size_t& allocation); 177 | 178 | extern void (*custom_cuda_free)(void* p, size_t len, size_t& allocation); 179 | 180 | #endif // CUDAUTILS_H -------------------------------------------------------------------------------- /src/cuttplan.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #ifndef CUTTPLAN_H 26 | #define CUTTPLAN_H 27 | 28 | #include 29 | #include 30 | #include 31 | #include "cuttTypes.h" 32 | 33 | const int TILEDIM = 32; 34 | const int TILEROWS = 8; 35 | 36 | // Transposing methods 37 | enum {Unknown, Trivial, Packed, PackedSplit, 38 | Tiled, TiledCopy, 39 | NumTransposeMethods}; 40 | 41 | // Tells how tensor is split into Mm and Mk and what method is used 42 | // NOTE: sizeMm and sizeMk fully define the split 43 | class TensorSplit { 44 | public: 45 | // Transposing method 46 | int method; 47 | 48 | // Input volume 49 | int sizeMm; 50 | int volMm; 51 | 52 | // Output volume 53 | int sizeMk; 54 | int volMk; 55 | 56 | // {Input} U {Output} 57 | int sizeMmk; 58 | int volMmk; 59 | 60 | // {Input} CUT {Output} = Mk which is not in Mm 61 | int sizeMkBar; 62 | int volMkBar; 63 | 64 | // Remaining volume 65 | int sizeMbar; 66 | int volMbar; 67 | 68 | // For Packed and PackedSplit methods: 69 | // Amount of contigious volume 70 | int volMmkInCont; 71 | int volMmkOutCont; 72 | 73 | // For PackedSplit method: 74 | // Number of splits 75 | int numSplit; 76 | 77 | // Rank that is split 78 | int splitRank; 79 | int splitDim; 80 | 81 | // volMmk that is left unsplit 82 | int volMmkUnsplit; 83 | 84 | TensorSplit(); 85 | 86 | void print(); 87 | 88 | void update(const int sizeMm_in, const int sizeMk_in, const int rank, 89 | const int* dim, const int* permutation); 90 | 91 | // Number of elements in shared memory space 92 | size_t shmem() const; 93 | 94 | // Number of elements in Mmk that are used effectively 95 | size_t volMmkUsed() const; 96 | 97 | // Bytes the shared memory space that needs to be allocated 98 | // (can be larger than volShmem() due to padding) 99 | size_t shmemAlloc(int sizeofType) const; 100 | 101 | }; 102 | 103 | class LaunchConfig { 104 | public: 105 | // Kernel launch configuration 106 | dim3 numthread; 107 | dim3 numblock; 108 | size_t shmemsize; 109 | 110 | // For the Packed method, number of registers to use for storage 111 | int numRegStorage; 112 | 113 | void print(); 114 | 115 | }; 116 | 117 | // Class that stores the plan data 118 | class cuttPlan_t { 119 | public: 120 | // Device for which this plan was made 121 | int deviceID; 122 | 123 | // CUDA stream associated with the plan 124 | cudaStream_t stream; 125 | 126 | // Kernel launch configuration 127 | LaunchConfig launchConfig; 128 | 129 | // Rank of the tensor 130 | int rank; 131 | 132 | // Size of the tensor elements in bytes 133 | size_t sizeofType; 134 | 135 | TensorSplit tensorSplit; 136 | 137 | // Number of active thread blocks 138 | int numActiveBlock; 139 | 140 | int cuDimMk; 141 | int cuDimMm; 142 | 143 | int2 tiledVol; 144 | 145 | // Number of iterations of the kernel 146 | int num_iter; 147 | // Average memory level parallelism = average unroll count 148 | float mlp; 149 | int gld_req, gst_req, gld_tran, gst_tran; 150 | int cl_full_l2, cl_part_l2; 151 | int cl_full_l1, cl_part_l1; 152 | int sld_req, sst_req, sld_tran, sst_tran; 153 | double cycles; 154 | 155 | //-------------- 156 | // Host buffers 157 | //-------------- 158 | std::vector hostMbar; 159 | std::vector hostMmk; 160 | std::vector hostMsh; 161 | 162 | //---------------- 163 | // Device buffers 164 | //---------------- 165 | // sizeMbar 166 | TensorConvInOut* Mbar; 167 | size_t MbarSz; 168 | size_t MbarAllocation; 169 | 170 | // sizeMmk 171 | TensorConvInOut* Mmk; 172 | size_t MmkSz; 173 | size_t MmkAllocation; 174 | 175 | // sizeMmk 176 | TensorConv* Msh; 177 | size_t MshSz; 178 | size_t MshAllocation; 179 | 180 | // For TiledSingleInRank 181 | TensorConv* Mk; 182 | size_t MkSz; 183 | size_t MkAllocation; 184 | 185 | // For TiledSingleOutRank 186 | TensorConv* Mm; 187 | size_t MmSz; 188 | size_t MmAllocation; 189 | 190 | cuttPlan_t(); 191 | ~cuttPlan_t(); 192 | void print(); 193 | void setStream(cudaStream_t stream_in); 194 | bool countCycles(cudaDeviceProp& prop, const int numPosMbarSample=0); 195 | void activate(); 196 | void nullDevicePointers(); 197 | 198 | static bool createPlans(const int rank, const int* dim, const int* permutation, 199 | const int redRank, const int* redDim, const int* redPermutation, 200 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 201 | 202 | private: 203 | static bool createTrivialPlans(const int rank, const int* dim, const int* permutation, 204 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 205 | 206 | static bool createTiledPlans(const int rank, const int* dim, const int* permutation, 207 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 208 | 209 | static bool createTiledCopyPlans(const int rank, const int* dim, const int* permutation, 210 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 211 | 212 | static bool createPackedPlans(const int rank, const int* dim, const int* permutation, 213 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 214 | 215 | static bool createPackedSplitPlans(const int rank, const int* dim, const int* permutation, 216 | const size_t sizeofType, const int deviceID, const cudaDeviceProp& prop, std::list& plans); 217 | 218 | bool setup(const int rank_in, const int* dim, const int* permutation, 219 | const size_t sizeofType_in, const TensorSplit& tensorSplit_in, 220 | const LaunchConfig& launchConfig_in, const int numActiveBlock_in); 221 | 222 | }; 223 | 224 | void printMatlab(cudaDeviceProp& prop, std::list& plans, std::vector& times); 225 | 226 | void reduceRanks(const int rank, const int* dim, const int* permutation, 227 | std::vector& redDim, std::vector& redPermutation); 228 | 229 | std::list::iterator choosePlanHeuristic(std::list& plans); 230 | 231 | #endif // CUTTPLAN_H 232 | -------------------------------------------------------------------------------- /src/cuttTimer.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | 26 | #include "cuttTimer.h" 27 | #include "CudaUtils.h" 28 | // #include // std::numeric_limits 29 | #include 30 | #ifdef CUDA_EVENT_TIMER 31 | #include "CudaUtils.h" 32 | #endif 33 | 34 | #ifdef CUDA_EVENT_TIMER 35 | Timer::Timer() { 36 | cudaCheck(cudaEventCreate(&tmstart)); 37 | cudaCheck(cudaEventCreate(&tmend)); 38 | } 39 | Timer::~Timer() { 40 | cudaCheck(cudaEventDestroy(tmstart)); 41 | cudaCheck(cudaEventDestroy(tmend)); 42 | } 43 | #endif 44 | 45 | void Timer::start() { 46 | #ifdef CUDA_EVENT_TIMER 47 | cudaCheck(cudaEventRecord(tmstart, 0)); 48 | #else 49 | tmstart = std::chrono::high_resolution_clock::now(); 50 | #endif 51 | } 52 | 53 | void Timer::stop() { 54 | #ifdef CUDA_EVENT_TIMER 55 | cudaCheck(cudaEventRecord(tmend, 0)); 56 | cudaCheck(cudaEventSynchronize(tmend)); 57 | #else 58 | cudaCheck(cudaDeviceSynchronize()); 59 | tmend = std::chrono::high_resolution_clock::now(); 60 | #endif 61 | } 62 | 63 | // 64 | // Returns the duration of the last run in seconds 65 | // 66 | double Timer::seconds() { 67 | #ifdef CUDA_EVENT_TIMER 68 | float ms; 69 | cudaCheck(cudaEventElapsedTime(&ms, tmstart, tmend)); 70 | return (double)(ms/1000.0f); 71 | #else 72 | return std::chrono::duration_cast< std::chrono::duration >(tmend - tmstart).count(); 73 | #endif 74 | } 75 | 76 | // 77 | // Class constructor 78 | // 79 | cuttTimer::cuttTimer(int sizeofType) : sizeofType(sizeofType) {} 80 | 81 | // 82 | // Class destructor 83 | // 84 | cuttTimer::~cuttTimer() {} 85 | 86 | // 87 | // Start timer 88 | // 89 | void cuttTimer::start(std::vector& dim, std::vector& permutation) { 90 | curDim = dim; 91 | curPermutation = permutation; 92 | curBytes = sizeofType*2; // "2x" because every element is read and also written out 93 | for (int i=0;i < curDim.size();i++) { 94 | curBytes *= dim[i]; 95 | } 96 | ranks.insert(curDim.size()); 97 | timer.start(); 98 | } 99 | 100 | // 101 | // Stop timer and record statistics 102 | // 103 | void cuttTimer::stop() { 104 | timer.stop(); 105 | double bandwidth = GBs(); 106 | auto it = stats.find(curDim.size()); 107 | if (it == stats.end()) { 108 | Stat new_stat; 109 | std::pair new_elem(curDim.size(), new_stat); 110 | auto retval = stats.insert(new_elem); 111 | it = retval.first; 112 | } 113 | Stat& stat = it->second; 114 | stat.totBW += bandwidth; 115 | if (bandwidth < stat.minBW) { 116 | stat.minBW = bandwidth; 117 | stat.worstDim = curDim; 118 | stat.worstPermutation = curPermutation; 119 | } 120 | stat.maxBW = std::max(stat.maxBW, bandwidth); 121 | stat.BW.push_back(bandwidth); 122 | } 123 | 124 | // 125 | // Returns the duration of the last run in seconds 126 | // 127 | double cuttTimer::seconds() { 128 | return timer.seconds(); 129 | } 130 | 131 | // 132 | // Returns the bandwidth of the last run in GB/s 133 | // 134 | double cuttTimer::GBs() { 135 | const double BILLION = 1000000000.0; 136 | double sec = seconds(); 137 | return (sec == 0.0) ? 0.0 : (double)(curBytes)/(BILLION*sec); 138 | } 139 | 140 | // 141 | // Returns the bandwidth of the last run in GiB/s 142 | // 143 | double cuttTimer::GiBs() { 144 | const double iBILLION = 1073741824.0; 145 | double sec = seconds(); 146 | return (sec == 0.0) ? 0.0 : (double)(curBytes)/(iBILLION*sec); 147 | } 148 | 149 | // 150 | // Returns the best performing tensor transpose for rank 151 | // 152 | double cuttTimer::getBest(int rank) { 153 | auto it = stats.find(rank); 154 | if (it == stats.end()) return 0.0; 155 | Stat& stat = it->second; 156 | return stat.maxBW; 157 | } 158 | 159 | // 160 | // Returns the worst performing tensor transpose for rank 161 | // 162 | double cuttTimer::getWorst(int rank) { 163 | auto it = stats.find(rank); 164 | if (it == stats.end()) return 0.0; 165 | Stat& stat = it->second; 166 | return stat.minBW; 167 | } 168 | 169 | // 170 | // Returns the worst performing tensor transpose for rank 171 | // 172 | double cuttTimer::getWorst(int rank, std::vector& dim, std::vector& permutation) { 173 | auto it = stats.find(rank); 174 | if (it == stats.end()) return 0.0; 175 | Stat& stat = it->second; 176 | dim = stat.worstDim; 177 | permutation = stat.worstPermutation; 178 | return stat.minBW; 179 | } 180 | 181 | // 182 | // Returns the median bandwidth for rank 183 | // 184 | double cuttTimer::getMedian(int rank) { 185 | auto it = stats.find(rank); 186 | if (it == stats.end()) return 0.0; 187 | Stat& stat = it->second; 188 | if (stat.BW.size() == 0) return 0.0; 189 | // Set middle element in to correct position 190 | std::nth_element(stat.BW.begin(), stat.BW.begin() + stat.BW.size()/2, stat.BW.end()); 191 | double median = stat.BW[stat.BW.size()/2]; 192 | if (stat.BW.size() % 2 == 0) { 193 | // For even number of elements, set middle - 1 element in to correct position 194 | // and take average 195 | std::nth_element(stat.BW.begin(), stat.BW.begin() + stat.BW.size()/2 - 1, stat.BW.end()); 196 | median += stat.BW[stat.BW.size()/2 - 1]; 197 | median *= 0.5; 198 | } 199 | return median; 200 | } 201 | 202 | // 203 | // Returns the average bandwidth for rank 204 | // 205 | double cuttTimer::getAverage(int rank) { 206 | auto it = stats.find(rank); 207 | if (it == stats.end()) return 0.0; 208 | Stat& stat = it->second; 209 | return stat.totBW/(double)stat.BW.size(); 210 | } 211 | 212 | // 213 | // Returns all data for rank 214 | // 215 | std::vector cuttTimer::getData(int rank) { 216 | std::vector res; 217 | auto it = stats.find(rank); 218 | if (it != stats.end()) { 219 | Stat& stat = it->second; 220 | res = stat.BW; 221 | } 222 | return res; 223 | } 224 | 225 | // 226 | // Returns the worst performing tensor transpose of all 227 | // 228 | double cuttTimer::getWorst(std::vector& dim, std::vector& permutation) { 229 | double worstBW = 1.0e20; 230 | int worstRank = 0; 231 | for (auto it=ranks.begin(); it != ranks.end(); it++) { 232 | double bw = stats.find(*it)->second.minBW; 233 | if (worstBW > bw) { 234 | worstRank = *it; 235 | worstBW = bw; 236 | } 237 | } 238 | if (worstRank == 0) { 239 | dim.resize(0); 240 | permutation.resize(0); 241 | return 0.0; 242 | } 243 | dim = stats.find(worstRank)->second.worstDim; 244 | permutation = stats.find(worstRank)->second.worstPermutation; 245 | return worstBW; 246 | } 247 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cuTT - CUDA Tensor Transpose 2 | ============================ 3 | 4 | cuTT is a high performance tensor transpose library for NVIDIA GPUs. It works with Kepler (SM 3.0) and above GPUs. 5 | 6 | Copyright (c) 2016 Antti-Pekka Hynninen 7 | 8 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 9 | 10 | Version 1.1 11 | 12 | Installation 13 | ============ 14 | 15 | Software requirements: 16 | * C++ compiler with C++11 compitability 17 | * CUDA compiler 18 | 19 | Hardware requirements: 20 | * Kepler (SM 3.0) or above NVIDIA GPU 21 | 22 | To compile cuTT library as well as test cases and benchmarks, simply do 23 | 24 | make 25 | 26 | This will create the library itself: 27 | 28 | * include/cutt.h 29 | * lib/libcutt.a 30 | 31 | as well as the test and benchmarks 32 | 33 | * bin/cutt_test 34 | * bin/cutt_bench 35 | 36 | In order to use cuTT, you only need the include (include/cutt.h) and the library (lib/libcutt.a) files. 37 | 38 | Running tests and benchmarks 39 | ============================ 40 | 41 | Tests and benchmark executables are in the bin/ directory and they can be run without any options. 42 | Options to the test executable lets you choose the device ID on which to run: 43 | 44 | cutt_test [options] 45 | Options: 46 | -device gpuid : use GPU with ID gpuid 47 | 48 | For the benchmark executable, we have an additional option that lets you run the benchmarks using 49 | plans that are chosen optimally by measuring the performance of every possible implementation and 50 | choosing the best one. 51 | 52 | cutt_bench [options] 53 | Options: 54 | -device gpuid : use GPU with ID gpuid 55 | -measure : use cuttPlanMeasure (default is cuttPlan) 56 | 57 | Performance 58 | =========== 59 | 60 | cuTT was designed with performance as the main goal. Here are performance benchmarks for a random set of tensors with 200M `double` elements with ranks 2 to 7. The benchmarks were run with the measurement flag on 61 | (cutt_bench -measure) 62 | 63 | ![k20x](https://raw.githubusercontent.com/ap-hynninen/cutt/master/doc/k20x_bench.png) 64 | 65 | 67 | 68 | 70 | 71 | Usage 72 | ===== 73 | 74 | cuTT uses a "plan structure" similar to FFTW and cuFFT libraries, where the 75 | user first creates a plan for the transpose and then executes that plan. 76 | Here is an example code. 77 | 78 | ```c++ 79 | #include 80 | 81 | // 82 | // Error checking wrapper for cutt 83 | // 84 | #define cuttCheck(stmt) do { \ 85 | cuttResult err = stmt; \ 86 | if (err != CUTT_SUCCESS) { \ 87 | fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \ 88 | exit(1); \ 89 | } \ 90 | } while(0) 91 | 92 | int main() { 93 | 94 | // Four dimensional tensor 95 | // Transpose (31, 549, 2, 3) -> (3, 31, 2, 549) 96 | int dim[4] = {31, 549, 2, 3}; 97 | int permutation[4] = {3, 0, 2, 1}; 98 | 99 | .... input and output data is setup here ... 100 | // double* idata : size product(dim) 101 | // double* odata : size product(dim) 102 | 103 | // Option 1: Create plan on NULL stream and choose implementation based on heuristics 104 | cuttHandle plan; 105 | cuttCheck(cuttPlan(&plan, 4, dim, permutation, sizeof(double), 0)); 106 | 107 | // Option 2: Create plan on NULL stream and choose implementation based on performance measurements 108 | // cuttCheck(cuttPlanMeasure(&plan, 4, dim, permutation, sizeof(double), 0, idata, odata)); 109 | 110 | // Execute plan 111 | cuttCheck(cuttExecute(plan, idata, odata)); 112 | 113 | ... do stuff with your output and deallocate data ... 114 | 115 | // Destroy plan 116 | cuttCheck(cuttDestroy(plan)); 117 | 118 | return 0; 119 | } 120 | ``` 121 | 122 | Input (idata) and output (odata) data are both in GPU memory and must point to different 123 | memory areas for correct operation. That is, cuTT only currently supports out-of-place 124 | transposes. Note that using Option 2 to create the plan can take up some time especially 125 | for high-rank tensors. 126 | 127 | cuTT API 128 | ======== 129 | 130 | ```c++ 131 | // 132 | // Create plan 133 | // 134 | // Parameters 135 | // handle = Returned handle to cuTT plan 136 | // rank = Rank of the tensor 137 | // dim[rank] = Dimensions of the tensor 138 | // permutation[rank] = Transpose permutation 139 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 140 | // stream = CUDA stream (0 if no stream is used) 141 | // 142 | // Returns 143 | // Success/unsuccess code 144 | // 145 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 146 | cudaStream_t stream); 147 | 148 | // 149 | // Create plan and choose implementation by measuring performance 150 | // 151 | // Parameters 152 | // handle = Returned handle to cuTT plan 153 | // rank = Rank of the tensor 154 | // dim[rank] = Dimensions of the tensor 155 | // permutation[rank] = Transpose permutation 156 | // sizeofType = Size of the elements of the tensor in bytes (=4 or 8) 157 | // stream = CUDA stream (0 if no stream is used) 158 | // idata = Input data size product(dim) 159 | // odata = Output data size product(dim) 160 | // 161 | // Returns 162 | // Success/unsuccess code 163 | // 164 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 165 | cudaStream_t stream, void* idata, void* odata); 166 | 167 | // 168 | // Destroy plan 169 | // 170 | // Parameters 171 | // handle = Handle to the cuTT plan 172 | // 173 | // Returns 174 | // Success/unsuccess code 175 | // 176 | cuttResult cuttDestroy(cuttHandle handle); 177 | 178 | // 179 | // Execute plan out-of-place 180 | // 181 | // Parameters 182 | // handle = Returned handle to cuTT plan 183 | // idata = Input data size product(dim) 184 | // odata = Output data size product(dim) 185 | // 186 | // Returns 187 | // Success/unsuccess code 188 | // 189 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata); 190 | ``` 191 | 192 | KNOWN BUGS 193 | ========== 194 | * Benchmarks sometime fail due to the stupid algorithm I have now to create 195 | random tensors with fixed volume. 196 | 197 | TODO 198 | ==== 199 | * Make "tiled" method work with sets of ranks (where ranks in M_m and M_k remain in same order) 200 | 201 | Licence 202 | ======= 203 | 204 | MIT License 205 | 206 | Copyright (c) 2016 Antti-Pekka Hynninen 207 | 208 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 209 | 210 | Permission is hereby granted, free of charge, to any person obtaining a copy 211 | of this software and associated documentation files (the "Software"), to deal 212 | in the Software without restriction, including without limitation the rights 213 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 214 | copies of the Software, and to permit persons to whom the Software is 215 | furnished to do so, subject to the following conditions: 216 | 217 | The above copyright notice and this permission notice shall be included in all 218 | copies or substantial portions of the Software. 219 | 220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 221 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 222 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 223 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 224 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 225 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 226 | SOFTWARE. 227 | -------------------------------------------------------------------------------- /src/TensorTester.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | 26 | // 27 | // Testing utilities 28 | // 29 | #include 30 | #include "CudaUtils.h" 31 | #include "TensorTester.h" 32 | 33 | __global__ void setTensorCheckPatternKernel(unsigned int* data, unsigned int ndata) { 34 | for (unsigned int i = threadIdx.x + blockIdx.x*blockDim.x;i < ndata;i += blockDim.x*gridDim.x) { 35 | data[i] = i; 36 | } 37 | } 38 | 39 | template 40 | __global__ void checkTransposeKernel(T* data, unsigned int ndata, int rank, TensorConv* glTensorConv, 41 | TensorError_t* glError, int* glFail) { 42 | 43 | extern __shared__ unsigned int shPos[]; 44 | 45 | const int warpLane = threadIdx.x & (warpSize - 1); 46 | TensorConv tc; 47 | if (warpLane < rank) { 48 | tc = glTensorConv[warpLane]; 49 | } 50 | 51 | TensorError_t error; 52 | error.pos = 0xffffffff; 53 | error.refVal = 0; 54 | error.dataVal = 0; 55 | 56 | for (int base = blockIdx.x*blockDim.x;base < ndata;base += blockDim.x*gridDim.x) { 57 | int i = base + threadIdx.x; 58 | T dataValT = (i < ndata) ? data[i] : -1; 59 | int refVal = 0; 60 | for (int j=0;j < rank;j++) { 61 | refVal += ((i/__shfl_sync(FULL_MASK, tc.c,j)) % __shfl_sync(FULL_MASK, tc.d,j))*__shfl_sync(FULL_MASK, tc.ct,j); 62 | } 63 | 64 | int dataVal = (dataValT & 0xffffffff)/(sizeof(T)/4); 65 | 66 | if (i < ndata && refVal != dataVal && i < error.pos) { 67 | error.pos = i; 68 | error.refVal = refVal; 69 | error.dataVal = dataVal; 70 | } 71 | } 72 | 73 | // Set FAIL flag 74 | if (error.pos != 0xffffffff) { 75 | // printf("error %d %d %d\n", error.pos, error.refVal, error.dataVal); 76 | *glFail = 1; 77 | } 78 | 79 | shPos[threadIdx.x] = error.pos; 80 | __syncthreads(); 81 | for (int d=1;d < blockDim.x;d *= 2) { 82 | int t = threadIdx.x + d; 83 | unsigned int posval = (t < blockDim.x) ? shPos[t] : 0xffffffff; 84 | __syncthreads(); 85 | shPos[threadIdx.x] = min(posval, shPos[threadIdx.x]); 86 | __syncthreads(); 87 | } 88 | // Minimum error.pos is in shPos[0] (or 0xffffffff in case of no error) 89 | 90 | if (shPos[0] != 0xffffffff && shPos[0] == error.pos) { 91 | // Error has occured and this thread has the minimum error.pos 92 | // printf("BOO error %d %d %d | %d\n", error.pos, error.refVal, error.dataVal, blockIdx.x); 93 | glError[blockIdx.x] = error; 94 | } 95 | 96 | } 97 | 98 | // ################################################################################ 99 | // ################################################################################ 100 | // ################################################################################ 101 | 102 | // 103 | // Class constructor 104 | // 105 | TensorTester::TensorTester() : maxRank(32), maxNumblock(256) { 106 | h_tensorConv = new TensorConv[maxRank]; 107 | h_error = new TensorError_t[maxNumblock]; 108 | allocate_device(&d_tensorConv, maxRank); 109 | allocate_device(&d_error, maxNumblock); 110 | allocate_device(&d_fail, 1); 111 | } 112 | 113 | // 114 | // Class destructor 115 | // 116 | TensorTester::~TensorTester() { 117 | delete [] h_tensorConv; 118 | delete [] h_error; 119 | deallocate_device(&d_tensorConv); 120 | deallocate_device(&d_error); 121 | deallocate_device(&d_fail); 122 | } 123 | 124 | void TensorTester::setTensorCheckPattern(unsigned int* data, unsigned int ndata) { 125 | int numthread = 512; 126 | int numblock = min(65535, (ndata - 1)/numthread + 1 ); 127 | setTensorCheckPatternKernel<<< numblock, numthread >>>(data, ndata); 128 | cudaCheck(cudaGetLastError()); 129 | } 130 | 131 | // void calcTensorConv(const int rank, const int* dim, const int* permutation, 132 | // TensorConv* tensorConv) { 133 | 134 | // tensorConv[0].c = 1; 135 | // tensorConv[0].d = dim[0]; 136 | // tensorConv[permutation[0]].ct = 1; 137 | // int ct_prev = 1; 138 | // for (int i=1;i < rank;i++) { 139 | // tensorConv[i].c = tensorConv[i-1].c*dim[i-1]; 140 | // tensorConv[i].d = dim[i]; 141 | // int ct = ct_prev*dim[permutation[i-1]]; 142 | // tensorConv[permutation[i]].ct = ct; 143 | // ct_prev = ct; 144 | // } 145 | 146 | // } 147 | 148 | // 149 | // Calculates tensor conversion constants. Returns total volume of tensor 150 | // 151 | int TensorTester::calcTensorConv(const int rank, const int* dim, const int* permutation, 152 | TensorConv* tensorConv) { 153 | 154 | int vol = dim[0]; 155 | 156 | tensorConv[permutation[0]].c = 1; 157 | tensorConv[0].ct = 1; 158 | tensorConv[0].d = dim[0]; 159 | for (int i=1;i < rank;i++) { 160 | vol *= dim[i]; 161 | 162 | tensorConv[permutation[i]].c = tensorConv[permutation[i-1]].c*dim[permutation[i-1]]; 163 | 164 | tensorConv[i].d = dim[i]; 165 | tensorConv[i].ct = tensorConv[i-1].ct*dim[i-1]; 166 | 167 | } 168 | 169 | return vol; 170 | } 171 | 172 | template bool TensorTester::checkTranspose(int rank, int* dim, int* permutation, T* data) { 173 | 174 | if (rank > 32) { 175 | return false; 176 | } 177 | 178 | int ndata = calcTensorConv(rank, dim, permutation, h_tensorConv); 179 | copy_HtoD(h_tensorConv, d_tensorConv, rank); 180 | 181 | // printf("tensorConv\n"); 182 | // for (int i=0;i < rank;i++) { 183 | // printf("%d %d %d\n", h_tensorConv[i].c, h_tensorConv[i].d, h_tensorConv[i].ct); 184 | // } 185 | 186 | set_device_array(d_error, 0, maxNumblock); 187 | set_device_array(d_fail, 0, 1); 188 | 189 | int numthread = 512; 190 | int numblock = min(maxNumblock, (ndata - 1)/numthread + 1 ); 191 | int shmemsize = numthread*sizeof(unsigned int); 192 | checkTransposeKernel<<< numblock, numthread, shmemsize >>>(data, ndata, rank, d_tensorConv, d_error, d_fail); 193 | cudaCheck(cudaGetLastError()); 194 | 195 | int h_fail; 196 | copy_DtoH(d_fail, &h_fail, 1); 197 | cudaCheck(cudaDeviceSynchronize()); 198 | 199 | if (h_fail) { 200 | copy_DtoH_sync(d_error, h_error, maxNumblock); 201 | TensorError_t error; 202 | error.pos = 0x0fffffff; 203 | for (int i=0;i < numblock;i++) { 204 | // printf("%d %d %d\n", error.pos, error.refVal, error.dataVal); 205 | if (h_error[i].refVal != h_error[i].dataVal && error.pos > h_error[i].pos) { 206 | error = h_error[i]; 207 | } 208 | } 209 | printf("TensorTester::checkTranspose FAIL at %d ref %d data %d\n", error.pos, error.refVal, error.dataVal); 210 | return false; 211 | } 212 | 213 | return true; 214 | } 215 | 216 | // Explicit instances 217 | template bool TensorTester::checkTranspose(int rank, int* dim, int* permutation, int* data); 218 | template bool TensorTester::checkTranspose(int rank, int* dim, int* permutation, long long int* data); 219 | -------------------------------------------------------------------------------- /src/int_vector.h: -------------------------------------------------------------------------------- 1 | #ifndef INT_VECTOR_H 2 | #define INT_VECTOR_H 3 | 4 | // Intel: Minimum SSE2 required for vectorization. 5 | // SSE can't be used because it does not support integer operations. SSE defaults to scalar 6 | 7 | #if defined(__SSE2__) 8 | // Intel x86 9 | #include 10 | 11 | #if defined(__AVX2__) 12 | #define USE_AVX 13 | const int INT_VECTOR_LEN = 8; 14 | // #include 15 | const char INT_VECTOR_TYPE[] = "AVX2"; 16 | #else 17 | #define USE_SSE 18 | const int INT_VECTOR_LEN = 4; 19 | const char INT_VECTOR_TYPE[] = "SSE2"; 20 | #endif 21 | 22 | #elif defined(__ALTIVEC__) // #if defined(__SSE2__) 23 | #define USE_ALTIVEC 24 | // IBM altivec 25 | #include 26 | #undef bool 27 | const int INT_VECTOR_LEN = 4; 28 | const char INT_VECTOR_TYPE[] = "ALTIVEC"; 29 | 30 | #else // #if defined(__SSE2__) 31 | // Nothing 32 | const int INT_VECTOR_LEN = 1; 33 | const char INT_VECTOR_TYPE[] = "SCALAR"; 34 | #endif 35 | 36 | // 37 | // Integer vector class for Intel and IBM CPU platforms 38 | // 39 | class int_vector { 40 | private: 41 | 42 | #if defined(USE_AVX) 43 | __m256i x; 44 | #elif defined(USE_SSE) 45 | __m128i x; 46 | #elif defined(USE_ALTIVEC) 47 | vector signed int x; 48 | #else 49 | int x; 50 | #endif 51 | 52 | public: 53 | 54 | inline int_vector() { 55 | } 56 | 57 | inline int_vector(const int a) { 58 | #if defined(USE_AVX) 59 | x = _mm256_set1_epi32(a); 60 | #elif defined(USE_SSE) 61 | x = _mm_set1_epi32(a); 62 | #elif defined(USE_ALTIVEC) 63 | x = (vector signed int){a, a, a, a}; 64 | #else 65 | x = a; 66 | #endif 67 | } 68 | 69 | inline int_vector(const int a[]) { 70 | #if defined(USE_AVX) 71 | x = _mm256_set_epi32(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); 72 | #elif defined(USE_SSE) 73 | x = _mm_set_epi32(a[3], a[2], a[1], a[0]); 74 | #elif defined(USE_ALTIVEC) 75 | x = vec_ld(0, a); 76 | #else 77 | x = a[0]; 78 | #endif 79 | } 80 | 81 | #if defined(USE_AVX) 82 | inline int_vector(const __m256i ax) { 83 | x = ax; 84 | } 85 | #elif defined(USE_SSE) 86 | inline int_vector(const __m128i ax) { 87 | x = ax; 88 | } 89 | #elif defined(USE_ALTIVEC) 90 | inline int_vector(const vector signed int ax) { 91 | x = ax; 92 | } 93 | #endif 94 | 95 | // 96 | // Member functions 97 | // 98 | 99 | inline int_vector operator+=(const int_vector a) { 100 | #if defined(USE_AVX) 101 | x = _mm256_add_epi32(x, a.x); 102 | #elif defined(USE_SSE) 103 | x = _mm_add_epi32(x, a.x); 104 | #elif defined(USE_ALTIVEC) 105 | x += a.x; 106 | #else 107 | x += a.x; 108 | #endif 109 | return *this; 110 | } 111 | 112 | inline int_vector operator-=(const int_vector a) { 113 | #if defined(USE_AVX) 114 | x = _mm256_sub_epi32(x, a.x); 115 | #elif defined(USE_SSE) 116 | x = _mm_sub_epi32(x, a.x); 117 | #elif defined(USE_ALTIVEC) 118 | x -= a.x; 119 | #else 120 | x -= a.x; 121 | #endif 122 | return *this; 123 | } 124 | 125 | inline int_vector operator&=(const int_vector a) { 126 | #if defined(USE_AVX) 127 | x = _mm256_and_si256(x, a.x); 128 | #elif defined(USE_SSE) 129 | x = _mm_and_si128(x, a.x); 130 | #elif defined(USE_ALTIVEC) 131 | x &= a.x; 132 | #else 133 | x &= a.x; 134 | #endif 135 | return *this; 136 | } 137 | 138 | inline int_vector operator|=(const int_vector a) { 139 | #if defined(USE_AVX) 140 | x = _mm256_or_si256(x, a.x); 141 | #elif defined(USE_SSE) 142 | x = _mm_or_si128(x, a.x); 143 | #elif defined(USE_ALTIVEC) 144 | x |= a.x; 145 | #else 146 | x |= a.x; 147 | #endif 148 | return *this; 149 | } 150 | 151 | inline int_vector operator~() { 152 | #if defined(USE_AVX) 153 | int_vector fullmask = int_vector(-1); 154 | return int_vector( _mm256_andnot_si256(x, fullmask.x) ); 155 | #elif defined(USE_SSE) 156 | int_vector fullmask = int_vector(-1); 157 | return int_vector( _mm_andnot_si128(x, fullmask.x) ); 158 | #elif defined(USE_ALTIVEC) 159 | return int_vector( ~x ); 160 | #else 161 | return ~x; 162 | #endif 163 | } 164 | 165 | // Sign extended shift by a constant. 166 | // Note: 0 <= n <= 31. Otherwise results are unpredictable 167 | inline int_vector operator>>=(const int n) { 168 | #if defined(USE_AVX) 169 | x = _mm256_srai_epi32(x, n); 170 | #elif defined(USE_SSE) 171 | x = _mm_srai_epi32(x, n); 172 | #elif defined(USE_ALTIVEC) 173 | x >>= n; 174 | #else 175 | x >>= n; 176 | #endif 177 | return *this; 178 | } 179 | 180 | // Sign extended shift by a constant 181 | // Note: 0 <= n <= 31. Otherwise results are unpredictable 182 | inline int_vector operator<<=(const int n) { 183 | #if defined(USE_AVX) 184 | x = _mm256_slli_epi32(x, n); 185 | #elif defined(USE_SSE) 186 | x = _mm_slli_epi32(x, n); 187 | #elif defined(USE_ALTIVEC) 188 | x <<= n; 189 | #else 190 | x <<= n; 191 | #endif 192 | return *this; 193 | } 194 | 195 | // Copy contest to int array 196 | void copy(int* a) const { 197 | #if defined(USE_AVX) 198 | _mm256_storeu_si256((__m256i *)a, x); 199 | #elif defined(USE_SSE) 200 | _mm_storeu_si128((__m128i *)a, x); 201 | #elif defined(USE_ALTIVEC) 202 | // void vec_stl (vector signed int, int, int *); 203 | vec_stl(x, 0, a); 204 | #else 205 | a[0] = x; 206 | #endif 207 | } 208 | 209 | // 210 | // Non-member functions 211 | // 212 | 213 | inline friend int_vector operator+(int_vector a, const int_vector b) { 214 | a += b; 215 | return a; 216 | } 217 | 218 | inline friend int_vector operator-(int_vector a, const int_vector b) { 219 | a -= b; 220 | return a; 221 | } 222 | 223 | inline friend int_vector operator&(int_vector a, const int_vector b) { 224 | a &= b; 225 | return a; 226 | } 227 | 228 | inline friend int_vector operator|(int_vector a, const int_vector b) { 229 | a |= b; 230 | return a; 231 | } 232 | 233 | inline friend int_vector operator>>(int_vector a, const int n) { 234 | a >>= n; 235 | return a; 236 | } 237 | 238 | inline friend int_vector operator<<(int_vector a, const int n) { 239 | a <<= n; 240 | return a; 241 | } 242 | 243 | // Returns 0xffffffff = -1 on the vector elements that are equal 244 | inline friend int_vector eq_mask(const int_vector a, const int_vector b) { 245 | #if defined(USE_AVX) 246 | return int_vector(_mm256_cmpeq_epi32(a.x, b.x)); 247 | #elif defined(USE_SSE) 248 | return int_vector(_mm_cmpeq_epi32(a.x, b.x)); 249 | #elif defined(USE_ALTIVEC) 250 | return int_vector(a.x == b.x); 251 | #else 252 | return int_vector((a.x == b.x)*(-1)); 253 | #endif 254 | } 255 | 256 | inline friend int_vector neq_mask(const int_vector a, const int_vector b) { 257 | return ~eq_mask(a, b); 258 | } 259 | 260 | // 0xffffffff => 1 261 | inline friend int_vector mask_to_bool(const int_vector a) { 262 | #if defined(USE_AVX) 263 | return int_vector(_mm256_srli_epi32(a.x, 31)); 264 | #elif defined(USE_SSE) 265 | return int_vector(_mm_srli_epi32(a.x, 31)); 266 | #elif defined(USE_ALTIVEC) 267 | return int_vector((vector signed int)((vector unsigned int)a.x >> 31)); 268 | #else 269 | return ((unsigned int)a.x >> 31); 270 | #endif 271 | } 272 | 273 | inline friend int_vector operator==(const int_vector a, const int_vector b) { 274 | return mask_to_bool(eq_mask(a, b)); 275 | } 276 | 277 | inline friend int_vector operator!=(const int_vector a, const int_vector b) { 278 | return mask_to_bool(neq_mask(a, b)); 279 | } 280 | 281 | // 1 => 0xffffffff 282 | inline friend int_vector bool_to_mask(const int_vector a) { 283 | #if defined(USE_AVX) 284 | return neq_mask(a, int_vector(0)); 285 | #elif defined(USE_SSE) 286 | return neq_mask(a, int_vector(0)); 287 | #elif defined(USE_ALTIVEC) 288 | return neq_mask(a, int_vector(0)); 289 | #else 290 | return (a ? -1 : 0); 291 | #endif 292 | } 293 | 294 | // Implicit type conversion 295 | // Returns true if any of the elements are != 0 296 | operator bool() const { 297 | #if defined(USE_AVX) 298 | int_vector a = neq_mask(*this, int_vector(0)); 299 | return (_mm256_movemask_epi8(a.x) != 0); 300 | #elif defined(USE_SSE) 301 | int_vector a = neq_mask(*this, int_vector(0)); 302 | return (_mm_movemask_epi8(a.x) != 0); 303 | #elif defined(USE_ALTIVEC) 304 | return vec_any_ne(x, ((const vector signed int){0, 0, 0, 0})); 305 | #else 306 | return x; 307 | #endif 308 | } 309 | 310 | // 311 | // Helper functions 312 | // 313 | void print() { 314 | int vec[INT_VECTOR_LEN]; 315 | this->copy(vec); 316 | for (int i=0;i < INT_VECTOR_LEN;i++) { 317 | printf("%d ", vec[i]); 318 | } 319 | } 320 | 321 | }; 322 | 323 | 324 | #if defined(USE_ALTIVEC) 325 | #undef vector 326 | #undef pixel 327 | #endif 328 | 329 | #endif // INT_VECTOR_H 330 | -------------------------------------------------------------------------------- /src/cutt.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include 27 | #include 28 | #include "CudaUtils.h" 29 | #include "cuttplan.h" 30 | #include "cuttkernel.h" 31 | #include "cuttTimer.h" 32 | #include "cutt.h" 33 | // #include 34 | 35 | // Hash table to store the plans 36 | static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage; 37 | 38 | // Current handle 39 | static cuttHandle curHandle = 0; 40 | 41 | // Table of devices that have been initialized 42 | static std::unordered_map deviceProps; 43 | 44 | // Checks prepares device if it's not ready yet and returns device properties 45 | // Also sets shared memory configuration 46 | void getDeviceProp(int& deviceID, cudaDeviceProp &prop) { 47 | cudaCheck(cudaGetDevice(&deviceID)); 48 | auto it = deviceProps.find(deviceID); 49 | if (it == deviceProps.end()) { 50 | // Get device properties and store it for later use 51 | cudaCheck(cudaGetDeviceProperties(&prop, deviceID)); 52 | cuttKernelSetSharedMemConfig(); 53 | deviceProps.insert({deviceID, prop}); 54 | } else { 55 | prop = it->second; 56 | } 57 | } 58 | 59 | cuttResult cuttPlanCheckInput(int rank, int* dim, int* permutation, size_t sizeofType) { 60 | // Check sizeofType 61 | if (sizeofType != 4 && sizeofType != 8) return CUTT_INVALID_PARAMETER; 62 | // Check rank 63 | if (rank <= 1) return CUTT_INVALID_PARAMETER; 64 | // Check dim[] 65 | for (int i=0;i < rank;i++) { 66 | if (dim[i] <= 1) return CUTT_INVALID_PARAMETER; 67 | } 68 | // Check permutation 69 | bool permutation_fail = false; 70 | int* check = new int[rank]; 71 | for (int i=0;i < rank;i++) check[i] = 0; 72 | for (int i=0;i < rank;i++) { 73 | if (permutation[i] < 0 || permutation[i] >= rank || check[permutation[i]]++) { 74 | permutation_fail = true; 75 | break; 76 | } 77 | } 78 | delete [] check; 79 | if (permutation_fail) return CUTT_INVALID_PARAMETER; 80 | 81 | return CUTT_SUCCESS; 82 | } 83 | 84 | cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 85 | cudaStream_t stream) { 86 | 87 | #ifdef ENABLE_NVTOOLS 88 | gpuRangeStart("init"); 89 | #endif 90 | 91 | // Check that input parameters are valid 92 | cuttResult inpCheck = cuttPlanCheckInput(rank, dim, permutation, sizeofType); 93 | if (inpCheck != CUTT_SUCCESS) return inpCheck; 94 | 95 | // Create new handle 96 | *handle = curHandle; 97 | curHandle++; 98 | 99 | // Check that the current handle is available (it better be!) 100 | if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR; 101 | 102 | // Prepare device 103 | int deviceID; 104 | cudaDeviceProp prop; 105 | getDeviceProp(deviceID, prop); 106 | 107 | // Reduce ranks 108 | std::vector redDim; 109 | std::vector redPermutation; 110 | reduceRanks(rank, dim, permutation, redDim, redPermutation); 111 | 112 | // Create plans from reduced ranks 113 | std::list plans; 114 | // if (rank != redDim.size()) { 115 | // if (!createPlans(redDim.size(), redDim.data(), redPermutation.data(), sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR; 116 | // } 117 | 118 | // // Create plans from non-reduced ranks 119 | // if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR; 120 | 121 | #if 0 122 | if (!cuttKernelDatabase(deviceID, prop)) return CUTT_INTERNAL_ERROR; 123 | #endif 124 | 125 | #ifdef ENABLE_NVTOOLS 126 | gpuRangeStop(); 127 | gpuRangeStart("createPlans"); 128 | #endif 129 | 130 | // std::chrono::high_resolution_clock::time_point plan_start; 131 | // plan_start = std::chrono::high_resolution_clock::now(); 132 | 133 | if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 134 | sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR; 135 | 136 | // std::chrono::high_resolution_clock::time_point plan_end; 137 | // plan_end = std::chrono::high_resolution_clock::now(); 138 | // double plan_duration = std::chrono::duration_cast< std::chrono::duration >(plan_end - plan_start).count(); 139 | // printf("createPlans took %lf ms\n", plan_duration*1000.0); 140 | 141 | #ifdef ENABLE_NVTOOLS 142 | gpuRangeStop(); 143 | gpuRangeStart("countCycles"); 144 | #endif 145 | 146 | // Count cycles 147 | for (auto it=plans.begin();it != plans.end();it++) { 148 | if (!it->countCycles(prop, 10)) return CUTT_INTERNAL_ERROR; 149 | } 150 | 151 | #ifdef ENABLE_NVTOOLS 152 | gpuRangeStop(); 153 | gpuRangeStart("rest"); 154 | #endif 155 | 156 | // Choose the plan 157 | std::list::iterator bestPlan = choosePlanHeuristic(plans); 158 | if (bestPlan == plans.end()) return CUTT_INTERNAL_ERROR; 159 | 160 | // bestPlan->print(); 161 | 162 | // Create copy of the plan outside the list 163 | cuttPlan_t* plan = new cuttPlan_t(); 164 | // NOTE: No deep copy needed here since device memory hasn't been allocated yet 165 | *plan = *bestPlan; 166 | // Set device pointers to NULL in the old copy of the plan so 167 | // that they won't be deallocated later when the object is destroyed 168 | bestPlan->nullDevicePointers(); 169 | 170 | // Set stream 171 | plan->setStream(stream); 172 | 173 | // Activate plan 174 | plan->activate(); 175 | 176 | // Insert plan into storage 177 | planStorage.insert( {*handle, plan} ); 178 | 179 | #ifdef ENABLE_NVTOOLS 180 | gpuRangeStop(); 181 | #endif 182 | 183 | return CUTT_SUCCESS; 184 | } 185 | 186 | cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutation, size_t sizeofType, 187 | cudaStream_t stream, void* idata, void* odata) { 188 | 189 | // Check that input parameters are valid 190 | cuttResult inpCheck = cuttPlanCheckInput(rank, dim, permutation, sizeofType); 191 | if (inpCheck != CUTT_SUCCESS) return inpCheck; 192 | 193 | if (idata == odata) return CUTT_INVALID_PARAMETER; 194 | 195 | // Create new handle 196 | *handle = curHandle; 197 | curHandle++; 198 | 199 | // Check that the current handle is available (it better be!) 200 | if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR; 201 | 202 | // Prepare device 203 | int deviceID; 204 | cudaDeviceProp prop; 205 | getDeviceProp(deviceID, prop); 206 | 207 | // Reduce ranks 208 | std::vector redDim; 209 | std::vector redPermutation; 210 | reduceRanks(rank, dim, permutation, redDim, redPermutation); 211 | 212 | // Create plans from reduced ranks 213 | std::list plans; 214 | #if 0 215 | // if (rank != redDim.size()) { 216 | if (!createPlans(redDim.size(), redDim.data(), redPermutation.data(), sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR; 217 | // } 218 | 219 | // Create plans from non-reduced ranks 220 | // if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR; 221 | #else 222 | if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 223 | sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR; 224 | #endif 225 | 226 | // // Count cycles 227 | // for (auto it=plans.begin();it != plans.end();it++) { 228 | // if (!it->countCycles(prop, 10)) return CUTT_INTERNAL_ERROR; 229 | // } 230 | 231 | // // Count the number of elements 232 | size_t numBytes = sizeofType; 233 | for (int i=0;i < rank;i++) numBytes *= dim[i]; 234 | 235 | // Choose the plan 236 | double bestTime = 1.0e40; 237 | auto bestPlan = plans.end(); 238 | Timer timer; 239 | std::vector times; 240 | for (auto it=plans.begin();it != plans.end();it++) { 241 | // Activate plan 242 | it->activate(); 243 | // Clear output data to invalidate caches 244 | set_device_array((char *)odata, -1, numBytes); 245 | cudaCheck(cudaDeviceSynchronize()); 246 | timer.start(); 247 | // Execute plan 248 | if (!cuttKernel(*it, idata, odata)) return CUTT_INTERNAL_ERROR; 249 | timer.stop(); 250 | double curTime = timer.seconds(); 251 | // it->print(); 252 | // printf("curTime %1.2lf\n", curTime*1000.0); 253 | times.push_back(curTime); 254 | if (curTime < bestTime) { 255 | bestTime = curTime; 256 | bestPlan = it; 257 | } 258 | } 259 | if (bestPlan == plans.end()) return CUTT_INTERNAL_ERROR; 260 | 261 | // bestPlan = plans.begin(); 262 | 263 | // printMatlab(prop, plans, times); 264 | // findMispredictionBest(plans, times, bestPlan, bestTime); 265 | // bestPlan->print(); 266 | 267 | // Create copy of the plan outside the list 268 | cuttPlan_t* plan = new cuttPlan_t(); 269 | *plan = *bestPlan; 270 | // Set device pointers to NULL in the old copy of the plan so 271 | // that they won't be deallocated later when the object is destroyed 272 | bestPlan->nullDevicePointers(); 273 | 274 | // Set stream 275 | plan->setStream(stream); 276 | 277 | // Activate plan 278 | plan->activate(); 279 | 280 | // Insert plan into storage 281 | planStorage.insert( {*handle, plan} ); 282 | 283 | return CUTT_SUCCESS; 284 | } 285 | 286 | cuttResult cuttDestroy(cuttHandle handle) { 287 | auto it = planStorage.find(handle); 288 | if (it == planStorage.end()) return CUTT_INVALID_PLAN; 289 | // Delete instance of cuttPlan_t 290 | delete it->second; 291 | // Delete entry from plan storage 292 | planStorage.erase(it); 293 | return CUTT_SUCCESS; 294 | } 295 | 296 | cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) { 297 | auto it = planStorage.find(handle); 298 | if (it == planStorage.end()) return CUTT_INVALID_PLAN; 299 | 300 | if (idata == odata) return CUTT_INVALID_PARAMETER; 301 | 302 | cuttPlan_t& plan = *(it->second); 303 | 304 | int deviceID; 305 | cudaCheck(cudaGetDevice(&deviceID)); 306 | if (deviceID != plan.deviceID) return CUTT_INVALID_DEVICE; 307 | 308 | if (!cuttKernel(plan, idata, odata)) return CUTT_INTERNAL_ERROR; 309 | return CUTT_SUCCESS; 310 | } 311 | -------------------------------------------------------------------------------- /src/cutt_test.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include 27 | #include // std::time 28 | #include // strcmp 29 | #include 30 | #include "cutt.h" 31 | #include "CudaUtils.h" 32 | #include "TensorTester.h" 33 | #include "cuttTimer.h" 34 | #include "cuttGpuModel.h" // testCounters 35 | 36 | // 37 | // Error checking wrapper for cutt 38 | // 39 | #define cuttCheck(stmt) do { \ 40 | cuttResult err = stmt; \ 41 | if (err != CUTT_SUCCESS) { \ 42 | fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \ 43 | exit(1); \ 44 | } \ 45 | } while(0) 46 | 47 | cuttTimer* timerFloat; 48 | cuttTimer* timerDouble; 49 | 50 | long long int* dataIn = NULL; 51 | long long int* dataOut = NULL; 52 | int dataSize = 200000000; 53 | TensorTester* tester = NULL; 54 | 55 | bool test1(); 56 | bool test2(); 57 | bool test3(); 58 | bool test4(); 59 | bool test5(); 60 | template bool test_tensor(std::vector& dim, std::vector& permutation); 61 | void printVec(std::vector& vec); 62 | 63 | int main(int argc, char *argv[]) { 64 | 65 | int gpuid = -1; 66 | bool arg_ok = true; 67 | if (argc >= 3) { 68 | if (strcmp(argv[1], "-device") == 0) { 69 | sscanf(argv[2], "%d", &gpuid); 70 | } else { 71 | arg_ok = false; 72 | } 73 | } else if (argc > 1) { 74 | arg_ok = false; 75 | } 76 | 77 | if (!arg_ok) { 78 | printf("cutt_test [options]\n"); 79 | printf("Options:\n"); 80 | printf("-device gpuid : use GPU with ID gpuid\n"); 81 | return 1; 82 | } 83 | 84 | if (gpuid >= 0) { 85 | cudaCheck(cudaSetDevice(gpuid)); 86 | } 87 | 88 | cudaCheck(cudaDeviceReset()); 89 | cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); 90 | 91 | timerFloat = new cuttTimer(4); 92 | timerDouble = new cuttTimer(8); 93 | 94 | // Allocate device data, 100M elements 95 | allocate_device(&dataIn, dataSize); 96 | allocate_device(&dataOut, dataSize); 97 | 98 | // Create tester 99 | tester = new TensorTester(); 100 | tester->setTensorCheckPattern((unsigned int *)dataIn, dataSize*2); 101 | 102 | if (!test1()) goto fail; 103 | if (!test2()) goto fail; 104 | if (!test3()) goto fail; 105 | if (!test4()) goto fail; 106 | if (!test5()) goto fail; 107 | 108 | { 109 | std::vector worstDim; 110 | std::vector worstPermutation; 111 | double worstBW = timerDouble->getWorst(worstDim, worstPermutation); 112 | printf("worstBW %4.2lf GB/s\n", worstBW); 113 | printf("dim\n"); 114 | printVec(worstDim); 115 | printf("permutation\n"); 116 | printVec(worstPermutation); 117 | } 118 | 119 | printf("test OK\n"); 120 | goto end; 121 | fail: 122 | printf("test FAIL\n"); 123 | end: 124 | deallocate_device(&dataIn); 125 | deallocate_device(&dataOut); 126 | delete tester; 127 | 128 | delete timerFloat; 129 | delete timerDouble; 130 | 131 | cudaCheck(cudaDeviceReset()); 132 | return 0; 133 | } 134 | 135 | // 136 | // Test 1: Test all permutations up to rank 7 on smallish tensors 137 | // 138 | bool test1() { 139 | const int minDim = 2; 140 | const int maxDim = 16; 141 | for (int rank = 2;rank <= 7;rank++) { 142 | 143 | std::vector dim(rank); 144 | std::vector permutation(rank); 145 | for (int r=0;r < rank;r++) { 146 | permutation[r] = r; 147 | dim[r] = minDim + r*(maxDim - minDim)/rank; 148 | } 149 | 150 | do { 151 | if (!test_tensor(dim, permutation)) return false; 152 | if (!test_tensor(dim, permutation)) return false; 153 | } while (std::next_permutation(permutation.begin(), permutation.begin() + rank)); 154 | 155 | } 156 | 157 | return true; 158 | } 159 | 160 | // 161 | // Test 2: Test ranks 2-15, random volume, random permutation, random dimensions 162 | // 100 samples each rank 163 | // 164 | bool test2() { 165 | double minDim = 2.0; 166 | 167 | std::srand(unsigned (std::time(0))); 168 | 169 | for (int rank = 2;rank <= 15;rank++) { 170 | double volmin = pow(minDim+1, rank); 171 | double volmax = (double)dataSize; 172 | 173 | for (int isample=0;isample < 100;isample++) { 174 | 175 | std::vector dim(rank); 176 | std::vector permutation(rank); 177 | for (int r=0;r < rank;r++) permutation[r] = r; 178 | double vol = 1.0; 179 | double curvol = 1.0; 180 | int iter = 0; 181 | do { 182 | vol = (volmin + (volmax - volmin)*((double)rand())/((double)RAND_MAX) ); 183 | 184 | int subiter = 0; 185 | do { 186 | for (int r=0;r < rank;r++) { 187 | double vol_left = vol/(curvol*pow(minDim, (double)(rank-r))); 188 | double aveDim = pow(vol, 1.0/(double)rank); 189 | double dimSpread = (aveDim - minDim); 190 | // rn = -1 ... 1 191 | double rn = 2.0*(((double)rand())/((double)RAND_MAX) - 0.5); 192 | dim[r] = (int)(aveDim + dimSpread*rn); 193 | curvol *= (double)dim[r]; 194 | } 195 | 196 | // printf("vol %lf curvol %lf\n", vol, curvol); 197 | // printf("dim"); 198 | // for (int r=0;r < rank;r++) printf(" %d", dim[r]); 199 | // printf("\n"); 200 | 201 | double vol_scale = pow(vol/curvol, 1.0/(double)rank); 202 | // printf("vol_scale %lf\n", vol_scale); 203 | curvol = 1.0; 204 | for (int r=0;r < rank;r++) { 205 | dim[r] = std::max(2, (int)round((double)dim[r]*vol_scale)); 206 | curvol *= dim[r]; 207 | } 208 | 209 | // printf("vol %lf curvol %lf\n", vol, curvol); 210 | // printf("dim"); 211 | // for (int r=0;r < rank;r++) printf(" %d", dim[r]); 212 | // printf("\n"); 213 | // return false; 214 | 215 | subiter++; 216 | } while (subiter < 50 && (curvol > volmax || fabs(curvol-vol)/(double)vol > 2.3)); 217 | 218 | // printf("vol %lf curvol %lf volmin %lf volmax %lf\n", vol, curvol, volmin, volmax); 219 | // printf("dim"); 220 | // for (int r=0;r < rank;r++) printf(" %d", dim[r]); 221 | // printf("\n"); 222 | 223 | iter++; 224 | if (iter == 1000) { 225 | printf("vol %lf\n", vol); 226 | printf("Unable to determine dimensions in 1000 iterations\n"); 227 | return false; 228 | } 229 | } while (curvol > volmax || fabs(curvol-vol)/(double)vol > 2.3); 230 | 231 | std::random_shuffle(permutation.begin(), permutation.end()); 232 | 233 | if (!test_tensor(dim, permutation)) return false; 234 | if (!test_tensor(dim, permutation)) return false; 235 | } 236 | 237 | } 238 | 239 | return true; 240 | } 241 | 242 | // 243 | // Test 3: hand picked examples 244 | // 245 | bool test3() { 246 | 247 | { 248 | int rank = 2; 249 | std::vector dim(rank); 250 | std::vector permutation(rank); 251 | dim[0] = 43; 252 | dim[1] = 67; 253 | permutation[0] = 1; 254 | permutation[1] = 0; 255 | if (!test_tensor(dim, permutation)) return false; 256 | if (!test_tensor(dim, permutation)) return false; 257 | dim[0] = 65536*32; 258 | dim[1] = 2; 259 | permutation[0] = 1; 260 | permutation[1] = 0; 261 | if (!test_tensor(dim, permutation)) return false; 262 | if (!test_tensor(dim, permutation)) return false; 263 | } 264 | 265 | { 266 | int rank = 3; 267 | std::vector dim(rank); 268 | std::vector permutation(rank); 269 | dim[0] = 1305; 270 | dim[1] = 599; 271 | dim[2] = 88; 272 | permutation[0] = 0; 273 | permutation[1] = 2; 274 | permutation[2] = 1; 275 | if (!test_tensor(dim, permutation)) return false; 276 | if (!test_tensor(dim, permutation)) return false; 277 | } 278 | 279 | { 280 | int rank = 4; 281 | std::vector dim(rank); 282 | std::vector permutation(rank); 283 | dim[0] = 24; 284 | dim[1] = 330; 285 | dim[2] = 64; 286 | dim[3] = 147; 287 | permutation[0] = 1; 288 | permutation[1] = 0; 289 | permutation[2] = 2; 290 | permutation[3] = 3; 291 | if (!test_tensor(dim, permutation)) return false; 292 | if (!test_tensor(dim, permutation)) return false; 293 | } 294 | 295 | { 296 | int rank = 4; 297 | std::vector dim(rank); 298 | std::vector permutation(rank); 299 | dim[0] = 2; 300 | dim[1] = 5; 301 | dim[2] = 9; 302 | dim[3] = 12; 303 | permutation[0] = 0; 304 | permutation[1] = 1; 305 | permutation[2] = 2; 306 | permutation[3] = 3; 307 | if (!test_tensor(dim, permutation)) return false; 308 | if (!test_tensor(dim, permutation)) return false; 309 | } 310 | 311 | { 312 | int rank = 6; 313 | std::vector dim(rank); 314 | std::vector permutation(rank); 315 | dim[0] = 2; 316 | dim[1] = 4; 317 | dim[2] = 6; 318 | dim[3] = 9; 319 | dim[4] = 11; 320 | dim[5] = 13; 321 | permutation[0] = 0; 322 | permutation[1] = 1; 323 | permutation[2] = 2; 324 | permutation[3] = 3; 325 | permutation[4] = 4; 326 | permutation[5] = 5; 327 | if (!test_tensor(dim, permutation)) return false; 328 | if (!test_tensor(dim, permutation)) return false; 329 | } 330 | 331 | { 332 | std::vector dim(5); 333 | std::vector permutation(5); 334 | dim[0] = 5; 335 | dim[1] = 42; 336 | dim[2] = 75; 337 | dim[3] = 86; 338 | dim[4] = 57; 339 | permutation[0] = 2 - 1; 340 | permutation[1] = 4 - 1; 341 | permutation[2] = 5 - 1; 342 | permutation[3] = 3 - 1; 343 | permutation[4] = 1 - 1; 344 | if (!test_tensor(dim, permutation)) return false; 345 | if (!test_tensor(dim, permutation)) return false; 346 | } 347 | 348 | { 349 | std::vector dim(5); 350 | std::vector permutation(5); 351 | dim[0] = 5; 352 | dim[1] = 3; 353 | dim[2] = 2; 354 | dim[3] = 9; 355 | dim[4] = 14; 356 | permutation[0] = 0; 357 | permutation[1] = 1; 358 | permutation[2] = 3; 359 | permutation[3] = 2; 360 | permutation[4] = 4; 361 | if (!test_tensor(dim, permutation)) return false; 362 | if (!test_tensor(dim, permutation)) return false; 363 | } 364 | 365 | return true; 366 | } 367 | 368 | // 369 | // Test 4: streaming 370 | // 371 | bool test4() { 372 | 373 | std::vector dim = {24, 32, 16, 36, 43, 9}; 374 | std::vector permutation = {5, 1, 4, 2, 3, 0}; 375 | 376 | const int numStream = 10; 377 | 378 | cudaStream_t streams[numStream]; 379 | for (int i=0;i < numStream;i++) { 380 | cudaCheck(cudaStreamCreate(&streams[i])); 381 | } 382 | 383 | cudaCheck(cudaDeviceSynchronize()); 384 | 385 | cuttHandle plans[numStream]; 386 | 387 | for (int i=0;i < numStream;i++) { 388 | cuttCheck(cuttPlan(&plans[i], dim.size(), dim.data(), permutation.data(), sizeof(double), streams[i])); 389 | cuttCheck(cuttExecute(plans[i], dataIn, dataOut)); 390 | } 391 | 392 | cudaCheck(cudaDeviceSynchronize()); 393 | 394 | bool run_ok = tester->checkTranspose(dim.size(), dim.data(), permutation.data(), (long long int *)dataOut); 395 | 396 | cudaCheck(cudaDeviceSynchronize()); 397 | 398 | for (int i=0;i < numStream;i++) { 399 | cuttCheck(cuttDestroy(plans[i])); 400 | cudaCheck(cudaStreamDestroy(streams[i])); 401 | } 402 | 403 | return run_ok; 404 | } 405 | 406 | 407 | // 408 | // Test 5: Transaction and cache line counters 409 | // 410 | bool test5() { 411 | 412 | { 413 | // Number of elements that are loaded per memory transaction: 414 | // 128 bytes per transaction 415 | const int accWidth = 128/sizeof(double); 416 | // L2 cache line width is 32 bytes 417 | const int cacheWidth = 32/sizeof(double); 418 | if (!testCounters(32, accWidth, cacheWidth)) return false; 419 | } 420 | 421 | { 422 | // Number of elements that are loaded per memory transaction: 423 | // 128 bytes per transaction 424 | const int accWidth = 128/sizeof(float); 425 | // L2 cache line width is 32 bytes 426 | const int cacheWidth = 32/sizeof(float); 427 | if (!testCounters(32, accWidth, cacheWidth)) return false; 428 | } 429 | 430 | return true; 431 | } 432 | 433 | 434 | template 435 | bool test_tensor(std::vector& dim, std::vector& permutation) { 436 | 437 | int rank = dim.size(); 438 | 439 | int vol = 1; 440 | for (int r=0;r < rank;r++) { 441 | vol *= dim[r]; 442 | } 443 | 444 | size_t volmem = vol*sizeof(T); 445 | size_t datamem = dataSize*sizeof(long long int); 446 | if (volmem > datamem) { 447 | printf("test_tensor, data size exceeded\n"); 448 | return false; 449 | } 450 | 451 | printf("number of elements %d\n", vol); 452 | printf("dimensions\n"); 453 | printVec(dim); 454 | printf("permutation\n"); 455 | printVec(permutation); 456 | 457 | cuttTimer* timer; 458 | if (sizeof(T) == 4) { 459 | timer = timerFloat; 460 | } else { 461 | timer = timerDouble; 462 | } 463 | 464 | cuttHandle plan; 465 | cuttCheck(cuttPlan(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0)); 466 | set_device_array((T *)dataOut, -1, vol); 467 | cudaCheck(cudaDeviceSynchronize()); 468 | 469 | if (vol > 1000000) timer->start(dim, permutation); 470 | cuttCheck(cuttExecute(plan, dataIn, dataOut)); 471 | if (vol > 1000000) timer->stop(); 472 | 473 | cuttCheck(cuttDestroy(plan)); 474 | 475 | return tester->checkTranspose(rank, dim.data(), permutation.data(), (T *)dataOut); 476 | } 477 | 478 | void printVec(std::vector& vec) { 479 | for (int i=0;i < vec.size();i++) { 480 | printf("%d ", vec[i]); 481 | } 482 | printf("\n"); 483 | } 484 | 485 | -------------------------------------------------------------------------------- /src/cuttGpuModelKernel.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include "CudaUtils.h" 27 | #include "cuttGpuModelKernel.h" 28 | 29 | #define RESTRICT //__restrict__ 30 | 31 | // 32 | // Global memory access statistics 33 | // 34 | struct MemStat { 35 | int gld_tran; 36 | int gst_tran; 37 | int gld_req; 38 | int gst_req; 39 | int cl_full_l2; 40 | int cl_part_l2; 41 | int cl_full_l1; 42 | int cl_part_l1; 43 | // int l1_tran; 44 | __device__ __forceinline__ void clear() { 45 | gld_tran = 0; 46 | gst_tran = 0; 47 | gld_req = 0; 48 | gst_req = 0; 49 | cl_full_l2 = 0; 50 | cl_part_l2 = 0; 51 | cl_full_l1 = 0; 52 | cl_part_l1 = 0; 53 | // l1_tran = 0; 54 | } 55 | }; 56 | 57 | // 58 | // Returns scalar tensor position. Each lane has the same p 59 | // NOTE: c and d on inactive warps must be 1 !! 60 | // 61 | __device__ __forceinline__ 62 | int tensorPos( 63 | const int p, const int rank, const int c, const int d, const int ct, 64 | const int numLane=warpSize 65 | ) { 66 | 67 | int r = ((p/c) % d)*ct; 68 | #pragma unroll 69 | for (int i=numLane/2;i >= 1;i/=2) { 70 | r += __shfl_xor_sync(FULL_MASK, r, i); 71 | } 72 | return r; 73 | 74 | } 75 | 76 | // 77 | // Counts number of global memory transactions for a warp that accesses 78 | // memory at pos using warp lanes 0, ..., n - 1 79 | // 80 | __device__ __forceinline__ 81 | int countGlTransactions(const int pos, const int n, const int accWidth, const int warpLane) { 82 | int seg0 = pos/accWidth; 83 | int srcLane = (warpLane == 0 || warpLane >= n) ? (warpLane) : (warpLane - 1); 84 | int seg1 = __shfl_sync(FULL_MASK, seg0, srcLane); 85 | int count = __popc(__ballot_sync(FULL_MASK, seg0 != seg1)) + 1; 86 | count = (n == 0) ? 0 : count; 87 | return count; 88 | } 89 | 90 | // 91 | // Counts number of global memory transactions for a warp that accesses 92 | // memory at pos using warp lanes 0, ..., n - 1 93 | // 94 | __device__ __forceinline__ 95 | int countGlTransactions(const int* segbuf, const int n) { 96 | int count = 0; 97 | for (int i = threadIdx.x;i < n;i += blockDim.x) { 98 | int seg = segbuf[i]; 99 | int seg_prev = (i - 1 >= 0) ? segbuf[i - 1] : -1; 100 | count += (seg != seg_prev); 101 | } 102 | return count; 103 | } 104 | 105 | // 106 | // Counts number of full and partial cache lines for a warp that accesses per warp 107 | // memory at pos using warp lanes 0, ..., n - 1 108 | // 109 | __device__ __forceinline__ 110 | void countCacheLines(const int pos, const int n, const int cacheWidth, const int warpLane, 111 | int& cl_full, int& cl_part) { 112 | 113 | int seg = pos/cacheWidth; 114 | // Lane is at the beginning of a full cache line, if seg0 matches seg0 cacheWidth - 1 away 115 | int readLane = warpLane + (cacheWidth - 1); 116 | int val = (seg == __shfl_sync(FULL_MASK, seg, readLane)); 117 | val = (readLane < n) ? val : 0; 118 | cl_full += val; 119 | 120 | unsigned int valbit = (((val << cacheWidth) - 1)*val) << warpLane; 121 | // Perform warpSize-way bitwise or 122 | #pragma unroll 123 | for (int i=warpSize/2;i >= 1;i/=2) { 124 | valbit |= __shfl_xor_sync(FULL_MASK, valbit, i); 125 | } 126 | // Now: lanes with valbit set are part of a full cache line, 127 | // lanes with valbit unset are part of a partial cache line 128 | int full = (valbit >> warpLane) & 1; 129 | 130 | seg = (warpLane < n) ? seg : -1; 131 | int segP1 = __shfl_down_sync(FULL_MASK, seg, 1); 132 | segP1 = (warpLane + 1 < warpSize) ? segP1 : -1; 133 | int val2 = ((!full) && seg != segP1); 134 | cl_part += val2; 135 | } 136 | 137 | // 138 | // Counts number of full and partial cache lines for a warp that accesses 139 | // memory at cachelines segbuf[0] ... segbuf[n - 1] 140 | // 141 | __device__ __forceinline__ 142 | void countCacheLines(int* segbuf, const int n, const int cacheWidth, 143 | int& cl_full, int& cl_part) { 144 | 145 | const int topbit = (1 << 31); 146 | const int lowbits = ~(1 << 31); 147 | 148 | for (int i = threadIdx.x;i < n;i += blockDim.x) { 149 | // seg[i] is at the beginning of a full cache line, if seg[i] matches seg[i + cacheWidth - 1] 150 | int i1 = i + (cacheWidth - 1); 151 | int val = 0; 152 | if (i1 < n) val = ((segbuf[i] & lowbits) == (segbuf[i1] & lowbits)); 153 | cl_full += val; 154 | // Mark full cache lines with top bit set to 1 155 | if (val) { 156 | for (int j=0;j < cacheWidth;j++) { 157 | if (i + j < n) segbuf[i + j] |= topbit; 158 | } 159 | } 160 | } 161 | __syncthreads(); 162 | 163 | for (int i = threadIdx.x;i < n;i += blockDim.x) { 164 | int seg = segbuf[i]; 165 | int segP1 = (i + 1 < n) ? segbuf[i + 1] : -1; 166 | int part = ((seg & topbit) == 0); 167 | int val2 = (part && seg != segP1); 168 | cl_part += val2; 169 | } 170 | 171 | // Clear top bits 172 | __syncthreads(); 173 | for (int i = threadIdx.x;i < n;i += blockDim.x) { 174 | segbuf[i] &= lowbits; 175 | } 176 | 177 | } 178 | 179 | // 180 | // Runs countGlTransactions and countCacheLines counters for testing 181 | // Unused values in posData[] are marked with "-1" 182 | // 183 | __global__ void runCountersKernel(const int* posData, const int numPosData, 184 | const int accWidth, const int cacheWidth, int* tranData, int* cl_fullData, int* cl_partData) { 185 | 186 | const int warpLane = threadIdx.x & (warpSize - 1); 187 | 188 | for (int i=threadIdx.x + blockIdx.x*blockDim.x;i < numPosData;i+=blockDim.x*gridDim.x) { 189 | int pos = posData[i]; 190 | int flag = (pos == -1); 191 | int ffsval = __ffs(__ballot_sync(FULL_MASK, flag)) - 1; 192 | int n = (__any_sync(FULL_MASK, flag)) ? ffsval : warpSize; 193 | int tran = countGlTransactions(pos, n, accWidth, warpLane); 194 | int cl_full = 0; 195 | int cl_part = 0; 196 | countCacheLines(pos, n, cacheWidth, warpLane, cl_full, cl_part); 197 | #pragma unroll 198 | for (int k=warpSize/2;k >= 1;k/=2) { 199 | cl_full += __shfl_xor_sync(FULL_MASK, cl_full, k); 200 | cl_part += __shfl_xor_sync(FULL_MASK, cl_part, k); 201 | } 202 | int j = i / warpSize; 203 | tranData[j] = tran; 204 | cl_fullData[j] = cl_full; 205 | cl_partData[j] = cl_part; 206 | } 207 | 208 | } 209 | 210 | // 211 | // Reduce memStat within warp and write result to global memory 212 | // NOTE: Not super-efficient since every warp does atomicAdd(). 213 | // 214 | __device__ __forceinline__ 215 | void writeMemStat(const int warpLane, MemStat memStat, MemStat* RESTRICT glMemStat) { 216 | for (int i=16;i >= 1;i/=2) { 217 | // memStat.gld_tran += __shfl_xor_sync(FULL_MASK, memStat.gld_tran, i); 218 | // memStat.gst_tran += __shfl_xor_sync(FULL_MASK, memStat.gst_tran, i); 219 | // memStat.gld_req += __shfl_xor_sync(FULL_MASK, memStat.gld_req, i); 220 | // memStat.gst_req += __shfl_xor_sync(FULL_MASK, memStat.gst_req, i); 221 | memStat.cl_full_l2 += __shfl_xor_sync(FULL_MASK, memStat.cl_full_l2, i); 222 | memStat.cl_part_l2 += __shfl_xor_sync(FULL_MASK, memStat.cl_part_l2, i); 223 | memStat.cl_full_l1 += __shfl_xor_sync(FULL_MASK, memStat.cl_full_l1, i); 224 | memStat.cl_part_l1 += __shfl_xor_sync(FULL_MASK, memStat.cl_part_l1, i); 225 | // memStat.l1_tran += __shfl_xor_sync(FULL_MASK, memStat.l1_tran, i); 226 | } 227 | if (warpLane == 0) { 228 | atomicAdd(&(glMemStat->gld_tran), memStat.gld_tran); 229 | atomicAdd(&(glMemStat->gst_tran), memStat.gst_tran); 230 | atomicAdd(&(glMemStat->gld_req), memStat.gld_req); 231 | atomicAdd(&(glMemStat->gst_req), memStat.gst_req); 232 | atomicAdd(&(glMemStat->cl_full_l2), memStat.cl_full_l2); 233 | atomicAdd(&(glMemStat->cl_part_l2), memStat.cl_part_l2); 234 | atomicAdd(&(glMemStat->cl_full_l1), memStat.cl_full_l1); 235 | atomicAdd(&(glMemStat->cl_part_l1), memStat.cl_part_l1); 236 | // atomicAdd(&(glMemStat->l1_tran), memStat.l1_tran); 237 | } 238 | } 239 | 240 | // 241 | // Transpose when Mm and Mk don't overlap and contain only single rank 242 | // 243 | // dim3 numthread(TILEDIM, TILEROWS, 1); 244 | // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMk-1)/TILEDIM+1), 1, plan.volMbar); 245 | // 246 | __global__ void 247 | __launch_bounds__(TILEDIM*TILEROWS, 1) 248 | countTiled( 249 | const int numMm, const int volMbar, const int sizeMbar, 250 | const int2 tiledVol, const int cuDimMk, const int cuDimMm, 251 | const TensorConvInOut* RESTRICT glMbar, 252 | const int accWidth, const int cacheWidth, 253 | MemStat* RESTRICT glMemStat) { 254 | 255 | const int warpLane = threadIdx.x & (warpSize - 1); 256 | TensorConvInOut Mbar; 257 | Mbar.c_in = 1; 258 | Mbar.d_in = 1; 259 | Mbar.c_out = 1; 260 | Mbar.d_out = 1; 261 | if (warpLane < sizeMbar) { 262 | Mbar = glMbar[warpLane]; 263 | } 264 | 265 | const int bx = (blockIdx.x % numMm)*TILEDIM; 266 | const int by = (blockIdx.x / numMm)*TILEDIM; 267 | 268 | const int xin = bx + threadIdx.x; 269 | const int yin = by + threadIdx.y; 270 | 271 | const int xout = bx + threadIdx.y; 272 | const int yout = by + threadIdx.x; 273 | 274 | const unsigned int maskIny = __ballot_sync(FULL_MASK, (yin + warpLane < tiledVol.y))*(xin < tiledVol.x); 275 | const unsigned int maskOutx = __ballot_sync(FULL_MASK, (xout + warpLane < tiledVol.x))*(yout < tiledVol.y); 276 | 277 | const int posMinorIn = xin + yin*cuDimMk; 278 | const int posMinorOut = yout + xout*cuDimMm; 279 | const int posInAdd = TILEROWS*cuDimMk; 280 | const int posOutAdd = TILEROWS*cuDimMm; 281 | 282 | MemStat memStat; 283 | memStat.clear(); 284 | 285 | for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z) 286 | { 287 | 288 | // Compute global memory positions 289 | int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 290 | int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 291 | #pragma unroll 292 | for (int i=16;i >= 1;i/=2) { 293 | posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i); 294 | posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i); 295 | } 296 | int posIn = posMajorIn + posMinorIn; 297 | int posOut = posMajorOut + posMinorOut; 298 | 299 | // Read data into shared memory tile 300 | #pragma unroll 301 | for (int j=0;j < TILEDIM;j += TILEROWS) { 302 | int n = __popc(__ballot_sync(FULL_MASK, maskIny & (1 << j))); 303 | memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane); 304 | memStat.gld_req += __any_sync(FULL_MASK, n > 0); 305 | posIn += posInAdd; 306 | } 307 | 308 | #pragma unroll 309 | for (int j=0;j < TILEDIM;j += TILEROWS) { 310 | int n = __popc(__ballot_sync(FULL_MASK, maskOutx & (1 << j))); 311 | memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane); 312 | memStat.gst_req += __any_sync(FULL_MASK, n > 0); 313 | countCacheLines(posOut, n, cacheWidth, warpLane, memStat.cl_full_l2, memStat.cl_part_l2); 314 | posOut += posOutAdd; 315 | } 316 | 317 | } 318 | 319 | // Reduce memStat within thread block and write result to global memory 320 | writeMemStat(warpLane, memStat, glMemStat); 321 | 322 | } 323 | 324 | // 325 | // Packed transpose. Thread block loads plan.volMmk number of elements 326 | // 327 | template 328 | __global__ void 329 | __launch_bounds__(1024, 1) 330 | countPacked( 331 | const int volMmk, const int volMbar, 332 | const int sizeMmk, const int sizeMbar, 333 | const TensorConvInOut* RESTRICT gl_Mmk, 334 | const TensorConvInOut* RESTRICT gl_Mbar, 335 | const int accWidth, const int cacheWidth, 336 | MemStat* RESTRICT glMemStat) { 337 | 338 | extern __shared__ int shSegOut[]; 339 | 340 | const int warpLane = threadIdx.x & (warpSize - 1); 341 | 342 | TensorConvInOut Mmk; 343 | Mmk.c_in = 1; 344 | Mmk.d_in = 1; 345 | Mmk.c_out = 1; 346 | Mmk.d_out = 1; 347 | if (warpLane < sizeMmk) { 348 | Mmk = gl_Mmk[warpLane]; 349 | } 350 | 351 | // Pre-compute tensor positions in Mmk 352 | // 3*numRegStorage registers 353 | int posMmkIn[numRegStorage]; 354 | int posMmkOut[numRegStorage]; 355 | #pragma unroll 356 | for (int j=0;j < numRegStorage;j++) { 357 | posMmkIn[j] = 0; 358 | posMmkOut[j] = 0; 359 | } 360 | for (int i=0;i < sizeMmk;i++) { 361 | #pragma unroll 362 | for (int j=0;j < numRegStorage;j++) { 363 | int posMmk = threadIdx.x + j*blockDim.x; 364 | posMmkIn[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i); 365 | posMmkOut[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i); 366 | } 367 | } 368 | 369 | // 6 registers 370 | TensorConvInOut Mbar; 371 | Mbar.c_in = 1; 372 | Mbar.d_in = 1; 373 | Mbar.c_out = 1; 374 | Mbar.d_out = 1; 375 | if (warpLane < sizeMbar) { 376 | Mbar = gl_Mbar[warpLane]; 377 | } 378 | 379 | MemStat memStat; 380 | memStat.clear(); 381 | 382 | for (int posMbar=blockIdx.x;posMbar < volMbar;posMbar += gridDim.x) 383 | { 384 | 385 | int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 386 | #pragma unroll 387 | for (int i=16;i >= 1;i/=2) { 388 | posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i); 389 | } 390 | 391 | int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 392 | #pragma unroll 393 | for (int i=16;i >= 1;i/=2) { 394 | posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i); 395 | } 396 | 397 | // Read from global memory 398 | #pragma unroll 399 | for (int j=0;j < numRegStorage;j++) { 400 | int posMmk = threadIdx.x + j*blockDim.x; 401 | int posIn = posMbarIn + posMmkIn[j]; 402 | int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmk)); 403 | memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane); 404 | memStat.gld_req += __any_sync(FULL_MASK, n > 0); 405 | } 406 | 407 | // Write to global memory 408 | #pragma unroll 409 | for (int j=0;j < numRegStorage;j++) { 410 | int posMmk = threadIdx.x + j*blockDim.x; 411 | int posOut = posMbarOut + posMmkOut[j]; 412 | int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmk)); 413 | memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane); 414 | memStat.gst_req += __any_sync(FULL_MASK, n > 0); 415 | if (posMmk < volMmk) shSegOut[posMmk] = posOut/cacheWidth; 416 | } 417 | 418 | __syncthreads(); 419 | countCacheLines(shSegOut, volMmk, cacheWidth, memStat.cl_full_l2, memStat.cl_part_l2); 420 | // Go from L2 segments to L1 segments 421 | __syncthreads(); 422 | const int L2toL1 = accWidth/cacheWidth; 423 | for (int i=threadIdx.x;i < volMmk;i+=blockDim.x) { 424 | shSegOut[i] /= L2toL1; 425 | } 426 | __syncthreads(); 427 | countCacheLines(shSegOut, volMmk, accWidth, memStat.cl_full_l1, memStat.cl_part_l1); 428 | 429 | // __syncthreads(); 430 | // memStat.l1_tran += countGlTransactions(shSegOut, volMmk); 431 | 432 | } 433 | 434 | // Reduce memStat within thread block and write result to global memory 435 | writeMemStat(warpLane, memStat, glMemStat); 436 | 437 | } 438 | 439 | // 440 | // Packed method with a split rank 441 | // 442 | // dim nthread(((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize, 1, 1) 443 | // dim nblock(ts.numSplit, min(256, max(1, ts.volMbar)), 1) 444 | // 445 | template 446 | __global__ void 447 | __launch_bounds__(1024, 1) 448 | countPackedSplit( 449 | const int splitDim, const int volMmkUnsplit, const int volMbar, 450 | const int sizeMmk, const int sizeMbar, 451 | const int cMmSplit, const int cMkSplit, 452 | const TensorConvInOut* RESTRICT glMmk, 453 | const TensorConvInOut* RESTRICT glMbar, 454 | const int accWidth, const int cacheWidth, 455 | MemStat* RESTRICT glMemStat) { 456 | 457 | extern __shared__ int shSegOut[]; 458 | 459 | const int warpLane = threadIdx.x & (warpSize - 1); 460 | 461 | // const int plusone = (blockIdx.x < (splitDim % gridDim.x)); 462 | const int p0 = blockIdx.x*splitDim/gridDim.x; 463 | const int volSplit = (blockIdx.x + 1)*splitDim/gridDim.x - p0; 464 | const int plusone = volSplit - splitDim/gridDim.x; 465 | 466 | TensorConvInOut Mmk; 467 | Mmk.c_in = 1; 468 | Mmk.d_in = 1; 469 | Mmk.c_out = 1; 470 | Mmk.d_out = 1; 471 | if (warpLane < sizeMmk) { 472 | Mmk = glMmk[warpLane + plusone*sizeMmk]; 473 | } 474 | 475 | // gridDim.x = number of splits 476 | // blockIdx.x = {0 ... gridDim.x - 1} is the split-index 477 | // Volume of this split 478 | // const int volSplit = (splitDim/gridDim.x) + plusone; 479 | // Start position in this split 480 | // const int p0 = (splitDim/gridDim.x)*blockIdx.x + min(blockIdx.x, (splitDim % gridDim.x)); 481 | const int posMmkIn0 = p0*cMmSplit; 482 | const int posMmkOut0 = p0*cMkSplit; 483 | // Volume of split Mmk 484 | const int volMmkSplit = volSplit*volMmkUnsplit; 485 | 486 | // Pre-compute tensor positions in Mmk 487 | // 3*numRegStorage registers 488 | int posMmkIn[numRegStorage]; 489 | int posMmkOut[numRegStorage]; 490 | #pragma unroll 491 | for (int j=0;j < numRegStorage;j++) { 492 | posMmkIn[j] = posMmkIn0; 493 | posMmkOut[j] = posMmkOut0; 494 | } 495 | for (int i=0;i < sizeMmk;i++) { 496 | #pragma unroll 497 | for (int j=0;j < numRegStorage;j++) { 498 | int t = threadIdx.x + j*blockDim.x; 499 | posMmkIn[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i); 500 | posMmkOut[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i); 501 | } 502 | } 503 | 504 | TensorConvInOut Mbar; 505 | Mbar.c_in = 1; 506 | Mbar.d_in = 1; 507 | Mbar.c_out = 1; 508 | Mbar.d_out = 1; 509 | if (warpLane < sizeMbar) { 510 | Mbar = glMbar[warpLane]; 511 | } 512 | 513 | MemStat memStat; 514 | memStat.clear(); 515 | 516 | for (int posMbar=blockIdx.y;posMbar < volMbar;posMbar+=gridDim.y) 517 | { 518 | 519 | int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 520 | #pragma unroll 521 | for (int i=16;i >= 1;i/=2) { 522 | posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i); 523 | } 524 | 525 | int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 526 | #pragma unroll 527 | for (int i=16;i >= 1;i/=2) { 528 | posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i); 529 | } 530 | 531 | // Read from global memory 532 | #pragma unroll 533 | for (int j=0;j < numRegStorage;j++) { 534 | int posMmk = threadIdx.x + j*blockDim.x; 535 | int posIn = posMbarIn + posMmkIn[j]; 536 | int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmkSplit)); 537 | memStat.gld_tran += countGlTransactions(posIn, n, accWidth, warpLane); 538 | memStat.gld_req += __any_sync(FULL_MASK, n > 0); 539 | } 540 | 541 | // Write to global memory 542 | #pragma unroll 543 | for (int j=0;j < numRegStorage;j++) { 544 | int posMmk = threadIdx.x + j*blockDim.x; 545 | int posOut = posMbarOut + posMmkOut[j]; 546 | int n = __popc(__ballot_sync(FULL_MASK, posMmk < volMmkSplit)); 547 | memStat.gst_tran += countGlTransactions(posOut, n, accWidth, warpLane); 548 | memStat.gst_req += __any_sync(FULL_MASK, n > 0); 549 | if (posMmk < volMmkSplit) shSegOut[posMmk] = posOut / cacheWidth; 550 | // countCacheLines(posOut, n, cacheWidth, warpLane, memStat.cl_full, memStat.cl_part); 551 | } 552 | 553 | __syncthreads(); 554 | countCacheLines(shSegOut, volMmkSplit, cacheWidth, memStat.cl_full_l2, memStat.cl_part_l2); 555 | // Go from L2 segments to L1 segments 556 | __syncthreads(); 557 | const int L2toL1 = accWidth/cacheWidth; 558 | for (int i=threadIdx.x;i < volMmkSplit;i+=blockDim.x) { 559 | shSegOut[i] /= L2toL1; 560 | } 561 | __syncthreads(); 562 | countCacheLines(shSegOut, volMmkSplit, accWidth, memStat.cl_full_l1, memStat.cl_part_l1); 563 | 564 | // __syncthreads(); 565 | // memStat.l1_tran += countGlTransactions(shSegOut, volMmkSplit); 566 | 567 | } 568 | 569 | // Reduce memStat within thread block and write result to global memory 570 | writeMemStat(warpLane, memStat, glMemStat); 571 | 572 | } 573 | 574 | // 575 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2) 576 | // 577 | // dim3 numthread(TILEDIM, TILEROWS, 1); 578 | // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar); 579 | // 580 | __global__ void 581 | __launch_bounds__(TILEDIM*TILEROWS, 1) 582 | countTiledCopy( 583 | const int numMm, const int volMbar, const int sizeMbar, 584 | const int cuDimMk, const int cuDimMm, 585 | const int2 tiledVol, 586 | const TensorConvInOut* RESTRICT gl_Mbar, 587 | const int accWidth, const int cacheWidth, 588 | MemStat* RESTRICT glMemStat) { 589 | 590 | const int warpLane = threadIdx.x & (warpSize - 1); 591 | TensorConvInOut Mbar; 592 | Mbar.c_in = 1; 593 | Mbar.d_in = 1; 594 | Mbar.c_out = 1; 595 | Mbar.d_out = 1; 596 | if (warpLane < sizeMbar) { 597 | Mbar = gl_Mbar[warpLane]; 598 | } 599 | 600 | const int bx = (blockIdx.x % numMm)*TILEDIM; 601 | const int by = (blockIdx.x / numMm)*TILEDIM; 602 | 603 | const int x = bx + threadIdx.x; 604 | const int y = by + threadIdx.y; 605 | 606 | MemStat memStat; 607 | memStat.clear(); 608 | 609 | for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z) 610 | { 611 | 612 | // Read global memory 613 | { 614 | int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_in, Mbar.d_in, Mbar.ct_in); 615 | pos0 += x + y*cuDimMk; 616 | 617 | #pragma unroll 618 | for (int j=0;j < TILEDIM;j += TILEROWS) { 619 | int pos = pos0 + j*cuDimMk; 620 | int n = __popc(__ballot_sync(FULL_MASK, (x < tiledVol.x) && (y + j < tiledVol.y))); 621 | memStat.gld_tran += countGlTransactions(pos, n, accWidth, warpLane); 622 | memStat.gld_req += __any_sync(FULL_MASK, n > 0); 623 | } 624 | } 625 | 626 | // Write global memory 627 | { 628 | int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_out, Mbar.d_out, Mbar.ct_out); 629 | pos0 += x + y*cuDimMm; 630 | 631 | #pragma unroll 632 | for (int j=0;j < TILEDIM;j += TILEROWS) { 633 | int pos = pos0 + j*cuDimMm; 634 | int n = __popc(__ballot_sync(FULL_MASK, (x < tiledVol.x) && (y + j < tiledVol.y))); 635 | memStat.gst_tran += countGlTransactions(pos, n, accWidth, warpLane); 636 | memStat.gst_req += __any_sync(FULL_MASK, n > 0); 637 | countCacheLines(pos, n, cacheWidth, warpLane, memStat.cl_full_l2, memStat.cl_part_l2); 638 | } 639 | } 640 | 641 | } 642 | 643 | // Reduce memStat within thread block and write result to global memory 644 | writeMemStat(warpLane, memStat, glMemStat); 645 | 646 | } 647 | 648 | //###################################################################################### 649 | //###################################################################################### 650 | //###################################################################################### 651 | 652 | void runCounters(const int warpSize, const int* hostPosData, const int numPosData, 653 | const int accWidth, const int cacheWidth, int* host_tran, int* host_cl_full, int* host_cl_part) { 654 | 655 | const int numWarp = numPosData/warpSize; 656 | 657 | int* devPosData; 658 | allocate_device(&devPosData, numPosData); 659 | copy_HtoD(hostPosData, devPosData, numPosData); 660 | 661 | int* dev_tran; 662 | int* dev_cl_full; 663 | int* dev_cl_part; 664 | allocate_device(&dev_tran, numWarp); 665 | allocate_device(&dev_cl_full, numWarp); 666 | allocate_device(&dev_cl_part, numWarp); 667 | 668 | int nthread = 512; 669 | int nblock = (numPosData - 1)/nthread + 1; 670 | runCountersKernel<<< nblock, nthread >>>(devPosData, numPosData, 671 | accWidth, cacheWidth, dev_tran, dev_cl_full, dev_cl_part); 672 | cudaCheck(cudaGetLastError()); 673 | 674 | copy_DtoH(dev_tran, host_tran, numWarp); 675 | copy_DtoH(dev_cl_full, host_cl_full, numWarp); 676 | copy_DtoH(dev_cl_part, host_cl_part, numWarp); 677 | cudaCheck(cudaDeviceSynchronize()); 678 | 679 | deallocate_device(&dev_tran); 680 | deallocate_device(&dev_cl_full); 681 | deallocate_device(&dev_cl_part); 682 | 683 | deallocate_device(&devPosData); 684 | } 685 | 686 | bool cuttGpuModelKernel(cuttPlan_t& plan, const int accWidth, const int cacheWidth, 687 | int& gld_tran, int& gst_tran, int& gld_req, int& gst_req, 688 | int& cl_full_l2, int& cl_part_l2, int& cl_full_l1, int& cl_part_l1) { 689 | 690 | LaunchConfig& lc = plan.launchConfig; 691 | TensorSplit& ts = plan.tensorSplit; 692 | 693 | MemStat* devMemStat; 694 | allocate_device(&devMemStat, 1); 695 | set_device_array(devMemStat, 0, 1, plan.stream); 696 | 697 | switch(ts.method) { 698 | case Trivial: 699 | { 700 | return false; 701 | } 702 | 703 | case Packed: 704 | { 705 | switch(lc.numRegStorage) { 706 | #define CALL0(NREG) \ 707 | countPacked <<< lc.numblock, lc.numthread, ts.volMmk*sizeof(int), plan.stream >>> \ 708 | (ts.volMmk, ts.volMbar, ts.sizeMmk, ts.sizeMbar, \ 709 | plan.Mmk, plan.Mbar, accWidth, cacheWidth, devMemStat) 710 | #define CALL(ICASE) case ICASE: CALL0(ICASE); break 711 | #include "calls.h" 712 | default: 713 | printf("cuttGpuModelKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); 714 | return false; 715 | #undef CALL 716 | #undef CALL0 717 | } 718 | 719 | } 720 | break; 721 | 722 | case PackedSplit: 723 | { 724 | 725 | // Calculate max. volume of split Mmk 726 | const int volSplit = (ts.splitDim/ts.numSplit) + ((ts.splitDim % ts.numSplit) != 0); 727 | const int volMmkSplit = volSplit*ts.volMmkUnsplit; 728 | 729 | switch(lc.numRegStorage) { 730 | #define CALL0(NREG) \ 731 | countPackedSplit <<< lc.numblock, lc.numthread, volMmkSplit*sizeof(int), plan.stream >>> \ 732 | (ts.splitDim, ts.volMmkUnsplit, ts. volMbar, ts.sizeMmk, ts.sizeMbar, \ 733 | plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, accWidth, cacheWidth, devMemStat) 734 | #define CALL(ICASE) case ICASE: CALL0(ICASE); break 735 | #include "calls.h" 736 | default: 737 | printf("cuttGpuModelKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); 738 | return false; 739 | #undef CALL 740 | #undef CALL0 741 | } 742 | 743 | } 744 | break; 745 | 746 | case Tiled: 747 | { 748 | countTiled <<< lc.numblock, lc.numthread, 0, plan.stream >>> 749 | (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.tiledVol, plan.cuDimMk, plan.cuDimMm, 750 | plan.Mbar, accWidth, cacheWidth, devMemStat); 751 | } 752 | break; 753 | 754 | case TiledCopy: 755 | { 756 | countTiledCopy <<< lc.numblock, lc.numthread, 0, plan.stream >>> 757 | (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.cuDimMk, plan.cuDimMm, plan.tiledVol, 758 | plan.Mbar, accWidth, cacheWidth, devMemStat); 759 | } 760 | break; 761 | 762 | } 763 | 764 | cudaCheck(cudaGetLastError()); 765 | 766 | MemStat hostMemStat; 767 | copy_DtoH(devMemStat, &hostMemStat, 1, plan.stream); 768 | cudaCheck(cudaDeviceSynchronize()); 769 | deallocate_device(&devMemStat); 770 | 771 | gld_tran = hostMemStat.gld_tran; 772 | gst_tran = hostMemStat.gst_tran; 773 | gld_req = hostMemStat.gld_req; 774 | gst_req = hostMemStat.gst_req; 775 | cl_full_l2 = hostMemStat.cl_full_l2; 776 | cl_part_l2 = hostMemStat.cl_part_l2; 777 | cl_full_l1 = hostMemStat.cl_full_l1; 778 | cl_part_l1 = hostMemStat.cl_part_l1; 779 | // l1_tran = hostMemStat.l1_tran; 780 | 781 | return true; 782 | } 783 | -------------------------------------------------------------------------------- /src/cutt_bench.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include 27 | #include // strcmp 28 | #include // std::time 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include "cutt.h" 34 | #include "CudaUtils.h" 35 | #include "TensorTester.h" 36 | #include "cuttTimer.h" 37 | #include "CudaMemcpy.h" 38 | #include "int_vector.h" 39 | 40 | #define MILLION 1000000 41 | #define BILLION 1000000000 42 | 43 | // 44 | // Error checking wrapper for cutt 45 | // 46 | #define cuttCheck(stmt) do { \ 47 | cuttResult err = stmt; \ 48 | if (err != CUTT_SUCCESS) { \ 49 | fprintf(stderr, "%s in file %s, function %s\n", #stmt,__FILE__,__FUNCTION__); \ 50 | exit(1); \ 51 | } \ 52 | } while(0) 53 | 54 | char* dataIn = NULL; 55 | char* dataOut = NULL; 56 | size_t dataSize = 0; 57 | TensorTester* tester = NULL; 58 | 59 | cuttTimer* timer; 60 | bool use_cuttPlanMeasure; 61 | bool use_plantimer; 62 | 63 | std::default_random_engine generator; 64 | 65 | bool bench1(int numElem); 66 | bool bench2(int numElem); 67 | bool bench3(int numElem); 68 | bool bench4(); 69 | template bool bench5(int numElem, int ratio); 70 | bool bench6(); 71 | template bool bench7(); 72 | template bool bench_input(std::vector& dim, std::vector& permutation); 73 | template bool bench_memcpy(int numElem); 74 | 75 | bool isTrivial(std::vector& permutation); 76 | void getRandomDim(double vol, std::vector& dim); 77 | template bool bench_tensor(std::vector& dim, std::vector& permutation); 78 | void printVec(std::vector& vec); 79 | void printDeviceInfo(); 80 | 81 | int main(int argc, char *argv[]) { 82 | 83 | int gpuid = -1; 84 | unsigned seed = unsigned (std::time(0)); 85 | bool arg_ok = true; 86 | int benchID = 0; 87 | use_cuttPlanMeasure = false; 88 | use_plantimer = false; 89 | int elemsize = 8; 90 | std::vector dimIn; 91 | std::vector permutationIn; 92 | if (argc >= 2) { 93 | int i = 1; 94 | while (i < argc) { 95 | if (strcmp(argv[i], "-device") == 0) { 96 | sscanf(argv[i+1], "%d", &gpuid); 97 | i += 2; 98 | } else if (strcmp(argv[i], "-bench") == 0) { 99 | sscanf(argv[i+1], "%d", &benchID); 100 | i += 2; 101 | } else if (strcmp(argv[i], "-measure") == 0) { 102 | use_cuttPlanMeasure = true; 103 | i++; 104 | } else if (strcmp(argv[i], "-seed") == 0) { 105 | sscanf(argv[i+1], "%u", &seed); 106 | i += 2; 107 | } else if (strcmp(argv[i], "-plantimer") == 0) { 108 | use_plantimer = true; 109 | i++; 110 | } else if (strcmp(argv[i], "-elemsize") == 0) { 111 | sscanf(argv[i+1], "%u", &elemsize); 112 | i += 2; 113 | } else if (strcmp(argv[i], "-dim") == 0) { 114 | i++; 115 | while (i < argc && isdigit(*argv[i])) { 116 | int val; 117 | sscanf(argv[i++], "%d", &val); 118 | dimIn.push_back(val); 119 | } 120 | } else if (strcmp(argv[i], "-permutation") == 0) { 121 | i++; 122 | while (i < argc && isdigit(*argv[i])) { 123 | int val; 124 | sscanf(argv[i++], "%d", &val); 125 | permutationIn.push_back(val); 126 | } 127 | } else { 128 | arg_ok = false; 129 | break; 130 | } 131 | } 132 | } else if (argc > 1) { 133 | arg_ok = false; 134 | } 135 | 136 | if (elemsize != 4 && elemsize != 8) { 137 | arg_ok = false; 138 | } 139 | 140 | if (!arg_ok) { 141 | printf("cutt_bench [options]\n"); 142 | printf("Options:\n"); 143 | printf("-device [int] : GPU ID (default is 0)\n"); 144 | printf("-measure : use cuttPlanMeasure (default is cuttPlan)\n"); 145 | printf("-plantimer : planning is timed (default is no)\n"); 146 | printf("-seed [int] : seed value for random number generator (default is system timer)\n"); 147 | printf("-elemsize [int] : size of elements in bytes, 4 or 8. (default is 8)\n"); 148 | printf("-dim ... : space-separated list of dimensions\n"); 149 | printf("-permutation ... : space-separated list of permutations\n"); 150 | printf("-bench benchID : benchmark to run\n"); 151 | return 1; 152 | } 153 | 154 | if (gpuid >= 0) { 155 | cudaCheck(cudaSetDevice(gpuid)); 156 | } 157 | 158 | cudaCheck(cudaDeviceReset()); 159 | if (elemsize == 4) { 160 | cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); 161 | } else { 162 | cudaCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); 163 | } 164 | 165 | printDeviceInfo(); 166 | printf("CPU using vector type %s of length %d\n", INT_VECTOR_TYPE, INT_VECTOR_LEN); 167 | 168 | timer = new cuttTimer(elemsize); 169 | 170 | dataSize = (elemsize == 4) ? 420*MILLION : 370*MILLION; 171 | 172 | // Allocate device data, 100M elements 173 | allocate_device(&dataIn, dataSize*(size_t)elemsize); 174 | allocate_device(&dataOut, dataSize*(size_t)elemsize); 175 | 176 | // Create tester 177 | tester = new TensorTester(); 178 | tester->setTensorCheckPattern((unsigned int *)dataIn, dataSize*(size_t)elemsize/sizeof(unsigned int)); 179 | 180 | std::vector worstDim; 181 | std::vector worstPermutation; 182 | 183 | std::srand(seed); 184 | generator.seed(seed); 185 | 186 | // if (!bench1(40*MILLION, bandwidths)) goto fail; 187 | // printf("bench1:\n"); 188 | // for (int i=0;i < bandwidths.size();i++) { 189 | // printf("%lf\n", bandwidths[i]); 190 | // } 191 | 192 | // if (!bench2(40*MILLION, bandwidths)) goto fail; 193 | // printf("bench2:\n"); 194 | // for (int i=0;i < bandwidths.size();i++) { 195 | // printf("%lf\n", bandwidths[i]); 196 | // } 197 | 198 | if (dimIn.size() > 0) { 199 | bool ok = (elemsize == 4) ? bench_input(dimIn, permutationIn) : bench_input(dimIn, permutationIn); 200 | if (ok) goto benchOK; 201 | goto fail; 202 | } 203 | 204 | if (benchID == 3) { 205 | if (elemsize == 4) { 206 | printf("bench 3 not implemented for elemsize = 4\n"); 207 | goto fail; 208 | } 209 | if (bench3(200*MILLION)) { 210 | printf("bench3:\n"); 211 | printf("rank best worst average median\n"); 212 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 213 | double worstBW = timer->getWorst(*it); 214 | double bestBW = timer->getBest(*it); 215 | double aveBW = timer->getAverage(*it); 216 | double medBW = timer->getMedian(*it); 217 | printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW); 218 | } 219 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 220 | std::vector dim; 221 | std::vector permutation; 222 | double worstBW = timer->getWorst(*it, dim, permutation); 223 | printf("rank %d BW %4.2lf\n", *it, worstBW); 224 | printf("dimensions\n"); 225 | printVec(dim); 226 | printf("permutation\n"); 227 | printVec(permutation); 228 | } 229 | goto benchOK; 230 | } else { 231 | goto fail; 232 | } 233 | } 234 | 235 | if (benchID/100 == 5) { 236 | bool ok = (elemsize == 4) ? bench5(200*MILLION, benchID % 100) : bench5(200*MILLION, benchID % 100); 237 | if (ok) { 238 | printf("bench5:\n"); 239 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 240 | std::vector v = timer->getData(*it); 241 | printf("RANK%d", *it); 242 | for (int i=0;i < v.size();i++) { 243 | printf(" %1.2lf", v[i]); 244 | } 245 | printf("\n"); 246 | } 247 | printf("rank best worst average median\n"); 248 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 249 | double worstBW = timer->getWorst(*it); 250 | double bestBW = timer->getBest(*it); 251 | double aveBW = timer->getAverage(*it); 252 | double medBW = timer->getMedian(*it); 253 | printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW); 254 | } 255 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 256 | std::vector dim; 257 | std::vector permutation; 258 | double worstBW = timer->getWorst(*it, dim, permutation); 259 | printf("rank %d BW %4.2lf\n", *it, worstBW); 260 | printf("dimensions\n"); 261 | printVec(dim); 262 | printf("permutation\n"); 263 | printVec(permutation); 264 | } 265 | goto benchOK; 266 | } else { 267 | goto fail; 268 | } 269 | } 270 | 271 | if (benchID == 6) { 272 | if (elemsize == 4) { 273 | printf("bench 6 not implemented for elemsize = 4\n"); 274 | goto fail; 275 | } 276 | if (bench6()) { 277 | printf("bench6:\n"); 278 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 279 | std::vector v = timer->getData(*it); 280 | printf("RANK%d", *it); 281 | for (int i=0;i < v.size();i++) { 282 | printf(" %1.2lf", v[i]); 283 | } 284 | printf("\n"); 285 | } 286 | printf("rank best worst average\n"); 287 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 288 | double worstBW = timer->getWorst(*it); 289 | double bestBW = timer->getBest(*it); 290 | double aveBW = timer->getAverage(*it); 291 | printf("%d %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW); 292 | } 293 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 294 | std::vector dim; 295 | std::vector permutation; 296 | double worstBW = timer->getWorst(*it, dim, permutation); 297 | printf("rank %d BW %4.2lf\n", *it, worstBW); 298 | printf("dimensions\n"); 299 | printVec(dim); 300 | printf("permutation\n"); 301 | printVec(permutation); 302 | } 303 | goto benchOK; 304 | } else { 305 | goto fail; 306 | } 307 | } 308 | 309 | if (benchID == 7) { 310 | bool ok = (elemsize == 4) ? bench7() : bench7(); 311 | if (ok) { 312 | printf("bench7:\n"); 313 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 314 | std::vector v = timer->getData(*it); 315 | printf("RANK%d", *it); 316 | for (int i=0;i < v.size();i++) { 317 | printf(" %1.2lf", v[i]); 318 | } 319 | printf("\n"); 320 | } 321 | printf("rank best worst average median\n"); 322 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 323 | double worstBW = timer->getWorst(*it); 324 | double bestBW = timer->getBest(*it); 325 | double aveBW = timer->getAverage(*it); 326 | double medBW = timer->getMedian(*it); 327 | printf("%d %6.2lf %6.2lf %6.2lf %6.2lf\n", *it, bestBW, worstBW, aveBW, medBW); 328 | } 329 | for (auto it=timer->ranksBegin();it != timer->ranksEnd();it++) { 330 | std::vector dim; 331 | std::vector permutation; 332 | double worstBW = timer->getWorst(*it, dim, permutation); 333 | printf("rank %d BW %4.2lf\n", *it, worstBW); 334 | printf("dimensions\n"); 335 | printVec(dim); 336 | printf("permutation\n"); 337 | printVec(permutation); 338 | } 339 | goto benchOK; 340 | } else { 341 | goto fail; 342 | } 343 | } 344 | 345 | // Otherwise, do memcopy benchmark 346 | { 347 | bool ok = (elemsize == 4) ? bench_memcpy(benchID) : bench_memcpy(benchID); 348 | if (ok) goto benchOK; 349 | goto fail; 350 | } 351 | 352 | benchOK: 353 | printf("bench OK\n"); 354 | 355 | goto end; 356 | fail: 357 | printf("bench FAIL\n"); 358 | end: 359 | deallocate_device(&dataIn); 360 | deallocate_device(&dataOut); 361 | delete tester; 362 | 363 | printf("seed %u\n", seed); 364 | 365 | delete timer; 366 | 367 | cudaCheck(cudaDeviceSynchronize()); 368 | 369 | cudaCheck(cudaDeviceReset()); 370 | return 0; 371 | } 372 | 373 | // 374 | // Benchmark 1: ranks 2-8,15 in inverse permutation. 32 start and end dimension 375 | // 376 | bool bench1(int numElem) { 377 | int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15}; 378 | for (int i=0;i <= 7;i++) { 379 | std::vector dim(ranks[i]); 380 | std::vector permutation(ranks[i]); 381 | int dimave = (int)pow(numElem, 1.0/(double)ranks[i]); 382 | 383 | if (dimave < 100.0) { 384 | dim[0] = 32; 385 | dim[ranks[i] - 1] = 32; 386 | } else { 387 | dim[0] = dimave; 388 | dim[ranks[i] - 1] = dimave; 389 | } 390 | // Distribute remaining volume to the middle ranks 391 | int ranks_left = ranks[i] - 2; 392 | double numElem_left = numElem/(double)(dim[0]*dim[ranks[i] - 1]); 393 | for (int r=1;r < ranks[i] - 1;r++) { 394 | dim[r] = (int)pow(numElem_left, 1.0/(double)ranks_left); 395 | numElem_left /= (double)dim[r]; 396 | ranks_left--; 397 | } 398 | 399 | // Inverse order 400 | for (int r=0;r < ranks[i];r++) { 401 | permutation[r] = ranks[i] - 1 - r; 402 | } 403 | 404 | if (!bench_tensor(dim, permutation)) return false; 405 | } 406 | 407 | return true; 408 | } 409 | 410 | // 411 | // Benchmark 2: ranks 2-8,15 in inverse permutation. Even spread of dimensions. 412 | // 413 | bool bench2(int numElem) { 414 | int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15}; 415 | for (int i=0;i <= 7;i++) { 416 | std::vector dim(ranks[i]); 417 | std::vector permutation(ranks[i]); 418 | int dimave = (int)pow(numElem, 1.0/(double)ranks[i]); 419 | 420 | double numElem_left = numElem; 421 | for (int r=0;r < ranks[i];r++) { 422 | dim[r] = (int)pow(numElem_left, 1.0/(double)(ranks[i] - r)); 423 | numElem_left /= (double)dim[r]; 424 | } 425 | 426 | // Inverse order 427 | for (int r=0;r < ranks[i];r++) { 428 | permutation[r] = ranks[i] - 1 - r; 429 | } 430 | 431 | if (!bench_tensor(dim, permutation)) return false; 432 | } 433 | 434 | return true; 435 | } 436 | 437 | // 438 | // Benchmark 3: ranks 2-8,15 in random permutation and dimensions. 439 | // 440 | bool bench3(int numElem) { 441 | 442 | int ranks[8] = {2, 3, 4, 5, 6, 7, 8, 15}; 443 | 444 | for (int i=0;i <= 7;i++) { 445 | std::vector dim(ranks[i]); 446 | std::vector permutation(ranks[i]); 447 | for (int r=0;r < ranks[i];r++) permutation[r] = r; 448 | for (int nsample=0;nsample < 50;nsample++) { 449 | std::random_shuffle(permutation.begin(), permutation.end()); 450 | getRandomDim((double)numElem, dim); 451 | if (!bench_tensor(dim, permutation)) return false; 452 | } 453 | } 454 | 455 | return true; 456 | } 457 | 458 | // 459 | // Benchmark 4: specific examples 460 | // 461 | bool bench4() { 462 | } 463 | 464 | template 465 | bool bench_input(std::vector& dim, std::vector& permutation) { 466 | if (!bench_tensor(dim, permutation)) return false; 467 | printf("dimensions\n"); 468 | printVec(dim); 469 | printf("permutation\n"); 470 | printVec(permutation); 471 | printf("bandwidth %4.2lf GB/s\n", timer->GBs()); 472 | return true; 473 | } 474 | 475 | // 476 | // Benchmark 5: All permutations for ranks 2-4, limited permutations for ranks 5-7 477 | // 478 | template 479 | bool bench5(int numElemAvg, int ratio) { 480 | 481 | std::normal_distribution numElem_dist((double)numElemAvg, (double)numElemAvg*0.2); 482 | 483 | const int minDim = 2; 484 | const int maxDim = 16; 485 | for (int rank=2;rank <= 7;rank++) { 486 | 487 | for (int iter=0;iter < 500;iter++) { 488 | 489 | int numElem = (int)numElem_dist(generator); 490 | 491 | std::vector dim(rank); 492 | std::vector permutation(rank); 493 | std::vector dimf(rank); 494 | double volf = 1.0; 495 | for (int r=0;r < rank;r++) { 496 | permutation[r] = r; 497 | dimf[r] = 1.0 + (double)r*(ratio - 1.0)/(double)(rank - 1); 498 | volf *= dimf[r]; 499 | } 500 | // fprintf(stderr, "volf %lf\n", volf); 501 | double scale = pow((double)numElem/volf, 1.0/(double)rank); 502 | // fprintf(stderr, "scale %lf\n", scale); 503 | int vol = 1; 504 | for (int r=0;r < rank;r++) { 505 | if (r == rank - 1) { 506 | dim[r] = ratio*dim[0]; 507 | } else { 508 | dim[r] = (int)round(dimf[r]*scale); 509 | } 510 | dim[r] = std::max(2, dim[r]); 511 | vol *= dim[r]; 512 | } 513 | // fprintf(stderr, "dim[0] %lf\n", dim[0]); 514 | double cur_ratio = (double)dim[rank-1]/(double)dim[0]; 515 | double vol_re = fabs((double)(vol - numElem)/(double)numElem); 516 | // fprintf(stderr, "cur_ratio %lf vol_re %lf\n", cur_ratio, vol_re); 517 | // Fix dimensions if volume is off by more than 5% 518 | if (vol_re > 0.05) { 519 | int d = (vol < numElem) ? 1 : -1; 520 | int r = 1; 521 | while (vol_re > 0.05 && r < rank) { 522 | int dim_plus_d = std::max(2, dim[r] + d); 523 | // fprintf(stderr, "r %d vol %lf dim[r] %d dim_plus_d %d\n", vol, dim[r], dim_plus_d); 524 | vol = (vol/dim[r])*dim_plus_d; 525 | dim[r] = dim_plus_d; 526 | vol_re = fabs((double)(vol - numElem)/(double)numElem); 527 | r++; 528 | } 529 | } 530 | int minDim = *(std::min_element(dim.begin(), dim.end())); 531 | int maxDim = *(std::max_element(dim.begin(), dim.end())); 532 | // fprintf(stderr, "minDim %lf maxDim\n", minDim, maxDim); 533 | cur_ratio = (double)maxDim/(double)minDim; 534 | printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re); 535 | printVec(dim); 536 | 537 | std::random_shuffle(dim.begin(), dim.end()); 538 | while (isTrivial(permutation)) { 539 | std::random_shuffle(permutation.begin(), permutation.end()); 540 | } 541 | if (!bench_tensor(dim, permutation)) return false; 542 | } 543 | } 544 | 545 | return true; 546 | } 547 | 548 | // 549 | // Benchmark 6: from "TTC: A Tensor Transposition Compiler for Multiple Architectures" 550 | // 551 | bool bench6() { 552 | 553 | std::vector< std::vector > dims = { 554 | std::vector{7248,7248}, 555 | std::vector{43408,1216}, 556 | std::vector{1216,43408}, 557 | std::vector{368,384,384}, 558 | std::vector{2144,64,384}, 559 | std::vector{368,64,2307}, 560 | std::vector{384,384,355}, 561 | std::vector{2320,384,59}, 562 | std::vector{384,2320,59}, 563 | std::vector{384,355,384}, 564 | std::vector{2320,59,384}, 565 | std::vector{384,59,2320}, 566 | std::vector{80,96,75,96}, 567 | std::vector{464,16,75,96}, 568 | std::vector{80,16,75,582}, 569 | std::vector{96,75,96,75}, 570 | std::vector{608,12,96,75}, 571 | std::vector{96,12,608,75}, 572 | std::vector{96,75,96,75}, 573 | std::vector{608,12,96,75}, 574 | std::vector{96,12,608,75}, 575 | std::vector{96,96,75,75}, 576 | std::vector{608,96,12,75}, 577 | std::vector{96,608,12,75}, 578 | std::vector{96,75,75,96}, 579 | std::vector{608,12,75,96}, 580 | std::vector{96,12,75,608}, 581 | std::vector{32,48,28,28,48}, 582 | std::vector{176,8,28,28,48}, 583 | std::vector{32,8,28,28,298}, 584 | std::vector{48,28,28,48,28}, 585 | std::vector{352,4,28,48,28}, 586 | std::vector{48,4,28,352,28}, 587 | std::vector{48,28,48,28,28}, 588 | std::vector{352,4,48,28,28}, 589 | std::vector{48,4,352,28,28}, 590 | std::vector{48,48,28,28,28}, 591 | std::vector{352,48,4,28,28}, 592 | std::vector{48,352,4,28,28}, 593 | std::vector{48,28,28,28,48}, 594 | std::vector{352,4,28,28,48}, 595 | std::vector{48,4,28,28,352}, 596 | std::vector{16,32,15,32,15,15}, 597 | std::vector{48,10,15,32,15,15}, 598 | std::vector{16,10,15,103,15,15}, 599 | std::vector{32,15,15,32,15,15}, 600 | std::vector{112,5,15,32,15,15}, 601 | std::vector{32,5,15,112,15,15}, 602 | std::vector{32,15,32,15,15,15}, 603 | std::vector{112,5,32,15,15,15}, 604 | std::vector{32,5,112,15,15,15}, 605 | std::vector{32,15,15,32,15,15}, 606 | std::vector{112,5,15,32,15,15}, 607 | std::vector{32,5,15,112,15,15}, 608 | std::vector{32,15,15,15,15,32}, 609 | std::vector{112,5,15,15,15,32}, 610 | std::vector{32,5,15,15,15,112} 611 | }; 612 | 613 | std::vector< std::vector > permutations = { 614 | std::vector{1,0}, 615 | std::vector{1,0}, 616 | std::vector{1,0}, 617 | std::vector{0,2,1}, 618 | std::vector{0,2,1}, 619 | std::vector{0,2,1}, 620 | std::vector{1,0,2}, 621 | std::vector{1,0,2}, 622 | std::vector{1,0,2}, 623 | std::vector{2,1,0}, 624 | std::vector{2,1,0}, 625 | std::vector{2,1,0}, 626 | std::vector{0,3,2,1}, 627 | std::vector{0,3,2,1}, 628 | std::vector{0,3,2,1}, 629 | std::vector{2,1,3,0}, 630 | std::vector{2,1,3,0}, 631 | std::vector{2,1,3,0}, 632 | std::vector{2,0,3,1}, 633 | std::vector{2,0,3,1}, 634 | std::vector{2,0,3,1}, 635 | std::vector{1,0,3,2}, 636 | std::vector{1,0,3,2}, 637 | std::vector{1,0,3,2}, 638 | std::vector{3,2,1,0}, 639 | std::vector{3,2,1,0}, 640 | std::vector{3,2,1,0}, 641 | std::vector{0,4,2,1,3}, 642 | std::vector{0,4,2,1,3}, 643 | std::vector{0,4,2,1,3}, 644 | std::vector{3,2,1,4,0}, 645 | std::vector{3,2,1,4,0}, 646 | std::vector{3,2,1,4,0}, 647 | std::vector{2,0,4,1,3}, 648 | std::vector{2,0,4,1,3}, 649 | std::vector{2,0,4,1,3}, 650 | std::vector{1,3,0,4,2}, 651 | std::vector{1,3,0,4,2}, 652 | std::vector{1,3,0,4,2}, 653 | std::vector{4,3,2,1,0}, 654 | std::vector{4,3,2,1,0}, 655 | std::vector{4,3,2,1,0}, 656 | std::vector{0,3,2,5,4,1}, 657 | std::vector{0,3,2,5,4,1}, 658 | std::vector{0,3,2,5,4,1}, 659 | std::vector{3,2,0,5,1,4}, 660 | std::vector{3,2,0,5,1,4}, 661 | std::vector{3,2,0,5,1,4}, 662 | std::vector{2,0,4,1,5,3}, 663 | std::vector{2,0,4,1,5,3}, 664 | std::vector{2,0,4,1,5,3}, 665 | std::vector{3,2,5,1,0,4}, 666 | std::vector{3,2,5,1,0,4}, 667 | std::vector{3,2,5,1,0,4}, 668 | std::vector{5,4,3,2,1,0}, 669 | std::vector{5,4,3,2,1,0}, 670 | std::vector{5,4,3,2,1,0} 671 | }; 672 | 673 | for (int i=0;i < dims.size();i++) { 674 | if (!bench_tensor(dims[i], permutations[i])) return false; 675 | printf("dimensions\n"); 676 | printVec(dims[i]); 677 | printf("permutation\n"); 678 | printVec(permutations[i]); 679 | printf("bandwidth %4.2lf GiB/s\n", timer->GiBs()); 680 | } 681 | 682 | return true; 683 | } 684 | 685 | // 686 | // Benchmark 7: ranks 8 and 12 with 4 large dimensions and rest small dimensions 687 | // 688 | template 689 | bool bench7() { 690 | 691 | // 199584000 elements 692 | { 693 | std::vector dim = {5, 3, 2, 4, 35, 33, 37, 40}; 694 | std::vector permutation(8); 695 | // Inverse 696 | for (int r=0;r < dim.size();r++) permutation[r] = dim.size() - 1 - r; 697 | if (!bench_tensor(dim, permutation)) return false; 698 | // Random 699 | for (int r=0;r < dim.size();r++) permutation[r] = r; 700 | for (int nsample=0;nsample < 500;nsample++) { 701 | std::random_shuffle(dim.begin(), dim.end()); 702 | std::random_shuffle(permutation.begin(), permutation.end()); 703 | if (!isTrivial(permutation)) { 704 | if (!bench_tensor(dim, permutation)) return false; 705 | } 706 | } 707 | } 708 | 709 | // 328458240 elements 710 | { 711 | std::vector dim = {2, 3, 4, 3, 2, 2, 3, 2, 20, 18, 22, 24}; 712 | std::vector permutation(12); 713 | // Inverse 714 | for (int r=0;r < dim.size();r++) permutation[r] = dim.size() - 1 - r; 715 | if (!bench_tensor(dim, permutation)) return false; 716 | // Random 717 | for (int r=0;r < dim.size();r++) permutation[r] = r; 718 | for (int nsample=0;nsample < 500;nsample++) { 719 | std::random_shuffle(dim.begin(), dim.end()); 720 | std::random_shuffle(permutation.begin(), permutation.end()); 721 | if (!isTrivial(permutation)) { 722 | if (!bench_tensor(dim, permutation)) return false; 723 | } 724 | } 725 | } 726 | 727 | return true; 728 | } 729 | 730 | // 731 | // Returns true for trivial permutation 732 | // 733 | bool isTrivial(std::vector& permutation) { 734 | for (int i=0;i < permutation.size();i++) { 735 | if (permutation[i] != i) return false; 736 | } 737 | return true; 738 | } 739 | 740 | // 741 | // Get random dimensions for a fixed volume tensor 742 | // 743 | void getRandomDim(double vol, std::vector& dim) { 744 | double dimave = floor(pow(vol, 1.0/(double)dim.size())); 745 | double curvol = 1.0; 746 | int iter = 0; 747 | do { 748 | curvol = 1.0; 749 | for (int r=0;r < dim.size();r++) { 750 | // p is -1 ... 1 751 | double p = (((double)rand()/(double)RAND_MAX) - 0.5)*2.0; 752 | dim[r] = round(dimave + p*(dimave - 2.0)); 753 | curvol *= (double)dim[r]; 754 | } 755 | 756 | double vol_scale = pow(vol/curvol, 1.0/(double)dim.size()); 757 | curvol = 1.0; 758 | for (int r=0;r < dim.size();r++) { 759 | dim[r] = std::max(2, (int)(dim[r]*vol_scale)); 760 | curvol *= dim[r]; 761 | } 762 | // printf("curvol %lf\n", curvol/MILLION); 763 | iter++; 764 | } while (iter < 5000 && (fabs(curvol-vol)/(double)vol > 0.3)); 765 | 766 | if (iter == 5000) { 767 | printf("getRandomDim: Unable to determine dimensions in 5000 iterations\n"); 768 | exit(1); 769 | } 770 | } 771 | 772 | template 773 | bool bench_tensor(std::vector& dim, std::vector& permutation) { 774 | 775 | int rank = dim.size(); 776 | 777 | int vol = 1; 778 | for (int r=0;r < rank;r++) { 779 | vol *= dim[r]; 780 | } 781 | 782 | size_t volmem = vol*sizeof(T); 783 | size_t datamem = dataSize*sizeof(long long int); 784 | if (volmem > datamem) { 785 | printf("test_tensor, data size exceeded\n"); 786 | return false; 787 | } 788 | 789 | std::vector dimp(rank); 790 | for (int r=0;r < rank;r++) { 791 | dimp[r] = dim[permutation[r]]; 792 | } 793 | 794 | printf("number of elements %d\n", vol); 795 | printf("dimensions\n"); 796 | printVec(dim); 797 | printVec(dimp); 798 | printf("permutation\n"); 799 | printVec(permutation); 800 | 801 | cuttHandle plan; 802 | std::chrono::high_resolution_clock::time_point plan_start; 803 | if (use_plantimer) { 804 | plan_start = std::chrono::high_resolution_clock::now(); 805 | } 806 | if (use_cuttPlanMeasure) { 807 | cuttCheck(cuttPlanMeasure(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0, dataIn, dataOut)); 808 | } else { 809 | cuttCheck(cuttPlan(&plan, rank, dim.data(), permutation.data(), sizeof(T), 0)); 810 | } 811 | if (use_plantimer) { 812 | std::chrono::high_resolution_clock::time_point plan_end; 813 | plan_end = std::chrono::high_resolution_clock::now(); 814 | double plan_duration = std::chrono::duration_cast< std::chrono::duration >(plan_end - plan_start).count(); 815 | printf("plan took %lf ms\n", plan_duration*1000.0); 816 | } 817 | 818 | for (int i=0;i < 4;i++) { 819 | set_device_array((T *)dataOut, -1, vol); 820 | cudaCheck(cudaDeviceSynchronize()); 821 | 822 | timer->start(dim, permutation); 823 | cuttCheck(cuttExecute(plan, dataIn, dataOut)); 824 | timer->stop(); 825 | 826 | printf("wall time %lf ms %lf GB/s\n", timer->seconds()*1000.0, timer->GBs()); 827 | } 828 | 829 | cuttCheck(cuttDestroy(plan)); 830 | return tester->checkTranspose(rank, dim.data(), permutation.data(), (T *)dataOut); 831 | } 832 | 833 | void printVec(std::vector& vec) { 834 | for (int i=0;i < vec.size();i++) { 835 | printf("%d ", vec[i]); 836 | } 837 | printf("\n"); 838 | } 839 | 840 | // 841 | // Benchmarks memory copy. Returns bandwidth in GB/s 842 | // 843 | template 844 | bool bench_memcpy(int numElem) { 845 | 846 | std::vector dim(1, numElem); 847 | std::vector permutation(1, 0); 848 | 849 | { 850 | cuttTimer timer(sizeof(T)); 851 | for (int i=0;i < 4;i++) { 852 | set_device_array((T *)dataOut, -1, numElem); 853 | cudaCheck(cudaDeviceSynchronize()); 854 | timer.start(dim, permutation); 855 | scalarCopy(numElem, (T *)dataIn, (T *)dataOut, 0); 856 | timer.stop(); 857 | printf("%4.2lf GB/s\n", timer.GBs()); 858 | } 859 | if (!tester->checkTranspose(1, dim.data(), permutation.data(), (T *)dataOut)) return false; 860 | printf("scalarCopy %lf GB/s\n", timer.getAverage(1)); 861 | } 862 | 863 | { 864 | cuttTimer timer(sizeof(T)); 865 | for (int i=0;i < 4;i++) { 866 | set_device_array((T *)dataOut, -1, numElem); 867 | cudaCheck(cudaDeviceSynchronize()); 868 | timer.start(dim, permutation); 869 | vectorCopy(numElem, (T *)dataIn, (T *)dataOut, 0); 870 | timer.stop(); 871 | printf("%4.2lf GB/s\n", timer.GBs()); 872 | } 873 | if (!tester->checkTranspose(1, dim.data(), permutation.data(), (T *)dataOut)) return false; 874 | printf("vectorCopy %lf GB/s\n", timer.getAverage(1)); 875 | } 876 | 877 | { 878 | cuttTimer timer(sizeof(T)); 879 | for (int i=0;i < 4;i++) { 880 | set_device_array((T *)dataOut, -1, numElem); 881 | cudaCheck(cudaDeviceSynchronize()); 882 | timer.start(dim, permutation); 883 | memcpyFloat(numElem*sizeof(T)/sizeof(float), (float *)dataIn, (float *)dataOut, 0); 884 | timer.stop(); 885 | printf("%4.2lf GB/s\n", timer.GBs()); 886 | } 887 | if (!tester->checkTranspose(1, dim.data(), permutation.data(), (T *)dataOut)) return false; 888 | printf("memcpyFloat %lf GB/s\n", timer.getAverage(1)); 889 | } 890 | 891 | return true; 892 | } 893 | 894 | void printDeviceInfo() { 895 | int deviceID; 896 | cudaCheck(cudaGetDevice(&deviceID)); 897 | cudaDeviceProp prop; 898 | cudaCheck(cudaGetDeviceProperties(&prop, deviceID)); 899 | cudaSharedMemConfig pConfig; 900 | cudaCheck(cudaDeviceGetSharedMemConfig(&pConfig)); 901 | int shMemBankSize = 4; 902 | if (pConfig == cudaSharedMemBankSizeEightByte) shMemBankSize = 8; 903 | double mem_BW = (double)(prop.memoryClockRate*2*(prop.memoryBusWidth/8))/1.0e6; 904 | printf("Using %s SM version %d.%d\n", prop.name, prop.major, prop.minor); 905 | printf("Clock %1.3lfGhz numSM %d ECC %d mem BW %1.2lfGB/s shMemBankSize %dB\n", (double)prop.clockRate/1e6, 906 | prop.multiProcessorCount, prop.ECCEnabled, mem_BW, shMemBankSize); 907 | printf("L2 %1.2lfMB\n", (double)prop.l2CacheSize/(double)(1024*1024)); 908 | 909 | } 910 | -------------------------------------------------------------------------------- /src/cuttkernel.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | MIT License 3 | 4 | Copyright (c) 2016 Antti-Pekka Hynninen 5 | Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | *******************************************************************************/ 25 | #include 26 | #include "CudaUtils.h" 27 | #include "LRUCache.h" 28 | #include "cuttkernel.h" 29 | 30 | #define RESTRICT __restrict__ 31 | 32 | // 33 | // Transpose when Mm and Mk don't overlap and contain only single rank 34 | // 35 | // dim3 numthread(TILEDIM, TILEROWS, 1); 36 | // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMk-1)/TILEDIM+1), 1, plan.volMbar); 37 | // 38 | template 39 | __global__ void transposeTiled( 40 | const int numMm, const int volMbar, const int sizeMbar, 41 | const int2 tiledVol, const int cuDimMk, const int cuDimMm, 42 | const TensorConvInOut* RESTRICT glMbar, 43 | const T* RESTRICT dataIn, T* RESTRICT dataOut) { 44 | 45 | // Shared memory 46 | __shared__ T shTile[TILEDIM][TILEDIM+1]; 47 | 48 | const int warpLane = threadIdx.x & (warpSize - 1); 49 | TensorConvInOut Mbar; 50 | Mbar.c_in = 1; 51 | Mbar.d_in = 1; 52 | Mbar.c_out = 1; 53 | Mbar.d_out = 1; 54 | if (warpLane < sizeMbar) { 55 | Mbar = glMbar[warpLane]; 56 | } 57 | 58 | const int bx = (blockIdx.x % numMm)*TILEDIM; 59 | const int by = (blockIdx.x / numMm)*TILEDIM; 60 | 61 | const int xin = bx + threadIdx.x; 62 | const int yin = by + threadIdx.y; 63 | 64 | const int xout = bx + threadIdx.y; 65 | const int yout = by + threadIdx.x; 66 | 67 | const unsigned int maskIny = __ballot_sync(FULL_MASK, (yin + warpLane < tiledVol.y))*(xin < tiledVol.x); 68 | const unsigned int maskOutx = __ballot_sync(FULL_MASK, (xout + warpLane < tiledVol.x))*(yout < tiledVol.y); 69 | 70 | const int posMinorIn = xin + yin*cuDimMk; 71 | const int posMinorOut = yout + xout*cuDimMm; 72 | const int posInAdd = TILEROWS*cuDimMk; 73 | const int posOutAdd = TILEROWS*cuDimMm; 74 | 75 | for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z) 76 | { 77 | 78 | // Compute global memory positions 79 | int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 80 | int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 81 | #pragma unroll 82 | for (int i=16;i >= 1;i/=2) { 83 | posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i); 84 | posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i); 85 | } 86 | int posIn = posMajorIn + posMinorIn; 87 | int posOut = posMajorOut + posMinorOut; 88 | 89 | // Read from global memory 90 | __syncthreads(); 91 | 92 | // Read data into shared memory tile 93 | #pragma unroll 94 | for (int j=0;j < TILEDIM;j += TILEROWS) { 95 | // int pos = posIn + j*cuDimMk; 96 | // if (xin < readVol.x && yin + j < readVol.y) { 97 | if ((maskIny & (1 << j)) != 0) { 98 | shTile[threadIdx.y + j][threadIdx.x] = dataIn[posIn]; 99 | } 100 | posIn += posInAdd; 101 | } 102 | 103 | // Write to global memory 104 | __syncthreads(); 105 | 106 | #pragma unroll 107 | for (int j=0;j < TILEDIM;j += TILEROWS) { 108 | // int pos = posOut + j*cuDimMm; 109 | // if (xout + j < readVol.x && yout < readVol.y) { 110 | if ((maskOutx & (1 << j)) != 0 ) { 111 | dataOut[posOut] = shTile[threadIdx.x][threadIdx.y + j]; 112 | } 113 | posOut += posOutAdd; 114 | } 115 | 116 | } 117 | 118 | } 119 | 120 | // 121 | // Packed transpose. Thread block loads plan.volMmk number of elements 122 | // 123 | template 124 | __global__ void transposePacked( 125 | const int volMmk, const int volMbar, 126 | const int sizeMmk, const int sizeMbar, 127 | const TensorConvInOut* RESTRICT gl_Mmk, 128 | const TensorConvInOut* RESTRICT gl_Mbar, 129 | const TensorConv* RESTRICT gl_Msh, 130 | const T* RESTRICT dataIn, T* RESTRICT dataOut) { 131 | 132 | // Shared memory. volMmk elements 133 | extern __shared__ char shBuffer_char[]; 134 | T* shBuffer = (T *)shBuffer_char; 135 | 136 | const int warpLane = threadIdx.x & (warpSize - 1); 137 | 138 | TensorConvInOut Mmk; 139 | Mmk.c_in = 1; 140 | Mmk.d_in = 1; 141 | Mmk.c_out = 1; 142 | Mmk.d_out = 1; 143 | if (warpLane < sizeMmk) { 144 | Mmk = gl_Mmk[warpLane]; 145 | } 146 | TensorConv Msh; 147 | Msh.c = 1; 148 | Msh.d = 1; 149 | if (warpLane < sizeMmk) { 150 | Msh = gl_Msh[warpLane]; 151 | } 152 | 153 | // Pre-compute tensor positions in Mmk 154 | // 3*numRegStorage registers 155 | int posMmkIn[numRegStorage]; 156 | int posMmkOut[numRegStorage]; 157 | int posSh[numRegStorage]; 158 | #pragma unroll 159 | for (int j=0;j < numRegStorage;j++) { 160 | posMmkIn[j] = 0; 161 | posMmkOut[j] = 0; 162 | posSh[j] = 0; 163 | } 164 | for (int i=0;i < sizeMmk;i++) { 165 | #pragma unroll 166 | for (int j=0;j < numRegStorage;j++) { 167 | int posMmk = threadIdx.x + j*blockDim.x; 168 | posMmkIn[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i); 169 | posMmkOut[j] += ((posMmk / __shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i); 170 | posSh[j] += ((posMmk / __shfl_sync(FULL_MASK, Msh.c,i)) % __shfl_sync(FULL_MASK, Msh.d,i))*__shfl_sync(FULL_MASK, Msh.ct,i); 171 | } 172 | } 173 | 174 | // 6 registers 175 | TensorConvInOut Mbar; 176 | Mbar.c_in = 1; 177 | Mbar.d_in = 1; 178 | Mbar.c_out = 1; 179 | Mbar.d_out = 1; 180 | if (warpLane < sizeMbar) { 181 | Mbar = gl_Mbar[warpLane]; 182 | } 183 | 184 | for (int posMbar=blockIdx.x;posMbar < volMbar;posMbar += gridDim.x) 185 | { 186 | 187 | int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 188 | #pragma unroll 189 | for (int i=16;i >= 1;i/=2) { 190 | posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i); 191 | } 192 | 193 | int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 194 | #pragma unroll 195 | for (int i=16;i >= 1;i/=2) { 196 | posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i); 197 | } 198 | 199 | __syncthreads(); 200 | 201 | // Read from global memory 202 | #pragma unroll 203 | for (int j=0;j < numRegStorage;j++) { 204 | int posMmk = threadIdx.x + j*blockDim.x; 205 | int posIn = posMbarIn + posMmkIn[j]; 206 | if (posMmk < volMmk) shBuffer[posMmk] = dataIn[posIn]; 207 | } 208 | 209 | __syncthreads(); 210 | 211 | // Write to global memory 212 | #pragma unroll 213 | for (int j=0;j < numRegStorage;j++) { 214 | int posMmk = threadIdx.x + j*blockDim.x; 215 | int posOut = posMbarOut + posMmkOut[j]; 216 | if (posMmk < volMmk) dataOut[posOut] = shBuffer[posSh[j]]; 217 | } 218 | 219 | 220 | } 221 | 222 | } 223 | 224 | // 225 | // Packed method with a split rank 226 | // 227 | // dim nthread(((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize, 1, 1) 228 | // dim nblock(ts.numSplit, min(256, max(1, ts.volMbar)), 1) 229 | // 230 | template 231 | __global__ void transposePackedSplit( 232 | const int splitDim, const int volMmkUnsplit, const int volMbar, 233 | const int sizeMmk, const int sizeMbar, 234 | const int cMmSplit, const int cMkSplit, 235 | const TensorConvInOut* RESTRICT glMmk, 236 | const TensorConvInOut* RESTRICT glMbar, 237 | const TensorConv* RESTRICT glMsh, 238 | const T* RESTRICT dataIn, T* RESTRICT dataOut) { 239 | 240 | // Shared memory. max(volSplit)*volMmkUnsplit T elements 241 | extern __shared__ char shBuffer_char[]; 242 | T* shBuffer = (T *)shBuffer_char; 243 | 244 | const int warpLane = threadIdx.x & (warpSize - 1); 245 | 246 | // const int plusone = (blockIdx.x < (splitDim % gridDim.x)); 247 | const int p0 = blockIdx.x*splitDim/gridDim.x; 248 | const int volSplit = (blockIdx.x + 1)*splitDim/gridDim.x - p0; 249 | const int plusone = volSplit - splitDim/gridDim.x; 250 | 251 | TensorConvInOut Mmk; 252 | Mmk.c_in = 1; 253 | Mmk.d_in = 1; 254 | Mmk.c_out = 1; 255 | Mmk.d_out = 1; 256 | if (warpLane < sizeMmk) { 257 | Mmk = glMmk[warpLane + plusone*sizeMmk]; 258 | } 259 | TensorConv Msh; 260 | Msh.c = 1; 261 | Msh.d = 1; 262 | if (warpLane < sizeMmk) { 263 | Msh = glMsh[warpLane + plusone*sizeMmk]; 264 | } 265 | 266 | // gridDim.x = number of splits 267 | // blockIdx.x = {0 ... gridDim.x - 1} is the split-index 268 | // Volume of this split 269 | // const int volSplit = (splitDim/gridDim.x) + plusone; 270 | // Start position in this split 271 | // const int p0 = (splitDim/gridDim.x)*blockIdx.x + min(blockIdx.x, (splitDim % gridDim.x)); 272 | const int posMmkIn0 = p0*cMmSplit; 273 | const int posMmkOut0 = p0*cMkSplit; 274 | // Volume of split Mmk 275 | const int volMmkSplit = volSplit*volMmkUnsplit; 276 | 277 | // Pre-compute tensor positions in Mmk 278 | // 3*numRegStorage registers 279 | int posMmkIn[numRegStorage]; 280 | int posMmkOut[numRegStorage]; 281 | int posSh[numRegStorage]; 282 | #pragma unroll 283 | for (int j=0;j < numRegStorage;j++) { 284 | posMmkIn[j] = posMmkIn0; 285 | posMmkOut[j] = posMmkOut0; 286 | posSh[j] = 0; 287 | } 288 | for (int i=0;i < sizeMmk;i++) { 289 | #pragma unroll 290 | for (int j=0;j < numRegStorage;j++) { 291 | int t = threadIdx.x + j*blockDim.x; 292 | posMmkIn[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_in,i)) % __shfl_sync(FULL_MASK, Mmk.d_in,i))*__shfl_sync(FULL_MASK, Mmk.ct_in,i); 293 | posMmkOut[j] += ((t/__shfl_sync(FULL_MASK, Mmk.c_out,i)) % __shfl_sync(FULL_MASK, Mmk.d_out,i))*__shfl_sync(FULL_MASK, Mmk.ct_out,i); 294 | posSh[j] += ((t/__shfl_sync(FULL_MASK, Msh.c,i)) % __shfl_sync(FULL_MASK, Msh.d,i))*__shfl_sync(FULL_MASK, Msh.ct,i); 295 | } 296 | } 297 | 298 | TensorConvInOut Mbar; 299 | Mbar.c_in = 1; 300 | Mbar.d_in = 1; 301 | Mbar.c_out = 1; 302 | Mbar.d_out = 1; 303 | if (warpLane < sizeMbar) { 304 | Mbar = glMbar[warpLane]; 305 | } 306 | 307 | const int posMbar0 = blockIdx.y*volMbar/gridDim.y; 308 | const int posMbar1 = (blockIdx.y + 1)*volMbar/gridDim.y; 309 | for (int posMbar=posMbar0;posMbar < posMbar1;posMbar++) 310 | // for (int posMbar=blockIdx.y;posMbar < volMbar;posMbar+=gridDim.y) 311 | { 312 | 313 | int posMbarOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 314 | #pragma unroll 315 | for (int i=16;i >= 1;i/=2) { 316 | posMbarOut += __shfl_xor_sync(FULL_MASK, posMbarOut, i); 317 | } 318 | 319 | int posMbarIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 320 | #pragma unroll 321 | for (int i=16;i >= 1;i/=2) { 322 | posMbarIn += __shfl_xor_sync(FULL_MASK, posMbarIn, i); 323 | } 324 | 325 | // Read from global memory 326 | __syncthreads(); 327 | 328 | #pragma unroll 329 | for (int j=0;j < numRegStorage;j++) { 330 | int posMmk = threadIdx.x + j*blockDim.x; 331 | int posIn = posMbarIn + posMmkIn[j]; 332 | if (posMmk < volMmkSplit) shBuffer[posMmk] = dataIn[posIn]; 333 | } 334 | 335 | // Write to global memory 336 | __syncthreads(); 337 | 338 | #pragma unroll 339 | for (int j=0;j < numRegStorage;j++) { 340 | int posMmk = threadIdx.x + j*blockDim.x; 341 | int posOut = posMbarOut + posMmkOut[j]; 342 | if (posMmk < volMmkSplit) dataOut[posOut] = shBuffer[posSh[j]]; 343 | } 344 | 345 | } 346 | 347 | } 348 | 349 | #if 1 350 | // 351 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2) 352 | // 353 | // dim3 numthread(TILEDIM, TILEROWS, 1); 354 | // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar); 355 | // 356 | template 357 | __global__ void transposeTiledCopy( 358 | const int numMm, const int volMbar, const int sizeMbar, 359 | const int cuDimMk, const int cuDimMm, 360 | const int2 tiledVol, 361 | const TensorConvInOut* RESTRICT gl_Mbar, 362 | const T* RESTRICT dataIn, T* RESTRICT dataOut) { 363 | 364 | const int warpLane = threadIdx.x & (warpSize - 1); 365 | TensorConvInOut Mbar; 366 | Mbar.c_in = 1; 367 | Mbar.d_in = 1; 368 | Mbar.c_out = 1; 369 | Mbar.d_out = 1; 370 | if (warpLane < sizeMbar) { 371 | Mbar = gl_Mbar[warpLane]; 372 | } 373 | 374 | const int bx = (blockIdx.x % numMm)*TILEDIM; 375 | const int by = (blockIdx.x / numMm)*TILEDIM; 376 | 377 | const int x = bx + threadIdx.x; 378 | const int y = by + threadIdx.y; 379 | 380 | const unsigned int mask = __ballot_sync(FULL_MASK, (y + warpLane < tiledVol.y))*(x < tiledVol.x); 381 | 382 | const int posMinorIn = x + y*cuDimMk; 383 | const int posMinorOut = x + y*cuDimMm; 384 | const int posInAdd = TILEROWS*cuDimMk; 385 | const int posOutAdd = TILEROWS*cuDimMm; 386 | 387 | for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z) 388 | { 389 | 390 | // Compute global memory positions 391 | int posMajorIn = ((posMbar/Mbar.c_in) % Mbar.d_in)*Mbar.ct_in; 392 | int posMajorOut = ((posMbar/Mbar.c_out) % Mbar.d_out)*Mbar.ct_out; 393 | #pragma unroll 394 | for (int i=16;i >= 1;i/=2) { 395 | posMajorIn += __shfl_xor_sync(FULL_MASK, posMajorIn, i); 396 | posMajorOut += __shfl_xor_sync(FULL_MASK, posMajorOut, i); 397 | } 398 | int posIn = posMajorIn + posMinorIn; 399 | int posOut = posMajorOut + posMinorOut; 400 | 401 | // Variables where values are stored 402 | T val[TILEDIM/TILEROWS]; 403 | 404 | // Read global memory 405 | #pragma unroll 406 | for (int j=0;j < TILEDIM;j += TILEROWS) { 407 | // if ((x < tiledVol.x) && (y + j < tiledVol.y)) { 408 | if ((mask & (1 << j)) != 0) { 409 | val[j/TILEROWS] = dataIn[posIn]; 410 | } 411 | posIn += posInAdd; 412 | } 413 | 414 | // Write global memory 415 | #pragma unroll 416 | for (int j=0;j < TILEDIM;j += TILEROWS) { 417 | // if ((x < tiledVol.x) && (y + j < tiledVol.y)) { 418 | if ((mask & (1 << j)) != 0) { 419 | dataOut[posOut] = val[j/TILEROWS]; 420 | } 421 | posOut += posOutAdd; 422 | } 423 | 424 | } 425 | 426 | } 427 | #else 428 | 429 | // 430 | // Returns scalar tensor position. Each lane has the same p 431 | // NOTE: c and d on inactive warps must be 1 !! 432 | // 433 | __device__ __forceinline__ 434 | int tensorPos( 435 | const int p, const int rank, const int c, const int d, const int ct, 436 | const int numLane=warpSize 437 | ) { 438 | 439 | int r = ((p/c) % d)*ct; 440 | #pragma unroll 441 | for (int i=numLane/2;i >= 1;i/=2) { 442 | r += __shfl_xor_sync(FULL_MASK, r, i); 443 | } 444 | return r; 445 | 446 | } 447 | 448 | // 449 | // Transpose when the lead dimension is the same, e.g. (1, 2, 3) -> (1, 3, 2) 450 | // 451 | // dim3 numthread(TILEDIM, TILEROWS, 1); 452 | // dim3 numblock( ((plan.volMm-1)/TILEDIM+1)*((plan.volMkBar-1)/TILEDIM+1), 1, plan.volMbar); 453 | // 454 | template 455 | __global__ void transposeTiledCopy( 456 | const int numMm, const int volMbar, const int sizeMbar, 457 | const int cuDimMk, const int cuDimMm, 458 | const int2 tiledVol, 459 | const TensorConvInOut* RESTRICT gl_Mbar, 460 | const T* RESTRICT dataIn, T* RESTRICT dataOut) { 461 | 462 | const int warpLane = threadIdx.x & (warpSize - 1); 463 | TensorConvInOut Mbar; 464 | Mbar.c_in = 1; 465 | Mbar.d_in = 1; 466 | Mbar.c_out = 1; 467 | Mbar.d_out = 1; 468 | if (warpLane < sizeMbar) { 469 | Mbar = gl_Mbar[warpLane]; 470 | } 471 | 472 | const int bx = (blockIdx.x % numMm)*TILEDIM; 473 | const int by = (blockIdx.x / numMm)*TILEDIM; 474 | 475 | const int x = bx + threadIdx.x; 476 | const int y = by + threadIdx.y; 477 | 478 | for (int posMbar=blockIdx.z;posMbar < volMbar;posMbar += gridDim.z) 479 | { 480 | 481 | // Variables where values are stored 482 | T val[TILEDIM/TILEROWS]; 483 | 484 | // Read global memory 485 | { 486 | int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_in, Mbar.d_in, Mbar.ct_in); 487 | pos0 += x + y*cuDimMk; 488 | 489 | #pragma unroll 490 | for (int j=0;j < TILEDIM;j += TILEROWS) { 491 | int pos = pos0 + j*cuDimMk; 492 | if ((x < tiledVol.x) && (y + j < tiledVol.y)) { 493 | val[j/TILEROWS] = dataIn[pos]; 494 | } 495 | } 496 | } 497 | 498 | // Write global memory 499 | { 500 | int pos0 = tensorPos(posMbar, sizeMbar, Mbar.c_out, Mbar.d_out, Mbar.ct_out); 501 | pos0 += x + y*cuDimMm; 502 | 503 | #pragma unroll 504 | for (int j=0;j < TILEDIM;j += TILEROWS) { 505 | int pos = pos0 + j*cuDimMm; 506 | if ((x < tiledVol.x) && (y + j < tiledVol.y)) { 507 | dataOut[pos] = val[j/TILEROWS]; 508 | } 509 | } 510 | } 511 | 512 | } 513 | 514 | } 515 | #endif 516 | 517 | //###################################################################################### 518 | //###################################################################################### 519 | //###################################################################################### 520 | 521 | // 522 | // Sets shared memory bank configuration for all kernels. Needs to be called once per device. 523 | // 524 | void cuttKernelSetSharedMemConfig() { 525 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeFourByte )) 526 | #include "calls.h" 527 | #undef CALL 528 | 529 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePacked, cudaSharedMemBankSizeEightByte )) 530 | #include "calls.h" 531 | #undef CALL 532 | 533 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeFourByte )) 534 | #include "calls.h" 535 | #undef CALL 536 | 537 | #define CALL(NREG) cudaCheck(cudaFuncSetSharedMemConfig(transposePackedSplit, cudaSharedMemBankSizeEightByte )) 538 | #include "calls.h" 539 | #undef CALL 540 | 541 | cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeFourByte)); 542 | cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeFourByte)); 543 | 544 | cudaCheck(cudaFuncSetSharedMemConfig(transposeTiled, cudaSharedMemBankSizeEightByte)); 545 | cudaCheck(cudaFuncSetSharedMemConfig(transposeTiledCopy, cudaSharedMemBankSizeEightByte)); 546 | 547 | } 548 | 549 | // Caches for PackedSplit kernels. One cache for all devices 550 | // NOTE: Not thread safe 551 | const int CACHE_SIZE = 100000; 552 | const int MAX_NUMWARP = (1024/32); 553 | const int MAX_NUMTYPE = 2; 554 | static int numDevices = -1; 555 | LRUCache nabCache(CACHE_SIZE, -1); 556 | 557 | // 558 | // Returns the maximum number of active blocks per SM 559 | // 560 | int getNumActiveBlock(const int method, const int sizeofType, const LaunchConfig& lc, 561 | const int deviceID, const cudaDeviceProp& prop) { 562 | 563 | int numActiveBlock; 564 | int numthread = lc.numthread.x * lc.numthread.y * lc.numthread.z; 565 | switch(method) { 566 | case Trivial: 567 | { 568 | // This value does not matter, but should be > 0 569 | numActiveBlock = 1; 570 | } 571 | break; 572 | 573 | case Packed: 574 | { 575 | #define CALL0(TYPE, NREG) \ 576 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \ 577 | transposePacked, numthread, lc.shmemsize) 578 | switch(lc.numRegStorage) { 579 | #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float, ICASE); if (sizeofType == 8) CALL0(double, ICASE); break 580 | #include "calls.h" 581 | } 582 | #undef CALL 583 | #undef CALL0 584 | } 585 | break; 586 | 587 | case PackedSplit: 588 | { 589 | // Allocate cache structure if needed 590 | if (numDevices == -1) { 591 | cudaCheck(cudaGetDeviceCount(&numDevices)); 592 | } 593 | // Build unique key for cache 594 | int key_warp = (numthread/prop.warpSize - 1); 595 | if (key_warp >= MAX_NUMWARP) { 596 | printf("getNumActiveBlock maximum number of warps exceeded\n"); 597 | exit(1); 598 | } 599 | int key_reg = (lc.numRegStorage - 1); 600 | int key_type = (sizeofType == 4); 601 | unsigned long long int key = 602 | (unsigned long long int)(lc.shmemsize/sizeofType)*MAX_NUMWARP*MAX_REG_STORAGE*MAX_NUMTYPE*numDevices + 603 | (unsigned long long int)deviceID*MAX_NUMWARP*MAX_REG_STORAGE*MAX_NUMTYPE + 604 | (unsigned long long int)key_type*MAX_NUMWARP*MAX_REG_STORAGE + 605 | (unsigned long long int)key_reg*MAX_NUMWARP + 606 | (unsigned long long int)key_warp; 607 | 608 | numActiveBlock = nabCache.get(key); 609 | if (numActiveBlock == -1) { 610 | // key not found in cache, determine value and add it to cache 611 | #define CALL0(TYPE, NREG) \ 612 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, \ 613 | transposePackedSplit, numthread, lc.shmemsize) 614 | switch(lc.numRegStorage) { 615 | #define CALL(ICASE) case ICASE: if (sizeofType == 4) CALL0(float, ICASE); if (sizeofType == 8) CALL0(double, ICASE); break 616 | #include "calls.h" 617 | } 618 | #undef CALL 619 | #undef CALL0 620 | nabCache.set(key, numActiveBlock); 621 | } 622 | } 623 | break; 624 | 625 | case Tiled: 626 | { 627 | if (sizeofType == 4) { 628 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, 629 | transposeTiled, numthread, lc.shmemsize); 630 | } else { 631 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, 632 | transposeTiled, numthread, lc.shmemsize); 633 | } 634 | } 635 | break; 636 | 637 | case TiledCopy: 638 | { 639 | if (sizeofType == 4) { 640 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, 641 | transposeTiledCopy, numthread, lc.shmemsize); 642 | } else { 643 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlock, 644 | transposeTiledCopy, numthread, lc.shmemsize); 645 | } 646 | } 647 | break; 648 | } 649 | 650 | return numActiveBlock; 651 | } 652 | 653 | // 654 | // Sets up kernel launch configuration 655 | // 656 | // Returns the number of active blocks per SM that can be achieved on the Packed kernel 657 | // NOTE: Returns 0 when kernel execution is not possible 658 | // 659 | // Sets: 660 | // lc.numthread 661 | // lc.numblock 662 | // lc.shmemsize 663 | // lc.numRegStorage (for Packed method) 664 | // 665 | int cuttKernelLaunchConfiguration(const int sizeofType, const TensorSplit& ts, 666 | const int deviceID, const cudaDeviceProp& prop, LaunchConfig& lc) { 667 | 668 | // Return value of numActiveBlock 669 | int numActiveBlockReturn = -1; 670 | 671 | switch(ts.method) { 672 | case Trivial: 673 | { 674 | // These values don't matter 675 | lc.numthread.x = 1; 676 | lc.numthread.y = 1; 677 | lc.numthread.z = 1; 678 | lc.numblock.x = 1; 679 | lc.numblock.y = 1; 680 | lc.numblock.z = 1; 681 | lc.numblock.z = 1; 682 | lc.numblock.z = 1; 683 | lc.shmemsize = 0; 684 | lc.numRegStorage = 0; 685 | } 686 | break; 687 | 688 | case Packed: 689 | { 690 | // Amount of shared memory required 691 | lc.shmemsize = ts.shmemAlloc(sizeofType); //ts.volMmk*sizeofType; 692 | 693 | // Check that we're not using too much shared memory per block 694 | if (lc.shmemsize > prop.sharedMemPerBlock) { 695 | // printf("lc.shmemsize %d prop.sharedMemPerBlock %d\n", lc.shmemsize, prop.sharedMemPerBlock); 696 | return 0; 697 | } 698 | 699 | // Min and max number of threads we can use 700 | int minNumthread = ((ts.volMmk - 1)/(prop.warpSize*MAX_REG_STORAGE) + 1)*prop.warpSize; 701 | int maxNumthread = ((ts.volMmk - 1)/(prop.warpSize) + 1)*prop.warpSize; 702 | if (minNumthread > prop.maxThreadsPerBlock) return 0; 703 | maxNumthread = min(prop.maxThreadsPerBlock, maxNumthread); 704 | // printf("minNumthread %d maxNumthread %d\n", minNumthread, maxNumthread); 705 | 706 | // Min and max number of register storage we can use 707 | int minNumRegStorage = (ts.volMmk - 1)/maxNumthread + 1; 708 | int maxNumRegStorage = (ts.volMmk - 1)/minNumthread + 1; 709 | // printf("minNumRegStorage %d maxNumRegStorage %d\n", minNumRegStorage, maxNumRegStorage); 710 | 711 | int bestVal = 0; 712 | int bestNumRegStorage = 0; 713 | int bestNumActiveBlock = 0; 714 | 715 | lc.numthread.y = 1; 716 | lc.numthread.z = 1; 717 | lc.numblock.x = max(1, ts.volMbar); 718 | lc.numblock.x = min(prop.multiProcessorCount*18, lc.numblock.x); 719 | lc.numblock.y = 1; 720 | lc.numblock.z = 1; 721 | 722 | for (lc.numRegStorage=minNumRegStorage;lc.numRegStorage <= maxNumRegStorage;lc.numRegStorage++) { 723 | lc.numthread.x = ((ts.volMmk - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize; 724 | 725 | int numActiveBlock = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop); 726 | // int val = numActiveBlock*lc.numthread.x; 727 | int val = ts.volMmkUsed()*numActiveBlock; 728 | if (val > bestVal) { 729 | bestVal = val; 730 | bestNumRegStorage = lc.numRegStorage; 731 | bestNumActiveBlock = numActiveBlock; 732 | } 733 | } 734 | 735 | if (bestNumRegStorage == 0) return 0; 736 | 737 | lc.numRegStorage = bestNumRegStorage; 738 | lc.numthread.x = ((ts.volMmk - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize; 739 | numActiveBlockReturn = bestNumActiveBlock; 740 | } 741 | break; 742 | 743 | case PackedSplit: 744 | { 745 | // Amount of shared memory required 746 | lc.shmemsize = ts.shmemAlloc(sizeofType); 747 | 748 | // Check that we're not using too much shared memory per block 749 | if (lc.shmemsize > prop.sharedMemPerBlock) { 750 | // printf("lc.shmemsize %d prop.sharedMemPerBlock %d\n", lc.shmemsize, prop.sharedMemPerBlock); 751 | return 0; 752 | } 753 | 754 | int volMmkWithSplit = (ts.splitDim/ts.numSplit + ((ts.splitDim % ts.numSplit) > 0))*ts.volMmkUnsplit; 755 | 756 | // Min and max number of threads we can use 757 | int minNumthread = ((volMmkWithSplit - 1)/(prop.warpSize*MAX_REG_STORAGE) + 1)*prop.warpSize; 758 | int maxNumthread = ((volMmkWithSplit - 1)/(prop.warpSize) + 1)*prop.warpSize; 759 | if (minNumthread > prop.maxThreadsPerBlock) return 0; 760 | maxNumthread = min(prop.maxThreadsPerBlock, maxNumthread); 761 | // printf("minNumthread %d maxNumthread %d\n", minNumthread, maxNumthread); 762 | 763 | // Min and max number of register storage we can use 764 | int minNumRegStorage = (volMmkWithSplit - 1)/maxNumthread + 1; 765 | int maxNumRegStorage = (volMmkWithSplit - 1)/minNumthread + 1; 766 | // printf("minNumRegStorage %d maxNumRegStorage %d\n", minNumRegStorage, maxNumRegStorage); 767 | 768 | int bestVal = 0; 769 | int bestNumRegStorage = 0; 770 | int bestNumActiveBlock = 0; 771 | 772 | lc.numthread.y = 1; 773 | lc.numthread.z = 1; 774 | lc.numblock.x = ts.numSplit; 775 | lc.numblock.y = max(1, min((prop.multiProcessorCount*18)/lc.numblock.x, ts.volMbar)); 776 | lc.numblock.z = 1; 777 | 778 | for (lc.numRegStorage=minNumRegStorage;lc.numRegStorage <= maxNumRegStorage;lc.numRegStorage++) { 779 | lc.numthread.x = ((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize; 780 | 781 | int numActiveBlock = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop); 782 | // int val = numActiveBlock*lc.numthread.x*lc.numRegStorage; 783 | int val = ts.volMmkUsed()*numActiveBlock; 784 | if (val > bestVal) { 785 | bestVal = val; 786 | bestNumRegStorage = lc.numRegStorage; 787 | bestNumActiveBlock = numActiveBlock; 788 | } 789 | } 790 | 791 | if (bestNumRegStorage == 0) return 0; 792 | 793 | lc.numRegStorage = bestNumRegStorage; 794 | lc.numthread.x = ((volMmkWithSplit - 1)/(prop.warpSize*lc.numRegStorage) + 1)*prop.warpSize; 795 | numActiveBlockReturn = bestNumActiveBlock; 796 | } 797 | break; 798 | 799 | case Tiled: 800 | { 801 | lc.numthread.x = TILEDIM; 802 | lc.numthread.y = TILEROWS; 803 | lc.numthread.z = 1; 804 | lc.numblock.x = ((ts.volMm - 1)/TILEDIM + 1)*((ts.volMk - 1)/TILEDIM + 1); 805 | lc.numblock.y = 1; 806 | lc.numblock.z = max(1, min((prop.multiProcessorCount*8)/(lc.numblock.x*lc.numblock.y), ts.volMbar)); 807 | lc.shmemsize = 0; 808 | lc.numRegStorage = 0; 809 | } 810 | break; 811 | 812 | case TiledCopy: 813 | { 814 | lc.numthread.x = TILEDIM; 815 | lc.numthread.y = TILEROWS; 816 | lc.numthread.z = 1; 817 | lc.numblock.x = ((ts.volMm - 1)/TILEDIM + 1)*((ts.volMkBar - 1)/TILEDIM + 1); 818 | lc.numblock.y = 1; 819 | lc.numblock.z = ts.volMbar; 820 | lc.numblock.z = min((prop.multiProcessorCount*8)/(lc.numblock.x*lc.numblock.y), lc.numblock.z); 821 | lc.numblock.z = max(1, lc.numblock.z); 822 | lc.shmemsize = 0; 823 | lc.numRegStorage = 0; 824 | } 825 | break; 826 | } 827 | 828 | if (lc.numblock.x > prop.maxGridSize[0] || 829 | lc.numblock.y > prop.maxGridSize[1] || 830 | lc.numblock.z > prop.maxGridSize[2]) return 0; 831 | 832 | // Return the number of active blocks with these settings 833 | if (numActiveBlockReturn == -1) { 834 | // Not set, get it 835 | numActiveBlockReturn = getNumActiveBlock(ts.method, sizeofType, lc, deviceID, prop); 836 | } 837 | return numActiveBlockReturn; 838 | } 839 | 840 | bool cuttKernel(cuttPlan_t& plan, void* dataIn, void* dataOut) { 841 | LaunchConfig& lc = plan.launchConfig; 842 | TensorSplit& ts = plan.tensorSplit; 843 | 844 | switch(ts.method) { 845 | case Trivial: 846 | { 847 | cudaCheck(cudaMemcpyAsync(dataOut, dataIn, ts.volMmk*ts.volMbar*plan.sizeofType, 848 | cudaMemcpyDeviceToDevice, plan.stream)); 849 | } 850 | break; 851 | 852 | case Packed: 853 | { 854 | switch(lc.numRegStorage) { 855 | #define CALL0(TYPE, NREG) \ 856 | transposePacked <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ 857 | (ts.volMmk, ts.volMbar, ts.sizeMmk, ts.sizeMbar, \ 858 | plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut) 859 | #define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float, ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break 860 | #include "calls.h" 861 | default: 862 | printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); 863 | return false; 864 | #undef CALL 865 | #undef CALL0 866 | } 867 | 868 | } 869 | break; 870 | 871 | case PackedSplit: 872 | { 873 | switch(lc.numRegStorage) { 874 | #define CALL0(TYPE, NREG) \ 875 | transposePackedSplit <<< lc.numblock, lc.numthread, lc.shmemsize, plan.stream >>> \ 876 | (ts.splitDim, ts.volMmkUnsplit, ts. volMbar, ts.sizeMmk, ts.sizeMbar, \ 877 | plan.cuDimMm, plan.cuDimMk, plan.Mmk, plan.Mbar, plan.Msh, (TYPE *)dataIn, (TYPE *)dataOut); 878 | #define CALL(ICASE) case ICASE: if (plan.sizeofType == 4) CALL0(float, ICASE); if (plan.sizeofType == 8) CALL0(double, ICASE); break 879 | #include "calls.h" 880 | default: 881 | printf("cuttKernel no template implemented for numRegStorage %d\n", lc.numRegStorage); 882 | return false; 883 | #undef CALL 884 | #undef CALL0 885 | } 886 | 887 | } 888 | break; 889 | 890 | case Tiled: 891 | { 892 | #define CALL(TYPE) \ 893 | transposeTiled <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ 894 | (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.tiledVol, plan.cuDimMk, plan.cuDimMm, \ 895 | plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut) 896 | if (plan.sizeofType == 4) CALL(float); 897 | if (plan.sizeofType == 8) CALL(double); 898 | #undef CALL 899 | } 900 | break; 901 | 902 | case TiledCopy: 903 | { 904 | #define CALL(TYPE) \ 905 | transposeTiledCopy <<< lc.numblock, lc.numthread, 0, plan.stream >>> \ 906 | (((ts.volMm - 1)/TILEDIM + 1), ts.volMbar, ts.sizeMbar, plan.cuDimMk, plan.cuDimMm, plan.tiledVol, \ 907 | plan.Mbar, (TYPE *)dataIn, (TYPE *)dataOut) 908 | if (plan.sizeofType == 4) CALL(float); 909 | if (plan.sizeofType == 8) CALL(double); 910 | #undef CALL 911 | } 912 | break; 913 | 914 | } 915 | 916 | cudaCheck(cudaGetLastError()); 917 | return true; 918 | } 919 | --------------------------------------------------------------------------------