├── examples ├── Makefile └── demo.cu ├── LICENSE ├── README.md └── include └── cupti_profiler.h /examples/Makefile: -------------------------------------------------------------------------------- 1 | all: demo 2 | 3 | CUPTI_PATH=/usr/local/cuda/extras/CUPTI 4 | INCLUDES = -I ../include -I /usr/local/cuda/extras/CUPTI/include 5 | CXXARGS = -std=c++11 -g 6 | CXXARGS += -Xcompiler -DNDEBUG 7 | LIBS = -lcuda -L$(CUPTI_PATH)/lib64 -lcupti 8 | 9 | demo: demo.cu 10 | nvcc $(CXXARGS) $(INCLUDES) $(LIBS) demo.cu -o demo 11 | 12 | clean: 13 | rm -f *.o demo 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Saurav Muralidharan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Profiling Library 2 | 3 | This library provides an API for collecting CUDA profiling metrics and events 4 | from within a CUDA application. Programmers specify what metrics and events 5 | they want, and start the profiler before calling one or more CUDA kernels. The library 6 | sets up the appropriate CUPTI callbacks, calculates the number of 7 | kernel passes required, gathers values for the specified 8 | metrics and events, and returns them to the programmer on a per-kernel basis. 9 | 10 | **Example Usage:** 11 | 12 | ``` c++ 13 | vector event_names { 14 | "active_warps", 15 | "gst_inst_32bit", 16 | "active_cycles" 17 | }; 18 | vector metric_names { 19 | "flop_count_dp", 20 | "flop_count_sp", 21 | "inst_executed" 22 | }; 23 | 24 | cupti_profiler::profiler profiler(event_names, metric_names); 25 | 26 | // Get #passes required to compute all metrics and events 27 | const int passes = profiler.get_passes(); 28 | 29 | profiler.start(); 30 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #define PROFILE_ALL_EVENTS_METRICS 1 9 | 10 | template 11 | __global__ void kernel(T begin, int size) { 12 | const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 13 | if(thread_id < size) 14 | *(begin + thread_id) += 1; 15 | } 16 | 17 | template 18 | __global__ void kernel2(T begin, int size) { 19 | const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 20 | if(thread_id < size) 21 | *(begin + thread_id) += 2; 22 | } 23 | 24 | template 25 | void call_kernel(T& arg) { 26 | kernel<<<1, 100>>>(thrust::raw_pointer_cast(&arg[0]), arg.size()); 27 | } 28 | 29 | template 30 | void call_kernel2(T& arg) { 31 | kernel2<<<1, 50>>>(thrust::raw_pointer_cast(&arg[0]), arg.size()); 32 | } 33 | 34 | int main() { 35 | using namespace std; 36 | //using namespace thrust; 37 | 38 | CUdevice device; 39 | 40 | DRIVER_API_CALL(cuInit(0)); 41 | DRIVER_API_CALL(cuDeviceGet(&device, 0)); 42 | 43 | #if PROFILE_ALL_EVENTS_METRICS 44 | const auto event_names = cupti_profiler::available_events(device); 45 | const auto metric_names = cupti_profiler::available_metrics(device); 46 | #else 47 | vector event_names { 48 | "active_warps", 49 | "active_cycles", 50 | }; 51 | vector metric_names { 52 | "inst_per_warp", 53 | "branch_efficiency", 54 | "warp_execution_efficiency", 55 | "warp_nonpred_execution_efficiency", 56 | "inst_replay_overhead", 57 | }; 58 | #endif 59 | 60 | constexpr int N = 100; 61 | thrust::device_vector data(N, 0); 62 | 63 | //cupti_profiler::profiler profiler(vector{}, metric_names); 64 | 65 | // XXX: Disabling all metrics seems to change the values 66 | // of some events. Not sure if this is correct behavior. 67 | //cupti_profiler::profiler profiler(event_names, vector{}); 68 | 69 | cupti_profiler::profiler profiler(event_names, metric_names); 70 | // Get #passes required to compute all metrics and events 71 | const int passes = profiler.get_passes(); 72 | printf("Passes: %d\n", passes); 73 | 74 | profiler.start(); 75 | for(int i=0; i<50; ++i) { 76 | call_kernel(data); 77 | cudaDeviceSynchronize(); 78 | call_kernel2(data); 79 | cudaDeviceSynchronize(); 80 | } 81 | profiler.stop(); 82 | 83 | printf("Event Trace\n"); 84 | profiler.print_event_values(std::cout); 85 | printf("Metric Trace\n"); 86 | profiler.print_metric_values(std::cout); 87 | 88 | auto names = profiler.get_kernel_names(); 89 | for(auto name: names) { 90 | printf("%s\n", name.c_str()); 91 | } 92 | 93 | thrust::host_vector h_data(data); 94 | 95 | /*printf("\n"); 96 | for(int i = 0; i < 10; ++i) { 97 | printf("%.2lf ", h_data[i]); 98 | }*/ 99 | printf("\n"); 100 | return 0; 101 | } 102 | -------------------------------------------------------------------------------- /include/cupti_profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #define DRIVER_API_CALL(apiFuncCall) \ 12 | do { \ 13 | CUresult _status = apiFuncCall; \ 14 | if (_status != CUDA_SUCCESS) { \ 15 | fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ 16 | __FILE__, __LINE__, #apiFuncCall, _status); \ 17 | exit(-1); \ 18 | } \ 19 | } while (0) 20 | 21 | #define RUNTIME_API_CALL(apiFuncCall) \ 22 | do { \ 23 | cudaError_t _status = apiFuncCall; \ 24 | if (_status != cudaSuccess) { \ 25 | fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ 26 | __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ 27 | exit(-1); \ 28 | } \ 29 | } while (0) 30 | 31 | #define CUPTI_CALL(call) \ 32 | do { \ 33 | CUptiResult _status = call; \ 34 | if (_status != CUPTI_SUCCESS) { \ 35 | const char *errstr; \ 36 | cuptiGetResultString(_status, &errstr); \ 37 | fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ 38 | __FILE__, __LINE__, #call, errstr); \ 39 | exit(-1); \ 40 | } \ 41 | } while (0) 42 | 43 | #ifdef DEBUG 44 | template 45 | void _LOG(const char *msg, Args&&... args) { 46 | fprintf(stderr, "[Log]: "); 47 | fprintf(stderr, msg, args...); 48 | fprintf(stderr, "\n"); 49 | } 50 | void _LOG(const char *msg) { 51 | fprintf(stderr, "[Log]: %s\n", msg); 52 | } 53 | template 54 | void _DBG(const char *msg, Args&&... args) { 55 | fprintf(stderr, msg, args...); 56 | } 57 | void _DBG(const char *msg) { 58 | fprintf(stderr, "%s", msg); 59 | } 60 | #else 61 | #define _LOG(...) 62 | #define _DBG(...) 63 | #endif 64 | 65 | namespace cupti_profiler { 66 | static const char *dummy_kernel_name = "^^ DUMMY ^^"; 67 | 68 | namespace detail { 69 | 70 | // Pass-specific data 71 | struct pass_data_t { 72 | // the set of event groups to collect for a pass 73 | CUpti_EventGroupSet *event_groups; 74 | // the number of entries in eventIdArray and eventValueArray 75 | uint32_t num_events; 76 | // array of event ids 77 | std::vector event_ids; 78 | // array of event values 79 | std::vector event_values; 80 | }; 81 | 82 | struct kernel_data_t { 83 | typedef std::vector event_val_t; 84 | typedef std::vector metric_val_t; 85 | 86 | kernel_data_t() : m_current_pass(0) {} 87 | 88 | std::vector m_pass_data; 89 | std::string m_name; 90 | 91 | int m_metric_passes; 92 | int m_event_passes; 93 | int m_current_pass; 94 | int m_total_passes; 95 | CUdevice m_device; 96 | 97 | event_val_t m_event_values; 98 | metric_val_t m_metric_values; 99 | 100 | }; 101 | 102 | void CUPTIAPI 103 | get_value_callback(void *userdata, 104 | CUpti_CallbackDomain domain, 105 | CUpti_CallbackId cbid, 106 | const CUpti_CallbackData *cbInfo) { 107 | 108 | // This callback is enabled only for launch so we shouldn't see 109 | // anything else. 110 | if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) 111 | && (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)) { 112 | fprintf(stderr, "%s:%d: Unexpected cbid %d\n", __FILE__, __LINE__, cbid); 113 | exit(-1); 114 | } 115 | 116 | const char *current_kernel_name = cbInfo->symbolName; 117 | 118 | // Skip execution if kernel name is NULL string 119 | // TODO: Make sure this is fine 120 | if(!current_kernel_name) { 121 | _LOG("Empty kernel name string. Skipping..."); 122 | return; 123 | } 124 | 125 | std::map *kernel_data = 126 | (std::map *)userdata; 127 | 128 | if (cbInfo->callbackSite == CUPTI_API_ENTER) { 129 | // If this is kernel name hasn't been seen before 130 | if(kernel_data->count(current_kernel_name) == 0) { 131 | _LOG("New kernel encountered: %s", current_kernel_name); 132 | 133 | detail::kernel_data_t dummy = 134 | (*kernel_data)[dummy_kernel_name]; 135 | detail::kernel_data_t k_data = dummy; 136 | 137 | k_data.m_name = current_kernel_name; 138 | 139 | auto& pass_data = k_data.m_pass_data; 140 | 141 | CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context, 142 | CUPTI_EVENT_COLLECTION_MODE_KERNEL)); 143 | 144 | for (int i = 0; i < pass_data[0].event_groups->numEventGroups; i++) { 145 | _LOG(" Enabling group %d", i); 146 | uint32_t all = 1; 147 | CUPTI_CALL(cuptiEventGroupSetAttribute( 148 | pass_data[0].event_groups->eventGroups[i], 149 | CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, 150 | sizeof(all), &all)); 151 | CUPTI_CALL(cuptiEventGroupEnable( 152 | pass_data[0].event_groups->eventGroups[i])); 153 | 154 | (*kernel_data)[current_kernel_name] = k_data; 155 | } 156 | } else { 157 | auto& current_kernel = (*kernel_data)[current_kernel_name]; 158 | auto const& pass_data = current_kernel.m_pass_data; 159 | 160 | int current_pass = current_kernel.m_current_pass; 161 | if(current_pass >= current_kernel.m_total_passes) 162 | return; 163 | 164 | _LOG("Current pass for %s: %d", current_kernel_name, current_pass); 165 | 166 | CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context, 167 | CUPTI_EVENT_COLLECTION_MODE_KERNEL)); 168 | 169 | for (int i = 0; 170 | i < pass_data[current_pass].event_groups->numEventGroups; 171 | i++) { 172 | _LOG(" Enabling group %d", i); 173 | uint32_t all = 1; 174 | CUPTI_CALL(cuptiEventGroupSetAttribute( 175 | pass_data[current_pass].event_groups->eventGroups[i], 176 | CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, 177 | sizeof(all), &all)); 178 | CUPTI_CALL(cuptiEventGroupEnable( 179 | pass_data[current_pass].event_groups->eventGroups[i])); 180 | 181 | } 182 | } 183 | } else if(cbInfo->callbackSite == CUPTI_API_EXIT) { 184 | auto& current_kernel = (*kernel_data)[current_kernel_name]; 185 | int current_pass = current_kernel.m_current_pass; 186 | 187 | if(current_pass >= current_kernel.m_total_passes) 188 | return; 189 | 190 | auto& pass_data = 191 | current_kernel.m_pass_data[current_pass]; 192 | 193 | for (int i = 0; i < pass_data.event_groups->numEventGroups; i++) { 194 | CUpti_EventGroup group = pass_data.event_groups->eventGroups[i]; 195 | CUpti_EventDomainID group_domain; 196 | uint32_t numEvents, numInstances, numTotalInstances; 197 | CUpti_EventID *eventIds; 198 | size_t groupDomainSize = sizeof(group_domain); 199 | size_t numEventsSize = sizeof(numEvents); 200 | size_t numInstancesSize = sizeof(numInstances); 201 | size_t numTotalInstancesSize = sizeof(numTotalInstances); 202 | uint64_t *values, normalized, sum; 203 | size_t valuesSize, eventIdsSize; 204 | 205 | CUPTI_CALL(cuptiEventGroupGetAttribute(group, 206 | CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, 207 | &groupDomainSize, &group_domain)); 208 | CUPTI_CALL(cuptiDeviceGetEventDomainAttribute( 209 | current_kernel.m_device, group_domain, 210 | CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, 211 | &numTotalInstancesSize, &numTotalInstances)); 212 | CUPTI_CALL(cuptiEventGroupGetAttribute(group, 213 | CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, 214 | &numInstancesSize, &numInstances)); 215 | CUPTI_CALL(cuptiEventGroupGetAttribute(group, 216 | CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, 217 | &numEventsSize, &numEvents)); 218 | eventIdsSize = numEvents * sizeof(CUpti_EventID); 219 | eventIds = (CUpti_EventID *)malloc(eventIdsSize); 220 | CUPTI_CALL(cuptiEventGroupGetAttribute(group, 221 | CUPTI_EVENT_GROUP_ATTR_EVENTS, 222 | &eventIdsSize, eventIds)); 223 | 224 | valuesSize = sizeof(uint64_t) * numInstances; 225 | values = (uint64_t *)malloc(valuesSize); 226 | 227 | for(int j = 0; j < numEvents; j++) { 228 | CUPTI_CALL(cuptiEventGroupReadEvent(group, CUPTI_EVENT_READ_FLAG_NONE, 229 | eventIds[j], &valuesSize, values)); 230 | /*if (metric_data->eventIdx >= metric_data->numEvents) { 231 | fprintf(stderr, "[error]: Too many events collected, metric expects only %d\n", 232 | (int)metric_data->numEvents); 233 | exit(-1); 234 | }*/ 235 | 236 | // sum collect event values from all instances 237 | sum = 0; 238 | for(int k = 0; k < numInstances; k++) 239 | sum += values[k]; 240 | 241 | // normalize the event value to represent the total number of 242 | // domain instances on the device 243 | normalized = (sum * numTotalInstances) / numInstances; 244 | 245 | pass_data.event_ids.push_back(eventIds[j]); 246 | pass_data.event_values.push_back(normalized); 247 | 248 | // print collected value 249 | { 250 | char eventName[128]; 251 | size_t eventNameSize = sizeof(eventName) - 1; 252 | CUPTI_CALL(cuptiEventGetAttribute(eventIds[j], 253 | CUPTI_EVENT_ATTR_NAME, 254 | &eventNameSize, 255 | eventName)); 256 | eventName[127] = '\0'; 257 | _DBG("\t%s = %llu (", eventName, (unsigned long long)sum); 258 | if (numInstances > 1) { 259 | for (int k = 0; k < numInstances; k++) { 260 | if (k != 0) 261 | _DBG(", "); 262 | _DBG("%llu", (unsigned long long)values[k]); 263 | } 264 | } 265 | 266 | _DBG(")\n"); 267 | _LOG("\t%s (normalized) (%llu * %u) / %u = %llu", 268 | eventName, (unsigned long long)sum, 269 | numTotalInstances, numInstances, 270 | (unsigned long long)normalized); 271 | } 272 | } 273 | free(values); 274 | free(eventIds); 275 | } 276 | 277 | for (int i = 0; 278 | i < pass_data.event_groups->numEventGroups; 279 | i++) { 280 | _LOG(" Disabling group %d", i); 281 | CUPTI_CALL(cuptiEventGroupDisable( 282 | pass_data.event_groups->eventGroups[i])); 283 | } 284 | ++(*kernel_data)[current_kernel_name].m_current_pass; 285 | } 286 | } 287 | 288 | template 289 | void print_metric(CUpti_MetricID& id, 290 | CUpti_MetricValue& value, 291 | stream_t& s) { 292 | CUpti_MetricValueKind value_kind; 293 | size_t value_kind_sz = sizeof(value_kind); 294 | CUPTI_CALL(cuptiMetricGetAttribute(id, CUPTI_METRIC_ATTR_VALUE_KIND, 295 | &value_kind_sz, &value_kind)); 296 | switch(value_kind) { 297 | case CUPTI_METRIC_VALUE_KIND_DOUBLE: 298 | s << value.metricValueDouble; 299 | break; 300 | case CUPTI_METRIC_VALUE_KIND_UINT64: 301 | s << value.metricValueUint64; 302 | break; 303 | case CUPTI_METRIC_VALUE_KIND_INT64: 304 | s << value.metricValueInt64; 305 | break; 306 | case CUPTI_METRIC_VALUE_KIND_PERCENT: 307 | s << value.metricValuePercent; 308 | break; 309 | case CUPTI_METRIC_VALUE_KIND_THROUGHPUT: 310 | s << value.metricValueThroughput; 311 | break; 312 | case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL: 313 | s << value.metricValueUtilizationLevel; 314 | break; 315 | default: 316 | std::cerr << "[error]: unknown value kind\n"; 317 | exit(-1); 318 | } 319 | } 320 | 321 | } // namespace detail 322 | 323 | struct profiler { 324 | typedef std::vector strvec_t; 325 | using event_val_t = detail::kernel_data_t::event_val_t; 326 | using metric_val_t = detail::kernel_data_t::metric_val_t; 327 | 328 | profiler(const strvec_t& events, 329 | const strvec_t& metrics, 330 | const int device_num = 0) : 331 | m_event_names(events), 332 | m_metric_names(metrics), 333 | m_device_num(device_num), 334 | m_num_metrics(metrics.size()), 335 | m_num_events(events.size()), 336 | m_metric_passes(0), 337 | m_event_passes(0) { 338 | 339 | int device_count = 0; 340 | 341 | CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); 342 | DRIVER_API_CALL(cuDeviceGetCount(&device_count)); 343 | if (device_count == 0) { 344 | fprintf(stderr, "There is no device supporting CUDA.\n"); 345 | exit(1); 346 | } 347 | 348 | m_metric_ids.resize(m_num_metrics); 349 | m_event_ids.resize(m_num_events); 350 | 351 | // Init device, context and setup callback 352 | DRIVER_API_CALL(cuDeviceGet(&m_device, device_num)); 353 | DRIVER_API_CALL(cuCtxCreate(&m_context, 0, m_device)); 354 | CUPTI_CALL(cuptiSubscribe(&m_subscriber, 355 | (CUpti_CallbackFunc)detail::get_value_callback, 356 | &m_kernel_data)); 357 | CUPTI_CALL(cuptiEnableCallback(1, m_subscriber, 358 | CUPTI_CB_DOMAIN_RUNTIME_API, 359 | CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020)); 360 | CUPTI_CALL(cuptiEnableCallback(1, m_subscriber, 361 | CUPTI_CB_DOMAIN_RUNTIME_API, 362 | CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)); 363 | 364 | CUpti_MetricID *metric_ids = 365 | (CUpti_MetricID*)calloc(sizeof(CUpti_MetricID), m_num_metrics); 366 | for(int i = 0; i < m_num_metrics; ++i) { 367 | CUPTI_CALL(cuptiMetricGetIdFromName(m_device, 368 | m_metric_names[i].c_str(), 369 | &metric_ids[i])); 370 | } 371 | 372 | CUpti_EventID *event_ids = 373 | (CUpti_EventID*)calloc(sizeof(CUpti_EventID), m_num_events); 374 | for(int i = 0; i < m_num_events; ++i) { 375 | CUPTI_CALL(cuptiEventGetIdFromName(m_device, 376 | m_event_names[i].c_str(), 377 | &event_ids[i])); 378 | } 379 | 380 | if(m_num_metrics > 0) { 381 | 382 | CUPTI_CALL(cuptiMetricCreateEventGroupSets(m_context, 383 | sizeof(CUpti_MetricID) * m_num_metrics, metric_ids, 384 | &m_metric_pass_data)); 385 | m_metric_passes = m_metric_pass_data->numSets; 386 | 387 | std::copy(metric_ids, metric_ids + m_num_metrics, 388 | m_metric_ids.begin()); 389 | } 390 | if(m_num_events > 0) { 391 | CUPTI_CALL(cuptiEventGroupSetsCreate(m_context, 392 | sizeof(CUpti_EventID) * m_num_events, event_ids, 393 | &m_event_pass_data)); 394 | m_event_passes = m_event_pass_data->numSets; 395 | 396 | std::copy(event_ids, event_ids + m_num_events, 397 | m_event_ids.begin()); 398 | } 399 | 400 | _LOG("# Metric Passes: %d\n", m_metric_passes); 401 | _LOG("# Event Passes: %d\n", m_event_passes); 402 | 403 | assert((m_metric_passes + m_event_passes) > 0); 404 | 405 | detail::kernel_data_t dummy_data; 406 | dummy_data.m_name = dummy_kernel_name; 407 | dummy_data.m_metric_passes = m_metric_passes; 408 | dummy_data.m_event_passes = m_event_passes; 409 | dummy_data.m_device = m_device; 410 | dummy_data.m_total_passes = m_metric_passes + m_event_passes; 411 | dummy_data.m_pass_data.resize(m_metric_passes + m_event_passes); 412 | 413 | auto& pass_data = dummy_data.m_pass_data; 414 | for(int i = 0; i < m_metric_passes; ++i) { 415 | int total_events = 0; 416 | _LOG("[metric] Looking at set (pass) %d", i); 417 | uint32_t num_events = 0; 418 | size_t num_events_size = sizeof(num_events); 419 | for(int j = 0; j < m_metric_pass_data->sets[i].numEventGroups; ++j) { 420 | CUPTI_CALL(cuptiEventGroupGetAttribute( 421 | m_metric_pass_data->sets[i].eventGroups[j], 422 | CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, 423 | &num_events_size, &num_events)); 424 | _LOG(" Event Group %d, #Events = %d", j, num_events); 425 | total_events += num_events; 426 | } 427 | pass_data[i].event_groups = m_metric_pass_data->sets + i; 428 | pass_data[i].num_events = total_events; 429 | } 430 | 431 | for(int i = 0; i < m_event_passes; ++i) { 432 | int total_events = 0; 433 | _LOG("[event] Looking at set (pass) %d", i); 434 | uint32_t num_events = 0; 435 | size_t num_events_size = sizeof(num_events); 436 | for(int j = 0; j < m_event_pass_data->sets[i].numEventGroups; ++j) { 437 | CUPTI_CALL(cuptiEventGroupGetAttribute( 438 | m_event_pass_data->sets[i].eventGroups[j], 439 | CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, 440 | &num_events_size, &num_events)); 441 | _LOG(" Event Group %d, #Events = %d", j, num_events); 442 | total_events += num_events; 443 | } 444 | pass_data[i + m_metric_passes].event_groups = 445 | m_event_pass_data->sets + i; 446 | pass_data[i + m_metric_passes].num_events = total_events; 447 | } 448 | 449 | m_kernel_data[dummy_kernel_name] = dummy_data; 450 | free(metric_ids); 451 | free(event_ids); 452 | } 453 | 454 | ~profiler() { 455 | } 456 | 457 | int get_passes() 458 | { return m_metric_passes + m_event_passes; } 459 | 460 | void start() { 461 | } 462 | 463 | void stop() { 464 | for(auto &k: m_kernel_data) { 465 | auto& data = k.second.m_pass_data; 466 | 467 | if(k.first == dummy_kernel_name) 468 | continue; 469 | 470 | int total_events = 0; 471 | for(int i = 0; i < m_metric_passes; ++i) { 472 | //total_events += m_metric_data[i].num_events; 473 | total_events += data[i].num_events; 474 | } 475 | CUpti_MetricValue metric_value; 476 | CUpti_EventID *event_ids = new CUpti_EventID[total_events]; 477 | uint64_t *event_values = new uint64_t[total_events]; 478 | 479 | int running_sum = 0; 480 | for(int i = 0; i < m_metric_passes; ++i) { 481 | std::copy(data[i].event_ids.begin(), 482 | data[i].event_ids.end(), 483 | event_ids + running_sum); 484 | std::copy(data[i].event_values.begin(), 485 | data[i].event_values.end(), 486 | event_values + running_sum); 487 | running_sum += data[i].num_events; 488 | } 489 | 490 | for(int i = 0; i < m_num_metrics; ++i) { 491 | CUptiResult _status = cuptiMetricGetValue(m_device, 492 | m_metric_ids[i], 493 | total_events * sizeof(CUpti_EventID), 494 | event_ids, 495 | total_events * sizeof(uint64_t), 496 | event_values, 497 | 0, &metric_value); 498 | if(_status != CUPTI_SUCCESS) { 499 | fprintf(stderr, "Metric value retrieval failed for metric %s\n", 500 | m_metric_names[i].c_str()); 501 | exit(-1); 502 | } 503 | k.second.m_metric_values.push_back(metric_value); 504 | } 505 | 506 | delete[] event_ids; 507 | delete[] event_values; 508 | 509 | std::map event_map; 510 | for(int i = m_metric_passes; 511 | i < (m_metric_passes + m_event_passes); 512 | ++i) { 513 | for(int j = 0; j < data[i].num_events; ++j) { 514 | event_map[data[i].event_ids[j]] = 515 | data[i].event_values[j]; 516 | } 517 | } 518 | 519 | for(int i = 0; i < m_num_events; ++i) { 520 | k.second.m_event_values.push_back( 521 | event_map[m_event_ids[i]]); 522 | } 523 | } 524 | 525 | // Disable callback and unsubscribe 526 | CUPTI_CALL(cuptiEnableCallback(0, m_subscriber, 527 | CUPTI_CB_DOMAIN_RUNTIME_API, 528 | CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020)); 529 | CUPTI_CALL(cuptiEnableCallback(0, m_subscriber, 530 | CUPTI_CB_DOMAIN_RUNTIME_API, 531 | CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)); 532 | CUPTI_CALL(cuptiUnsubscribe(m_subscriber)); 533 | } 534 | 535 | template 536 | void print_event_values(stream& s, 537 | bool print_names=true, 538 | const char* kernel_separator = "; ") { 539 | using ull_t = unsigned long long; 540 | 541 | for(auto const& k: m_kernel_data) { 542 | if(k.first == dummy_kernel_name) 543 | continue; 544 | 545 | //printf("%s: ", 546 | // m_kernel_data[k.first].m_name.c_str()); 547 | 548 | /*for(int i = 0; i < m_num_events; ++i) { 549 | printf("Event [%s] = %llu\n", 550 | m_event_names[i].c_str(), 551 | (ull_t)m_kernel_data[k.first].m_event_values[i]); 552 | } 553 | printf("\n");*/ 554 | 555 | if(m_num_events <= 0) 556 | return; 557 | 558 | for(int i = 0; i < m_num_events; ++i) { 559 | if(print_names) 560 | s << "(" << m_event_names[i] << "," 561 | << (ull_t)m_kernel_data[k.first].m_event_values[i] 562 | << ") "; 563 | else 564 | s << (ull_t)m_kernel_data[k.first].m_event_values[i] 565 | << " "; 566 | } 567 | s << kernel_separator; 568 | } 569 | printf("\n"); 570 | } 571 | 572 | template 573 | void print_metric_values(stream& s, 574 | bool print_names=true, 575 | const char* kernel_separator = "; ") { 576 | if(m_num_metrics <= 0) 577 | return; 578 | 579 | for(auto const& k: m_kernel_data) { 580 | if(k.first == dummy_kernel_name) 581 | continue; 582 | 583 | //printf("%s: ", 584 | // m_kernel_data[k.first].m_name.c_str()); 585 | 586 | for(int i = 0; i < m_num_metrics; ++i) { 587 | if(print_names) 588 | s << "(" << m_metric_names[i] << ","; 589 | 590 | detail::print_metric( 591 | m_metric_ids[i], 592 | m_kernel_data[k.first].m_metric_values[i], 593 | s); 594 | 595 | if(print_names) s << ") "; 596 | else s << " "; 597 | } 598 | s << kernel_separator; 599 | } 600 | printf("\n"); 601 | } 602 | 603 | template 604 | void print_events_and_metrics(stream& s, 605 | bool print_names = true, 606 | const char* kernel_separator = "; ") { 607 | if(m_num_events <= 0 && m_num_metrics <= 0) 608 | return; 609 | 610 | using ull_t = unsigned long long; 611 | for(auto const& k: m_kernel_data) { 612 | if(k.first == dummy_kernel_name) 613 | continue; 614 | 615 | //printf("New kernel: %s \n", 616 | // m_kernel_data[k.first].m_name.c_str()); 617 | 618 | for(int i = 0; i < m_num_events; ++i) { 619 | if(print_names) 620 | s << "(" << m_event_names[i] << "," 621 | << (ull_t)m_kernel_data[k.first].m_event_values[i] 622 | << ") "; 623 | else 624 | s << (ull_t)m_kernel_data[k.first].m_event_values[i] 625 | << " "; 626 | } 627 | 628 | for(int i = 0; i < m_num_metrics; ++i) { 629 | if(print_names) 630 | s << "(" << m_metric_names[i] << ","; 631 | 632 | detail::print_metric( 633 | m_metric_ids[i], 634 | m_kernel_data[k.first].m_metric_values[i], 635 | s); 636 | 637 | if(print_names) s << ") "; 638 | else s << " "; 639 | } 640 | 641 | s << kernel_separator; 642 | } 643 | printf("\n"); 644 | } 645 | 646 | std::vector get_kernel_names() { 647 | if(m_kernel_names.size() == 0) { 648 | for(auto const& k: m_kernel_data) { 649 | if(k.first == dummy_kernel_name) 650 | continue; 651 | m_kernel_names.push_back(k.first); 652 | } 653 | } 654 | return m_kernel_names; 655 | } 656 | 657 | event_val_t 658 | get_event_values(const char *kernel_name) { 659 | if(m_num_events > 0) 660 | return m_kernel_data[kernel_name].m_event_values; 661 | else 662 | return event_val_t{}; 663 | } 664 | 665 | metric_val_t get_metric_values(const char *kernel_name) { 666 | if(m_num_metrics > 0) 667 | return m_kernel_data[kernel_name].m_metric_values; 668 | else 669 | return metric_val_t{}; 670 | } 671 | 672 | private: 673 | int m_device_num; 674 | int m_num_metrics, m_num_events; 675 | const strvec_t& m_event_names; 676 | const strvec_t& m_metric_names; 677 | std::vector m_metric_ids; 678 | std::vector m_event_ids; 679 | 680 | CUcontext m_context; 681 | CUdevice m_device; 682 | CUpti_SubscriberHandle m_subscriber; 683 | 684 | CUpti_EventGroupSets *m_metric_pass_data; 685 | CUpti_EventGroupSets *m_event_pass_data; 686 | 687 | int m_metric_passes, m_event_passes; 688 | // Kernel-specific (indexed by name) trace data 689 | std::map m_kernel_data; 691 | std::vector m_kernel_names; 692 | int m_num_kernels; 693 | }; 694 | 695 | #ifndef __CUPTI_PROFILER_NAME_SHORT 696 | #define __CUPTI_PROFILER_NAME_SHORT 128 697 | #endif 698 | 699 | std::vector available_metrics(CUdevice device) { 700 | std::vector metric_names; 701 | uint32_t numMetric; 702 | size_t size; 703 | char metricName[__CUPTI_PROFILER_NAME_SHORT]; 704 | CUpti_MetricValueKind metricKind; 705 | CUpti_MetricID *metricIdArray; 706 | 707 | CUPTI_CALL(cuptiDeviceGetNumMetrics(device, &numMetric)); 708 | size = sizeof(CUpti_MetricID) * numMetric; 709 | metricIdArray = (CUpti_MetricID*) malloc(size); 710 | if(NULL == metricIdArray) { 711 | printf("Memory could not be allocated for metric array"); 712 | exit(-1); 713 | } 714 | 715 | CUPTI_CALL(cuptiDeviceEnumMetrics(device, &size, metricIdArray)); 716 | 717 | for (int i = 0; i < numMetric; i++) { 718 | size = __CUPTI_PROFILER_NAME_SHORT; 719 | CUPTI_CALL(cuptiMetricGetAttribute(metricIdArray[i], 720 | CUPTI_METRIC_ATTR_NAME, &size, (void *)& metricName)); 721 | size = sizeof(CUpti_MetricValueKind); 722 | CUPTI_CALL(cuptiMetricGetAttribute(metricIdArray[i], 723 | CUPTI_METRIC_ATTR_VALUE_KIND, &size, (void *)& metricKind)); 724 | if ((metricKind == CUPTI_METRIC_VALUE_KIND_THROUGHPUT) 725 | || (metricKind == CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL)) { 726 | printf("Metric %s cannot be profiled as metric requires GPU" 727 | "time duration for kernel run.\n", metricName); 728 | } else { 729 | metric_names.push_back(metricName); 730 | } 731 | } 732 | free(metricIdArray); 733 | return std::move(metric_names); 734 | } 735 | 736 | std::vector available_events(CUdevice device) { 737 | std::vector event_names; 738 | uint32_t numDomains = 0, numEvents = 0, totalEvents = 0; 739 | size_t size; 740 | CUpti_EventDomainID* domainIdArray; 741 | CUpti_EventID *eventIdArray; 742 | size_t eventIdArraySize; 743 | char eventName[__CUPTI_PROFILER_NAME_SHORT]; 744 | 745 | CUPTI_CALL(cuptiDeviceGetNumEventDomains(device, &numDomains)); 746 | size = sizeof(CUpti_EventDomainID) * numDomains; 747 | domainIdArray = (CUpti_EventDomainID*) malloc(size); 748 | if(NULL == domainIdArray) { 749 | printf("Memory could not be allocated for domain array"); 750 | exit(-1); 751 | } 752 | CUPTI_CALL(cuptiDeviceEnumEventDomains(device, &size, domainIdArray)); 753 | 754 | for (int i = 0; i < numDomains; i++) { 755 | CUPTI_CALL(cuptiEventDomainGetNumEvents(domainIdArray[i], &numEvents)); 756 | totalEvents += numEvents; 757 | } 758 | 759 | eventIdArraySize = sizeof(CUpti_EventID) * totalEvents; 760 | eventIdArray = (CUpti_EventID *) malloc(eventIdArraySize); 761 | 762 | totalEvents = 0; 763 | for (int i = 0; i < numDomains; i++) { 764 | // Query num of events available in the domain 765 | CUPTI_CALL(cuptiEventDomainGetNumEvents(domainIdArray[i], &numEvents)); 766 | size = numEvents * sizeof(CUpti_EventID); 767 | CUPTI_CALL( 768 | cuptiEventDomainEnumEvents(domainIdArray[i], &size, 769 | eventIdArray + totalEvents)); 770 | totalEvents += numEvents; 771 | } 772 | 773 | for (int i = 0; i < totalEvents; i++) { 774 | size = __CUPTI_PROFILER_NAME_SHORT; 775 | CUPTI_CALL(cuptiEventGetAttribute(eventIdArray[i], 776 | CUPTI_EVENT_ATTR_NAME, &size, eventName)); 777 | event_names.push_back(eventName); 778 | } 779 | free(domainIdArray); 780 | free(eventIdArray); 781 | return std::move(event_names); 782 | } 783 | 784 | } // namespace cupti_profiler 785 | --------------------------------------------------------------------------------