├── python ├── .gitignore ├── prof_common.py ├── dvdt_prof_cli │ ├── dvdt_parser-v0.1.py │ └── dvdt_function.py ├── prof_wrangler.py └── prof_parser.py ├── tests ├── pipe.sh ├── .gitignore ├── clEnqueueReadBuffer.py ├── clEnqueueWriteBuffer.py ├── clCreateKernel.cpp ├── clSetKernelArg.cpp ├── clSetKernelArg_str.cpp ├── clCreateKernelsInProgram.cpp ├── clCreateBuffer.cpp ├── clCreateCommandQueue.cpp ├── clCreateProgramWithSource.cpp ├── clBuildProgram.cpp ├── clCreateProgramWithBinary.cpp ├── clEnqueueReadBuffer.cpp ├── clEnqueueWriteBuffer.cpp ├── clEnqueueNDRangeKernel.cpp ├── clEnqueueNDRangeKernel_LWS.cpp ├── clEnqueueNDRangeKernel_LWS_NULL.cpp ├── README.md ├── clCreateKernel.py ├── clCreateCommandQueue.py ├── clSetKernelArg.py ├── clSetKernelArg_str.py ├── clCreateBuffer.py ├── clCreateProgramWithSource.py ├── clCreateKernelsInProgram.py ├── CMakeLists.txt ├── clBuildProgram.py ├── clCreateProgramWithBinary.py ├── clEnqueueNDRangeKernel.py ├── clEnqueueReadOrWriteBuffer.py ├── clEnqueueNDRangeKernel_LWS_NULL.py └── clEnqueueNDRangeKernel_LWS.py ├── .gitignore ├── CONTRIBUTORS.txt ├── CHANGES.txt ├── cpp ├── prof_info.hpp.in ├── prof.cpp └── prof.hpp ├── LICENSE.txt ├── CMakeLists.txt └── README.md /python/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /tests/pipe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | $1 2>&1 | $2 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | prof.so 2 | libprof_test.so 3 | Testing/ 4 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | libprof_test.so 2 | *.exe 3 | *.pyc 4 | -------------------------------------------------------------------------------- /CONTRIBUTORS.txt: -------------------------------------------------------------------------------- 1 | Anton Lokhmotov, anton@dividiti.com 2 | Grigori Fursin, grigori@dividiti.com 3 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.2 First public release. 2 | * Output JSON online. 3 | 4 | v0.1 Internal development. 5 | * Output to stdout online; parse to JSON offline. 6 | -------------------------------------------------------------------------------- /tests/clEnqueueReadBuffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, re 3 | 4 | sys.path.append('.') 5 | from clEnqueueReadOrWriteBuffer import run 6 | 7 | # Test info. 8 | call = 'clEnqueueReadBuffer' 9 | _id = '' 10 | 11 | # Run test. 12 | run(call, _id) 13 | -------------------------------------------------------------------------------- /cpp/prof_info.hpp.in: -------------------------------------------------------------------------------- 1 | // 2015-2017 (c) dividiti 2 | 3 | // The configured options and settings for Prof. 4 | #define Prof_VERSION_MAJOR @Prof_VERSION_MAJOR@ 5 | #define Prof_VERSION_MINOR @Prof_VERSION_MINOR@ 6 | #define Prof_COPYRIGHT_DIVIDITI @Prof_COPYRIGHT_DIVIDITI@ 7 | -------------------------------------------------------------------------------- /tests/clEnqueueWriteBuffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, re 3 | 4 | sys.path.append('.') 5 | from clEnqueueReadOrWriteBuffer import run 6 | 7 | # Test info. 8 | call = 'clEnqueueWriteBuffer' 9 | _id = '' 10 | 11 | # Run test. 12 | run(call, _id) 13 | -------------------------------------------------------------------------------- /tests/clCreateKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_program program = (cl_program) 0x01234567; 7 | const char * kernel_name = "DGEMM_NT_2x2"; 8 | cl_int * errcode = (cl_int *) 0x12345678; 9 | 10 | cl_kernel kernel = clCreateKernel(program, kernel_name, errcode); 11 | assert((cl_kernel) 0x00000000 == kernel); 12 | 13 | return 0; 14 | } 15 | -------------------------------------------------------------------------------- /tests/clSetKernelArg.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_kernel kernel = (cl_kernel) 0x01234567; 7 | cl_uint arg_index = 1; 8 | cl_ushort arg_value = 1234; 9 | size_t arg_size = 2; 10 | 11 | cl_int errcode = clSetKernelArg(kernel, arg_index, arg_size, (const void*) &arg_value); 12 | assert(CL_SUCCESS == errcode); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /tests/clSetKernelArg_str.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_kernel kernel = (cl_kernel) 0x01234567; 7 | cl_uint arg_index = 2; 8 | char arg_value[] = "hello world"; 9 | size_t arg_size = 11; 10 | assert(sizeof(arg_value) == arg_size+1); 11 | 12 | cl_int errcode = clSetKernelArg(kernel, arg_index, arg_size, (const void*) &arg_value); 13 | assert(CL_SUCCESS == errcode); 14 | 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /tests/clCreateKernelsInProgram.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_program program = (cl_program) 0x01234567; 7 | cl_uint num_kernels = 2; 8 | cl_kernel * kernels = (cl_kernel *) 0x12345678; 9 | cl_uint * num_kernels_ret_ptr = (cl_uint *) 0x23456789; 10 | 11 | cl_int errcode = clCreateKernelsInProgram(program, num_kernels, kernels, num_kernels_ret_ptr); 12 | assert(CL_SUCCESS == errcode); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /tests/clCreateBuffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_context context = (cl_context) 0x01234567; 7 | cl_mem_flags flags = (cl_mem_flags) 17; // CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; 8 | size_t size = 4096; 9 | void * host_ptr = (void *) 0x12345678; 10 | cl_int * errcode = (cl_int *) 0x23456789; 11 | 12 | cl_mem buffer = clCreateBuffer(context, flags, size, host_ptr, errcode); 13 | assert((cl_mem) 0x00000000 == buffer); 14 | 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /tests/clCreateCommandQueue.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_context context = (cl_context) 0x01234567; 7 | cl_device_id device = (cl_device_id) 0x12345678; 8 | cl_command_queue_properties properties = (cl_command_queue_properties) 0; 9 | cl_int * errcode = (cl_int *) 0x23456789; 10 | 11 | cl_command_queue queue = clCreateCommandQueue(context, device, properties, errcode); 12 | assert(((cl_command_queue) 0x00000000 == queue)); 13 | 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /tests/clCreateProgramWithSource.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_context context = (cl_context) 0x01234567; 7 | cl_uint count = 1; 8 | const char * strings[1] = { "kernel void f() {}" }; 9 | const size_t * lengths = (const size_t *) 0x00000000; 10 | cl_int * errcode = (cl_int *) 0x12345678; 11 | 12 | cl_program program = clCreateProgramWithSource(context, count, strings, lengths, errcode); 13 | assert((cl_program) 0x00000000 == program); 14 | 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /tests/clBuildProgram.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef void (CL_CALLBACK pfn_notify_t)(cl_program program, void * user_data); 5 | 6 | int main() 7 | { 8 | cl_program program = (cl_program) 0x01234567; 9 | const cl_uint num_devices = 2; 10 | cl_device_id device_list[2] = { (cl_device_id) 0x12345678, (cl_device_id) 0x23456789 }; 11 | const char * options = "-Werror -DN=1024"; 12 | pfn_notify_t * pfn_notify = (pfn_notify_t *) 0x3456789a; 13 | void * user_data = (void *) 0x456789ab; 14 | 15 | cl_int errcode = clBuildProgram(program, num_devices, device_list, options, pfn_notify, user_data); 16 | assert(CL_SUCCESS == errcode); 17 | 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /tests/clCreateProgramWithBinary.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() 5 | { 6 | cl_context context = (cl_context) 0x01234567; 7 | cl_uint num_devices = 2; 8 | cl_device_id device_list[2] = { (cl_device_id) 0x12345678, (cl_device_id) 0x12345678 }; 9 | const size_t * lengths = (const size_t *) 0x23456789; 10 | const unsigned char ** binaries = (const unsigned char **) 0x3456789A; 11 | cl_int * binary_status = (cl_int *) 0x456789AB; 12 | cl_int * errcode_ret = (cl_int *) 0x56789ABC; 13 | 14 | cl_program program = clCreateProgramWithBinary(context, 15 | num_devices, device_list, 16 | lengths, binaries, 17 | binary_status, errcode_ret); 18 | assert((cl_program) 0x00000000 == program); 19 | 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /python/prof_common.py: -------------------------------------------------------------------------------- 1 | # 2 | # 2015-2017 (c) dividiti 3 | # 4 | 5 | import re 6 | import json 7 | 8 | # 9 | # Common definitions. 10 | # 11 | 12 | prefix = '(\[dv\/dt\])' 13 | call_regex = '(cl[a-zA-Z]*)' 14 | opts_regex = '([ \-\w_=]*)' 15 | iso_regex = '(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6})' 16 | ptr_regex = '((0x[0-9a-fA-F]{1,8})|(0))' 17 | int_regex = '(\d+)' 18 | hex_regex = '([a-fA-F\d]+)' 19 | bool_regex = '(\d)' 20 | 21 | # Check that definitions from this file are available. 22 | def test(): 23 | print ("prof_common.py") 24 | 25 | # Convert hexadecimal string into integer. 26 | def hex_str_as_int(hex_str): 27 | hex_str_reversed = ''.join(reversed( 28 | [ hex_str[n:n+2] for n in range(0,len(hex_str),2) ] 29 | )) 30 | return int(hex_str_reversed, 16) 31 | 32 | # Convert hexadecimal string into text string. 33 | def hex_str_as_str(hex_str): 34 | return hex_str.decode('hex') 35 | -------------------------------------------------------------------------------- /tests/clEnqueueReadBuffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | cl_command_queue queue = (cl_command_queue) 0x01234567; 8 | cl_mem buffer = (cl_mem) 0x12345678; 9 | cl_bool blocking = 1; 10 | size_t offset = 44; 11 | size_t size = 55; 12 | void *ptr = (void *) 0x23456789; 13 | cl_uint num_events_in_wait_list = 3; 14 | cl_event event_wait_list[3] = { (cl_event) 0x3456789a, (cl_event) 0x456789ab, (cl_event) 0x56789abc }; 15 | cl_event * event = (cl_event *) 0; 16 | 17 | cl_int errcode = clEnqueueReadBuffer(queue, buffer, blocking, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); 18 | assert(CL_SUCCESS == errcode); 19 | 20 | // Uncomment to emulate ostream profiling output (deprecated approach). 21 | // NB: Pattern matching still works even when it's commented out. 22 | // std::cout << "[dv/dt] clEnqueueReadBuffer profiling 100200300400 100200300500 100200300600 100200300700\n"; 23 | 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /tests/clEnqueueWriteBuffer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | cl_command_queue queue = (cl_command_queue) 0x01234567; 8 | cl_mem buffer = (cl_mem) 0x12345678; 9 | cl_bool blocking = 1; 10 | size_t offset = 44; 11 | size_t size = 55; 12 | const void *ptr = (const void *) 0x23456789; 13 | cl_uint num_events_in_wait_list = 3; 14 | cl_event event_wait_list[3] = { (cl_event) 0x3456789a, (cl_event) 0x456789ab, (cl_event) 0x56789abc }; 15 | cl_event * event = (cl_event *) 0; 16 | 17 | cl_int errcode = clEnqueueWriteBuffer(queue, buffer, blocking, offset, size, ptr, num_events_in_wait_list, event_wait_list, event); 18 | assert(CL_SUCCESS == errcode); 19 | 20 | // Uncomment to emulate ostream profiling output (deprecated approach). 21 | // NB: Pattern matching still works even when it's commented out. 22 | // std::cout << "[dv/dt] clEnqueueWriteBuffer profiling 100200300400 100200300500 100200300600 100200300700\n"; 23 | 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | cl_command_queue queue = (cl_command_queue) 0x01234567; 8 | cl_kernel kernel = (cl_kernel) 0x12345678; 9 | cl_uint work_dim = 2; 10 | size_t global_work_offset[2] = { 0, 1 }; 11 | size_t global_work_size[2] = { 1024, 2 }; 12 | size_t * local_work_size = NULL; 13 | cl_uint num_events_in_wait_list = 2; 14 | cl_event event_wait_list[2] = { (cl_event) 0x23456789, (cl_event) 0x3456789a }; 15 | cl_event * event = (cl_event *) 0x456789ab; 16 | 17 | cl_int errcode = clEnqueueNDRangeKernel(queue, kernel, \ 18 | work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); 19 | assert(CL_SUCCESS == errcode); 20 | 21 | // Uncomment to emulate ostream profiling output (deprecated approach). 22 | // NB: Pattern matching still works even when it's commented out. 23 | // std::cout << "[dv/dt] clEnqueueNDRangeKernel profiling 100200300400 100200300500 100200300600 100200300700\n"; 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel_LWS.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | cl_command_queue queue = (cl_command_queue) 0x01234567; 8 | cl_kernel kernel = (cl_kernel) 0x12345678; 9 | cl_uint work_dim = 2; 10 | size_t global_work_offset[2] = { 0, 1 }; 11 | size_t global_work_size[2] = { 1024, 4 }; 12 | size_t local_work_size[2] = { 128, 2 }; 13 | cl_uint num_events_in_wait_list = 2; 14 | cl_event event_wait_list[2] = { (cl_event) 0x23456789, (cl_event) 0x3456789a }; 15 | cl_event * event = (cl_event *) 0x456789ab; 16 | 17 | cl_int errcode = clEnqueueNDRangeKernel(queue, kernel, \ 18 | work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); 19 | assert(CL_SUCCESS == errcode); 20 | 21 | // Uncomment to emulate ostream profiling output (deprecated approach). 22 | // NB: Pattern matching still works even when it's commented out. 23 | // std::cout << "[dv/dt] clEnqueueNDRangeKernel profiling 100200300400 100200300500 100200300600 100200300700\n"; 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel_LWS_NULL.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | cl_command_queue queue = (cl_command_queue) 0x01234567; 8 | cl_kernel kernel = (cl_kernel) 0x12345678; 9 | cl_uint work_dim = 2; 10 | size_t global_work_offset[2] = { 0, 1 }; 11 | size_t global_work_size[2] = { 1024, 4 }; 12 | size_t local_work_size[2] = { 128, 2 }; 13 | cl_uint num_events_in_wait_list = 2; 14 | cl_event event_wait_list[2] = { (cl_event) 0x23456789, (cl_event) 0x3456789a }; 15 | cl_event * event = (cl_event *) 0x456789ab; 16 | 17 | cl_int errcode = clEnqueueNDRangeKernel(queue, kernel, \ 18 | work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); 19 | assert(CL_SUCCESS == errcode); 20 | 21 | // Uncomment to emulate ostream profiling output (deprecated approach). 22 | // NB: Pattern matching still works even when it's commented out. 23 | // std::cout << "[dv/dt] clEnqueueNDRangeKernel profiling 100200300400 100200300500 100200300600 100200300700\n"; 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Rationale. 2 | 3 | The initial unit tests (for `clCreateKernel()`, `clEnqueueNDRangeKernel()`, etc.) are written using a C program and a Python program with the same base file name. 4 | 5 | The C program gets compiled and run with `libprof_test.so` so that the output resembles that of `libprof.so`. The only difference is that `libprof_test.so` only intercepts the arguments but does not pass them further into a real `libOpenCL.so` library. Indeed, the C program uses some random values for pointer arguments (e.g. 0x12345678`) so calling `libOpenCL.so` would result in a segmentation fault. 6 | 7 | The output of the C program is input into the Python program. The Python program parses the output using the parser in `python/prof.py` producing a dictionary called `result`. The Python program also parses the C program file to extract the original values producing a dictionary called `source`. Finally, the two dictionaries are compared for equality. 8 | 9 | Getting the original values by parsing the C program is arguably hard. (Perhaps harder than it should be.) Another approach would be to generate the C program from a template. On the other hand, writing the Python program first helps writing the parser (cf. test driven development). Proper comparison of the approaches is left for future work. 10 | -------------------------------------------------------------------------------- /tests/clCreateKernel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | from prof_parser import opts_regex 11 | 12 | # Test info. 13 | call = 'clCreateKernel' 14 | _id = '' 15 | print '%s%s' % (call, _id) 16 | 17 | # Parse test source. 18 | source = {} 19 | with open(call + _id + '.cpp', 'r') as f: 20 | source['text'] = f.read() 21 | source['program'] = re.search('\(cl_program\) (?P%s)' % ptr_regex, source['text']).group('program') 22 | source['name'] = re.search('kernel_name = \"(?P%s)\"' % opts_regex, source['text']).group('name') 23 | # The following should match the assert statement. 24 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 25 | 26 | # Read from stdin (via pipe). 27 | output = sys.stdin.read() 28 | print 'OUTPUT' 29 | print output 30 | 31 | result = prof_parse(output)[0] 32 | print 'RESULT' 33 | print result 34 | print 35 | 36 | status = True 37 | status &= (source['program'] == result['program']) 38 | status &= (source['name'] == result['name']) 39 | status &= (source['kernel'] == result['kernel']) 40 | 41 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 42 | print 43 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2015-2017 (c) dividiti and contributors 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of dividiti nor the names of contributors may be used 14 | to endorse or promote products derived from this software without specific 15 | prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /tests/clCreateCommandQueue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | 11 | # Test info. 12 | call = 'clCreateCommandQueue' 13 | _id = '' 14 | print '%s%s' % (call, _id) 15 | 16 | # Parse test source. 17 | source = {} 18 | with open(call + _id + '.cpp', 'r') as f: 19 | source['text'] = f.read() 20 | source['context'] = re.search('\(cl_context\) (?P%s)' % ptr_regex, source['text']).group('context') 21 | source['device'] = re.search('\(cl_device_id\) (?P%s)' % ptr_regex, source['text']).group('device') 22 | source['properties'] = int(re.search('\(cl_command_queue_properties\) (?P\d*)', source['text']).group('props')) 23 | source['errcode_ret'] = re.search('\(cl_int \*\) (?P%s)' % ptr_regex, source['text']).group('errcode_ret') 24 | # The following should match the assert statement. 25 | source['queue'] = re.search('\(cl_command_queue\) (?P%s)' % ptr_regex, source['text']).group('queue') 26 | 27 | # Read from stdin (via pipe). 28 | output = sys.stdin.read() 29 | print 'OUTPUT' 30 | print output 31 | 32 | result = prof_parse(output)[0] 33 | print 'RESULT' 34 | print result 35 | print 36 | 37 | status = True 38 | status &= (source['context'] == result['context']) 39 | status &= (source['device'] == result['device']) 40 | status &= (source['properties'] == result['properties']) 41 | status &= (source['errcode_ret'] == result['errcode_ret']) 42 | 43 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 44 | print 45 | -------------------------------------------------------------------------------- /tests/clSetKernelArg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | from prof_parser import opts_regex 11 | from prof_common import hex_str_as_int 12 | 13 | # Test info. 14 | call = 'clSetKernelArg' 15 | _id = '' 16 | print '%s%s' % (call, _id) 17 | 18 | # Parse test source. 19 | source = {} 20 | with open(call + _id + '.cpp', 'r') as f: 21 | source['text'] = f.read() 22 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 23 | source['arg_index'] = int(re.search('arg_index(\s*)=(\s*)(?P\d+)', source['text']).group('arg_index')) 24 | source['arg_value_as_int'] = int(re.search('arg_value(\s*)=(\s*)(?P\d+)', source['text']).group('arg_value')) 25 | source['arg_size'] = int(re.search('arg_size(\s*)=(\s*)(?P\d+)', source['text']).group('arg_size')) 26 | 27 | # Read from stdin (via pipe). 28 | output = sys.stdin.read() 29 | print 'OUTPUT' 30 | print output 31 | 32 | result = prof_parse(output)[0] 33 | result['arg_value_as_int'] = hex_str_as_int(result['arg_value']) 34 | print 'RESULT' 35 | print result 36 | print 37 | 38 | 39 | status = True 40 | status &= (source['kernel'] == result['kernel']) 41 | status &= (source['arg_index'] == result['arg_index']) 42 | status &= (source['arg_size'] == result['arg_size']) 43 | status &= (source['arg_value_as_int'] == result['arg_value_as_int']) 44 | status &= (0 == result['errcode']) 45 | 46 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 47 | print 48 | -------------------------------------------------------------------------------- /tests/clSetKernelArg_str.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | from prof_parser import opts_regex 11 | from prof_common import hex_str_as_str 12 | 13 | # Test info. 14 | call = 'clSetKernelArg' 15 | _id = '_str' 16 | print '%s%s' % (call, _id) 17 | 18 | # Parse test source. 19 | source = {} 20 | with open(call + _id + '.cpp', 'r') as f: 21 | source['text'] = f.read() 22 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 23 | source['arg_index'] = int(re.search('arg_index(\s*)=(\s*)(?P\d+)', source['text']).group('arg_index')) 24 | source['arg_value_as_str'] = re.search('arg_value\[\](\s*)=(\s*)\"(?P.+)\"', source['text']).group('arg_value') 25 | source['arg_size'] = int(re.search('arg_size(\s*)=(\s*)(?P\d+)', source['text']).group('arg_size')) 26 | 27 | # Read from stdin (via pipe). 28 | output = sys.stdin.read() 29 | print 'OUTPUT' 30 | print output 31 | 32 | result = prof_parse(output)[0] 33 | result['arg_value_as_str'] = hex_str_as_str(result['arg_value']) 34 | print 'RESULT' 35 | print result 36 | print 37 | 38 | 39 | status = True 40 | status &= (source['kernel'] == result['kernel']) 41 | status &= (source['arg_index'] == result['arg_index']) 42 | status &= (source['arg_size'] == result['arg_size']) 43 | status &= (source['arg_value_as_str'] == result['arg_value_as_str']) 44 | status &= (0 == result['errcode']) 45 | 46 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 47 | print 48 | -------------------------------------------------------------------------------- /tests/clCreateBuffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | 11 | # Test info. 12 | call = 'clCreateBuffer' 13 | _id = '' 14 | print '%s%s' % (call, _id) 15 | 16 | # Parse test source. 17 | source = {} 18 | with open(call + _id + '.cpp', 'r') as f: 19 | source['text'] = f.read() 20 | source['context'] = re.search('\(cl_context\) (?P%s)' % ptr_regex, source['text']).group('context') 21 | source['flags'] = int(re.search('\(cl_mem_flags\) (?P\d*)', source['text']).group('flags')) 22 | source['size'] = int(re.search('size(\s*)=(\s*)(?P\d+)', source['text']).group('size')) 23 | source['host_ptr'] = re.search('\(void \*\) (?P%s)' % ptr_regex, source['text']).group('host_ptr') 24 | source['errcode_ret'] = re.search('\(cl_int \*\) (?P%s)' % ptr_regex, source['text']).group('errcode_ret') 25 | # The following should match the assert statement. 26 | source['buffer'] = re.search('\(cl_mem\) (?P%s)' % ptr_regex, source['text']).group('buffer') 27 | 28 | # Read from stdin (via pipe). 29 | output = sys.stdin.read() 30 | print 'OUTPUT' 31 | print output 32 | 33 | result = prof_parse(output)[0] 34 | print 'RESULT' 35 | print result 36 | print 37 | 38 | status = True 39 | status &= (source['context'] == result['context']) 40 | status &= (source['flags'] == result['flags']) 41 | status &= (source['size'] == result['size']) 42 | status &= (source['host_ptr'] == result['host_ptr']) 43 | status &= (source['errcode_ret'] == result['errcode_ret']) 44 | 45 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 46 | print 47 | -------------------------------------------------------------------------------- /tests/clCreateProgramWithSource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | 11 | # Test info. 12 | call = 'clCreateProgramWithSource' 13 | _id = '' 14 | print '%s%s' % (call, _id) 15 | 16 | # Parse test source. 17 | source = {} 18 | with open(call + _id + '.cpp', 'r') as f: 19 | source['text'] = f.read() 20 | source['context'] = re.search('\(cl_context\) (?P%s)' % ptr_regex, source['text']).group('context') 21 | source['count'] = int(re.search('count(\s*)=(\s*)(?P\d+)', source['text']).group('count')) 22 | source['string0'] = re.search('strings\[\d+\](\s*)=(\s*)\{(\s*)"(?P.*)"', source['text']).group('string0') 23 | source['lengths'] = re.search('\(const size_t \*\) (?P%s)' % ptr_regex, source['text']).group('lengths') 24 | source['errcode_ret'] = re.search('\(cl_int \*\) (?P%s)' % ptr_regex, source['text']).group('errcode_ret') 25 | # The following should match the assert statement. 26 | source['program'] = re.search('\(cl_program\) (?P%s)' % ptr_regex, source['text']).group('program') 27 | 28 | # Read from stdin (via pipe). 29 | output = sys.stdin.read() 30 | print 'OUTPUT' 31 | print output 32 | 33 | result = prof_parse(output)[0] 34 | print 'RESULT' 35 | print result 36 | print 37 | 38 | status = True 39 | status &= (source['context'] == result['context']) 40 | status &= (source['count'] == result['count']) 41 | status &= (source['string0'] == result['source']['0']) 42 | status &= (source['lengths'] == result['lengths']) 43 | status &= (source['errcode_ret'] == result['errcode_ret']) 44 | status &= (source['program'] == result['program']) 45 | 46 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 47 | print 48 | -------------------------------------------------------------------------------- /tests/clCreateKernelsInProgram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | from prof_parser import opts_regex 11 | 12 | # Test info. 13 | call = 'clCreateKernelsInProgram' 14 | _id = '' 15 | print '%s%s' % (call, _id) 16 | 17 | # Parse test source. 18 | source = {} 19 | with open(call + _id + '.cpp', 'r') as f: 20 | source['text'] = f.read() 21 | # cl_program program = (cl_program) 0x01234567; 22 | source['program'] = re.search('\(cl_program\) (?P%s)' % ptr_regex, source['text']).group('program') 23 | # cl_uint num_kernels = 2; 24 | source['num_kernels'] = int(re.search('num_kernels(\s*)=(\s*)(?P\d+)', source['text']).group('num_kernels')) 25 | # cl_kernel * kernels = (cl_kernel *) 0x12345678; 26 | source['kernels'] = re.search('\(cl_kernel(\s*)\*\)(\s*)(?P%s)' % ptr_regex, source['text']).group('kernels') 27 | # cl_uint * num_kernels_ret_ptr = (cl_uint *) 0x23456789; 28 | source['num_kernels_ret_ptr'] = re.search('\(cl_uint(\s*)\*\)(\s*)(?P%s)' % ptr_regex, source['text']).group('num_kernels_ret_ptr') 29 | 30 | # Read from stdin (via pipe). 31 | output = sys.stdin.read() 32 | print 'OUTPUT' 33 | print output 34 | 35 | result = prof_parse(output)[0] 36 | print 'RESULT' 37 | print result 38 | print 39 | 40 | status = True 41 | status &= (source['program'] == result['program']) 42 | status &= (source['num_kernels'] == result['num_kernels']) 43 | status &= (source['kernels'] == result['kernels']) 44 | status &= (source['num_kernels_ret_ptr'] == result['num_kernels_ret_ptr']) 45 | status &= (0 == result['num_kernels_ret']) 46 | status &= (0 == result['errcode']) 47 | 48 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 49 | print 50 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2015-2017 (c) dividiti 2 | 3 | # Add an interceptor library that intercepts some calls in 'libOpenCL.so'. 4 | add_library(prof_test SHARED ${SOURCE}) 5 | set_target_properties(prof_test 6 | PROPERTIES 7 | COMPILE_FLAGS "-D DVDT_PROF_TEST=1 -D DVDT_PROF_WALLCLOCK_TIMEOFDAY=1") 8 | target_link_libraries(prof_test dl "${CJSON_LIB_PATH}") 9 | 10 | # Get path to 'libprof_test.so'. 11 | set(PROF_TEST $) 12 | 13 | # Add OpenCL API test sources. 14 | file(GLOB cl_api_test_sources ${PROJECT_SOURCE_DIR}/tests/cl*.cpp) 15 | list(LENGTH cl_api_test_sources num_cl_api_test_sources) 16 | message(STATUS "Using ${num_cl_api_test_sources} OpenCL API tests") 17 | 18 | # Build OpenCL API test binaries. 19 | foreach(test_cpp ${cl_api_test_sources}) 20 | get_filename_component(test ${test_cpp} NAME_WE) 21 | add_executable(${test} ${test_cpp}) 22 | add_dependencies(${test} prof_test) 23 | target_link_libraries(${test} OpenCL) 24 | list(APPEND cl_api_test_binaries ${test}) 25 | endforeach() 26 | 27 | # Add OpenCL API tests. 28 | foreach(test ${cl_api_test_binaries}) 29 | add_test(NAME ${test} 30 | COMMAND 31 | ${PROJECT_SOURCE_DIR}/tests/pipe.sh 32 | "${PROJECT_BINARY_DIR}/bin/${test}" 33 | "${PROJECT_SOURCE_DIR}/tests/${test}.py" 34 | WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/tests" 35 | ) 36 | string(CONCAT cl_api_test_regex "${test}" ": PASSED") 37 | set_tests_properties(${test} 38 | PROPERTIES 39 | PASS_REGULAR_EXPRESSION ${cl_api_test_regex} 40 | ENVIRONMENT "LD_PRELOAD=${PROF_TEST};PARSE_JSON=${CJSON_SET}") 41 | endforeach() 42 | 43 | set_property(TEST clEnqueueNDRangeKernel_LWS 44 | APPEND PROPERTY ENVIRONMENT DVDT_PROF_LWS="dvdt_prof_kernel:1,2") 45 | set_property(TEST clEnqueueNDRangeKernel_LWS_NULL 46 | APPEND PROPERTY ENVIRONMENT DVDT_PROF_LWS_NULL=1) 47 | 48 | # Custom target: "make check" 49 | add_custom_target(check 50 | COMMAND ${CMAKE_CTEST_COMMAND} 51 | DEPENDS prof_test ${cl_api_test_binaries} 52 | ) 53 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # 2015-2017 (c) dividiti 3 | # 4 | 5 | cmake_minimum_required(VERSION 3.0) 6 | 7 | project(Prof) 8 | 9 | # The Prof copyright messages. 10 | set(Prof_COPYRIGHT_DIVIDITI "\"2015-2017 (c) dividiti\"") 11 | 12 | # The Prof version. 13 | set(Prof_VERSION_MAJOR 0) 14 | set(Prof_VERSION_MINOR 2) 15 | message("dividiti OpenCL API Profiler v${Prof_VERSION_MAJOR}.${Prof_VERSION_MINOR}") 16 | 17 | # The WALLCLOCK option. 18 | set(WALLCLOCK "boost" CACHE STRING "How to measure wall-clock time.") 19 | if(WALLCLOCK STREQUAL "boost") 20 | message(STATUS "Measuring wall-clock time using boost::chrono") 21 | add_definitions(-D DVDT_PROF_WALLCLOCK_BOOST=1) 22 | include_directories("${BOOST_INCLUDE_DIR}") 23 | SET(BOOST_LIB_PATH "${BOOST_LIB_DIR}/libboost_date_time.a") 24 | elseif(WALLCLOCK STREQUAL "timeofday") 25 | message(STATUS "Measuring wall-clock time using gettimeofday()") 26 | add_definitions(-D DVDT_PROF_WALLCLOCK_TIMEOFDAY=1) 27 | else() 28 | message(WARNING "Unsupported WALLCLOCK option: ${WALLCLOCK}.") 29 | endif() 30 | 31 | # The CJSON option. 32 | set(CJSON_SET "0" CACHE BOOLEAN "Parse JSON or default output.") 33 | if("${CJSON_SET}" STREQUAL "1") 34 | add_definitions(-D DVDT_PROF_CJSON=1) 35 | include_directories("${CJSON_INCLUDE_DIR}") 36 | SET(CJSON_LIB_PATH "${CJSON_LIB_DIR}/${CJSON_LIB_NAME}") 37 | endif() 38 | 39 | # Set build options. 40 | set(CMAKE_CXX_FLAGS "-O2 -W -Wall") 41 | message(STATUS "Using compiler flags: ${CMAKE_CXX_FLAGS}") 42 | message(STATUS "Using linker flags: ${CMAKE_SHARED_LINKER_FLAGS}") 43 | 44 | # Output directory for executables. 45 | SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin) 46 | 47 | # Output directory for libraries. 48 | SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib) 49 | 50 | # Pass CMake settings to the source code via a header file. 51 | configure_file( 52 | "${PROJECT_SOURCE_DIR}/cpp/prof_info.hpp.in" 53 | "${PROJECT_BINARY_DIR}/include/prof_info.hpp" 54 | ) 55 | 56 | # Add the binary tree to the search path for include files 57 | # so that 'prof_info.hpp' can be found. 58 | include_directories("${PROJECT_BINARY_DIR}/include") 59 | 60 | # Add source files. 61 | set(SOURCE 62 | ${CMAKE_CURRENT_SOURCE_DIR}/cpp/prof.cpp 63 | ) 64 | 65 | # Add an interceptor library that intercepts some calls in 'libOpenCL.so'. 66 | add_library(prof SHARED ${SOURCE}) 67 | target_link_libraries(prof dl "${CJSON_LIB_PATH}" "${BOOST_LIB_PATH}") 68 | 69 | # Test descriptions are in a separate file. 70 | include(CTest) 71 | include(${PROJECT_SOURCE_DIR}/tests/CMakeLists.txt) 72 | -------------------------------------------------------------------------------- /tests/clBuildProgram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | from prof_parser import opts_regex 11 | 12 | # Test info. 13 | call = 'clBuildProgram' 14 | _id = '' 15 | print '%s%s' % (call, _id) 16 | 17 | # FIXME: taken from clBuildProgram.py - avoid duplication. 18 | # Parse initialisation list of form: lhs = { elem, ... }. 19 | def match_init_list(text, lhs_regex, elem_regex): 20 | result = [] 21 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 22 | while match and match.group('elem') != '}': 23 | result.append(match.group('elem')) 24 | text = text[match.end():] 25 | match = re.search('(?P%s|\})' % elem_regex, text) 26 | return result 27 | 28 | # Parse test source. 29 | source = {} 30 | with open(call + _id + '.cpp', 'r') as f: 31 | source['text'] = f.read() 32 | source['program'] = re.search('\(cl_program\) (?P%s)' % ptr_regex, source['text']).group('program') 33 | source['options'] = re.search('options(\s*)=(\s*)\"(?P%s)\"' % opts_regex, source['text']).group('options') 34 | source['pfn_notify'] = re.search('\(pfn_notify_t \*\) (?P%s)' % ptr_regex, source['text']).group('pfn_notify') 35 | source['user_data'] = re.search('\(void \*\) (?P%s)' % ptr_regex, source['text']).group('user_data') 36 | # Parse device list. 37 | num_devices = int(re.search('num_devices(\s*)=(\s*)(?P\d+)', source['text']).group('num_devices')) 38 | cl_device_ptr_list = match_init_list(source['text'], 'device_list\[%d\]' % num_devices, '\(cl_device_id\) %s' % ptr_regex) 39 | source['device_list'] = [re.match('\(cl_device_id\) (?P%s)' % ptr_regex, cl_device_ptr).group('ptr') for cl_device_ptr in cl_device_ptr_list] 40 | 41 | # Read from stdin (via pipe). 42 | output = sys.stdin.read() 43 | print 'OUTPUT' 44 | print output 45 | 46 | result = prof_parse(output)[0] 47 | print 'RESULT' 48 | print result 49 | print 50 | 51 | status = True 52 | status &= (source['program'] == result['program']) 53 | status &= (cmp(source['device_list'], result['device_list']) == 0) 54 | status &= (source['options'] == result['options']) 55 | status &= (source['pfn_notify'].lower() == result['pfn_notify'].lower()) 56 | status &= (source['user_data'].lower() == result['user_data'].lower()) 57 | status &= (0 == result['errcode']) 58 | 59 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 60 | print 61 | -------------------------------------------------------------------------------- /tests/clCreateProgramWithBinary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import ptr_regex 10 | 11 | # Test info. 12 | call = 'clCreateProgramWithBinary' 13 | _id = '' 14 | print '%s%s' % (call, _id) 15 | 16 | # FIXME: taken from clBuildProgram.py - avoid duplication. 17 | # Parse initialisation list of form: lhs = { elem, ... }. 18 | def match_init_list(text, lhs_regex, elem_regex): 19 | result = [] 20 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 21 | while match and match.group('elem') != '}': 22 | result.append(match.group('elem')) 23 | text = text[match.end():] 24 | match = re.search('(?P%s|\})' % elem_regex, text) 25 | return result 26 | 27 | # Parse test source. 28 | source = {} 29 | with open(call + _id + '.cpp', 'r') as f: 30 | source['text'] = f.read() 31 | source['context'] = re.search('\(cl_context\) (?P%s)' % ptr_regex, source['text']).group('context') 32 | source['lengths'] = re.search('\(const size_t \*\) (?P%s)' % ptr_regex, source['text']).group('lengths') 33 | source['binaries'] = re.search('\(const unsigned char \*\*\) (?P%s)' % ptr_regex, source['text']).group('binaries') 34 | source['binary_status'] = re.search('binary_status(\s*)=(\s*)\(cl_int \*\) (?P%s)' % ptr_regex, source['text']).group('binary_status') 35 | source['errcode_ret'] = re.search('errcode_ret(\s*)=(\s*)\(cl_int \*\) (?P%s)' % ptr_regex, source['text']).group('errcode_ret') 36 | # Parse device list. 37 | num_devices = int(re.search('num_devices(\s*)=(\s*)(?P\d+)', source['text']).group('num_devices')) 38 | cl_device_ptr_list = match_init_list(source['text'], 'device_list\[%d\]' % num_devices, '\(cl_device_id\) %s' % ptr_regex) 39 | source['device_list'] = [re.match('\(cl_device_id\) (?P%s)' % ptr_regex, cl_device_ptr).group('ptr') for cl_device_ptr in cl_device_ptr_list] 40 | # The following should match the assert statement. 41 | source['program'] = re.search('\(cl_program\) (?P%s)' % ptr_regex, source['text']).group('program') 42 | 43 | # Read from stdin (via pipe). 44 | output = sys.stdin.read() 45 | print 'OUTPUT' 46 | print output 47 | 48 | result = prof_parse(output)[0] 49 | print 'RESULT' 50 | print result 51 | print 52 | 53 | status = True 54 | status &= (source['context'].lower() == result['context'].lower()) 55 | status &= (cmp(source['device_list'], result['device_list']) == 0) 56 | status &= (source['lengths'].lower() == result['lengths'].lower()) 57 | status &= (source['binaries'].lower() == result['binaries'].lower()) 58 | status &= (source['binary_status'].lower() == result['binary_status'].lower()) 59 | status &= (source['errcode_ret'].lower() == result['errcode_ret'].lower()) 60 | status &= (source['program'].lower() == result['program'].lower()) 61 | 62 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 63 | print 64 | -------------------------------------------------------------------------------- /python/dvdt_prof_cli/dvdt_parser-v0.1.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2018 cTuning foundation. 3 | # See CK COPYRIGHT.txt for copyright details. 4 | # 5 | # SPDX-License-Identifier: BSD-3-Clause. 6 | # See CK LICENSE.txt for licensing details. 7 | # 8 | 9 | # 10 | # Developer(s): 11 | # - Grigori Fursin, cTuning foundation, 2018 12 | # - Anton Lokhmotov, dividiti, 2018 13 | # - Flavio Vella, dividiti, 2018 14 | # 15 | ############################################################################## 16 | # The program provide a friendly interface to parse and visualize information from 17 | # dividi-prof. 18 | # 19 | 20 | # MODULE 21 | # deps: tabulate. To install: $ pip install tabulate --user 22 | # 23 | import os 24 | import dateutil.parser 25 | import json 26 | import argparse 27 | from pprint import pprint 28 | from tabulate import tabulate 29 | # 30 | import dvdt_function as tools 31 | # 32 | #### 33 | ### Options todo: --call=NDRange specify opencl operations. NDRange by default 34 | #### Table options... Maybe we want raw data or cvs 35 | 36 | parser = argparse.ArgumentParser(description='Dividiti Profiler Command Line Interface v0') 37 | #parser.add_argument("--filter_by", action="store", dest="bool_aggregate", help="Show information of unique kernel name") 38 | parser.add_argument("--aggregate", action="store_true", dest="aggregate_bool", default=None, help="Show information by of unique kernel names") 39 | parser.add_argument("--verbose", action="store", dest="verbose_lvl", default=0, help="verbose level") 40 | parser.add_argument("--files", action="store", dest="files_name", default="tmp-dvdt-prof.json", type=str, help="pass dvdt-prof.json files column separated") 41 | parser.add_argument("--filter-by-name", action="store", dest="filter_kernel_list", help="show information by a given kernel name") 42 | parser.add_argument("--filter-by-percent", action="store", dest="filter_percent", default=0, type=float, help="show information by a given kernel name") 43 | parser.add_argument("--sort", action="store_true", dest="sort_bool", default=False, help="Show information sorted by time") 44 | 45 | args=parser.parse_args() 46 | 47 | 48 | ### from here variable 49 | _is_aggregate = args.aggregate_bool 50 | _verbose = args.verbose_lvl 51 | _limit = tools.filter_percent_manager(args.filter_percent) 52 | _files_lst = tools.files_manager(args.files_name, "tmp-dvdt-prof.json") 53 | _filters_kernel = tools.fiter_by_name_manager(args.filter_kernel_list ) 54 | _is_sorted = args.sort_bool 55 | 56 | _data_list = [] 57 | print _files_lst 58 | for i in _files_lst: 59 | _data_list.append(tools.json_manager(i)) 60 | 61 | config_dict = {} 62 | ##n Only 1 dvdt-prof file is supported at the moment. 63 | config_dict['data'] = _data_list 64 | config_dict['filter_kernel'] = _filters_kernel 65 | config_dict['verbose'] = _verbose 66 | config_dict['percent_limit'] = _limit 67 | config_dict['aggregate'] = _is_aggregate 68 | config_dict['sort'] = _is_sorted 69 | config_dict['files_list'] = _files_lst 70 | config_dict['call_name'] = ['clEnqueueNDRangeKernel'] 71 | if _verbose > 2: 72 | tools.print_args 73 | if _verbose > 1: 74 | print "Print configuration" 75 | tools.print_configuration(config_dict) 76 | 77 | ### just one file is supported and NRange is the only supported call 78 | _NDRange_lst = tools.get_data_from_call(config_dict['data'][0], config_dict['call_name'][0]) 79 | _kernel_stat_lst = tools.get_data_from_ndrange(_NDRange_lst) 80 | 81 | if config_dict['sort'] is True: 82 | _kernel_stat_lst = sorted(_kernel_stat_lst, key=lambda k: k['total_time'], reverse=True) 83 | 84 | _app_stat_lst = tools.get_application_stat(_kernel_stat_lst) 85 | 86 | tools.computing_percent(_kernel_stat_lst, _app_stat_lst['total_kernel_time']) 87 | # othee options here https://pypi.python.org/pypi/tabulate/ 88 | print "===== " + config_dict['files_list'][0] + " =====" 89 | 90 | tools.print_table(_kernel_stat_lst, _app_stat_lst, config_dict['percent_limit'], config_dict['filter_kernel'], "simple") 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import int_regex 10 | from prof_parser import ptr_regex 11 | from prof_parser import opts_regex 12 | 13 | max_work_dim = 3 14 | default_offset = 0 15 | null_offset = 0 16 | default_gws = 1 17 | default_lws = 1 18 | null_lws = 0 19 | 20 | # Test info. 21 | call = 'clEnqueueNDRangeKernel' 22 | _id = '' 23 | print '%s%s' % (call, _id) 24 | 25 | # Parse initialisation list of form: lhs = { elem, ... }. 26 | def match_init_list(text, lhs_regex, elem_regex): 27 | result = [] 28 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 29 | while match and match.group('elem') != '}': 30 | result.append(match.group('elem')) 31 | text = text[match.end():] 32 | match = re.search('(?P%s|\})' % elem_regex, text) 33 | return result 34 | 35 | # Parse test source. 36 | source = {} 37 | with open(call + _id + '.cpp', 'r') as f: 38 | source['text'] = f.read() 39 | source['queue'] = re.search('\(cl_command_queue\) (?P%s)' % ptr_regex, source['text']).group('queue') 40 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 41 | 42 | work_dim = int(re.search('work_dim(\s*)=(\s*)(?P\d+)', source['text']).group('work_dim')) 43 | gwo = match_init_list(source['text'], 'global_work_offset\[%d\]' % work_dim, int_regex) 44 | source['gwo'] = ([int(i) for i in gwo] + [default_offset] * (max_work_dim - work_dim)) if gwo else [null_offset] * max_work_dim 45 | gws = match_init_list(source['text'], 'global_work_size\[%d\]' % work_dim, int_regex) 46 | source['gws'] = [int(i) for i in gws] + [default_gws] * (max_work_dim - work_dim) 47 | lws = match_init_list(source['text'], 'local_work_size\[%d\]' % work_dim, int_regex) 48 | source['lws'] = ([int(i) for i in lws] + [default_lws] * (max_work_dim - work_dim)) if lws else [null_lws] * max_work_dim 49 | 50 | num_events = int(re.search('num_events_in_wait_list(\s*)=(\s*)(?P\d+)', source['text']).group('num_events')) 51 | cl_event_ptr_list = match_init_list(source['text'], 'event_wait_list\[%d\]' % num_events, '\(cl_event\) %s' % ptr_regex) 52 | source['event_wait_list'] = [re.match('\(cl_event\) (?P%s)' % ptr_regex, cl_event_ptr).group('ptr') for cl_event_ptr in cl_event_ptr_list] 53 | 54 | source['event'] = re.search('\(cl_event \*\) (?P%s)' % ptr_regex, source['text']).group('event') 55 | 56 | profiling_match = re.search('%s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 57 | ('profiling', int_regex, int_regex, int_regex, int_regex), source['text']) 58 | if profiling_match: 59 | source['profiling'] = {} 60 | source['profiling']['queued'] = int(profiling_match.group('queued')) 61 | source['profiling']['submit'] = int(profiling_match.group('submit')) 62 | source['profiling']['start'] = int(profiling_match.group('start')) 63 | source['profiling']['end'] = int(profiling_match.group('end')) 64 | 65 | 66 | # Read from stdin (via pipe). 67 | output = sys.stdin.read() 68 | print 'OUTPUT' 69 | print output 70 | 71 | result = prof_parse(output)[0] 72 | print 'RESULT' 73 | print result 74 | print 75 | 76 | status = True 77 | status &= ("dvdt_prof_kernel" == result['name']) 78 | status &= (source['queue'] == result['queue']) 79 | status &= (source['kernel'] == result['kernel']) 80 | status &= (cmp(source['gwo'], result['gwo']) == 0) 81 | status &= (cmp(source['gws'], result['gws']) == 0) 82 | status &= (cmp(source['lws'], result['lws']) == 0) 83 | status &= (cmp(source['event_wait_list'], result['event_wait_list']) == 0) 84 | status &= (source['event'] == result['event']) 85 | status &= (cmp(source['profiling'], result['profiling']) == 0) 86 | 87 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 88 | print 89 | -------------------------------------------------------------------------------- /tests/clEnqueueReadOrWriteBuffer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | import os 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import int_regex 10 | from prof_parser import ptr_regex 11 | from prof_parser import opts_regex 12 | 13 | # Parse initialisation list of form: lhs = { elem, ... }. 14 | def match_init_list(text, lhs_regex, elem_regex): 15 | result = [] 16 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 17 | while match and match.group('elem') != '}': 18 | result.append(match.group('elem')) 19 | text = text[match.end():] 20 | match = re.search('(?P%s|\})' % elem_regex, text) 21 | return result 22 | 23 | # Parse test source. 24 | def get_source(call, _id): 25 | source = {} 26 | with open('%s%s.cpp' % (call, _id), 'r') as f: 27 | text = f.read() 28 | source['queue'] = re.search('\(cl_command_queue\)(\s*)(?P%s)' % ptr_regex, text).group('queue') 29 | source['buffer'] = re.search('\(cl_mem\)(\s*)(?P%s)' % ptr_regex, text).group('buffer') 30 | source['blocking'] = int(re.search('blocking(\s*)=(\s*)(?P\d)', text).group('blocking')) 31 | source['offset'] = int(re.search('offset(\s*)=(\s*)(?P\d+)', text).group('offset')) 32 | source['size'] = int(re.search('size(\s*)=(\s*)(?P\d+)', text).group('size')) 33 | source['ptr'] = re.search('\(%s(\s*)void(\s*)\*\)(\s*)(?P%s)' % \ 34 | ('const' if call == 'clEnqueueWriteBuffer' else '', ptr_regex), text).group('ptr') 35 | 36 | num_events = int(re.search('num_events_in_wait_list(\s*)=(\s*)(?P\d+)', text).group('num_events')) 37 | cl_event_ptr_list = match_init_list(text, 'event_wait_list\[%d\]' % num_events, '\(cl_event\) %s' % ptr_regex) 38 | source['event_wait_list'] = [re.match('\(cl_event\) (?P%s)' % ptr_regex, cl_event_ptr).group('ptr') for cl_event_ptr in cl_event_ptr_list] 39 | source['event'] = re.search('\(cl_event \*\) (?P%s|0)' % ptr_regex, text).group('event') 40 | 41 | profiling_match = re.search('%s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 42 | ('profiling', int_regex, int_regex, int_regex, int_regex), text) 43 | if profiling_match: 44 | source['profiling'] = {} 45 | source['profiling']['queued'] = int(profiling_match.group('queued')) 46 | source['profiling']['submit'] = int(profiling_match.group('submit')) 47 | source['profiling']['start'] = int(profiling_match.group('start')) 48 | source['profiling']['end'] = int(profiling_match.group('end')) 49 | return source 50 | 51 | # Get result. 52 | def get_result(output): 53 | result = prof_parse(output)[0] 54 | return result 55 | 56 | # Test source and result for comparison. 57 | def cmp_source_and_result(source, result): 58 | status = True 59 | status &= (source['queue'] == result['queue']) 60 | status &= (source['buffer'] == result['buffer']) 61 | status &= (source['blocking'] == result['blocking']) 62 | status &= (source['offset'] == result['offset']) 63 | status &= (source['size'] == result['size']) 64 | status &= (source['ptr'] == result['ptr']) 65 | status &= (cmp(source['event_wait_list'], result['event_wait_list']) == 0) 66 | # FIXME: watch for NULL pointers (0 != '0x00000000'). 67 | # status &= (source['event'] == result['event']) 68 | status &= (cmp(source['profiling'], result['profiling']) == 0) 69 | return status 70 | 71 | def run(call, _id): 72 | print '%s%s' % (call, _id) 73 | 74 | # Parse test source file. 75 | source = get_source(call, _id) 76 | print 'SOURCE' 77 | print source 78 | 79 | # Read test executable output from stdin (via pipe). 80 | output = sys.stdin.read() 81 | print 'OUTPUT' 82 | print output 83 | 84 | # Parse test executable output. 85 | result = get_result(output) 86 | print 'RESULT' 87 | print result 88 | 89 | print '%s%s: %s' % (call, _id, 'PASSED' if cmp_source_and_result(source, result) else 'FAILED') 90 | print 91 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel_LWS_NULL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import re 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import int_regex 10 | from prof_parser import ptr_regex 11 | from prof_parser import opts_regex 12 | 13 | max_work_dim = 3 14 | default_offset = 0 15 | null_offset = 0 16 | default_gws = 1 17 | default_lws = 1 18 | null_lws = 0 19 | 20 | # Test info. 21 | call = 'clEnqueueNDRangeKernel' 22 | _id = '_LWS_NULL' 23 | print '%s%s' % (call, _id) 24 | print 25 | 26 | # Environment. 27 | env = dict(os.environ) 28 | print 'DVDT_PROF_LWS_NULL=%s' % env['DVDT_PROF_LWS_NULL'] 29 | print 'LD_PRELOAD=%s' % env['LD_PRELOAD'] 30 | print 31 | 32 | # Parse initialisation list of form: lhs = { elem, ... }. 33 | def match_init_list(text, lhs_regex, elem_regex): 34 | result = [] 35 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 36 | while match and match.group('elem') != '}': 37 | result.append(match.group('elem')) 38 | text = text[match.end():] 39 | match = re.search('(?P%s|\})' % elem_regex, text) 40 | return result 41 | 42 | # Parse test source. 43 | source = {} 44 | with open(call + _id + '.cpp', 'r') as f: 45 | source['text'] = f.read() 46 | source['queue'] = re.search('\(cl_command_queue\) (?P%s)' % ptr_regex, source['text']).group('queue') 47 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 48 | 49 | work_dim = int(re.search('work_dim(\s*)=(\s*)(?P\d+)', source['text']).group('work_dim')) 50 | gwo = match_init_list(source['text'], 'global_work_offset\[%d\]' % work_dim, int_regex) 51 | source['gwo'] = ([int(i) for i in gwo] + [default_offset] * (max_work_dim - work_dim)) if gwo else [null_offset] * max_work_dim 52 | gws = match_init_list(source['text'], 'global_work_size\[%d\]' % work_dim, int_regex) 53 | source['gws'] = [int(i) for i in gws] + [default_gws] * (max_work_dim - work_dim) 54 | 55 | # Incerceptor sets local work size to NULL when DVDT_PROF_LWS_NULL is defined, ignore test source here. 56 | lws = None 57 | source['lws'] = ([int(i) for i in lws] + [default_lws] * (max_work_dim - work_dim)) if lws else [null_lws] * max_work_dim 58 | 59 | num_events = int(re.search('num_events_in_wait_list(\s*)=(\s*)(?P\d+)', source['text']).group('num_events')) 60 | cl_event_ptr_list = match_init_list(source['text'], 'event_wait_list\[%d\]' % num_events, '\(cl_event\) %s' % ptr_regex) 61 | source['event_wait_list'] = [re.match('\(cl_event\) (?P%s)' % ptr_regex, cl_event_ptr).group('ptr') for cl_event_ptr in cl_event_ptr_list] 62 | 63 | source['event'] = re.search('\(cl_event \*\) (?P%s)' % ptr_regex, source['text']).group('event') 64 | 65 | profiling_match = re.search('%s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 66 | ('profiling', int_regex, int_regex, int_regex, int_regex), source['text']) 67 | if profiling_match: 68 | source['profiling'] = {} 69 | source['profiling']['queued'] = int(profiling_match.group('queued')) 70 | source['profiling']['submit'] = int(profiling_match.group('submit')) 71 | source['profiling']['start'] = int(profiling_match.group('start')) 72 | source['profiling']['end'] = int(profiling_match.group('end')) 73 | 74 | 75 | # Read from stdin (via pipe). 76 | output = sys.stdin.read() 77 | print 'OUTPUT' 78 | print output 79 | 80 | result = prof_parse(output)[0] 81 | print 'RESULT' 82 | print result 83 | print 84 | 85 | status = True 86 | status &= ("dvdt_prof_kernel" == result['name']) 87 | status &= (source['queue'] == result['queue']) 88 | status &= (source['kernel'] == result['kernel']) 89 | status &= (cmp(source['gwo'], result['gwo']) == 0) 90 | status &= (cmp(source['gws'], result['gws']) == 0) 91 | status &= (cmp(source['lws'], result['lws']) == 0) 92 | status &= (cmp(source['event_wait_list'], result['event_wait_list']) == 0) 93 | status &= (source['event'] == result['event']) 94 | status &= (cmp(source['profiling'], result['profiling']) == 0) 95 | 96 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 97 | print 98 | -------------------------------------------------------------------------------- /tests/clEnqueueNDRangeKernel_LWS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import re 5 | import json 6 | 7 | sys.path.append('../python') 8 | from prof_parser import prof_parse 9 | from prof_parser import int_regex 10 | from prof_parser import ptr_regex 11 | from prof_parser import opts_regex 12 | 13 | max_work_dim = 3 14 | default_offset = 0 15 | null_offset = 0 16 | default_gws = 1 17 | default_lws = 1 18 | null_lws = 0 19 | 20 | # Test info. 21 | call = 'clEnqueueNDRangeKernel' 22 | _id = '_LWS' 23 | print '%s%s' % (call, _id) 24 | print 25 | 26 | # Environment. 27 | env = dict(os.environ) 28 | print 'DVDT_PROF_LWS=%s' % env['DVDT_PROF_LWS'] 29 | print 'LD_PRELOAD=%s' % env['LD_PRELOAD'] 30 | print 31 | 32 | # Parse initialisation list of form: lhs = { elem, ... }. 33 | def match_init_list(text, lhs_regex, elem_regex): 34 | result = [] 35 | match = re.search('%s(\s*)=(\s*)\{(\s*)(?P%s)' % (lhs_regex, elem_regex), text) 36 | while match and match.group('elem') != '}': 37 | result.append(match.group('elem')) 38 | text = text[match.end():] 39 | match = re.search('(?P%s|\})' % elem_regex, text) 40 | return result 41 | 42 | # Parse test source. 43 | source = {} 44 | with open(call + _id + '.cpp', 'r') as f: 45 | source['text'] = f.read() 46 | source['queue'] = re.search('\(cl_command_queue\) (?P%s)' % ptr_regex, source['text']).group('queue') 47 | source['kernel'] = re.search('\(cl_kernel\) (?P%s)' % ptr_regex, source['text']).group('kernel') 48 | 49 | work_dim = int(re.search('work_dim(\s*)=(\s*)(?P\d+)', source['text']).group('work_dim')) 50 | gwo = match_init_list(source['text'], 'global_work_offset\[%d\]' % work_dim, int_regex) 51 | source['gwo'] = ([int(i) for i in gwo] + [default_offset] * (max_work_dim - work_dim)) if gwo else [null_offset] * max_work_dim 52 | gws = match_init_list(source['text'], 'global_work_size\[%d\]' % work_dim, int_regex) 53 | source['gws'] = [int(i) for i in gws] + [default_gws] * (max_work_dim - work_dim) 54 | # Interceptor updates local work size from DVDT_PROF_LWS, so ignore test source and parse DVDT_PROF_LWS instead. 55 | lws = (env['DVDT_PROF_LWS'].strip('"').split(':')[1]).split(',') 56 | source['lws'] = ([int(i) for i in lws] + [default_lws] * (max_work_dim - work_dim)) if lws else [null_lws] * max_work_dim 57 | 58 | num_events = int(re.search('num_events_in_wait_list(\s*)=(\s*)(?P\d+)', source['text']).group('num_events')) 59 | cl_event_ptr_list = match_init_list(source['text'], 'event_wait_list\[%d\]' % num_events, '\(cl_event\) %s' % ptr_regex) 60 | source['event_wait_list'] = [re.match('\(cl_event\) (?P%s)' % ptr_regex, cl_event_ptr).group('ptr') for cl_event_ptr in cl_event_ptr_list] 61 | 62 | source['event'] = re.search('\(cl_event \*\) (?P%s)' % ptr_regex, source['text']).group('event') 63 | 64 | profiling_match = re.search('%s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 65 | ('profiling', int_regex, int_regex, int_regex, int_regex), source['text']) 66 | if profiling_match: 67 | source['profiling'] = {} 68 | source['profiling']['queued'] = int(profiling_match.group('queued')) 69 | source['profiling']['submit'] = int(profiling_match.group('submit')) 70 | source['profiling']['start'] = int(profiling_match.group('start')) 71 | source['profiling']['end'] = int(profiling_match.group('end')) 72 | 73 | 74 | # Read from stdin (via pipe). 75 | output = sys.stdin.read() 76 | print 'OUTPUT' 77 | print output 78 | 79 | result = prof_parse(output)[0] 80 | print 'RESULT' 81 | print result 82 | print 83 | 84 | status = True 85 | status &= ("dvdt_prof_kernel" == result['name']) 86 | status &= (source['queue'] == result['queue']) 87 | status &= (source['kernel'] == result['kernel']) 88 | status &= (cmp(source['gwo'], result['gwo']) == 0) 89 | status &= (cmp(source['gws'], result['gws']) == 0) 90 | status &= (cmp(source['lws'], result['lws']) == 0) 91 | status &= (cmp(source['event_wait_list'], result['event_wait_list']) == 0) 92 | status &= (source['event'] == result['event']) 93 | status &= (cmp(source['profiling'], result['profiling']) == 0) 94 | 95 | print '%s%s: %s' % (call, _id, 'PASSED' if status else 'FAILED') 96 | print 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dv/dt prof: OpenCL API profiler 2 | 3 | The `dv/dt prof` profiler (`libprof.so`) intercepts some OpenCL API calls and 4 | logs their arguments before invoking the underlying OpenCL implementation. 5 | 6 | As of v0.2, the profiler supports two modes: 7 | 8 | 1. The `ostream` mode logs to `stdout` blocks of text like the following: 9 | ``` 10 | [dv/dt] clEnqueueNDRangeKernel 11 | [dv/dt] clEnqueueNDRangeKernel name im2col_float 12 | [dv/dt] clEnqueueNDRangeKernel queue 0x5d3240 13 | [dv/dt] clEnqueueNDRangeKernel kernel 0xbb0300 14 | [dv/dt] clEnqueueNDRangeKernel gwo 0 0 0 15 | [dv/dt] clEnqueueNDRangeKernel gws 16384 1 1 16 | [dv/dt] clEnqueueNDRangeKernel lws 128 1 1 17 | [dv/dt] clEnqueueNDRangeKernel event_wait_list 18 | [dv/dt] clEnqueueNDRangeKernel event 0 19 | [dv/dt] clEnqueueNDRangeKernel start 2016-10-11T20:41:18.041468 20 | [dv/dt] clEnqueueNDRangeKernel profiling 52910121520869 52910121595577 52910130751092 52910132647472 21 | [dv/dt] clEnqueueNDRangeKernel end 2016-10-11T20:41:18.054802 22 | [dv/dt] clEnqueueNDRangeKernel errcode 0 23 | ``` 24 | 25 | In an offline post-processing step, the Python parser (`prof_parser.py`) 26 | converts the profiler's output into JSON as the following: 27 | ``` 28 | { 29 | "kernel": "0x7f8700", 30 | "profiling": { 31 | "start": 46559873667079, 32 | "end": 46559875636796, 33 | "queued": 46559863661412, 34 | "submit": 46559863742203 35 | }, 36 | "name": "im2col_float", 37 | "lws": [ 38 | 128, 39 | 1, 40 | 1 41 | ], 42 | "gwo": [ 43 | 0, 44 | 0, 45 | 0 46 | ], 47 | "errcode": 0, 48 | "queue": "0x1cd240", 49 | "call": "clEnqueueNDRangeKernel", 50 | "gws": [ 51 | 16384, 52 | 1, 53 | 1 54 | ], 55 | "timestamp": { 56 | "start": "2016-10-11T15:50:45.883538", 57 | "end": "2016-10-11T15:50:45.897364" 58 | }, 59 | "enqueue_id": 24, 60 | "event_wait_list": [], 61 | "event": "0" 62 | } 63 | ``` 64 | 65 | 2. The `cjson` mode uses the [cJSON](https://github.com/DaveGamble/cJSON/) 66 | library to build JSON online, which then gets logged to `stdout`. In an offline 67 | post-processing step, the Python parser (`prof_parser.py`) simply loads JSON 68 | between the `[dv/dt] <<` and `[dv/dt] >>` markers. 69 | 70 | # Effect on execution time 71 | 72 | Using the profiler can slow down the program for several reasons: 73 | 74 | - To make parsing robust, the profiler uses formatted printing which is 75 | relatively expensive. 76 | 77 | - To time non-blocking calls accurately, the profiler makes them blocking. 78 | 79 | - Optionally, the profiler can alter the program behaviour in other ways, for 80 | example, by changing the local work size for one or more kernels in the 81 | program. This functionality requires keeping additional state. 82 | 83 | The good news is that the kernel execution time and memory copy time are not 84 | affected. 85 | 86 | [OpenCL page at the Khronos Group](https://www.khronos.org/opencl) 87 | 88 | # Installing the profiler 89 | 90 | **NB:** The easiest way to install the profiler is by using 91 | [CK-Caffe](http://github.com/dividiti/ck-caffe) packages: 92 | ``` 93 | $ ck pull repo:ck-caffe --url=https://github.com/dividiti/ck-caffe 94 | $ ck install ck-caffe:package:tool-dvdt-prof 95 | $ ck install ck-caffe:package:tool-dvdt-prof-cjson 96 | ``` 97 | 98 | ## Prerequisites 99 | 100 | - CMake 3.0. 101 | 102 | - OpenCL headers and library. 103 | 104 | ## Building the profiler. 105 | 106 | Place the source into `${SRC_DIR}`. Create `${BUILD_DIR}`. 107 | 108 | ``` 109 | $ cd ${BUILD_DIR} 110 | $ cmake ${SRC_DIR} 111 | $ make prof 112 | ``` 113 | 114 | `${SRC_DIR}/lib` now contains `libprof.so`. 115 | 116 | To build and run tests: 117 | 118 | ``` 119 | $ make check 120 | ``` 121 | 122 | `${SRC_DIR}/lib` now contains `libprof_test.so` which is only useful for testing. 123 | 124 | ### Build options. 125 | 126 | By default, the profiler uses `boost::chrono` to measure wall-clock time. This 127 | can be disabled by setting the `WALLCLOCK` option as follows: 128 | 129 | ``` 130 | cmake ${SRC_DIR} -DWALLCLOCK=timeofday 131 | ``` 132 | 133 | (This is particularly handy for Android platforms.) 134 | 135 | Alternative mechanisms like `gettimeofday()` are not supported at the moment. 136 | 137 | # Using the profiler 138 | 139 | ## Collecting runtime information 140 | ``` 141 | $ LD_PRELOAD= 142 | ``` 143 | 144 | ## Changing the program behaviour 145 | 146 | Several environment variables can be defined when launching the program. 147 | 148 | ### DVDT_PROF_LWS 149 | 150 | `DVDT_PROF_LWS` specifies changes that should be made to the local work size 151 | when launching one or more kernels in the program. 152 | 153 | ``` 154 | DVDT_PROF_LWS="kernel_A:lws_A0,lws_A1,lws_A2 kernel_B:lws_B0,lws_B1,lws_B1 ..." 155 | ``` 156 | For example: 157 | ``` 158 | DVDT_PROF_LWS="transpose:8,8 gemm:4,16" LD_PRELOAD= 159 | ``` 160 | 161 | Namely, the per-kernel list elements are separated by spaces; the kernel names 162 | (strings) are separated from the local work size tuple by colons; the tuple 163 | elements (unsigned integers) are delimited by commas. The number of elements in 164 | a tuple must match the number of work-group dimensions as specified in the 165 | program or start with the value of `0` to use `NULL` as the local work size for 166 | this kernel. 167 | 168 | Note that the profiler cannot check the correctness of any given specification. 169 | In particular, the usual execution constraints hold: the global work size 170 | dimensions must be divisible by the local work size dimensions; the total 171 | work-group size (the product of all the dimensions) cannot exceed 172 | `CL_KERNEL_WORK_GROUP_SIZE`. 173 | 174 | ### DVDT_PROF_LWS_NULL 175 | 176 | For convenience, if the environment defines `DVDT_PROF_LWS_NULL` then `NULL` is 177 | used when launching any kernel in the program. (In this case, `DVDT_PROF_LWS` 178 | gets ignored.) 179 | -------------------------------------------------------------------------------- /python/dvdt_prof_cli/dvdt_function.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import json 3 | from tabulate import tabulate 4 | 5 | 6 | 7 | def print_args(args): 8 | if args.aggregate_bool is not None: 9 | print args.aggregate_bool 10 | if args.filter_kernel_list is not None: 11 | print args.filter_kernel_list 12 | print args.files_name 13 | print args.filter_percent 14 | print args.verbose_lvl 15 | 16 | def print_configuration(conf_dict): 17 | for i in conf_dict: 18 | if i != 'data': 19 | print i + ": ", conf_dict[i] 20 | 21 | def files_manager(files_str, file_default, verbosity=0): 22 | files_lst = [] 23 | ### manage default case 24 | if files_str.lower() == file_default: 25 | return files_lst.append(files_str.lower()) 26 | ### manage string seperated by columns 27 | file_list_all = files_str.split(',') 28 | for f in file_list_all: 29 | if os.path.isfile(f) is True: 30 | files_lst.append(f) 31 | ### here all the file should exist 32 | return files_lst 33 | 34 | 35 | def json_manager(fp): 36 | with open(fp, 'r') as f: 37 | dvp = json.load(f) 38 | ## return dictionary 39 | return dvp 40 | 41 | 42 | def filter_percent_manager(percent, verbosity=0): 43 | # silent 44 | new_percent = percent 45 | if percent > 100.0: 46 | new_percent = 0.0 47 | elif percent < 0.0: 48 | new_percent = 0.0 49 | return new_percent 50 | 51 | def fiter_by_name_manager(filter_kernel_str, verbosity=0): 52 | filter_kernel_lst = [] 53 | if filter_kernel_str is None: 54 | return filter_kernel_lst 55 | else: 56 | filter_kernel_lst = filter_kernel_str.split(',') 57 | return filter_kernel_lst 58 | 59 | 60 | def get_data_from_call(dvp, call_name="clEnqueueNDRangeKernel"): 61 | call_list = [] 62 | for opencl_function in dvp: 63 | if opencl_function['call'] == call_name: 64 | call_list.append(opencl_function) 65 | return call_list 66 | 67 | 68 | 69 | ##### Function per call/applcation and general function 70 | 71 | ### !!!! THE Function modifies k 72 | def computing_percent(k_lst, total_time): 73 | for i in k_lst: 74 | t_i = float(i['total_time']) 75 | percent = (t_i*100.0)/total_time 76 | i['percent'] = percent 77 | 78 | 79 | 80 | 81 | def get_application_stat(kernel_stat_lst): 82 | application_statistics = {} 83 | application_statistics['total_kernel_num'] = len(kernel_stat_lst) 84 | total_time = 0.0 85 | ## compute total time 86 | for i in kernel_stat_lst: 87 | total_time += i['total_time'] 88 | application_statistics['total_kernel_time'] = total_time 89 | application_statistics['unit'] = kernel_stat_lst[0]['unit'] 90 | return application_statistics 91 | ### NDRange 92 | 93 | def get_data_from_ndrange(NDRange_list, unit="ms"): 94 | stat_lst = [] 95 | ## Select unit 96 | ## To Do. Put in scientific format 97 | ms = 1000000.0 98 | sec = 1000000000.0 99 | if unit == "ms": 100 | unit_scale = ms 101 | else: 102 | unit_scale = sec 103 | 104 | for k in NDRange_list: 105 | ## kernel is in nano seconds 106 | tmp = {} 107 | tmp['kernel_id'] = str(k['kernel']) 108 | tmp['kernel_name'] = k['name'] 109 | total_kernel_time = int(k['profiling']['end']) -int (k['profiling']['start']) 110 | t_scale = total_kernel_time/unit_scale 111 | tmp ['total_time'] = t_scale 112 | tmp ['configuration'] = {'gws': k['gws'], 'lws':k['lws']} 113 | tmp ['unit'] = unit 114 | stat_lst.append(tmp) 115 | return stat_lst 116 | 117 | 118 | 119 | 120 | ##### VISUALIZATION 121 | 122 | # add option to save format 123 | # add regex per file name 124 | def print_table(k_lst, app_lst, limit=0, by_kernel=[], view="simple"): 125 | header = ["Kernel_id", "Kernel_name", "Time ("+ app_lst['unit']+ ")", "Percent (%)", "GWS", "LWS"] 126 | total_calls = 0 127 | partial_percent = 0.0 128 | partial_time = 0.0 129 | value = [] 130 | idd = 0 131 | if len(by_kernel) == 0: 132 | for i in k_lst: 133 | idd +=1 134 | if i["percent"] > limit: 135 | total_calls = total_calls + 1 136 | gws = i['configuration']['gws'] 137 | lws = i['configuration']['lws'] 138 | #value.append( [i["kernel_id"], i["kernel_name"], format(i['total_time'],'.2f'), format(i['percent'],'.2f'), gws, lws ]) 139 | value.append( [idd, i["kernel_name"], format(i['total_time'],'.2f'), format(i['percent'],'.2f'), gws, lws ]) 140 | 141 | partial_time += i['total_time'] 142 | partial_percent += i["percent"] 143 | 144 | else: 145 | for i in k_lst: 146 | idd +=1 147 | if i["percent"] > limit and i["kernel_name"] in by_kernel: 148 | total_calls = total_calls + 1 149 | gws = i['configuration']['gws'] 150 | lws = i['configuration']['lws'] 151 | #value.append( [i["kernel_id"], i["kernel_name"], format(i['total_time'],'.2f'), format(i['percent'],'.2f'), gws, lws ]) 152 | value.append( [str(idd), i["kernel_name"], format(i['total_time'],'.2f'), format(i['percent'],'.2f'), gws, lws ]) 153 | 154 | partial_time += i['total_time'] 155 | partial_percent += i["percent"] 156 | ## APP INFO 157 | by_kernel_str = '' 158 | for k in by_kernel: 159 | by_kernel_str += str(k)+'\n' 160 | app_header = ["Filter_by", "Threshold % > ", "Calls", "Partial time", "Partial percent" ] 161 | app_value = [[by_kernel_str, limit, total_calls, format(partial_time,'.2f'), format(partial_percent,'.2f') ]] 162 | #value.append([total_calls,"-", format(partial_time,'.2f') , format(partial_percent,'.2f')]) 163 | print "\n" 164 | print tabulate(value, header, tablefmt=view) 165 | print "\n" 166 | print tabulate(app_value, app_header, tablefmt="rst") 167 | 168 | -------------------------------------------------------------------------------- /python/prof_wrangler.py: -------------------------------------------------------------------------------- 1 | # 2 | # 2015-2017 (c) dividiti 3 | # 4 | 5 | import prof_common 6 | 7 | import dateutil.parser 8 | 9 | import pandas as pd 10 | 11 | # Check that definitions from this file are available. 12 | def test(): 13 | print ("prof_wrangler.py") 14 | 15 | # Return the difference between the end and start timestamps in seconds. 16 | def ts_delta_s(ts_end, ts_start): 17 | delta = dateutil.parser.parse(ts_end) - dateutil.parser.parse(ts_start) 18 | delta_s = delta.total_seconds() 19 | return delta_s 20 | 21 | # Return the difference between the end and start timestamps in nanoseconds. 22 | def ts_delta_ns(ts_end, ts_start): 23 | delta_s = ts_delta_s(ts_end, ts_start) 24 | delta_ns = int(delta_s * 1e9) 25 | return delta_ns 26 | 27 | # For each call in the trace, add its index to the call dictionary. 28 | def index_calls(trace): 29 | indexed_trace = [ 30 | dict(trace, call_index=index) 31 | for trace, index in zip(trace, range(len(trace))) 32 | ] 33 | return indexed_trace 34 | 35 | # Return calls in the trace whose names are in the call_names list. 36 | # For example, when calls_names=['clEnqueueNDRangeKernel'], return 37 | # only kernel enqueues. 38 | def filter_calls(trace, call_names): 39 | filtered_trace = [ 40 | call for call in trace if call['call'] in call_names 41 | ] 42 | return filtered_trace 43 | 44 | # Return a DataFrame containing the differences between the profiling markers 45 | # and the timestamps in nanoseconds for a trace (nqs) with any enqueues 46 | # (clEnqueueNDRangeKernel, clEnqueueReadBuffer, clEnqueueWriteBuffer, etc). 47 | def df_enqueues_ns(nqs, 48 | label_fn = lambda nq: '%s' % str(nq['call_index']).zfill(6)) : 49 | def _df_data(): 50 | data = [ 51 | { 52 | 'p1 - p0' : nq['profiling']['submit'] - nq['profiling']['queued'], # command queueing time 53 | 'p2 - p1' : nq['profiling']['start'] - nq['profiling']['submit'], # job queueing time 54 | 'p3 - p2' : nq['profiling']['end'] - nq['profiling']['start'], # kernel execution time 55 | 'p3 - p0' : nq['profiling']['end'] - nq['profiling']['queued'], # total execution time 56 | 't1 - t0' : ts_delta_ns(ts_end=nq['timestamp']['end'], ts_start=nq['timestamp']['start']), # chrono time 57 | '(t1 - t0) - (p3 - p0)' : 58 | ts_delta_ns(ts_end=nq['timestamp']['end'], ts_start=nq['timestamp']['start']) - 59 | nq['profiling']['end'] + nq['profiling']['queued'], # chrono overhead 60 | } 61 | for nq in nqs 62 | ] 63 | return data 64 | 65 | def _df_index(): 66 | index = pd.MultiIndex.from_tuples( 67 | names=('label', 'call'), 68 | tuples=[ (label_fn(nq), nq['call']) for nq in nqs ] 69 | ) 70 | return index 71 | 72 | df = pd.DataFrame(data=_df_data(),index=_df_index()) 73 | return df 74 | 75 | # Return a DataFrame containing kernel enqueue info. 76 | def df_kernel_enqueues(nqs, unit='ms'): 77 | multiplier = { 78 | 'ns' : { 'profiling' : 1e-0, 'timestamp' : 1e+9 }, 79 | 'us' : { 'profiling' : 1e-3, 'timestamp' : 1e+6 }, 80 | 'ms' : { 'profiling' : 1e-6, 'timestamp' : 1e+3 }, 81 | 's' : { 'profiling' : 1e-9, 'timestamp' : 1e+0 } 82 | } 83 | 84 | df_kernel_enqueues = pd.DataFrame() 85 | df_kernel_enqueues_tmp = pd.DataFrame(nqs) 86 | 87 | # Flatten work size and offset lists. 88 | df_kernel_enqueues[['lws0','lws1','lws2']] = df_kernel_enqueues_tmp['lws'].apply(pd.Series) 89 | df_kernel_enqueues[['gws0','gws1','gws2']] = df_kernel_enqueues_tmp['gws'].apply(pd.Series) 90 | df_kernel_enqueues[['gwo0','gwo1','gwo2']] = df_kernel_enqueues_tmp['gwo'].apply(pd.Series) 91 | 92 | # Flatten timestamp dictionaries 93 | df_kernel_enqueues_tmp[['t0','t1']] = df_kernel_enqueues_tmp['timestamp'].apply(pd.Series) 94 | # Compute the timestamp difference. 95 | df_kernel_enqueues['t1 - t0 (%s)' % unit] = df_kernel_enqueues_tmp[['t0','t1']] \ 96 | .apply(lambda x: multiplier[unit]['timestamp'] * ts_delta_s(x[0],x[1]), axis=1) 97 | 98 | # Flatten profiling dictionaries 99 | # NB: Note this approach is different from the one used for timestamps 100 | # due to non-intuitive order of flattening via .apply(pd.Series). 101 | df_kernel_enqueues_tmp['p0'] = df_kernel_enqueues_tmp['profiling'].apply(lambda x: x['queued']) 102 | df_kernel_enqueues_tmp['p1'] = df_kernel_enqueues_tmp['profiling'].apply(lambda x: x['submit']) 103 | df_kernel_enqueues_tmp['p2'] = df_kernel_enqueues_tmp['profiling'].apply(lambda x: x['start']) 104 | df_kernel_enqueues_tmp['p3'] = df_kernel_enqueues_tmp['profiling'].apply(lambda x: x['end']) 105 | # Compute the profiling differences. 106 | df_kernel_enqueues['p3 - p0 (%s)' % unit] = \ 107 | multiplier[unit]['profiling'] * (df_kernel_enqueues_tmp['p3'] - df_kernel_enqueues_tmp['p0']) 108 | df_kernel_enqueues['p3 - p2 (%s)' % unit] = \ 109 | multiplier[unit]['profiling'] * (df_kernel_enqueues_tmp['p3'] - df_kernel_enqueues_tmp['p2']) 110 | df_kernel_enqueues['p2 - p1 (%s)' % unit] = \ 111 | multiplier[unit]['profiling'] * (df_kernel_enqueues_tmp['p2'] - df_kernel_enqueues_tmp['p1']) 112 | df_kernel_enqueues['p1 - p0 (%s)' % unit] = \ 113 | multiplier[unit]['profiling'] * (df_kernel_enqueues_tmp['p1'] - df_kernel_enqueues_tmp['p0']) 114 | 115 | # Set the index. 116 | df_kernel_enqueues[['call_index','name']] = df_kernel_enqueues_tmp[['call_index','name']] 117 | df_kernel_enqueues.set_index(['call_index', 'name'], inplace=True) 118 | 119 | return df_kernel_enqueues 120 | 121 | 122 | def df_kernel_enqueues_cumulative_time_num(df_kernel_enqueues_all, unit): 123 | # For each kernel enqueue, create the time column and column of all ones. 124 | df_time_num = df_kernel_enqueues_all[['p3 - p2 (%s)' % unit]].copy() 125 | df_time_num['1'] = 1 126 | 127 | # Compute the cumulative time and the number of enqueues. 128 | df_cumulative_time_num = df_time_num.groupby(level='name').sum() 129 | # Update the column labels. 130 | df_cumulative_time_num.columns = ['** Execution time (%s) **' % unit, '** Number of enqueues **'] 131 | # Update the index label. 132 | df_cumulative_time_num.index.name = '** Kernel name **' 133 | 134 | # Compute the execution time percentage. 135 | df_cumulative_time_num['** Execution time (%) **'] = 100 * ( \ 136 | df_cumulative_time_num['** Execution time (%s) **' % unit] / \ 137 | df_cumulative_time_num['** Execution time (%s) **' % unit].sum()) 138 | 139 | # Sort the columns so that the number of enqueues comes first, and sort the rows in descending order. 140 | return df_cumulative_time_num[ 141 | ['** Number of enqueues **', '** Execution time (%s) **' % unit, '** Execution time (%) **'] 142 | ].sort_values('** Execution time (%) **', ascending=False) 143 | -------------------------------------------------------------------------------- /python/prof_parser.py: -------------------------------------------------------------------------------- 1 | # 2 | # 2015-2017 (c) dividiti 3 | # 4 | 5 | import re 6 | import json 7 | 8 | # 9 | # Common definitions. 10 | # 11 | 12 | prefix = '(\[dv\/dt\])' 13 | call_regex = '(cl[a-zA-Z]*)' 14 | opts_regex = '([ \-\w_=]*)' 15 | iso_regex = '(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6})' 16 | ptr_regex = '((0x[0-9a-fA-F]{1,8})|(0))' 17 | int_regex = '(\d+)' 18 | hex_regex = '([a-fA-F\d]+)' 19 | bool_regex = '(\d)' 20 | 21 | # 22 | # Parsers for API calls. 23 | # 24 | 25 | def match_clBuildProgram(output, result): 26 | call = 'clBuildProgram' 27 | 28 | # Arguments. 29 | result['program'] = re.search('%s %s %s (?P%s)' % \ 30 | (prefix, call, 'program', ptr_regex), output).group('program') 31 | result['device_list'] = re.search('%s %s %s(?P( %s)*)' % \ 32 | (prefix, call, 'device_list', ptr_regex), output).group('device_list').split() 33 | result['options'] = re.search('%s %s %s (?P%s)' % \ 34 | (prefix, call, 'options', opts_regex), output).group('options') 35 | result['pfn_notify'] = re.search('%s %s %s (?P%s)' % \ 36 | (prefix, call, 'pfn_notify', ptr_regex), output).group('pfn_notify') 37 | result['user_data'] = re.search('%s %s %s (?P%s)' % \ 38 | (prefix, call, 'user_data', ptr_regex), output).group('user_data') 39 | 40 | # Return value. 41 | return_match = re.search('%s %s %s (?P%s)' % \ 42 | (prefix, call, 'errcode', int_regex), output) 43 | result['errcode'] = int(return_match.group('errcode')) 44 | 45 | return (output[return_match.end():], result) 46 | 47 | 48 | def match_clCreateBuffer(output, result): 49 | call = 'clCreateBuffer' 50 | 51 | # Arguments. 52 | result['context'] = re.search('%s %s %s (?P%s)' % \ 53 | (prefix, call, 'context', ptr_regex), output).group('context') 54 | result['flags'] = int(re.search('%s %s %s (?P%s)' % \ 55 | (prefix, call, 'flags', int_regex), output).group('flags')) 56 | result['size'] = int(re.search('%s %s %s (?P%s)' % \ 57 | (prefix, call, 'size', int_regex), output).group('size')) 58 | result['host_ptr'] = re.search('%s %s %s (?P%s)' % \ 59 | (prefix, call, 'host_ptr', ptr_regex), output).group('host_ptr') 60 | result['errcode_ret'] = re.search('%s %s %s (?P%s)' % \ 61 | (prefix, call, 'errcode_ret', ptr_regex), output).group('errcode_ret') 62 | 63 | # Return value. 64 | return_match = re.search('%s %s %s (?P%s)' % \ 65 | (prefix, call, 'buffer', ptr_regex), output) 66 | result['queue'] = return_match.group('buffer') 67 | 68 | return (output[return_match.end():], result) 69 | 70 | 71 | def match_clCreateCommandQueue(output, result): 72 | call = 'clCreateCommandQueue' 73 | 74 | # Arguments. 75 | result['context'] = re.search('%s %s %s (?P%s)' % \ 76 | (prefix, call, 'context', ptr_regex), output).group('context') 77 | result['device'] = re.search('%s %s %s (?P%s)' % \ 78 | (prefix, call, 'device', ptr_regex), output).group('device') 79 | result['properties'] = int(re.search('%s %s %s (?P%s)' % \ 80 | (prefix, call, 'properties', int_regex), output).group('properties')) 81 | result['errcode_ret'] = re.search('%s %s %s (?P%s)' % \ 82 | (prefix, call, 'errcode_ret', ptr_regex), output).group('errcode_ret') 83 | 84 | # Return value. 85 | return_match = re.search('%s %s %s (?P%s)' % \ 86 | (prefix, call, 'queue', ptr_regex), output) 87 | result['queue'] = return_match.group('queue') 88 | 89 | return (output[return_match.end():], result) 90 | 91 | 92 | def match_clCreateKernel(output, result): 93 | call = 'clCreateKernel' 94 | 95 | # Arguments. 96 | result['program'] = re.search('%s %s %s (?P%s)' % \ 97 | (prefix, call, 'program', ptr_regex), output).group('program') 98 | result['name'] = re.search('%s %s %s (?P%s)' % \ 99 | (prefix, call, 'name', opts_regex), output).group('name') 100 | result['errcode_ret'] = re.search('%s %s %s (?P%s)' % \ 101 | (prefix, call, 'errcode_ret', ptr_regex), output).group('errcode_ret') 102 | 103 | # Return value. 104 | return_match = re.search('%s %s %s (?P%s)' % \ 105 | (prefix, call, 'kernel', ptr_regex), output) 106 | result['kernel'] = return_match.group('kernel') 107 | 108 | return (output[return_match.end():], result) 109 | 110 | 111 | def match_clCreateKernelsInProgram(output, result): 112 | call = 'clCreateKernelsInProgram' 113 | 114 | # Arguments. 115 | result['program'] = re.search('%s %s %s (?P%s)' % \ 116 | (prefix, call, 'program', ptr_regex), output).group('program') 117 | result['num_kernels'] = int(re.search('%s %s %s (?P%s)' % \ 118 | (prefix, call, 'num_kernels', int_regex), output).group('num_kernels')) 119 | result['kernels'] = re.search('%s %s %s (?P%s)' % \ 120 | (prefix, call, 'kernels', ptr_regex), output).group('kernels') 121 | result['num_kernels_ret_ptr'] = re.search('%s %s %s (?P%s)' % \ 122 | (prefix, call, 'num_kernels_ret_ptr', ptr_regex), output).group('num_kernels_ret_ptr') 123 | result['num_kernels_ret'] = int(re.search('%s %s %s (?P%s)' % \ 124 | (prefix, call, 'num_kernels_ret', int_regex), output).group('num_kernels_ret')) 125 | 126 | # Return value. 127 | return_match = re.search('%s %s %s (?P%s)' % \ 128 | (prefix, call, 'errcode', int_regex), output) 129 | result['errcode'] = int(return_match.group('errcode')) 130 | 131 | return (output[return_match.end():], result) 132 | 133 | 134 | def match_clCreateProgramWithBinary(output, result): 135 | call = 'clCreateProgramWithBinary' 136 | 137 | # Arguments. 138 | result['context'] = re.search('%s %s %s (?P%s)' % \ 139 | (prefix, call, 'context', ptr_regex), output).group('context') 140 | result['device_list'] = re.search('%s %s %s(?P( %s)*)' % \ 141 | (prefix, call, 'device_list', ptr_regex), output).group('device_list').split() 142 | result['lengths'] = re.search('%s %s %s (?P%s)' % \ 143 | (prefix, call, 'lengths', ptr_regex), output).group('lengths') 144 | result['binaries'] = re.search('%s %s %s (?P%s)' % \ 145 | (prefix, call, 'binaries', ptr_regex), output).group('binaries') 146 | result['binary_status'] = re.search('%s %s %s (?P%s)' % \ 147 | (prefix, call, 'binary_status', ptr_regex), output).group('binary_status') 148 | result['errcode_ret'] = re.search('%s %s %s (?P%s)' % \ 149 | (prefix, call, 'errcode_ret', ptr_regex), output).group('errcode_ret') 150 | 151 | # Return value. 152 | return_match = re.search('%s %s %s (?P%s)' % \ 153 | (prefix, call, 'program', ptr_regex), output) 154 | result['program'] = return_match.group('program') 155 | 156 | return (output[return_match.end():], result) 157 | 158 | 159 | def match_clCreateProgramWithSource(output, result): 160 | call = 'clCreateProgramWithSource' 161 | 162 | # Arguments. 163 | result['context'] = re.search('%s %s %s (?P%s)' % \ 164 | (prefix, call, 'context', ptr_regex), output).group('context') 165 | result['count'] = int(re.search('%s %s %s (?P%s)' % \ 166 | (prefix, call, 'count', int_regex), output).group('count')) 167 | result['strings'] = re.search('%s %s %s (?P%s)' % \ 168 | (prefix, call, 'strings', ptr_regex), output).group('strings') 169 | result['lengths'] = re.search('%s %s %s (?P%s)' % \ 170 | (prefix, call, 'lengths', ptr_regex), output).group('lengths') 171 | result['errcode_ret'] = re.search('%s %s %s (?P%s)' % \ 172 | (prefix, call, 'errcode_ret', ptr_regex), output).group('errcode_ret') 173 | 174 | result['source'] = {} 175 | for k in range(result['count']): 176 | prefix_call_string_k = '%s %s %s' % \ 177 | (prefix, call, 'sources\[%d\]' % k) 178 | # NB: '.*?' matches any characters between the markers 179 | # in a non-greedy fashion. 180 | result['source'][str(k)] = \ 181 | re.search('%s <<\n(?P.*?)\n%s >>\n' % \ 182 | (prefix_call_string_k, prefix_call_string_k), \ 183 | output, re.DOTALL).group('string') 184 | 185 | # Return value. 186 | return_match = re.search('%s %s %s (?P%s)' % \ 187 | (prefix, call, 'program', ptr_regex), output) 188 | result['program'] = return_match.group('program') 189 | 190 | return (output[return_match.end():], result) 191 | 192 | 193 | def match_clEnqueueNDRangeKernel(output, result): 194 | call = 'clEnqueueNDRangeKernel' 195 | 196 | # Name. 197 | result['name'] = re.search('%s %s %s (?P%s)' % \ 198 | (prefix, call, 'name', opts_regex), output).group('name') 199 | 200 | # Arguments. 201 | result['queue'] = re.search('%s %s %s (?P%s)' % \ 202 | (prefix, call, 'queue', ptr_regex), output).group('queue') 203 | result['kernel'] = re.search('%s %s %s (?P%s)' % \ 204 | (prefix, call, 'kernel', ptr_regex), output).group('kernel') 205 | result['gwo'] = [int(i) for i in re.search('%s %s %s (?P%s)' % \ 206 | (prefix, call, 'gwo', '.*'), output).group('gwo').split()] 207 | result['gws'] = [int(i) for i in re.search('%s %s %s (?P%s)' % \ 208 | (prefix, call, 'gws', '.*'), output).group('gws').split()] 209 | result['lws'] = [int(i) for i in re.search('%s %s %s (?P%s)' % \ 210 | (prefix, call, 'lws', '.*'), output).group('lws').split()] 211 | result['event_wait_list'] = re.search('%s %s %s(?P( %s)*)' % \ 212 | (prefix, call, 'event_wait_list', ptr_regex), output).group('event_wait_list').split() 213 | result['event'] = re.search('%s %s %s (?P%s)' % \ 214 | (prefix, call, 'event', ptr_regex), output).group('event') 215 | 216 | # Return value. 217 | return_match = re.search('%s %s %s (?P%s)' % \ 218 | (prefix, call, 'errcode', int_regex), output) 219 | result['errcode'] = int(return_match.group('errcode')) 220 | 221 | # Profiling info. 222 | profiling_match = re.search('%s %s %s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 223 | (prefix, call, 'profiling', int_regex, int_regex, int_regex, int_regex), output) 224 | if profiling_match: 225 | result['profiling'] = {} 226 | result['profiling']['queued'] = int(profiling_match.group('queued')) 227 | result['profiling']['submit'] = int(profiling_match.group('submit')) 228 | result['profiling']['start'] = int(profiling_match.group('start')) 229 | result['profiling']['end'] = int(profiling_match.group('end')) 230 | 231 | last_match = return_match if not profiling_match else profiling_match 232 | 233 | return (output[last_match.end():], result) 234 | 235 | 236 | # Auxiliary function for clEnqueueReadBuffer and clEnqueueWriteBuffer. 237 | def _match_clEnqueueReadOrWriteBuffer(call, output, result): 238 | # Arguments. 239 | result['queue'] = re.search('%s %s %s (?P%s)' % \ 240 | (prefix, call, 'queue', ptr_regex), output).group('queue') 241 | result['buffer'] = re.search('%s %s %s (?P%s)' % \ 242 | (prefix, call, 'buffer', ptr_regex), output).group('buffer') 243 | result['blocking'] = int(re.search('%s %s %s (?P%s)' % \ 244 | (prefix, call, 'blocking', bool_regex), output).group('blocking')) 245 | result['offset'] = int(re.search('%s %s %s (?P%s)' % \ 246 | (prefix, call, 'offset', int_regex), output).group('offset')) 247 | result['size'] = int(re.search('%s %s %s (?P%s)' % \ 248 | (prefix, call, 'size', int_regex), output).group('size')) 249 | result['ptr'] = re.search('%s %s %s (?P%s)' % \ 250 | (prefix, call, 'ptr', ptr_regex), output).group('ptr') 251 | result['event_wait_list'] = re.search('%s %s %s(?P( %s)*)' % \ 252 | (prefix, call, 'event_wait_list', ptr_regex), output).group('event_wait_list').split() 253 | result['event'] = re.search('%s %s %s (?P%s)' % \ 254 | (prefix, call, 'event', ptr_regex), output).group('event') 255 | 256 | # Return value. 257 | return_match = re.search('%s %s %s (?P%s)' % \ 258 | (prefix, call, 'errcode', int_regex), output) 259 | result['errcode'] = int(return_match.group('errcode')) 260 | 261 | # Profiling info. 262 | profiling_match = re.search('%s %s %s (?P%s) (?P%s) (?P%s) (?P%s)' % \ 263 | (prefix, call, 'profiling', int_regex, int_regex, int_regex, int_regex), output) 264 | if profiling_match: 265 | result['profiling'] = {} 266 | result['profiling']['queued'] = int(profiling_match.group('queued')) 267 | result['profiling']['submit'] = int(profiling_match.group('submit')) 268 | result['profiling']['start'] = int(profiling_match.group('start')) 269 | result['profiling']['end'] = int(profiling_match.group('end')) 270 | 271 | last_match = return_match if not profiling_match else profiling_match 272 | 273 | return (output[last_match.end():], result) 274 | 275 | def match_clEnqueueReadBuffer(output, result): 276 | return _match_clEnqueueReadOrWriteBuffer(call='clEnqueueReadBuffer', output=output, result=result) 277 | 278 | def match_clEnqueueWriteBuffer(output, result): 279 | return _match_clEnqueueReadOrWriteBuffer(call='clEnqueueWriteBuffer', output=output, result=result) 280 | 281 | 282 | def match_clSetKernelArg(output, result): 283 | call = 'clSetKernelArg' 284 | 285 | # Arguments. 286 | result['kernel'] = re.search('%s %s %s (?P%s)' % \ 287 | (prefix, call, 'kernel', ptr_regex), output).group('kernel') 288 | result['arg_index'] = int(re.search('%s %s %s (?P%s)' % \ 289 | (prefix, call, 'arg_index', int_regex), output).group('arg_index')) 290 | result['arg_size'] = int(re.search('%s %s %s (?P%s)' % \ 291 | (prefix, call, 'arg_size', int_regex), output).group('arg_size')) 292 | result['arg_value'] = re.search('%s %s %s (?P%s)' % \ 293 | (prefix, call, 'arg_value', hex_regex), output).group('arg_value') 294 | 295 | # Return value. 296 | return_match = re.search('%s %s %s (?P%s)' % \ 297 | (prefix, call, 'errcode', int_regex), output) 298 | result['errcode'] = int(return_match.group('errcode')) 299 | 300 | return (output[return_match.end():], result) 301 | 302 | 303 | # Map from API calls to parsers. 304 | map_call_to_parser = { 305 | 'clBuildProgram' : match_clBuildProgram, 306 | 'clCreateBuffer' : match_clCreateBuffer, 307 | 'clCreateCommandQueue' : match_clCreateCommandQueue, 308 | 'clCreateKernel' : match_clCreateKernel, 309 | 'clCreateKernelsInProgram' : match_clCreateKernelsInProgram, 310 | 'clCreateProgramWithBinary' : match_clCreateProgramWithBinary, 311 | 'clCreateProgramWithSource' : match_clCreateProgramWithSource, 312 | 'clEnqueueNDRangeKernel' : match_clEnqueueNDRangeKernel, 313 | 'clEnqueueReadBuffer' : match_clEnqueueReadBuffer, 314 | 'clEnqueueWriteBuffer' : match_clEnqueueWriteBuffer, 315 | 'clSetKernelArg' : match_clSetKernelArg 316 | } 317 | 318 | 319 | def next_match(output): 320 | result = {} 321 | 322 | # For robustness, a new block starts with just an API call name. 323 | match = re.search('%s (?P%s)\n' % (prefix, call_regex), output) 324 | if not match: 325 | return ('', {}) 326 | 327 | result['call'] = match.group('call') 328 | parser = map_call_to_parser[result['call']] 329 | if not parser: 330 | raise Exception('OpenCL API call %s not supported!' % result['call']) 331 | 332 | # Start and end timestamps are optional (especially in tests) but common to all calls. 333 | result['timestamp'] = {} 334 | start_match = re.search('%s %s start (?P%s)' % (prefix, result['call'], iso_regex), output[match.end():]) 335 | if start_match: 336 | result['timestamp']['start'] = start_match.group('start') 337 | end_match = re.search('%s %s end (?P%s)' % (prefix, result['call'], iso_regex), output[match.end():]) 338 | if end_match: 339 | result['timestamp']['end'] = end_match.group('end') 340 | 341 | return parser(output, result) 342 | 343 | 344 | def prof_parse_ostream(output): 345 | results = [] 346 | (output, result) = next_match(output) 347 | while result: 348 | results.append(result) 349 | (output, result) = next_match(output) 350 | return results 351 | 352 | 353 | def prof_parse_cjson(output): 354 | results = [] 355 | match = re.search('%s <<\n(?P.*)\n%s >>\n' % \ 356 | (prefix, prefix), \ 357 | output, re.DOTALL) 358 | if match: 359 | results = json.loads(match.group('json')) 360 | return results 361 | 362 | 363 | def prof_parse(output): 364 | results = prof_parse_cjson(output) 365 | if not results: 366 | results = prof_parse_ostream(output) 367 | return results 368 | -------------------------------------------------------------------------------- /cpp/prof.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // 2015-2017 (c) dividiti 3 | // 4 | 5 | 6 | #include "prof.hpp" 7 | 8 | // Static container for profiler's methods and data. 9 | static dvdt::Prof prof; 10 | 11 | // Static container for profiler's logger. 12 | #if (1 == DVDT_PROF_CJSON) 13 | static dvdt::cjsonLogger logger; 14 | #else 15 | static dvdt::ostreamLogger logger; 16 | #endif 17 | 18 | // 19 | // Table of contents: OpenCL API functions in the alphabetical order. 20 | // 21 | // - clBuildProgram() 22 | // - clCreateBuffer() 23 | // - clCreateCommandQueue() 24 | // - clCreateKernel() 25 | // - clCreateKernelsInProgram() 26 | // - clCreateProgramWithBinary() 27 | // - clCreateProgramWithSource() 28 | // - clEnqueueNDRangeKernel() 29 | // - clEnqueueReadBuffer() 30 | // - clEnqueueWriteBuffer() 31 | // - clSetKernelArg() 32 | // 33 | 34 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clBuildProgram.html 35 | extern CL_API_ENTRY cl_int CL_API_CALL 36 | clBuildProgram( 37 | cl_program program, 38 | cl_uint num_devices, 39 | const cl_device_id * device_list, 40 | const char * options, 41 | void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), 42 | void * user_data) CL_API_SUFFIX__VERSION_1_0 43 | { 44 | // Return value. 45 | cl_int errcode = CL_SUCCESS; 46 | 47 | // API call. 48 | const char * call = "clBuildProgram"; 49 | logger.log_call(call); 50 | 51 | if (NULL == prof.interceptor.clBuildProgram_original) 52 | { 53 | prof.interceptor.clBuildProgram_original = (dvdt::Prof::Interceptor::clBuildProgram_type) dlsym(RTLD_NEXT, call); 54 | } 55 | 56 | // Arguments. 57 | logger.log_ptr(call, "program", program); 58 | logger.log_list(call, "device_list", device_list, num_devices); 59 | logger.log_str(call, "options", options ? options : ""); 60 | logger.log_ptr(call, "pfn_notify", (const void *) pfn_notify); 61 | logger.log_ptr(call, "user_data", user_data); 62 | 63 | #ifndef DVDT_PROF_TEST 64 | logger.log_timestamp_start(call); 65 | 66 | // Original call. 67 | errcode = prof.interceptor.clBuildProgram_original(\ 68 | program, num_devices, device_list, options, pfn_notify, user_data); 69 | // TODO: When pfn_notify is not NULL, still make the call blocking so that 70 | // (timestamp_end - timestamp_start) represents the actual build time. 71 | 72 | logger.log_timestamp_end(call); 73 | #endif 74 | 75 | // Return value. 76 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 77 | 78 | return errcode; 79 | 80 | } // clBuildProgram() 81 | 82 | 83 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateBuffer.html 84 | extern CL_API_ENTRY cl_mem CL_API_CALL 85 | clCreateBuffer( 86 | cl_context context, 87 | cl_mem_flags flags, 88 | size_t size, 89 | void *host_ptr, 90 | cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 91 | { 92 | // Return value. 93 | cl_mem buffer = (cl_mem) 0x0; 94 | 95 | // API call. 96 | const char * call = "clCreateBuffer"; 97 | logger.log_call(call); 98 | 99 | if (NULL == prof.interceptor.clCreateBuffer_original) 100 | { 101 | prof.interceptor.clCreateBuffer_original = (dvdt::Prof::Interceptor::clCreateBuffer_type) dlsym(RTLD_NEXT, call); 102 | } 103 | 104 | if (NULL == prof.interceptor.context) 105 | { 106 | prof.interceptor.context = context; 107 | } 108 | 109 | // Arguments. 110 | logger.log_ptr(call, "context", context); 111 | logger.log_num(call, "flags", flags); 112 | logger.log_num(call, "size", size); 113 | logger.log_ptr(call, "host_ptr", host_ptr); 114 | logger.log_ptr(call, "errcode_ret", errcode_ret); 115 | 116 | #ifndef DVDT_PROF_TEST 117 | logger.log_timestamp_start(call); 118 | 119 | // Original call. 120 | buffer = prof.interceptor.clCreateBuffer_original(\ 121 | context, flags, size, host_ptr, errcode_ret); 122 | 123 | logger.log_timestamp_end(call); 124 | 125 | // Error value. 126 | logger.log_num(call, "errcode", errcode_ret ? *errcode_ret : -1); 127 | #endif 128 | 129 | // Return value. 130 | logger.log_ptr(call, "buffer", buffer); logger.log_lf(); 131 | 132 | return buffer; 133 | 134 | } // clCreateBuffer() 135 | 136 | 137 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateCommandQueue.html 138 | extern CL_API_ENTRY cl_command_queue CL_API_CALL 139 | clCreateCommandQueue( 140 | cl_context context, 141 | cl_device_id device, 142 | cl_command_queue_properties properties, 143 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 144 | { 145 | // Return value. 146 | cl_command_queue queue = (cl_command_queue) 0x0; 147 | 148 | // API call. 149 | const char * call = "clCreateCommandQueue"; 150 | logger.log_call(call); 151 | 152 | if (NULL == prof.interceptor.clCreateCommandQueue_original) 153 | { 154 | prof.interceptor.clCreateCommandQueue_original = (dvdt::Prof::Interceptor::clCreateCommandQueue_type) dlsym(RTLD_NEXT, call); 155 | } 156 | 157 | if (NULL == prof.interceptor.context) 158 | { 159 | prof.interceptor.context = context; 160 | } 161 | 162 | // Arguments. 163 | logger.log_ptr(call, "context", context); 164 | logger.log_ptr(call, "device", device); 165 | logger.log_num(call, "properties", properties); 166 | logger.log_ptr(call, "errcode_ret", errcode_ret); 167 | 168 | #ifndef DVDT_PROF_TEST 169 | logger.log_timestamp_start(call); 170 | 171 | // Original call. 172 | queue = prof.interceptor.clCreateCommandQueue_original(\ 173 | context, device, properties | CL_QUEUE_PROFILING_ENABLE, errcode_ret); 174 | 175 | logger.log_timestamp_end(call); 176 | 177 | // Error value. 178 | logger.log_num(call, "errcode", errcode_ret ? *errcode_ret : -1); 179 | #endif 180 | 181 | // Return value. 182 | logger.log_ptr(call, "queue", queue); logger.log_lf(); 183 | 184 | return queue; 185 | 186 | } // clCreateCommandQueue() 187 | 188 | 189 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateKernel.html 190 | extern CL_API_ENTRY cl_kernel CL_API_CALL 191 | clCreateKernel( 192 | cl_program program, 193 | const char * kernel_name, 194 | cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0 195 | { 196 | // Return value. 197 | cl_kernel kernel = (cl_kernel) 0x0; 198 | 199 | // API call. 200 | const char * call = "clCreateKernel"; 201 | logger.log_call(call); 202 | 203 | if (NULL == prof.interceptor.clCreateKernel_original) 204 | { 205 | prof.interceptor.clCreateKernel_original = (dvdt::Prof::Interceptor::clCreateKernel_type) dlsym(RTLD_NEXT, call); 206 | } 207 | 208 | // Arguments. 209 | logger.log_ptr(call, "program", program); 210 | logger.log_str(call, "name", kernel_name); 211 | logger.log_ptr(call, "errcode_ret", errcode_ret); 212 | 213 | #ifndef DVDT_PROF_TEST 214 | logger.log_timestamp_start(call); 215 | 216 | // Original call. 217 | kernel = prof.interceptor.clCreateKernel_original( 218 | program, kernel_name, errcode_ret); 219 | 220 | logger.log_timestamp_end(call); 221 | 222 | // Error value. 223 | logger.log_num(call, "errcode", errcode_ret ? *errcode_ret : -1); 224 | #endif 225 | 226 | // Return value. 227 | logger.log_ptr(call, "kernel", kernel); logger.log_lf(); 228 | 229 | return kernel; 230 | 231 | } // clCreateKernel() 232 | 233 | 234 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateKernelsInProgram.html 235 | extern CL_API_ENTRY cl_int CL_API_CALL 236 | clCreateKernelsInProgram( 237 | cl_program program, 238 | cl_uint num_kernels, 239 | cl_kernel *kernels, 240 | cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0 241 | { 242 | // Return value. 243 | cl_int errcode = CL_SUCCESS; 244 | 245 | // API call. 246 | const char * call = "clCreateKernelsInProgram"; 247 | logger.log_call(call); 248 | 249 | if (NULL == prof.interceptor.clCreateKernelsInProgram_original) 250 | { 251 | prof.interceptor.clCreateKernelsInProgram_original = (dvdt::Prof::Interceptor::clCreateKernelsInProgram_type) dlsym(RTLD_NEXT, call); 252 | } 253 | 254 | // Arguments. 255 | logger.log_ptr(call, "program", program); 256 | logger.log_num(call, "num_kernels", num_kernels); 257 | logger.log_ptr(call, "kernels", kernels); 258 | // TODO: log list of kernels. 259 | logger.log_ptr(call, "num_kernels_ret_ptr", num_kernels_ret); 260 | 261 | #ifndef DVDT_PROF_TEST 262 | logger.log_timestamp_start(call); 263 | 264 | // Original call. 265 | errcode = prof.interceptor.clCreateKernelsInProgram_original( 266 | program, num_kernels, kernels, num_kernels_ret); 267 | 268 | logger.log_timestamp_end(call); 269 | 270 | // Actual number of kernels in program. 271 | logger.log_num(call, "num_kernels_ret", num_kernels_ret ? *num_kernels_ret : -1); 272 | #else 273 | logger.log_num(call, "num_kernels_ret", 0); 274 | #endif 275 | 276 | // Return value. 277 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 278 | 279 | return errcode; 280 | 281 | } // clCreateKernelsInProgram() 282 | 283 | 284 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateProgramWithBinary.html 285 | extern CL_API_ENTRY cl_program CL_API_CALL 286 | clCreateProgramWithBinary( 287 | cl_context context, 288 | cl_uint num_devices, 289 | const cl_device_id *device_list, 290 | const size_t *lengths, 291 | const unsigned char **binaries, 292 | cl_int *binary_status, 293 | cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 294 | { 295 | // Return value. 296 | cl_program program = (cl_program) 0x0; 297 | 298 | // API call. 299 | const char * call = "clCreateProgramWithBinary"; 300 | logger.log_call(call); 301 | 302 | if (NULL == prof.interceptor.clCreateProgramWithBinary_original) 303 | { 304 | prof.interceptor.clCreateProgramWithBinary_original = (dvdt::Prof::Interceptor::clCreateProgramWithBinary_type) dlsym(RTLD_NEXT, call); 305 | } 306 | 307 | if (NULL == prof.interceptor.context) 308 | { 309 | prof.interceptor.context = context; 310 | } 311 | 312 | // Arguments. 313 | logger.log_ptr(call, "context", context); 314 | logger.log_list(call, "device_list", device_list, num_devices); 315 | logger.log_ptr(call, "lengths", lengths); 316 | logger.log_ptr(call, "binaries", binaries); 317 | logger.log_ptr(call, "binary_status", binary_status); 318 | logger.log_ptr(call, "errcode_ret", errcode_ret); 319 | 320 | #ifndef DVDT_PROF_TEST 321 | logger.log_timestamp_start(call); 322 | 323 | // Original call. 324 | program = prof.interceptor.clCreateProgramWithBinary_original(\ 325 | context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret); 326 | 327 | logger.log_timestamp_end(call); 328 | 329 | // Error value. 330 | logger.log_num(call, "errcode", errcode_ret ? *errcode_ret : -1); 331 | #endif 332 | 333 | // Return value. 334 | logger.log_ptr(call, "program", program); logger.log_lf(); 335 | 336 | return program; 337 | 338 | } // clCreateProgramWithBinary() 339 | 340 | 341 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateProgramWithSource.html 342 | extern CL_API_ENTRY cl_program CL_API_CALL 343 | clCreateProgramWithSource( 344 | cl_context context, 345 | cl_uint count, 346 | const char **strings, 347 | const size_t *lengths, 348 | cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 349 | { 350 | // Return value. 351 | cl_program program = (cl_program) 0x0; 352 | 353 | // API call. 354 | const char * call = "clCreateProgramWithSource"; 355 | logger.log_call(call); 356 | 357 | if (NULL == prof.interceptor.clCreateProgramWithSource_original) 358 | { 359 | prof.interceptor.clCreateProgramWithSource_original = (dvdt::Prof::Interceptor::clCreateProgramWithSource_type) dlsym(RTLD_NEXT, call); 360 | } 361 | 362 | if (NULL == prof.interceptor.context) 363 | { 364 | prof.interceptor.context = context; 365 | } 366 | 367 | // Arguments. 368 | logger.log_ptr(call, "context", context); 369 | logger.log_num(call, "count", count); 370 | logger.log_ptr(call, "strings", strings); 371 | logger.log_ptr(call, "lengths", lengths); 372 | logger.log_src(call, count, strings, lengths); 373 | logger.log_ptr(call, "errcode_ret", errcode_ret); 374 | 375 | #ifndef DVDT_PROF_TEST 376 | logger.log_timestamp_start(call); 377 | 378 | // Original call. 379 | program = prof.interceptor.clCreateProgramWithSource_original(\ 380 | context, count, strings, lengths, errcode_ret); 381 | 382 | logger.log_timestamp_end(call); 383 | 384 | // Error value. 385 | logger.log_num(call, "errcode", errcode_ret ? *errcode_ret : -1); 386 | #endif 387 | 388 | // Return value. 389 | logger.log_ptr(call, "program", program); logger.log_lf(); 390 | 391 | return program; 392 | 393 | } // clCreateProgramWithSource() 394 | 395 | 396 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueNDRangeKernel.html 397 | extern CL_API_ENTRY cl_int CL_API_CALL 398 | clEnqueueNDRangeKernel( 399 | cl_command_queue queue, 400 | cl_kernel kernel, 401 | cl_uint work_dim, 402 | const size_t *global_work_offset, 403 | const size_t *global_work_size, 404 | const size_t *local_work_size, 405 | cl_uint num_events_in_wait_list, 406 | const cl_event *event_wait_list, 407 | cl_event *event) CL_API_SUFFIX__VERSION_1_0 408 | { 409 | // Return value. 410 | cl_int errcode = CL_SUCCESS; 411 | 412 | // API call. 413 | const char * call = "clEnqueueNDRangeKernel"; 414 | logger.log_call(call); 415 | 416 | if (NULL == prof.interceptor.clEnqueueNDRangeKernel_original) 417 | { 418 | prof.interceptor.clEnqueueNDRangeKernel_original = (dvdt::Prof::Interceptor::clEnqueueNDRangeKernel_type) dlsym(RTLD_NEXT, call); 419 | } 420 | 421 | // Kernel name. 422 | #ifndef DVDT_PROF_TEST 423 | const size_t max_name_length = 80; 424 | char name[max_name_length]; 425 | { 426 | size_t name_length; 427 | cl_int info_errcode = clGetKernelInfo(\ 428 | kernel, CL_KERNEL_FUNCTION_NAME, max_name_length, name, &name_length); 429 | assert(info_errcode == CL_SUCCESS && "Failed to get kernel name"); 430 | assert(name_length <= max_name_length); 431 | } 432 | #else 433 | const char name[] = "dvdt_prof_kernel"; 434 | #endif 435 | logger.log_str(call, "name", name); 436 | 437 | local_work_size = prof.interceptor.update_lws(name, local_work_size); 438 | 439 | // Arguments. 440 | logger.log_ptr(call, "queue", queue); 441 | logger.log_ptr(call, "kernel", kernel); 442 | logger.log_gwo(call, work_dim, global_work_offset); 443 | logger.log_gws(call, work_dim, global_work_size); 444 | logger.log_lws(call, work_dim, local_work_size); 445 | logger.log_list(call, "event_wait_list", event_wait_list, num_events_in_wait_list); 446 | logger.log_ptr(call, "event", event); 447 | 448 | #ifndef DVDT_PROF_TEST 449 | logger.log_timestamp_start(call); 450 | 451 | // Event object needed if 'event' is NULL. 452 | cl_event prof_event_obj; 453 | cl_event * prof_event = (NULL != event ? event : &prof_event_obj); 454 | 455 | // Original call. 456 | errcode = prof.interceptor.clEnqueueNDRangeKernel_original(\ 457 | queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,\ 458 | num_events_in_wait_list, event_wait_list, prof_event); 459 | 460 | // Wait for original call to complete. 461 | logger.log_profiling_info(call, prof_event); 462 | 463 | logger.log_timestamp_end(call); 464 | #else 465 | logger.log_profiling_info(call, NULL); 466 | #endif 467 | 468 | // Return value. 469 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 470 | 471 | return errcode; 472 | 473 | } // clEnqueueNDRangeKernel() 474 | 475 | 476 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueReadBuffer.html 477 | extern CL_API_ENTRY cl_int CL_API_CALL 478 | clEnqueueReadBuffer( 479 | cl_command_queue queue, 480 | cl_mem buffer, 481 | cl_bool blocking, 482 | size_t offset, 483 | size_t size, 484 | void *ptr, 485 | cl_uint num_events_in_wait_list, 486 | const cl_event *event_wait_list, 487 | cl_event *event) CL_API_SUFFIX__VERSION_1_0 488 | { 489 | // Return value. 490 | cl_int errcode = CL_SUCCESS; 491 | 492 | // API call. 493 | const char * call = "clEnqueueReadBuffer"; 494 | logger.log_call(call); 495 | 496 | if (NULL == prof.interceptor.clEnqueueReadBuffer_original) 497 | { 498 | prof.interceptor.clEnqueueReadBuffer_original = (dvdt::Prof::Interceptor::clEnqueueReadBuffer_type) dlsym(RTLD_NEXT, call); 499 | } 500 | 501 | // Arguments. 502 | logger.log_ptr(call, "queue", queue); 503 | logger.log_ptr(call, "buffer", buffer); 504 | logger.log_num(call, "blocking", blocking); 505 | logger.log_num(call, "offset", offset); 506 | logger.log_num(call, "size", size); 507 | logger.log_ptr(call, "ptr", ptr); 508 | // - event_wait_list 509 | logger.log_list(call, "event_wait_list", event_wait_list, num_events_in_wait_list); 510 | // - event 511 | logger.log_ptr(call, "event", event); 512 | 513 | #ifndef DVDT_PROF_TEST 514 | logger.log_timestamp_start(call); 515 | 516 | // Event object needed if 'event' is NULL. 517 | cl_event prof_event_obj; 518 | cl_event * prof_event = (NULL != event ? event : &prof_event_obj); 519 | 520 | // Original call. 521 | errcode = prof.interceptor.clEnqueueReadBuffer_original(queue, buffer, blocking, offset, size, ptr, 522 | num_events_in_wait_list, event_wait_list, prof_event); 523 | 524 | // Wait for original call to complete. 525 | logger.log_profiling_info(call, prof_event); 526 | 527 | logger.log_timestamp_end(call); 528 | #else 529 | logger.log_profiling_info(call, NULL); 530 | #endif 531 | 532 | // Return value. 533 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 534 | 535 | return errcode; 536 | 537 | } // clEnqueueReadBuffer() 538 | 539 | 540 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clEnqueueWriteBuffer.html 541 | extern CL_API_ENTRY cl_int CL_API_CALL 542 | clEnqueueWriteBuffer( 543 | cl_command_queue queue, 544 | cl_mem buffer, 545 | cl_bool blocking, 546 | size_t offset, 547 | size_t size, 548 | const void *ptr, 549 | cl_uint num_events_in_wait_list, 550 | const cl_event *event_wait_list, 551 | cl_event *event) CL_API_SUFFIX__VERSION_1_0 552 | { 553 | // Return value. 554 | cl_int errcode = CL_SUCCESS; 555 | 556 | // API call. 557 | const char * call = "clEnqueueWriteBuffer"; 558 | logger.log_call(call); 559 | 560 | if (NULL == prof.interceptor.clEnqueueWriteBuffer_original) 561 | { 562 | prof.interceptor.clEnqueueWriteBuffer_original = (dvdt::Prof::Interceptor::clEnqueueWriteBuffer_type) dlsym(RTLD_NEXT, call); 563 | } 564 | 565 | // Arguments. 566 | logger.log_ptr(call, "queue", queue); 567 | logger.log_ptr(call, "buffer", buffer); 568 | logger.log_num(call, "blocking", blocking); 569 | logger.log_num(call, "offset", offset); 570 | logger.log_num(call, "size", size); 571 | logger.log_ptr(call, "ptr", ptr); 572 | logger.log_list(call, "event_wait_list", event_wait_list, num_events_in_wait_list); 573 | logger.log_ptr(call, "event", event); 574 | 575 | #ifndef DVDT_PROF_TEST 576 | logger.log_timestamp_start(call); 577 | 578 | // Event object needed if 'event' is NULL. 579 | cl_event prof_event_obj; 580 | cl_event * prof_event = (NULL != event ? event : &prof_event_obj); 581 | 582 | // Original call. 583 | errcode = prof.interceptor.clEnqueueWriteBuffer_original(\ 584 | queue, buffer, blocking, offset, size, ptr, 585 | num_events_in_wait_list, event_wait_list, prof_event); 586 | 587 | // Wait for original call to complete. 588 | logger.log_profiling_info(call, prof_event); 589 | 590 | logger.log_timestamp_end(call); 591 | #else 592 | logger.log_profiling_info(call, NULL); 593 | #endif 594 | 595 | // Return value. 596 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 597 | 598 | return errcode; 599 | 600 | } // clEnqueueWriteBuffer() 601 | 602 | 603 | // https://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clSetKernelArg.html 604 | extern CL_API_ENTRY cl_int CL_API_CALL 605 | clSetKernelArg( 606 | cl_kernel kernel, 607 | cl_uint arg_index, 608 | size_t arg_size, 609 | const void *arg_value_ptr) CL_API_SUFFIX__VERSION_1_0 610 | { 611 | // Return value. 612 | cl_int errcode = CL_SUCCESS; 613 | 614 | // API call. 615 | const char * call = "clSetKernelArg"; 616 | logger.log_call(call); 617 | 618 | if (NULL == prof.interceptor.clSetKernelArg_original) 619 | { 620 | prof.interceptor.clSetKernelArg_original = (dvdt::Prof::Interceptor::clSetKernelArg_type) dlsym(RTLD_NEXT, call); 621 | } 622 | 623 | // Arguments. 624 | logger.log_ptr(call, "kernel", kernel); 625 | logger.log_num(call, "arg_index", arg_index); 626 | logger.log_num(call, "arg_size", arg_size); 627 | logger.log_ptr(call, "arg_value_ptr", arg_value_ptr); 628 | logger.log_hex(call, "arg_value", arg_value_ptr, arg_size); 629 | 630 | #ifndef DVDT_PROF_TEST 631 | logger.log_timestamp_start(call); 632 | 633 | // Original call. 634 | errcode = prof.interceptor.clSetKernelArg_original(kernel, arg_index, arg_size, arg_value_ptr); 635 | 636 | logger.log_timestamp_end(call); 637 | #endif 638 | 639 | // Return value. 640 | logger.log_num(call, "errcode", errcode); logger.log_lf(); 641 | 642 | return errcode; 643 | 644 | } // clSetKernelArg() 645 | -------------------------------------------------------------------------------- /cpp/prof.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // 2015-2017 (c) dividiti 3 | // 4 | 5 | #ifndef DVDT_PROF_HPP 6 | #define DVDT_PROF_HPP 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef __APPLE__ 20 | #include 21 | #else 22 | #include 23 | #endif 24 | 25 | #if (1 == DVDT_PROF_WALLCLOCK_BOOST) 26 | #include 27 | #elif (1 == DVDT_PROF_WALLCLOCK_TIMEOFDAY) 28 | #include 29 | #else 30 | #error "Don't know how to measure wall-clock time" 31 | #endif 32 | 33 | #if (1 == DVDT_PROF_CJSON) 34 | #include 35 | #endif 36 | 37 | // Log fixed width pointers. 38 | #if (1 == DVDT_PROF_TEST) 39 | #include 40 | #endif 41 | 42 | // Configure output stream at compile-time. 43 | #ifndef DVDT_PROF_OSTREAM 44 | #define DVDT_PROF_OSTREAM std::cout 45 | #endif 46 | 47 | namespace dvdt 48 | { 49 | 50 | class Prof 51 | { 52 | public: 53 | class Interceptor 54 | { 55 | public: 56 | // Types of OpenCL API functions. 57 | typedef cl_int (*clBuildProgram_type)\ 58 | (cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *); 59 | 60 | typedef cl_mem (*clCreateBuffer_type)\ 61 | (cl_context, cl_mem_flags, size_t, void *, cl_int *); 62 | 63 | typedef cl_command_queue (*clCreateCommandQueue_type)\ 64 | (cl_context, cl_device_id, cl_command_queue_properties, cl_int *errcode_ret); 65 | 66 | typedef cl_kernel (*clCreateKernel_type)\ 67 | (cl_program, const char * kernel_name, cl_int * errcode_ret); 68 | 69 | typedef cl_int (*clCreateKernelsInProgram_type)\ 70 | (cl_program, cl_uint num_kernels, cl_kernel * kernel, cl_uint * num_kernels_ret); 71 | 72 | typedef cl_program (*clCreateProgramWithBinary_type)\ 73 | (cl_context, cl_uint num_devices, const cl_device_id * device_list, const size_t * lengths,\ 74 | const unsigned char ** binaries, cl_int * binary_status, cl_int * errcode_ret); 75 | 76 | typedef cl_program (*clCreateProgramWithSource_type)\ 77 | (cl_context, cl_uint count, const char ** strings, const size_t * lengths, cl_int * errcode_ret); 78 | 79 | typedef cl_int (*clEnqueueNDRangeKernel_type)\ 80 | (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); 81 | 82 | typedef cl_int (*clEnqueueReadBuffer_type)\ 83 | (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); 84 | 85 | typedef cl_int (*clEnqueueWriteBuffer_type)\ 86 | (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); 87 | 88 | typedef cl_int (*clSetKernelArg_type)\ 89 | (cl_kernel, cl_uint, size_t, const void *); 90 | 91 | // OpenCL API functions from the underlying vendor implementation. 92 | clBuildProgram_type clBuildProgram_original; 93 | clCreateBuffer_type clCreateBuffer_original; 94 | clCreateCommandQueue_type clCreateCommandQueue_original; 95 | clCreateKernel_type clCreateKernel_original; 96 | clCreateKernelsInProgram_type clCreateKernelsInProgram_original; 97 | clCreateProgramWithBinary_type clCreateProgramWithBinary_original; 98 | clCreateProgramWithSource_type clCreateProgramWithSource_original; 99 | clEnqueueNDRangeKernel_type clEnqueueNDRangeKernel_original; 100 | clEnqueueReadBuffer_type clEnqueueReadBuffer_original; 101 | clEnqueueWriteBuffer_type clEnqueueWriteBuffer_original; 102 | clSetKernelArg_type clSetKernelArg_original; 103 | 104 | // Mapping a kernel to a local work size tuple that will be used 105 | // to override the local work size specified in the program. 106 | const size_t * update_lws(const char * name, const size_t * program_lws); 107 | 108 | // Constructor. 109 | Interceptor() : 110 | clBuildProgram_original(NULL), 111 | clCreateBuffer_original(NULL), 112 | clCreateCommandQueue_original(NULL), 113 | clCreateKernel_original(NULL), 114 | clCreateKernelsInProgram_original(NULL), 115 | clCreateProgramWithBinary_original(NULL), 116 | clCreateProgramWithSource_original(NULL), 117 | clEnqueueNDRangeKernel_original(NULL), 118 | clEnqueueReadBuffer_original(NULL), 119 | clEnqueueWriteBuffer_original(NULL), 120 | clSetKernelArg_original(NULL), 121 | kernel_lws_null(false), 122 | context(NULL) 123 | { 124 | if (getenv("DVDT_PROF_LWS_NULL")) 125 | { 126 | kernel_lws_null = true; 127 | } 128 | else if (const char * kernel_lws_list = getenv("DVDT_PROF_LWS")) 129 | { 130 | update_kernel_lws_map(kernel_lws_list); 131 | } 132 | } 133 | 134 | // Destructor. 135 | ~Interceptor() 136 | { 137 | // Free local work size values. 138 | for (std::map::iterator i = kernel_lws_map.begin(), 139 | e = kernel_lws_map.end(); i != e; i++) 140 | { 141 | delete[] i->second; 142 | } 143 | } 144 | 145 | private: 146 | // The map is populated by parsing an environment variable DVDT_PROF_LWS 147 | // in the following format: 148 | // 149 | // "kernel_A:lws_A0,lws_A1,lws_A2 kernel_B:lws_B0,lws_B1,lws_B1 ..." 150 | // 151 | // Namely, the list elements are separated by spaces; the kernel names 152 | // (strings) are separated from the local work size tuple by colons; 153 | // the tuple elements (unsigned integers) are delimited by commas. 154 | // The number of elements in a tuple must match the number of work-group 155 | // dimensions as specified in the program or start with the value of 0 156 | // to use NULL as the local work size for this kernel. 157 | // (For convenience, kernel_lws_null allows to use NULL for all kernels 158 | // in the program.) 159 | std::map kernel_lws_map; 160 | 161 | // True if NULL is to be used as the local work size for all kernels. 162 | bool kernel_lws_null; 163 | 164 | // Helper method for update_kernel_lws_map(). 165 | std::vector split(const std::string & str, char delim) 166 | { 167 | std::vector elems; 168 | std::stringstream ss(str); 169 | std::string elem; 170 | while (std::getline(ss, elem, delim)) 171 | { 172 | elems.push_back(elem); 173 | } 174 | return elems; 175 | } 176 | 177 | // See kernel_lws_map. 178 | void update_kernel_lws_map(const char * kernel_lws_list); 179 | 180 | public: 181 | // Cached OpenCL context. (Currently unused.) 182 | cl_context context; 183 | 184 | }; // inner class Interceptor 185 | 186 | class Logger 187 | { 188 | public: 189 | virtual inline void 190 | log_call(const char * call_name) = 0; 191 | 192 | virtual inline void 193 | log_gws(const char * call_name, cl_uint work_dim, const size_t * global_work_size) = 0; 194 | 195 | virtual inline void 196 | log_gwo(const char * call_name, cl_uint work_dim, const size_t * global_work_offset) = 0; 197 | 198 | virtual inline void 199 | log_hex(const char * call_name, const char * arg_name, const void * arg_value_ptr, size_t arg_size) = 0; 200 | 201 | // NB: Templated function cannot be virtual. 202 | template inline void 203 | log_list(const char * call_name, const char * list_name, const elem_ty * list, cl_uint list_size) { }; 204 | 205 | virtual inline void 206 | log_lws(const char * call_name, cl_uint work_dim, const size_t * local_work_size) = 0; 207 | 208 | // NB: Templated function cannot be virtual. 209 | template inline void 210 | log_num(const char * call_name, const char * arg_name, num_ty arg_value) { }; 211 | 212 | virtual inline void 213 | log_profiling_info(const char * call_name, cl_event * prof_event) = 0; 214 | 215 | virtual inline void 216 | log_ptr(const char * call_name, const char * arg_name, const void * arg_value) = 0; 217 | 218 | virtual inline void 219 | log_src(const char * call_name, cl_uint count, const char **strings, const size_t *lengths) = 0; 220 | 221 | virtual inline void 222 | log_str(const char * call_name, const char * arg_name, const char * arg_value) = 0; 223 | 224 | virtual inline void 225 | log_timestamp_end(const char * call_name) = 0; 226 | 227 | virtual inline void 228 | log_timestamp_start(const char * call_name) = 0; 229 | 230 | inline std::string 231 | ptr_to_str(const void * ptr) 232 | { 233 | std::stringstream ss; 234 | #if (1 == DVDT_PROF_TEST) 235 | ss << "0x" << std::hex << std::setw(8) << std::setfill('0') << 236 | reinterpret_cast(ptr) << std::dec; 237 | #else 238 | ss << ptr; 239 | #endif 240 | return ss.str(); 241 | } 242 | }; // abstract inner class Logger 243 | 244 | // Interceptor object. 245 | Interceptor interceptor; 246 | 247 | // Typical implementation-defined constants. 248 | // TODO: query the actual implementation. 249 | static const cl_uint max_work_dim = 3; 250 | 251 | // Default values of work size parameters. 252 | static const size_t default_local_work_size = 1; 253 | static const size_t null_local_work_size = 0; 254 | 255 | static const size_t default_global_work_size = 1; 256 | // NB: no null_global_work_size 257 | 258 | static const size_t default_global_work_offset = 0; 259 | static const size_t null_global_work_offset = 0; 260 | 261 | }; // class Prof 262 | 263 | 264 | #if (1 != DVDT_PROF_CJSON) 265 | class ostreamLogger : public Prof::Logger 266 | { 267 | private: 268 | std::ostream & stream; 269 | 270 | const char * prefix; 271 | const char sep; 272 | const char lf; 273 | 274 | public: 275 | // Constructor. 276 | ostreamLogger(std::ostream & _stream=DVDT_PROF_OSTREAM, 277 | const char * _prefix="[dv/dt]", 278 | const char _sep=' ', 279 | const char _lf='\n') : 280 | stream(_stream), prefix(_prefix), sep(_sep), lf(_lf) 281 | {} 282 | 283 | inline void log_prefix() { stream << prefix; } 284 | inline void log_sep() { stream << sep; } 285 | inline void log_lf() { stream << lf; } 286 | 287 | public: 288 | inline void 289 | log_call(const char * call_name) 290 | { 291 | stream << prefix << sep << call_name << lf; 292 | } // log_call() 293 | 294 | inline void 295 | log_gws(const char * call_name, cl_uint work_dim, const size_t * global_work_size) 296 | { 297 | stream << prefix << sep << call_name << sep << "gws"; 298 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 299 | { 300 | stream << sep << (d < work_dim ? global_work_size[d] : dvdt::Prof::default_global_work_size); 301 | } 302 | stream << lf; 303 | } // log_gws() 304 | 305 | inline void 306 | log_gwo(const char * call_name, cl_uint work_dim, const size_t * global_work_offset) 307 | { 308 | stream << prefix << sep << call_name << sep << "gwo"; 309 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 310 | { 311 | if (global_work_offset) 312 | { 313 | stream << sep << (d < work_dim ? global_work_offset[d] : dvdt::Prof::default_global_work_offset); 314 | } 315 | else 316 | { 317 | stream << sep << dvdt::Prof::null_global_work_offset; 318 | } 319 | } 320 | stream << lf; 321 | } // log_gwo() 322 | 323 | template inline void 324 | log_list(const char * call_name, const char * list_name, const elem_ty * list, cl_uint list_size) 325 | { 326 | stream << prefix << sep << call_name << sep << list_name; 327 | for (cl_uint i = 0; i < list_size; ++i) 328 | { 329 | stream << sep << list[i]; 330 | } 331 | stream << lf; 332 | } // log_list() 333 | 334 | inline void 335 | log_lws(const char * call_name, cl_uint work_dim, const size_t * local_work_size) 336 | { 337 | stream << prefix << sep << call_name << sep << "lws"; 338 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 339 | { 340 | if (local_work_size) 341 | { 342 | stream << sep << (d < work_dim ? local_work_size[d] : dvdt::Prof::default_local_work_size); 343 | } 344 | else 345 | { 346 | stream << sep << dvdt::Prof::null_local_work_size; 347 | } 348 | } 349 | stream << lf; 350 | } // log_lws() 351 | 352 | template inline void 353 | log_num(const char * call_name, const char * arg_name, num_ty arg_value) 354 | { 355 | stream << prefix << sep << call_name << sep << arg_name << sep << arg_value << lf; 356 | } // log_num() 357 | 358 | inline void 359 | log_hex(const char * call_name, const char * arg_name, const void * arg_value_ptr, size_t arg_size) 360 | { 361 | stream << prefix << sep << call_name << sep << arg_name << sep << std::hex; 362 | for (size_t i = 0; i < arg_size; ++i) 363 | { 364 | unsigned int byte = static_cast( 365 | reinterpret_cast(arg_value_ptr)[i] 366 | ); 367 | stream << std::setfill('0') << std::setw(2) << byte; 368 | } 369 | stream << std::dec << lf; 370 | } // log_hex() 371 | 372 | inline void 373 | log_profiling_info(const char * call_name, cl_event * prof_event) 374 | { 375 | cl_ulong queued, submit, start, end; 376 | #ifndef DVDT_PROF_TEST 377 | cl_int prof_errcode = CL_SUCCESS; 378 | prof_errcode |= clWaitForEvents(1, prof_event); 379 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); 380 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); 381 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); 382 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); 383 | if (CL_SUCCESS != prof_errcode) 384 | { 385 | stream << prefix << sep << call_name << sep << "output profiling info error: " << prof_errcode << lf; 386 | } 387 | #else 388 | queued = 100200300400L; 389 | submit = 100200300500L; 390 | start = 100200300600L; 391 | end = 100200300700L; 392 | #endif 393 | stream << prefix << sep << call_name << sep << "profiling" << 394 | sep << queued << sep << submit << sep << start << sep << end << lf; 395 | } // log_profiling_info() 396 | 397 | inline void 398 | log_ptr(const char * call_name, const char * arg_name, const void * arg_value) 399 | { 400 | stream << prefix << sep << call_name << sep << arg_name << sep << ptr_to_str(arg_value) << lf; 401 | } // log_ptr() 402 | 403 | inline void 404 | log_src(const char * call_name, cl_uint count, const char **strings, const size_t *lengths) 405 | { 406 | for (cl_uint c = 0; c < count; ++c) 407 | { 408 | stream << prefix << sep << call_name << sep << "sources[" << c << "] <<" << lf; 409 | if (NULL == lengths || 0 == lengths[c]) 410 | { 411 | // Program string is null-terminated. 412 | stream << strings[c]; 413 | } 414 | else 415 | { 416 | // When program string it not null-terminated, only 417 | // print lengths[c] characters from strings[c]. 418 | for (cl_uint k = 0; k < lengths[c]; ++ k) 419 | { 420 | stream << strings[c][k]; 421 | } 422 | } 423 | stream << std::endl; 424 | stream << prefix << sep << call_name << sep << "sources[" << c << "] >>" << lf; 425 | } 426 | } // log_src() 427 | 428 | inline void 429 | log_str(const char * call_name, const char * arg_name, const char * arg_value) 430 | { 431 | stream << prefix << sep << call_name << sep << arg_name << sep << arg_value << lf; 432 | } // log_str() 433 | 434 | private: 435 | inline void 436 | log_timestamp(const char * call_name, const char * timestamp_kind) 437 | { 438 | #if (1 == DVDT_PROF_WALLCLOCK_BOOST) 439 | const boost::posix_time::ptime time = boost::posix_time::microsec_clock::universal_time(); 440 | const std::string time_str = boost::posix_time::to_iso_extended_string(time); 441 | #elif (1 == DVDT_PROF_WALLCLOCK_TIMEOFDAY) 442 | const std::string time_str("1970-01-01 00:00:00.000"); 443 | #endif 444 | stream << prefix << sep << call_name << sep << timestamp_kind << sep << time_str << lf; 445 | } // log_timestamp() 446 | public: 447 | inline void 448 | log_timestamp_end(const char * call_name) 449 | { 450 | log_timestamp(call_name, "end" ); 451 | } // log_timestamp_end() 452 | 453 | inline void 454 | log_timestamp_start(const char * call_name) 455 | { 456 | log_timestamp(call_name, "start"); 457 | } // log_timestamp_start() 458 | }; // class ostreamLogger : Logger 459 | #endif // (1 != DVDT_PROF_CJSON) 460 | 461 | #if (1 == DVDT_PROF_CJSON) 462 | class cjsonLogger : public Prof::Logger 463 | { 464 | private: 465 | // Stream to write the final JSON to. 466 | std::ostream & stream; 467 | // Prefix for pattern matching the final JSON. 468 | const char * prefix; 469 | // Array (list) of call objects. 470 | cJSON * calls; 471 | // Currently open call object. 472 | cJSON * call; 473 | 474 | public: 475 | // Constructor. 476 | cjsonLogger(std::ostream & _stream=DVDT_PROF_OSTREAM, 477 | const char * _prefix="[dv/dt]") : 478 | stream(_stream), prefix(_prefix), calls(NULL), call(NULL) 479 | { 480 | calls = cJSON_CreateArray(); 481 | } 482 | 483 | // Destructor. 484 | ~cjsonLogger() 485 | { 486 | // Add last call object to calls array. 487 | if (call) 488 | { 489 | cJSON_AddItemToArray(calls, call); 490 | } 491 | // Print calls array. 492 | { 493 | char * result_ptr = cJSON_Print(calls); 494 | std::string result_str(result_ptr); 495 | stream << prefix << " <<\n"; 496 | stream << result_str << "\n"; 497 | stream << prefix << " >>\n"; 498 | stream.flush(); 499 | free(result_ptr); 500 | } 501 | // Free calls array. 502 | cJSON_Delete(calls); 503 | } 504 | 505 | // No-op. 506 | inline void log_lf() { return; } 507 | 508 | public: 509 | inline void 510 | log_call(const char * call_name) 511 | { 512 | // Add previous call object to calls array. 513 | if (call) 514 | { 515 | cJSON_AddItemToArray(calls, call); 516 | } 517 | // Create new call object. 518 | call = cJSON_CreateObject(); 519 | cJSON_AddItemToObject(call, "call", cJSON_CreateString(call_name)); 520 | } // log_call() 521 | 522 | inline void 523 | log_gws(const char * call_name, cl_uint work_dim, const size_t * global_work_size) 524 | { 525 | cJSON * gws = cJSON_CreateArray(); 526 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 527 | { 528 | size_t gws_d; 529 | gws_d = d < work_dim ? 530 | global_work_size[d] : 531 | dvdt::Prof::default_global_work_size; 532 | cJSON * gws_d_as_num = cJSON_CreateNumber(gws_d); 533 | cJSON_AddItemToArray(gws, gws_d_as_num); 534 | } 535 | cJSON_AddItemToObject(call, "gws", gws); 536 | } // log_gws() 537 | 538 | inline void 539 | log_gwo(const char * call_name, cl_uint work_dim, const size_t * global_work_offset) 540 | { 541 | cJSON * gwo = cJSON_CreateArray(); 542 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 543 | { 544 | size_t gwo_d; 545 | if (global_work_offset) 546 | { 547 | gwo_d = d < work_dim ? 548 | global_work_offset[d] : 549 | dvdt::Prof::default_global_work_offset; 550 | } 551 | else 552 | { 553 | gwo_d = dvdt::Prof::null_global_work_offset; 554 | } 555 | cJSON * gwo_d_as_num = cJSON_CreateNumber(gwo_d); 556 | cJSON_AddItemToArray(gwo, gwo_d_as_num); 557 | } 558 | cJSON_AddItemToObject(call, "gwo", gwo); 559 | } // log_gwo() 560 | 561 | inline void 562 | log_hex(const char * call_name, const char * arg_name, const void * arg_value_ptr, size_t arg_size) 563 | { 564 | std::stringstream ss; 565 | ss << std::hex; 566 | for (size_t i = 0; i < arg_size; ++i) 567 | { 568 | unsigned int byte = static_cast( 569 | reinterpret_cast(arg_value_ptr)[i] 570 | ); 571 | ss << std::setfill('0') << std::setw(2) << byte; 572 | } 573 | const std::string arg_value_str = ss.str(); 574 | const char * arg_value_cstr = arg_value_str.c_str(); 575 | cJSON * arg_value_as_str = cJSON_CreateString(arg_value_cstr); 576 | cJSON_AddItemToObject(call, arg_name, arg_value_as_str); 577 | } // log_hex() 578 | 579 | template inline void 580 | log_list(const char * call_name, const char * list_name, const elem_ty * list, cl_uint list_size) 581 | { 582 | cJSON * list_as_array = cJSON_CreateArray(); 583 | for (cl_uint i = 0; i < list_size; ++i) 584 | { 585 | elem_ty list_i = list[i]; 586 | // FIXME: Currently only used for lists of cl_event's 587 | // and cl_device_id's, which are effectively pointers. 588 | cJSON_AddItemToArray(list_as_array, 589 | cJSON_CreateString(ptr_to_str(list_i).c_str())); 590 | } 591 | cJSON_AddItemToObject(call, list_name, list_as_array); 592 | } // log_list() 593 | 594 | inline void 595 | log_lws(const char * call_name, cl_uint work_dim, const size_t * local_work_size) 596 | { 597 | cJSON * lws = cJSON_CreateArray(); 598 | for (cl_uint d = 0; d < dvdt::Prof::max_work_dim; ++d) 599 | { 600 | size_t lws_d; 601 | if (local_work_size) 602 | { 603 | lws_d = d < work_dim ? 604 | local_work_size[d] : 605 | dvdt::Prof::default_local_work_size; 606 | } 607 | else 608 | { 609 | lws_d = dvdt::Prof::null_local_work_size; 610 | } 611 | cJSON * lws_d_as_num = cJSON_CreateNumber(lws_d); 612 | cJSON_AddItemToArray(lws, lws_d_as_num); 613 | } 614 | cJSON_AddItemToObject(call, "lws", lws); 615 | } // log_lws() 616 | 617 | template inline void 618 | log_num(const char * call_name, const char * arg_name, num_ty arg_value) 619 | { 620 | cJSON * arg_value_as_num = cJSON_CreateNumber( 621 | static_cast(arg_value)); 622 | cJSON_AddItemToObject(call, arg_name, arg_value_as_num); 623 | } // log_num() 624 | 625 | inline void 626 | log_profiling_info(const char * call_name, cl_event * prof_event) 627 | { 628 | cl_ulong queued, submit, start, end; 629 | #ifndef DVDT_PROF_TEST 630 | cl_int prof_errcode = CL_SUCCESS; 631 | prof_errcode |= clWaitForEvents(1, prof_event); 632 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); 633 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); 634 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); 635 | prof_errcode |= clGetEventProfilingInfo(*prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); 636 | if (CL_SUCCESS != prof_errcode) 637 | { 638 | cJSON * prof_errcode_as_num = cJSON_CreateNumber(prof_errcode); 639 | cJSON_AddItemToObject(call, 640 | "output profiling_error", prof_errcode_as_num); 641 | } 642 | #else 643 | queued = 100200300400L; 644 | submit = 100200300500L; 645 | start = 100200300600L; 646 | end = 100200300700L; 647 | #endif 648 | cJSON * profiling = cJSON_CreateObject(); 649 | cJSON * queued_as_num = cJSON_CreateNumber(queued); 650 | cJSON * submit_as_num = cJSON_CreateNumber(submit); 651 | cJSON * start_as_num = cJSON_CreateNumber(start); 652 | cJSON * end_as_num = cJSON_CreateNumber(end); 653 | cJSON_AddItemToObject(profiling, "queued", queued_as_num); 654 | cJSON_AddItemToObject(profiling, "submit", submit_as_num); 655 | cJSON_AddItemToObject(profiling, "start", start_as_num); 656 | cJSON_AddItemToObject(profiling, "end", end_as_num); 657 | cJSON_AddItemToObject(call, "profiling", profiling); 658 | } // log_profiling_info() 659 | 660 | inline void 661 | log_ptr(const char * call_name, const char * arg_name, const void * arg_value) 662 | { 663 | std::string arg_value_as_ptr_str = ptr_to_str(arg_value); 664 | const char * arg_value_as_ptr_cstr = arg_value_as_ptr_str.c_str(); 665 | cJSON_AddStringToObject(call, arg_name, arg_value_as_ptr_cstr); 666 | } // log_ptr() 667 | 668 | inline void 669 | log_src(const char * call_name, cl_uint count, const char **strings, const size_t *lengths) 670 | { 671 | cJSON * source = cJSON_CreateObject(); 672 | cJSON_AddItemToObject(call, "source", source); 673 | for (cl_uint c = 0; c < count; ++c) 674 | { 675 | std::stringstream string_ss; 676 | if (NULL == lengths || 0 == lengths[c]) 677 | { 678 | // Program string is null-terminated. 679 | string_ss << strings[c]; 680 | } 681 | else 682 | { 683 | // When program string it not null-terminated, only 684 | // print lengths[c] characters from strings[c]. 685 | for (cl_uint k = 0; k < lengths[c]; ++ k) 686 | { 687 | string_ss << strings[c][k]; 688 | } 689 | } 690 | std::stringstream c_ss; 691 | c_ss << c; 692 | 693 | const std::string string_str = string_ss.str(); 694 | const std::string c_str = c_ss.str(); 695 | 696 | const char * string_cstr = string_str.c_str(); 697 | const char * c_cstr = c_str.c_str(); 698 | 699 | cJSON_AddStringToObject(source, c_cstr, string_cstr); 700 | } 701 | } // log_src() 702 | 703 | inline void 704 | log_str(const char * call_name, const char * arg_name, const char * arg_value) 705 | { 706 | cJSON * arg_value_as_str = cJSON_CreateString(arg_value); 707 | cJSON_AddItemToObject(call, arg_name, arg_value_as_str); 708 | } // log_str() 709 | 710 | private: 711 | inline void 712 | log_timestamp(const char * call_name, const char * timestamp_kind) 713 | { 714 | #if (1 == DVDT_PROF_WALLCLOCK_BOOST) 715 | const boost::posix_time::ptime time = boost::posix_time::microsec_clock::universal_time(); 716 | const std::string time_str = boost::posix_time::to_iso_extended_string(time); 717 | #elif (1 == DVDT_PROF_WALLCLOCK_TIMEOFDAY) 718 | const std::string time_str("1970-01-01 00:00:00.000"); 719 | #endif 720 | const char * time_cstr = time_str.c_str(); 721 | cJSON * timestamp = cJSON_GetObjectItem(call, "timestamp"); 722 | if (NULL == timestamp) 723 | { 724 | assert(std::string(timestamp_kind) == std::string("start")); 725 | timestamp = cJSON_CreateObject(); 726 | cJSON_AddItemToObject(call, "timestamp", timestamp); 727 | } 728 | else 729 | { 730 | assert(std::string(timestamp_kind) == std::string("end")); 731 | } 732 | cJSON_AddStringToObject( 733 | timestamp, timestamp_kind, time_cstr); 734 | } // log_timestamp() 735 | public: 736 | inline void 737 | log_timestamp_end(const char * call_name) 738 | { 739 | log_timestamp(call_name, "end" ); 740 | } // log_timestamp_end() 741 | 742 | inline void 743 | log_timestamp_start(const char * call_name) 744 | { 745 | log_timestamp(call_name, "start"); 746 | } // log_timestamp_start() 747 | }; // class cjsonLogger : Logger 748 | #endif // (1 == DVDT_PROF_CJSON) 749 | 750 | void 751 | Prof::Interceptor::update_kernel_lws_map(const char * kernel_lws_list) 752 | { 753 | // Strip surrounding double quotation marks if present. 754 | std::string kernel_lws_list_str(kernel_lws_list); 755 | { 756 | const char double_quote = '\"'; 757 | const std::string::size_type first = kernel_lws_list_str.find(double_quote); 758 | const std::string::size_type last = kernel_lws_list_str.find_last_of(double_quote); 759 | kernel_lws_list_str = kernel_lws_list_str.substr(first+1, last-(first+1)); 760 | } 761 | 762 | // Split space-separated list of elements into vector of elements. 763 | const char per_kernel_delim = ' '; 764 | const std::vector per_kernel_elems = split(kernel_lws_list_str, per_kernel_delim); 765 | 766 | for (std::vector::const_iterator elems_i = per_kernel_elems.begin(), 767 | elems_e = per_kernel_elems.end(); elems_i != elems_e; ++elems_i) 768 | { 769 | const std::string elem(*elems_i); 770 | 771 | // Split element into two colon-separated strings: kernel name and lws tuple. 772 | const char kernel_lws_delim = ':'; 773 | const std::string::size_type pos = elem.find(kernel_lws_delim); 774 | assert(pos != std::string::npos); 775 | const std::string kernel = elem.substr(0, pos); 776 | const std::string lws_list = elem.substr(pos+1); 777 | 778 | // Split comma-separated lws tuple string into vector of lws dimensions. 779 | const char lws_delim = ','; 780 | const std::vector lws_vector = split(lws_list, lws_delim); 781 | const std::vector::size_type n = lws_vector.size(); 782 | assert((1 <= n) && (n <= Prof::max_work_dim)); 783 | size_t * lws = new size_t[n]; // To be deallocated in the destructor. 784 | for (std::vector::size_type i = 0; i < n; ++i) 785 | { 786 | std::stringstream(lws_vector[i]) >> lws[i]; 787 | } 788 | // TODO: allow updating the map (e.g. for runtime adaptation). 789 | assert(kernel_lws_map.count(kernel) == 0); 790 | kernel_lws_map.insert(std::pair(kernel, lws)); 791 | } 792 | 793 | return; 794 | 795 | } // Prof::Interceptor::update_kernel_lws_map() 796 | 797 | 798 | const size_t * 799 | Prof::Interceptor::update_lws(const char * name, const size_t * program_lws) 800 | { 801 | if (kernel_lws_null) 802 | { 803 | return NULL; 804 | } 805 | std::map::iterator it = kernel_lws_map.find(std::string(name)); 806 | if (kernel_lws_map.end() != it) 807 | { 808 | const size_t * lws = it->second; 809 | if (0 == lws[0]) 810 | { 811 | program_lws = NULL; 812 | } 813 | else 814 | { 815 | program_lws = lws; 816 | } 817 | } 818 | 819 | return program_lws; 820 | 821 | } // Prof::Interceptor::update_lws() 822 | 823 | } // namespace dvdt 824 | 825 | #endif // #ifndef DVDT_PROF_HPP 826 | --------------------------------------------------------------------------------