├── LICENSE ├── Makefile ├── README.md ├── config.h ├── input.cl └── main.c /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Ralph Doncaster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Change this path if the SDK was installed in a non-standard location 2 | OPENCL_HEADERS = "/opt/AMDAPPSDK-3.0/include" 3 | # By default libOpenCL.so is searched in default system locations, this path 4 | # lets you adds one more directory to the search path. 5 | LIBOPENCL = "/opt/amdgpu-pro/lib/x86_64-linux-gnu" 6 | 7 | CC = gcc -O2 -flto 8 | CPPFLAGS = -std=gnu99 -pedantic -Wextra -Wall -ggdb \ 9 | -Wno-deprecated-declarations \ 10 | -Wno-overlength-strings \ 11 | -I${OPENCL_HEADERS} 12 | LDFLAGS = -L${LIBOPENCL} 13 | LDLIBS = -lOpenCL 14 | OBJ = main.o 15 | INCLUDES = config.h _kernel.h 16 | EXE = cl-mem 17 | 18 | all : ${EXE} 19 | 20 | ${EXE} : ${OBJ} 21 | ${CC} -o $@ ${OBJ} ${LDFLAGS} ${LDLIBS} 22 | 23 | ${OBJ} : ${INCLUDES} 24 | 25 | _kernel.h : input.cl config.h 26 | echo 'const char *ocl_code = R"_mrb_(' >$@ 27 | cpp $< >>$@ 28 | echo ')_mrb_";' >>$@ 29 | 30 | clean : 31 | rm -f ${EXE} _kernel.h *.o _temp_* 32 | 33 | re : clean all 34 | 35 | .cpp.o : 36 | ${CC} ${CPPFLAGS} -o $@ -c $< 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cl-mem 2 | 3 | cl-mem is an OpenCL memory benchmark utility. 4 | 5 | Version 0.2 tests sequential write and read speeds. 6 | Version 0.3 added sequential copy. 7 | Random read/write tests are planned for a later version. 8 | 9 | example R9 380 with memory clocked at 1.5Ghz: 10 | 11 | Running write test. 12 | 128 GB in 1150.1 ms (111.3 GB/s) 13 | Running read test. 14 | 128 GB in 779.5 ms (164.2 GB/s) 15 | Running copy test. 16 | 128 GB in 906.2 ms (141.3 GB/s) 17 | 18 | 19 | # Thanks 20 | 21 | cl-mem uses code from Marc Bevand's SILENTARMY zcash miner 22 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | 2 | #define REPS 128 3 | #define MEMSIZE 1024*1024*1024 4 | #define WAVES 32 5 | -------------------------------------------------------------------------------- /input.cl: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | 3 | /* 4 | * OpenCL memory benchmarks 5 | */ 6 | 7 | /* each group of threads aka WAVE uses a different section of memory */ 8 | #define DTYPE ulong 9 | #define BLOCKEND ((MEMSIZE/sizeof(DTYPE))/WAVES) 10 | #define INITPTR __global DTYPE *p = (__global DTYPE *)\ 11 | (buffer + wave * (MEMSIZE/WAVES)) 12 | 13 | /* 14 | * sequential write 1GB 15 | * should be launched with local worksize 256 16 | * WAVES should be a power of 2 17 | */ 18 | __kernel 19 | void test0(__global char *buffer) 20 | { 21 | uint tid = get_global_id(0)%256; 22 | uint wave = get_global_id(0)>>8; // 256 threads per wave 23 | DTYPE data = get_global_id(0) << 16; 24 | uint block; 25 | uint rep = REPS; 26 | 27 | while (rep--) 28 | { 29 | INITPTR; 30 | for (block = 0; block < BLOCKEND; block += 256) 31 | { 32 | *(p + block + tid) = data; 33 | } 34 | } 35 | } 36 | 37 | /* 38 | * sequential read 1GB 39 | * should be launched with local worksize 256 40 | * WAVES should be a power of 2 41 | */ 42 | __kernel 43 | void test1(__global char *buffer) 44 | { 45 | uint tid = get_global_id(0)%256; 46 | uint wave = get_global_id(0)>>8; // 256 threads per wave 47 | uint block; 48 | uint rep = REPS; 49 | DTYPE sum = 0; 50 | 51 | while (rep--) 52 | { 53 | INITPTR; 54 | for (block = 0; block < BLOCKEND; block += 256) 55 | { 56 | sum += *(p + block + tid); 57 | } 58 | } 59 | // write sum to keep compiler from optimizing away the whole kernel 60 | *(__global DTYPE *)(buffer + wave * (MEMSIZE/WAVES)) = sum; 61 | } 62 | 63 | /* 64 | * sequential copy 1GB/2 65 | * should be launched with local worksize 256 66 | * WAVES should be a power of 2 67 | */ 68 | __kernel 69 | void test2(__global char *buffer) 70 | { 71 | uint tid = get_global_id(0)%256; 72 | uint wave = get_global_id(0)>>8; // 256 threads per wave 73 | uint block; 74 | uint rep = REPS; 75 | 76 | while (rep--) 77 | { 78 | INITPTR; 79 | for (block = 0; block*2 < BLOCKEND; block += 256) 80 | { 81 | // 1st half of block is src, 2nd half is dst 82 | *(p + BLOCKEND/2 + block + tid) = *(p + block + tid); 83 | } 84 | } 85 | } 86 | 87 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 1/* memrchr */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "_kernel.h" 17 | #include "config.h" 18 | 19 | typedef uint8_t uchar; 20 | typedef uint32_t uint; 21 | 22 | const char* test_names[] = {"write", "read", "copy"}; 23 | 24 | int verbose = 0; 25 | uint32_t do_list_devices = 0; 26 | uint32_t gpu_to_use = 0; 27 | 28 | typedef struct debug_s 29 | { 30 | uint32_t dropped_coll; 31 | uint32_t dropped_stor; 32 | } debug_t; 33 | 34 | void debug(const char *fmt, ...) 35 | { 36 | va_list ap; 37 | if (!verbose) 38 | return ; 39 | va_start(ap, fmt); 40 | vfprintf(stderr, fmt, ap); 41 | va_end(ap); 42 | } 43 | 44 | void warn(const char *fmt, ...) 45 | { 46 | va_list ap; 47 | va_start(ap, fmt); 48 | vfprintf(stderr, fmt, ap); 49 | va_end(ap); 50 | } 51 | 52 | // fatal error 53 | void fatal(const char *fmt, ...) 54 | { 55 | va_list ap; 56 | va_start(ap, fmt); 57 | vfprintf(stderr, fmt, ap); 58 | va_end(ap); 59 | exit(1); 60 | } 61 | 62 | #define CL_CHECK(STATUS) if (STATUS != CL_SUCCESS) fatal("Error (%d) at line%d\n", STATUS, __LINE__) 63 | 64 | uint64_t parse_num(char *str) 65 | { 66 | char *endptr; 67 | uint64_t n; 68 | n = strtoul(str, &endptr, 0); 69 | if (endptr == str || *endptr) 70 | fatal("'%s' is not a valid number\n", str); 71 | return n; 72 | } 73 | 74 | uint64_t now(void) 75 | { 76 | struct timeval tv; 77 | gettimeofday(&tv, NULL); 78 | return (uint64_t)tv.tv_sec * 1000 * 1000 + tv.tv_usec; 79 | } 80 | 81 | void show_time(uint64_t t0) 82 | { 83 | uint64_t t1; 84 | t1 = now(); 85 | fprintf(stderr, "Elapsed time: %.1f msec\n", (t1 - t0) / 1e3); 86 | } 87 | 88 | void set_blocking_mode(int fd, int block) 89 | { 90 | int f; 91 | if (-1 == (f = fcntl(fd, F_GETFL))) 92 | fatal("fcntl F_GETFL: %s\n", strerror(errno)); 93 | if (-1 == fcntl(fd, F_SETFL, block ? (f & ~O_NONBLOCK) : (f | O_NONBLOCK))) 94 | fatal("fcntl F_SETFL: %s\n", strerror(errno)); 95 | } 96 | 97 | void randomize(void *p, ssize_t l) 98 | { 99 | const char *fname = "/dev/urandom"; 100 | int fd; 101 | ssize_t ret; 102 | if (-1 == (fd = open(fname, O_RDONLY))) 103 | fatal("open %s: %s\n", fname, strerror(errno)); 104 | if (-1 == (ret = read(fd, p, l))) 105 | fatal("read %s: %s\n", fname, strerror(errno)); 106 | if (ret != l) 107 | fatal("%s: short read %d bytes out of %d\n", fname, ret, l); 108 | if (-1 == close(fd)) 109 | fatal("close %s: %s\n", fname, strerror(errno)); 110 | } 111 | 112 | cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size, 113 | void *host_ptr) 114 | { 115 | cl_int status; 116 | cl_mem ret; 117 | ret = clCreateBuffer(ctx, flags, size, host_ptr, &status); 118 | if (status != CL_SUCCESS || !ret) 119 | fatal("clCreateBuffer (%d)\n", status); 120 | return ret; 121 | } 122 | 123 | void check_clSetKernelArg(cl_kernel k, cl_uint a_pos, cl_mem *a) 124 | { 125 | cl_int status; 126 | status = clSetKernelArg(k, a_pos, sizeof (*a), a); 127 | if (status != CL_SUCCESS) 128 | fatal("clSetKernelArg (%d)\n", status); 129 | } 130 | 131 | void check_clEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel k, cl_uint 132 | work_dim, const size_t *global_work_offset, const size_t 133 | *global_work_size, const size_t *local_work_size, cl_uint 134 | event_wait_list_size, const cl_event *event_wait_list, cl_event 135 | *event) 136 | { 137 | cl_int status; 138 | status = clEnqueueNDRangeKernel(queue, k, work_dim, global_work_offset, 139 | global_work_size, local_work_size, event_wait_list_size, 140 | event_wait_list, event); 141 | if (status != CL_SUCCESS) 142 | fatal("clEnqueueNDRangeKernel (%d)\n", status); 143 | } 144 | 145 | void check_clEnqueueReadBuffer(cl_command_queue queue, cl_mem buffer, cl_bool 146 | blocking_read, size_t offset, size_t size, void *ptr, cl_uint 147 | num_events_in_wait_list, const cl_event *event_wait_list, cl_event 148 | *event) 149 | { 150 | cl_int status; 151 | status = clEnqueueReadBuffer(queue, buffer, blocking_read, offset, 152 | size, ptr, num_events_in_wait_list, event_wait_list, event); 153 | if (status != CL_SUCCESS) 154 | fatal("clEnqueueReadBuffer (%d)\n", status); 155 | } 156 | 157 | void hexdump(uint8_t *a, uint32_t a_len) 158 | { 159 | for (uint32_t i = 0; i < a_len; i++) 160 | fprintf(stderr, "%02x", a[i]); 161 | } 162 | 163 | char *s_hexdump(const void *_a, uint32_t a_len) 164 | { 165 | const uint8_t *a = _a; 166 | static char buf[4096]; 167 | uint32_t i; 168 | for (i = 0; i < a_len && i + 2 < sizeof (buf); i++) 169 | sprintf(buf + i * 2, "%02x", a[i]); 170 | buf[i * 2] = 0; 171 | return buf; 172 | } 173 | 174 | uint8_t hex2val(const char *base, size_t off) 175 | { 176 | const char c = base[off]; 177 | if (c >= '0' && c <= '9') return c - '0'; 178 | else if (c >= 'a' && c <= 'f') return 10 + c - 'a'; 179 | else if (c >= 'A' && c <= 'F') return 10 + c - 'A'; 180 | fatal("Invalid hex char at offset %zd: ...%c...\n", off, c); 181 | return 0; 182 | } 183 | 184 | void get_program_build_log(cl_program program, cl_device_id device) 185 | { 186 | cl_int status; 187 | char val[2*1024*1024]; 188 | size_t ret = 0; 189 | status = clGetProgramBuildInfo(program, device, 190 | CL_PROGRAM_BUILD_LOG, 191 | sizeof (val), // size_t param_value_size 192 | &val, // void *param_value 193 | &ret); // size_t *param_value_size_ret 194 | if (status != CL_SUCCESS) 195 | fatal("clGetProgramBuildInfo (%d)\n", status); 196 | fprintf(stderr, "%s\n", val); 197 | } 198 | 199 | void dump(const char *fname, void *data, size_t len) 200 | { 201 | int fd; 202 | ssize_t ret; 203 | if (-1 == (fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0666))) 204 | fatal("%s: %s\n", fname, strerror(errno)); 205 | ret = write(fd, data, len); 206 | if (ret == -1) 207 | fatal("write: %s: %s\n", fname, strerror(errno)); 208 | if ((size_t)ret != len) 209 | fatal("%s: partial write\n", fname); 210 | if (-1 == close(fd)) 211 | fatal("close: %s: %s\n", fname, strerror(errno)); 212 | } 213 | 214 | void get_program_bins(cl_program program) 215 | { 216 | cl_int status; 217 | size_t sizes; 218 | unsigned char *p; 219 | size_t ret = 0; 220 | status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, 221 | sizeof (sizes), // size_t param_value_size 222 | &sizes, // void *param_value 223 | &ret); // size_t *param_value_size_ret 224 | if (status != CL_SUCCESS) 225 | fatal("clGetProgramInfo(sizes) (%d)\n", status); 226 | if (ret != sizeof (sizes)) 227 | fatal("clGetProgramInfo(sizes) did not fill sizes (%d)\n", status); 228 | debug("Program binary size is %zd bytes\n", sizes); 229 | p = (unsigned char *)malloc(sizes); 230 | status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 231 | sizeof (p), // size_t param_value_size 232 | &p, // void *param_value 233 | &ret); // size_t *param_value_size_ret 234 | if (status != CL_SUCCESS) 235 | fatal("clGetProgramInfo (%d)\n", status); 236 | dump("dump.co", p, sizes); 237 | debug("program: %02x%02x%02x%02x...\n", p[0], p[1], p[2], p[3]); 238 | } 239 | 240 | void print_platform_info(cl_platform_id plat) 241 | { 242 | char name[1024]; 243 | size_t len = 0; 244 | int status; 245 | status = clGetPlatformInfo(plat, CL_PLATFORM_NAME, sizeof (name), &name, 246 | &len); 247 | if (status != CL_SUCCESS) 248 | fatal("clGetPlatformInfo (%d)\n", status); 249 | printf("Devices on platform \"%s\":\n", name); 250 | fflush(stdout); 251 | } 252 | 253 | void print_device_info(unsigned i, cl_device_id d) 254 | { 255 | char name[1024]; 256 | size_t len = 0; 257 | int status; 258 | status = clGetDeviceInfo(d, CL_DEVICE_NAME, sizeof (name), &name, &len); 259 | if (status != CL_SUCCESS) 260 | fatal("clGetDeviceInfo (%d)\n", status); 261 | printf(" ID %d: %s\n", i, name); 262 | fflush(stdout); 263 | } 264 | 265 | // non-debug version 266 | void examine_ht(unsigned round, cl_command_queue queue, cl_mem buf_ht) 267 | { 268 | (void)round; 269 | (void)queue; 270 | (void)buf_ht; 271 | } 272 | 273 | void examine_dbg(cl_command_queue queue, cl_mem buf_dbg, size_t dbg_size) 274 | { 275 | debug_t *dbg; 276 | size_t dropped_coll_total, dropped_stor_total; 277 | if (verbose < 2) 278 | return ; 279 | dbg = (debug_t *)malloc(dbg_size); 280 | if (!dbg) 281 | fatal("malloc: %s\n", strerror(errno)); 282 | check_clEnqueueReadBuffer(queue, buf_dbg, 283 | CL_TRUE, // cl_bool blocking_read 284 | 0, // size_t offset 285 | dbg_size, // size_t size 286 | dbg, // void *ptr 287 | 0, // cl_uint num_events_in_wait_list 288 | NULL, // cl_event *event_wait_list 289 | NULL); // cl_event *event 290 | dropped_coll_total = dropped_stor_total = 0; 291 | for (unsigned tid = 0; tid < dbg_size / sizeof (*dbg); tid++) 292 | { 293 | dropped_coll_total += dbg[tid].dropped_coll; 294 | dropped_stor_total += dbg[tid].dropped_stor; 295 | if (0 && (dbg[tid].dropped_coll || dbg[tid].dropped_stor)) 296 | debug("thread %6d: dropped_coll %zd dropped_stor %zd\n", tid, 297 | dbg[tid].dropped_coll, dbg[tid].dropped_stor); 298 | } 299 | debug("Dropped: %zd (coll) %zd (stor)\n", 300 | dropped_coll_total, dropped_stor_total); 301 | free(dbg); 302 | } 303 | 304 | /* 305 | ** Sort a pair of binary blobs (a, b) which are consecutive in memory and 306 | ** occupy a total of 2*len 32-bit words. 307 | ** 308 | ** a points to the pair 309 | ** len number of 32-bit words in each pair 310 | */ 311 | void sort_pair(uint32_t *a, uint32_t len) 312 | { 313 | uint32_t *b = a + len; 314 | uint32_t tmp, need_sorting = 0; 315 | for (uint32_t i = 0; i < len; i++) 316 | if (need_sorting || a[i] > b[i]) 317 | { 318 | need_sorting = 1; 319 | tmp = a[i]; 320 | a[i] = b[i]; 321 | b[i] = tmp; 322 | } 323 | else if (a[i] < b[i]) 324 | return ; 325 | } 326 | 327 | /* 328 | ** Read a complete line from stdin. If 2 or more lines are available, store 329 | ** only the last one in the buffer. 330 | ** 331 | ** buf buffer to store the line 332 | ** len length of the buffer 333 | ** block blocking mode: do not return until a line was read 334 | ** 335 | ** Return 1 iff a line was read. 336 | */ 337 | int read_last_line(char *buf, size_t len, int block) 338 | { 339 | char *start; 340 | size_t pos = 0; 341 | ssize_t n; 342 | set_blocking_mode(0, block); 343 | while (42) 344 | { 345 | n = read(0, buf + pos, len - pos); 346 | if (n == -1 && errno == EINTR) 347 | continue ; 348 | else if (n == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) 349 | { 350 | if (!pos) 351 | return 0; 352 | warn("strange: a partial line was read\n"); 353 | // a partial line was read, continue reading it in blocking mode 354 | // to be sure to read it completely 355 | set_blocking_mode(0, 1); 356 | continue ; 357 | } 358 | else if (n == -1) 359 | fatal("read stdin: %s\n", strerror(errno)); 360 | else if (!n) 361 | fatal("EOF on stdin\n"); 362 | pos += n; 363 | if (buf[pos - 1] == '\n') 364 | // 1 (or more) complete lines were read 365 | break ; 366 | } 367 | start = memrchr(buf, '\n', pos - 1); 368 | if (start) 369 | { 370 | warn("strange: more than 1 line was read\n"); 371 | // more than 1 line; copy the last line to the beginning of the buffer 372 | pos -= (start + 1 - buf); 373 | memmove(buf, start + 1, pos); 374 | } 375 | // overwrite '\n' with NUL 376 | buf[pos - 1] = 0; 377 | return 1; 378 | } 379 | 380 | void run_opencl(cl_context ctx, cl_command_queue queue, cl_kernel* tests) 381 | { 382 | cl_mem buf_test, buf_dbg; 383 | void *dbg = NULL; 384 | size_t global_ws = 256*WAVES; 385 | size_t local_ws = 64; 386 | #ifdef ENABLE_DEBUG 387 | size_t dbg_size = NR_ROWS * sizeof (debug_t); 388 | #else 389 | size_t dbg_size = 1 * sizeof (debug_t); 390 | #endif 391 | //uint64_t total; 392 | if (verbose) 393 | fprintf(stderr, "Test buffers will use GB\n"); 394 | //fprintf(stderr, "Hash tables will use %.1f MB\n", 2.0 * HT_SIZE / 1e6); 395 | // Set up buffers for the host and memory objects for the kernel 396 | if (!(dbg = calloc(dbg_size, 1))) 397 | fatal("malloc: %s\n", strerror(errno)); 398 | buf_dbg = check_clCreateBuffer(ctx, CL_MEM_READ_WRITE | 399 | CL_MEM_COPY_HOST_PTR, dbg_size, dbg); 400 | buf_test = check_clCreateBuffer(ctx, CL_MEM_READ_WRITE, MEMSIZE, NULL); 401 | 402 | uint num_tests = sizeof(test_names)/sizeof(test_names[0]); 403 | for (unsigned test = 0; test < num_tests; test++) 404 | { 405 | check_clSetKernelArg(tests[test], 0, &buf_test); 406 | fprintf(stderr, "Running %s test.\n", test_names[test]); 407 | uint64_t t0 = now(); 408 | check_clEnqueueNDRangeKernel(queue, tests[test], 1, NULL, 409 | &global_ws, &local_ws, 0, NULL, NULL); 410 | cl_int status = clFinish(queue); 411 | CL_CHECK(status); 412 | uint64_t t1 = now(); 413 | fprintf(stderr, "%d GB in %.1f ms (%.1f GB/s)\n", REPS, 414 | (t1 - t0) / 1e3, REPS / ((t1 - t0) / 1e6)); 415 | } 416 | // Clean up 417 | if (dbg) 418 | free(dbg); 419 | clReleaseMemObject(buf_dbg); 420 | clReleaseMemObject(buf_test); 421 | } 422 | 423 | /* 424 | ** Scan the devices available on this platform. Try to find the device 425 | ** selected by the "--use " option and, if found, store the platform and 426 | ** device in plat_id and dev_id. 427 | ** 428 | ** plat platform being scanned 429 | ** nr_devs_total total number of devices detected so far, will be 430 | ** incremented by the number of devices available on this 431 | ** platform 432 | ** plat_id where to store the platform id 433 | ** dev_id where to store the device id 434 | ** 435 | ** Return 1 iff the selected device was found. 436 | */ 437 | unsigned scan_platform(cl_platform_id plat, cl_uint *nr_devs_total, 438 | cl_platform_id *plat_id, cl_device_id *dev_id) 439 | { 440 | cl_device_type typ = CL_DEVICE_TYPE_ALL; 441 | cl_uint nr_devs = 0; 442 | cl_device_id *devices; 443 | cl_int status; 444 | unsigned found = 0; 445 | unsigned i; 446 | if (do_list_devices) 447 | print_platform_info(plat); 448 | status = clGetDeviceIDs(plat, typ, 0, NULL, &nr_devs); 449 | if (status != CL_SUCCESS) 450 | fatal("clGetDeviceIDs (%d)\n", status); 451 | if (nr_devs == 0) 452 | return 0; 453 | devices = (cl_device_id *)malloc(nr_devs * sizeof (*devices)); 454 | status = clGetDeviceIDs(plat, typ, nr_devs, devices, NULL); 455 | if (status != CL_SUCCESS) 456 | fatal("clGetDeviceIDs (%d)\n", status); 457 | i = 0; 458 | while (i < nr_devs) 459 | { 460 | if (do_list_devices) 461 | print_device_info(*nr_devs_total, devices[i]); 462 | else if (*nr_devs_total == gpu_to_use) 463 | { 464 | found = 1; 465 | *plat_id = plat; 466 | *dev_id = devices[i]; 467 | break ; 468 | } 469 | (*nr_devs_total)++; 470 | i++; 471 | } 472 | free(devices); 473 | return found; 474 | } 475 | 476 | /* 477 | ** Stores the platform id and device id that was selected by the "--use " 478 | ** option. 479 | ** 480 | ** plat_id where to store the platform id 481 | ** dev_id where to store the device id 482 | */ 483 | void scan_platforms(cl_platform_id *plat_id, cl_device_id *dev_id) 484 | { 485 | cl_uint nr_platforms; 486 | cl_platform_id *platforms; 487 | cl_uint i, nr_devs_total; 488 | cl_int status; 489 | status = clGetPlatformIDs(0, NULL, &nr_platforms); 490 | if (status != CL_SUCCESS) 491 | fatal("Cannot get OpenCL platforms (%d)\n", status); 492 | if (!nr_platforms || verbose) 493 | fprintf(stderr, "Found %d OpenCL platform(s)\n", nr_platforms); 494 | if (!nr_platforms) 495 | exit(1); 496 | platforms = (cl_platform_id *)malloc(nr_platforms * sizeof (*platforms)); 497 | if (!platforms) 498 | fatal("malloc: %s\n", strerror(errno)); 499 | status = clGetPlatformIDs(nr_platforms, platforms, NULL); 500 | if (status != CL_SUCCESS) 501 | fatal("clGetPlatformIDs (%d)\n", status); 502 | i = nr_devs_total = 0; 503 | while (i < nr_platforms) 504 | { 505 | if (scan_platform(platforms[i], &nr_devs_total, plat_id, dev_id)) 506 | break ; 507 | i++; 508 | } 509 | if (do_list_devices) 510 | exit(0); 511 | debug("Using GPU device ID %d\n", gpu_to_use); 512 | free(platforms); 513 | } 514 | 515 | void run_bench() 516 | { 517 | cl_platform_id plat_id = 0; 518 | cl_device_id dev_id = 0; 519 | // cl_kernel k_rounds[PARAM_K]; 520 | uint num_tests = sizeof(test_names)/sizeof(test_names[0]); 521 | cl_kernel tests[num_tests]; 522 | cl_int status; 523 | scan_platforms(&plat_id, &dev_id); 524 | if (!plat_id || !dev_id) 525 | fatal("Selected device (ID %d) not found; see --list\n", gpu_to_use); 526 | /* Create context.*/ 527 | cl_context context = clCreateContext(NULL, 1, &dev_id, 528 | NULL, NULL, &status); 529 | if (status != CL_SUCCESS || !context) 530 | fatal("clCreateContext (%d)\n", status); 531 | /* Creating command queue associate with the context.*/ 532 | cl_command_queue queue = clCreateCommandQueue(context, dev_id, 533 | 0, &status); 534 | if (status != CL_SUCCESS || !queue) 535 | fatal("clCreateCommandQueue (%d)\n", status); 536 | /* Create program object */ 537 | cl_program program; 538 | const char *source; 539 | size_t source_len; 540 | source = ocl_code; 541 | source_len = strlen(ocl_code); 542 | program = clCreateProgramWithSource(context, 1, (const char **)&source, 543 | &source_len, &status); 544 | if (status != CL_SUCCESS || !program) 545 | fatal("clCreateProgramWithSource (%d)\n", status); 546 | /* Build program. */ 547 | if (verbose) 548 | fprintf(stderr, "Building program\n"); 549 | status = clBuildProgram(program, 1, &dev_id, 550 | "", // compile options 551 | NULL, NULL); 552 | if (status != CL_SUCCESS) 553 | { 554 | warn("OpenCL build failed (%d). Build log follows:\n", status); 555 | get_program_build_log(program, dev_id); 556 | exit(1); 557 | } 558 | //get_program_bins(program); 559 | // Create kernel objects 560 | //cl_kernel k_init_ht = clCreateKernel(program, "kernel_init_ht", &status); 561 | //if (status != CL_SUCCESS || !k_init_ht) 562 | // fatal("clCreateKernel (%d)\n", status); 563 | for (unsigned test = 0; test < num_tests; test++) 564 | { 565 | char name[128]; 566 | // snprintf(name, sizeof (name), "kernel_round%d", round); 567 | snprintf(name, sizeof (name), "test%d", test); 568 | tests[test] = clCreateKernel(program, name, &status); 569 | if (status != CL_SUCCESS || !tests[test]) 570 | fatal("clCreateKernel (%d)\n", status); 571 | } 572 | // Run 573 | run_opencl(context, queue, tests); 574 | // Release resources 575 | assert(CL_SUCCESS == 0); 576 | status = CL_SUCCESS; 577 | for (unsigned test = 0; test < num_tests; test++) 578 | status |= clReleaseKernel(tests[test]); 579 | status |= clReleaseProgram(program); 580 | status |= clReleaseCommandQueue(queue); 581 | status |= clReleaseContext(context); 582 | if (status) 583 | fprintf(stderr, "Cleaning resources failed\n"); 584 | } 585 | 586 | enum 587 | { 588 | OPT_HELP, 589 | OPT_VERBOSE, 590 | OPT_NONCES, 591 | OPT_THREADS, 592 | OPT_LIST, 593 | OPT_USE, 594 | }; 595 | 596 | static struct option optlong[] = 597 | { 598 | {"help", no_argument, 0, OPT_HELP}, 599 | {"h", no_argument, 0, OPT_HELP}, 600 | {"verbose", no_argument, 0, OPT_VERBOSE}, 601 | {"v", no_argument, 0, OPT_VERBOSE}, 602 | {"t", required_argument, 0, OPT_THREADS}, 603 | {"list", no_argument, 0, OPT_LIST}, 604 | {"use", required_argument, 0, OPT_USE}, 605 | {0, 0, 0, 0}, 606 | }; 607 | 608 | void usage(const char *progname) 609 | { 610 | printf("Usage: %s [options]\n" 611 | "OpenCL memory benchmark v0.3\n" 612 | "\n" 613 | "Options are:\n" 614 | " -h, --help display this help and exit\n" 615 | " -v, --verbose print verbose messages\n" 616 | " --list list available OpenCL devices by ID (GPUs...)\n" 617 | " --use use GPU (default: 0)\n" 618 | , progname); 619 | } 620 | 621 | int main(int argc, char **argv) 622 | { 623 | int32_t i; 624 | while (-1 != (i = getopt_long_only(argc, argv, "", optlong, 0))) 625 | switch (i) 626 | { 627 | case OPT_HELP: 628 | usage(argv[0]), exit(0); 629 | break ; 630 | case OPT_VERBOSE: 631 | verbose += 1; 632 | break ; 633 | case OPT_THREADS: 634 | // ignored, this is just to conform to the contest CLI API 635 | break ; 636 | case OPT_LIST: 637 | do_list_devices = 1; 638 | break ; 639 | case OPT_USE: 640 | gpu_to_use = parse_num(optarg); 641 | break ; 642 | default: 643 | fatal("Try '%s --help'\n", argv[0]); 644 | break ; 645 | } 646 | run_bench(); 647 | return 0; 648 | } 649 | --------------------------------------------------------------------------------