├── .gitignore ├── LICENSE ├── README.md ├── build.bat ├── build.sh ├── main.c ├── pool.h └── spall_native_auto.h /.gitignore: -------------------------------------------------------------------------------- 1 | pool 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Colin Davidson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # workpool 2 | A cross-platform (Windows, Linux, OSX, FreeBSD, OpenBSD) clang, gcc, and msvc compatible work-stealing threadpool 3 | 4 | ![scaling_demo](https://user-images.githubusercontent.com/6327402/210031238-7394fc2b-2867-4dab-8d32-049489e80528.png) 5 | -------------------------------------------------------------------------------- /build.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | clang -fuse-ld=lld -O3 -g -o pool.exe -D_CRT_SECURE_NO_WARNINGS -finstrument-functions main.c 4 | 5 | rem cl main.c /O2 /Zi /GH /Gh /diagnostics:caret /nologo /Fe:pool.exe 6 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | OS=$(uname) 2 | 3 | OS_FLAGS="" 4 | case $OS in 5 | Darwin) 6 | OS_FLAGS="-framework System" 7 | ;; 8 | esac 9 | 10 | OPT_FLAGS="" 11 | case $1 in 12 | -t) 13 | OPT_FLAGS+="-finstrument-functions -D ENABLE_TRACING" 14 | shift 15 | ;; 16 | esac 17 | 18 | clang -g -O3 -Wall -o pool $OS_FLAGS -ldl -lpthread $OPT_FLAGS main.c 19 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | #include "pool.h" 2 | 3 | #if defined(_MSC_VER) 4 | #define ATOMIC_INC32(val) (_InterlockedIncrement(&(val))) 5 | #else 6 | #define ATOMIC_INC32(val) (atomic_fetch_add_explicit(&val, 1, memory_order_relaxed)) 7 | #endif 8 | 9 | #ifdef ENABLE_TRACING 10 | #include "spall_native_auto.h" 11 | #endif 12 | 13 | int thread_results[9] = {0}; 14 | _Atomic static int total_tasks = 0; 15 | void little_work(TPool *pool, void *args) { 16 | // this is my workload. enjoy 17 | 18 | #ifndef _WIN32 19 | int sleep_time = rand() % 201; 20 | usleep(sleep_time); 21 | #else 22 | static float aaa[10000]; 23 | for (size_t i = 0; i < 10000; i++) { 24 | aaa[i] = (rand() % 2000) * 0.25; 25 | } 26 | #endif 27 | 28 | if (total_tasks < 2000) { 29 | for (int i = 0; i < 5; i++) { 30 | TPool_Task task; 31 | task.do_work = little_work; 32 | task.args = NULL; 33 | tpool_add_task(pool, task); 34 | } 35 | } 36 | 37 | ATOMIC_INC32(total_tasks); 38 | } 39 | 40 | /* 41 | void little_work(TPool *pool, void *args) { 42 | thread_results[tpool_current_thread_idx] += 1; 43 | usleep(2); 44 | } 45 | */ 46 | 47 | int main(void) { 48 | #ifdef ENABLE_TRACING 49 | spall_auto_init((char *)"profile.spall"); 50 | spall_auto_thread_init(0, SPALL_DEFAULT_BUFFER_SIZE); 51 | #endif 52 | srand(1); 53 | 54 | TPool pool = {0}; 55 | tpool_init(&pool, 8); 56 | 57 | int initial_task_count = 10; 58 | for (int i = 0; i < initial_task_count; i++) { 59 | TPool_Task task; 60 | task.do_work = little_work; 61 | task.args = NULL; 62 | tpool_add_task(&pool, task); 63 | } 64 | tpool_wait(&pool); 65 | 66 | int total_tasks = 0; 67 | for (int i = 0; i < 9; i++) { 68 | total_tasks += thread_results[i]; 69 | } 70 | printf("%d\n", total_tasks); 71 | 72 | /* 73 | total_tasks = 0; 74 | for (int i = 0; i < initial_task_count; i++) { 75 | TPool_Task task; 76 | task.do_work = little_work; 77 | task.args = NULL; 78 | tpool_add_task(&pool, task); 79 | } 80 | tpool_wait(&pool); 81 | 82 | total_tasks = 0; 83 | for (int i = 0; i < initial_task_count; i++) { 84 | TPool_Task task; 85 | task.do_work = little_work; 86 | task.args = NULL; 87 | tpool_add_task(&pool, task); 88 | } 89 | tpool_wait(&pool); 90 | */ 91 | tpool_destroy(&pool); 92 | 93 | #ifdef ENABLE_TRACING 94 | spall_auto_thread_quit(); 95 | spall_auto_quit(); 96 | #endif 97 | } 98 | 99 | #ifdef ENABLE_TRACING 100 | #define SPALL_AUTO_IMPLEMENTATION 101 | #include "spall_native_auto.h" 102 | #endif 103 | -------------------------------------------------------------------------------- /pool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef ENABLE_TRACING 9 | #include "spall_native_auto.h" 10 | #endif 11 | 12 | // cross-platform thread wrappers, because microsoft couldn't be arsed to take 5 seconds and 13 | // do this and save all the junior devs and codebases everywhere from this pile of nonsense. 14 | #if defined(__linux__) || defined(__APPLE__) 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | typedef pthread_t TPool_ThreadHandle; 22 | 23 | #define tpool_thread_start(t) pthread_create(&(t)->thread, NULL, _tpool_worker, (void *) (t)) 24 | #define tpool_thread_end(t) pthread_join((t)->thread, NULL) 25 | 26 | #elif defined(_WIN32) 27 | 28 | #include 29 | #include 30 | 31 | typedef ptrdiff_t ssize_t; 32 | typedef HANDLE TPool_ThreadHandle; 33 | 34 | #define tpool_thread_start(t) ((t)->thread = (HANDLE) _beginthread(_tpool_worker, 0, t)) 35 | #define tpool_thread_end(t) WaitForSingleObject((t)->thread, INFINITE) 36 | 37 | #endif 38 | 39 | // MSVC only took 11 years to put C11 atomics in, (despite the fact that MSVC/C++11 has them). 40 | // This is the pain we suffer because microsoft got lazy 41 | #if defined(_MSC_VER) 42 | 43 | #define TPool_Thread_Local __declspec(thread) 44 | #define TPool_Atomic volatile 45 | 46 | #define TPOOL_LOAD(val) val 47 | #define TPOOL_CAS(addr, expected, desired) (InterlockedCompareExchange64(addr, desired, expected) == expected) 48 | #define TPOOL_ATOMIC_FUTEX_INC(val) (_InterlockedIncrement64(&(val))) 49 | #define TPOOL_ATOMIC_FUTEX_DEC(val) (_InterlockedDecrement64(&(val))) 50 | 51 | #else 52 | 53 | #include 54 | 55 | #define TPool_Thread_Local _Thread_local 56 | #define TPool_Atomic _Atomic 57 | 58 | #define TPOOL_LOAD(val) atomic_load(&val) 59 | #define TPOOL_CAS(addr, expected, desired) atomic_compare_exchange_weak(addr, &expected, desired) 60 | #define TPOOL_ATOMIC_FUTEX_INC(val) (atomic_fetch_add_explicit(&val, 1, memory_order_acquire)) 61 | #define TPOOL_ATOMIC_FUTEX_DEC(val) (atomic_fetch_sub_explicit(&val, 1, memory_order_acquire)) 62 | #define __debugbreak() __builtin_trap() 63 | 64 | #endif 65 | 66 | // cross-platform futex, because we can't just have nice things. All the popular platforms have them under the hood, 67 | // but giving them to users? NO! Users are too stupid to have nice things, save them for the fedora-wearing elite. 68 | #if defined(__linux__) 69 | 70 | #include 71 | #include 72 | 73 | typedef TPool_Atomic int32_t TPool_Futex; 74 | 75 | void _tpool_signal(TPool_Futex *addr) { 76 | int ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL, 0); 77 | if (ret == -1) { 78 | perror("Futex wake"); 79 | __debugbreak(); 80 | } 81 | } 82 | void _tpool_broadcast(TPool_Futex *addr) { 83 | int ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT32_MAX, NULL, NULL, 0); 84 | if (ret == -1) { 85 | perror("Futex wake"); 86 | __debugbreak(); 87 | } 88 | } 89 | 90 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) { 91 | for (;;) { 92 | int ret = syscall(SYS_futex, addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL, 0); 93 | if (ret == -1) { 94 | if (errno != EAGAIN) { 95 | perror("Futex wait"); 96 | __debugbreak(); 97 | } else { 98 | return; 99 | } 100 | } else if (ret == 0) { 101 | if (*addr != val) { 102 | return; 103 | } 104 | } 105 | } 106 | } 107 | 108 | #elif defined(__APPLE__) 109 | 110 | typedef TPool_Atomic int64_t TPool_Futex; 111 | 112 | #define UL_COMPARE_AND_WAIT 0x00000001 113 | #define ULF_WAKE_ALL 0x00000100 114 | #define ULF_NO_ERRNO 0x01000000 115 | 116 | /* timeout is specified in microseconds */ 117 | int __ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout); 118 | int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); 119 | 120 | void _tpool_signal(TPool_Futex *addr) { 121 | for (;;) { 122 | int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, 0); 123 | if (ret >= 0) { 124 | return; 125 | } 126 | ret = -ret; 127 | if (ret == EINTR || ret == EFAULT) { 128 | continue; 129 | } 130 | if (ret == ENOENT) { 131 | return; 132 | } 133 | printf("futex wake fail?\n"); 134 | __debugbreak(); 135 | } 136 | } 137 | 138 | void _tpool_broadcast(TPool_Futex *addr) { 139 | for (;;) { 140 | int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, addr, 0); 141 | if (ret >= 0) { 142 | return; 143 | } 144 | ret = -ret; 145 | if (ret == EINTR || ret == EFAULT) { 146 | continue; 147 | } 148 | if (ret == ENOENT) { 149 | return; 150 | } 151 | printf("futex wake fail?\n"); 152 | __debugbreak(); 153 | } 154 | } 155 | 156 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) { 157 | for (;;) { 158 | int ret = __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, val, 0); 159 | if (ret >= 0) { 160 | if (*addr != val) { 161 | return; 162 | } 163 | continue; 164 | } 165 | ret = -ret; 166 | if (ret == EINTR || ret == EFAULT) { 167 | continue; 168 | } 169 | if (ret == ENOENT) { 170 | return; 171 | } 172 | 173 | printf("futex wait fail?\n"); 174 | __debugbreak(); 175 | } 176 | } 177 | 178 | #elif defined(_WIN32) 179 | typedef TPool_Atomic int64_t TPool_Futex; 180 | 181 | void _tpool_signal(TPool_Futex *addr) { 182 | WakeByAddressSingle((void *)addr); 183 | } 184 | 185 | void _tpool_broadcast(TPool_Futex *addr) { 186 | WakeByAddressAll((void *)addr); 187 | } 188 | 189 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) { 190 | for (;;) { 191 | int ret = WaitOnAddress(addr, (void *)&val, sizeof(val), INFINITE); 192 | if (*addr != val) break; 193 | } 194 | } 195 | 196 | #elif defined(__FreeBSD__) 197 | 198 | #include 199 | #include 200 | 201 | typedef TPool_Atomic int32_t TPool_Futex; 202 | 203 | void _tpool_signal(TPool_Futex *addr) { 204 | _umtx_op(addr, UMTX_OP_WAKE, 1, 0, 0); 205 | } 206 | 207 | void _tpool_broadcast(TPool_Futex *addr) { 208 | _umtx_op(addr, UMTX_OP_WAKE, INT32_MAX, 0, 0); 209 | } 210 | 211 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) { 212 | for (;;) { 213 | int ret = _umtx_op(addr, UMTX_OP_WAIT_UINT, val, 0, NULL); 214 | if (ret == 0) { 215 | if (errno == ETIMEDOUT || errno == EINTR) { 216 | continue; 217 | } 218 | 219 | perror("Futex wait"); 220 | __debugbreak(); 221 | } else if (ret == 0) { 222 | if (*addr != val) { 223 | return; 224 | } 225 | } 226 | } 227 | } 228 | 229 | #elif defined(__OpenBSD__) 230 | 231 | #include 232 | 233 | typedef TPool_Atomic int32_t TPool_Futex; 234 | 235 | void _tpool_signal(TPool_Futex *addr) { 236 | for (;;) { 237 | int ret = futex(addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL); 238 | if (ret == -1) { 239 | if (errno == ETIMEDOUT || errno == EINTR) { 240 | continue; 241 | } 242 | 243 | perror("Futex wake"); 244 | __debugbreak(); 245 | } else if (ret == 1) { 246 | return; 247 | } 248 | } 249 | } 250 | 251 | void _tpool_broadcast(TPool_Futex *addr) { 252 | for (;;) { 253 | int ret = futex(addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT32_MAX, NULL, NULL); 254 | if (ret == -1) { 255 | if (errno == ETIMEDOUT || errno == EINTR) { 256 | continue; 257 | } 258 | 259 | perror("Futex wake"); 260 | __debugbreak(); 261 | } else if (ret == 1) { 262 | return; 263 | } 264 | } 265 | } 266 | 267 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) { 268 | for (;;) { 269 | int ret = futex(addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL); 270 | if (ret == -1) { 271 | if (*addr != val) { 272 | return; 273 | } 274 | 275 | if (errno == ETIMEDOUT || errno == EINTR) { 276 | continue; 277 | } 278 | 279 | perror("Futex wait"); 280 | __debugbreak(); 281 | } 282 | } 283 | } 284 | 285 | #endif 286 | 287 | struct TPool; 288 | typedef void tpool_task_proc(struct TPool *pool, void *data); 289 | TPool_Thread_Local int tpool_current_thread_idx; 290 | 291 | #define GRAB_SUCCESS 0 292 | #define GRAB_EMPTY 1 293 | #define GRAB_FAILED 2 294 | 295 | typedef struct TPool_Task { 296 | tpool_task_proc *do_work; 297 | void *args; 298 | } TPool_Task; 299 | 300 | typedef struct { 301 | TPool_Atomic ssize_t size; 302 | TPool_Task *buffer; 303 | } TPool_RingBuffer; 304 | 305 | typedef struct { 306 | TPool_Atomic ssize_t top; 307 | TPool_Atomic ssize_t bottom; 308 | 309 | TPool_Atomic(TPool_RingBuffer *) ring; 310 | } TPool_Queue; 311 | 312 | typedef struct TPool_Thread { 313 | TPool_ThreadHandle thread; 314 | int idx; 315 | 316 | TPool_Queue queue; 317 | struct TPool *pool; 318 | } TPool_Thread; 319 | 320 | typedef struct TPool { 321 | struct TPool_Thread *threads; 322 | 323 | int thread_count; 324 | TPool_Atomic bool running; 325 | 326 | TPool_Futex tasks_available; 327 | TPool_Futex tasks_left; 328 | } TPool; 329 | 330 | TPool_RingBuffer *tpool_ring_make(ssize_t size) { 331 | TPool_RingBuffer *ring = malloc(sizeof(TPool_RingBuffer)); 332 | ring->size = size; 333 | ring->buffer = calloc(ring->size, sizeof(TPool_Task)); 334 | return ring; 335 | } 336 | 337 | TPool_Queue tpool_queue_make(ssize_t size) { 338 | TPool_Queue d = {}; 339 | TPool_RingBuffer *ring = tpool_ring_make(size); 340 | atomic_store(&d.ring, ring); 341 | return d; 342 | } 343 | 344 | void tpool_queue_delete(TPool_Queue *q) { 345 | free(q->ring->buffer); 346 | free(q->ring); 347 | } 348 | 349 | TPool_RingBuffer *tpool_ring_grow(TPool_RingBuffer *ring, ssize_t bottom, ssize_t top) { 350 | TPool_RingBuffer *new_ring = tpool_ring_make(ring->size * 2); 351 | for (ssize_t i = top; i < bottom; i++) { 352 | new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size]; 353 | } 354 | return new_ring; 355 | } 356 | 357 | void _thread_init(TPool *pool, TPool_Thread *thread, int idx) { 358 | thread->queue = tpool_queue_make(1 << 1); 359 | thread->pool = pool; 360 | thread->idx = idx; 361 | } 362 | 363 | void _tpool_queue_push(TPool_Thread *thread, TPool_Task task) { 364 | ssize_t bot = atomic_load_explicit(&thread->queue.bottom, memory_order_relaxed); 365 | ssize_t top = atomic_load_explicit(&thread->queue.top, memory_order_acquire); 366 | TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_relaxed); 367 | 368 | ssize_t size = bot - top; 369 | if (size > (cur_ring->size - 1)) { 370 | // Queue is full 371 | thread->queue.ring = tpool_ring_grow(thread->queue.ring, bot, top); 372 | cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_relaxed); 373 | } 374 | 375 | cur_ring->buffer[bot % cur_ring->size] = task; 376 | atomic_thread_fence(memory_order_release); 377 | atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed); 378 | 379 | TPOOL_ATOMIC_FUTEX_INC(thread->pool->tasks_left); 380 | TPOOL_ATOMIC_FUTEX_INC(thread->pool->tasks_available); 381 | _tpool_broadcast(&thread->pool->tasks_available); 382 | } 383 | 384 | int _tpool_queue_take(TPool_Thread *thread, TPool_Task *task) { 385 | ssize_t bot = atomic_load_explicit(&thread->queue.bottom, memory_order_relaxed) - 1; 386 | TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_relaxed); 387 | atomic_store_explicit(&thread->queue.bottom, bot, memory_order_relaxed); 388 | atomic_thread_fence(memory_order_seq_cst); 389 | 390 | ssize_t top = atomic_load_explicit(&thread->queue.top, memory_order_relaxed); 391 | if (top <= bot) { 392 | // Queue is not empty 393 | 394 | *task = cur_ring->buffer[bot % cur_ring->size]; 395 | if (top == bot) { 396 | // Only one entry left in queue 397 | if (!atomic_compare_exchange_strong_explicit(&thread->queue.top, &top, top + 1, memory_order_seq_cst, memory_order_relaxed)) { 398 | // Race failed 399 | atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed); 400 | return GRAB_EMPTY; 401 | } 402 | 403 | atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed); 404 | return GRAB_SUCCESS; 405 | } 406 | 407 | // We got a task without hitting a race 408 | return GRAB_SUCCESS; 409 | } else { 410 | // Queue is empty 411 | atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed); 412 | return GRAB_EMPTY; 413 | } 414 | } 415 | 416 | int _tpool_queue_steal(TPool_Thread *thread, TPool_Task *task) { 417 | ssize_t top = atomic_load_explicit(&thread->queue.top, memory_order_acquire); 418 | atomic_thread_fence(memory_order_seq_cst); 419 | ssize_t bot = atomic_load_explicit(&thread->queue.bottom, memory_order_acquire); 420 | 421 | int ret = GRAB_EMPTY; 422 | if (top < bot) { 423 | // Queue is not empty 424 | TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_consume); 425 | *task = cur_ring->buffer[top % cur_ring->size]; 426 | 427 | if (!atomic_compare_exchange_strong_explicit(&thread->queue.top, &top, top + 1, memory_order_seq_cst, memory_order_relaxed)) { 428 | // Race failed 429 | ret = GRAB_FAILED; 430 | } else { 431 | ret = GRAB_SUCCESS; 432 | } 433 | } 434 | return ret; 435 | } 436 | 437 | #ifndef _WIN32 438 | void *_tpool_worker(void *ptr) 439 | #else 440 | void _tpool_worker(void *ptr) 441 | #endif 442 | { 443 | TPool_Task task; 444 | TPool_Thread *current_thread = (TPool_Thread *)ptr; 445 | tpool_current_thread_idx = current_thread->idx; 446 | TPool *pool = current_thread->pool; 447 | 448 | #ifdef ENABLE_TRACING 449 | spall_auto_thread_init(tpool_current_thread_idx, SPALL_DEFAULT_BUFFER_SIZE); 450 | #endif 451 | 452 | for (;;) { 453 | work_start: 454 | if (!pool->running) { 455 | break; 456 | } 457 | 458 | // If we've got tasks to process, work through them 459 | size_t finished_tasks = 0; 460 | while (!_tpool_queue_take(current_thread, &task)) { 461 | task.do_work(pool, task.args); 462 | TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left); 463 | 464 | finished_tasks += 1; 465 | } 466 | if (finished_tasks > 0 && !TPOOL_LOAD(pool->tasks_left)) { 467 | _tpool_signal(&pool->tasks_left); 468 | } 469 | 470 | // If there's still work somewhere and we don't have it, steal it 471 | if (TPOOL_LOAD(pool->tasks_left)) { 472 | int idx = current_thread->idx; 473 | for (int i = 0; i < pool->thread_count; i++) { 474 | if (!TPOOL_LOAD(pool->tasks_left)) { 475 | break; 476 | } 477 | 478 | idx = (idx + 1) % pool->thread_count; 479 | TPool_Thread *thread = &pool->threads[idx]; 480 | 481 | TPool_Task task; 482 | int ret = _tpool_queue_steal(thread, &task); 483 | if (ret == GRAB_FAILED) { 484 | goto work_start; 485 | } else if (ret == GRAB_EMPTY) { 486 | continue; 487 | } 488 | 489 | task.do_work(pool, task.args); 490 | TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left); 491 | 492 | if (!TPOOL_LOAD(pool->tasks_left)) { 493 | _tpool_signal(&pool->tasks_left); 494 | } 495 | 496 | goto work_start; 497 | } 498 | } 499 | 500 | // if we've done all our work, and there's nothing to steal, go to sleep 501 | int32_t state = TPOOL_LOAD(pool->tasks_available); 502 | if (!pool->running) { break; } 503 | _tpool_wait(&pool->tasks_available, state); 504 | } 505 | 506 | #ifdef ENABLE_TRACING 507 | spall_auto_thread_quit(); 508 | #endif 509 | 510 | #ifndef _WIN32 511 | return NULL; 512 | #endif 513 | } 514 | 515 | void tpool_add_task(TPool *pool, TPool_Task task) { 516 | TPool_Thread *current_thread = &pool->threads[tpool_current_thread_idx]; 517 | _tpool_queue_push(current_thread, task); 518 | } 519 | 520 | void tpool_wait(TPool *pool) { 521 | TPool_Task task; 522 | TPool_Thread *current_thread = &pool->threads[tpool_current_thread_idx]; 523 | 524 | while (TPOOL_LOAD(pool->tasks_left)) { 525 | 526 | // if we've got tasks on our queue, run them 527 | while (!_tpool_queue_take(current_thread, &task)) { 528 | task.do_work(pool, task.args); 529 | TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left); 530 | } 531 | 532 | 533 | // is this mem-barriered enough? 534 | // This *must* be executed in this order, so the futex wakes immediately 535 | // if rem_tasks has changed since we checked last, otherwise the program 536 | // will permanently sleep 537 | TPool_Futex rem_tasks = TPOOL_LOAD(pool->tasks_left); 538 | if (!rem_tasks) { 539 | break; 540 | } 541 | 542 | _tpool_wait(&pool->tasks_left, rem_tasks); 543 | } 544 | 545 | } 546 | 547 | void tpool_init(TPool *pool, int child_thread_count) { 548 | int thread_count = child_thread_count + 1; 549 | pool->thread_count = thread_count; 550 | pool->threads = malloc(sizeof(TPool_Thread) * pool->thread_count); 551 | 552 | pool->running = true; 553 | 554 | // setup the main thread 555 | _thread_init(pool, &pool->threads[0], 0); 556 | tpool_current_thread_idx = 0; 557 | 558 | for (int i = 1; i < pool->thread_count; i++) { 559 | _thread_init(pool, &pool->threads[i], i); 560 | tpool_thread_start(&pool->threads[i]); 561 | } 562 | } 563 | 564 | void tpool_destroy(TPool *pool) { 565 | pool->running = false; 566 | for (int i = 1; i < pool->thread_count; i++) { 567 | TPOOL_ATOMIC_FUTEX_INC(pool->tasks_available); 568 | _tpool_broadcast(&pool->tasks_available); 569 | tpool_thread_end(&pool->threads[i]); 570 | } 571 | for (int i = 0; i < pool->thread_count; i++) { 572 | tpool_queue_delete(&pool->threads[i].queue); 573 | } 574 | 575 | free(pool->threads); 576 | } 577 | -------------------------------------------------------------------------------- /spall_native_auto.h: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: © 2024 Colin Davidson 2 | // SPDX-License-Identifier: MIT 3 | 4 | #ifndef SPALL_AUTO_H 5 | #define SPALL_AUTO_H 6 | 7 | // THIS IS EXPERIMENTAL, BUT VERY HANDY 8 | // *should* work on clang and gcc on Windows, Mac, and Linux 9 | 10 | #define SPALL_IS_WINDOWS 0 11 | #define SPALL_IS_DARWIN 0 12 | #define SPALL_IS_LINUX 0 13 | #define SPALL_IS_GCC 0 14 | #define SPALL_IS_CLANG 0 15 | #define SPALL_IS_CPP 0 16 | #define SPALL_IS_X64 0 17 | #define SPALL_IS_ARM64 0 18 | 19 | #ifdef __cplusplus 20 | #undef SPALL_IS_CPP 21 | #define SPALL_IS_CPP 1 22 | #endif 23 | 24 | #if defined(__clang__) 25 | #undef SPALL_IS_CLANG 26 | #define SPALL_IS_CLANG 1 27 | #endif 28 | #if defined(_WIN32) 29 | #undef SPALL_IS_WINDOWS 30 | #define SPALL_IS_WINDOWS 1 31 | #elif defined(__APPLE__) 32 | #undef SPALL_IS_DARWIN 33 | #define SPALL_IS_DARWIN 1 34 | #elif defined(__linux__) 35 | #undef SPALL_IS_LINUX 36 | #define SPALL_IS_LINUX 1 37 | #endif 38 | #ifdef __GNUC__ 39 | #undef SPALL_IS_GCC 40 | #define SPALL_IS_GCC 1 41 | #endif 42 | #if defined(__x86_64__) || defined(_M_AMD64) 43 | #undef SPALL_IS_X64 44 | #define SPALL_IS_X64 1 45 | #elif defined(__aarch64__) 46 | #undef SPALL_IS_ARM64 47 | #define SPALL_IS_ARM64 1 48 | #endif 49 | 50 | #if (!SPALL_IS_CLANG && !SPALL_IS_GCC) 51 | #error "Compiler not supported!" 52 | #endif 53 | 54 | #ifdef __cplusplus 55 | extern "C" { 56 | #endif 57 | 58 | #include 59 | #include 60 | #include 61 | 62 | bool spall_auto_init(char *filename); 63 | void spall_auto_quit(void); 64 | bool spall_auto_thread_init(uint32_t thread_id, size_t buffer_size); 65 | void spall_auto_thread_quit(void); 66 | 67 | bool spall_auto_buffer_begin(const char *name, signed long name_len, const char *args, signed long args_len); 68 | bool spall_auto_buffer_end(void); 69 | bool spall_auto_buffer_flush(void); 70 | 71 | void spall_auto_set_thread_instrumenting(bool on); 72 | 73 | #if SPALL_IS_GCC && SPALL_IS_CPP 74 | #define _Thread_local thread_local 75 | #endif 76 | 77 | #define SPALL_DEFAULT_BUFFER_SIZE (32 * 1024 * 1024) 78 | #define SPALL_MIN(a, b) (((a) < (b)) ? (a) : (b)) 79 | #define SPALL_MAX(a, b) (((a) > (b)) ? (a) : (b)) 80 | 81 | #ifdef __cplusplus 82 | } 83 | #endif 84 | #endif // endif SPALL_AUTO_H 85 | 86 | #ifdef SPALL_AUTO_IMPLEMENTATION 87 | #ifndef SPALL_AUTO_IMPLEMENTED_H 88 | #define SPALL_AUTO_IMPLEMENTED_H 89 | 90 | #if !SPALL_IS_WINDOWS 91 | #if SPALL_IS_CPP 92 | #include 93 | #else 94 | #include 95 | #endif 96 | #endif 97 | 98 | #ifdef __cplusplus 99 | extern "C" { 100 | #endif 101 | 102 | #include 103 | #include 104 | #include 105 | #include 106 | 107 | 108 | #if !SPALL_IS_WINDOWS 109 | #include 110 | #include 111 | #include 112 | #include 113 | #endif 114 | 115 | #if SPALL_IS_WINDOWS 116 | #include 117 | #include 118 | 119 | typedef ptrdiff_t ssize_t; 120 | typedef HANDLE Spall_ThreadHandle; 121 | 122 | #define spall_thread_start(t) ((t)->writer.thread = (HANDLE) _beginthread(spall_writer, 0, t)) 123 | #define spall_thread_end(t) WaitForSingleObject((t)->writer.thread, INFINITE) 124 | #else 125 | typedef pthread_t Spall_ThreadHandle; 126 | #define spall_thread_start(t) pthread_create(&(t)->writer.thread, NULL, spall_writer, (void *) (t)) 127 | #define spall_thread_end(t) pthread_join((t)->writer.thread, NULL) 128 | #endif 129 | 130 | #define SPALL_NOINSTRUMENT __attribute__((no_instrument_function)) 131 | #define SPALL_FORCEINLINE __attribute__((always_inline)) 132 | #define __debugbreak() __builtin_trap() 133 | 134 | #if SPALL_IS_CPP 135 | #define Spall_Atomic(X) std::atomic 136 | #else 137 | #define Spall_Atomic(X) _Atomic (X) 138 | #endif 139 | 140 | #define SPALL_FN static SPALL_NOINSTRUMENT 141 | 142 | #if SPALL_IS_X64 143 | #include 144 | SPALL_FN uint64_t spall_get_clock(void) { 145 | return __rdtsc(); 146 | } 147 | SPALL_FN void spall_pause(void) { 148 | _mm_pause(); 149 | } 150 | #elif SPALL_IS_ARM64 151 | SPALL_FN uint64_t spall_get_clock(void) { 152 | int64_t timer_val; 153 | asm volatile("mrs %0, cntvct_el0" : "=r"(timer_val)); 154 | return (uint64_t)timer_val; 155 | } 156 | SPALL_FN void spall_pause(void) { 157 | asm volatile("yield"); 158 | } 159 | #endif 160 | 161 | #pragma pack(push, 1) 162 | 163 | typedef struct SpallHeader { 164 | uint64_t magic_header; // = 0xABADF00D 165 | uint64_t version; // = 2 166 | double timestamp_unit; 167 | uint64_t known_address; // Address for spall_auto_init, for skew-correction 168 | uint16_t program_path_len; 169 | } SpallHeader; 170 | 171 | enum { 172 | SpallAutoEventType_Invalid = 0, 173 | SpallAutoEventType_Begin = 1, 174 | }; 175 | 176 | typedef struct SpallMicroBeginEventMax { 177 | uint8_t type; 178 | uint64_t ts; 179 | uint64_t caller; 180 | } SpallMicroBeginEventMax; 181 | 182 | typedef struct SpallMicroEndEventMax { 183 | uint8_t type; 184 | uint64_t ts; 185 | } SpallMicroEndEventMax; 186 | 187 | typedef struct SpallAutoBeginEvent { 188 | uint8_t type; 189 | uint64_t when; 190 | } SpallAutoBeginEvent; 191 | 192 | typedef struct SpallBufferHeader { 193 | uint32_t size; 194 | uint32_t tid; 195 | uint64_t first_ts; 196 | uint32_t max_depth; 197 | } SpallBufferHeader; 198 | 199 | #pragma pack(pop) 200 | 201 | SPALL_FN SPALL_FORCEINLINE uint64_t spall_delta_to_bits(uint64_t dt) { 202 | uint32_t count = 0; 203 | count += (dt >= 0x100); 204 | count += (dt >= 0x10000); 205 | count += (dt >= 0x100000000); 206 | return count; 207 | } 208 | 209 | typedef struct SpallProfile { 210 | double stamp_scale; 211 | FILE *file; 212 | } SpallProfile; 213 | 214 | typedef Spall_Atomic(uint64_t) Spall_Futex; 215 | typedef struct SpallBuffer { 216 | uint8_t *data; 217 | size_t length; 218 | 219 | // if true, write to upper-half, else lower-half 220 | size_t sub_length; 221 | bool write_half; 222 | 223 | struct { 224 | Spall_Atomic(bool) is_running; 225 | Spall_ThreadHandle thread; 226 | Spall_Atomic(uint64_t) ptr; 227 | Spall_Atomic(size_t) size; 228 | } writer; 229 | 230 | size_t head; 231 | uint32_t thread_id; 232 | 233 | uint64_t previous_ts; 234 | uint64_t first_ts; 235 | 236 | uint64_t previous_addr; 237 | uint64_t previous_caller; 238 | 239 | uint32_t current_depth; 240 | uint32_t max_depth; 241 | } SpallBuffer; 242 | 243 | 244 | // Cross-platform wrappers 245 | #if SPALL_IS_LINUX 246 | #include 247 | #include 248 | #include 249 | #include 250 | #include 251 | #include 252 | #include 253 | #include 254 | #include 255 | #include 256 | #include 257 | 258 | SPALL_FN bool get_program_path(char **out_path) { 259 | char path[PATH_MAX] = {0}; 260 | uint32_t size = sizeof(path); 261 | 262 | ssize_t buff_len = (ssize_t)readlink("/proc/self/exe", path, size - 1); 263 | if (buff_len == -1) { 264 | *out_path = NULL; 265 | return false; 266 | } 267 | 268 | char *post_path = (char *)calloc(PATH_MAX, 1); 269 | if (realpath(path, post_path) == NULL) { 270 | free(post_path); 271 | *out_path = NULL; 272 | return false; 273 | } 274 | 275 | *out_path = post_path; 276 | return true; 277 | } 278 | 279 | SPALL_FN uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift) { 280 | __uint128_t x = cyc; 281 | x *= mult; 282 | x >>= shift; 283 | return (uint64_t)x; 284 | } 285 | 286 | SPALL_FN long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { 287 | return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); 288 | } 289 | 290 | #if SPALL_IS_X64 291 | SPALL_FN double spall_get_clock_multiplier(void) { 292 | struct perf_event_attr pe = { 293 | .type = PERF_TYPE_HARDWARE, 294 | .size = sizeof(struct perf_event_attr), 295 | .config = PERF_COUNT_HW_INSTRUCTIONS, 296 | .disabled = 1, 297 | .exclude_kernel = 1, 298 | .exclude_hv = 1 299 | }; 300 | 301 | int fd = (int)perf_event_open(&pe, 0, -1, -1, 0); 302 | if (fd == -1) { 303 | perror("perf_event_open failed"); 304 | return 1; 305 | } 306 | void *addr = mmap(NULL, 4*1024, PROT_READ, MAP_SHARED, fd, 0); 307 | if (!addr) { 308 | perror("mmap failed"); 309 | return 1; 310 | } 311 | struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)addr; 312 | if (pc->cap_user_time != 1) { 313 | fprintf(stderr, "Perf system doesn't support user time\n"); 314 | return 1; 315 | } 316 | double nanos = (double)mul_u64_u32_shr(1000000000000000ull, pc->time_mult, pc->time_shift); 317 | double multiplier = nanos / 1000000000000000.0; 318 | return multiplier; 319 | } 320 | #endif 321 | 322 | 323 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) { 324 | long ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL, 0); 325 | if (ret == -1) { 326 | perror("Futex wake"); 327 | __debugbreak(); 328 | } 329 | } 330 | 331 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) { 332 | for (;;) { 333 | long ret = syscall(SYS_futex, addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL, 0); 334 | if (ret == -1) { 335 | if (errno != EAGAIN) { 336 | perror("Futex wait"); 337 | __debugbreak(); 338 | } else { 339 | return; 340 | } 341 | } else if (ret == 0) { 342 | return; 343 | } 344 | } 345 | } 346 | 347 | #elif SPALL_IS_DARWIN 348 | 349 | #include 350 | #include 351 | #include 352 | 353 | #if SPALL_IS_X64 354 | SPALL_FN double spall_get_clock_multiplier(void) { 355 | uint64_t freq; 356 | size_t size = sizeof(freq); 357 | 358 | sysctlbyname("machdep.tsc.frequency", &freq, &size, NULL, 0); 359 | return 1000000000.0 / (double)freq; 360 | } 361 | #elif SPALL_IS_ARM64 362 | SPALL_FN double spall_get_clock_multiplier(void) { 363 | uint64_t freq_val; 364 | asm volatile("mrs %0, cntfrq_el0" : "=r"(freq_val)); 365 | 366 | double multiplier = 1000000000.0 / (double)freq_val; 367 | return multiplier; 368 | } 369 | #endif 370 | 371 | SPALL_FN bool get_program_path(char **out_path) { 372 | char pre_path[1025]; 373 | uint32_t size = sizeof(pre_path); 374 | if (_NSGetExecutablePath(pre_path, &size) == -1) { 375 | *out_path = NULL; 376 | return false; 377 | } 378 | 379 | char *post_path = (char *)malloc(1025); 380 | if (realpath(pre_path, post_path) == NULL) { 381 | free(post_path); 382 | *out_path = NULL; 383 | return false; 384 | } 385 | 386 | *out_path = post_path; 387 | return true; 388 | } 389 | 390 | #define UL_COMPARE_AND_WAIT 0x00000001 391 | #define ULF_WAKE_ALL 0x00000100 392 | #define ULF_NO_ERRNO 0x01000000 393 | 394 | /* timeout is specified in microseconds */ 395 | int __ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout); 396 | int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); 397 | 398 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) { 399 | for (;;) { 400 | int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, 0); 401 | if (ret >= 0) { 402 | return; 403 | } 404 | ret = -ret; 405 | if (ret == EINTR || ret == EFAULT) { 406 | continue; 407 | } 408 | if (ret == ENOENT) { 409 | return; 410 | } 411 | printf("futex signal fail?\n"); 412 | __debugbreak(); 413 | } 414 | } 415 | 416 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) { 417 | for (;;) { 418 | int ret = __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, val, 0); 419 | if (ret >= 0) { 420 | return; 421 | } 422 | ret = -ret; 423 | if (ret == EINTR || ret == EFAULT) { 424 | continue; 425 | } 426 | if (ret == ENOENT) { 427 | return; 428 | } 429 | 430 | printf("futex wait fail? %d\n", ret); 431 | __debugbreak(); 432 | } 433 | } 434 | 435 | #elif SPALL_IS_WINDOWS 436 | 437 | SPALL_FN bool get_program_path(char **out_path) { 438 | char *post_path = (char *)calloc(MAX_PATH, 1); 439 | if (GetModuleFileNameA(NULL, post_path, MAX_PATH) == 0) { 440 | *out_path = NULL; 441 | return false; 442 | } 443 | 444 | *out_path = post_path; 445 | return true; 446 | } 447 | 448 | SPALL_FN SPALL_FORCEINLINE double spall_get_clock_multiplier(void) { 449 | 450 | // Cache the answer so that multiple calls never take the slow path more than once 451 | static double multiplier = 0; 452 | if (multiplier) { 453 | return multiplier; 454 | } 455 | 456 | uint64_t tsc_freq = 0; 457 | 458 | // Get time before sleep 459 | uint64_t qpc_begin = 0; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_begin); 460 | uint64_t tsc_begin = spall_get_clock(); 461 | 462 | Sleep(2); 463 | 464 | // Get time after sleep 465 | uint64_t qpc_end = qpc_begin + 1; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_end); 466 | uint64_t tsc_end = spall_get_clock(); 467 | 468 | // Do the math to extrapolate the RDTSC ticks elapsed in 1 second 469 | uint64_t qpc_freq = 0; QueryPerformanceFrequency((LARGE_INTEGER *)&qpc_freq); 470 | tsc_freq = (tsc_end - tsc_begin) * qpc_freq / (qpc_end - qpc_begin); 471 | 472 | multiplier = 1000000000.0 / (double)tsc_freq; 473 | return multiplier; 474 | } 475 | 476 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) { 477 | WakeByAddressSingle((void *)addr); 478 | } 479 | 480 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) { 481 | WaitOnAddress(addr, (void *)&val, sizeof(val), INFINITE); 482 | } 483 | 484 | #endif 485 | 486 | // Auto-tracing impl 487 | static SpallProfile spall_ctx; 488 | static _Thread_local SpallBuffer *spall_buffer = NULL; 489 | static _Thread_local bool spall_thread_running = false; 490 | 491 | SPALL_NOINSTRUMENT void spall_auto_set_thread_instrumenting(bool on) { 492 | spall_thread_running = on; 493 | } 494 | 495 | #if SPALL_IS_WINDOWS 496 | SPALL_FN void spall_writer(void *arg) { 497 | #else 498 | SPALL_FN void *spall_writer(void *arg) { 499 | #endif 500 | 501 | SpallBuffer *buffer = (SpallBuffer *)arg; 502 | while (buffer->writer.is_running) { 503 | spall_wait(&buffer->writer.ptr, 0); 504 | if (!buffer->writer.is_running) { break; } 505 | 506 | void *buffer_ptr = (void *)atomic_load(&buffer->writer.ptr); 507 | if (buffer_ptr == 0) { continue; } 508 | 509 | size_t size = (size_t)atomic_load(&buffer->writer.size); 510 | atomic_store(&buffer->writer.ptr, 0); 511 | 512 | fwrite(buffer_ptr, size, 1, spall_ctx.file); 513 | } 514 | 515 | #if !SPALL_IS_WINDOWS 516 | return NULL; 517 | #endif 518 | } 519 | 520 | SPALL_FN SPALL_FORCEINLINE bool spall__file_write(void *p, size_t n) { 521 | atomic_store(&spall_buffer->writer.size, n); 522 | atomic_store(&spall_buffer->writer.ptr, (uint64_t)p); 523 | spall_signal(&spall_buffer->writer.ptr); 524 | 525 | for (;;) { 526 | void *ptr = (void *)atomic_load(&spall_buffer->writer.ptr); 527 | if (ptr == 0) { break; } 528 | spall_pause(); 529 | } 530 | 531 | return true; 532 | } 533 | 534 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_flush(void) { 535 | if (!spall_buffer) return false; 536 | 537 | size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0; 538 | 539 | SpallBufferHeader *sbp = (SpallBufferHeader *)(spall_buffer->data + data_start); 540 | if (spall_buffer->head > 0) { 541 | sbp->size = (uint32_t)(spall_buffer->head - sizeof(SpallBufferHeader)); 542 | sbp->first_ts = spall_buffer->first_ts; 543 | sbp->max_depth = spall_buffer->max_depth; 544 | if (!spall__file_write(spall_buffer->data + data_start, spall_buffer->head)) return false; 545 | 546 | spall_buffer->write_half = !spall_buffer->write_half; 547 | } 548 | 549 | data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0; 550 | sbp = (SpallBufferHeader *)(spall_buffer->data + data_start); 551 | sbp->size = 0; 552 | sbp->first_ts = 0; 553 | sbp->tid = spall_buffer->thread_id; 554 | 555 | spall_buffer->head = sizeof(SpallBufferHeader); 556 | spall_buffer->first_ts = 0; 557 | spall_buffer->previous_ts = 0; 558 | spall_buffer->previous_addr = 0; 559 | spall_buffer->previous_caller = 0; 560 | return true; 561 | } 562 | 563 | SPALL_FN SPALL_FORCEINLINE bool spall_buffer_micro_begin(uint64_t addr, uint64_t caller) { 564 | spall_buffer->current_depth += 1; 565 | spall_buffer->max_depth = SPALL_MAX(spall_buffer->max_depth, spall_buffer->current_depth); 566 | 567 | size_t ev_size = sizeof(SpallMicroBeginEventMax); 568 | if ((spall_buffer->head + ev_size) > spall_buffer->sub_length) { 569 | if (!spall_auto_buffer_flush()) { 570 | return false; 571 | } 572 | } 573 | 574 | size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0; 575 | uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head; 576 | 577 | uint64_t now = spall_get_clock(); 578 | if (spall_buffer->first_ts == 0) { 579 | spall_buffer->first_ts = now; 580 | spall_buffer->previous_ts = now; 581 | } 582 | 583 | uint64_t dt = now - spall_buffer->previous_ts; 584 | uint64_t d_addr = addr ^ spall_buffer->previous_addr; 585 | uint64_t d_caller = caller ^ spall_buffer->previous_caller; 586 | 587 | uint64_t dt_bits = spall_delta_to_bits(dt); 588 | uint64_t addr_bits = spall_delta_to_bits(d_addr); 589 | uint64_t caller_bits = spall_delta_to_bits(d_caller); 590 | 591 | uint64_t dt_size = 1 << dt_bits; 592 | uint64_t addr_size = 1 << addr_bits; 593 | uint64_t caller_size = 1 << caller_bits; 594 | 595 | // [begin event tag | size of ts | size of addr | size of caller] 596 | uint8_t type_byte = (0 << 6) | (dt_bits << 4) | (addr_bits << 2) | caller_bits; 597 | 598 | int i = 0; 599 | *(ev_buffer + i) = type_byte; i += 1; 600 | memcpy(ev_buffer + i, &dt, 8); i += dt_size; 601 | memcpy(ev_buffer + i, &d_addr, 8); i += addr_size; 602 | memcpy(ev_buffer + i, &d_caller, 8); i += caller_size; 603 | 604 | spall_buffer->previous_ts = now; 605 | spall_buffer->previous_addr = addr; 606 | spall_buffer->previous_caller = caller; 607 | spall_buffer->head += i; 608 | 609 | return true; 610 | } 611 | 612 | SPALL_FN SPALL_FORCEINLINE bool spall_buffer_micro_end(void) { 613 | uint64_t now = spall_get_clock(); 614 | spall_buffer->current_depth -= 1; 615 | 616 | size_t ev_size = sizeof(SpallMicroEndEventMax); 617 | if ((spall_buffer->head + ev_size) > spall_buffer->sub_length) { 618 | if (!spall_auto_buffer_flush()) { 619 | return false; 620 | } 621 | } 622 | if (spall_buffer->first_ts == 0) { 623 | spall_buffer->first_ts = now; 624 | spall_buffer->previous_ts = now; 625 | } 626 | 627 | size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0; 628 | uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head; 629 | 630 | uint64_t dt = now - spall_buffer->previous_ts; 631 | uint64_t dt_bits = spall_delta_to_bits(dt); 632 | uint64_t dt_size = 1 << dt_bits; 633 | 634 | // [end event tag | size of ts] 635 | uint8_t type_byte = (1 << 6) | (dt_bits << 4); 636 | 637 | int i = 0; 638 | *(ev_buffer + i) = type_byte; i += 1; 639 | memcpy(ev_buffer + i, &dt, 8); i += dt_size; 640 | 641 | spall_buffer->previous_ts = now; 642 | spall_buffer->head += i; 643 | return true; 644 | } 645 | 646 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_begin(const char *name, signed long name_len, const char *args, signed long args_len) { 647 | 648 | spall_buffer->current_depth += 1; 649 | spall_buffer->max_depth = SPALL_MAX(spall_buffer->max_depth, spall_buffer->current_depth); 650 | 651 | uint16_t trunc_name_len = (uint16_t)SPALL_MIN(name_len, UINT16_MAX); 652 | uint16_t trunc_args_len = (uint16_t)SPALL_MIN(args_len, UINT16_MAX); 653 | uint64_t name_len_size = (trunc_name_len > 255) ? 2 : 1; 654 | uint64_t args_len_size = (trunc_args_len > 255) ? 2 : 1; 655 | 656 | uint64_t event_tail = trunc_name_len + name_len_size + trunc_args_len + args_len_size; 657 | if ((spall_buffer->head + sizeof(SpallAutoBeginEvent) + event_tail) > spall_buffer->sub_length) { 658 | if (!spall_auto_buffer_flush()) { 659 | return false; 660 | } 661 | } 662 | 663 | size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0; 664 | uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head; 665 | 666 | uint64_t now = spall_get_clock(); 667 | if (spall_buffer->first_ts == 0) { 668 | spall_buffer->first_ts = now; 669 | spall_buffer->previous_ts = now; 670 | } 671 | uint64_t dt = now - spall_buffer->previous_ts; 672 | uint64_t dt_bits = spall_delta_to_bits(dt); 673 | uint64_t dt_size = 1 << dt_bits; 674 | 675 | // [extended tag | begin type | delta size | field lengths] 676 | uint8_t name_args_lens = ((name_len_size >> 1) << 1) | (args_len_size >> 1); 677 | uint8_t type_byte = (2 << 6) | (SpallAutoEventType_Begin << 4) | (dt_bits << 2) | name_args_lens; 678 | 679 | int i = 0; 680 | *(ev_buffer + i) = type_byte; i += 1; 681 | memcpy(ev_buffer + i, &dt, 8); i += dt_size; 682 | memcpy(ev_buffer + i, &trunc_name_len, name_len_size); i += name_len_size; 683 | memcpy(ev_buffer + i, &trunc_args_len, args_len_size); i += args_len_size; 684 | memcpy(ev_buffer + i, name, trunc_name_len); i += trunc_name_len; 685 | memcpy(ev_buffer + i, args, trunc_args_len); i += trunc_args_len; 686 | 687 | spall_buffer->previous_ts = now; 688 | spall_buffer->head += i; 689 | 690 | return true; 691 | } 692 | 693 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_end(void) { 694 | return spall_buffer_micro_end(); 695 | } 696 | 697 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool (spall_auto_thread_init)(uint32_t thread_id, size_t buffer_size) { 698 | if (buffer_size < 512) { return false; } 699 | if (spall_buffer != NULL) { return false; } 700 | 701 | spall_buffer = (SpallBuffer *)calloc(sizeof(SpallBuffer), 1); 702 | spall_buffer->data = (uint8_t *)malloc(buffer_size); 703 | spall_buffer->length = buffer_size; 704 | spall_buffer->thread_id = thread_id; 705 | spall_buffer->sub_length = buffer_size / 2; 706 | 707 | // removing initial page-fault bubbles to make the data a little more accurate, at the cost of thread spin-up time 708 | memset(spall_buffer->data, 1, spall_buffer->length); 709 | 710 | spall_buffer->writer.is_running = true; 711 | spall_thread_start(spall_buffer); 712 | 713 | spall_auto_buffer_flush(); 714 | spall_thread_running = true; 715 | return true; 716 | } 717 | 718 | void (spall_auto_thread_quit)(void) { 719 | spall_thread_running = false; 720 | spall_auto_buffer_flush(); 721 | 722 | spall_buffer->writer.is_running = false; 723 | spall_buffer->writer.ptr = 1; 724 | spall_signal(&spall_buffer->writer.ptr); 725 | spall_thread_end(spall_buffer); 726 | 727 | free(spall_buffer->data); 728 | free(spall_buffer); 729 | spall_buffer = NULL; 730 | } 731 | 732 | SPALL_FN void *spall_canonical_addr(void* fn) { 733 | // sometimes the pointer we get back is to a jump table; walk past that first layer. 734 | 735 | void *ret = fn; 736 | #if SPALL_IS_X64 737 | unsigned char *fn_data = (unsigned char *)fn; 738 | if (fn_data[0] == 0xE9) { 739 | // JMP rel32 740 | int32_t target = *(int32_t*) &fn_data[1]; 741 | 742 | int jump_inst_size = 5; 743 | ret = (void *)(fn_data + jump_inst_size + target); 744 | } 745 | #endif 746 | 747 | return ret; 748 | } 749 | 750 | 751 | SPALL_NOINSTRUMENT bool spall_auto_init(char *filename) { 752 | if (!filename) return false; 753 | memset(&spall_ctx, 0, sizeof(spall_ctx)); 754 | 755 | spall_ctx.file = fopen(filename, "wb"); // TODO: handle utf8 and long paths on windows 756 | if (spall_ctx.file) { // basically freopen() but we don't want to force users to lug along another macro define 757 | fclose(spall_ctx.file); 758 | spall_ctx.file = fopen(filename, "ab"); 759 | } 760 | if (!spall_ctx.file) { return false; } 761 | 762 | spall_ctx.stamp_scale = spall_get_clock_multiplier(); 763 | SpallHeader header = {0}; 764 | header.magic_header = 0xABADF00D; 765 | header.version = 2; 766 | header.timestamp_unit = spall_ctx.stamp_scale; 767 | header.known_address = (uint64_t)spall_canonical_addr((void *)spall_auto_init); 768 | 769 | char *program_path; 770 | if (!get_program_path(&program_path)) { return false; } 771 | uint16_t program_path_len = (uint16_t)strlen(program_path); 772 | 773 | header.program_path_len = program_path_len; 774 | 775 | size_t full_header_size = sizeof(SpallHeader) + (size_t)program_path_len; 776 | uint8_t *full_header = (uint8_t *)malloc(full_header_size); 777 | memcpy(full_header, &header, sizeof(SpallHeader)); 778 | memcpy(full_header + sizeof(SpallHeader), program_path, program_path_len); 779 | 780 | size_t write_ret = fwrite(full_header, 1, full_header_size, spall_ctx.file); 781 | if (write_ret < full_header_size) { return false; } 782 | 783 | free(full_header); 784 | return true; 785 | } 786 | 787 | SPALL_NOINSTRUMENT void spall_auto_quit(void) {} 788 | 789 | SPALL_NOINSTRUMENT void __cyg_profile_func_enter(void *fn, void *caller) { 790 | if (!spall_thread_running) { 791 | return; 792 | } 793 | fn = spall_canonical_addr(fn); 794 | 795 | spall_thread_running = false; 796 | spall_buffer_micro_begin((uint64_t)fn, (uint64_t)caller); 797 | spall_thread_running = true; 798 | } 799 | 800 | SPALL_NOINSTRUMENT void __cyg_profile_func_exit(void *fn, void *caller) { 801 | if (!spall_thread_running) { 802 | return; 803 | } 804 | 805 | spall_thread_running = false; 806 | spall_buffer_micro_end(); 807 | spall_thread_running = true; 808 | } 809 | 810 | #ifdef __cplusplus 811 | } 812 | #endif 813 | 814 | #endif 815 | #endif 816 | --------------------------------------------------------------------------------