├── .gitignore
├── LICENSE
├── README.md
├── build.bat
├── build.sh
├── main.c
├── pool.h
└── spall_native_auto.h


/.gitignore:
--------------------------------------------------------------------------------
1 | pool
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Colin Davidson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # workpool
2 | A cross-platform (Windows, Linux, OSX, FreeBSD, OpenBSD) clang, gcc, and msvc compatible work-stealing threadpool
3 | 
4 | ![scaling_demo](https://user-images.githubusercontent.com/6327402/210031238-7394fc2b-2867-4dab-8d32-049489e80528.png)
5 | 


--------------------------------------------------------------------------------
/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | 
3 | clang -fuse-ld=lld -O3 -g -o pool.exe -D_CRT_SECURE_NO_WARNINGS -finstrument-functions main.c
4 | 
5 | rem cl main.c /O2 /Zi /GH /Gh /diagnostics:caret /nologo /Fe:pool.exe
6 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | OS=$(uname)
 2 | 
 3 | OS_FLAGS=""
 4 | case $OS in
 5 | Darwin)
 6 | 	OS_FLAGS="-framework System"
 7 | 	;;
 8 | esac
 9 | 
10 | OPT_FLAGS=""
11 | case $1 in
12 | 	-t)
13 | 	OPT_FLAGS+="-finstrument-functions -D ENABLE_TRACING"
14 | 	shift
15 | 	;;
16 | esac
17 | 
18 | clang -g -O3 -Wall -o pool $OS_FLAGS -ldl -lpthread $OPT_FLAGS main.c
19 | 


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
  1 | #include "pool.h"
  2 | 
  3 | #if defined(_MSC_VER)
  4 | #define ATOMIC_INC32(val) (_InterlockedIncrement(&(val)))
  5 | #else
  6 | #define ATOMIC_INC32(val) (atomic_fetch_add_explicit(&val, 1, memory_order_relaxed))
  7 | #endif
  8 | 
  9 | #ifdef ENABLE_TRACING
 10 | #include "spall_native_auto.h"
 11 | #endif
 12 | 
 13 | int thread_results[9] = {0};
 14 | _Atomic static int total_tasks = 0;
 15 | void little_work(TPool *pool, void *args) {
 16 | 	// this is my workload. enjoy
 17 | 
 18 | #ifndef _WIN32
 19 | 	int sleep_time = rand() % 201;
 20 | 	usleep(sleep_time);
 21 | #else
 22 | 	static float aaa[10000];
 23 | 	for (size_t i = 0; i < 10000; i++) {
 24 | 		aaa[i] = (rand() % 2000) * 0.25;
 25 | 	}
 26 | #endif
 27 | 
 28 | 	if (total_tasks < 2000) {
 29 | 		for (int i = 0; i < 5; i++) {
 30 | 			TPool_Task task;
 31 | 			task.do_work = little_work;
 32 | 			task.args = NULL;
 33 | 			tpool_add_task(pool, task);
 34 | 		}
 35 | 	}
 36 | 
 37 | 	ATOMIC_INC32(total_tasks);
 38 | }
 39 | 
 40 | /*
 41 | void little_work(TPool *pool, void *args) {
 42 | 	thread_results[tpool_current_thread_idx] += 1;
 43 | 	usleep(2);
 44 | }
 45 | */
 46 | 
 47 | int main(void) {
 48 | #ifdef ENABLE_TRACING
 49 | 	spall_auto_init((char *)"profile.spall");
 50 | 	spall_auto_thread_init(0, SPALL_DEFAULT_BUFFER_SIZE);
 51 | #endif
 52 | 	srand(1);
 53 | 
 54 | 	TPool pool = {0};
 55 | 	tpool_init(&pool, 8);
 56 | 
 57 | 	int initial_task_count = 10;
 58 | 	for (int i = 0; i < initial_task_count; i++) {
 59 | 		TPool_Task task;
 60 | 		task.do_work = little_work;
 61 | 		task.args = NULL;
 62 | 		tpool_add_task(&pool, task);
 63 | 	}
 64 | 	tpool_wait(&pool);
 65 | 
 66 | 	int total_tasks = 0;
 67 | 	for (int i = 0; i < 9; i++) {
 68 | 		total_tasks += thread_results[i];
 69 | 	}
 70 | 	printf("%d\n", total_tasks);
 71 | 
 72 | /*
 73 | 	total_tasks = 0;
 74 | 	for (int i = 0; i < initial_task_count; i++) {
 75 | 		TPool_Task task;
 76 | 		task.do_work = little_work;
 77 | 		task.args = NULL;
 78 | 		tpool_add_task(&pool, task);
 79 | 	}
 80 | 	tpool_wait(&pool);
 81 | 
 82 | 	total_tasks = 0;
 83 | 	for (int i = 0; i < initial_task_count; i++) {
 84 | 		TPool_Task task;
 85 | 		task.do_work = little_work;
 86 | 		task.args = NULL;
 87 | 		tpool_add_task(&pool, task);
 88 | 	}
 89 | 	tpool_wait(&pool);
 90 | */
 91 | 	tpool_destroy(&pool);
 92 | 
 93 | #ifdef ENABLE_TRACING
 94 | 	spall_auto_thread_quit();
 95 | 	spall_auto_quit();
 96 | #endif
 97 | }
 98 | 
 99 | #ifdef ENABLE_TRACING
100 | #define SPALL_AUTO_IMPLEMENTATION
101 | #include "spall_native_auto.h"
102 | #endif
103 | 


--------------------------------------------------------------------------------
/pool.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <stdbool.h>
  7 | 
  8 | #ifdef ENABLE_TRACING
  9 | #include "spall_native_auto.h"
 10 | #endif
 11 | 
 12 | // cross-platform thread wrappers, because microsoft couldn't be arsed to take 5 seconds and 
 13 | // do this and save all the junior devs and codebases everywhere from this pile of nonsense.
 14 | #if defined(__linux__) || defined(__APPLE__)
 15 | 
 16 | #include <stdatomic.h>
 17 | #include <pthread.h>
 18 | #include <unistd.h>
 19 | #include <errno.h>
 20 | 
 21 | typedef pthread_t TPool_ThreadHandle;
 22 | 
 23 | #define tpool_thread_start(t) pthread_create(&(t)->thread, NULL, _tpool_worker, (void *) (t))
 24 | #define tpool_thread_end(t)   pthread_join((t)->thread, NULL)
 25 | 
 26 | #elif defined(_WIN32)
 27 | 
 28 | #include <windows.h>
 29 | #include <process.h>
 30 | 
 31 | typedef ptrdiff_t ssize_t;
 32 | typedef HANDLE TPool_ThreadHandle;
 33 | 
 34 | #define tpool_thread_start(t) ((t)->thread = (HANDLE) _beginthread(_tpool_worker, 0, t))
 35 | #define tpool_thread_end(t) WaitForSingleObject((t)->thread, INFINITE)
 36 | 
 37 | #endif
 38 | 
 39 | // MSVC only took 11 years to put C11 atomics in, (despite the fact that MSVC/C++11 has them).
 40 | // This is the pain we suffer because microsoft got lazy
 41 | #if defined(_MSC_VER)
 42 | 
 43 | #define TPool_Thread_Local __declspec(thread)
 44 | #define TPool_Atomic volatile
 45 | 
 46 | #define TPOOL_LOAD(val) val
 47 | #define TPOOL_CAS(addr, expected, desired) (InterlockedCompareExchange64(addr, desired, expected) == expected)
 48 | #define TPOOL_ATOMIC_FUTEX_INC(val) (_InterlockedIncrement64(&(val)))
 49 | #define TPOOL_ATOMIC_FUTEX_DEC(val) (_InterlockedDecrement64(&(val)))
 50 | 
 51 | #else
 52 | 
 53 | #include <stdatomic.h>
 54 | 
 55 | #define TPool_Thread_Local _Thread_local
 56 | #define TPool_Atomic _Atomic
 57 | 
 58 | #define TPOOL_LOAD(val) atomic_load(&val)
 59 | #define TPOOL_CAS(addr, expected, desired) atomic_compare_exchange_weak(addr, &expected, desired)
 60 | #define TPOOL_ATOMIC_FUTEX_INC(val) (atomic_fetch_add_explicit(&val, 1, memory_order_acquire))
 61 | #define TPOOL_ATOMIC_FUTEX_DEC(val) (atomic_fetch_sub_explicit(&val, 1, memory_order_acquire))
 62 | #define __debugbreak() __builtin_trap()
 63 | 
 64 | #endif
 65 | 
 66 | // cross-platform futex, because we can't just have nice things. All the popular platforms have them under the hood,
 67 | // but giving them to users? NO! Users are too stupid to have nice things, save them for the fedora-wearing elite.
 68 | #if defined(__linux__)
 69 | 
 70 | #include <linux/futex.h>
 71 | #include <sys/syscall.h>
 72 | 
 73 | typedef TPool_Atomic int32_t TPool_Futex;
 74 | 
 75 | void _tpool_signal(TPool_Futex *addr) {
 76 | 	int ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL, 0);
 77 | 	if (ret == -1) {
 78 | 		perror("Futex wake");
 79 | 		__debugbreak();
 80 | 	}
 81 | }
 82 | void _tpool_broadcast(TPool_Futex *addr) {
 83 | 	int ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT32_MAX, NULL, NULL, 0);
 84 | 	if (ret == -1) {
 85 | 		perror("Futex wake");
 86 | 		__debugbreak();
 87 | 	}
 88 | }
 89 | 
 90 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) {
 91 | 	for (;;) {
 92 | 		int ret = syscall(SYS_futex, addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL, 0);
 93 | 		if (ret == -1) {
 94 | 			if (errno != EAGAIN) {
 95 | 				perror("Futex wait");
 96 | 				__debugbreak();
 97 | 			} else {
 98 | 				return;
 99 | 			}
100 | 		} else if (ret == 0) {
101 | 			if (*addr != val) {
102 | 				return;
103 | 			}
104 | 		}
105 | 	}
106 | }
107 | 
108 | #elif defined(__APPLE__)
109 | 
110 | typedef TPool_Atomic int64_t TPool_Futex;
111 | 
112 | #define UL_COMPARE_AND_WAIT	0x00000001
113 | #define ULF_WAKE_ALL        0x00000100
114 | #define ULF_NO_ERRNO        0x01000000
115 | 
116 | /* timeout is specified in microseconds */
117 | int __ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout); 
118 | int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
119 | 
120 | void _tpool_signal(TPool_Futex *addr) {
121 | 	for (;;) {
122 | 		int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, 0);
123 | 		if (ret >= 0) {
124 | 			return;
125 | 		}
126 | 		ret = -ret;
127 | 		if (ret == EINTR || ret == EFAULT) {
128 | 			continue;
129 | 		}
130 | 		if (ret == ENOENT) {
131 | 			return;
132 | 		}
133 | 		printf("futex wake fail?\n");
134 | 		__debugbreak();
135 | 	}
136 | }
137 | 
138 | void _tpool_broadcast(TPool_Futex *addr) {
139 | 	for (;;) {
140 | 		int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, addr, 0);
141 | 		if (ret >= 0) {
142 | 			return;
143 | 		}
144 | 		ret = -ret;
145 | 		if (ret == EINTR || ret == EFAULT) {
146 | 			continue;
147 | 		}
148 | 		if (ret == ENOENT) {
149 | 			return;
150 | 		}
151 | 		printf("futex wake fail?\n");
152 | 		__debugbreak();
153 | 	}
154 | }
155 | 
156 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) {
157 | 	for (;;) {
158 | 		int ret = __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, val, 0);
159 | 		if (ret >= 0) {
160 | 			if (*addr != val) {
161 | 				return;
162 | 			}
163 | 			continue;
164 | 		}
165 | 		ret = -ret;
166 | 		if (ret == EINTR || ret == EFAULT) {
167 | 			continue;
168 | 		}
169 | 		if (ret == ENOENT) {
170 | 			return;
171 | 		}
172 | 
173 | 		printf("futex wait fail?\n");
174 | 		__debugbreak();
175 | 	}
176 | }
177 | 
178 | #elif defined(_WIN32)
179 | typedef TPool_Atomic int64_t TPool_Futex;
180 | 
181 | void _tpool_signal(TPool_Futex *addr) {
182 | 	WakeByAddressSingle((void *)addr);
183 | }
184 | 
185 | void _tpool_broadcast(TPool_Futex *addr) {
186 | 	WakeByAddressAll((void *)addr);
187 | }
188 | 
189 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) {
190 | 	for (;;) {
191 | 		int ret = WaitOnAddress(addr, (void *)&val, sizeof(val), INFINITE);
192 | 		if (*addr != val) break;
193 | 	}
194 | }
195 | 
196 | #elif defined(__FreeBSD__)
197 | 
198 | #include <sys/types.h>
199 | #include <sys/umtx.h>
200 | 
201 | typedef TPool_Atomic int32_t TPool_Futex;
202 | 
203 | void _tpool_signal(TPool_Futex *addr) {
204 | 	_umtx_op(addr, UMTX_OP_WAKE, 1, 0, 0);
205 | }
206 | 
207 | void _tpool_broadcast(TPool_Futex *addr) {
208 | 	_umtx_op(addr, UMTX_OP_WAKE, INT32_MAX, 0, 0);
209 | }
210 | 
211 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) {
212 | 	for (;;) {
213 | 		int ret = _umtx_op(addr, UMTX_OP_WAIT_UINT, val, 0, NULL);
214 | 		if (ret == 0) {
215 | 			if (errno == ETIMEDOUT || errno == EINTR) {
216 | 				continue;
217 | 			}
218 | 
219 | 			perror("Futex wait");
220 | 			__debugbreak();
221 | 		} else if (ret == 0) {
222 | 			if (*addr != val) {
223 | 				return;
224 | 			}
225 | 		}
226 | 	}
227 | }
228 | 
229 | #elif defined(__OpenBSD__)
230 | 
231 | #include <sys/futex.h>
232 | 
233 | typedef TPool_Atomic int32_t TPool_Futex;
234 | 
235 | void _tpool_signal(TPool_Futex *addr) {
236 | 	for (;;) {
237 | 		int ret = futex(addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL);
238 | 		if (ret == -1) {
239 | 			if (errno == ETIMEDOUT || errno == EINTR) {
240 | 				continue;
241 | 			}
242 | 
243 | 			perror("Futex wake");
244 | 			__debugbreak();
245 | 		} else if (ret == 1) {
246 | 			return;
247 | 		}
248 | 	}
249 | }
250 | 
251 | void _tpool_broadcast(TPool_Futex *addr) {
252 | 	for (;;) {
253 | 		int ret = futex(addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT32_MAX, NULL, NULL);
254 | 		if (ret == -1) {
255 | 			if (errno == ETIMEDOUT || errno == EINTR) {
256 | 				continue;
257 | 			}
258 | 
259 | 			perror("Futex wake");
260 | 			__debugbreak();
261 | 		} else if (ret == 1) {
262 | 			return;
263 | 		}
264 | 	}
265 | }
266 | 
267 | void _tpool_wait(TPool_Futex *addr, TPool_Futex val) {
268 | 	for (;;) {
269 | 		int ret = futex(addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL);
270 | 		if (ret == -1) {
271 | 			if (*addr != val) {
272 | 				return;
273 | 			}
274 | 
275 | 			if (errno == ETIMEDOUT || errno == EINTR) {
276 | 				continue;
277 | 			}
278 | 
279 | 			perror("Futex wait");
280 | 			__debugbreak();
281 | 		}
282 | 	}
283 | }
284 | 
285 | #endif
286 | 
287 | struct TPool;
288 | typedef void tpool_task_proc(struct TPool *pool, void *data);
289 | TPool_Thread_Local int tpool_current_thread_idx;
290 | 
291 | #define GRAB_SUCCESS 0
292 | #define GRAB_EMPTY   1
293 | #define GRAB_FAILED  2
294 | 
295 | typedef struct TPool_Task {
296 | 	tpool_task_proc  *do_work;
297 | 	void             *args;
298 | } TPool_Task;
299 | 
300 | typedef struct {
301 | 	TPool_Atomic ssize_t size;
302 | 	TPool_Task *buffer;
303 | } TPool_RingBuffer;
304 | 
305 | typedef struct {
306 | 	TPool_Atomic ssize_t top;
307 | 	TPool_Atomic ssize_t bottom;
308 | 
309 | 	TPool_Atomic(TPool_RingBuffer *) ring;
310 | } TPool_Queue;
311 | 
312 | typedef struct TPool_Thread {
313 | 	TPool_ThreadHandle thread;
314 | 	int idx;
315 | 
316 | 	TPool_Queue queue;
317 | 	struct TPool *pool;
318 | } TPool_Thread;
319 | 
320 | typedef struct TPool {
321 | 	struct TPool_Thread *threads;
322 | 
323 | 	int thread_count;
324 | 	TPool_Atomic bool running;
325 | 
326 | 	TPool_Futex tasks_available;
327 | 	TPool_Futex tasks_left;
328 | } TPool;
329 | 
330 | TPool_RingBuffer *tpool_ring_make(ssize_t size) {
331 | 	TPool_RingBuffer *ring = malloc(sizeof(TPool_RingBuffer));
332 | 	ring->size = size;
333 | 	ring->buffer = calloc(ring->size, sizeof(TPool_Task));
334 | 	return ring;
335 | }
336 | 
337 | TPool_Queue tpool_queue_make(ssize_t size) {
338 | 	TPool_Queue d = {};
339 | 	TPool_RingBuffer *ring = tpool_ring_make(size);
340 | 	atomic_store(&d.ring, ring);
341 | 	return d;
342 | }
343 | 
344 | void tpool_queue_delete(TPool_Queue *q) {
345 | 	free(q->ring->buffer);
346 | 	free(q->ring);
347 | }
348 | 
349 | TPool_RingBuffer *tpool_ring_grow(TPool_RingBuffer *ring, ssize_t bottom, ssize_t top) {
350 | 	TPool_RingBuffer *new_ring = tpool_ring_make(ring->size * 2);
351 | 	for (ssize_t i = top; i < bottom; i++) {
352 | 		new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size];
353 | 	}
354 | 	return new_ring;
355 | }
356 | 
357 | void _thread_init(TPool *pool, TPool_Thread *thread, int idx) {
358 | 	thread->queue = tpool_queue_make(1 << 1);
359 | 	thread->pool = pool;
360 | 	thread->idx = idx;
361 | }
362 | 
363 | void _tpool_queue_push(TPool_Thread *thread, TPool_Task task) {
364 | 	ssize_t bot                = atomic_load_explicit(&thread->queue.bottom, memory_order_relaxed);
365 | 	ssize_t top                = atomic_load_explicit(&thread->queue.top,    memory_order_acquire);
366 | 	TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring,   memory_order_relaxed);
367 | 
368 | 	ssize_t size = bot - top;
369 | 	if (size > (cur_ring->size - 1)) {
370 | 		// Queue is full
371 | 		thread->queue.ring = tpool_ring_grow(thread->queue.ring, bot, top);
372 | 		cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_relaxed);
373 | 	}
374 | 
375 | 	cur_ring->buffer[bot % cur_ring->size] = task;
376 | 	atomic_thread_fence(memory_order_release);
377 | 	atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed);
378 | 
379 | 	TPOOL_ATOMIC_FUTEX_INC(thread->pool->tasks_left);
380 | 	TPOOL_ATOMIC_FUTEX_INC(thread->pool->tasks_available);
381 | 	_tpool_broadcast(&thread->pool->tasks_available);
382 | }
383 | 
384 | int _tpool_queue_take(TPool_Thread *thread, TPool_Task *task) {
385 | 	ssize_t bot = atomic_load_explicit(&thread->queue.bottom, memory_order_relaxed) - 1;
386 | 	TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_relaxed);
387 | 	atomic_store_explicit(&thread->queue.bottom, bot, memory_order_relaxed);
388 | 	atomic_thread_fence(memory_order_seq_cst);
389 | 
390 | 	ssize_t top = atomic_load_explicit(&thread->queue.top, memory_order_relaxed);
391 | 	if (top <= bot) {
392 | 		// Queue is not empty
393 | 
394 | 		*task = cur_ring->buffer[bot % cur_ring->size];
395 | 		if (top == bot) {
396 | 			// Only one entry left in queue
397 | 			if (!atomic_compare_exchange_strong_explicit(&thread->queue.top, &top, top + 1, memory_order_seq_cst, memory_order_relaxed)) {
398 | 				// Race failed
399 | 				atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed);
400 | 				return GRAB_EMPTY;
401 | 			}
402 | 
403 | 			atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed);
404 | 			return GRAB_SUCCESS;
405 | 		}
406 | 
407 | 		// We got a task without hitting a race
408 | 		return GRAB_SUCCESS;
409 | 	} else {
410 | 		// Queue is empty
411 | 		atomic_store_explicit(&thread->queue.bottom, bot + 1, memory_order_relaxed);
412 | 		return GRAB_EMPTY;
413 | 	}
414 | }
415 | 
416 | int _tpool_queue_steal(TPool_Thread *thread, TPool_Task *task) {
417 | 	ssize_t top = atomic_load_explicit(&thread->queue.top, memory_order_acquire);
418 | 	atomic_thread_fence(memory_order_seq_cst);
419 | 	ssize_t bot = atomic_load_explicit(&thread->queue.bottom, memory_order_acquire);
420 | 
421 | 	int ret = GRAB_EMPTY;
422 | 	if (top < bot) {
423 | 		// Queue is not empty
424 | 		TPool_RingBuffer *cur_ring = atomic_load_explicit(&thread->queue.ring, memory_order_consume);
425 | 		*task = cur_ring->buffer[top % cur_ring->size];
426 | 
427 | 		if (!atomic_compare_exchange_strong_explicit(&thread->queue.top, &top, top + 1, memory_order_seq_cst, memory_order_relaxed)) {
428 | 			// Race failed
429 | 			ret = GRAB_FAILED;
430 | 		} else {
431 | 			ret = GRAB_SUCCESS;
432 | 		}
433 | 	}
434 | 	return ret;
435 | }
436 | 
437 | #ifndef _WIN32
438 | void *_tpool_worker(void *ptr)
439 | #else
440 | void _tpool_worker(void *ptr)
441 | #endif
442 | {
443 | 	TPool_Task task;
444 | 	TPool_Thread *current_thread = (TPool_Thread *)ptr;
445 | 	tpool_current_thread_idx = current_thread->idx;
446 | 	TPool *pool = current_thread->pool;
447 | 
448 | #ifdef ENABLE_TRACING
449 | 	spall_auto_thread_init(tpool_current_thread_idx, SPALL_DEFAULT_BUFFER_SIZE);
450 | #endif
451 | 
452 | 	for (;;) {
453 |         work_start:
454 | 		if (!pool->running) {
455 | 			break;
456 | 		}
457 | 
458 | 		// If we've got tasks to process, work through them
459 | 		size_t finished_tasks = 0;
460 | 		while (!_tpool_queue_take(current_thread, &task)) {
461 | 			task.do_work(pool, task.args);
462 | 			TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left);
463 | 
464 | 			finished_tasks += 1;
465 | 		}
466 | 		if (finished_tasks > 0 && !TPOOL_LOAD(pool->tasks_left)) {
467 | 			_tpool_signal(&pool->tasks_left);
468 | 		}
469 | 
470 | 		// If there's still work somewhere and we don't have it, steal it
471 | 		if (TPOOL_LOAD(pool->tasks_left)) {
472 | 			int idx = current_thread->idx;
473 | 			for (int i = 0; i < pool->thread_count; i++) {
474 | 				if (!TPOOL_LOAD(pool->tasks_left)) {
475 | 					break;
476 | 				}
477 | 
478 | 				idx = (idx + 1) % pool->thread_count;
479 | 				TPool_Thread *thread = &pool->threads[idx];
480 | 
481 | 				TPool_Task task;
482 | 				int ret = _tpool_queue_steal(thread, &task);
483 | 				if (ret == GRAB_FAILED) {
484 | 					goto work_start;
485 | 				} else if (ret == GRAB_EMPTY) {
486 | 					continue;
487 | 				}
488 | 
489 | 				task.do_work(pool, task.args);
490 | 				TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left);
491 | 
492 | 				if (!TPOOL_LOAD(pool->tasks_left)) {
493 | 					_tpool_signal(&pool->tasks_left);
494 | 				}
495 | 
496 | 				goto work_start;
497 | 			}
498 | 		}
499 | 
500 | 		// if we've done all our work, and there's nothing to steal, go to sleep
501 | 		int32_t state = TPOOL_LOAD(pool->tasks_available);
502 | 		if (!pool->running) { break; }
503 | 		_tpool_wait(&pool->tasks_available, state);
504 | 	}
505 | 
506 | #ifdef ENABLE_TRACING
507 | 	spall_auto_thread_quit();
508 | #endif
509 | 
510 | #ifndef _WIN32
511 | 	return NULL;
512 | #endif
513 | }
514 | 
515 | void tpool_add_task(TPool *pool, TPool_Task task) {
516 | 	TPool_Thread *current_thread = &pool->threads[tpool_current_thread_idx];
517 | 	_tpool_queue_push(current_thread, task);
518 | }
519 | 
520 | void tpool_wait(TPool *pool) {
521 | 	TPool_Task task;
522 | 	TPool_Thread *current_thread = &pool->threads[tpool_current_thread_idx];
523 | 
524 | 	while (TPOOL_LOAD(pool->tasks_left)) {
525 | 
526 | 		// if we've got tasks on our queue, run them
527 | 		while (!_tpool_queue_take(current_thread, &task)) {
528 | 			task.do_work(pool, task.args);
529 | 			TPOOL_ATOMIC_FUTEX_DEC(pool->tasks_left);
530 | 		}
531 | 
532 | 
533 | 		// is this mem-barriered enough?
534 | 		// This *must* be executed in this order, so the futex wakes immediately
535 | 		// if rem_tasks has changed since we checked last, otherwise the program
536 | 		// will permanently sleep
537 | 		TPool_Futex rem_tasks = TPOOL_LOAD(pool->tasks_left);
538 | 		if (!rem_tasks) {
539 | 			break;
540 | 		}
541 | 
542 | 		_tpool_wait(&pool->tasks_left, rem_tasks);
543 | 	}
544 | 
545 | }
546 | 
547 | void tpool_init(TPool *pool, int child_thread_count) {
548 | 	int thread_count = child_thread_count + 1;
549 | 	pool->thread_count = thread_count;
550 | 	pool->threads = malloc(sizeof(TPool_Thread) * pool->thread_count);
551 | 
552 | 	pool->running = true;
553 | 
554 | 	// setup the main thread
555 | 	_thread_init(pool, &pool->threads[0], 0);
556 | 	tpool_current_thread_idx = 0;
557 | 
558 | 	for (int i = 1; i < pool->thread_count; i++) {
559 | 		_thread_init(pool, &pool->threads[i], i);
560 | 		tpool_thread_start(&pool->threads[i]);
561 | 	}
562 | }
563 | 
564 | void tpool_destroy(TPool *pool) {
565 | 	pool->running = false;
566 | 	for (int i = 1; i < pool->thread_count; i++) {
567 | 		TPOOL_ATOMIC_FUTEX_INC(pool->tasks_available);
568 | 		_tpool_broadcast(&pool->tasks_available);
569 | 		tpool_thread_end(&pool->threads[i]);
570 | 	}
571 | 	for (int i = 0; i < pool->thread_count; i++) {
572 | 		tpool_queue_delete(&pool->threads[i].queue);
573 | 	}
574 | 
575 | 	free(pool->threads);
576 | }
577 | 


--------------------------------------------------------------------------------
/spall_native_auto.h:
--------------------------------------------------------------------------------
  1 | // SPDX-FileCopyrightText: © 2024 Colin Davidson <colrdavidson@gmail.com>
  2 | // SPDX-License-Identifier: MIT
  3 | 
  4 | #ifndef SPALL_AUTO_H
  5 | #define SPALL_AUTO_H
  6 | 
  7 | // THIS IS EXPERIMENTAL, BUT VERY HANDY
  8 | // *should* work on clang and gcc on Windows, Mac, and Linux
  9 | 
 10 | #define SPALL_IS_WINDOWS 0
 11 | #define SPALL_IS_DARWIN  0
 12 | #define SPALL_IS_LINUX   0
 13 | #define SPALL_IS_GCC     0
 14 | #define SPALL_IS_CLANG   0
 15 | #define SPALL_IS_CPP     0
 16 | #define SPALL_IS_X64     0
 17 | #define SPALL_IS_ARM64   0
 18 | 
 19 | #ifdef __cplusplus
 20 |     #undef SPALL_IS_CPP
 21 |     #define SPALL_IS_CPP 1
 22 | #endif
 23 | 
 24 | #if defined(__clang__)
 25 |     #undef SPALL_IS_CLANG
 26 |     #define SPALL_IS_CLANG 1
 27 | #endif
 28 | #if defined(_WIN32)
 29 |     #undef SPALL_IS_WINDOWS
 30 |     #define SPALL_IS_WINDOWS 1
 31 | #elif defined(__APPLE__)
 32 |     #undef SPALL_IS_DARWIN
 33 |     #define SPALL_IS_DARWIN 1
 34 | #elif defined(__linux__)
 35 |     #undef SPALL_IS_LINUX
 36 |     #define SPALL_IS_LINUX 1
 37 | #endif
 38 | #ifdef __GNUC__
 39 |     #undef SPALL_IS_GCC
 40 |     #define SPALL_IS_GCC 1
 41 | #endif
 42 | #if defined(__x86_64__) || defined(_M_AMD64)
 43 |     #undef SPALL_IS_X64
 44 |     #define SPALL_IS_X64 1
 45 | #elif defined(__aarch64__)
 46 |     #undef SPALL_IS_ARM64
 47 |     #define SPALL_IS_ARM64 1
 48 | #endif
 49 | 
 50 | #if (!SPALL_IS_CLANG && !SPALL_IS_GCC)
 51 | #error "Compiler not supported!"
 52 | #endif
 53 | 
 54 | #ifdef __cplusplus
 55 | extern "C" {
 56 | #endif
 57 | 
 58 | #include <stdint.h>
 59 | #include <stddef.h>
 60 | #include <stdbool.h>
 61 | 
 62 | bool spall_auto_init(char *filename);
 63 | void spall_auto_quit(void);
 64 | bool spall_auto_thread_init(uint32_t thread_id, size_t buffer_size);
 65 | void spall_auto_thread_quit(void);
 66 | 
 67 | bool spall_auto_buffer_begin(const char *name, signed long name_len, const char *args, signed long args_len);
 68 | bool spall_auto_buffer_end(void);
 69 | bool spall_auto_buffer_flush(void);
 70 | 
 71 | void spall_auto_set_thread_instrumenting(bool on);
 72 | 
 73 | #if SPALL_IS_GCC && SPALL_IS_CPP
 74 |     #define _Thread_local thread_local
 75 | #endif
 76 | 
 77 | #define SPALL_DEFAULT_BUFFER_SIZE (32 * 1024 * 1024)
 78 | #define SPALL_MIN(a, b) (((a) < (b)) ? (a) : (b))
 79 | #define SPALL_MAX(a, b) (((a) > (b)) ? (a) : (b))
 80 | 
 81 | #ifdef __cplusplus
 82 | }
 83 | #endif
 84 | #endif // endif SPALL_AUTO_H
 85 | 
 86 | #ifdef SPALL_AUTO_IMPLEMENTATION
 87 | #ifndef SPALL_AUTO_IMPLEMENTED_H
 88 | #define SPALL_AUTO_IMPLEMENTED_H
 89 | 
 90 | #if !SPALL_IS_WINDOWS
 91 |     #if SPALL_IS_CPP
 92 |         #include <atomic>
 93 |     #else
 94 |         #include <stdatomic.h>
 95 |     #endif
 96 | #endif
 97 | 
 98 | #ifdef __cplusplus
 99 | extern "C" {
100 | #endif
101 | 
102 | #include <stdint.h>
103 | #include <stdio.h>
104 | #include <stdlib.h>
105 | #include <string.h>
106 | 
107 | 
108 | #if !SPALL_IS_WINDOWS
109 |     #include <time.h>
110 |     #include <pthread.h>
111 |     #include <unistd.h>
112 |     #include <errno.h>
113 | #endif
114 | 
115 | #if SPALL_IS_WINDOWS
116 |     #include <windows.h>
117 |     #include <process.h>
118 | 
119 |     typedef ptrdiff_t ssize_t;
120 |     typedef HANDLE Spall_ThreadHandle;
121 | 
122 |     #define spall_thread_start(t) ((t)->writer.thread = (HANDLE) _beginthread(spall_writer, 0, t))
123 |     #define spall_thread_end(t)   WaitForSingleObject((t)->writer.thread, INFINITE)
124 | #else
125 |     typedef pthread_t Spall_ThreadHandle;
126 |     #define spall_thread_start(t) pthread_create(&(t)->writer.thread, NULL, spall_writer, (void *) (t))
127 |     #define spall_thread_end(t)   pthread_join((t)->writer.thread, NULL)
128 | #endif
129 | 
130 | #define SPALL_NOINSTRUMENT __attribute__((no_instrument_function))
131 | #define SPALL_FORCEINLINE __attribute__((always_inline))
132 | #define __debugbreak() __builtin_trap()
133 | 
134 | #if SPALL_IS_CPP
135 |     #define Spall_Atomic(X) std::atomic<X>
136 | #else
137 |     #define Spall_Atomic(X) _Atomic (X)
138 | #endif
139 | 
140 | #define SPALL_FN static SPALL_NOINSTRUMENT
141 | 
142 | #if SPALL_IS_X64
143 | #include <x86intrin.h>
144 | SPALL_FN uint64_t spall_get_clock(void) {
145 |     return __rdtsc();
146 | }
147 | SPALL_FN void spall_pause(void) {
148 |     _mm_pause();
149 | }
150 | #elif SPALL_IS_ARM64
151 | SPALL_FN uint64_t spall_get_clock(void) {
152 |     int64_t timer_val;
153 |     asm volatile("mrs %0, cntvct_el0" : "=r"(timer_val));
154 |     return (uint64_t)timer_val;
155 | }
156 | SPALL_FN void spall_pause(void) {
157 |     asm volatile("yield");
158 | }
159 | #endif
160 | 
161 | #pragma pack(push, 1)
162 | 
163 | typedef struct SpallHeader {
164 |     uint64_t magic_header; // = 0xABADF00D
165 |     uint64_t version; // = 2
166 |     double   timestamp_unit;
167 |     uint64_t known_address; // Address for spall_auto_init, for skew-correction
168 |     uint16_t program_path_len;
169 | } SpallHeader;
170 | 
171 | enum {
172 |     SpallAutoEventType_Invalid = 0,
173 |     SpallAutoEventType_Begin   = 1,
174 | };
175 | 
176 | typedef struct SpallMicroBeginEventMax {
177 |     uint8_t type;
178 |     uint64_t ts;
179 |     uint64_t caller;
180 | } SpallMicroBeginEventMax;
181 | 
182 | typedef struct SpallMicroEndEventMax {
183 |     uint8_t type;
184 |     uint64_t ts;
185 | } SpallMicroEndEventMax;
186 | 
187 | typedef struct SpallAutoBeginEvent {
188 |     uint8_t type;
189 |     uint64_t when;
190 | } SpallAutoBeginEvent;
191 | 
192 | typedef struct SpallBufferHeader {
193 |     uint32_t size;
194 |     uint32_t tid;
195 |     uint64_t first_ts;
196 |     uint32_t max_depth;
197 | } SpallBufferHeader;
198 | 
199 | #pragma pack(pop)
200 | 
201 | SPALL_FN SPALL_FORCEINLINE uint64_t spall_delta_to_bits(uint64_t dt) {
202 | 	uint32_t count = 0;
203 | 	count += (dt >= 0x100);
204 | 	count += (dt >= 0x10000);
205 | 	count += (dt >= 0x100000000);
206 | 	return count;
207 | }
208 | 
209 | typedef struct SpallProfile {
210 |     double stamp_scale;
211 |     FILE *file;
212 | } SpallProfile;
213 | 
214 | typedef Spall_Atomic(uint64_t) Spall_Futex;
215 | typedef struct SpallBuffer {
216 |     uint8_t *data;
217 |     size_t   length;
218 | 
219 |     // if true, write to upper-half, else lower-half
220 |     size_t sub_length;
221 |     bool   write_half;
222 | 
223 |     struct {
224 |         Spall_Atomic(bool)     is_running;
225 |         Spall_ThreadHandle     thread;
226 |         Spall_Atomic(uint64_t) ptr;
227 |         Spall_Atomic(size_t)   size;
228 |     } writer;
229 | 
230 |     size_t   head;
231 |     uint32_t thread_id;
232 | 
233 |     uint64_t previous_ts;
234 |     uint64_t first_ts;
235 | 
236 |     uint64_t previous_addr;
237 |     uint64_t previous_caller;
238 | 
239 |     uint32_t current_depth;
240 |     uint32_t max_depth;
241 | } SpallBuffer;
242 | 
243 | 
244 | // Cross-platform wrappers
245 | #if SPALL_IS_LINUX
246 | #include <stdio.h>
247 | #include <stdlib.h>
248 | #include <string.h>
249 | #include <unistd.h>
250 | #include <fcntl.h>
251 | #include <sys/syscall.h>
252 | #include <sys/mman.h>
253 | #include <asm/unistd.h>
254 | #include <linux/futex.h>
255 | #include <linux/limits.h>
256 | #include <linux/perf_event.h>
257 | 
258 | SPALL_FN bool get_program_path(char **out_path) {
259 |     char path[PATH_MAX] = {0};
260 |     uint32_t size = sizeof(path);
261 | 
262 |     ssize_t buff_len = (ssize_t)readlink("/proc/self/exe", path, size - 1);
263 |     if (buff_len == -1) {
264 |         *out_path = NULL;
265 |         return false;
266 |     }
267 | 
268 |     char *post_path = (char *)calloc(PATH_MAX, 1);
269 |     if (realpath(path, post_path) == NULL) {
270 |         free(post_path);
271 |         *out_path = NULL;
272 |         return false;
273 |     }
274 | 
275 |     *out_path = post_path;
276 |     return true;
277 | }
278 | 
279 | SPALL_FN uint64_t mul_u64_u32_shr(uint64_t cyc, uint32_t mult, uint32_t shift) {
280 |     __uint128_t x = cyc;
281 |     x *= mult;
282 |     x >>= shift;
283 |     return (uint64_t)x;
284 | }
285 | 
286 | SPALL_FN long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
287 |     return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
288 | }
289 | 
290 | #if SPALL_IS_X64
291 | SPALL_FN double spall_get_clock_multiplier(void) {
292 |     struct perf_event_attr pe = {
293 |         .type = PERF_TYPE_HARDWARE,
294 |         .size = sizeof(struct perf_event_attr),
295 |         .config = PERF_COUNT_HW_INSTRUCTIONS,
296 |         .disabled = 1,
297 |         .exclude_kernel = 1,
298 |         .exclude_hv = 1
299 |     };
300 | 
301 |     int fd = (int)perf_event_open(&pe, 0, -1, -1, 0);
302 |     if (fd == -1) {
303 |         perror("perf_event_open failed");
304 |         return 1;
305 |     }
306 |     void *addr = mmap(NULL, 4*1024, PROT_READ, MAP_SHARED, fd, 0);
307 |     if (!addr) {
308 |         perror("mmap failed");
309 |         return 1;
310 |     }
311 |     struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)addr;
312 |     if (pc->cap_user_time != 1) {
313 |         fprintf(stderr, "Perf system doesn't support user time\n");
314 |         return 1;
315 |     }
316 |     double nanos = (double)mul_u64_u32_shr(1000000000000000ull, pc->time_mult, pc->time_shift);
317 |     double multiplier = nanos / 1000000000000000.0;
318 |     return multiplier;
319 | }
320 | #endif
321 | 
322 | 
323 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) {
324 |     long ret = syscall(SYS_futex, addr, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, NULL, NULL, 0);
325 |     if (ret == -1) {
326 |         perror("Futex wake");
327 |         __debugbreak();
328 |     }
329 | }
330 | 
331 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) {
332 |     for (;;) {
333 |         long ret = syscall(SYS_futex, addr, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, val, NULL, NULL, 0);
334 |         if (ret == -1) {
335 |             if (errno != EAGAIN) {
336 |                 perror("Futex wait");
337 |                 __debugbreak();
338 |             } else {
339 |                 return;
340 |             }
341 |         } else if (ret == 0) {
342 |             return;
343 |         }
344 |     }
345 | }
346 | 
347 | #elif SPALL_IS_DARWIN
348 | 
349 | #include <mach-o/dyld.h>
350 | #include <sys/types.h>
351 | #include <sys/sysctl.h>
352 | 
353 | #if SPALL_IS_X64
354 | SPALL_FN double spall_get_clock_multiplier(void) {
355 |     uint64_t freq;
356 |     size_t size = sizeof(freq);
357 | 
358 |     sysctlbyname("machdep.tsc.frequency", &freq, &size, NULL, 0);
359 |     return 1000000000.0 / (double)freq;
360 | }
361 | #elif SPALL_IS_ARM64
362 | SPALL_FN double spall_get_clock_multiplier(void) {
363 |     uint64_t freq_val;
364 |     asm volatile("mrs %0, cntfrq_el0" : "=r"(freq_val));
365 | 
366 |     double multiplier = 1000000000.0 / (double)freq_val;
367 |     return multiplier;
368 | }
369 | #endif
370 | 
371 | SPALL_FN bool get_program_path(char **out_path) {
372 |     char pre_path[1025];
373 |     uint32_t size = sizeof(pre_path);
374 |     if (_NSGetExecutablePath(pre_path, &size) == -1) {
375 |         *out_path = NULL;
376 |         return false;
377 |     }
378 | 
379 |     char *post_path = (char *)malloc(1025);
380 |     if (realpath(pre_path, post_path) == NULL) {
381 |         free(post_path);
382 |         *out_path = NULL;
383 |         return false;
384 |     }
385 | 
386 |     *out_path = post_path;
387 |     return true;
388 | }
389 | 
390 | #define UL_COMPARE_AND_WAIT 0x00000001
391 | #define ULF_WAKE_ALL        0x00000100
392 | #define ULF_NO_ERRNO        0x01000000
393 | 
394 | /* timeout is specified in microseconds */
395 | int __ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout);
396 | int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
397 | 
398 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) {
399 |     for (;;) {
400 |         int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, 0);
401 |         if (ret >= 0) {
402 |             return;
403 |         }
404 |         ret = -ret;
405 |         if (ret == EINTR || ret == EFAULT) {
406 |             continue;
407 |         }
408 |         if (ret == ENOENT) {
409 |             return;
410 |         }
411 |         printf("futex signal fail?\n");
412 |         __debugbreak();
413 |     }
414 | }
415 | 
416 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) {
417 |     for (;;) {
418 |         int ret = __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, addr, val, 0);
419 |         if (ret >= 0) {
420 |             return;
421 |         }
422 |         ret = -ret;
423 |         if (ret == EINTR || ret == EFAULT) {
424 |             continue;
425 |         }
426 |         if (ret == ENOENT) {
427 |             return;
428 |         }
429 | 
430 |         printf("futex wait fail? %d\n", ret);
431 |         __debugbreak();
432 |     }
433 | }
434 | 
435 | #elif SPALL_IS_WINDOWS
436 | 
437 | SPALL_FN bool get_program_path(char **out_path) {
438 |     char *post_path = (char *)calloc(MAX_PATH, 1);
439 |     if (GetModuleFileNameA(NULL, post_path, MAX_PATH) == 0) {
440 |         *out_path = NULL;
441 |         return false;
442 |     }
443 | 
444 |     *out_path = post_path;
445 |     return true;
446 | }
447 | 
448 | SPALL_FN SPALL_FORCEINLINE double spall_get_clock_multiplier(void) {
449 | 
450 |     // Cache the answer so that multiple calls never take the slow path more than once
451 |     static double multiplier = 0;
452 |     if (multiplier) {
453 |         return multiplier;
454 |     }
455 | 
456 |     uint64_t tsc_freq = 0;
457 | 
458 |     // Get time before sleep
459 |     uint64_t qpc_begin = 0; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_begin);
460 |     uint64_t tsc_begin = spall_get_clock();
461 | 
462 |     Sleep(2);
463 | 
464 |     // Get time after sleep
465 |     uint64_t qpc_end = qpc_begin + 1; QueryPerformanceCounter((LARGE_INTEGER *)&qpc_end);
466 |     uint64_t tsc_end = spall_get_clock();
467 | 
468 |     // Do the math to extrapolate the RDTSC ticks elapsed in 1 second
469 |     uint64_t qpc_freq = 0; QueryPerformanceFrequency((LARGE_INTEGER *)&qpc_freq);
470 |     tsc_freq = (tsc_end - tsc_begin) * qpc_freq / (qpc_end - qpc_begin);
471 | 
472 |     multiplier = 1000000000.0 / (double)tsc_freq;
473 |     return multiplier;
474 | }
475 | 
476 | SPALL_FN SPALL_FORCEINLINE void spall_signal(Spall_Futex *addr) {
477 |     WakeByAddressSingle((void *)addr);
478 | }
479 | 
480 | SPALL_FN SPALL_FORCEINLINE void spall_wait(Spall_Futex *addr, uint64_t val) {
481 |     WaitOnAddress(addr, (void *)&val, sizeof(val), INFINITE);
482 | }
483 | 
484 | #endif
485 | 
486 | // Auto-tracing impl
487 | static SpallProfile spall_ctx;
488 | static _Thread_local SpallBuffer *spall_buffer = NULL;
489 | static _Thread_local bool spall_thread_running = false;
490 | 
491 | SPALL_NOINSTRUMENT void spall_auto_set_thread_instrumenting(bool on) {
492 |     spall_thread_running = on;
493 | }
494 | 
495 | #if SPALL_IS_WINDOWS
496 | SPALL_FN void spall_writer(void *arg) {
497 | #else
498 | SPALL_FN void *spall_writer(void *arg) {
499 | #endif
500 | 
501 |     SpallBuffer *buffer = (SpallBuffer *)arg;
502 |     while (buffer->writer.is_running) {
503 |         spall_wait(&buffer->writer.ptr, 0);
504 |         if (!buffer->writer.is_running) { break; }
505 | 
506 |         void *buffer_ptr = (void *)atomic_load(&buffer->writer.ptr);
507 |         if (buffer_ptr == 0) { continue; }
508 | 
509 |         size_t size = (size_t)atomic_load(&buffer->writer.size);
510 |         atomic_store(&buffer->writer.ptr, 0);
511 | 
512 |         fwrite(buffer_ptr, size, 1, spall_ctx.file);
513 |     }
514 | 
515 | #if !SPALL_IS_WINDOWS
516 |     return NULL;
517 | #endif
518 | }
519 | 
520 | SPALL_FN SPALL_FORCEINLINE bool spall__file_write(void *p, size_t n) {
521 |     atomic_store(&spall_buffer->writer.size, n);
522 |     atomic_store(&spall_buffer->writer.ptr, (uint64_t)p);
523 |     spall_signal(&spall_buffer->writer.ptr);
524 | 
525 |     for (;;) {
526 |         void *ptr = (void *)atomic_load(&spall_buffer->writer.ptr);
527 |         if (ptr == 0) { break; }
528 |         spall_pause();
529 |     }
530 | 
531 |     return true;
532 | }
533 | 
534 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_flush(void) {
535 |     if (!spall_buffer) return false;
536 | 
537 |     size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0;
538 | 
539 |     SpallBufferHeader *sbp = (SpallBufferHeader *)(spall_buffer->data + data_start);
540 |     if (spall_buffer->head > 0) {
541 |         sbp->size = (uint32_t)(spall_buffer->head - sizeof(SpallBufferHeader));
542 |         sbp->first_ts = spall_buffer->first_ts;
543 |         sbp->max_depth = spall_buffer->max_depth;
544 |         if (!spall__file_write(spall_buffer->data + data_start, spall_buffer->head)) return false;
545 | 
546 |         spall_buffer->write_half = !spall_buffer->write_half;
547 |     }
548 | 
549 |     data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0;
550 |     sbp = (SpallBufferHeader *)(spall_buffer->data + data_start);
551 |     sbp->size = 0;
552 |     sbp->first_ts = 0;
553 |     sbp->tid = spall_buffer->thread_id;
554 | 
555 |     spall_buffer->head = sizeof(SpallBufferHeader);
556 |     spall_buffer->first_ts = 0;
557 |     spall_buffer->previous_ts = 0;
558 |     spall_buffer->previous_addr = 0;
559 |     spall_buffer->previous_caller = 0;
560 |     return true;
561 | }
562 | 
563 | SPALL_FN SPALL_FORCEINLINE bool spall_buffer_micro_begin(uint64_t addr, uint64_t caller) {
564 |     spall_buffer->current_depth += 1;
565 |     spall_buffer->max_depth = SPALL_MAX(spall_buffer->max_depth, spall_buffer->current_depth);
566 | 
567 |     size_t ev_size = sizeof(SpallMicroBeginEventMax);
568 |     if ((spall_buffer->head + ev_size) > spall_buffer->sub_length) {
569 |         if (!spall_auto_buffer_flush()) {
570 |             return false;
571 |         }
572 |     }
573 | 
574 |     size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0;
575 |     uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head;
576 | 
577 |     uint64_t now = spall_get_clock();
578 |     if (spall_buffer->first_ts == 0) {
579 |         spall_buffer->first_ts = now;
580 |         spall_buffer->previous_ts = now;
581 |     }
582 | 
583 |     uint64_t dt       = now    - spall_buffer->previous_ts;
584 |     uint64_t d_addr   = addr   ^ spall_buffer->previous_addr;
585 |     uint64_t d_caller = caller ^ spall_buffer->previous_caller;
586 | 
587 |     uint64_t dt_bits     = spall_delta_to_bits(dt);
588 |     uint64_t addr_bits   = spall_delta_to_bits(d_addr);
589 |     uint64_t caller_bits = spall_delta_to_bits(d_caller);
590 | 
591 |     uint64_t dt_size     = 1 << dt_bits;
592 |     uint64_t addr_size   = 1 << addr_bits;
593 |     uint64_t caller_size = 1 << caller_bits;
594 | 
595 |     // [begin event tag | size of ts | size of addr | size of caller]
596 |     uint8_t type_byte = (0 << 6) | (dt_bits << 4) | (addr_bits << 2) | caller_bits;
597 | 
598 |     int i = 0;
599 |     *(ev_buffer + i) = type_byte; i += 1;
600 |     memcpy(ev_buffer + i, &dt,       8);     i += dt_size;
601 |     memcpy(ev_buffer + i, &d_addr,   8);   i += addr_size;
602 |     memcpy(ev_buffer + i, &d_caller, 8); i += caller_size;
603 | 
604 |     spall_buffer->previous_ts = now;
605 |     spall_buffer->previous_addr = addr;
606 |     spall_buffer->previous_caller = caller;
607 |     spall_buffer->head += i;
608 | 
609 |     return true;
610 | }
611 | 
612 | SPALL_FN SPALL_FORCEINLINE bool spall_buffer_micro_end(void) {
613 |     uint64_t now = spall_get_clock();
614 |     spall_buffer->current_depth -= 1;
615 | 
616 |     size_t ev_size = sizeof(SpallMicroEndEventMax);
617 |     if ((spall_buffer->head + ev_size) > spall_buffer->sub_length) {
618 |         if (!spall_auto_buffer_flush()) {
619 |             return false;
620 |         }
621 |     }
622 |     if (spall_buffer->first_ts == 0) {
623 |         spall_buffer->first_ts = now;
624 |         spall_buffer->previous_ts = now;
625 |     }
626 | 
627 |     size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0;
628 |     uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head;
629 | 
630 |     uint64_t dt = now - spall_buffer->previous_ts;
631 |     uint64_t dt_bits = spall_delta_to_bits(dt);
632 |     uint64_t dt_size = 1 << dt_bits;
633 | 
634 |     // [end event tag | size of ts]
635 |     uint8_t type_byte = (1 << 6) | (dt_bits << 4);
636 | 
637 |     int i = 0;
638 |     *(ev_buffer + i) = type_byte; i += 1;
639 |     memcpy(ev_buffer + i, &dt, 8); i += dt_size;
640 | 
641 |     spall_buffer->previous_ts = now;
642 |     spall_buffer->head += i;
643 |     return true;
644 | }
645 | 
646 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_begin(const char *name, signed long name_len, const char *args, signed long args_len) {
647 | 
648 |     spall_buffer->current_depth += 1;
649 |     spall_buffer->max_depth = SPALL_MAX(spall_buffer->max_depth, spall_buffer->current_depth);
650 | 
651 |     uint16_t trunc_name_len = (uint16_t)SPALL_MIN(name_len, UINT16_MAX);
652 |     uint16_t trunc_args_len = (uint16_t)SPALL_MIN(args_len, UINT16_MAX);
653 |     uint64_t name_len_size = (trunc_name_len > 255) ? 2 : 1;
654 |     uint64_t args_len_size = (trunc_args_len > 255) ? 2 : 1;
655 | 
656 |     uint64_t event_tail = trunc_name_len + name_len_size + trunc_args_len + args_len_size;
657 |     if ((spall_buffer->head + sizeof(SpallAutoBeginEvent) + event_tail) > spall_buffer->sub_length) {
658 |         if (!spall_auto_buffer_flush()) {
659 |             return false;
660 |         }
661 |     }
662 | 
663 |     size_t data_start = spall_buffer->write_half ? spall_buffer->sub_length : 0;
664 |     uint8_t *ev_buffer = (spall_buffer->data + data_start) + spall_buffer->head;
665 | 
666 |     uint64_t now = spall_get_clock();
667 |     if (spall_buffer->first_ts == 0) {
668 |         spall_buffer->first_ts = now;
669 |         spall_buffer->previous_ts = now;
670 |     }
671 |     uint64_t dt = now - spall_buffer->previous_ts;
672 |     uint64_t dt_bits = spall_delta_to_bits(dt);
673 |     uint64_t dt_size = 1 << dt_bits;
674 | 
675 |     // [extended tag | begin type | delta size | field lengths]
676 |     uint8_t name_args_lens = ((name_len_size >> 1) << 1) | (args_len_size >> 1);
677 |     uint8_t type_byte = (2 << 6) | (SpallAutoEventType_Begin << 4) | (dt_bits << 2) | name_args_lens;
678 | 
679 |     int i = 0;
680 |     *(ev_buffer + i) = type_byte;              i += 1;
681 |     memcpy(ev_buffer + i, &dt, 8);             i += dt_size;
682 |     memcpy(ev_buffer + i, &trunc_name_len, name_len_size); i += name_len_size;
683 |     memcpy(ev_buffer + i, &trunc_args_len, args_len_size); i += args_len_size;
684 |     memcpy(ev_buffer + i, name, trunc_name_len); i += trunc_name_len;
685 |     memcpy(ev_buffer + i, args, trunc_args_len); i += trunc_args_len;
686 | 
687 |     spall_buffer->previous_ts = now;
688 |     spall_buffer->head += i;
689 | 
690 |     return true;
691 | }
692 | 
693 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool spall_auto_buffer_end(void) {
694 |     return spall_buffer_micro_end();
695 | }
696 | 
697 | SPALL_NOINSTRUMENT SPALL_FORCEINLINE bool (spall_auto_thread_init)(uint32_t thread_id, size_t buffer_size) {
698 |     if (buffer_size < 512) { return false; }
699 |     if (spall_buffer != NULL) { return false; }
700 | 
701 |     spall_buffer = (SpallBuffer *)calloc(sizeof(SpallBuffer), 1);
702 |     spall_buffer->data = (uint8_t *)malloc(buffer_size);
703 |     spall_buffer->length = buffer_size;
704 |     spall_buffer->thread_id = thread_id;
705 |     spall_buffer->sub_length = buffer_size / 2;
706 | 
707 |     // removing initial page-fault bubbles to make the data a little more accurate, at the cost of thread spin-up time
708 |     memset(spall_buffer->data, 1, spall_buffer->length);
709 | 
710 |     spall_buffer->writer.is_running = true;
711 |     spall_thread_start(spall_buffer);
712 | 
713 |     spall_auto_buffer_flush();
714 |     spall_thread_running = true;
715 |     return true;
716 | }
717 | 
718 | void (spall_auto_thread_quit)(void) {
719 |     spall_thread_running = false;
720 |     spall_auto_buffer_flush();
721 | 
722 |     spall_buffer->writer.is_running = false;
723 |     spall_buffer->writer.ptr = 1;
724 |     spall_signal(&spall_buffer->writer.ptr);
725 |     spall_thread_end(spall_buffer);
726 | 
727 |     free(spall_buffer->data);
728 |     free(spall_buffer);
729 |     spall_buffer = NULL;
730 | }
731 | 
732 | SPALL_FN void *spall_canonical_addr(void* fn) {
733 |     // sometimes the pointer we get back is to a jump table; walk past that first layer.
734 | 
735 |     void *ret = fn;
736 | #if SPALL_IS_X64
737 |     unsigned char *fn_data = (unsigned char *)fn;
738 |     if (fn_data[0] == 0xE9) {
739 |         // JMP rel32
740 |         int32_t target = *(int32_t*) &fn_data[1];
741 | 
742 |         int jump_inst_size = 5;
743 |         ret = (void *)(fn_data + jump_inst_size + target);
744 |     }
745 | #endif
746 | 
747 |     return ret;
748 | }
749 | 
750 | 
751 | SPALL_NOINSTRUMENT bool spall_auto_init(char *filename) {
752 |     if (!filename) return false;
753 |     memset(&spall_ctx, 0, sizeof(spall_ctx));
754 | 
755 |     spall_ctx.file = fopen(filename, "wb"); // TODO: handle utf8 and long paths on windows
756 |     if (spall_ctx.file) { // basically freopen() but we don't want to force users to lug along another macro define
757 |         fclose(spall_ctx.file);
758 |         spall_ctx.file = fopen(filename, "ab");
759 |     }
760 |     if (!spall_ctx.file) { return false; }
761 | 
762 |     spall_ctx.stamp_scale = spall_get_clock_multiplier();
763 |     SpallHeader header = {0};
764 |     header.magic_header = 0xABADF00D;
765 |     header.version = 2;
766 |     header.timestamp_unit = spall_ctx.stamp_scale;
767 |     header.known_address = (uint64_t)spall_canonical_addr((void *)spall_auto_init);
768 | 
769 |     char *program_path;
770 |     if (!get_program_path(&program_path)) { return false; }
771 |     uint16_t program_path_len = (uint16_t)strlen(program_path);
772 | 
773 |     header.program_path_len = program_path_len;
774 | 
775 |     size_t full_header_size = sizeof(SpallHeader) + (size_t)program_path_len;
776 |     uint8_t *full_header = (uint8_t *)malloc(full_header_size);
777 |     memcpy(full_header, &header, sizeof(SpallHeader));
778 |     memcpy(full_header + sizeof(SpallHeader), program_path, program_path_len);
779 | 
780 |     size_t write_ret = fwrite(full_header, 1, full_header_size, spall_ctx.file);
781 |     if (write_ret < full_header_size) { return false; }
782 | 
783 |     free(full_header);
784 |     return true;
785 | }
786 | 
787 | SPALL_NOINSTRUMENT void spall_auto_quit(void) {}
788 | 
789 | SPALL_NOINSTRUMENT void __cyg_profile_func_enter(void *fn, void *caller) {
790 |     if (!spall_thread_running) {
791 |         return;
792 |     }
793 |     fn = spall_canonical_addr(fn);
794 | 
795 |     spall_thread_running = false;
796 |     spall_buffer_micro_begin((uint64_t)fn, (uint64_t)caller);
797 |     spall_thread_running = true;
798 | }
799 | 
800 | SPALL_NOINSTRUMENT void __cyg_profile_func_exit(void *fn, void *caller) {
801 |     if (!spall_thread_running) {
802 |         return;
803 |     }
804 | 
805 |     spall_thread_running = false;
806 |     spall_buffer_micro_end();
807 |     spall_thread_running = true;
808 | }
809 | 
810 | #ifdef __cplusplus
811 | }
812 | #endif
813 | 
814 | #endif
815 | #endif
816 | 


--------------------------------------------------------------------------------