├── .gitignore
├── README.md
└── cds_job.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | GPATH
 2 | GRTAGS
 3 | GTAGS
 4 | GSYMS
 5 | 
 6 | # Visual Studio droppings
 7 | *.exe
 8 | *.obj
 9 | *.ilk
10 | *.pdb
11 | *.suo
12 | 
13 | *.dSYM


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | cds_job
 2 | ========
 3 | 
 4 | This
 5 | [single header file](https://github.com/nothings/stb/blob/master/docs/other_libs.md)
 6 | C++11 library provides a lock-free, work-stealing job queue system. It is based on
 7 | an implementation described extensively by [Stefan Reinalter](http://www.molecular-matters.com/) on
 8 | his [blog](https://blog.molecular-matters.com/tag/job-system/).
 9 | 
10 | *This library is a work in progress, and should not yet be used in production code.*
11 | 
12 | No documentation yet, but here's a list of things to keep in mind:
13 | -	Each job's data is stored (as a copy) in the leftover space of the Job structure itself.
14 | 	-	If you pass more data to a job than will fit in this space, you fail.
15 | 	-	This should be handled with a different createJob() variant.
16 | -	Each job can have a "parent" job specified at creation time, but this has nothing to do with job dependencies.
17 | 	It's merely a way to say "when you wait on a job that's a parent, you wait on all its children (recursively) as well."
18 | -	Waiting on a job with waitForJob() is a read-only operation, completely orthogonal to job execution.
19 | 	-	waitForJob() does not attempt to steal or execute the specified job; instead, it causes the thread to process
20 | 		jobs from its own queue (and/or by stealing from other queues) until the specified job is finished.
21 | 	-	The job you're waiting on may very well be executed while you're waiting for it to complete. This is fine.
22 | 		The wait will not terminate until both the parent and all its children have been executed. Note that parent/child
23 | 		execution order is NOT guaranteed.
24 | 	-	Waiting on a dummy root job seems common enough that there should be a shortcut.
25 | -	For efficiency, Jobs are allocated out of pools stored in TLS. Each worker thread has a maximum number of jobs it can have "in-flight"
26 | 	simultaneously.
27 | 	-	There is currently no check to enforce the maxJobsPerThread limit; accidentally exceeding this limit is by far the most common source
28 | 		of nasty bugs I've encountered so far.
29 | 	-	This scheme is best suited to cases where the worker threads are generating their own work (in roughly similar quantities),
30 | 		rather than one master thread generating an entire workload and letting the workers divvy it up. Although, see parallel_for()
31 | 		for an example of the latter strategy in action.
32 | -	Currently, threads that can't find work to do will call YieldProcessor(). This is effectively a busy-wait, and is totally
33 | 	inappropriate in production code. The original author suggests various approaches to put workers to sleep when there's no work left,
34 | 	and wake them again when more is ready.
35 | 
36 | Key Features / Design Goals
37 | ---------------------------
38 | - **Identical API on all supported platforms**. The following
39 |   platforms are tested regularly:
40 |   - Microsoft Windows 7
41 |     - Visual Studio 2010
42 |     - Visual Studio 2012
43 |     - Visual Studio 2013
44 |   - Linux Mint
45 |     - LLVM/Clang 3.5
46 |     - gcc 4.8.4
47 |   - Apple OSX
48 |     - Apple LLVM/Clang 6.1.0
49 | - **No (mandatory) external dependencies**. Only C++11 standard library
50 |   functions are used.
51 | - **Dirt-simple integration**. Just a single header file to include in
52 | your project.
53 | - **Public domain license terms**. 
54 | 
55 | Acknowledgements
56 | ----------------
57 | - [Sean Barrett](http://nothings.org/): master of single-header C libraries.
58 | - [Stefan Reinalter](https://blog.molecular-matters.com/tag/job-system/): author of the Molecule Engine.
59 | 


--------------------------------------------------------------------------------
/cds_job.h:
--------------------------------------------------------------------------------
  1 | /* cds_job.h -- Lock-free job queue in C++11
  2 |  *              No warranty implied; use at your own risk.
  3 |  *
  4 |  * Do this:
  5 |  *   #define CDS_JOB_IMPLEMENTATION
  6 |  * before including this file in *one* C/C++ file to provide the function
  7 |  * implementations.
  8 |  *
  9 |  * For a unit test on g++/Clang:
 10 |  *   cc -Wall -pthread -std=c++11 -D_POSIX_C_SOURCE=199309L -g -x c -DCDS_JOB_TEST -o test_cds_job.exe cds_job.h -lstdc++ -lpthread
 11 |  * Clang users may also pass -fsanitize=thread to enable Clang's
 12 |  * ThreadSanitizer feature.
 13 |  *
 14 |  * For a unit test on Visual C++:
 15 |  *   "%VS120COMNTOOLS%\..\..\VC\vcvarsall.bat"
 16 |  *   cl -W4 -MT -nologo -EHsc -TP -DCDS_JOB_TEST /Fetest_cds_job.exe cds_job.h
 17 |  * Debug-mode:
 18 |  *   cl -W4 -Od -Z7 -FC -MTd -nologo -EHsc -TP -DCDS_JOB_TEST /Fetest_cds_job.exe cds_job.h
 19 |  *
 20 |  * LICENSE:
 21 |  * This software is in the public domain. Where that dedication is not
 22 |  * recognized, you are granted a perpetual, irrevocable license to
 23 |  * copy, distribute, and modify this file as you see fit.
 24 |  */
 25 | 
 26 | #ifndef CDS_JOB_H
 27 | #define CDS_JOB_H
 28 | 
 29 | #include <stdint.h>
 30 | 
 31 | namespace cds {
 32 |     namespace job {
 33 |         struct Job;
 34 |         class Context;
 35 |         typedef void (*JobFunction)(struct Job*, const void*);
 36 | 
 37 |         // Called by main thread to create the shared job context for a pool of worker threads.
 38 |         Context *createContext(int numWorkers, int maxJobsPerWorker);
 39 | 
 40 |         // Called by each worker thread.
 41 |         int initWorker(Context *ctx);
 42 | 
 43 |         // Called by worker threads to create a new job to execute. This function does *not* enqueue the new job for execution.
 44 |         Job *createJob(JobFunction function, Job *parent, const void *embeddedData, size_t embeddedDataBytes);
 45 | 
 46 |         // Called by worker threads to enqueue a job for execution. This gives the next available thread permission to execute this
 47 |         // job. All prior dependencies must be complete before a job is enqueued.
 48 |         int enqueueJob(Job *job);
 49 | 
 50 |         // Fetch and run any available queued jobs until the specified job is complete.
 51 |         void waitForJob(const Job *job);
 52 | 
 53 |         // Return the worker ID of the calling thread. If initWorker()
 54 |         // was called by this thread, the worker ID will be an index
 55 |         // from [0..numWorkers-1]. Otherwise, the worker ID is undefined.
 56 |         int workerId(void);
 57 | 
 58 |         template <typename T, typename S>
 59 |         struct ParallelForJobData {
 60 |             typedef T DataType;
 61 |             typedef S SplitterType;
 62 |             typedef void (*FunctionType)(DataType*, unsigned int, void*);
 63 | 
 64 |             ParallelForJobData(DataType* data, unsigned int count, void *userData, FunctionType function, const SplitterType& splitter)
 65 |                 :    data(data)
 66 |                 ,    userData(userData)
 67 |                 ,    function(function)
 68 |                 ,    splitter(splitter)
 69 |                 ,    count(count)
 70 |                 {
 71 |                 }
 72 | 
 73 |             DataType* data;
 74 |             void *userData;
 75 |             FunctionType function;
 76 |             SplitterType splitter;
 77 |             unsigned int count;
 78 |         };
 79 | 
 80 |         template <typename T, typename S>
 81 |         Job* createParallelForJob(T* data, unsigned int count, void *userData, void (*function)(T*, unsigned int, void*),
 82 |             const S& splitter, Job *parent = nullptr)
 83 |         {
 84 |             typedef ParallelForJobData<T, S> JobData;
 85 |             const JobData jobData(data, count, userData, function, splitter);
 86 | 
 87 |             return createJob(parallelForJobFunc<JobData>, parent, &jobData, sizeof(jobData));
 88 |         }
 89 | 
 90 |         template <typename JobData>
 91 |         void parallelForJobFunc(struct Job* job, const void* jobData) {
 92 |             const JobData* data = static_cast<const JobData*>(jobData);
 93 |             const JobData::SplitterType& splitter = data->splitter;
 94 |             if (splitter.split<JobData::DataType>(data->count)) {
 95 |                 // split in two
 96 |                 const unsigned int leftCount = data->count / 2U;
 97 |                 const JobData leftData(data->data + 0, leftCount, data->userData, data->function, splitter);
 98 |                 Job *leftJob = createJob(parallelForJobFunc<JobData>, job, &leftData, sizeof(leftData));
 99 |                 enqueueJob( leftJob );
100 | 
101 |                 const unsigned int rightCount = data->count - leftCount;
102 |                 const JobData rightData(data->data + leftCount, rightCount, data->userData, data->function, splitter);
103 |                 Job *rightJob = createJob(parallelForJobFunc<JobData>, job, &rightData, sizeof(rightData));
104 |                 enqueueJob( rightJob );
105 |             } else {
106 |                 // execute the function on the range of data
107 |                 (data->function)(data->data, data->count, data->userData);
108 |             }
109 |         }
110 | 
111 |         class CountSplitter {
112 |         public:
113 |             explicit CountSplitter(unsigned int count) : m_count(count) {}
114 |             template <typename T> inline bool split(unsigned int count) const { return (count > m_count); }
115 |         private:
116 |             unsigned int m_count;
117 |         };
118 | 
119 |         class DataSizeSplitter {
120 |         public:
121 |             explicit DataSizeSplitter(unsigned int size) : m_size(size) {}
122 |             template <typename T> inline bool split(unsigned int count) const { return (count*sizeof(T) > m_size); }
123 |         private:
124 |             unsigned int m_size;
125 |         };
126 |     }
127 | }
128 | 
129 | #endif ////////////////////////////////////// end header file
130 | 
131 | #if defined(CDS_JOB_TEST)
132 | #   if !defined(CDS_JOB_IMPLEMENTATION)
133 | #       define CDS_JOB_IMPLEMENTATION
134 | #   endif
135 | #endif
136 | 
137 | #ifdef CDS_JOB_IMPLEMENTATION
138 | 
139 | #if   defined(_MSC_VER)
140 | #   if _MSC_VER < 1900
141 | #       define CDS_JOB_THREADLOCAL __declspec(thread)
142 | #   else
143 | #       define CDS_JOB_THREADLOCAL thread_local
144 | #   endif
145 | #elif defined(__GNUC__)
146 | #   define CDS_JOB_THREADLOCAL __thread
147 | #elif defined(__clang__)
148 | #   if defined(__APPLE__) || defined(__MACH__)
149 | #       define CDS_JOB_THREADLOCAL __thread
150 | #   else
151 | #       define CDS_JOB_THREADLOCAL thread_local
152 | #   endif
153 | #endif
154 | 
155 | #ifdef _MSC_VER
156 | #   include <windows.h>
157 | #   define JOB_YIELD() YieldProcessor()
158 | #   define JOB_COMPILER_BARRIER _ReadWriteBarrier()
159 | #   define JOB_MEMORY_BARRIER std::atomic_thread_fence(std::memory_order_seq_cst);
160 | #else
161 | #   include <emmintrin.h>
162 | #   define JOB_YIELD() _mm_pause()
163 | #   define JOB_COMPILER_BARRIER asm volatile("" ::: "memory")
164 | #   define JOB_MEMORY_BARRIER asm volatile("mfence" ::: "memory")
165 | #endif
166 | 
167 | #include <assert.h>
168 | #include <atomic>
169 | #include <stdio.h>
170 | #include <stdlib.h>
171 | #include <string.h>
172 | using namespace cds::job;
173 | 
174 | namespace {
175 |     class WorkStealingQueue {
176 |     public:
177 |         static size_t BufferSize(int capacity) {
178 |             return capacity*sizeof(Job*);
179 |         }
180 | 
181 |         int Init(int capacity, void *buffer, size_t bufferSize);
182 |         int Push(Job *job);
183 |         Job *Pop();
184 |         Job *Steal();
185 | 
186 |     private:
187 |         Job **m_entries;
188 |         std::atomic<uint64_t> m_top;
189 |         uint64_t m_bottom;
190 |         int m_capacity;
191 |     };
192 | }
193 | 
194 | int WorkStealingQueue::Init(int capacity, void *buffer, size_t bufferSize) {
195 |     if ( (capacity & (capacity-1)) != 0) {
196 |         return -2; // capacity must be a power of 2
197 |     }
198 |     size_t minBufferSize = BufferSize(capacity);
199 |     if (bufferSize < minBufferSize) {
200 |         return -1; // inadequate buffer size
201 |     }
202 |     uint8_t *bufferNext = (uint8_t*)buffer;
203 |     m_entries = (Job**)bufferNext;
204 |     bufferNext += capacity*sizeof(Job*);
205 |     assert( bufferNext - (uint8_t*)buffer == (intptr_t)minBufferSize );
206 | 
207 |     for(int iEntry=0; iEntry<capacity; iEntry+=1) {
208 |         m_entries[iEntry] = nullptr;
209 |     }
210 | 
211 |     m_top = 0;
212 |     m_bottom = 0;
213 |     m_capacity = capacity;
214 | 
215 |     return 0;
216 | }
217 | 
218 | int WorkStealingQueue::Push(Job *job) {
219 |     // TODO: assert that this is only ever called by the owning thread
220 |     uint64_t jobIndex = m_bottom;
221 |     m_entries[jobIndex & (m_capacity-1)] = job;
222 | 
223 |     // Ensure the job is written before the m_bottom increment is published.
224 |     // A StoreStore memory barrier would also be necessary on platforms with a weak memory model.
225 |     JOB_COMPILER_BARRIER;
226 | 
227 |     m_bottom = jobIndex+1;
228 |     return 0;
229 | }
230 | Job *WorkStealingQueue::Pop() {
231 |     // TODO: assert that this is only ever called by the owning thread
232 |     uint64_t bottom = m_bottom-1;
233 |     m_bottom = bottom;
234 | 
235 |     // Make sure m_bottom is published before reading top.
236 |     // Requires a full StoreLoad memory barrier, even on x86/64.
237 |     JOB_MEMORY_BARRIER;
238 | 
239 |     uint64_t top = m_top;
240 |     if (top <= bottom) {
241 |         Job *job = m_entries[bottom & (m_capacity-1)];
242 |         if (top != bottom) {
243 |             // still >0 jobs left in the queue
244 |             return job;
245 |         } else {
246 |             // popping the last element in the queue
247 |             if (!std::atomic_compare_exchange_strong(&m_top, &top, top)) {
248 |                 // failed race against Steal()
249 |                 job = nullptr;
250 |             }
251 |             m_bottom = top+1;
252 |             return job;
253 |         }
254 |     } else {
255 |         // queue already empty
256 |         m_bottom = top;
257 |         return nullptr;
258 |     }
259 | }
260 | Job *WorkStealingQueue::Steal() {
261 |     // TODO: assert that this is never called by the owning thread
262 |     uint64_t top    = m_top;
263 | 
264 |     // Ensure top is always read before bottom.
265 |     // A LoadLoad memory barrier would also be necessary on platforms with a weak memory model.
266 |     JOB_COMPILER_BARRIER;
267 | 
268 |     uint64_t bottom = m_bottom;
269 |     if (top < bottom) {
270 |         Job *job = m_entries[top & (m_capacity-1)];
271 |         // CAS serves as a compiler barrier as-is.
272 |         if (!std::atomic_compare_exchange_strong(&m_top, &top, top+1)) {
273 |             // concurrent Steal()/Pop() got this entry first.
274 |             return nullptr;
275 |         }
276 |         m_entries[top & (m_capacity-1)] = nullptr;
277 |         return job;
278 |     } else {
279 |         return nullptr; // queue empty
280 |     }
281 | }
282 | 
283 | ///////////////////
284 | 
285 | #define kCdsJobCacheLineBytes 64
286 | #define kCdsJobPaddingBytes ( (kCdsJobCacheLineBytes) - (sizeof(JobFunction) + sizeof(struct Job*) + sizeof(void*) + sizeof(std::atomic_int_fast32_t)) )
287 | 
288 | #ifdef _MSC_VER
289 | #   define JOB_ATTR_ALIGN(alignment) __declspec(align(alignment))
290 | #else
291 | #   define JOB_ATTR_ALIGN(alignment) __attribute__((aligned(alignment)))
292 | #endif
293 | 
294 | namespace cds {
295 |     namespace job {
296 |         typedef JOB_ATTR_ALIGN(kCdsJobCacheLineBytes) struct Job {
297 |             JobFunction function;
298 |             struct Job *parent;
299 |             void *data;
300 |             std::atomic_int_fast32_t unfinishedJobs;
301 |             char padding[kCdsJobPaddingBytes];
302 |         } Job;
303 | 
304 |         class Context {
305 |         public:
306 |             Context() = delete;
307 |             Context(const Context &ctx) = delete;
308 |             Context(int numWorkerThreads, int maxJobsPerThread);
309 |             ~Context();
310 | 
311 |             WorkStealingQueue **m_workerJobQueues;
312 |             void *m_jobPoolBuffer;
313 |             void *m_queueEntryBuffer;
314 |             std::atomic<int> m_nextWorkerId;
315 |             int m_numWorkerThreads;
316 |             int m_maxJobsPerThread;
317 |         };
318 |     }
319 | }
320 | 
321 | static_assert((sizeof(struct Job) % kCdsJobCacheLineBytes) == 0, "Job struct is not cache-line-aligned!");
322 | 
323 | static CDS_JOB_THREADLOCAL Context *tls_jobContext = nullptr;
324 | static CDS_JOB_THREADLOCAL uint64_t tls_jobCount = 0;
325 | static CDS_JOB_THREADLOCAL int tls_workerId = -1;
326 | static CDS_JOB_THREADLOCAL Job *tls_jobPool = nullptr;
327 | 
328 | static inline uint32_t nextPowerOfTwo(uint32_t x)
329 | {
330 |     x = x-1;
331 |     x = x | (x>> 1);
332 |     x = x | (x>> 2);
333 |     x = x | (x>> 4);
334 |     x = x | (x>> 8);
335 |     x = x | (x>>16);
336 |     return x+1;
337 | }
338 | 
339 | Context::Context(int numWorkerThreads, int maxJobsPerThread)
340 |     :   m_workerJobQueues(nullptr)
341 |     ,   m_nextWorkerId(0)
342 |     ,   m_numWorkerThreads(numWorkerThreads)
343 | {
344 |     maxJobsPerThread = nextPowerOfTwo(maxJobsPerThread);
345 |     m_maxJobsPerThread = maxJobsPerThread;
346 | 
347 |     m_workerJobQueues = new WorkStealingQueue*[numWorkerThreads];
348 |     const size_t jobPoolBufferSize = numWorkerThreads*maxJobsPerThread*sizeof(Job) + kCdsJobCacheLineBytes - 1;
349 |     m_jobPoolBuffer = malloc(jobPoolBufferSize);
350 |     size_t queueBufferSize = WorkStealingQueue::BufferSize(maxJobsPerThread);
351 |     m_queueEntryBuffer = malloc(queueBufferSize * numWorkerThreads);
352 |     for(int iWorker=0; iWorker<numWorkerThreads; ++iWorker)
353 |     {
354 |         m_workerJobQueues[iWorker] = new WorkStealingQueue();
355 |         int initError = m_workerJobQueues[iWorker]->Init(
356 |             maxJobsPerThread,
357 |             (void*)( intptr_t(m_queueEntryBuffer) + iWorker*queueBufferSize ),
358 |             queueBufferSize);
359 |         (void)initError;
360 |         assert(initError == 0);
361 |     }
362 | 
363 | }
364 | Context::~Context()
365 | {
366 |     for(int iWorker=0; iWorker<m_numWorkerThreads; ++iWorker)
367 |     {
368 |         delete m_workerJobQueues[iWorker];
369 |     }
370 |     delete [] m_workerJobQueues;
371 |     free(m_queueEntryBuffer);
372 |     free(m_jobPoolBuffer);
373 | }
374 | 
375 | static inline Job *AllocateJob() {
376 |     // TODO(cort): no protection against over-allocation
377 |     uint64_t index = tls_jobCount++;
378 |     return &tls_jobPool[index & (tls_jobContext->m_maxJobsPerThread-1)];
379 | }
380 | 
381 | static inline bool IsJobComplete(const Job *job) {
382 |     return (job->unfinishedJobs == 0);
383 | }
384 | 
385 | static void FinishJob(Job *job) {
386 |     const int32_t unfinishedJobs = --(job->unfinishedJobs);
387 |     assert(unfinishedJobs >= 0);
388 |     if (unfinishedJobs == 0 && job->parent) {
389 |         FinishJob(job->parent);
390 |     }
391 | }
392 | 
393 | static Job *GetJob(void) {
394 |     WorkStealingQueue *myQueue = tls_jobContext->m_workerJobQueues[tls_workerId];
395 |     Job *job = myQueue->Pop();
396 |     if (!job) {
397 |         // this worker's queue is empty; try to steal a job from another thread
398 |         int victimOffset = 1 + (rand() % tls_jobContext->m_numWorkerThreads-1);
399 |         int victimIndex = (tls_workerId + victimOffset) % tls_jobContext->m_numWorkerThreads;
400 |         WorkStealingQueue *victimQueue = tls_jobContext->m_workerJobQueues[victimIndex];
401 |         job = victimQueue->Steal();
402 |         if (!job) { // nothing to steal
403 |             JOB_YIELD(); // TODO(cort): busy-wait bad, right? But there might be a job to steal in ANOTHER queue, so we should try again shortly.
404 |             return nullptr;
405 |         }
406 |     }
407 |     return job;
408 | }
409 | 
410 | static inline void ExecuteJob(Job *job) {
411 |     (job->function)(job, job->data);
412 |     FinishJob(job);
413 | }
414 | 
415 | Context *cds::job::createContext(int numWorkers, int maxJobsPerWorker)
416 | {
417 |     return new Context(numWorkers, maxJobsPerWorker);
418 | }
419 | 
420 | int cds::job::initWorker(Context *ctx)
421 | {
422 |     tls_jobContext = ctx;
423 |     tls_jobCount = 0;
424 |     tls_workerId = ctx->m_nextWorkerId++;
425 |     assert(tls_workerId < ctx->m_numWorkerThreads);
426 |     void *jobPoolBufferAligned = (void*)( (uintptr_t(ctx->m_jobPoolBuffer) + kCdsJobCacheLineBytes-1) & ~(kCdsJobCacheLineBytes-1) );
427 |     assert( (uintptr_t(jobPoolBufferAligned) % kCdsJobCacheLineBytes) == 0 );
428 |     tls_jobPool = (Job*)(jobPoolBufferAligned) + tls_workerId*ctx->m_maxJobsPerThread;
429 |     return tls_workerId;
430 | }
431 | 
432 | Job *cds::job::createJob(JobFunction function, Job *parent, const void *embeddedData, size_t embeddedDataBytes) {
433 |     if (embeddedData != nullptr && embeddedDataBytes > kCdsJobPaddingBytes) {
434 |         assert(0);
435 |         return NULL;
436 |     }
437 |     if (parent) {
438 |         parent->unfinishedJobs++;
439 |     }
440 |     Job *job = AllocateJob();
441 |     job->function = function;
442 |     job->parent = parent;
443 |     job->unfinishedJobs = 1;
444 |     if (embeddedData) {
445 |         memcpy(job->padding, embeddedData, embeddedDataBytes);
446 |         job->data = job->padding;
447 |     } else {
448 |         job->data = nullptr;
449 |     }
450 |     return job;
451 | }
452 | 
453 | // Enqueues a job for eventual execution
454 | int cds::job::enqueueJob(Job *job) {
455 |     int pushError = tls_jobContext->m_workerJobQueues[tls_workerId]->Push(job);
456 |     return pushError;
457 | }
458 | 
459 | // Fetch and run queued jobs until the specified job is complete
460 | void cds::job::waitForJob(const Job *job) {
461 |     while(!IsJobComplete(job)) {
462 |         Job *nextJob = GetJob();
463 |         if (nextJob) {
464 |             ExecuteJob(nextJob);
465 |         }
466 |     }
467 | }
468 | 
469 | int cds::job::workerId(void) {
470 |     return tls_workerId;
471 | }
472 | 
473 | #endif // defined(CDS_JOB_IMPLEMENTATION)
474 | 
475 | #ifdef CDS_JOB_TEST ////////////////////////////// test code
476 | 
477 | #define kNumWorkers 16
478 | #define kTotalJobCount (64*1024)
479 | static const int kMaxJobsPerThread = (kTotalJobCount / kNumWorkers);
480 | static std::atomic_int_fast32_t g_finishedJobCount(0);
481 | 
482 | static void empty_job(Job *job, const void*data) {
483 |     (void)job;
484 |     (void)data;
485 |     g_finishedJobCount++;
486 |     //int *jobId = (int*)data;
487 |     //printf("worker %2d, job 0x%08X\n", tls_workerId, *jobId);
488 | }
489 | 
490 | static void emptyWorkerTest(Context *jobCtx) {
491 |     int workerId = cds::job::initWorker(jobCtx);
492 | 
493 |     const int jobCount = jobCtx->m_maxJobsPerThread;
494 |     int jobId = (workerId<<16) | 0;
495 |     Job *root = createJob(empty_job, nullptr, &jobId, sizeof(int));
496 |     enqueueJob(root);
497 |     for(int iJob=1; iJob<jobCount; iJob+=1) {
498 |         int jobId = (workerId<<16) | iJob;
499 |         Job *job = createJob(empty_job, root, &jobId, sizeof(int));
500 |         int addError = enqueueJob(job);
501 |         assert(!addError);
502 |     }
503 |     waitForJob(root);
504 | }
505 | 
506 | static void squareInts(uint64_t *data, unsigned int count, void *userData) {
507 |     (void)userData;
508 |     for(unsigned int i=0; i<count; ++i) {
509 |         data[i] *= data[i];
510 |     }
511 | }
512 | 
513 | static void parallelForTest(Context *jobCtx, Job *rootJob) {
514 |     cds::job::initWorker(jobCtx);
515 |     waitForJob(rootJob);
516 | }
517 | 
518 | #include <chrono>
519 | #include <thread>
520 | 
521 | int main(int argc, char *argv[]) {
522 |     (void)argc;
523 |     (void)argv;
524 | 
525 |     {
526 |         cds::job::Context *jobCtx = cds::job::createContext(kNumWorkers, kMaxJobsPerThread);
527 | 
528 |         auto startTime = std::chrono::high_resolution_clock::now();
529 |         std::thread workers[kNumWorkers];
530 |         for(int iThread=0; iThread<kNumWorkers; iThread+=1) {
531 |             workers[iThread] = std::thread(emptyWorkerTest, jobCtx);
532 |         }
533 | 
534 |         for(int iThread=0; iThread<kNumWorkers; iThread+=1) {
535 |             workers[iThread].join();
536 |         }
537 | 
538 |         auto endTime = std::chrono::high_resolution_clock::now();
539 |         auto elapsedNanos = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime-startTime).count();
540 |         printf("%d jobs complete in %.3fms\n", (int)g_finishedJobCount.load(), (double)elapsedNanos/1e6);
541 |         delete jobCtx;
542 |     }
543 | 
544 |     {
545 |         const int kNumSquares = 1*1024*1024;
546 |         uint64_t *squares = new uint64_t[kNumSquares];
547 |         for(uint64_t i=0; i<kNumSquares; ++i) {
548 |             squares[i] = i;
549 |         }
550 | 
551 |         cds::job::Context *jobCtx = cds::job::createContext(kNumWorkers, kNumSquares/(32*1024/sizeof(uint64_t))); // TODO(cort): touchy touchy!
552 | 
553 |         auto startTime = std::chrono::high_resolution_clock::now();
554 | 
555 |         // in this test, the main thread is a worker.
556 |         initWorker(jobCtx);
557 |         Job *rootJob = createParallelForJob(squares, kNumSquares, nullptr, squareInts, DataSizeSplitter(32*1024), nullptr);
558 |         enqueueJob(rootJob);
559 | 
560 | #if 0
561 |         waitForJob(rootJob);
562 | #else
563 |         std::thread workers[kNumWorkers-1];
564 |         for(int iThread=0; iThread<kNumWorkers-1; iThread+=1) {
565 |             workers[iThread] = std::thread(parallelForTest, jobCtx, rootJob);
566 |         }
567 |         waitForJob(rootJob);
568 |         for(int iThread=0; iThread<kNumWorkers-1; iThread+=1) {
569 |             workers[iThread].join();
570 |         }
571 | #endif
572 | 
573 |         auto endTime = std::chrono::high_resolution_clock::now();
574 |         auto elapsedNanos = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime-startTime).count();
575 |         printf("%d jobs complete in %.3fms\n", kNumSquares, (double)elapsedNanos/1e6);
576 |         for(uint64_t i=0; i<kNumSquares; ++i) {
577 |             if (squares[i] != i*i) {
578 |                 printf("Error: squares[%lld] = %lld (expected %lld)\n", i, squares[i], i*i);
579 |             }
580 |         }
581 |         printf("%d squares computed successfully\n", kNumSquares);
582 |         free(squares);
583 |         delete jobCtx;
584 |     }
585 |     return 0;
586 | }
587 | #endif // CDS_JOB_TEST
588 | 


--------------------------------------------------------------------------------