├── Benchmarks.hs
├── LICENSE
├── MVarExperiment.hs
├── Main1.hs
├── MainN.hs
├── README.md
├── RetryExperiment.hs
├── Setup.hs
├── TVarExperiment.hs
├── TestAtomics.hs
├── chan-benchmarks.cabal
└── reports
    ├── ghc-7.6
        ├── bench-multi.chans.html
        ├── bench-multi.vars-oddly-little-ioref-variance.html
        ├── bench-multi.vars.html
        ├── bench-single.arrays.html
        ├── bench-single.chan_latency.html
        ├── bench-single.chan_throughput.html
        ├── bench-single.cons-reverse.html
        ├── bench-single.misc.html
        └── bench-single.primitives.html
    ├── ghc-7.6_8-core
        ├── bench-multi.chans.html
        ├── bench-multi.vars.html
        └── bench-single.primitives.html
    └── ghc-7.8
        ├── bench-multi.chans.html
        ├── bench-multi.vars.html
        ├── bench-single.arrays.html
        ├── bench-single.chan_latency.html
        ├── bench-single.chan_throughput.html
        └── bench-single.primitives.html


/Benchmarks.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RankNTypes #-}
  2 | {-# LANGUAGE PackageImports #-}
  3 | {-# LANGUAGE BangPatterns #-}
  4 | module Benchmarks
  5 |     where 
  6 | 
  7 | import Control.Concurrent.Async
  8 | import Control.Monad
  9 | import System.Environment
 10 | 
 11 | import Control.Concurrent
 12 | import Control.Concurrent.Chan
 13 | import Control.Concurrent.STM
 14 | import Control.Concurrent.STM.TQueue
 15 | import Control.Concurrent.STM.TBQueue
 16 | 
 17 | import Control.Concurrent.MVar
 18 | import Data.IORef
 19 | import Control.Exception(evaluate)
 20 | 
 21 | 
 22 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S
 23 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC
 24 | import Data.Primitive.MutVar
 25 | import Control.Monad.Primitive(PrimState)
 26 | 
 27 | import Data.Atomics
 28 | 
 29 | -- Hack since these aren't currently working with ghc 7.8
 30 | #if MIN_VERSION_base(4,7,0)
 31 | #else
 32 | import qualified Data.Concurrent.Queue.MichaelScott as MS
 33 | #endif
 34 | import qualified Data.Concurrent.Deque.ChaseLev as CL
 35 | 
 36 | 
 37 | -- -----------------------------------------------------------
 38 | atomicModifyIORefCAS' :: IORef a      -- ^ Mutable location to modify
 39 |                      -> (a -> (a,b)) -- ^ Computation runs one or more times (speculation)
 40 |                      -> IO b
 41 | atomicModifyIORefCAS' ref fn = do
 42 |    -- TODO: Should handle contention in a better way...
 43 |    tick <- readForCAS ref
 44 |    loop tick 
 45 |   where 
 46 |    loop old = do 
 47 |      let !(~new,~result) = fn $ peekTicket old
 48 |      (b,tick) <- casIORef ref old new
 49 |      if b 
 50 |       then new `seq` result `seq` return result
 51 |       else loop tick
 52 | 
 53 | 
 54 | -- -----------------------------------------------------------
 55 | 
 56 | -- These tests initially taken from stm/bench/chanbench.hs, ported to
 57 | -- criterion, with some additions, and have now changed quite a bit.
 58 | --
 59 | -- The original used CPP to avoid code duplication while also ensuring GHC
 60 | -- optimized the code in a realistic fashion. Here we just copy paste.
 61 | 
 62 | 
 63 | runtestChan1, runtestChan2 :: Int -> IO ()
 64 | runtestChan1 n = do
 65 |   c <- newChan
 66 |   replicateM_ n $ writeChan c ()
 67 |   replicateM_ n $ readChan c
 68 | 
 69 | runtestChan2 n = do
 70 |   c <- newChan
 71 |   let n1000 = n `quot` 1000
 72 |   replicateM_ 1000 $ do
 73 |     replicateM_ n1000 $ writeChan c ()
 74 |     replicateM_ n1000 $ readChan c
 75 | 
 76 | runtestChanAsync :: Int -> Int -> Int -> IO ()
 77 | runtestChanAsync writers readers n = do
 78 |   let nNice = n - rem n (lcm writers readers)
 79 |   c <- newChan
 80 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ readChan c
 81 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ writeChan c ()
 82 |   mapM_ wait rcvrs
 83 | 
 84 | -- ----------
 85 | -- Hack since these aren't currently working with ghc 7.8
 86 | #if MIN_VERSION_base(4,7,0)
 87 | #else
 88 | -- from "lockfree-queue"
 89 | runtestLockfreeQueue1, runtestLockfreeQueue2 :: Int -> IO ()
 90 | runtestLockfreeQueue1 n = do
 91 |   c <- MS.newQ
 92 |   replicateM_ n $ MS.pushL c ()
 93 |   replicateM_ n $ msreadR c
 94 | 
 95 | runtestLockfreeQueue2 n = do
 96 |   c <- MS.newQ
 97 |   let n1000 = n `quot` 1000
 98 |   replicateM_ 1000 $ do
 99 |     replicateM_ n1000 $ MS.pushL c ()
100 |     replicateM_ n1000 $ msreadR c
101 | 
102 | runtestLockfreeQueueAsync :: Int -> Int -> Int -> IO ()
103 | runtestLockfreeQueueAsync writers readers n = do
104 |   let nNice = n - rem n (lcm writers readers)
105 |   c <- MS.newQ
106 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ msreadR c
107 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ MS.pushL c ()
108 |   mapM_ wait rcvrs
109 | 
110 | -- a busy-blocking read:
111 | msreadR :: MS.LinkedQueue a -> IO a
112 | msreadR q = MS.tryPopR q >>= maybe (msreadR q) return
113 | #endif
114 | 
115 | 
116 | -- ----------
117 | -- from "chaselev-dequeue"
118 | -- NOTE: this is generally to get a sense of how the techniques used perform;
119 | --       this is not a general-purpose concurrent FIFO queue.
120 | runtestChaseLevQueue1, runtestChaseLevQueue2 :: Int -> IO ()
121 | runtestChaseLevQueue1 n = do
122 |   c <- CL.newQ
123 |   replicateM_ n $ CL.pushL c ()
124 |   replicateM_ n $ clreadR c
125 | 
126 | runtestChaseLevQueue2 n = do
127 |   c <- CL.newQ
128 |   let n1000 = n `quot` 1000
129 |   replicateM_ 1000 $ do
130 |     replicateM_ n1000 $ CL.pushL c ()
131 |     replicateM_ n1000 $ clreadR c
132 | 
133 | -- One reader / one writer (we can have at most one writer safely); copy-pasta
134 | runtestChaseLevQueueAsync_1_1 :: Int -> IO ()
135 | runtestChaseLevQueueAsync_1_1 n = do
136 |   let nNice = n - rem n (lcm 1 1)
137 |   c <- CL.newQ
138 |   rcvrs <- replicateM 1 $ async $ replicateM_ (nNice `quot` 1) $ clreadR c
139 |   senders <- replicateM 1 $ async $ replicateM_ (nNice `quot` 1) $ CL.pushL c ()
140 |   mapM_ wait rcvrs
141 | 
142 | -- a busy-blocking read:
143 | clreadR :: CL.ChaseLevDeque a -> IO a
144 | clreadR q = CL.tryPopR q >>= maybe (clreadR q) return
145 | 
146 | -- ----------
147 | 
148 | runtestTChan1, runtestTChan2 :: Int -> IO ()
149 | runtestTChan1 n = do
150 |   c <- newTChanIO
151 |   replicateM_ n $ atomically $ writeTChan c ()
152 |   replicateM_ n $ atomically $ readTChan c
153 | 
154 | runtestTChan2 n = do
155 |   c <- newTChanIO
156 |   let n1000 = n `quot` 1000
157 |   replicateM_ 1000 $ do
158 |     replicateM_ n1000 $ atomically $ writeTChan c ()
159 |     replicateM_ n1000 $ atomically $ readTChan c
160 | 
161 | runtestTChanAsync :: Int -> Int -> Int -> IO ()
162 | runtestTChanAsync writers readers n = do
163 |   let nNice = n - rem n (lcm writers readers)
164 |   c <- newTChanIO
165 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTChan c
166 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTChan c ()
167 |   mapM_ wait rcvrs
168 | 
169 | -- ----------
170 | 
171 | runtestTQueue1, runtestTQueue2 :: Int -> IO ()
172 | runtestTQueue1 n = do
173 |   c <- newTQueueIO
174 |   replicateM_ n $ atomically $ writeTQueue c ()
175 |   replicateM_ n $ atomically $ readTQueue c
176 | 
177 | runtestTQueue2 n = do
178 |   c <- newTQueueIO
179 |   let n1000 = n `quot` 1000
180 |   replicateM_ 1000 $ do
181 |     replicateM_ n1000 $ atomically $ writeTQueue c ()
182 |     replicateM_ n1000 $ atomically $ readTQueue c
183 | 
184 | runtestTQueueAsync :: Int -> Int -> Int -> IO ()
185 | runtestTQueueAsync writers readers n = do
186 |   let nNice = n - rem n (lcm writers readers)
187 |   c <- newTQueueIO
188 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTQueue c
189 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTQueue c ()
190 |   mapM_ wait rcvrs
191 | 
192 | -- ----------
193 | 
194 | runtestTBQueue1, runtestTBQueue2 :: Int -> IO ()
195 | runtestTBQueue1 n = do
196 |   c <- newTBQueueIO n -- The original benchmark must have blocked indefinitely here, no?
197 |   replicateM_ n $ atomically $ writeTBQueue c ()
198 |   replicateM_ n $ atomically $ readTBQueue c
199 | 
200 | runtestTBQueue2 n = do
201 |   c <- newTBQueueIO 4096
202 |   let n1000 = n `quot` 1000
203 |   replicateM_ 1000 $ do
204 |     replicateM_ n1000 $ atomically $ writeTBQueue c ()
205 |     replicateM_ n1000 $ atomically $ readTBQueue c
206 | 
207 | runtestTBQueueAsync :: Int -> Int -> Int -> IO ()
208 | runtestTBQueueAsync writers readers n = do
209 |   let nNice = n - rem n (lcm writers readers)
210 |   c <- newTBQueueIO 4096
211 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTBQueue c
212 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTBQueue c ()
213 |   mapM_ wait rcvrs
214 | 
215 | 
216 | -- OTHER CHAN IMPLEMENTATIONS:
217 | 
218 | -- chan-split-fast
219 | 
220 | runtestSplitChan1, runtestSplitChan2 :: Int -> IO ()
221 | runtestSplitChan1 n = do
222 |   (i,o) <- S.newSplitChan
223 |   replicateM_ n $ S.writeChan i ()
224 |   replicateM_ n $ S.readChan o
225 | 
226 | runtestSplitChan2 n = do
227 |   (i,o) <- S.newSplitChan
228 |   let n1000 = n `quot` 1000
229 |   replicateM_ 1000 $ do
230 |     replicateM_ n1000 $ S.writeChan i ()
231 |     replicateM_ n1000 $ S.readChan o
232 | 
233 | 
234 | runtestSplitChanAsync :: Int -> Int -> Int -> IO ()
235 | runtestSplitChanAsync writers readers n = do
236 |   let nNice = n - rem n (lcm writers readers)
237 |   (i,o) <- S.newSplitChan
238 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ S.readChan o
239 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ S.writeChan i ()
240 |   mapM_ wait rcvrs
241 | 
242 | 
243 | 
244 | -- split-channel
245 | 
246 | runtestSplitChannel1, runtestSplitChannel2 :: Int -> IO ()
247 | runtestSplitChannel1 n = do
248 |   (i,o) <- SC.new
249 |   replicateM_ n $ SC.send i ()
250 |   replicateM_ n $ SC.receive o
251 | 
252 | runtestSplitChannel2 n = do
253 |   (i,o) <- SC.new
254 |   let n1000 = n `quot` 1000
255 |   replicateM_ 1000 $ do
256 |     replicateM_ n1000 $ SC.send i ()
257 |     replicateM_ n1000 $ SC.receive o
258 | 
259 | runtestSplitChannelAsync :: Int -> Int -> Int -> IO ()
260 | runtestSplitChannelAsync writers readers n = do
261 |   let nNice = n - rem n (lcm writers readers)
262 |   (i,o) <- SC.new
263 |   rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ SC.receive o
264 |   senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ SC.send i ()
265 |   mapM_ wait rcvrs
266 | 
267 | 
268 | 
269 | -- --------------------------
270 | -- Misc Components
271 | 
272 | testCompositionAppend :: Int -> [Int]
273 | testCompositionAppend n = (go id [1..n]) [] where
274 |     go f [] = f
275 |     go f (a:as) = go (f . (a:)) as
276 | 
277 | -- are appends just as cheap as prepends?
278 | testCompositionAppendPrepend :: Int -> [Int]
279 | testCompositionAppendPrepend n = (go id [1..n]) [] where
280 |     go f [] = f
281 |     go f (a:as) 
282 |         | even a = go (f . (a:)) as
283 |         | otherwise = go ((a:) . f) as
284 | 
285 | testConsReverse :: Int -> [Int]
286 | testConsReverse n = reverse $ go [1..n] [] where 
287 |     go [] as = as
288 |     go (a:xs) as = go xs (a:as)
289 | 
290 | -- test an optimization for small writer dequeues? what about branch prediction's effects on this test?
291 | -- LITTLE BENEFIT
292 | testConsUnrolledReverse :: Int -> [Int]
293 | testConsUnrolledReverse n = rev [] $ go [1..n] [] where 
294 |     go [] as = as
295 |     go (a:xs) as = go xs (a:as)
296 | 
297 |     rev a [z,y,x,w,v,u,t] = t:u:v:w:x:y:z:a
298 |     rev a [z,y,x,w,v,u] = u:v:w:x:y:z:a
299 |     rev a [z,y,x,w,v] = v:w:x:y:z:a
300 |     rev a [z,y,x,w] = w:x:y:z:a
301 |     rev a [z,y,x] = x:y:z:a
302 |     rev a [z,y] = y:z:a
303 |     rev a [z] = z:a
304 |     rev a []     = a
305 |     rev a (x:xs) = rev xs (x:a)
306 | 
307 | -- This is more realistic, eliminating any benefits from inlining and rewriting
308 | -- we might get from above
309 | testCompositionAppendInMVar :: Int -> IO [Int]
310 | testCompositionAppendInMVar n = do
311 |     v <- newMVar id
312 |     mapM_ (go v) [1..n]
313 |     fmap ($ []) $ takeMVar v
314 |   where go v a = do
315 |             f <- takeMVar v
316 |             fa <- evaluate (f . (a:))
317 |             putMVar v fa
318 | 
319 | testConsReverseInMVar :: Int -> IO [Int]
320 | testConsReverseInMVar n = do
321 |     v <- newMVar []
322 |     mapM_ (go v) [1..n]
323 |     fmap reverse $ takeMVar v
324 |   where go v a = do
325 |             zs <- takeMVar v
326 |             azs <- evaluate (a:zs)
327 |             putMVar v azs
328 | 
329 | -- get an idea of the impact on writers:
330 | testStoreCompositionAppendInMVar :: Int -> IO ()
331 | testStoreCompositionAppendInMVar n = do
332 |     v <- newMVar id
333 |     mapM_ (go v) [1..n]
334 |   where go v a = do
335 |             f <- takeMVar v
336 |             fa <- evaluate (f . (a:))
337 |             putMVar v fa
338 | 
339 | testStoreConsReverseInMVar :: Int -> IO ()
340 | testStoreConsReverseInMVar n = do
341 |     v <- newMVar []
342 |     mapM_ (go v) [1..n]
343 |   where go v a = do
344 |             zs <- takeMVar v
345 |             azs <- evaluate (a:zs)
346 |             putMVar v azs
347 | 
348 | -- -------------------------------------------------------------------------
349 | 
350 | -- we'd like to know whether in practice contention can be reduced on a shared
351 | -- counter by first doing a read, and only doing an atomicModify when the
352 | -- counter is not seen to have been incremented yet (note: even so, by the time
353 | -- we're in the atomic block it may have been incremented, in which case it's a
354 | -- NOOP)
355 | readMaybeAtomicModifyIORef :: Int -> IO ()
356 | readMaybeAtomicModifyIORef n = do
357 |     counter <- newIORef 0
358 |     stack1 <- newIORef [] -- non-contentious work done on these:
359 |     stack2 <- newIORef []
360 |     let op stck = do cnt <- readIORef counter
361 |                      atomicModifyIORef' stck (\st-> (cnt:st,()))
362 |                      cnt' <- readIORef counter
363 |                      if cnt' == cnt
364 |                          then atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ()))
365 |                          else return ()
366 | 
367 |     w1 <- async $ replicateM_ n $ op stack1
368 |     w2 <- async $ replicateM_ n $ op stack2
369 |     waitBoth w1 w2
370 |     return ()
371 | 
372 | readMaybeCAS :: Int -> IO ()
373 | readMaybeCAS n = do
374 |     counter <- newIORef 0
375 |     stack1 <- newIORef [] -- non-contentious work done on these:
376 |     stack2 <- newIORef []
377 |     let op stck = do cntTicket <- readForCAS counter
378 |                      let cnt = peekTicket cntTicket
379 |                      atomicModifyIORef' stck (\st-> (cnt:st,()))
380 |                      incrCnt <- evaluate (cnt + 1)
381 |                      (weIncremented, _) <- casIORef counter cntTicket incrCnt
382 |                      return ()
383 | 
384 |     w1 <- async $ replicateM_ n $ op stack1
385 |     w2 <- async $ replicateM_ n $ op stack2
386 |     waitBoth w1 w2
387 |     return ()
388 | 
389 | atomicMaybeModifyIORef :: Int -> IO ()
390 | atomicMaybeModifyIORef n = do
391 |     counter <- newIORef 0
392 |     stack1 <- newIORef [] -- non-contentious work done on these:
393 |     stack2 <- newIORef []
394 |     let op stck = do cnt <- readIORef counter
395 |                      atomicModifyIORef' stck (\st-> (cnt:st,()))
396 |                      atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ()))
397 | 
398 |     w1 <- async $ replicateM_ n $ op stack1
399 |     w2 <- async $ replicateM_ n $ op stack2
400 |     waitBoth w1 w2
401 |     return ()
402 | 
403 | readMaybeAtomicModifyTVar :: Int -> IO ()
404 | readMaybeAtomicModifyTVar n = do
405 |     counter <- newTVarIO 0
406 |     stack1 <- newTVarIO [] -- non-contentious work done on these:
407 |     stack2 <- newTVarIO []
408 |     let op stck = do cnt <- readTVarIO counter
409 |                      atomically $ modifyTVar stck (\st-> cnt:st)
410 |                      cnt' <- readTVarIO counter
411 |                      if cnt' == cnt
412 |                          then atomically $ modifyTVar counter (\cnt1-> if cnt1 == cnt then cnt+1 else cnt1)
413 |                          else return ()
414 | 
415 |     w1 <- async $ replicateM_ n $ op stack1
416 |     w2 <- async $ replicateM_ n $ op stack2
417 |     waitBoth w1 w2
418 |     return ()
419 | 
420 | atomicMaybeModifyTVar :: Int -> IO ()
421 | atomicMaybeModifyTVar n = do
422 |     counter <- newTVarIO 0
423 |     stack1 <- newTVarIO [] -- non-contentious work done on these:
424 |     stack2 <- newTVarIO []
425 |     let op stck = do cnt <- readTVarIO counter
426 |                      atomically $ modifyTVar stck (\st-> cnt:st)
427 |                      atomically $ modifyTVar counter (\cnt1-> if cnt1 == cnt then cnt+1 else cnt1)
428 | 
429 |     w1 <- async $ replicateM_ n $ op stack1
430 |     w2 <- async $ replicateM_ n $ op stack2
431 |     waitBoth w1 w2
432 |     return ()
433 | 
434 | -- variants with a less realistic payload, simulating higher contention with more writers:
435 | {-
436 | -- NOTE: REMOVING: this isn't representative of the behavior of multiple
437 | -- threads, each with a sizable payload between read and atomicModifyIORef,
438 | -- which is what we care about.
439 | readMaybeAtomicModifyIORefHiC :: Int -> IO ()
440 | readMaybeAtomicModifyIORefHiC n = do
441 |     counter <- newIORef 0
442 |     let op  =     do cnt <- readIORef counter
443 |                      evaluate (show $ sqrt cnt)
444 |                      cnt' <- readIORef counter
445 |                      if cnt' == cnt
446 |                          then atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ()))
447 |                          else return ()
448 | 
449 |     w1 <- async $ replicateM_ n $ op 
450 |     w2 <- async $ replicateM_ n $ op 
451 |     waitBoth w1 w2
452 |     return ()
453 | 
454 | atomicMaybeModifyIORefHiC :: Int -> IO ()
455 | atomicMaybeModifyIORefHiC n = do
456 |     counter <- newIORef 0
457 |     let op      = do cnt <- readIORef counter
458 |                      evaluate (show $ sqrt cnt)
459 |                      atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ()))
460 | 
461 |     w1 <- async $ replicateM_ n $ op 
462 |     w2 <- async $ replicateM_ n $ op 
463 |     waitBoth w1 w2
464 |     return ()
465 | -}
466 | 
467 | 
468 | -- Do atomicModifyIORefs block readers?
469 | -- NO
470 | readsAgainstAtomicModifyIORefs :: Int -> IO ()
471 | readsAgainstAtomicModifyIORefs n = do
472 |     cntr <- newIORef 0
473 |     t <- async $ forever (atomicModifyIORef' cntr (\c-> (c+1,())))
474 |     replicateM_ n (readIORef cntr >>= evaluate)
475 |     cancel t
476 | 
477 | readsAgainstNonAtomicModify :: Int -> IO ()
478 | readsAgainstNonAtomicModify n = do
479 |     cntr <- newIORef 0
480 |     t <- async $ forever (modifyIORef' cntr (\c-> c+1))
481 |     replicateM_ n (readIORef cntr >>= evaluate)
482 |     cancel t
483 |     
484 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Brandon Simmons
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Brandon Simmons nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/MVarExperiment.hs:
--------------------------------------------------------------------------------
 1 | module Main
 2 |     where
 3 | 
 4 | import Control.Concurrent
 5 | import Control.Monad
 6 | 
 7 | -- this to be run on one thread (i.e. +RTS -N1) to observe what happens to the
 8 | -- fairness property of MVars as green threads exceed HECs.
 9 | --
10 | -- Interesting:
11 | -- https://ghc.haskell.org/trac/ghc/ticket/7606
12 | -- https://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Scheduler
13 | main = do
14 |     st <- newMVar []
15 |     vs <- mapM (\n-> do  v <- newEmptyMVar
16 |                          forkIO (replicateM_ 10 $ modifyMVar_ st (return . (n:)) >> putMVar v ())
17 |                          return v
18 |           ) [1..10]
19 |     mapM_ takeMVar vs -- wait
20 |     takeMVar st >>= print
21 |     
22 | 
23 | 


--------------------------------------------------------------------------------
/Main1.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RankNTypes #-}
  2 | {-# LANGUAGE PackageImports #-}
  3 | module Main
  4 |     where 
  5 | 
  6 | import Control.Concurrent.Async
  7 | import Control.Monad
  8 | import System.Environment
  9 | 
 10 | import Control.Concurrent
 11 | import Control.Concurrent.Chan
 12 | import Control.Concurrent.STM
 13 | import Control.Concurrent.STM.TQueue
 14 | import Control.Concurrent.STM.TBQueue
 15 | 
 16 | import Control.Concurrent.MVar
 17 | import Data.IORef
 18 | import Criterion.Main
 19 | import Control.Exception(evaluate,mask_)
 20 | 
 21 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S
 22 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC
 23 | import Data.Primitive.MutVar
 24 | import Control.Monad.Primitive(PrimState)
 25 | import Data.Atomics.Counter
 26 | import Data.Atomics
 27 | import System.Random
 28 | import System.Random.MWC
 29 | 
 30 | import Data.Array.IArray
 31 | import qualified Data.Vector as V
 32 | import qualified Data.Vector.Generic.Mutable as MV
 33 | import qualified Data.Vector.Unboxed.Mutable as UMV
 34 | import qualified Data.Primitive as P
 35 | -- TODO fix imports above
 36 | import qualified Data.Vector.Mutable as MVec
 37 | 
 38 | import System.Time(getClockTime)
 39 | import Data.Time.Clock.POSIX
 40 | 
 41 | import Data.Word
 42 | import Data.Bits
 43 | 
 44 | -- Hack since these aren't currently working with ghc 7.8
 45 | #if MIN_VERSION_base(4,7,0)
 46 | #else
 47 | import qualified Data.Concurrent.Queue.MichaelScott as MS
 48 | #endif
 49 | import qualified Data.Concurrent.Deque.ChaseLev as CL
 50 | 
 51 | import Benchmarks
 52 | 
 53 | -- These tests initially taken from stm/bench/chanbench.hs, ported to
 54 | -- criterion, with some additions.
 55 | --
 56 | -- The original used CPP to avoid code duplication while also ensuring GHC
 57 | -- optimized the code in a realistic fashion. Here we just copy paste.
 58 | 
 59 | main = do 
 60 |   let n = 100000
 61 | --let n = 2000000  -- original suggested value, bugs if exceeded
 62 | 
 63 |   mv <- newEmptyMVar -- This to be left empty after each test
 64 |   mvFull <- newMVar '1'
 65 |   -- --
 66 |   -- mvWithFinalizer <- newEmptyMVar
 67 |   -- mkWeakMVar mvWithFinalizer $ return ()
 68 |   -- --
 69 |   -- mvFinalizee <- newMVar 'a'
 70 |   -- mvWithFinalizer <- newMVar ()
 71 |   -- mkWeakMVar mvWithFinalizer $
 72 |   --     modifyMVar_ mvFinalizee (const $ return 'b')
 73 |   -- --
 74 |   tmv <- newEmptyTMVarIO 
 75 |   tv <- newTVarIO '1' 
 76 |   ior <- newIORef '1'
 77 |   mutv <- newMutVar '1'
 78 | 
 79 |   atomic_counter <- newCounter 0
 80 | 
 81 |   -- to be left empty at emd of each test:
 82 | #if MIN_VERSION_base(4,7,0)
 83 | #else
 84 |   lockfreeQEmpty <- MS.newQ
 85 | #endif
 86 |   chaselevQEmpty <- CL.newQ
 87 |   chanEmpty <- newChan
 88 |   tchanEmpty <- newTChanIO
 89 |   tqueueEmpty <- newTQueueIO
 90 |   tbqueueEmpty <- newTBQueueIO 2
 91 |   (fastEmptyI,fastEmptyO) <- S.newSplitChan
 92 |   (splitchannelEmptyI,splitchannelEmptyO) <- SC.new
 93 | 
 94 |   -- random generators
 95 |   mwc_gen <- createSystemRandom
 96 |   sys_rand_gen <- newStdGen
 97 | 
 98 |   let arr8 = listArray (0,7) [1..8] :: Array Int Int
 99 |   let arr16 = listArray (0,15) [1..16] :: Array Int Int
100 |   let vec8 = V.fromList [1..8] :: V.Vector Int
101 |   let vec16 = V.fromList [1..16] :: V.Vector Int
102 |   mvec8 <- V.thaw vec8 -- :: V.MVector (PrimState IO) Int
103 |   mvec16 <- V.thaw vec16 -- :: V.MVector (PrimState IO) Int
104 |   parr8 <- P.newArray 8 (0::Int)
105 |   parr16 <- P.newArray 16 (0::Int)
106 |   parr32 <- P.newArray 32 (0::Int)
107 |   parr128 <- P.newArray 128 (0::Int)
108 |   parr512 <- P.newArray 512 (0::Int)
109 |   parr32Justs <- P.newArray 32 (Just 0 :: Maybe Int)
110 |   iparr8 <- ((P.newArray 8 (0::Int)) >>= P.unsafeFreezeArray) :: IO (P.Array Int)
111 |   iparr16 <- ((P.newArray 16 (0::Int)) >>= P.unsafeFreezeArray) :: IO (P.Array Int)
112 | 
113 |   ba16 <- (P.newByteArray (16 * P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
114 |   forM_ [0..15] $ \i-> P.writeByteArray ba16 i (0::Int)
115 |   umvvec16 <- UMV.new 16 :: IO (UMV.IOVector Int)
116 |   UMV.set umvvec16 0 :: IO ()
117 | 
118 |   defaultMain $
119 |         [ bgroup "Channel implementations" $
120 |             -- Very artificial; just adding up the consts of the
121 |             -- takes/puts/reads involved in getting a single message in and out
122 |             [ bgroup "Latency micro-benchmark" $
123 |                 [ bench "Chan" (writeChan chanEmpty () >> readChan chanEmpty)
124 |                 , bench "TChan" (atomically (writeTChan tchanEmpty () >>  readTChan tchanEmpty))
125 |                 , bench "TQueue" (atomically (writeTQueue tqueueEmpty () >>  readTQueue tqueueEmpty))
126 |                 , bench "TBQueue" (atomically (writeTBQueue tbqueueEmpty () >>  readTBQueue tbqueueEmpty))
127 |                 , bench "chan-split-fast" (S.writeChan fastEmptyI () >> S.readChan fastEmptyO)
128 |                 , bench "split-channel" (SC.send splitchannelEmptyI () >> SC.receive splitchannelEmptyO)
129 | #if MIN_VERSION_base(4,7,0)
130 | #else
131 |                 , bench "lockfree-queue" (MS.pushL lockfreeQEmpty () >> msreadR lockfreeQEmpty)
132 | #endif
133 |                 , bench "chaselev-dequeue" (CL.pushL chaselevQEmpty () >> clreadR chaselevQEmpty)
134 |                 ]
135 |             , bgroup ("Single-thread throughput with "++show n++" messages") $
136 |                 -- some pure operations we'd like a rough measurement for, e.g.
137 |                 -- the TQueue performs a reverse [1..n] in a test run, so we'd
138 |                 -- like an idea of how much that factor affects throughput.
139 |                 [ bgroup "For scale" $
140 |                       [ bench "reverse [1..n]" $ nf (\n'-> reverse [1..n']) n
141 |                       , bench "reverse replicate n 1" $ nf (\n'-> replicate n' (1::Int)) n
142 |                       ]
143 |                 , bgroup "Chan" $
144 |                       -- original tests from chanbench.hs
145 |                       [ bench "sequential write all then read all" $ runtestChan1 n
146 |                       , bench "repeated write some, read some" $ runtestChan2 n
147 |                       ]
148 |                 , bgroup "TChan" $
149 |                       [ bench "sequential write all then read all" $ runtestTChan1 n
150 |                       , bench "repeated write some, read some" $ runtestTChan2 n
151 |                       ]
152 |                 , bgroup "TQueue" $
153 |                       [ bench "sequential write all then read all" $ runtestTQueue1 n
154 |                       , bench "repeated write some, read some" $ runtestTQueue2 n
155 |                       ]
156 |                 , bgroup "TBQueue" $
157 |                       [ bench "sequential write all then read all" $ runtestTBQueue1 n
158 |                       , bench "repeated write some, read some" $ runtestTBQueue2 n
159 |                       ]
160 |                 -- OTHER CHAN IMPLEMENTATIONS:
161 |                 , bgroup "chan-split-fast" $
162 |                       [ bench "sequential write all then read all" $ runtestSplitChan1 n
163 |                       , bench "repeated write some, read some" $ runtestSplitChan2 n
164 |                       ]
165 |                 , bgroup "split-channel" $
166 |                       [ bench "sequential write all then read all" $ runtestSplitChannel1 n
167 |                       , bench "repeated write some, read some" $ runtestSplitChannel2 n
168 |                       ]
169 | #if MIN_VERSION_base(4,7,0)
170 | #else
171 |                 , bgroup "lockfree-queue" $
172 |                       [ bench "sequential write all then read all" $ runtestLockfreeQueue1 n
173 |                       , bench "repeated write some, read some" $ runtestLockfreeQueue2 n
174 |                       ]
175 | #endif
176 |                 , bgroup "chaselev-dequeue" $
177 |                       [ bench "sequential write all then read all" $ runtestChaseLevQueue1 n
178 |                       , bench "repeated write some, read some" $ runtestChaseLevQueue2 n
179 |                       ]
180 |                 ]
181 |             ]
182 |         , bgroup "Forking, context switches, and Misc. on a single core" $
183 |             [ bench "forkIO" (forkIO (return ()) >> return ())
184 |             , bench "put,take MVar" $ do
185 |                 putMVar mv '0'
186 |                 takeMVar mv
187 |             , bench "put,takeMVar + forkIO + 2 context switches" $ do
188 |                 forkIO $ putMVar mv '0'
189 |                 takeMVar mv
190 |             , bench "getNumCapabilities" getNumCapabilities
191 |             , bench "myThreadId" myThreadId
192 | 
193 |             , bench "myThreadId >>= threadCapability" $ myThreadId >>= threadCapability
194 |             -- It may not be possible to re-use generators for our applications of random, so:
195 |             , bench "random new_gen" $ newStdGen
196 |             , bench "random Int range: (1,8)" $ whnf (randomR (1 :: Int, 8)) sys_rand_gen
197 |           -- THIS IS CRAZY SLOW TODO move out of this so we can actually compare graphs:
198 |           --, bench "mwc-random new_gen" $ createSystemRandom
199 |             , bench "mwc-random Int range: (1,8)" $ ((uniformR (1 :: Int, 8) mwc_gen) :: IO Int)
200 |             , bench "randomRIO (1,8)" $ randomRIO (1::Int,8)
201 | 
202 |             , bench "old-time getClockTime" $ getClockTime
203 |             , bench "time getPOSIXTime" getPOSIXTime
204 |             ]
205 | 
206 |         , bgroup "Var primitives" $
207 |             [ bgroup "For scale" $ 
208 |                 [ bench "mod (Int)" $ nf (2147483647 `mod`) (8 :: Int)
209 |                 , bench "rem (Int)" $ nf (2147483647 `rem`) (8 :: Int)
210 |                 , bench "mod (Word)" $ nf ((2147483647::Word) `mod`) (8 :: Word)
211 |                 , bench "rem (Word)" $ nf ((2147483647::Word) `rem`) (8 :: Word)
212 |                 -- tricks since we only need powers of two: a `mod` 2^i = a .&. (2^i–1) 
213 |                 , bench "AND modulo (Int)" $ nf (((2147483647::Int) .&.) . subtract 1) (8 :: Int)
214 |                 , bench "AND modulo (Int) easier" $ nf (((2147483647::Int) .&.)) (7 :: Int)
215 |                 , bench "AND modulo (Word)" $ nf (((2147483647::Word) .&.) . subtract 1) (8 :: Word)
216 |                 
217 |                 -- we might end up just using subtraction:
218 |                 , bench "subtraction (Int)" $ nf (subtract 2147483640) (2147483647 :: Int)
219 |                 -- ALL OF ABOVE IS STUPID SLOW; probably measuring overhead of function call, boxing/unboxing, etc.
220 |                 , bench "subtraction on counter incr" $ nfIO $ fmap (subtract 2147483640) (incrCounter 1 atomic_counter)
221 |                 ]
222 |             , bgroup "atomic-primops (puts on safety goggles...)" $ 
223 |                 [ bench "newCounter" $ newCounter 0
224 |                 , bench "incrCounter 1" $ incrCounter 1 atomic_counter
225 |                 , bench "incrCounter 1024" $ incrCounter 1024 atomic_counter
226 |                 ]
227 |             , bgroup "IORef" $ 
228 |                 [ bench "newIORef Char" $ (newIORef '0')
229 |                 , bench "writeIORef" $ (writeIORef ior '1')
230 |                 , bench "readIORef" $ (readIORef ior)
231 |                 , bench "readIORef (w result forced)" $ nfIO (readIORef ior) -- what does this tell us w/r/t above?
232 |                 , bench "readForCAS" $ (fmap peekTicket $ readForCAS ior)
233 |                 , bench "readForCAS (w result forced)" $ nfIO (fmap peekTicket $ readForCAS ior)
234 |                 , bench "modifyIORef' (i.e. readIORef + writeIORef)" (modifyIORef' ior $ const '2')
235 |                 , bench "atomicModifyIORef' (i.e. (in GHC) strict atomicModifyMutVar)" $ (atomicModifyIORef' ior $ const ('3','3'))
236 |                 , bench "atomicModifyIORef (i.e. (in GHC) atomicModifyMutVar)" $ (atomicModifyIORef ior $ const ('3','3'))
237 |                 
238 |                 , bench "atomicModifyIORefCAS (i.e. atomic-primops CAS loop)" $ (atomicModifyIORefCAS ior $ const ('4','4'))
239 |                 , bench "atomicModifyIORefCAS' (my CAS loop)" $ (atomicModifyIORefCAS' ior $ const ('4','4'))
240 | 
241 |                 ]
242 |             , bgroup "MVar" $
243 |                 [ bench "newEmptyMVar" $ newEmptyMVar
244 |                 , bench "newEmptyMVar + putMVar (i.e. newMVar Char)" (newEmptyMVar >>= \v->putMVar v '0')
245 |                 , bench "putMVar + takeMVar" $ (putMVar mv '1' >> takeMVar mv)
246 |                 , bench "modifyMVarMasked_ (i.e. mask + takeMVar + putMVar + exception-handling)" $ (modifyMVarMasked_ mvFull (const $ return '2'))
247 |                 , bench "mask_ + takeMVar-and-mod + putMVar)" $ mask_ (takeMVar mvFull >>= (putMVar mvFull . const '2'))
248 |                 , bench "newMVar + mkWeakMVar with finalizer" $ (newMVar '1' >>= flip mkWeakMVar (return ()))
249 |                 -- These show no effect of a finalizer:
250 |                 -- , bench "on MVar with finalizer: putMVar, takeMVar" $ (putMVar mvWithFinalizer '1' >> takeMVar mvWithFinalizer)
251 |                 -- , bench "On target of an MVar finalizer: takeMVar, putMVar" $ (takeMVar mvFinalizee >>= putMVar mvFinalizee)
252 | #if MIN_VERSION_base(4,7,0)
253 |                 , bench "tryReadMVar" $ tryReadMVar mvFull
254 |                 , bench "tryReadMVar (w result forced)" $ nfIO $ tryReadMVar mvFull
255 | #endif
256 |                 , bench "readMVar" $ readMVar mvFull
257 |                 , bench "readMVar (w result forced)" $ nfIO $ readMVar mvFull
258 |                 ]
259 | 
260 |             , bgroup "TVar" $
261 |                 [ bench "newTVarIO ()" $ (newTVarIO ())
262 |                 , bench "atomically writeTVar" $ (atomically $ writeTVar tv '1')
263 |                 , bench "atomically readTVar" $ (atomically $ readTVar tv)
264 |                 , bench "readTVarIO" $ (readTVarIO tv)
265 |                 , bench "atomically modifyTVar' (i.e. atomically (readTVar + writeTVar))" $ (atomically $ modifyTVar' tv (const '2'))
266 |                 , bench "(atomically writeTVar) + (atomically readTVar)" $ ((atomically $ writeTVar tv '1') >> (atomically $ readTVar tv))
267 |                 ]
268 |             , bgroup "TMVar" $
269 |                 [ bench "newEmptyTMVar (i.e. newTVarIO)" $ newEmptyTMVarIO
270 |                 , bench "atomically (newEmptyTMVar + putTMVar) (i.e. newTVar + readTVar + writeTVar,retry)" (atomically (newEmptyTMVar >>= \v->putTMVar v ()))
271 |                 , bench "atomically (putTMVar + takeTMVar)" $ (atomically $ (putTMVar tmv '1' >> takeTMVar tmv))
272 |                 , bench "(atomically putTMVar) + (atomically takeTMVar)" $ ((atomically $ putTMVar tmv '1') >> (atomically $ takeTMVar tmv))
273 |                 ]
274 |             , bgroup "MutVar" $
275 |                 [ bench "newMutVar ()" $ (newMutVar () :: IO (MutVar (PrimState IO) ()))
276 |                 , bench "writeMutVar" $ (writeMutVar mutv '1' :: IO ())
277 |                 , bench "readMutVar" $ (readMutVar mutv :: IO Char)
278 |                 , bench "atomicModifyMutVar" $ (atomicModifyMutVar mutv $ const ('2','2') :: IO Char)
279 |                 ]
280 |             ]
281 |         -- Some more tests of pure operations relevant to TQueue style dequeue
282 |         -- performance.
283 |         , bgroup "Misc" $
284 |             -- of interest for TQueue style approach
285 |             [ bgroup "cons and reverse" $ 
286 |                 [ bench "cons" $ nf (:[]) True
287 |                 , bench "pure unrolled cons then final reverse x10" $       nf testConsUnrolledReverse 10
288 |                 , bench "pure cons then final reverse x10" $       nf testConsReverse 10
289 |                 , bench "pure unrolled cons then final reverse x5" $       nf testConsUnrolledReverse 5
290 |                 , bench "pure cons then final reverse x5" $       nf testConsReverse 5
291 | 
292 |                 , bench "pure cons-composition append x10" $ nf testCompositionAppend 10
293 |                 , bench "pure cons then final reverse x10" $       nf testConsReverse 10
294 |                 , bench "pure cons-composition append x100" $ nf testCompositionAppend 100
295 |                 , bench "pure cons then final reverse x100" $       nf testConsReverse 100
296 |                 , bench "pure cons-composition append x10000" $ nf testCompositionAppend 10000
297 |                 , bench "pure cons then final reverse x10000" $       nf testConsReverse 10000
298 | 
299 |                 , bench "pure cons-composition append and prepend x10" $ nf testCompositionAppendPrepend 10
300 |                 , bench "pure cons-composition append and prepend x100" $ nf testCompositionAppendPrepend 100
301 |                 , bench "pure cons-composition append and prepend x10000" $ nf testCompositionAppendPrepend 10000
302 | 
303 |                 , bench "mvar-stored cons-composition append x10" $ nfIO $ testCompositionAppendInMVar 10
304 |                 , bench "mvar-stored cons then final reverse x10" $ nfIO $       testConsReverseInMVar 10
305 |                 , bench "mvar-stored cons-composition append x100" $ nfIO $ testCompositionAppendInMVar 100
306 |                 , bench "mvar-stored cons then final reverse x100" $ nfIO $       testConsReverseInMVar 100
307 |                 , bench "mvar-stored cons-composition append x10000" $ nfIO $ testCompositionAppendInMVar 10000
308 |                 , bench "mvar-stored cons then final reverse x10000" $ nfIO $       testConsReverseInMVar 10000
309 |                 
310 |                 , bench "storing mvar-stored cons-composition append x10" $ nfIO $ testStoreCompositionAppendInMVar 10
311 |                 , bench "storing mvar-stored cons then final reverse x10" $ nfIO $       testStoreConsReverseInMVar 10
312 |                 , bench "storing mvar-stored cons-composition append x100" $ nfIO $ testStoreCompositionAppendInMVar 100
313 |                 , bench "storing mvar-stored cons then final reverse x100" $ nfIO $       testStoreConsReverseInMVar 100
314 |                 , bench "storing mvar-stored cons-composition append x10000" $ nfIO $ testStoreCompositionAppendInMVar 10000
315 |                 , bench "storing mvar-stored cons then final reverse x10000" $ nfIO $       testStoreConsReverseInMVar 10000
316 |                 ]
317 |             , bgroup "arrays" $
318 |                 [ bgroup "indexing" $ 
319 |                     [ bench "index 8th list" $ nf ([(1::Int)..8] !!) 7
320 |                     , bench "index 8th IArray" $ nf (arr8 !) 7
321 |                     , bench "index 8th Vector" $ nf (V.unsafeIndex vec8) 7
322 |                     , bench "index 8th MVector" $ nfIO $ (MV.unsafeRead mvec8) 7
323 |                     -- I think this is basically MVector AFAICT
324 |                     , bench "index 8th Primitiv MutableArray" $ nfIO $ (P.readArray parr8) 7
325 |                     , bench "index 8th Primitiv Array" $ nf (P.indexArray iparr8) 7
326 | 
327 |                     , bench "index 16th list" $ nf ([(1::Int)..16] !!) 15
328 |                     , bench "index 16th IArray" $ nf (arr16 !) 15
329 |                     , bench "index 16th Vector" $ nf (V.unsafeIndex vec16) 15
330 |                     , bench "index 16th MVector" $ nfIO $ (MV.unsafeRead mvec16) 15
331 |                     , bench "index 16th Primitiv MutableArray" $ nfIO $ (P.readArray parr16) 15
332 |                     , bench "index 16th Primitiv Array" $ nf (P.indexArray iparr16) 15
333 | 
334 |                     , bench "readArrayElem for CAS (MutableArray)" $ nfIO (fmap peekTicket $ readArrayElem parr16 15 )
335 |                     , bench "readArray (MutableArray)" $ nfIO (P.readArray parr16 15 )
336 |                     , bench "readByteArray (MutableByteArray, usable for CAS)" $ nfIO (P.readByteArray ba16 15 :: IO Int)
337 |                     , bench "read Mutable Unboxed Vector" $ (UMV.read umvvec16 15 :: IO Int)
338 |                     ]
339 |                 , bgroup "writing" $
340 |                     [ bench "write MutableArray" $ (P.writeArray parr16 15 1 :: IO ())
341 |                     , bench "CAS MutableArray (along with a readArrayElem)" (readArrayElem parr16 15 >>= (\t-> casArrayElem parr16 15 t 2))
342 |                     , bench "write MutableByteArray" (P.writeByteArray ba16 15 (1::Int) :: IO ())
343 |                     , bench "CAS MutableByteArray (along with a readByteArray)" (P.readByteArray ba16 15 >>= (\t-> casByteArrayInt ba16 15 t (2::Int)))
344 |                     , bench "write Mutable Unboxed Vector" $ (UMV.write umvvec16 15 2 :: IO ())
345 |                     , bench "fetchAddByteArrayInt" $ fetchAddByteArrayInt ba16 14 1
346 |                     , bench "fetchAddByteArrayInt w result forced" $ nfIO $ fetchAddByteArrayInt ba16 14 1
347 |                     ]
348 |                 , bgroup "creating" $
349 |                     [ bench "new MVector 8 Ints" $ (MVec.new 8 :: IO (MVec.IOVector Int))
350 |                     , bench "new MVector 32 Ints" $ (MVec.new 32 :: IO (MVec.IOVector Int))
351 |                     , bench "unsafeNew MVector 8 Ints" $ (MVec.unsafeNew 8 :: IO (MVec.IOVector Int))
352 |                     , bench "unsafeNew MVector 32 Ints" $ (MVec.unsafeNew 32 :: IO (MVec.IOVector Int))
353 | 
354 |                     , bench "new MutableArray 8 Ints" $ (P.newArray 8 0 :: IO (P.MutableArray (PrimState IO) Int))
355 |                     , bench "new MutableArray 32 Ints" $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int))
356 |                     , bench "new MutableArray 32 Nothing :: Maybe Ints" $ (P.newArray 32 Nothing :: IO (P.MutableArray (PrimState IO) (Maybe Int)))
357 |                     , bench "cloned MutableArray 8 Ints" $ (P.cloneMutableArray parr8 0 8 :: IO (P.MutableArray (PrimState IO) Int))
358 |                     , bench "cloned MutableArray 32 Ints" $ (P.cloneMutableArray parr32 0 32 :: IO (P.MutableArray (PrimState IO) Int))
359 |                     , bench "cloned MutableArray 128 Ints" $ (P.cloneMutableArray parr128 0 128 :: IO (P.MutableArray (PrimState IO) Int))
360 |                     , bench "cloned MutableArray 512 Ints" $ (P.cloneMutableArray parr512 0 512 :: IO (P.MutableArray (PrimState IO) Int))
361 | 
362 |                     , bench "new MutableByteArray 8 Ints" (P.newByteArray (8* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
363 |                     , bench "new MutableByteArray 32 Ints" (P.newByteArray (32* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
364 |                     , bench "new set MutableByteArray 32 Ints" $ (P.newByteArray (32* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 32 (0 :: Int)
365 |                     , bench "new MutableByteArray 128 Ints" (P.newByteArray (128* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
366 |                     , bench "new set MutableByteArray 128 Ints" $ (P.newByteArray (128* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 128 (0 :: Int)
367 |                     , bench "new MutableByteArray 512 Ints" (P.newByteArray (512* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
368 |                     , bench "new set MutableByteArray 512 Ints" $ (P.newByteArray (512* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 512 (0 :: Int)
369 |                     , bench "new MutableByteArray 2048 Ints" (P.newByteArray (2048* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
370 |                     , bench "new MutableByteArray 8192 Ints" (P.newByteArray (8192* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO)))
371 |                     , bench "new unboxed Mutable Vector 8 Ints" (UMV.new 8 :: IO (UMV.IOVector Int))
372 |                     , bench "new unboxed Mutable Vector 32 Ints" (UMV.new 32 :: IO (UMV.IOVector Int))
373 |                     , bench "new unboxed Mutable Vector 128 Ints" (UMV.new 128 :: IO (UMV.IOVector Int))
374 |                     ]
375 |                 ]
376 |             ]
377 |         ]
378 |   -- takeMVar mvWithFinalizer -- keep finalizer from actually running
379 | 
380 | 


--------------------------------------------------------------------------------
/MainN.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RankNTypes #-}
  2 | {-# LANGUAGE PackageImports #-}
  3 | module Main
  4 |     where 
  5 | 
  6 | import Control.Concurrent.Async
  7 | import Control.Monad
  8 | import System.Environment
  9 | 
 10 | import qualified Data.Primitive as P
 11 | import Control.Concurrent
 12 | import Control.Concurrent.Chan
 13 | import Control.Concurrent.STM
 14 | import Control.Concurrent.STM.TQueue
 15 | import Control.Concurrent.STM.TBQueue
 16 | 
 17 | import Control.Concurrent.MVar
 18 | import Data.IORef
 19 | import Criterion.Main
 20 | import Control.Exception(evaluate)
 21 | 
 22 | 
 23 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S
 24 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC
 25 | import Data.Primitive.MutVar
 26 | import Control.Monad.Primitive(PrimState)
 27 | import Data.Atomics.Counter
 28 | import Data.Atomics
 29 | 
 30 | #if MIN_VERSION_base(4,7,0)
 31 | #else
 32 | import qualified Data.Concurrent.Queue.MichaelScott as MS
 33 | #endif
 34 | 
 35 | import GHC.Conc
 36 | 
 37 | import Benchmarks
 38 | 
 39 | -- These tests initially taken from stm/bench/chanbench.hs, ported to
 40 | -- criterion, with some additions.
 41 | --
 42 | -- The original used CPP to avoid code duplication while also ensuring GHC
 43 | -- optimized the code in a realistic fashion. Here we just copy paste.
 44 | 
 45 | main = do 
 46 |   let n = 100000
 47 | --let n = 2000000  -- original suggested value, bugs if exceeded
 48 | 
 49 |   procs <- getNumCapabilities
 50 |   let procs_div2 = procs `div` 2
 51 |   if procs_div2 >= 0 then return ()
 52 |                      else error "Run with RTS +N2 or more"
 53 | 
 54 |   mv <- newEmptyMVar -- This to be left empty after each test
 55 |   mvFull <- newMVar undefined
 56 |   -- --
 57 |   -- mvWithFinalizer <- newEmptyMVar
 58 |   -- mkWeakMVar mvWithFinalizer $ return ()
 59 |   -- --
 60 |   -- mvFinalizee <- newMVar 'a'
 61 |   -- mvWithFinalizer <- newMVar ()
 62 |   -- mkWeakMVar mvWithFinalizer $
 63 |   --     modifyMVar_ mvFinalizee (const $ return 'b')
 64 |   -- --
 65 |   tmv <- newEmptyTMVarIO 
 66 |   tv <- newTVarIO undefined 
 67 |   ior <- newIORef undefined
 68 |   mutv <- newMutVar undefined
 69 | 
 70 |   counter_mvar <- newMVar (1::Int)
 71 |   counter_ioref <- newIORef (1::Int)
 72 |   counter_tvar <- newTVarIO (1::Int)
 73 |   counter_atomic_counter <- newCounter (1::Int)
 74 | 
 75 |   fill_empty_chan <- newChan
 76 |   fill_empty_tchan <- newTChanIO
 77 |   fill_empty_tqueue <- newTQueueIO
 78 |   fill_empty_tbqueue <- newTBQueueIO maxBound
 79 |   (fill_empty_fastI, fill_empty_fastO) <- S.newSplitChan
 80 |   (fill_empty_splitchannelI, fill_empty_splitchannelO) <- SC.new
 81 | #if MIN_VERSION_base(4,7,0)
 82 | #else
 83 |   fill_empty_lockfree <- MS.newQ
 84 | #endif
 85 | 
 86 |   defaultMain $
 87 |         [ bgroup "Var primitives" $
 88 |             -- This gives us an idea of how long a lock is held by these atomic
 89 |             -- ops, and the effects of retry/blocking scheduling behavior.
 90 |             -- compare this with latency measure in Main1 to get the whole
 91 |             -- picture:
 92 |             -- Subtract the cost of:
 93 |             --   - 2 context switches
 94 |             --   - 4 newEmptyMVar
 95 |             --   - 4 takeMVar
 96 |             --   - 4 putMVar
 97 |             -- TODO: also test with N green threads per core.
 98 |             [ bgroup ("Throughput on "++(show n)++" concurrent atomic mods") $
 99 |                 -- just forks some threads all atomically modifying a variable:
100 |                 let {-# INLINE mod_test #-}
101 |                     mod_test = mod_test_n n
102 |                     {-# INLINE mod_test_n #-}
103 |                     mod_test_n n' = \threads modf -> do
104 |                       dones <- replicateM threads newEmptyMVar ; starts <- replicateM threads newEmptyMVar
105 |                       mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n' `div` threads) modf >> putMVar done1 ()) $ zip starts dones
106 |                       mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
107 | 
108 |                     -- We use this payload to scale contention; on my machine
109 |                     -- timesN values of 1,2,3,4 run at fairly consistent: 15ns,
110 |                     -- 19ns, 29ns, and 37ns (note: 22.4ns for an atomicModifyIORef)
111 |                     {-# NOINLINE payload #-}
112 |                     payload timesN = (evaluate $ (foldr ($) 2 $ replicate timesN sqrt) :: IO Float)
113 | 
114 |                     varGroupPayload perProc numPL = [
115 |                          bench "modifyMVar_" $ mod_test (procs*perProc) $
116 |                           (modifyMVar_ counter_mvar (return . (+1)) >> payload numPL)
117 | 
118 |                         , bench "modifyMVarMasked_" $ mod_test (procs*perProc) $
119 |                             (modifyMVarMasked_ counter_mvar (return . (+1)) >> payload numPL)
120 |                         
121 |                         , bench "atomicModifyIORef'" $ mod_test (procs*perProc) $
122 |                             (atomicModifyIORef' counter_ioref (\x-> (x+1,()) ) >> payload numPL)
123 | 
124 |                         , bench "atomically modifyTVar'" $ mod_test (procs*perProc) $
125 |                             ((atomically $ modifyTVar' counter_tvar ((+1))) >> payload numPL) 
126 | 
127 |                         , bench "incrCounter (atomic-primops)" $ mod_test (procs*perProc) $
128 |                             (incrCounter 1 counter_atomic_counter >> payload numPL)
129 |                         
130 |                         , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test (procs*perProc) $
131 |                             (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) ) >> payload numPL)
132 | 
133 |                         , bench "atomicModifyIORefCAS' (my CAS loop)" $ mod_test (procs*perProc) $
134 |                             (atomicModifyIORefCAS' counter_ioref (\x-> (x+1,()) ) >> payload numPL)
135 | 
136 |                         ]
137 | 
138 |                  in [ bgroup "1 thread per HEC, full contention" $
139 |                        [ bench "modifyMVar_" $ mod_test procs $
140 |                           (modifyMVar_ counter_mvar (return . (+1)))
141 | 
142 |                         , bench "modifyMVarMasked_" $ mod_test procs $
143 |                             (modifyMVarMasked_ counter_mvar (return . (+1)))
144 |                         
145 |                         , bench "atomicModifyIORef'" $ mod_test procs $
146 |                             (atomicModifyIORef' counter_ioref (\x-> (x+1,()) ))
147 | 
148 |                         , bench "atomically modifyTVar'" $ mod_test procs $
149 |                             (atomically $ modifyTVar' counter_tvar ((+1))) 
150 | 
151 |                         , bench "incrCounter (atomic-primops)" $ mod_test procs $
152 |                             (incrCounter 1 counter_atomic_counter)
153 |                         
154 |                         , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test procs $
155 |                             (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) ))
156 |                         
157 |                         , bench "atomicModifyIORefCAS' (my CAS loop)" $ mod_test procs $
158 |                             (atomicModifyIORefCAS' counter_ioref (\x-> (x+1,()) ))
159 |                         
160 |                         -- I want to compare these with the same results above;
161 |                         -- see also TVarExperiment:
162 |                         -- , bench "atomicModifyIORef' x10" $ mod_test_n (10*n) procs $
163 |                         --     (atomicModifyIORef' counter_ioref (\x-> (x+1,()) ))
164 |                         -- , bench "atomically modifyTVar' x10" $ mod_test_n (10*n) procs $
165 |                         --     (atomically $ modifyTVar' counter_tvar ((+1))) 
166 |                         ]
167 |                     , bgroup "2 threads per HEC, full contention" $
168 |                        [ bench "modifyMVar_" $ mod_test (procs*2) $
169 |                           (modifyMVar_ counter_mvar (return . (+1)))
170 | 
171 |                         , bench "modifyMVarMasked_" $ mod_test (procs*2) $
172 |                             (modifyMVarMasked_ counter_mvar (return . (+1)))
173 |                         
174 |                         -- WTF! This is suddenly giving me a stack overflow....
175 |                         -- , bench "atomicModifyIORef'" $ mod_test (procs*2) $
176 |                         --     (atomicModifyIORef' counter_ioref (\x-> (x+1,()) ))
177 | 
178 |                         , bench "atomically modifyTVar'" $ mod_test (procs*2) $
179 |                             (atomically $ modifyTVar' counter_tvar ((+1))) 
180 | 
181 |                         , bench "incrCounter (atomic-primops)" $ mod_test (procs*2) $
182 |                             (incrCounter 1 counter_atomic_counter)
183 |                         
184 |                         , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test (procs*2) $
185 |                             (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) ))
186 | 
187 |                         ]
188 |                    
189 |                    {- COMMENTING, since the atomicModifyIORef' below is *again*
190 |                       causing stack overflow for no apparent reason TODO why?
191 | 
192 |                     -- NOTE: adding more threads per-HEC at this point shows
193 |                     -- little difference (very bad MVar locking behavior has
194 |                     -- mostly disappeared)
195 |                     --
196 |                     -- test dialing back the contention:
197 |                     , bgroup "1 threads per HEC, 1 payload" $ 
198 |                         varGroupPayload 1 1
199 |                     , bgroup "1 threads per HEC, 2 payload" $
200 |                         varGroupPayload 1 2
201 |                     , bgroup "1 threads per HEC, 4 payload" $
202 |                         varGroupPayload 1 4
203 |                     , bgroup "1 threads per HEC, 8 payload" $
204 |                         varGroupPayload 1 8
205 | 
206 |                     -- this is an attempt to see if a somewhat random delay can
207 |                     -- get rid of (some or all) the very slow runs; hypothesis
208 |                     -- being that those runs get into some bad harmonics and
209 |                     -- contention is slow to resolve.
210 |                     , bgroup "1 thread per HEC, scattered payloads with IORefs" $
211 |                         let benchRandPayloadIORef evry pyld = 
212 |                                 bench ("atomicModifyIORef' "++(show evry)++" "++(show pyld)) $ 
213 |                                   mod_test procs $
214 |                                     (atomicModifyIORef' counter_ioref (\x-> (x+1,x) ) 
215 |                                      >>= \x-> if x `mod` evry == 0 then payload pyld else return 1)
216 |                          in [ benchRandPayloadIORef 2 1
217 |                             , benchRandPayloadIORef 2 4
218 |                             , benchRandPayloadIORef 2 16
219 |                             , benchRandPayloadIORef 8 1
220 |                             , benchRandPayloadIORef 8 4
221 |                             , benchRandPayloadIORef 8 16
222 |                             , benchRandPayloadIORef 32 1
223 |                             , benchRandPayloadIORef 32 4
224 |                             , benchRandPayloadIORef 32 16
225 |                             ]
226 | 
227 |                     , bgroup "Test Payload" $
228 |                         [ bench "payload x1" $ payload 1
229 |                         , bench "payload x2" $ payload 2
230 |                         , bench "payload x4" $ payload 4
231 |                         , bench "payload x8" $ payload 8
232 |                         ]
233 |                      -}
234 |                     ]
235 |             , bgroup "Misc" $
236 |                 -- If the second shows some benefit on just two threads, then 
237 |                 -- it represents a useful technique for reducing contention:
238 |                 [ bench "contentious atomic-maybe-modify IORef" $ atomicMaybeModifyIORef n
239 |                 , bench "read first, then maybe contentious atomic-maybe-modify IORef" $ readMaybeAtomicModifyIORef n
240 |                 , bench "readForCAS, then CAS (atomic-primops)" $ readMaybeCAS n
241 |               -- NOT RELEVANT:
242 |                 -- , bench "Higher contention, contentious atomic-maybe-modify IORef" $ atomicMaybeModifyIORefHiC n
243 |                 -- , bench "Higher contention, read first, then maybe contentious atomic-maybe-modify IORef" $ readMaybeAtomicModifyIORefHiC n
244 |                 , bench "contentious atomic-maybe-modify TVar" $ atomicMaybeModifyTVar n
245 |                 , bench "read first, then maybe contentious atomic-maybe-modify TVar" $ readMaybeAtomicModifyTVar n
246 | 
247 |                 -- we should expect these to be the same:
248 |                 , bench "reads against atomicModifyIORefs" $ readsAgainstAtomicModifyIORefs n
249 |                 , bench "reads against modifyIORefs" $ readsAgainstNonAtomicModify n
250 |                 -- TODO how do these compare with STM?
251 |                 ]
252 |             ]
253 |             -- TODO: define these in terms of numCapabilities:
254 |             -- 1 r thread 1 w thread: measuring r/w contention
255 |             -- 2 w threads ONLY: meeasure w/w contention, THEN:
256 |             -- 2 r threads ONLY: meeasure r/r contention
257 |             -- more threads: measuring descheduling bottlenecks, context switching overheads (+ above)
258 |             --    above better tested outside criterion, w/ eventlogging
259 |             --    also test equivalents of above on 8-core
260 |         , bgroup "Channel implementations" $
261 |             [ bgroup ("Operations on "++(show n)++" messages") $
262 |                 [ bgroup "For scale" $
263 |                       -- For TQueue style chans, test the cost of reverse
264 |                       [ bench "reverse [1..n]" $ nf (\n'-> reverse [1..n']) n
265 |                       , bench "reverse replicate n 1" $ nf (\n'-> replicate n' (1::Int)) n
266 |                       ]
267 |                 , bgroup "Chan" $
268 |                       -- this gives us a measure of effects of contention between
269 |                       -- readers and writers when compared with single-threaded
270 |                       -- version:
271 |                       [ bench "async 1 writer 1 readers" $ runtestChanAsync 1 1 n
272 |                       -- NOTE: this is a bit hackish, filling in one test and
273 |                       -- reading in the other; make sure memory usage isn't
274 |                       -- influencing mean:
275 |                       --
276 |                       -- This measures writer/writer contention, in this case I
277 |                       -- think we see a lot of thread blocking/waiting delays
278 |                       , bench ("async "++(show procs)++" writers") $ do
279 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
280 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (writeChan fill_empty_chan ()) >> putMVar done1 ()) $ zip starts dones
281 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
282 |                       -- This measures reader/reader contention:
283 |                       , bench ("async "++(show procs)++" readers") $ do
284 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
285 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (readChan fill_empty_chan) >> putMVar done1 ()) $ zip starts dones
286 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
287 |                       -- This is measuring the effects of bottlenecks caused by
288 |                       -- descheduling, context-switching overhead (forced my
289 |                       -- fairness properties in the case of MVar), as well as
290 |                       -- all of the above; this is probably less than
291 |                       -- informative. Try threadscope on a standalone test:
292 |                       , bench "contention: async 100 writers 100 readers" $ runtestChanAsync 100 100 n
293 |                       ]
294 |                 , bgroup "TChan" $
295 |                       [ bench "async 1 writers 1 readers" $ runtestTChanAsync 1 1 n
296 |                       -- This measures writer/writer contention:
297 |                       {- LIVELOCK!!!
298 |                       , bench ("async "++(show procs)++" writers") $ do
299 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
300 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTChan fill_empty_tchan ()) >> putMVar done1 ()) $ zip starts dones
301 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
302 |                       -- This measures reader/reader contention:
303 |                       , bench ("async "++(show procs)++" readers") $ do
304 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
305 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTChan fill_empty_tchan) >> putMVar done1 ()) $ zip starts dones
306 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
307 |                       , bench "contention: async 100 writers 100 readers" $ runtestTChanAsync 100 100 n
308 |                       -}
309 |                       ]
310 |                 , bgroup "TQueue" $
311 |                       [ bench "async 1 writers 1 readers" $ runtestTQueueAsync 1 1 n
312 |                       -- This measures writer/writer contention:
313 |                       , bench ("async "++(show procs)++" writers") $ do
314 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
315 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTQueue fill_empty_tqueue ()) >> putMVar done1 ()) $ zip starts dones
316 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
317 |                       -- This measures reader/reader contention:
318 |                       , bench ("async "++(show procs)++" readers") $ do
319 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
320 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTQueue fill_empty_tqueue) >> putMVar done1 ()) $ zip starts dones
321 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
322 |                       , bench "contention: async 100 writers 100 readers" $ runtestTQueueAsync 100 100 n
323 |                       ]
324 |                 , bgroup "TBQueue" $
325 |                       [ bench "async 1 writers 1 readers" $ runtestTBQueueAsync 1 1 n
326 |                       -- This measures writer/writer contention:
327 |                       , bench ("async "++(show procs)++" writers") $ do
328 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
329 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTBQueue fill_empty_tbqueue ()) >> putMVar done1 ()) $ zip starts dones
330 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
331 |                       -- This measures reader/reader contention:
332 |                       , bench ("async "++(show procs)++" readers") $ do
333 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
334 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTBQueue fill_empty_tbqueue) >> putMVar done1 ()) $ zip starts dones
335 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
336 |                       , bench "contention: async 100 writers 100 readers" $ runtestTBQueueAsync 100 100 n
337 |                       ]
338 |                 -- OTHER CHAN IMPLEMENTATIONS:
339 |                 , bgroup "chan-split-fast" $
340 |                       [ bench "async 1 writers 1 readers" $ runtestSplitChanAsync 1 1 n
341 |                       -- This measures writer/writer contention:
342 |                       , bench ("async "++(show procs)++" writers") $ do
343 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
344 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (S.writeChan fill_empty_fastI ()) >> putMVar done1 ()) $ zip starts dones
345 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
346 |                       -- This measures reader/reader contention:
347 |                       , bench ("async "++(show procs)++" readers") $ do
348 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
349 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (S.readChan fill_empty_fastO) >> putMVar done1 ()) $ zip starts dones
350 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
351 |                       , bench "contention: async 100 writers 100 readers" $ runtestSplitChanAsync 100 100 n
352 |                       ]
353 |                 , bgroup "split-channel" $
354 |                       [ bench "async 1 writers 1 readers" $ runtestSplitChannelAsync 1 1 n
355 |                       -- This measures writer/writer contention:
356 |                       , bench ("async "++(show procs)++" writers") $ do
357 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
358 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (SC.send fill_empty_splitchannelI ()) >> putMVar done1 ()) $ zip starts dones
359 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
360 |                       -- This measures reader/reader contention:
361 |                       , bench ("async "++(show procs)++" readers") $ do
362 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
363 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (SC.receive fill_empty_splitchannelO) >> putMVar done1 ()) $ zip starts dones
364 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
365 |                       , bench "contention: async 100 writers 100 readers" $ runtestSplitChannelAsync 100 100 n
366 |                       ]
367 |                 -- michael-scott queue implementation, using atomic-primops
368 | #if MIN_VERSION_base(4,7,0)
369 | #else
370 |                 , bgroup "lockfree-queue" $
371 |                       [ bench "async 1 writer 1 readers" $ runtestLockfreeQueueAsync 1 1 n
372 |                       , bench ("async "++(show procs)++" writers") $ do
373 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
374 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (MS.pushL fill_empty_lockfree ()) >> putMVar done1 ()) $ zip starts dones
375 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
376 |                       , bench ("async "++(show procs)++" readers") $ do
377 |                           dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
378 |                           mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (msreadR fill_empty_lockfree) >> putMVar done1 ()) $ zip starts dones
379 |                           mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
380 |                       , bench "contention: async 100 writers 100 readers" $ runtestLockfreeQueueAsync 100 100 n
381 |                       ]
382 | #endif
383 |                 -- Chase / Lev work-stealing queue
384 |                 -- NOTE: we can have at most 1 writer (pushL); not a general-purpose queue, so don't do more tests
385 |                 , bgroup "chaselev-dequeue" $
386 |                       [ bench "async 1 writer 1 readers" $ runtestChaseLevQueueAsync_1_1 n
387 |                       ]
388 |                 ]
389 |             ]
390 |         , bgroup "Arrays misc" $
391 |             -- be sure to subtract "cost" of 2 forkIO's and context switch
392 |             [ bench "baseline" $
393 |                 do x <- newEmptyMVar
394 |                    y <- newEmptyMVar
395 |                    forkIO $ (replicateM_ 500 $ return ()) >> putMVar x ()
396 |                    forkIO $ (replicateM_ 500 $ return ()) >> putMVar y ()
397 |                    takeMVar x
398 |                    takeMVar y
399 |             , bench "New 32-length MutableArrays x1000 across two threads" $
400 |                 do x <- newEmptyMVar
401 |                    y <- newEmptyMVar
402 |                    forkIO $ (replicateM_ 500 $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int))) >> putMVar x ()
403 |                    forkIO $ (replicateM_ 500 $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int))) >> putMVar y ()
404 |                    takeMVar x
405 |                    takeMVar y
406 |             , bench "New MVar x1000 across two threads" $
407 |                 do x <- newEmptyMVar
408 |                    y <- newEmptyMVar
409 |                    forkIO $ (replicateM_ 500 $ (newEmptyMVar :: IO (MVar Int))) >> putMVar x ()
410 |                    forkIO $ (replicateM_ 500 $ (newEmptyMVar :: IO (MVar Int))) >> putMVar y ()
411 |                    takeMVar x
412 |                    takeMVar y
413 |             ]
414 |         ]
415 |   -- to make sure the counter is actually being incremented!:
416 |   cntv <- readCounter counter_atomic_counter
417 |   putStrLn $ "Final counter val is "++(show cntv)
418 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Criterion benchmarks for the different haskell concurrent channel
  2 | implementations in `base` and `stm` and elsewhere, as well as simple var
  3 | read/write benchmarks for `MVar`, `IORef`, and `TVar` and others.
  4 | 
  5 | These benchmarks were originally taken from bench/chanbench.hs in the `stm`
  6 | package, ported to criterion with some additions. To run them on your machine:
  7 | 
  8 |     cabal sandbox init
  9 |     cabal install
 10 |     # For HTML reports:
 11 |     ./.cabal-sandbox/bin/chan-benchmarks -g -o Benchmarks.html +RTS -N
 12 | 
 13 | Feel free to send pull requests with new or improved benchmarks.
 14 | 
 15 | # Sample Results
 16 | 
 17 | Nice HTML output for a sample run performed on:
 18 | 
 19 |     $ lscpu 
 20 |     Architecture:          i686
 21 |     CPU op-mode(s):        32-bit, 64-bit
 22 |     Byte Order:            Little Endian
 23 |     CPU(s):                4
 24 |     On-line CPU(s) list:   0-3
 25 |     Thread(s) per core:    2
 26 |     Core(s) per socket:    2
 27 |     Socket(s):             1
 28 |     Vendor ID:             GenuineIntel
 29 |     CPU family:            6
 30 |     Model:                 58
 31 |     Stepping:              9
 32 |     CPU MHz:               1400.000
 33 |     BogoMIPS:              4988.38
 34 |     Virtualization:        VT-x
 35 |     L1d cache:             32K
 36 |     L1i cache:             32K
 37 |     L2 cache:              256K
 38 |     L3 cache:              4096K
 39 | 
 40 | 
 41 | ...are at `Benchmarks.chans_sample.html` and `Benchmarks.vars_sample.html`.
 42 | 
 43 | ## Some analysis of primitive operations
 44 | 
 45 |     forkIO           309.ns
 46 |     context switch  2975.ns
 47 | 
 48 |     getNumCapabilities    4.1ns
 49 |     myThreadId            4.7ns
 50 | 
 51 |     newIORef      7.19
 52 |     readIORef     3.74ns
 53 |     writeIORef    7.02ns
 54 |     modifyIORef'  7.02ns  -- even though this is implemented as read+write??
 55 |     atomicModifyIORef'   22.43ns
 56 |     atomicModifyIORef    53.67ns  -- variable; showing cost of lazy creation of (const 'x') thunks?
 57 | 
 58 |     newEmptyMVar  7.32ns
 59 |     takeMVar     16.21ns
 60 |     putMVar       9.02ns 
 61 |     modifyMVarMasked_   35.09ns  -- handler overhead ~ 10ns
 62 | 
 63 |     newTVarIO    12.96ns
 64 |     atomically writeTVar    53.35ns
 65 |     atomically readTVar     54.29ns
 66 |     readTVarIO               4.13ns
 67 |     atomically modifyTVar'  63.76ns
 68 | 
 69 |     --counter from atomic-primops v0.5
 70 |     -- NOTE: ACTUALLY 
 71 |     newCounter   11.99ns
 72 |     incrCounter   9.28ns
 73 | 
 74 | 
 75 |     Throughput of incrementing atomic counter with...
 76 |         modifyMVar_        331.29ms / 100000  = 3312.9ns per increment
 77 |         modifyMVarMasked_  323.41ms / 100000    3234.1ns    -- NOTE: small variance
 78 |         atomicModifyIORef'  87.66ms / 100000     876.6ns    -- NOTE: sort of folded normal distribution beginning at 9.76ms , where the max we saw was 256.12ms
 79 |         modifyTVar'         19.10ms / 100000     191.0ns    -- NOTE: much lower variance, with samples 17 - 21 ms
 80 | 
 81 |         incrCounter          0.97ms / 100000  -- NOTE: GARBAGE; COUNTER NOT ATOMIC
 82 | 
 83 | 
 84 | ## Random resources
 85 | 
 86 | Some discussion of nitty-gritty of `atomicModifyIORef`:
 87 | 
 88 |     http://stackoverflow.com/questions/10102881/haskell-how-does-atomicmodifyioref-work
 89 |     
 90 | 
 91 | ## Random Analysis
 92 | 
 93 | Back-of-envelope look at how primitive var read write cost relates to chan RW
 94 | cost.
 95 | 
 96 | As of the current test run, looking at the mean times for the fastest three
 97 | contenders on the easiest test (write some, read some), we get the following
 98 | mean timings for *one read and write* (although reads and writes might vary
 99 | widely)
100 | 
101 |     Chan             135 ns
102 |     TQueue           175 ns
103 |     chan-split-fast   88 ns
104 | 
105 | Measured timings for an atomic `modify` (or take/put; again not ideal) divided
106 | by 2 (i.e. very approx timing for a take/read or put/write):
107 | 
108 |     MVar    15 ns
109 |     TVar    36 ns
110 | 
111 | and var creation:
112 | 
113 |     MVar    24 ns
114 |     TVar    21 ns
115 | 
116 | Counting var operations, with (count) around slowest path
117 | 
118 |     chan-split-fast
119 |     ---------------
120 |                  puts  takes  creates   TOTAL
121 |     readChan     1(2)  1(3)   (1)       30-99 ns
122 |     writeChan    1(2)  1      /         30-45 ns
123 |               TOTAL for read and write: 60-144 ns
124 | 
125 | So if we didn't screw that up:
126 |     - yes, read/write timing dominates
127 |     - but there might still be room to shave time elsewhere
128 |     - in "write some / read some" we don't take the slow reader-blocked path much, as expected
129 | 


--------------------------------------------------------------------------------
/RetryExperiment.hs:
--------------------------------------------------------------------------------
 1 | module Main
 2 |     where
 3 | 
 4 | import Control.Concurrent
 5 | import Control.Concurrent.STM
 6 | import Control.Concurrent.STM.TSem
 7 | import Control.Monad
 8 | 
 9 | import System.IO
10 | import Debug.Trace
11 | 
12 | main = do
13 |     hSetBuffering stdout NoBuffering
14 |     noMansLand <- replicateM 998 $ newTVarIO 0
15 |     t0 <- newTVarIO (1::Int)
16 |     t999 <- newTVarIO (-1)
17 |     let ts = t0:noMansLand++[t999]
18 | 
19 |     done <- atomically $ newTSem 0
20 |     forkIO $ atomically $ nestedOrElseMap done ts
21 | 
22 |     -- need enough time here for nestedOrElseMap thread above to move past t0
23 | 
24 |     -- in this version, the modifications to t0 force nestedOrElseMap to be restarted
25 |     forkIO (trace "starting vexing!" $ forever $ atomically $ (modifyTVar' t0 (+1) >> trace "vex" (return ())))
26 |     -- in this version nestedOrElseMap causes this transaction to be restarted and never makes progress:
27 |     --forkIO (atomically (trace "starting vexing!" $ forever $ (modifyTVar' t0 (+1) >> trace "vex" (return ()))))
28 | 
29 |     atomically $ waitTSem done
30 |     putStrLn "No livelock! Did the t0 counter get incremented?: "
31 |     atomically (readTVar t0) >>= print
32 |    
33 | nestedOrElseMap :: TSem -> [TVar Int] -> STM ()
34 | nestedOrElseMap done ts = trace "nestedOrElseMap starting" $ foldl1 orElse $ map transaction $ zip [(1::Int)..] ts
35 |     where transaction (cnt,v) = do
36 |             n <- traceShow cnt $ readTVar v
37 |             if n < 0 
38 |                 then trace "@" (modifyTVar' v (subtract 1)) >> signalTSem done
39 |                 else  retry
40 | 
41 | -- NOTE: this shows at least that we get livelock as we wait on the first transaction (it may also be executed; add Debug.Trace)
42 | -- that's not really working...
43 | --
44 | -- CONSIDER:
45 | -- The behavior we see ensures that all branches of orElse see the same view of
46 | -- the same variables, but is overzealous! It should do validation for each
47 | -- subtransaction by only checking oldest parent read of each variable *used*
48 | -- in the transaction, and if any changed, then go up only to the last
49 | -- inconsistency)
50 | --     ""If both t1 and t2 execute retry then even though the effects of t1 are
51 | --     thrown away, it could be that a change to a TVar that is only in the
52 | --     access set of t1 will allow the whole transaction to succeed when it is
53 | --     woken.  To solve this problem, when a branch on a nested transaction is
54 | --     aborted the access set of the nested transaction is merged as a read set
55 | --     into the parent TRec. Specifically if the TVar is in any TRec up the
56 | --     chain of nested transactions it must be ignored, otherwise it is entered
57 | --     as a new entry (retaining just the read) in the parent TRec.""
58 | --
59 | -- "aborted"? "read set"?
60 | --     -- is this the culprit? No. Clearly we move on from that branch in test.
61 | --     "A validation failure in the first branch aborts the entire transaction, not just the nested part"
62 | --        "validation"?
63 | --          "Before a transaction can make its effects visible to other threads
64 | --          it must check that it has seen a consistent view of memory while it
65 | --          was executing. Most of the work is done in
66 | --          validate_and_acquire_ownership by checking that TVars hold their
67 | --          expected values. "
68 | --          
69 | --     when commiting, do we force that the read set is unchanged?
70 | -- 
71 | -- Soooo
72 | -- The writes to variables from other branches are causing a validation failure
73 | -- and causing whole transaction to reset
74 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/TVarExperiment.hs:
--------------------------------------------------------------------------------
 1 | module Main
 2 |     where
 3 | 
 4 | -- Testing interleaving on IORefs and TVars under contention. Most interested
 5 | -- in 2 competing HECs
 6 | 
 7 | import Control.Concurrent.STM
 8 | import Control.Concurrent
 9 | import Data.IORef
10 | import Control.Monad
11 | import Data.List
12 | import System.Environment
13 | 
14 | main = do
15 |     [msgsS] <- getArgs
16 | 
17 |     let msgs = read msgsS :: Int -- per thread (with 2 threads)
18 |     tmsgs <- testTVar msgs
19 |     iormsgs <- testIORef msgs 
20 | 
21 |     -- 69% mean
22 |     pprint $ analyze iormsgs
23 |     --  9% mean
24 |     pprint $ analyze tmsgs
25 | 
26 | pprint (prefix, rest) = 
27 |     putStrLn ( (show $ round $ (prefix*100))++"% of messages in constant prefix, with "++(show $ round (rest*100))++"% interleaving for rest"  )
28 | 
29 | analyze :: [Int] -> (Float, Float)
30 | analyze l@(a:as) = 
31 |     -- strip off starting list
32 |     let lenL = length l
33 |         lSuff = dropWhile (/=a) as
34 |         lenPrefix = length (a:takeWhile (==a) as)
35 |      in ( fromIntegral lenPrefix / fromIntegral lenL
36 |         , fromIntegral (flops lSuff) / fromIntegral (lenL - lenPrefix - 1) )
37 | 
38 | flops  = subtract 1 . length . group
39 | 
40 | testTVar :: Int -> IO [Int]
41 | testTVar msgs = do
42 |     st <- newTVarIO []
43 |     start1 <- newEmptyMVar
44 |     start2 <- newEmptyMVar
45 |     vs <- mapM (\(n,start)-> do 
46 |                         v <- newEmptyMVar 
47 |                         forkIO $ takeMVar start >> (replicateM_ msgs $ atomically $ modifyTVar' st (n:)) >> putMVar v ()
48 |                         return v
49 |                ) [(1, start1),(2,start2)]
50 |     mapM_ (flip putMVar ()) [start2,start1]
51 |     mapM_ takeMVar vs -- wait
52 |     readTVarIO st
53 | 
54 | testIORef :: Int -> IO [Int]
55 | testIORef msgs = do
56 |     st <- newIORef []
57 |     start1 <- newEmptyMVar
58 |     start2 <- newEmptyMVar
59 |     vs <- mapM (\(n,start)-> do 
60 |                         v <- newEmptyMVar 
61 |                         forkIO $ takeMVar start >> (replicateM_ msgs $ atomicModifyIORef' st (\st'-> (n:st',()))) >> putMVar v ()
62 |                         return v
63 |                ) [(1, start1),(2,start2)]
64 |     mapM_ (flip putMVar ()) [start2,start1]
65 |     mapM_ takeMVar vs -- wait
66 |     readIORef st
67 | 


--------------------------------------------------------------------------------
/TestAtomics.hs:
--------------------------------------------------------------------------------
  1 | import Control.Concurrent
  2 | import Data.Atomics.Counter
  3 | import Data.Atomics
  4 | import Data.IORef
  5 | import Control.Monad
  6 | import GHC.Conc
  7 | import Control.Exception(evaluate)
  8 | import qualified Data.Set as Set
  9 | import Data.List
 10 | 
 11 | main = do
 12 |     testCounterOverflow
 13 |     testConsistentSuccessFailure
 14 |     counterTest
 15 | {-
 16 |     testCAS_ABA_0
 17 |     testCAS_ABA_0_modAfterAtomicMod
 18 |     testCAS_ABA_0_modBefAtomicMod
 19 |     testCAS_ABA_1
 20 |     testCAS_ABA_2
 21 |     testCAS_ABA_3
 22 | 
 23 | -}
 24 | cHUNK_SIZE = 32
 25 | maxInt = maxBound :: Int
 26 | minInt = minBound :: Int
 27 | testCounterOverflow = do
 28 |     let ourMod = mod -- or something more fancy?
 29 |     cntr <- newCounter (maxInt - (cHUNK_SIZE `div` 2)) 
 30 |     spanningCntr <- replicateM cHUNK_SIZE (incrCounter 1 cntr)
 31 |     -- make sure our test is working
 32 |     if all (>0) spanningCntr || all (<0) spanningCntr
 33 |         then error "Sequence meant to span maxBound of counter not actually spanning"
 34 |         else return ()
 35 | 
 36 |     let l = map (`ourMod` cHUNK_SIZE) spanningCntr
 37 |         l' = (dropWhile (/= 0) l) ++ (takeWhile (/= 0) l)
 38 | 
 39 |     -- (1) test that we overflow the counter without any breaks and our mod function is working properly:
 40 |     if l' == [0..(cHUNK_SIZE - 1)]
 41 |         then putStrLn $ "OK"
 42 |         else error $ "Uh Oh: "++(show l')
 43 | 
 44 |     -- (2) test that Ints and counter overflow in exactly the same way
 45 |     let spanningInts = take cHUNK_SIZE $ iterate (+1) (maxInt - (cHUNK_SIZE `div` 2) + 1) 
 46 |     if spanningInts == spanningCntr 
 47 |         then putStrLn "OK"
 48 |         else do putStrLn $ "Ints overflow differently than counter: "
 49 |                 putStrLn $ "Int: "++(show spanningInts)
 50 |                 putStrLn $ "Counter: "++(show spanningCntr)
 51 |                 error "Fail"
 52 | 
 53 |     -- We don't use this property
 54 |     cntr2 <- newCounter maxBound
 55 |     mbnd <- incrCounter 1 cntr2
 56 |     if mbnd == minBound
 57 |         then putStrLn "OK"
 58 |         else error $ "Incrementing counter at maxbound didn't yield minBound"
 59 | 
 60 |     -- (3) test subtraction across boundary: count - newFirstIndex, for window spanning boundary.
 61 |     cntr3 <- newCounter (maxBound - 1)
 62 |     let ls = take 30 $ iterate (+1) $ maxBound - 10
 63 |     cs <- mapM (\l-> fmap (subtract l) $ incrCounter 1 cntr3) ls
 64 |     if cs == replicate 30 10
 65 |         then putStrLn "OK"
 66 |         else error $ "Derp. We don't know how subtraction works: "++(show cs)
 67 |     -- (4) readIORef before fetchAndAdd w/ barriers
 68 | 
 69 | 
 70 | -- Test these assumptions:
 71 | --   1) If a CAS fails in thread 1 then another CAS (in thread 2, say) succeeded
 72 | --   2) In the case that thread 1's CAS failed, the ticket returned with (False,tk) will contain that newly-written value from thread 2
 73 | testConsistentSuccessFailure = do
 74 |     var <- newIORef "0"
 75 | 
 76 |     sem <- newIORef (0::Int)
 77 |     outs <- replicateM 2 newEmptyMVar 
 78 | 
 79 |     forkSync sem 2 $ test "a" var (outs!!0)
 80 |     forkSync sem 2 $ test "b" var (outs!!1)
 81 | 
 82 |     mapM takeMVar outs >>= examine
 83 |        -- w/r/t (2) above: we only try to find an element read along with False
 84 |        -- which wasn't sent by another thread, which isn't ideal
 85 |  where attempts = 100000
 86 |        test tag var out = do
 87 |          
 88 |          res <- forM [(1::Int)..attempts] $ \x-> do
 89 |                     let str = (tag++(show x))
 90 |                     tk <- readForCAS var
 91 |                     (b,tk') <- casIORef var tk str
 92 |                     return (if b then str else peekTicket tk' , b)
 93 |          putMVar out res
 94 | 
 95 |        examine [res1, res2] = do
 96 |          -- any failures in either should be marked as successes in the other
 97 |          let (successes1,failures1) = (\(x,y)-> (Set.fromList $ map fst x, map fst y)) $ partition snd res1
 98 |              (successes2,failures2) = (\(x,y)-> (Set.fromList $ map fst x, map fst y)) $ partition snd res2
 99 |              ok1 = all (flip Set.member successes2) failures1
100 |              ok2 = all (flip Set.member successes1) failures2
101 |          if ok1 && ok2
102 |              then if length failures1 < (attempts `div` 6) || length failures2 < (attempts `div` 6) 
103 |                     then error "There was not enough contention to trust test. Please retry."
104 |                     else putStrLn "OK"
105 |              else do print res1
106 |                      print res2
107 |                      error "FAILURE!"
108 | 
109 |                    
110 | -- forkSync :: IORef Int -> Int -> IO a -> IO ThreadId
111 | forkSync sem target io = 
112 |     forkIO $ (busyWait >> io)
113 |   where busyWait =
114 |            atomicModifyIORef' sem (\n-> (n+1,())) >> wait
115 |         wait = do
116 |             n <- readIORef sem
117 |             unless (n == target) wait
118 |     
119 | 
120 | counterTest = do
121 |     n0 <- testAtomicCount newCounter readCounter incrCounter
122 |     n1 <- testAtomicCount newMVar takeMVar (\n v-> modifyMVar_ v (evaluate . (+1)) )
123 |     if n0 /= n1
124 |         then putStrLn $ "Counter broken: expecting "++(show n1)++" got "++(show n0)
125 |         else putStrLn "OK"
126 | 
127 | testAtomicCount new read incr = do
128 |   let n = 1000000
129 |   procs <- getNumCapabilities
130 | 
131 |   counter <- new (1::Int)
132 |   dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar
133 |   mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (incr 1 counter) >> putMVar done1 ()) $ zip starts dones
134 |   mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones
135 |   
136 |   read counter
137 | 
138 | -- test ABA issue with these three cases:
139 | --   ()
140 | --   Bool
141 | --       {-# NOINLINE True #-}
142 | --   let true = True
143 | 
144 | 
145 | 
146 | -- returns False
147 | testCAS_ABA_0 = do
148 |     a <- newIORef ()
149 |     ta <- readForCAS a
150 |     atomicModifyIORef' a (\u-> (u,u))
151 |     (res, _) <- casIORef a ta () 
152 |     print res
153 | {- same
154 | testCAS_ABA_0_nonstrict = do
155 |     a <- newIORef ()
156 |     ta <- readForCAS a
157 |     atomicModifyIORef a (\u-> (u,u))
158 |     (res, _) <- casIORef a ta () 
159 |     print res
160 | testCAS_ABA_0_u = do
161 |     a <- newIORef ()
162 |     ta <- readForCAS a
163 |     atomicModifyIORef' a (const ((),()))
164 |     (res, _) <- casIORef a ta () 
165 |     print res
166 | testCAS_ABA_0_sameu = do
167 |     let {-# NOINLINE u #-} 
168 |         u = ()
169 |     a <- newIORef u
170 |     ta <- readForCAS a
171 |     atomicModifyIORef' a (const (u,u))
172 |     (res, _) <- casIORef a ta () 
173 |     print res
174 | -}
175 | 
176 | -- returns True
177 | testCAS_ABA_1 = do
178 |     a <- newIORef ()
179 |     ta <- readForCAS a
180 |     modifyIORef a (const ())   -- i.e. readIORef >> writeIORef
181 |     (res, _) <- casIORef a ta () 
182 |     print res
183 | 
184 | {- same
185 | testCAS_ABA_1_casMod = do
186 |     a <- newIORef ()
187 |     ta <- readForCAS a
188 |     atomicModifyIORefCAS_ a id
189 |     (res, _) <- casIORef a ta () 
190 |     print res
191 | testCAS_ABA_1_id = do
192 |     a <- newIORef ()
193 |     ta <- readForCAS a
194 |     modifyIORef a id   -- i.e. readIORef >> writeIORef
195 |     (res, _) <- casIORef a ta () 
196 |     print res
197 | -}
198 | 
199 | -- returns True
200 | -- ... so the issue isn't re-ordering of readForCas and the read in modifyIORef
201 | -- in fact, no combination of the barriers provided seem to work.
202 | testCAS_ABA_2 = do
203 |     a <- newIORef ()
204 |     ta <- readForCAS a
205 |     loadLoadBarrier
206 |     modifyIORef a (const ())   -- i.e. readIORef >> writeIORef
207 |     (res, _) <- casIORef a ta () 
208 |     print res
209 | 
210 | testCAS_ABA_3 = do
211 |     barrier <- newIORef ()
212 | 
213 |     a <- newIORef ()
214 |     ta <- readForCAS a
215 | 
216 |     atomicModifyIORef' barrier (\u-> (u,u))  -- just a barrier
217 |     modifyIORef a (const ())   -- i.e. readIORef >> writeIORef
218 |     atomicModifyIORef' barrier (\u-> (u,u))  -- just a barrier
219 |     
220 |     (res, _) <- casIORef a ta () 
221 |     print res
222 | 
223 | -- INTERESTING!: /adding/ the modifyIORef /after/ the atomicModifyIORef causes this to return True!
224 | testCAS_ABA_0_modAfterAtomicMod = do
225 |     barrier <- newIORef ()
226 | 
227 |     a <- newIORef ()
228 |     ta <- readForCAS a
229 | 
230 |     atomicModifyIORef' a (\u-> (u,u))
231 |     modifyIORef a (const ())   -- i.e. readIORef >> writeIORef
232 |     
233 |     (res, _) <- casIORef a ta () 
234 |     print res
235 | 
236 | -- ...whereas this one returns False again
237 | testCAS_ABA_0_modBefAtomicMod = do
238 |     barrier <- newIORef ()
239 | 
240 |     a <- newIORef ()
241 |     ta <- readForCAS a
242 | 
243 |     modifyIORef a (const ())   -- i.e. readIORef >> writeIORef
244 |     atomicModifyIORef' a (\u-> (u,u))
245 |     
246 |     (res, _) <- casIORef a ta () 
247 |     print res
248 | 


--------------------------------------------------------------------------------
/chan-benchmarks.cabal:
--------------------------------------------------------------------------------
  1 | -- Initial chan-benchmarks.cabal generated by cabal init.  For further 
  2 | -- documentation, see http://haskell.org/cabal/users-guide/
  3 | 
  4 | name:                chan-benchmarks
  5 | version:             0.1.0.0
  6 | license:             BSD3
  7 | license-file:        LICENSE
  8 | author:              Brandon Simmons
  9 | maintainer:          brandon.m.simmons@gmail.com
 10 | build-type:          Simple
 11 | cabal-version:       >=1.10
 12 | 
 13 | -- We provide two executables which run on a single core and all available
 14 | -- cores, respectively. Benchmarks to be tested live in Benchmarks.hs.
 15 | -- There is some overlap in which tests are run in which, but we omit tests
 16 | -- from bench-multi where we suspect running on multiple cores only confuses
 17 | -- things, and omit tests from bench-single where a test on a single core wouldn't
 18 | -- be relevant (e.g. testing contention or the scheduler directly)
 19 | 
 20 | executable bench-single
 21 |   main-is:           Main1.hs
 22 |   other-modules:     Benchmarks
 23 |   build-depends:       base >=4.6
 24 |                      , stm
 25 |                      , async
 26 |                      -- random number generation:
 27 |                      , random
 28 |                      , mwc-random
 29 |                      , criterion
 30 |                      , primitive > 0.5.2.0
 31 |                      , atomic-primops == 0.6
 32 |             -- other chans:
 33 |                      , chan-split-fast
 34 |                      , split-channel
 35 |               --TODO (other FIFO chans/queues with blocking read):
 36 |                    --, cml
 37 |                      , chaselev-deque
 38 | 
 39 |                      -- arrays:
 40 |                      , array
 41 |                      , vector
 42 |                      -- time:
 43 |                      , old-time
 44 |                      , time
 45 |   -- Since this doesn't support 7.8 yet
 46 |   if impl(ghc < 7.8.0)
 47 |     build-depends: lockfree-queue == 0.2.3
 48 |                    
 49 |   -- Still testing the threaded runtime, but removing OS thread scheduling from
 50 |   -- the picture:
 51 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N1
 52 |   ghc-options: -fforce-recomp
 53 |   default-language:    Haskell2010
 54 |   default-extensions: CPP
 55 | 
 56 | executable bench-multi
 57 |   main-is:           MainN.hs
 58 |   other-modules:     Benchmarks
 59 |   build-depends:       base >=4.6
 60 |                      , stm
 61 |                      , async
 62 |                      , criterion
 63 |                      , primitive > 0.5.2.0
 64 |                      , atomic-primops == 0.6
 65 |             -- other chans:
 66 |                      , chan-split-fast
 67 |                      , split-channel
 68 |               --TODO (other FIFO chans/queues with blocking read):
 69 |                    --, cml
 70 |                      , chaselev-deque
 71 |   -- Since this doesn't support 7.8 yet
 72 |   if impl(ghc < 7.8.0)
 73 |     build-depends: lockfree-queue == 0.2.3
 74 | 
 75 |   -- We fix this to 2 cores, so that we can be a little less careful about how
 76 |   -- we write and interpret our tests.
 77 |   -- These would be interesting to play with on a >2 core machine:
 78 |   --    -feager-blackholing  -qa   -qg1   -fllvm
 79 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N2
 80 |   ghc-options: -fforce-recomp
 81 |   default-language:    Haskell2010
 82 |   default-extensions: CPP
 83 | 
 84 | -- --------------
 85 | 
 86 | executable test-atomics
 87 |   main-is:           TestAtomics.hs
 88 |   build-depends:       base >=4.6
 89 |                      , stm
 90 |                      , async
 91 |                      , atomic-primops == 0.6
 92 |                      , containers
 93 | 
 94 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N2
 95 |   default-language:    Haskell2010
 96 | 
 97 | -- --------------
 98 | 
 99 | executable experiment-mvar-fairness
100 |   main-is:           MVarExperiment.hs
101 |   other-modules:     Benchmarks
102 |   build-depends:       base >=4.6
103 |                      , stm
104 |                      , async
105 |                      -- random number generation:
106 |                      , random
107 |                      , mwc-random
108 |                      , criterion
109 |                      , primitive > 0.5.2.0
110 |                      , atomic-primops == 0.6
111 |                      , chan-split-fast
112 |                      , split-channel
113 |   
114 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N1
115 |   default-language:    Haskell2010
116 | 
117 | executable experiment-tvar-interleaving
118 |   main-is:           TVarExperiment.hs
119 |   other-modules:     Benchmarks
120 |   build-depends:       base >=4.6
121 |                      , stm
122 |                      , async
123 |                      -- random number generation:
124 |                      , random
125 |                      , mwc-random
126 |                      , criterion
127 |                      , primitive > 0.5.2.0
128 |                      , atomic-primops == 0.6
129 |                      , chan-split-fast
130 |                      , split-channel
131 |   
132 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N2
133 |   default-language:    Haskell2010
134 | 
135 | executable retry-experiment
136 |   main-is:           RetryExperiment.hs
137 |   other-modules:     Benchmarks
138 |   build-depends:       base >=4.6
139 |                      , stm
140 |   
141 |   ghc-options: -O2  -rtsopts  -threaded -with-rtsopts=-N2
142 |   ghc-options: -Wall
143 |   default-language:    Haskell2010
144 | 


--------------------------------------------------------------------------------