├── Benchmarks.hs ├── LICENSE ├── MVarExperiment.hs ├── Main1.hs ├── MainN.hs ├── README.md ├── RetryExperiment.hs ├── Setup.hs ├── TVarExperiment.hs ├── TestAtomics.hs ├── chan-benchmarks.cabal └── reports ├── ghc-7.6 ├── bench-multi.chans.html ├── bench-multi.vars-oddly-little-ioref-variance.html ├── bench-multi.vars.html ├── bench-single.arrays.html ├── bench-single.chan_latency.html ├── bench-single.chan_throughput.html ├── bench-single.cons-reverse.html ├── bench-single.misc.html └── bench-single.primitives.html ├── ghc-7.6_8-core ├── bench-multi.chans.html ├── bench-multi.vars.html └── bench-single.primitives.html └── ghc-7.8 ├── bench-multi.chans.html ├── bench-multi.vars.html ├── bench-single.arrays.html ├── bench-single.chan_latency.html ├── bench-single.chan_throughput.html └── bench-single.primitives.html /Benchmarks.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | {-# LANGUAGE PackageImports #-} 3 | {-# LANGUAGE BangPatterns #-} 4 | module Benchmarks 5 | where 6 | 7 | import Control.Concurrent.Async 8 | import Control.Monad 9 | import System.Environment 10 | 11 | import Control.Concurrent 12 | import Control.Concurrent.Chan 13 | import Control.Concurrent.STM 14 | import Control.Concurrent.STM.TQueue 15 | import Control.Concurrent.STM.TBQueue 16 | 17 | import Control.Concurrent.MVar 18 | import Data.IORef 19 | import Control.Exception(evaluate) 20 | 21 | 22 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S 23 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC 24 | import Data.Primitive.MutVar 25 | import Control.Monad.Primitive(PrimState) 26 | 27 | import Data.Atomics 28 | 29 | -- Hack since these aren't currently working with ghc 7.8 30 | #if MIN_VERSION_base(4,7,0) 31 | #else 32 | import qualified Data.Concurrent.Queue.MichaelScott as MS 33 | #endif 34 | import qualified Data.Concurrent.Deque.ChaseLev as CL 35 | 36 | 37 | -- ----------------------------------------------------------- 38 | atomicModifyIORefCAS' :: IORef a -- ^ Mutable location to modify 39 | -> (a -> (a,b)) -- ^ Computation runs one or more times (speculation) 40 | -> IO b 41 | atomicModifyIORefCAS' ref fn = do 42 | -- TODO: Should handle contention in a better way... 43 | tick <- readForCAS ref 44 | loop tick 45 | where 46 | loop old = do 47 | let !(~new,~result) = fn $ peekTicket old 48 | (b,tick) <- casIORef ref old new 49 | if b 50 | then new `seq` result `seq` return result 51 | else loop tick 52 | 53 | 54 | -- ----------------------------------------------------------- 55 | 56 | -- These tests initially taken from stm/bench/chanbench.hs, ported to 57 | -- criterion, with some additions, and have now changed quite a bit. 58 | -- 59 | -- The original used CPP to avoid code duplication while also ensuring GHC 60 | -- optimized the code in a realistic fashion. Here we just copy paste. 61 | 62 | 63 | runtestChan1, runtestChan2 :: Int -> IO () 64 | runtestChan1 n = do 65 | c <- newChan 66 | replicateM_ n $ writeChan c () 67 | replicateM_ n $ readChan c 68 | 69 | runtestChan2 n = do 70 | c <- newChan 71 | let n1000 = n `quot` 1000 72 | replicateM_ 1000 $ do 73 | replicateM_ n1000 $ writeChan c () 74 | replicateM_ n1000 $ readChan c 75 | 76 | runtestChanAsync :: Int -> Int -> Int -> IO () 77 | runtestChanAsync writers readers n = do 78 | let nNice = n - rem n (lcm writers readers) 79 | c <- newChan 80 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ readChan c 81 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ writeChan c () 82 | mapM_ wait rcvrs 83 | 84 | -- ---------- 85 | -- Hack since these aren't currently working with ghc 7.8 86 | #if MIN_VERSION_base(4,7,0) 87 | #else 88 | -- from "lockfree-queue" 89 | runtestLockfreeQueue1, runtestLockfreeQueue2 :: Int -> IO () 90 | runtestLockfreeQueue1 n = do 91 | c <- MS.newQ 92 | replicateM_ n $ MS.pushL c () 93 | replicateM_ n $ msreadR c 94 | 95 | runtestLockfreeQueue2 n = do 96 | c <- MS.newQ 97 | let n1000 = n `quot` 1000 98 | replicateM_ 1000 $ do 99 | replicateM_ n1000 $ MS.pushL c () 100 | replicateM_ n1000 $ msreadR c 101 | 102 | runtestLockfreeQueueAsync :: Int -> Int -> Int -> IO () 103 | runtestLockfreeQueueAsync writers readers n = do 104 | let nNice = n - rem n (lcm writers readers) 105 | c <- MS.newQ 106 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ msreadR c 107 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ MS.pushL c () 108 | mapM_ wait rcvrs 109 | 110 | -- a busy-blocking read: 111 | msreadR :: MS.LinkedQueue a -> IO a 112 | msreadR q = MS.tryPopR q >>= maybe (msreadR q) return 113 | #endif 114 | 115 | 116 | -- ---------- 117 | -- from "chaselev-dequeue" 118 | -- NOTE: this is generally to get a sense of how the techniques used perform; 119 | -- this is not a general-purpose concurrent FIFO queue. 120 | runtestChaseLevQueue1, runtestChaseLevQueue2 :: Int -> IO () 121 | runtestChaseLevQueue1 n = do 122 | c <- CL.newQ 123 | replicateM_ n $ CL.pushL c () 124 | replicateM_ n $ clreadR c 125 | 126 | runtestChaseLevQueue2 n = do 127 | c <- CL.newQ 128 | let n1000 = n `quot` 1000 129 | replicateM_ 1000 $ do 130 | replicateM_ n1000 $ CL.pushL c () 131 | replicateM_ n1000 $ clreadR c 132 | 133 | -- One reader / one writer (we can have at most one writer safely); copy-pasta 134 | runtestChaseLevQueueAsync_1_1 :: Int -> IO () 135 | runtestChaseLevQueueAsync_1_1 n = do 136 | let nNice = n - rem n (lcm 1 1) 137 | c <- CL.newQ 138 | rcvrs <- replicateM 1 $ async $ replicateM_ (nNice `quot` 1) $ clreadR c 139 | senders <- replicateM 1 $ async $ replicateM_ (nNice `quot` 1) $ CL.pushL c () 140 | mapM_ wait rcvrs 141 | 142 | -- a busy-blocking read: 143 | clreadR :: CL.ChaseLevDeque a -> IO a 144 | clreadR q = CL.tryPopR q >>= maybe (clreadR q) return 145 | 146 | -- ---------- 147 | 148 | runtestTChan1, runtestTChan2 :: Int -> IO () 149 | runtestTChan1 n = do 150 | c <- newTChanIO 151 | replicateM_ n $ atomically $ writeTChan c () 152 | replicateM_ n $ atomically $ readTChan c 153 | 154 | runtestTChan2 n = do 155 | c <- newTChanIO 156 | let n1000 = n `quot` 1000 157 | replicateM_ 1000 $ do 158 | replicateM_ n1000 $ atomically $ writeTChan c () 159 | replicateM_ n1000 $ atomically $ readTChan c 160 | 161 | runtestTChanAsync :: Int -> Int -> Int -> IO () 162 | runtestTChanAsync writers readers n = do 163 | let nNice = n - rem n (lcm writers readers) 164 | c <- newTChanIO 165 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTChan c 166 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTChan c () 167 | mapM_ wait rcvrs 168 | 169 | -- ---------- 170 | 171 | runtestTQueue1, runtestTQueue2 :: Int -> IO () 172 | runtestTQueue1 n = do 173 | c <- newTQueueIO 174 | replicateM_ n $ atomically $ writeTQueue c () 175 | replicateM_ n $ atomically $ readTQueue c 176 | 177 | runtestTQueue2 n = do 178 | c <- newTQueueIO 179 | let n1000 = n `quot` 1000 180 | replicateM_ 1000 $ do 181 | replicateM_ n1000 $ atomically $ writeTQueue c () 182 | replicateM_ n1000 $ atomically $ readTQueue c 183 | 184 | runtestTQueueAsync :: Int -> Int -> Int -> IO () 185 | runtestTQueueAsync writers readers n = do 186 | let nNice = n - rem n (lcm writers readers) 187 | c <- newTQueueIO 188 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTQueue c 189 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTQueue c () 190 | mapM_ wait rcvrs 191 | 192 | -- ---------- 193 | 194 | runtestTBQueue1, runtestTBQueue2 :: Int -> IO () 195 | runtestTBQueue1 n = do 196 | c <- newTBQueueIO n -- The original benchmark must have blocked indefinitely here, no? 197 | replicateM_ n $ atomically $ writeTBQueue c () 198 | replicateM_ n $ atomically $ readTBQueue c 199 | 200 | runtestTBQueue2 n = do 201 | c <- newTBQueueIO 4096 202 | let n1000 = n `quot` 1000 203 | replicateM_ 1000 $ do 204 | replicateM_ n1000 $ atomically $ writeTBQueue c () 205 | replicateM_ n1000 $ atomically $ readTBQueue c 206 | 207 | runtestTBQueueAsync :: Int -> Int -> Int -> IO () 208 | runtestTBQueueAsync writers readers n = do 209 | let nNice = n - rem n (lcm writers readers) 210 | c <- newTBQueueIO 4096 211 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ atomically $ readTBQueue c 212 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ atomically $ writeTBQueue c () 213 | mapM_ wait rcvrs 214 | 215 | 216 | -- OTHER CHAN IMPLEMENTATIONS: 217 | 218 | -- chan-split-fast 219 | 220 | runtestSplitChan1, runtestSplitChan2 :: Int -> IO () 221 | runtestSplitChan1 n = do 222 | (i,o) <- S.newSplitChan 223 | replicateM_ n $ S.writeChan i () 224 | replicateM_ n $ S.readChan o 225 | 226 | runtestSplitChan2 n = do 227 | (i,o) <- S.newSplitChan 228 | let n1000 = n `quot` 1000 229 | replicateM_ 1000 $ do 230 | replicateM_ n1000 $ S.writeChan i () 231 | replicateM_ n1000 $ S.readChan o 232 | 233 | 234 | runtestSplitChanAsync :: Int -> Int -> Int -> IO () 235 | runtestSplitChanAsync writers readers n = do 236 | let nNice = n - rem n (lcm writers readers) 237 | (i,o) <- S.newSplitChan 238 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ S.readChan o 239 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ S.writeChan i () 240 | mapM_ wait rcvrs 241 | 242 | 243 | 244 | -- split-channel 245 | 246 | runtestSplitChannel1, runtestSplitChannel2 :: Int -> IO () 247 | runtestSplitChannel1 n = do 248 | (i,o) <- SC.new 249 | replicateM_ n $ SC.send i () 250 | replicateM_ n $ SC.receive o 251 | 252 | runtestSplitChannel2 n = do 253 | (i,o) <- SC.new 254 | let n1000 = n `quot` 1000 255 | replicateM_ 1000 $ do 256 | replicateM_ n1000 $ SC.send i () 257 | replicateM_ n1000 $ SC.receive o 258 | 259 | runtestSplitChannelAsync :: Int -> Int -> Int -> IO () 260 | runtestSplitChannelAsync writers readers n = do 261 | let nNice = n - rem n (lcm writers readers) 262 | (i,o) <- SC.new 263 | rcvrs <- replicateM readers $ async $ replicateM_ (nNice `quot` readers) $ SC.receive o 264 | senders <- replicateM writers $ async $ replicateM_ (nNice `quot` writers) $ SC.send i () 265 | mapM_ wait rcvrs 266 | 267 | 268 | 269 | -- -------------------------- 270 | -- Misc Components 271 | 272 | testCompositionAppend :: Int -> [Int] 273 | testCompositionAppend n = (go id [1..n]) [] where 274 | go f [] = f 275 | go f (a:as) = go (f . (a:)) as 276 | 277 | -- are appends just as cheap as prepends? 278 | testCompositionAppendPrepend :: Int -> [Int] 279 | testCompositionAppendPrepend n = (go id [1..n]) [] where 280 | go f [] = f 281 | go f (a:as) 282 | | even a = go (f . (a:)) as 283 | | otherwise = go ((a:) . f) as 284 | 285 | testConsReverse :: Int -> [Int] 286 | testConsReverse n = reverse $ go [1..n] [] where 287 | go [] as = as 288 | go (a:xs) as = go xs (a:as) 289 | 290 | -- test an optimization for small writer dequeues? what about branch prediction's effects on this test? 291 | -- LITTLE BENEFIT 292 | testConsUnrolledReverse :: Int -> [Int] 293 | testConsUnrolledReverse n = rev [] $ go [1..n] [] where 294 | go [] as = as 295 | go (a:xs) as = go xs (a:as) 296 | 297 | rev a [z,y,x,w,v,u,t] = t:u:v:w:x:y:z:a 298 | rev a [z,y,x,w,v,u] = u:v:w:x:y:z:a 299 | rev a [z,y,x,w,v] = v:w:x:y:z:a 300 | rev a [z,y,x,w] = w:x:y:z:a 301 | rev a [z,y,x] = x:y:z:a 302 | rev a [z,y] = y:z:a 303 | rev a [z] = z:a 304 | rev a [] = a 305 | rev a (x:xs) = rev xs (x:a) 306 | 307 | -- This is more realistic, eliminating any benefits from inlining and rewriting 308 | -- we might get from above 309 | testCompositionAppendInMVar :: Int -> IO [Int] 310 | testCompositionAppendInMVar n = do 311 | v <- newMVar id 312 | mapM_ (go v) [1..n] 313 | fmap ($ []) $ takeMVar v 314 | where go v a = do 315 | f <- takeMVar v 316 | fa <- evaluate (f . (a:)) 317 | putMVar v fa 318 | 319 | testConsReverseInMVar :: Int -> IO [Int] 320 | testConsReverseInMVar n = do 321 | v <- newMVar [] 322 | mapM_ (go v) [1..n] 323 | fmap reverse $ takeMVar v 324 | where go v a = do 325 | zs <- takeMVar v 326 | azs <- evaluate (a:zs) 327 | putMVar v azs 328 | 329 | -- get an idea of the impact on writers: 330 | testStoreCompositionAppendInMVar :: Int -> IO () 331 | testStoreCompositionAppendInMVar n = do 332 | v <- newMVar id 333 | mapM_ (go v) [1..n] 334 | where go v a = do 335 | f <- takeMVar v 336 | fa <- evaluate (f . (a:)) 337 | putMVar v fa 338 | 339 | testStoreConsReverseInMVar :: Int -> IO () 340 | testStoreConsReverseInMVar n = do 341 | v <- newMVar [] 342 | mapM_ (go v) [1..n] 343 | where go v a = do 344 | zs <- takeMVar v 345 | azs <- evaluate (a:zs) 346 | putMVar v azs 347 | 348 | -- ------------------------------------------------------------------------- 349 | 350 | -- we'd like to know whether in practice contention can be reduced on a shared 351 | -- counter by first doing a read, and only doing an atomicModify when the 352 | -- counter is not seen to have been incremented yet (note: even so, by the time 353 | -- we're in the atomic block it may have been incremented, in which case it's a 354 | -- NOOP) 355 | readMaybeAtomicModifyIORef :: Int -> IO () 356 | readMaybeAtomicModifyIORef n = do 357 | counter <- newIORef 0 358 | stack1 <- newIORef [] -- non-contentious work done on these: 359 | stack2 <- newIORef [] 360 | let op stck = do cnt <- readIORef counter 361 | atomicModifyIORef' stck (\st-> (cnt:st,())) 362 | cnt' <- readIORef counter 363 | if cnt' == cnt 364 | then atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ())) 365 | else return () 366 | 367 | w1 <- async $ replicateM_ n $ op stack1 368 | w2 <- async $ replicateM_ n $ op stack2 369 | waitBoth w1 w2 370 | return () 371 | 372 | readMaybeCAS :: Int -> IO () 373 | readMaybeCAS n = do 374 | counter <- newIORef 0 375 | stack1 <- newIORef [] -- non-contentious work done on these: 376 | stack2 <- newIORef [] 377 | let op stck = do cntTicket <- readForCAS counter 378 | let cnt = peekTicket cntTicket 379 | atomicModifyIORef' stck (\st-> (cnt:st,())) 380 | incrCnt <- evaluate (cnt + 1) 381 | (weIncremented, _) <- casIORef counter cntTicket incrCnt 382 | return () 383 | 384 | w1 <- async $ replicateM_ n $ op stack1 385 | w2 <- async $ replicateM_ n $ op stack2 386 | waitBoth w1 w2 387 | return () 388 | 389 | atomicMaybeModifyIORef :: Int -> IO () 390 | atomicMaybeModifyIORef n = do 391 | counter <- newIORef 0 392 | stack1 <- newIORef [] -- non-contentious work done on these: 393 | stack2 <- newIORef [] 394 | let op stck = do cnt <- readIORef counter 395 | atomicModifyIORef' stck (\st-> (cnt:st,())) 396 | atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ())) 397 | 398 | w1 <- async $ replicateM_ n $ op stack1 399 | w2 <- async $ replicateM_ n $ op stack2 400 | waitBoth w1 w2 401 | return () 402 | 403 | readMaybeAtomicModifyTVar :: Int -> IO () 404 | readMaybeAtomicModifyTVar n = do 405 | counter <- newTVarIO 0 406 | stack1 <- newTVarIO [] -- non-contentious work done on these: 407 | stack2 <- newTVarIO [] 408 | let op stck = do cnt <- readTVarIO counter 409 | atomically $ modifyTVar stck (\st-> cnt:st) 410 | cnt' <- readTVarIO counter 411 | if cnt' == cnt 412 | then atomically $ modifyTVar counter (\cnt1-> if cnt1 == cnt then cnt+1 else cnt1) 413 | else return () 414 | 415 | w1 <- async $ replicateM_ n $ op stack1 416 | w2 <- async $ replicateM_ n $ op stack2 417 | waitBoth w1 w2 418 | return () 419 | 420 | atomicMaybeModifyTVar :: Int -> IO () 421 | atomicMaybeModifyTVar n = do 422 | counter <- newTVarIO 0 423 | stack1 <- newTVarIO [] -- non-contentious work done on these: 424 | stack2 <- newTVarIO [] 425 | let op stck = do cnt <- readTVarIO counter 426 | atomically $ modifyTVar stck (\st-> cnt:st) 427 | atomically $ modifyTVar counter (\cnt1-> if cnt1 == cnt then cnt+1 else cnt1) 428 | 429 | w1 <- async $ replicateM_ n $ op stack1 430 | w2 <- async $ replicateM_ n $ op stack2 431 | waitBoth w1 w2 432 | return () 433 | 434 | -- variants with a less realistic payload, simulating higher contention with more writers: 435 | {- 436 | -- NOTE: REMOVING: this isn't representative of the behavior of multiple 437 | -- threads, each with a sizable payload between read and atomicModifyIORef, 438 | -- which is what we care about. 439 | readMaybeAtomicModifyIORefHiC :: Int -> IO () 440 | readMaybeAtomicModifyIORefHiC n = do 441 | counter <- newIORef 0 442 | let op = do cnt <- readIORef counter 443 | evaluate (show $ sqrt cnt) 444 | cnt' <- readIORef counter 445 | if cnt' == cnt 446 | then atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ())) 447 | else return () 448 | 449 | w1 <- async $ replicateM_ n $ op 450 | w2 <- async $ replicateM_ n $ op 451 | waitBoth w1 w2 452 | return () 453 | 454 | atomicMaybeModifyIORefHiC :: Int -> IO () 455 | atomicMaybeModifyIORefHiC n = do 456 | counter <- newIORef 0 457 | let op = do cnt <- readIORef counter 458 | evaluate (show $ sqrt cnt) 459 | atomicModifyIORef' counter (\cnt1-> (if cnt1 == cnt then cnt+1 else cnt1, ())) 460 | 461 | w1 <- async $ replicateM_ n $ op 462 | w2 <- async $ replicateM_ n $ op 463 | waitBoth w1 w2 464 | return () 465 | -} 466 | 467 | 468 | -- Do atomicModifyIORefs block readers? 469 | -- NO 470 | readsAgainstAtomicModifyIORefs :: Int -> IO () 471 | readsAgainstAtomicModifyIORefs n = do 472 | cntr <- newIORef 0 473 | t <- async $ forever (atomicModifyIORef' cntr (\c-> (c+1,()))) 474 | replicateM_ n (readIORef cntr >>= evaluate) 475 | cancel t 476 | 477 | readsAgainstNonAtomicModify :: Int -> IO () 478 | readsAgainstNonAtomicModify n = do 479 | cntr <- newIORef 0 480 | t <- async $ forever (modifyIORef' cntr (\c-> c+1)) 481 | replicateM_ n (readIORef cntr >>= evaluate) 482 | cancel t 483 | 484 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Brandon Simmons 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Brandon Simmons nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /MVarExperiment.hs: -------------------------------------------------------------------------------- 1 | module Main 2 | where 3 | 4 | import Control.Concurrent 5 | import Control.Monad 6 | 7 | -- this to be run on one thread (i.e. +RTS -N1) to observe what happens to the 8 | -- fairness property of MVars as green threads exceed HECs. 9 | -- 10 | -- Interesting: 11 | -- https://ghc.haskell.org/trac/ghc/ticket/7606 12 | -- https://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Scheduler 13 | main = do 14 | st <- newMVar [] 15 | vs <- mapM (\n-> do v <- newEmptyMVar 16 | forkIO (replicateM_ 10 $ modifyMVar_ st (return . (n:)) >> putMVar v ()) 17 | return v 18 | ) [1..10] 19 | mapM_ takeMVar vs -- wait 20 | takeMVar st >>= print 21 | 22 | 23 | -------------------------------------------------------------------------------- /Main1.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | {-# LANGUAGE PackageImports #-} 3 | module Main 4 | where 5 | 6 | import Control.Concurrent.Async 7 | import Control.Monad 8 | import System.Environment 9 | 10 | import Control.Concurrent 11 | import Control.Concurrent.Chan 12 | import Control.Concurrent.STM 13 | import Control.Concurrent.STM.TQueue 14 | import Control.Concurrent.STM.TBQueue 15 | 16 | import Control.Concurrent.MVar 17 | import Data.IORef 18 | import Criterion.Main 19 | import Control.Exception(evaluate,mask_) 20 | 21 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S 22 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC 23 | import Data.Primitive.MutVar 24 | import Control.Monad.Primitive(PrimState) 25 | import Data.Atomics.Counter 26 | import Data.Atomics 27 | import System.Random 28 | import System.Random.MWC 29 | 30 | import Data.Array.IArray 31 | import qualified Data.Vector as V 32 | import qualified Data.Vector.Generic.Mutable as MV 33 | import qualified Data.Vector.Unboxed.Mutable as UMV 34 | import qualified Data.Primitive as P 35 | -- TODO fix imports above 36 | import qualified Data.Vector.Mutable as MVec 37 | 38 | import System.Time(getClockTime) 39 | import Data.Time.Clock.POSIX 40 | 41 | import Data.Word 42 | import Data.Bits 43 | 44 | -- Hack since these aren't currently working with ghc 7.8 45 | #if MIN_VERSION_base(4,7,0) 46 | #else 47 | import qualified Data.Concurrent.Queue.MichaelScott as MS 48 | #endif 49 | import qualified Data.Concurrent.Deque.ChaseLev as CL 50 | 51 | import Benchmarks 52 | 53 | -- These tests initially taken from stm/bench/chanbench.hs, ported to 54 | -- criterion, with some additions. 55 | -- 56 | -- The original used CPP to avoid code duplication while also ensuring GHC 57 | -- optimized the code in a realistic fashion. Here we just copy paste. 58 | 59 | main = do 60 | let n = 100000 61 | --let n = 2000000 -- original suggested value, bugs if exceeded 62 | 63 | mv <- newEmptyMVar -- This to be left empty after each test 64 | mvFull <- newMVar '1' 65 | -- -- 66 | -- mvWithFinalizer <- newEmptyMVar 67 | -- mkWeakMVar mvWithFinalizer $ return () 68 | -- -- 69 | -- mvFinalizee <- newMVar 'a' 70 | -- mvWithFinalizer <- newMVar () 71 | -- mkWeakMVar mvWithFinalizer $ 72 | -- modifyMVar_ mvFinalizee (const $ return 'b') 73 | -- -- 74 | tmv <- newEmptyTMVarIO 75 | tv <- newTVarIO '1' 76 | ior <- newIORef '1' 77 | mutv <- newMutVar '1' 78 | 79 | atomic_counter <- newCounter 0 80 | 81 | -- to be left empty at emd of each test: 82 | #if MIN_VERSION_base(4,7,0) 83 | #else 84 | lockfreeQEmpty <- MS.newQ 85 | #endif 86 | chaselevQEmpty <- CL.newQ 87 | chanEmpty <- newChan 88 | tchanEmpty <- newTChanIO 89 | tqueueEmpty <- newTQueueIO 90 | tbqueueEmpty <- newTBQueueIO 2 91 | (fastEmptyI,fastEmptyO) <- S.newSplitChan 92 | (splitchannelEmptyI,splitchannelEmptyO) <- SC.new 93 | 94 | -- random generators 95 | mwc_gen <- createSystemRandom 96 | sys_rand_gen <- newStdGen 97 | 98 | let arr8 = listArray (0,7) [1..8] :: Array Int Int 99 | let arr16 = listArray (0,15) [1..16] :: Array Int Int 100 | let vec8 = V.fromList [1..8] :: V.Vector Int 101 | let vec16 = V.fromList [1..16] :: V.Vector Int 102 | mvec8 <- V.thaw vec8 -- :: V.MVector (PrimState IO) Int 103 | mvec16 <- V.thaw vec16 -- :: V.MVector (PrimState IO) Int 104 | parr8 <- P.newArray 8 (0::Int) 105 | parr16 <- P.newArray 16 (0::Int) 106 | parr32 <- P.newArray 32 (0::Int) 107 | parr128 <- P.newArray 128 (0::Int) 108 | parr512 <- P.newArray 512 (0::Int) 109 | parr32Justs <- P.newArray 32 (Just 0 :: Maybe Int) 110 | iparr8 <- ((P.newArray 8 (0::Int)) >>= P.unsafeFreezeArray) :: IO (P.Array Int) 111 | iparr16 <- ((P.newArray 16 (0::Int)) >>= P.unsafeFreezeArray) :: IO (P.Array Int) 112 | 113 | ba16 <- (P.newByteArray (16 * P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 114 | forM_ [0..15] $ \i-> P.writeByteArray ba16 i (0::Int) 115 | umvvec16 <- UMV.new 16 :: IO (UMV.IOVector Int) 116 | UMV.set umvvec16 0 :: IO () 117 | 118 | defaultMain $ 119 | [ bgroup "Channel implementations" $ 120 | -- Very artificial; just adding up the consts of the 121 | -- takes/puts/reads involved in getting a single message in and out 122 | [ bgroup "Latency micro-benchmark" $ 123 | [ bench "Chan" (writeChan chanEmpty () >> readChan chanEmpty) 124 | , bench "TChan" (atomically (writeTChan tchanEmpty () >> readTChan tchanEmpty)) 125 | , bench "TQueue" (atomically (writeTQueue tqueueEmpty () >> readTQueue tqueueEmpty)) 126 | , bench "TBQueue" (atomically (writeTBQueue tbqueueEmpty () >> readTBQueue tbqueueEmpty)) 127 | , bench "chan-split-fast" (S.writeChan fastEmptyI () >> S.readChan fastEmptyO) 128 | , bench "split-channel" (SC.send splitchannelEmptyI () >> SC.receive splitchannelEmptyO) 129 | #if MIN_VERSION_base(4,7,0) 130 | #else 131 | , bench "lockfree-queue" (MS.pushL lockfreeQEmpty () >> msreadR lockfreeQEmpty) 132 | #endif 133 | , bench "chaselev-dequeue" (CL.pushL chaselevQEmpty () >> clreadR chaselevQEmpty) 134 | ] 135 | , bgroup ("Single-thread throughput with "++show n++" messages") $ 136 | -- some pure operations we'd like a rough measurement for, e.g. 137 | -- the TQueue performs a reverse [1..n] in a test run, so we'd 138 | -- like an idea of how much that factor affects throughput. 139 | [ bgroup "For scale" $ 140 | [ bench "reverse [1..n]" $ nf (\n'-> reverse [1..n']) n 141 | , bench "reverse replicate n 1" $ nf (\n'-> replicate n' (1::Int)) n 142 | ] 143 | , bgroup "Chan" $ 144 | -- original tests from chanbench.hs 145 | [ bench "sequential write all then read all" $ runtestChan1 n 146 | , bench "repeated write some, read some" $ runtestChan2 n 147 | ] 148 | , bgroup "TChan" $ 149 | [ bench "sequential write all then read all" $ runtestTChan1 n 150 | , bench "repeated write some, read some" $ runtestTChan2 n 151 | ] 152 | , bgroup "TQueue" $ 153 | [ bench "sequential write all then read all" $ runtestTQueue1 n 154 | , bench "repeated write some, read some" $ runtestTQueue2 n 155 | ] 156 | , bgroup "TBQueue" $ 157 | [ bench "sequential write all then read all" $ runtestTBQueue1 n 158 | , bench "repeated write some, read some" $ runtestTBQueue2 n 159 | ] 160 | -- OTHER CHAN IMPLEMENTATIONS: 161 | , bgroup "chan-split-fast" $ 162 | [ bench "sequential write all then read all" $ runtestSplitChan1 n 163 | , bench "repeated write some, read some" $ runtestSplitChan2 n 164 | ] 165 | , bgroup "split-channel" $ 166 | [ bench "sequential write all then read all" $ runtestSplitChannel1 n 167 | , bench "repeated write some, read some" $ runtestSplitChannel2 n 168 | ] 169 | #if MIN_VERSION_base(4,7,0) 170 | #else 171 | , bgroup "lockfree-queue" $ 172 | [ bench "sequential write all then read all" $ runtestLockfreeQueue1 n 173 | , bench "repeated write some, read some" $ runtestLockfreeQueue2 n 174 | ] 175 | #endif 176 | , bgroup "chaselev-dequeue" $ 177 | [ bench "sequential write all then read all" $ runtestChaseLevQueue1 n 178 | , bench "repeated write some, read some" $ runtestChaseLevQueue2 n 179 | ] 180 | ] 181 | ] 182 | , bgroup "Forking, context switches, and Misc. on a single core" $ 183 | [ bench "forkIO" (forkIO (return ()) >> return ()) 184 | , bench "put,take MVar" $ do 185 | putMVar mv '0' 186 | takeMVar mv 187 | , bench "put,takeMVar + forkIO + 2 context switches" $ do 188 | forkIO $ putMVar mv '0' 189 | takeMVar mv 190 | , bench "getNumCapabilities" getNumCapabilities 191 | , bench "myThreadId" myThreadId 192 | 193 | , bench "myThreadId >>= threadCapability" $ myThreadId >>= threadCapability 194 | -- It may not be possible to re-use generators for our applications of random, so: 195 | , bench "random new_gen" $ newStdGen 196 | , bench "random Int range: (1,8)" $ whnf (randomR (1 :: Int, 8)) sys_rand_gen 197 | -- THIS IS CRAZY SLOW TODO move out of this so we can actually compare graphs: 198 | --, bench "mwc-random new_gen" $ createSystemRandom 199 | , bench "mwc-random Int range: (1,8)" $ ((uniformR (1 :: Int, 8) mwc_gen) :: IO Int) 200 | , bench "randomRIO (1,8)" $ randomRIO (1::Int,8) 201 | 202 | , bench "old-time getClockTime" $ getClockTime 203 | , bench "time getPOSIXTime" getPOSIXTime 204 | ] 205 | 206 | , bgroup "Var primitives" $ 207 | [ bgroup "For scale" $ 208 | [ bench "mod (Int)" $ nf (2147483647 `mod`) (8 :: Int) 209 | , bench "rem (Int)" $ nf (2147483647 `rem`) (8 :: Int) 210 | , bench "mod (Word)" $ nf ((2147483647::Word) `mod`) (8 :: Word) 211 | , bench "rem (Word)" $ nf ((2147483647::Word) `rem`) (8 :: Word) 212 | -- tricks since we only need powers of two: a `mod` 2^i = a .&. (2^i–1) 213 | , bench "AND modulo (Int)" $ nf (((2147483647::Int) .&.) . subtract 1) (8 :: Int) 214 | , bench "AND modulo (Int) easier" $ nf (((2147483647::Int) .&.)) (7 :: Int) 215 | , bench "AND modulo (Word)" $ nf (((2147483647::Word) .&.) . subtract 1) (8 :: Word) 216 | 217 | -- we might end up just using subtraction: 218 | , bench "subtraction (Int)" $ nf (subtract 2147483640) (2147483647 :: Int) 219 | -- ALL OF ABOVE IS STUPID SLOW; probably measuring overhead of function call, boxing/unboxing, etc. 220 | , bench "subtraction on counter incr" $ nfIO $ fmap (subtract 2147483640) (incrCounter 1 atomic_counter) 221 | ] 222 | , bgroup "atomic-primops (puts on safety goggles...)" $ 223 | [ bench "newCounter" $ newCounter 0 224 | , bench "incrCounter 1" $ incrCounter 1 atomic_counter 225 | , bench "incrCounter 1024" $ incrCounter 1024 atomic_counter 226 | ] 227 | , bgroup "IORef" $ 228 | [ bench "newIORef Char" $ (newIORef '0') 229 | , bench "writeIORef" $ (writeIORef ior '1') 230 | , bench "readIORef" $ (readIORef ior) 231 | , bench "readIORef (w result forced)" $ nfIO (readIORef ior) -- what does this tell us w/r/t above? 232 | , bench "readForCAS" $ (fmap peekTicket $ readForCAS ior) 233 | , bench "readForCAS (w result forced)" $ nfIO (fmap peekTicket $ readForCAS ior) 234 | , bench "modifyIORef' (i.e. readIORef + writeIORef)" (modifyIORef' ior $ const '2') 235 | , bench "atomicModifyIORef' (i.e. (in GHC) strict atomicModifyMutVar)" $ (atomicModifyIORef' ior $ const ('3','3')) 236 | , bench "atomicModifyIORef (i.e. (in GHC) atomicModifyMutVar)" $ (atomicModifyIORef ior $ const ('3','3')) 237 | 238 | , bench "atomicModifyIORefCAS (i.e. atomic-primops CAS loop)" $ (atomicModifyIORefCAS ior $ const ('4','4')) 239 | , bench "atomicModifyIORefCAS' (my CAS loop)" $ (atomicModifyIORefCAS' ior $ const ('4','4')) 240 | 241 | ] 242 | , bgroup "MVar" $ 243 | [ bench "newEmptyMVar" $ newEmptyMVar 244 | , bench "newEmptyMVar + putMVar (i.e. newMVar Char)" (newEmptyMVar >>= \v->putMVar v '0') 245 | , bench "putMVar + takeMVar" $ (putMVar mv '1' >> takeMVar mv) 246 | , bench "modifyMVarMasked_ (i.e. mask + takeMVar + putMVar + exception-handling)" $ (modifyMVarMasked_ mvFull (const $ return '2')) 247 | , bench "mask_ + takeMVar-and-mod + putMVar)" $ mask_ (takeMVar mvFull >>= (putMVar mvFull . const '2')) 248 | , bench "newMVar + mkWeakMVar with finalizer" $ (newMVar '1' >>= flip mkWeakMVar (return ())) 249 | -- These show no effect of a finalizer: 250 | -- , bench "on MVar with finalizer: putMVar, takeMVar" $ (putMVar mvWithFinalizer '1' >> takeMVar mvWithFinalizer) 251 | -- , bench "On target of an MVar finalizer: takeMVar, putMVar" $ (takeMVar mvFinalizee >>= putMVar mvFinalizee) 252 | #if MIN_VERSION_base(4,7,0) 253 | , bench "tryReadMVar" $ tryReadMVar mvFull 254 | , bench "tryReadMVar (w result forced)" $ nfIO $ tryReadMVar mvFull 255 | #endif 256 | , bench "readMVar" $ readMVar mvFull 257 | , bench "readMVar (w result forced)" $ nfIO $ readMVar mvFull 258 | ] 259 | 260 | , bgroup "TVar" $ 261 | [ bench "newTVarIO ()" $ (newTVarIO ()) 262 | , bench "atomically writeTVar" $ (atomically $ writeTVar tv '1') 263 | , bench "atomically readTVar" $ (atomically $ readTVar tv) 264 | , bench "readTVarIO" $ (readTVarIO tv) 265 | , bench "atomically modifyTVar' (i.e. atomically (readTVar + writeTVar))" $ (atomically $ modifyTVar' tv (const '2')) 266 | , bench "(atomically writeTVar) + (atomically readTVar)" $ ((atomically $ writeTVar tv '1') >> (atomically $ readTVar tv)) 267 | ] 268 | , bgroup "TMVar" $ 269 | [ bench "newEmptyTMVar (i.e. newTVarIO)" $ newEmptyTMVarIO 270 | , bench "atomically (newEmptyTMVar + putTMVar) (i.e. newTVar + readTVar + writeTVar,retry)" (atomically (newEmptyTMVar >>= \v->putTMVar v ())) 271 | , bench "atomically (putTMVar + takeTMVar)" $ (atomically $ (putTMVar tmv '1' >> takeTMVar tmv)) 272 | , bench "(atomically putTMVar) + (atomically takeTMVar)" $ ((atomically $ putTMVar tmv '1') >> (atomically $ takeTMVar tmv)) 273 | ] 274 | , bgroup "MutVar" $ 275 | [ bench "newMutVar ()" $ (newMutVar () :: IO (MutVar (PrimState IO) ())) 276 | , bench "writeMutVar" $ (writeMutVar mutv '1' :: IO ()) 277 | , bench "readMutVar" $ (readMutVar mutv :: IO Char) 278 | , bench "atomicModifyMutVar" $ (atomicModifyMutVar mutv $ const ('2','2') :: IO Char) 279 | ] 280 | ] 281 | -- Some more tests of pure operations relevant to TQueue style dequeue 282 | -- performance. 283 | , bgroup "Misc" $ 284 | -- of interest for TQueue style approach 285 | [ bgroup "cons and reverse" $ 286 | [ bench "cons" $ nf (:[]) True 287 | , bench "pure unrolled cons then final reverse x10" $ nf testConsUnrolledReverse 10 288 | , bench "pure cons then final reverse x10" $ nf testConsReverse 10 289 | , bench "pure unrolled cons then final reverse x5" $ nf testConsUnrolledReverse 5 290 | , bench "pure cons then final reverse x5" $ nf testConsReverse 5 291 | 292 | , bench "pure cons-composition append x10" $ nf testCompositionAppend 10 293 | , bench "pure cons then final reverse x10" $ nf testConsReverse 10 294 | , bench "pure cons-composition append x100" $ nf testCompositionAppend 100 295 | , bench "pure cons then final reverse x100" $ nf testConsReverse 100 296 | , bench "pure cons-composition append x10000" $ nf testCompositionAppend 10000 297 | , bench "pure cons then final reverse x10000" $ nf testConsReverse 10000 298 | 299 | , bench "pure cons-composition append and prepend x10" $ nf testCompositionAppendPrepend 10 300 | , bench "pure cons-composition append and prepend x100" $ nf testCompositionAppendPrepend 100 301 | , bench "pure cons-composition append and prepend x10000" $ nf testCompositionAppendPrepend 10000 302 | 303 | , bench "mvar-stored cons-composition append x10" $ nfIO $ testCompositionAppendInMVar 10 304 | , bench "mvar-stored cons then final reverse x10" $ nfIO $ testConsReverseInMVar 10 305 | , bench "mvar-stored cons-composition append x100" $ nfIO $ testCompositionAppendInMVar 100 306 | , bench "mvar-stored cons then final reverse x100" $ nfIO $ testConsReverseInMVar 100 307 | , bench "mvar-stored cons-composition append x10000" $ nfIO $ testCompositionAppendInMVar 10000 308 | , bench "mvar-stored cons then final reverse x10000" $ nfIO $ testConsReverseInMVar 10000 309 | 310 | , bench "storing mvar-stored cons-composition append x10" $ nfIO $ testStoreCompositionAppendInMVar 10 311 | , bench "storing mvar-stored cons then final reverse x10" $ nfIO $ testStoreConsReverseInMVar 10 312 | , bench "storing mvar-stored cons-composition append x100" $ nfIO $ testStoreCompositionAppendInMVar 100 313 | , bench "storing mvar-stored cons then final reverse x100" $ nfIO $ testStoreConsReverseInMVar 100 314 | , bench "storing mvar-stored cons-composition append x10000" $ nfIO $ testStoreCompositionAppendInMVar 10000 315 | , bench "storing mvar-stored cons then final reverse x10000" $ nfIO $ testStoreConsReverseInMVar 10000 316 | ] 317 | , bgroup "arrays" $ 318 | [ bgroup "indexing" $ 319 | [ bench "index 8th list" $ nf ([(1::Int)..8] !!) 7 320 | , bench "index 8th IArray" $ nf (arr8 !) 7 321 | , bench "index 8th Vector" $ nf (V.unsafeIndex vec8) 7 322 | , bench "index 8th MVector" $ nfIO $ (MV.unsafeRead mvec8) 7 323 | -- I think this is basically MVector AFAICT 324 | , bench "index 8th Primitiv MutableArray" $ nfIO $ (P.readArray parr8) 7 325 | , bench "index 8th Primitiv Array" $ nf (P.indexArray iparr8) 7 326 | 327 | , bench "index 16th list" $ nf ([(1::Int)..16] !!) 15 328 | , bench "index 16th IArray" $ nf (arr16 !) 15 329 | , bench "index 16th Vector" $ nf (V.unsafeIndex vec16) 15 330 | , bench "index 16th MVector" $ nfIO $ (MV.unsafeRead mvec16) 15 331 | , bench "index 16th Primitiv MutableArray" $ nfIO $ (P.readArray parr16) 15 332 | , bench "index 16th Primitiv Array" $ nf (P.indexArray iparr16) 15 333 | 334 | , bench "readArrayElem for CAS (MutableArray)" $ nfIO (fmap peekTicket $ readArrayElem parr16 15 ) 335 | , bench "readArray (MutableArray)" $ nfIO (P.readArray parr16 15 ) 336 | , bench "readByteArray (MutableByteArray, usable for CAS)" $ nfIO (P.readByteArray ba16 15 :: IO Int) 337 | , bench "read Mutable Unboxed Vector" $ (UMV.read umvvec16 15 :: IO Int) 338 | ] 339 | , bgroup "writing" $ 340 | [ bench "write MutableArray" $ (P.writeArray parr16 15 1 :: IO ()) 341 | , bench "CAS MutableArray (along with a readArrayElem)" (readArrayElem parr16 15 >>= (\t-> casArrayElem parr16 15 t 2)) 342 | , bench "write MutableByteArray" (P.writeByteArray ba16 15 (1::Int) :: IO ()) 343 | , bench "CAS MutableByteArray (along with a readByteArray)" (P.readByteArray ba16 15 >>= (\t-> casByteArrayInt ba16 15 t (2::Int))) 344 | , bench "write Mutable Unboxed Vector" $ (UMV.write umvvec16 15 2 :: IO ()) 345 | , bench "fetchAddByteArrayInt" $ fetchAddByteArrayInt ba16 14 1 346 | , bench "fetchAddByteArrayInt w result forced" $ nfIO $ fetchAddByteArrayInt ba16 14 1 347 | ] 348 | , bgroup "creating" $ 349 | [ bench "new MVector 8 Ints" $ (MVec.new 8 :: IO (MVec.IOVector Int)) 350 | , bench "new MVector 32 Ints" $ (MVec.new 32 :: IO (MVec.IOVector Int)) 351 | , bench "unsafeNew MVector 8 Ints" $ (MVec.unsafeNew 8 :: IO (MVec.IOVector Int)) 352 | , bench "unsafeNew MVector 32 Ints" $ (MVec.unsafeNew 32 :: IO (MVec.IOVector Int)) 353 | 354 | , bench "new MutableArray 8 Ints" $ (P.newArray 8 0 :: IO (P.MutableArray (PrimState IO) Int)) 355 | , bench "new MutableArray 32 Ints" $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int)) 356 | , bench "new MutableArray 32 Nothing :: Maybe Ints" $ (P.newArray 32 Nothing :: IO (P.MutableArray (PrimState IO) (Maybe Int))) 357 | , bench "cloned MutableArray 8 Ints" $ (P.cloneMutableArray parr8 0 8 :: IO (P.MutableArray (PrimState IO) Int)) 358 | , bench "cloned MutableArray 32 Ints" $ (P.cloneMutableArray parr32 0 32 :: IO (P.MutableArray (PrimState IO) Int)) 359 | , bench "cloned MutableArray 128 Ints" $ (P.cloneMutableArray parr128 0 128 :: IO (P.MutableArray (PrimState IO) Int)) 360 | , bench "cloned MutableArray 512 Ints" $ (P.cloneMutableArray parr512 0 512 :: IO (P.MutableArray (PrimState IO) Int)) 361 | 362 | , bench "new MutableByteArray 8 Ints" (P.newByteArray (8* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 363 | , bench "new MutableByteArray 32 Ints" (P.newByteArray (32* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 364 | , bench "new set MutableByteArray 32 Ints" $ (P.newByteArray (32* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 32 (0 :: Int) 365 | , bench "new MutableByteArray 128 Ints" (P.newByteArray (128* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 366 | , bench "new set MutableByteArray 128 Ints" $ (P.newByteArray (128* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 128 (0 :: Int) 367 | , bench "new MutableByteArray 512 Ints" (P.newByteArray (512* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 368 | , bench "new set MutableByteArray 512 Ints" $ (P.newByteArray (512* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) >>= \a-> P.setByteArray a 0 512 (0 :: Int) 369 | , bench "new MutableByteArray 2048 Ints" (P.newByteArray (2048* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 370 | , bench "new MutableByteArray 8192 Ints" (P.newByteArray (8192* P.sizeOf (0 :: Int)) :: IO (P.MutableByteArray (PrimState IO))) 371 | , bench "new unboxed Mutable Vector 8 Ints" (UMV.new 8 :: IO (UMV.IOVector Int)) 372 | , bench "new unboxed Mutable Vector 32 Ints" (UMV.new 32 :: IO (UMV.IOVector Int)) 373 | , bench "new unboxed Mutable Vector 128 Ints" (UMV.new 128 :: IO (UMV.IOVector Int)) 374 | ] 375 | ] 376 | ] 377 | ] 378 | -- takeMVar mvWithFinalizer -- keep finalizer from actually running 379 | 380 | -------------------------------------------------------------------------------- /MainN.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | {-# LANGUAGE PackageImports #-} 3 | module Main 4 | where 5 | 6 | import Control.Concurrent.Async 7 | import Control.Monad 8 | import System.Environment 9 | 10 | import qualified Data.Primitive as P 11 | import Control.Concurrent 12 | import Control.Concurrent.Chan 13 | import Control.Concurrent.STM 14 | import Control.Concurrent.STM.TQueue 15 | import Control.Concurrent.STM.TBQueue 16 | 17 | import Control.Concurrent.MVar 18 | import Data.IORef 19 | import Criterion.Main 20 | import Control.Exception(evaluate) 21 | 22 | 23 | import qualified "chan-split-fast" Control.Concurrent.Chan.Split as S 24 | import qualified "split-channel" Control.Concurrent.Chan.Split as SC 25 | import Data.Primitive.MutVar 26 | import Control.Monad.Primitive(PrimState) 27 | import Data.Atomics.Counter 28 | import Data.Atomics 29 | 30 | #if MIN_VERSION_base(4,7,0) 31 | #else 32 | import qualified Data.Concurrent.Queue.MichaelScott as MS 33 | #endif 34 | 35 | import GHC.Conc 36 | 37 | import Benchmarks 38 | 39 | -- These tests initially taken from stm/bench/chanbench.hs, ported to 40 | -- criterion, with some additions. 41 | -- 42 | -- The original used CPP to avoid code duplication while also ensuring GHC 43 | -- optimized the code in a realistic fashion. Here we just copy paste. 44 | 45 | main = do 46 | let n = 100000 47 | --let n = 2000000 -- original suggested value, bugs if exceeded 48 | 49 | procs <- getNumCapabilities 50 | let procs_div2 = procs `div` 2 51 | if procs_div2 >= 0 then return () 52 | else error "Run with RTS +N2 or more" 53 | 54 | mv <- newEmptyMVar -- This to be left empty after each test 55 | mvFull <- newMVar undefined 56 | -- -- 57 | -- mvWithFinalizer <- newEmptyMVar 58 | -- mkWeakMVar mvWithFinalizer $ return () 59 | -- -- 60 | -- mvFinalizee <- newMVar 'a' 61 | -- mvWithFinalizer <- newMVar () 62 | -- mkWeakMVar mvWithFinalizer $ 63 | -- modifyMVar_ mvFinalizee (const $ return 'b') 64 | -- -- 65 | tmv <- newEmptyTMVarIO 66 | tv <- newTVarIO undefined 67 | ior <- newIORef undefined 68 | mutv <- newMutVar undefined 69 | 70 | counter_mvar <- newMVar (1::Int) 71 | counter_ioref <- newIORef (1::Int) 72 | counter_tvar <- newTVarIO (1::Int) 73 | counter_atomic_counter <- newCounter (1::Int) 74 | 75 | fill_empty_chan <- newChan 76 | fill_empty_tchan <- newTChanIO 77 | fill_empty_tqueue <- newTQueueIO 78 | fill_empty_tbqueue <- newTBQueueIO maxBound 79 | (fill_empty_fastI, fill_empty_fastO) <- S.newSplitChan 80 | (fill_empty_splitchannelI, fill_empty_splitchannelO) <- SC.new 81 | #if MIN_VERSION_base(4,7,0) 82 | #else 83 | fill_empty_lockfree <- MS.newQ 84 | #endif 85 | 86 | defaultMain $ 87 | [ bgroup "Var primitives" $ 88 | -- This gives us an idea of how long a lock is held by these atomic 89 | -- ops, and the effects of retry/blocking scheduling behavior. 90 | -- compare this with latency measure in Main1 to get the whole 91 | -- picture: 92 | -- Subtract the cost of: 93 | -- - 2 context switches 94 | -- - 4 newEmptyMVar 95 | -- - 4 takeMVar 96 | -- - 4 putMVar 97 | -- TODO: also test with N green threads per core. 98 | [ bgroup ("Throughput on "++(show n)++" concurrent atomic mods") $ 99 | -- just forks some threads all atomically modifying a variable: 100 | let {-# INLINE mod_test #-} 101 | mod_test = mod_test_n n 102 | {-# INLINE mod_test_n #-} 103 | mod_test_n n' = \threads modf -> do 104 | dones <- replicateM threads newEmptyMVar ; starts <- replicateM threads newEmptyMVar 105 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n' `div` threads) modf >> putMVar done1 ()) $ zip starts dones 106 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 107 | 108 | -- We use this payload to scale contention; on my machine 109 | -- timesN values of 1,2,3,4 run at fairly consistent: 15ns, 110 | -- 19ns, 29ns, and 37ns (note: 22.4ns for an atomicModifyIORef) 111 | {-# NOINLINE payload #-} 112 | payload timesN = (evaluate $ (foldr ($) 2 $ replicate timesN sqrt) :: IO Float) 113 | 114 | varGroupPayload perProc numPL = [ 115 | bench "modifyMVar_" $ mod_test (procs*perProc) $ 116 | (modifyMVar_ counter_mvar (return . (+1)) >> payload numPL) 117 | 118 | , bench "modifyMVarMasked_" $ mod_test (procs*perProc) $ 119 | (modifyMVarMasked_ counter_mvar (return . (+1)) >> payload numPL) 120 | 121 | , bench "atomicModifyIORef'" $ mod_test (procs*perProc) $ 122 | (atomicModifyIORef' counter_ioref (\x-> (x+1,()) ) >> payload numPL) 123 | 124 | , bench "atomically modifyTVar'" $ mod_test (procs*perProc) $ 125 | ((atomically $ modifyTVar' counter_tvar ((+1))) >> payload numPL) 126 | 127 | , bench "incrCounter (atomic-primops)" $ mod_test (procs*perProc) $ 128 | (incrCounter 1 counter_atomic_counter >> payload numPL) 129 | 130 | , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test (procs*perProc) $ 131 | (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) ) >> payload numPL) 132 | 133 | , bench "atomicModifyIORefCAS' (my CAS loop)" $ mod_test (procs*perProc) $ 134 | (atomicModifyIORefCAS' counter_ioref (\x-> (x+1,()) ) >> payload numPL) 135 | 136 | ] 137 | 138 | in [ bgroup "1 thread per HEC, full contention" $ 139 | [ bench "modifyMVar_" $ mod_test procs $ 140 | (modifyMVar_ counter_mvar (return . (+1))) 141 | 142 | , bench "modifyMVarMasked_" $ mod_test procs $ 143 | (modifyMVarMasked_ counter_mvar (return . (+1))) 144 | 145 | , bench "atomicModifyIORef'" $ mod_test procs $ 146 | (atomicModifyIORef' counter_ioref (\x-> (x+1,()) )) 147 | 148 | , bench "atomically modifyTVar'" $ mod_test procs $ 149 | (atomically $ modifyTVar' counter_tvar ((+1))) 150 | 151 | , bench "incrCounter (atomic-primops)" $ mod_test procs $ 152 | (incrCounter 1 counter_atomic_counter) 153 | 154 | , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test procs $ 155 | (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) )) 156 | 157 | , bench "atomicModifyIORefCAS' (my CAS loop)" $ mod_test procs $ 158 | (atomicModifyIORefCAS' counter_ioref (\x-> (x+1,()) )) 159 | 160 | -- I want to compare these with the same results above; 161 | -- see also TVarExperiment: 162 | -- , bench "atomicModifyIORef' x10" $ mod_test_n (10*n) procs $ 163 | -- (atomicModifyIORef' counter_ioref (\x-> (x+1,()) )) 164 | -- , bench "atomically modifyTVar' x10" $ mod_test_n (10*n) procs $ 165 | -- (atomically $ modifyTVar' counter_tvar ((+1))) 166 | ] 167 | , bgroup "2 threads per HEC, full contention" $ 168 | [ bench "modifyMVar_" $ mod_test (procs*2) $ 169 | (modifyMVar_ counter_mvar (return . (+1))) 170 | 171 | , bench "modifyMVarMasked_" $ mod_test (procs*2) $ 172 | (modifyMVarMasked_ counter_mvar (return . (+1))) 173 | 174 | -- WTF! This is suddenly giving me a stack overflow.... 175 | -- , bench "atomicModifyIORef'" $ mod_test (procs*2) $ 176 | -- (atomicModifyIORef' counter_ioref (\x-> (x+1,()) )) 177 | 178 | , bench "atomically modifyTVar'" $ mod_test (procs*2) $ 179 | (atomically $ modifyTVar' counter_tvar ((+1))) 180 | 181 | , bench "incrCounter (atomic-primops)" $ mod_test (procs*2) $ 182 | (incrCounter 1 counter_atomic_counter) 183 | 184 | , bench "atomicModifyIORefCAS (atomic-primops)" $ mod_test (procs*2) $ 185 | (atomicModifyIORefCAS counter_ioref (\x-> (x+1,()) )) 186 | 187 | ] 188 | 189 | {- COMMENTING, since the atomicModifyIORef' below is *again* 190 | causing stack overflow for no apparent reason TODO why? 191 | 192 | -- NOTE: adding more threads per-HEC at this point shows 193 | -- little difference (very bad MVar locking behavior has 194 | -- mostly disappeared) 195 | -- 196 | -- test dialing back the contention: 197 | , bgroup "1 threads per HEC, 1 payload" $ 198 | varGroupPayload 1 1 199 | , bgroup "1 threads per HEC, 2 payload" $ 200 | varGroupPayload 1 2 201 | , bgroup "1 threads per HEC, 4 payload" $ 202 | varGroupPayload 1 4 203 | , bgroup "1 threads per HEC, 8 payload" $ 204 | varGroupPayload 1 8 205 | 206 | -- this is an attempt to see if a somewhat random delay can 207 | -- get rid of (some or all) the very slow runs; hypothesis 208 | -- being that those runs get into some bad harmonics and 209 | -- contention is slow to resolve. 210 | , bgroup "1 thread per HEC, scattered payloads with IORefs" $ 211 | let benchRandPayloadIORef evry pyld = 212 | bench ("atomicModifyIORef' "++(show evry)++" "++(show pyld)) $ 213 | mod_test procs $ 214 | (atomicModifyIORef' counter_ioref (\x-> (x+1,x) ) 215 | >>= \x-> if x `mod` evry == 0 then payload pyld else return 1) 216 | in [ benchRandPayloadIORef 2 1 217 | , benchRandPayloadIORef 2 4 218 | , benchRandPayloadIORef 2 16 219 | , benchRandPayloadIORef 8 1 220 | , benchRandPayloadIORef 8 4 221 | , benchRandPayloadIORef 8 16 222 | , benchRandPayloadIORef 32 1 223 | , benchRandPayloadIORef 32 4 224 | , benchRandPayloadIORef 32 16 225 | ] 226 | 227 | , bgroup "Test Payload" $ 228 | [ bench "payload x1" $ payload 1 229 | , bench "payload x2" $ payload 2 230 | , bench "payload x4" $ payload 4 231 | , bench "payload x8" $ payload 8 232 | ] 233 | -} 234 | ] 235 | , bgroup "Misc" $ 236 | -- If the second shows some benefit on just two threads, then 237 | -- it represents a useful technique for reducing contention: 238 | [ bench "contentious atomic-maybe-modify IORef" $ atomicMaybeModifyIORef n 239 | , bench "read first, then maybe contentious atomic-maybe-modify IORef" $ readMaybeAtomicModifyIORef n 240 | , bench "readForCAS, then CAS (atomic-primops)" $ readMaybeCAS n 241 | -- NOT RELEVANT: 242 | -- , bench "Higher contention, contentious atomic-maybe-modify IORef" $ atomicMaybeModifyIORefHiC n 243 | -- , bench "Higher contention, read first, then maybe contentious atomic-maybe-modify IORef" $ readMaybeAtomicModifyIORefHiC n 244 | , bench "contentious atomic-maybe-modify TVar" $ atomicMaybeModifyTVar n 245 | , bench "read first, then maybe contentious atomic-maybe-modify TVar" $ readMaybeAtomicModifyTVar n 246 | 247 | -- we should expect these to be the same: 248 | , bench "reads against atomicModifyIORefs" $ readsAgainstAtomicModifyIORefs n 249 | , bench "reads against modifyIORefs" $ readsAgainstNonAtomicModify n 250 | -- TODO how do these compare with STM? 251 | ] 252 | ] 253 | -- TODO: define these in terms of numCapabilities: 254 | -- 1 r thread 1 w thread: measuring r/w contention 255 | -- 2 w threads ONLY: meeasure w/w contention, THEN: 256 | -- 2 r threads ONLY: meeasure r/r contention 257 | -- more threads: measuring descheduling bottlenecks, context switching overheads (+ above) 258 | -- above better tested outside criterion, w/ eventlogging 259 | -- also test equivalents of above on 8-core 260 | , bgroup "Channel implementations" $ 261 | [ bgroup ("Operations on "++(show n)++" messages") $ 262 | [ bgroup "For scale" $ 263 | -- For TQueue style chans, test the cost of reverse 264 | [ bench "reverse [1..n]" $ nf (\n'-> reverse [1..n']) n 265 | , bench "reverse replicate n 1" $ nf (\n'-> replicate n' (1::Int)) n 266 | ] 267 | , bgroup "Chan" $ 268 | -- this gives us a measure of effects of contention between 269 | -- readers and writers when compared with single-threaded 270 | -- version: 271 | [ bench "async 1 writer 1 readers" $ runtestChanAsync 1 1 n 272 | -- NOTE: this is a bit hackish, filling in one test and 273 | -- reading in the other; make sure memory usage isn't 274 | -- influencing mean: 275 | -- 276 | -- This measures writer/writer contention, in this case I 277 | -- think we see a lot of thread blocking/waiting delays 278 | , bench ("async "++(show procs)++" writers") $ do 279 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 280 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (writeChan fill_empty_chan ()) >> putMVar done1 ()) $ zip starts dones 281 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 282 | -- This measures reader/reader contention: 283 | , bench ("async "++(show procs)++" readers") $ do 284 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 285 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (readChan fill_empty_chan) >> putMVar done1 ()) $ zip starts dones 286 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 287 | -- This is measuring the effects of bottlenecks caused by 288 | -- descheduling, context-switching overhead (forced my 289 | -- fairness properties in the case of MVar), as well as 290 | -- all of the above; this is probably less than 291 | -- informative. Try threadscope on a standalone test: 292 | , bench "contention: async 100 writers 100 readers" $ runtestChanAsync 100 100 n 293 | ] 294 | , bgroup "TChan" $ 295 | [ bench "async 1 writers 1 readers" $ runtestTChanAsync 1 1 n 296 | -- This measures writer/writer contention: 297 | {- LIVELOCK!!! 298 | , bench ("async "++(show procs)++" writers") $ do 299 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 300 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTChan fill_empty_tchan ()) >> putMVar done1 ()) $ zip starts dones 301 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 302 | -- This measures reader/reader contention: 303 | , bench ("async "++(show procs)++" readers") $ do 304 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 305 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTChan fill_empty_tchan) >> putMVar done1 ()) $ zip starts dones 306 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 307 | , bench "contention: async 100 writers 100 readers" $ runtestTChanAsync 100 100 n 308 | -} 309 | ] 310 | , bgroup "TQueue" $ 311 | [ bench "async 1 writers 1 readers" $ runtestTQueueAsync 1 1 n 312 | -- This measures writer/writer contention: 313 | , bench ("async "++(show procs)++" writers") $ do 314 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 315 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTQueue fill_empty_tqueue ()) >> putMVar done1 ()) $ zip starts dones 316 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 317 | -- This measures reader/reader contention: 318 | , bench ("async "++(show procs)++" readers") $ do 319 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 320 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTQueue fill_empty_tqueue) >> putMVar done1 ()) $ zip starts dones 321 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 322 | , bench "contention: async 100 writers 100 readers" $ runtestTQueueAsync 100 100 n 323 | ] 324 | , bgroup "TBQueue" $ 325 | [ bench "async 1 writers 1 readers" $ runtestTBQueueAsync 1 1 n 326 | -- This measures writer/writer contention: 327 | , bench ("async "++(show procs)++" writers") $ do 328 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 329 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ writeTBQueue fill_empty_tbqueue ()) >> putMVar done1 ()) $ zip starts dones 330 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 331 | -- This measures reader/reader contention: 332 | , bench ("async "++(show procs)++" readers") $ do 333 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 334 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (atomically $ readTBQueue fill_empty_tbqueue) >> putMVar done1 ()) $ zip starts dones 335 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 336 | , bench "contention: async 100 writers 100 readers" $ runtestTBQueueAsync 100 100 n 337 | ] 338 | -- OTHER CHAN IMPLEMENTATIONS: 339 | , bgroup "chan-split-fast" $ 340 | [ bench "async 1 writers 1 readers" $ runtestSplitChanAsync 1 1 n 341 | -- This measures writer/writer contention: 342 | , bench ("async "++(show procs)++" writers") $ do 343 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 344 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (S.writeChan fill_empty_fastI ()) >> putMVar done1 ()) $ zip starts dones 345 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 346 | -- This measures reader/reader contention: 347 | , bench ("async "++(show procs)++" readers") $ do 348 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 349 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (S.readChan fill_empty_fastO) >> putMVar done1 ()) $ zip starts dones 350 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 351 | , bench "contention: async 100 writers 100 readers" $ runtestSplitChanAsync 100 100 n 352 | ] 353 | , bgroup "split-channel" $ 354 | [ bench "async 1 writers 1 readers" $ runtestSplitChannelAsync 1 1 n 355 | -- This measures writer/writer contention: 356 | , bench ("async "++(show procs)++" writers") $ do 357 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 358 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (SC.send fill_empty_splitchannelI ()) >> putMVar done1 ()) $ zip starts dones 359 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 360 | -- This measures reader/reader contention: 361 | , bench ("async "++(show procs)++" readers") $ do 362 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 363 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (SC.receive fill_empty_splitchannelO) >> putMVar done1 ()) $ zip starts dones 364 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 365 | , bench "contention: async 100 writers 100 readers" $ runtestSplitChannelAsync 100 100 n 366 | ] 367 | -- michael-scott queue implementation, using atomic-primops 368 | #if MIN_VERSION_base(4,7,0) 369 | #else 370 | , bgroup "lockfree-queue" $ 371 | [ bench "async 1 writer 1 readers" $ runtestLockfreeQueueAsync 1 1 n 372 | , bench ("async "++(show procs)++" writers") $ do 373 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 374 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (MS.pushL fill_empty_lockfree ()) >> putMVar done1 ()) $ zip starts dones 375 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 376 | , bench ("async "++(show procs)++" readers") $ do 377 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 378 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (msreadR fill_empty_lockfree) >> putMVar done1 ()) $ zip starts dones 379 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 380 | , bench "contention: async 100 writers 100 readers" $ runtestLockfreeQueueAsync 100 100 n 381 | ] 382 | #endif 383 | -- Chase / Lev work-stealing queue 384 | -- NOTE: we can have at most 1 writer (pushL); not a general-purpose queue, so don't do more tests 385 | , bgroup "chaselev-dequeue" $ 386 | [ bench "async 1 writer 1 readers" $ runtestChaseLevQueueAsync_1_1 n 387 | ] 388 | ] 389 | ] 390 | , bgroup "Arrays misc" $ 391 | -- be sure to subtract "cost" of 2 forkIO's and context switch 392 | [ bench "baseline" $ 393 | do x <- newEmptyMVar 394 | y <- newEmptyMVar 395 | forkIO $ (replicateM_ 500 $ return ()) >> putMVar x () 396 | forkIO $ (replicateM_ 500 $ return ()) >> putMVar y () 397 | takeMVar x 398 | takeMVar y 399 | , bench "New 32-length MutableArrays x1000 across two threads" $ 400 | do x <- newEmptyMVar 401 | y <- newEmptyMVar 402 | forkIO $ (replicateM_ 500 $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int))) >> putMVar x () 403 | forkIO $ (replicateM_ 500 $ (P.newArray 32 0 :: IO (P.MutableArray (PrimState IO) Int))) >> putMVar y () 404 | takeMVar x 405 | takeMVar y 406 | , bench "New MVar x1000 across two threads" $ 407 | do x <- newEmptyMVar 408 | y <- newEmptyMVar 409 | forkIO $ (replicateM_ 500 $ (newEmptyMVar :: IO (MVar Int))) >> putMVar x () 410 | forkIO $ (replicateM_ 500 $ (newEmptyMVar :: IO (MVar Int))) >> putMVar y () 411 | takeMVar x 412 | takeMVar y 413 | ] 414 | ] 415 | -- to make sure the counter is actually being incremented!: 416 | cntv <- readCounter counter_atomic_counter 417 | putStrLn $ "Final counter val is "++(show cntv) 418 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Criterion benchmarks for the different haskell concurrent channel 2 | implementations in `base` and `stm` and elsewhere, as well as simple var 3 | read/write benchmarks for `MVar`, `IORef`, and `TVar` and others. 4 | 5 | These benchmarks were originally taken from bench/chanbench.hs in the `stm` 6 | package, ported to criterion with some additions. To run them on your machine: 7 | 8 | cabal sandbox init 9 | cabal install 10 | # For HTML reports: 11 | ./.cabal-sandbox/bin/chan-benchmarks -g -o Benchmarks.html +RTS -N 12 | 13 | Feel free to send pull requests with new or improved benchmarks. 14 | 15 | # Sample Results 16 | 17 | Nice HTML output for a sample run performed on: 18 | 19 | $ lscpu 20 | Architecture: i686 21 | CPU op-mode(s): 32-bit, 64-bit 22 | Byte Order: Little Endian 23 | CPU(s): 4 24 | On-line CPU(s) list: 0-3 25 | Thread(s) per core: 2 26 | Core(s) per socket: 2 27 | Socket(s): 1 28 | Vendor ID: GenuineIntel 29 | CPU family: 6 30 | Model: 58 31 | Stepping: 9 32 | CPU MHz: 1400.000 33 | BogoMIPS: 4988.38 34 | Virtualization: VT-x 35 | L1d cache: 32K 36 | L1i cache: 32K 37 | L2 cache: 256K 38 | L3 cache: 4096K 39 | 40 | 41 | ...are at `Benchmarks.chans_sample.html` and `Benchmarks.vars_sample.html`. 42 | 43 | ## Some analysis of primitive operations 44 | 45 | forkIO 309.ns 46 | context switch 2975.ns 47 | 48 | getNumCapabilities 4.1ns 49 | myThreadId 4.7ns 50 | 51 | newIORef 7.19 52 | readIORef 3.74ns 53 | writeIORef 7.02ns 54 | modifyIORef' 7.02ns -- even though this is implemented as read+write?? 55 | atomicModifyIORef' 22.43ns 56 | atomicModifyIORef 53.67ns -- variable; showing cost of lazy creation of (const 'x') thunks? 57 | 58 | newEmptyMVar 7.32ns 59 | takeMVar 16.21ns 60 | putMVar 9.02ns 61 | modifyMVarMasked_ 35.09ns -- handler overhead ~ 10ns 62 | 63 | newTVarIO 12.96ns 64 | atomically writeTVar 53.35ns 65 | atomically readTVar 54.29ns 66 | readTVarIO 4.13ns 67 | atomically modifyTVar' 63.76ns 68 | 69 | --counter from atomic-primops v0.5 70 | -- NOTE: ACTUALLY 71 | newCounter 11.99ns 72 | incrCounter 9.28ns 73 | 74 | 75 | Throughput of incrementing atomic counter with... 76 | modifyMVar_ 331.29ms / 100000 = 3312.9ns per increment 77 | modifyMVarMasked_ 323.41ms / 100000 3234.1ns -- NOTE: small variance 78 | atomicModifyIORef' 87.66ms / 100000 876.6ns -- NOTE: sort of folded normal distribution beginning at 9.76ms , where the max we saw was 256.12ms 79 | modifyTVar' 19.10ms / 100000 191.0ns -- NOTE: much lower variance, with samples 17 - 21 ms 80 | 81 | incrCounter 0.97ms / 100000 -- NOTE: GARBAGE; COUNTER NOT ATOMIC 82 | 83 | 84 | ## Random resources 85 | 86 | Some discussion of nitty-gritty of `atomicModifyIORef`: 87 | 88 | http://stackoverflow.com/questions/10102881/haskell-how-does-atomicmodifyioref-work 89 | 90 | 91 | ## Random Analysis 92 | 93 | Back-of-envelope look at how primitive var read write cost relates to chan RW 94 | cost. 95 | 96 | As of the current test run, looking at the mean times for the fastest three 97 | contenders on the easiest test (write some, read some), we get the following 98 | mean timings for *one read and write* (although reads and writes might vary 99 | widely) 100 | 101 | Chan 135 ns 102 | TQueue 175 ns 103 | chan-split-fast 88 ns 104 | 105 | Measured timings for an atomic `modify` (or take/put; again not ideal) divided 106 | by 2 (i.e. very approx timing for a take/read or put/write): 107 | 108 | MVar 15 ns 109 | TVar 36 ns 110 | 111 | and var creation: 112 | 113 | MVar 24 ns 114 | TVar 21 ns 115 | 116 | Counting var operations, with (count) around slowest path 117 | 118 | chan-split-fast 119 | --------------- 120 | puts takes creates TOTAL 121 | readChan 1(2) 1(3) (1) 30-99 ns 122 | writeChan 1(2) 1 / 30-45 ns 123 | TOTAL for read and write: 60-144 ns 124 | 125 | So if we didn't screw that up: 126 | - yes, read/write timing dominates 127 | - but there might still be room to shave time elsewhere 128 | - in "write some / read some" we don't take the slow reader-blocked path much, as expected 129 | -------------------------------------------------------------------------------- /RetryExperiment.hs: -------------------------------------------------------------------------------- 1 | module Main 2 | where 3 | 4 | import Control.Concurrent 5 | import Control.Concurrent.STM 6 | import Control.Concurrent.STM.TSem 7 | import Control.Monad 8 | 9 | import System.IO 10 | import Debug.Trace 11 | 12 | main = do 13 | hSetBuffering stdout NoBuffering 14 | noMansLand <- replicateM 998 $ newTVarIO 0 15 | t0 <- newTVarIO (1::Int) 16 | t999 <- newTVarIO (-1) 17 | let ts = t0:noMansLand++[t999] 18 | 19 | done <- atomically $ newTSem 0 20 | forkIO $ atomically $ nestedOrElseMap done ts 21 | 22 | -- need enough time here for nestedOrElseMap thread above to move past t0 23 | 24 | -- in this version, the modifications to t0 force nestedOrElseMap to be restarted 25 | forkIO (trace "starting vexing!" $ forever $ atomically $ (modifyTVar' t0 (+1) >> trace "vex" (return ()))) 26 | -- in this version nestedOrElseMap causes this transaction to be restarted and never makes progress: 27 | --forkIO (atomically (trace "starting vexing!" $ forever $ (modifyTVar' t0 (+1) >> trace "vex" (return ())))) 28 | 29 | atomically $ waitTSem done 30 | putStrLn "No livelock! Did the t0 counter get incremented?: " 31 | atomically (readTVar t0) >>= print 32 | 33 | nestedOrElseMap :: TSem -> [TVar Int] -> STM () 34 | nestedOrElseMap done ts = trace "nestedOrElseMap starting" $ foldl1 orElse $ map transaction $ zip [(1::Int)..] ts 35 | where transaction (cnt,v) = do 36 | n <- traceShow cnt $ readTVar v 37 | if n < 0 38 | then trace "@" (modifyTVar' v (subtract 1)) >> signalTSem done 39 | else retry 40 | 41 | -- NOTE: this shows at least that we get livelock as we wait on the first transaction (it may also be executed; add Debug.Trace) 42 | -- that's not really working... 43 | -- 44 | -- CONSIDER: 45 | -- The behavior we see ensures that all branches of orElse see the same view of 46 | -- the same variables, but is overzealous! It should do validation for each 47 | -- subtransaction by only checking oldest parent read of each variable *used* 48 | -- in the transaction, and if any changed, then go up only to the last 49 | -- inconsistency) 50 | -- ""If both t1 and t2 execute retry then even though the effects of t1 are 51 | -- thrown away, it could be that a change to a TVar that is only in the 52 | -- access set of t1 will allow the whole transaction to succeed when it is 53 | -- woken. To solve this problem, when a branch on a nested transaction is 54 | -- aborted the access set of the nested transaction is merged as a read set 55 | -- into the parent TRec. Specifically if the TVar is in any TRec up the 56 | -- chain of nested transactions it must be ignored, otherwise it is entered 57 | -- as a new entry (retaining just the read) in the parent TRec."" 58 | -- 59 | -- "aborted"? "read set"? 60 | -- -- is this the culprit? No. Clearly we move on from that branch in test. 61 | -- "A validation failure in the first branch aborts the entire transaction, not just the nested part" 62 | -- "validation"? 63 | -- "Before a transaction can make its effects visible to other threads 64 | -- it must check that it has seen a consistent view of memory while it 65 | -- was executing. Most of the work is done in 66 | -- validate_and_acquire_ownership by checking that TVars hold their 67 | -- expected values. " 68 | -- 69 | -- when commiting, do we force that the read set is unchanged? 70 | -- 71 | -- Soooo 72 | -- The writes to variables from other branches are causing a validation failure 73 | -- and causing whole transaction to reset 74 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /TVarExperiment.hs: -------------------------------------------------------------------------------- 1 | module Main 2 | where 3 | 4 | -- Testing interleaving on IORefs and TVars under contention. Most interested 5 | -- in 2 competing HECs 6 | 7 | import Control.Concurrent.STM 8 | import Control.Concurrent 9 | import Data.IORef 10 | import Control.Monad 11 | import Data.List 12 | import System.Environment 13 | 14 | main = do 15 | [msgsS] <- getArgs 16 | 17 | let msgs = read msgsS :: Int -- per thread (with 2 threads) 18 | tmsgs <- testTVar msgs 19 | iormsgs <- testIORef msgs 20 | 21 | -- 69% mean 22 | pprint $ analyze iormsgs 23 | -- 9% mean 24 | pprint $ analyze tmsgs 25 | 26 | pprint (prefix, rest) = 27 | putStrLn ( (show $ round $ (prefix*100))++"% of messages in constant prefix, with "++(show $ round (rest*100))++"% interleaving for rest" ) 28 | 29 | analyze :: [Int] -> (Float, Float) 30 | analyze l@(a:as) = 31 | -- strip off starting list 32 | let lenL = length l 33 | lSuff = dropWhile (/=a) as 34 | lenPrefix = length (a:takeWhile (==a) as) 35 | in ( fromIntegral lenPrefix / fromIntegral lenL 36 | , fromIntegral (flops lSuff) / fromIntegral (lenL - lenPrefix - 1) ) 37 | 38 | flops = subtract 1 . length . group 39 | 40 | testTVar :: Int -> IO [Int] 41 | testTVar msgs = do 42 | st <- newTVarIO [] 43 | start1 <- newEmptyMVar 44 | start2 <- newEmptyMVar 45 | vs <- mapM (\(n,start)-> do 46 | v <- newEmptyMVar 47 | forkIO $ takeMVar start >> (replicateM_ msgs $ atomically $ modifyTVar' st (n:)) >> putMVar v () 48 | return v 49 | ) [(1, start1),(2,start2)] 50 | mapM_ (flip putMVar ()) [start2,start1] 51 | mapM_ takeMVar vs -- wait 52 | readTVarIO st 53 | 54 | testIORef :: Int -> IO [Int] 55 | testIORef msgs = do 56 | st <- newIORef [] 57 | start1 <- newEmptyMVar 58 | start2 <- newEmptyMVar 59 | vs <- mapM (\(n,start)-> do 60 | v <- newEmptyMVar 61 | forkIO $ takeMVar start >> (replicateM_ msgs $ atomicModifyIORef' st (\st'-> (n:st',()))) >> putMVar v () 62 | return v 63 | ) [(1, start1),(2,start2)] 64 | mapM_ (flip putMVar ()) [start2,start1] 65 | mapM_ takeMVar vs -- wait 66 | readIORef st 67 | -------------------------------------------------------------------------------- /TestAtomics.hs: -------------------------------------------------------------------------------- 1 | import Control.Concurrent 2 | import Data.Atomics.Counter 3 | import Data.Atomics 4 | import Data.IORef 5 | import Control.Monad 6 | import GHC.Conc 7 | import Control.Exception(evaluate) 8 | import qualified Data.Set as Set 9 | import Data.List 10 | 11 | main = do 12 | testCounterOverflow 13 | testConsistentSuccessFailure 14 | counterTest 15 | {- 16 | testCAS_ABA_0 17 | testCAS_ABA_0_modAfterAtomicMod 18 | testCAS_ABA_0_modBefAtomicMod 19 | testCAS_ABA_1 20 | testCAS_ABA_2 21 | testCAS_ABA_3 22 | 23 | -} 24 | cHUNK_SIZE = 32 25 | maxInt = maxBound :: Int 26 | minInt = minBound :: Int 27 | testCounterOverflow = do 28 | let ourMod = mod -- or something more fancy? 29 | cntr <- newCounter (maxInt - (cHUNK_SIZE `div` 2)) 30 | spanningCntr <- replicateM cHUNK_SIZE (incrCounter 1 cntr) 31 | -- make sure our test is working 32 | if all (>0) spanningCntr || all (<0) spanningCntr 33 | then error "Sequence meant to span maxBound of counter not actually spanning" 34 | else return () 35 | 36 | let l = map (`ourMod` cHUNK_SIZE) spanningCntr 37 | l' = (dropWhile (/= 0) l) ++ (takeWhile (/= 0) l) 38 | 39 | -- (1) test that we overflow the counter without any breaks and our mod function is working properly: 40 | if l' == [0..(cHUNK_SIZE - 1)] 41 | then putStrLn $ "OK" 42 | else error $ "Uh Oh: "++(show l') 43 | 44 | -- (2) test that Ints and counter overflow in exactly the same way 45 | let spanningInts = take cHUNK_SIZE $ iterate (+1) (maxInt - (cHUNK_SIZE `div` 2) + 1) 46 | if spanningInts == spanningCntr 47 | then putStrLn "OK" 48 | else do putStrLn $ "Ints overflow differently than counter: " 49 | putStrLn $ "Int: "++(show spanningInts) 50 | putStrLn $ "Counter: "++(show spanningCntr) 51 | error "Fail" 52 | 53 | -- We don't use this property 54 | cntr2 <- newCounter maxBound 55 | mbnd <- incrCounter 1 cntr2 56 | if mbnd == minBound 57 | then putStrLn "OK" 58 | else error $ "Incrementing counter at maxbound didn't yield minBound" 59 | 60 | -- (3) test subtraction across boundary: count - newFirstIndex, for window spanning boundary. 61 | cntr3 <- newCounter (maxBound - 1) 62 | let ls = take 30 $ iterate (+1) $ maxBound - 10 63 | cs <- mapM (\l-> fmap (subtract l) $ incrCounter 1 cntr3) ls 64 | if cs == replicate 30 10 65 | then putStrLn "OK" 66 | else error $ "Derp. We don't know how subtraction works: "++(show cs) 67 | -- (4) readIORef before fetchAndAdd w/ barriers 68 | 69 | 70 | -- Test these assumptions: 71 | -- 1) If a CAS fails in thread 1 then another CAS (in thread 2, say) succeeded 72 | -- 2) In the case that thread 1's CAS failed, the ticket returned with (False,tk) will contain that newly-written value from thread 2 73 | testConsistentSuccessFailure = do 74 | var <- newIORef "0" 75 | 76 | sem <- newIORef (0::Int) 77 | outs <- replicateM 2 newEmptyMVar 78 | 79 | forkSync sem 2 $ test "a" var (outs!!0) 80 | forkSync sem 2 $ test "b" var (outs!!1) 81 | 82 | mapM takeMVar outs >>= examine 83 | -- w/r/t (2) above: we only try to find an element read along with False 84 | -- which wasn't sent by another thread, which isn't ideal 85 | where attempts = 100000 86 | test tag var out = do 87 | 88 | res <- forM [(1::Int)..attempts] $ \x-> do 89 | let str = (tag++(show x)) 90 | tk <- readForCAS var 91 | (b,tk') <- casIORef var tk str 92 | return (if b then str else peekTicket tk' , b) 93 | putMVar out res 94 | 95 | examine [res1, res2] = do 96 | -- any failures in either should be marked as successes in the other 97 | let (successes1,failures1) = (\(x,y)-> (Set.fromList $ map fst x, map fst y)) $ partition snd res1 98 | (successes2,failures2) = (\(x,y)-> (Set.fromList $ map fst x, map fst y)) $ partition snd res2 99 | ok1 = all (flip Set.member successes2) failures1 100 | ok2 = all (flip Set.member successes1) failures2 101 | if ok1 && ok2 102 | then if length failures1 < (attempts `div` 6) || length failures2 < (attempts `div` 6) 103 | then error "There was not enough contention to trust test. Please retry." 104 | else putStrLn "OK" 105 | else do print res1 106 | print res2 107 | error "FAILURE!" 108 | 109 | 110 | -- forkSync :: IORef Int -> Int -> IO a -> IO ThreadId 111 | forkSync sem target io = 112 | forkIO $ (busyWait >> io) 113 | where busyWait = 114 | atomicModifyIORef' sem (\n-> (n+1,())) >> wait 115 | wait = do 116 | n <- readIORef sem 117 | unless (n == target) wait 118 | 119 | 120 | counterTest = do 121 | n0 <- testAtomicCount newCounter readCounter incrCounter 122 | n1 <- testAtomicCount newMVar takeMVar (\n v-> modifyMVar_ v (evaluate . (+1)) ) 123 | if n0 /= n1 124 | then putStrLn $ "Counter broken: expecting "++(show n1)++" got "++(show n0) 125 | else putStrLn "OK" 126 | 127 | testAtomicCount new read incr = do 128 | let n = 1000000 129 | procs <- getNumCapabilities 130 | 131 | counter <- new (1::Int) 132 | dones <- replicateM procs newEmptyMVar ; starts <- replicateM procs newEmptyMVar 133 | mapM_ (\(start1,done1)-> forkIO $ takeMVar start1 >> replicateM_ (n `div` procs) (incr 1 counter) >> putMVar done1 ()) $ zip starts dones 134 | mapM_ (\v-> putMVar v ()) starts ; mapM_ (\v-> takeMVar v) dones 135 | 136 | read counter 137 | 138 | -- test ABA issue with these three cases: 139 | -- () 140 | -- Bool 141 | -- {-# NOINLINE True #-} 142 | -- let true = True 143 | 144 | 145 | 146 | -- returns False 147 | testCAS_ABA_0 = do 148 | a <- newIORef () 149 | ta <- readForCAS a 150 | atomicModifyIORef' a (\u-> (u,u)) 151 | (res, _) <- casIORef a ta () 152 | print res 153 | {- same 154 | testCAS_ABA_0_nonstrict = do 155 | a <- newIORef () 156 | ta <- readForCAS a 157 | atomicModifyIORef a (\u-> (u,u)) 158 | (res, _) <- casIORef a ta () 159 | print res 160 | testCAS_ABA_0_u = do 161 | a <- newIORef () 162 | ta <- readForCAS a 163 | atomicModifyIORef' a (const ((),())) 164 | (res, _) <- casIORef a ta () 165 | print res 166 | testCAS_ABA_0_sameu = do 167 | let {-# NOINLINE u #-} 168 | u = () 169 | a <- newIORef u 170 | ta <- readForCAS a 171 | atomicModifyIORef' a (const (u,u)) 172 | (res, _) <- casIORef a ta () 173 | print res 174 | -} 175 | 176 | -- returns True 177 | testCAS_ABA_1 = do 178 | a <- newIORef () 179 | ta <- readForCAS a 180 | modifyIORef a (const ()) -- i.e. readIORef >> writeIORef 181 | (res, _) <- casIORef a ta () 182 | print res 183 | 184 | {- same 185 | testCAS_ABA_1_casMod = do 186 | a <- newIORef () 187 | ta <- readForCAS a 188 | atomicModifyIORefCAS_ a id 189 | (res, _) <- casIORef a ta () 190 | print res 191 | testCAS_ABA_1_id = do 192 | a <- newIORef () 193 | ta <- readForCAS a 194 | modifyIORef a id -- i.e. readIORef >> writeIORef 195 | (res, _) <- casIORef a ta () 196 | print res 197 | -} 198 | 199 | -- returns True 200 | -- ... so the issue isn't re-ordering of readForCas and the read in modifyIORef 201 | -- in fact, no combination of the barriers provided seem to work. 202 | testCAS_ABA_2 = do 203 | a <- newIORef () 204 | ta <- readForCAS a 205 | loadLoadBarrier 206 | modifyIORef a (const ()) -- i.e. readIORef >> writeIORef 207 | (res, _) <- casIORef a ta () 208 | print res 209 | 210 | testCAS_ABA_3 = do 211 | barrier <- newIORef () 212 | 213 | a <- newIORef () 214 | ta <- readForCAS a 215 | 216 | atomicModifyIORef' barrier (\u-> (u,u)) -- just a barrier 217 | modifyIORef a (const ()) -- i.e. readIORef >> writeIORef 218 | atomicModifyIORef' barrier (\u-> (u,u)) -- just a barrier 219 | 220 | (res, _) <- casIORef a ta () 221 | print res 222 | 223 | -- INTERESTING!: /adding/ the modifyIORef /after/ the atomicModifyIORef causes this to return True! 224 | testCAS_ABA_0_modAfterAtomicMod = do 225 | barrier <- newIORef () 226 | 227 | a <- newIORef () 228 | ta <- readForCAS a 229 | 230 | atomicModifyIORef' a (\u-> (u,u)) 231 | modifyIORef a (const ()) -- i.e. readIORef >> writeIORef 232 | 233 | (res, _) <- casIORef a ta () 234 | print res 235 | 236 | -- ...whereas this one returns False again 237 | testCAS_ABA_0_modBefAtomicMod = do 238 | barrier <- newIORef () 239 | 240 | a <- newIORef () 241 | ta <- readForCAS a 242 | 243 | modifyIORef a (const ()) -- i.e. readIORef >> writeIORef 244 | atomicModifyIORef' a (\u-> (u,u)) 245 | 246 | (res, _) <- casIORef a ta () 247 | print res 248 | -------------------------------------------------------------------------------- /chan-benchmarks.cabal: -------------------------------------------------------------------------------- 1 | -- Initial chan-benchmarks.cabal generated by cabal init. For further 2 | -- documentation, see http://haskell.org/cabal/users-guide/ 3 | 4 | name: chan-benchmarks 5 | version: 0.1.0.0 6 | license: BSD3 7 | license-file: LICENSE 8 | author: Brandon Simmons 9 | maintainer: brandon.m.simmons@gmail.com 10 | build-type: Simple 11 | cabal-version: >=1.10 12 | 13 | -- We provide two executables which run on a single core and all available 14 | -- cores, respectively. Benchmarks to be tested live in Benchmarks.hs. 15 | -- There is some overlap in which tests are run in which, but we omit tests 16 | -- from bench-multi where we suspect running on multiple cores only confuses 17 | -- things, and omit tests from bench-single where a test on a single core wouldn't 18 | -- be relevant (e.g. testing contention or the scheduler directly) 19 | 20 | executable bench-single 21 | main-is: Main1.hs 22 | other-modules: Benchmarks 23 | build-depends: base >=4.6 24 | , stm 25 | , async 26 | -- random number generation: 27 | , random 28 | , mwc-random 29 | , criterion 30 | , primitive > 0.5.2.0 31 | , atomic-primops == 0.6 32 | -- other chans: 33 | , chan-split-fast 34 | , split-channel 35 | --TODO (other FIFO chans/queues with blocking read): 36 | --, cml 37 | , chaselev-deque 38 | 39 | -- arrays: 40 | , array 41 | , vector 42 | -- time: 43 | , old-time 44 | , time 45 | -- Since this doesn't support 7.8 yet 46 | if impl(ghc < 7.8.0) 47 | build-depends: lockfree-queue == 0.2.3 48 | 49 | -- Still testing the threaded runtime, but removing OS thread scheduling from 50 | -- the picture: 51 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N1 52 | ghc-options: -fforce-recomp 53 | default-language: Haskell2010 54 | default-extensions: CPP 55 | 56 | executable bench-multi 57 | main-is: MainN.hs 58 | other-modules: Benchmarks 59 | build-depends: base >=4.6 60 | , stm 61 | , async 62 | , criterion 63 | , primitive > 0.5.2.0 64 | , atomic-primops == 0.6 65 | -- other chans: 66 | , chan-split-fast 67 | , split-channel 68 | --TODO (other FIFO chans/queues with blocking read): 69 | --, cml 70 | , chaselev-deque 71 | -- Since this doesn't support 7.8 yet 72 | if impl(ghc < 7.8.0) 73 | build-depends: lockfree-queue == 0.2.3 74 | 75 | -- We fix this to 2 cores, so that we can be a little less careful about how 76 | -- we write and interpret our tests. 77 | -- These would be interesting to play with on a >2 core machine: 78 | -- -feager-blackholing -qa -qg1 -fllvm 79 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N2 80 | ghc-options: -fforce-recomp 81 | default-language: Haskell2010 82 | default-extensions: CPP 83 | 84 | -- -------------- 85 | 86 | executable test-atomics 87 | main-is: TestAtomics.hs 88 | build-depends: base >=4.6 89 | , stm 90 | , async 91 | , atomic-primops == 0.6 92 | , containers 93 | 94 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N2 95 | default-language: Haskell2010 96 | 97 | -- -------------- 98 | 99 | executable experiment-mvar-fairness 100 | main-is: MVarExperiment.hs 101 | other-modules: Benchmarks 102 | build-depends: base >=4.6 103 | , stm 104 | , async 105 | -- random number generation: 106 | , random 107 | , mwc-random 108 | , criterion 109 | , primitive > 0.5.2.0 110 | , atomic-primops == 0.6 111 | , chan-split-fast 112 | , split-channel 113 | 114 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N1 115 | default-language: Haskell2010 116 | 117 | executable experiment-tvar-interleaving 118 | main-is: TVarExperiment.hs 119 | other-modules: Benchmarks 120 | build-depends: base >=4.6 121 | , stm 122 | , async 123 | -- random number generation: 124 | , random 125 | , mwc-random 126 | , criterion 127 | , primitive > 0.5.2.0 128 | , atomic-primops == 0.6 129 | , chan-split-fast 130 | , split-channel 131 | 132 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N2 133 | default-language: Haskell2010 134 | 135 | executable retry-experiment 136 | main-is: RetryExperiment.hs 137 | other-modules: Benchmarks 138 | build-depends: base >=4.6 139 | , stm 140 | 141 | ghc-options: -O2 -rtsopts -threaded -with-rtsopts=-N2 142 | ghc-options: -Wall 143 | default-language: Haskell2010 144 | --------------------------------------------------------------------------------