├── .gitignore
├── Setup.hs
├── bad_scaling.png
├── bloom_bench.png
├── tests
    ├── serialized
    │   ├── 1_0.64.bytestring
    │   ├── 2_1.64.bytestring
    │   ├── 3_2.64.bytestring
    │   ├── 3_7.64.bytestring
    │   └── 4_10.64.bytestring
    └── Main.hs
├── core-example
    └── Main.hs
├── src
    └── Control
    │   └── Concurrent
    │       ├── BloomFilter.hs
    │       └── BloomFilter
    │           └── Internal.hs
├── .travis.yml
├── README.md
├── LICENSE
├── unagi-bloomfilter.cabal
└── benchmarks
    └── Main.hs


/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/bad_scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/bad_scaling.png


--------------------------------------------------------------------------------
/bloom_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/bloom_bench.png


--------------------------------------------------------------------------------
/tests/serialized/1_0.64.bytestring:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/tests/serialized/1_0.64.bytestring


--------------------------------------------------------------------------------
/tests/serialized/2_1.64.bytestring:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/tests/serialized/2_1.64.bytestring


--------------------------------------------------------------------------------
/tests/serialized/3_2.64.bytestring:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/tests/serialized/3_2.64.bytestring


--------------------------------------------------------------------------------
/tests/serialized/3_7.64.bytestring:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/tests/serialized/3_7.64.bytestring


--------------------------------------------------------------------------------
/tests/serialized/4_10.64.bytestring:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jberryman/unagi-bloomfilter/HEAD/tests/serialized/4_10.64.bytestring


--------------------------------------------------------------------------------
/core-example/Main.hs:
--------------------------------------------------------------------------------
1 | module Main(main) where
2 | 
3 | import qualified Control.Concurrent.BloomFilter as Bloom
4 | 
5 | main = do
6 |     b_5_20 <- Bloom.new (Bloom.SipKey 1 1) 5 20
7 |     p <- Bloom.lookup b_5_20 (1::Int)
8 |     print p
9 | 


--------------------------------------------------------------------------------
/src/Control/Concurrent/BloomFilter.hs:
--------------------------------------------------------------------------------
 1 | module Control.Concurrent.BloomFilter (
 2 | {- | A thread-safe mutable bloom filter. Some additional functionality for
 3 |    advanced users is exposed in "Control.Concurrent.BloomFilter.Internal".
 4 |  -}
 5 |       BloomFilter()
 6 |     , BloomFilterException(..)
 7 |     -- * Creation
 8 |     , new
 9 |     , SipKey(..)
10 |     -- * Operations
11 |     , insert
12 |     , lookup
13 |     -- ** Copying and Combining
14 |     , unionInto
15 |     , intersectionInto
16 |     , clone
17 |     -- * Serialization
18 |     , serialize
19 |     , deserialize
20 |     -- * Utilities
21 |     , fpr
22 | 
23 |   ) where
24 | 
25 | import Control.Concurrent.BloomFilter.Internal
26 | import Prelude hiding (lookup)
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Copied from: https://github.com/hvr/multi-ghc-travis
 2 | language: c
 3 | 
 4 | # explicitly request container-based infrastructure
 5 | sudo: false
 6 | 
 7 | matrix:
 8 |   include:
 9 |     - env: CABALVER=1.18 GHCVER=7.8.4
10 |       addons: {apt: {packages: [cabal-install-1.18,ghc-7.8.4], sources: [hvr-ghc]}}
11 |     - env: CABALVER=1.22 GHCVER=7.10.1
12 |       addons: {apt: {packages: [cabal-install-1.22,ghc-7.10.1],sources: [hvr-ghc]}}
13 |     - env: CABALVER=1.24 GHCVER=8.0.1
14 |       addons: {apt: {packages: [cabal-install-1.24,ghc-8.0.1], sources: [hvr-ghc]}}
15 |     - env: CABALVER=head GHCVER=head
16 |       addons: {apt: {packages: [cabal-install-head,ghc-head],  sources: [hvr-ghc]}}
17 | 
18 |   allow_failures:
19 |    - env: CABALVER=head GHCVER=head
20 | 
21 | before_install:
22 |  - export PATH=/opt/ghc/$GHCVER/bin:/opt/cabal/$CABALVER/bin:$PATH
23 | 
24 | install:
25 |  - travis_retry cabal update
26 |  - sed -i 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config
27 |  - cabal install --only-dependencies --enable-tests -fdev
28 | 
29 | script:
30 |  - cabal configure -fdev --enable-tests -finstrumented
31 |  - cabal build
32 |  - dist/build/tests/tests
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # unagi-bloomfilter [![Build Status](https://travis-ci.org/jberryman/unagi-bloomfilter.svg)](https://travis-ci.org/jberryman/unagi-bloomfilter)
 2 | 
 3 | This library implements a fast concurrent bloom filter, based on bloom-1 from
 4 | "Fast Bloom Filters and Their Generalization" by Y Qiao, et al. 
 5 | 
 6 | It's [on hackage](https://hackage.haskell.org/package/unagi-bloomfilter) and
 7 | can be installed with 
 8 | 
 9 |     $ cabal install unagi-bloomfilter
10 | 
11 | A bloom filter is a probabilistic, constant-space, set-like data structure
12 | supporting insertion and membership queries. This implementation is backed by
13 | SipHash so can safely consume untrusted inputs.
14 | 
15 | The implementation here compares favorably with traditional set implementations
16 | in a single-threaded context, e.g. here are 10 inserts or lookups compared
17 | across some sets of different sizes:
18 | 
19 | ![single-threaded](http://i.imgur.com/gei1LW4.png)
20 | 
21 | With the llvm backend benchmarks take around 75-85% of the runtime of the
22 | native code gen.
23 | 
24 | Unfortunately writes in particular don't seem to scale currently; i.e.
25 | distributing writes across multiple threads may be _slower_ than in a
26 | single-threaded context, because of memory effects. We plan to export
27 | functionality that would support using the filter here in a concurrent context
28 | with better memory behavior (e.g. a server that shards to a thread-pool which
29 | handles only a portion of the bloom array).
30 | 
31 | ![concurrent](http://i.imgur.com/RaUSmZB.png)
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Brandon Simmons
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Brandon Simmons nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/unagi-bloomfilter.cabal:
--------------------------------------------------------------------------------
  1 | name:                unagi-bloomfilter
  2 | version:             0.1.1.2
  3 | synopsis:            A fast, cache-efficient, concurrent bloom filter
  4 | description:
  5 |   This library implements a fast concurrent bloom filter, based on bloom-1 from
  6 |   "Fast Bloom Filters and Their Generalization" by Y Qiao, et al. 
  7 |   .
  8 |   A bloom filter is a probabilistic, constant-space, set-like data structure
  9 |   supporting insertion and membership queries. This implementation is backed by
 10 |   SipHash so can safely consume untrusted inputs.
 11 |   .
 12 |   The implementation here compares favorably with traditional set
 13 |   implementations in a single-threaded context, e.g. here are 10 inserts or
 14 |   lookups compared across some sets of different sizes:
 15 |   .
 16 |   <<http://i.imgur.com/gei1LW4.png>>
 17 |   .
 18 |   With the llvm backend benchmarks take around 75-85% of the runtime of the
 19 |   native code gen.
 20 |   .
 21 |   Unfortunately writes in particular don't seem to scale currently; i.e.
 22 |   distributing writes across multiple threads may be /slower/ than in a
 23 |   single-threaded context, because of memory effects. We plan to export
 24 |   functionality that would support using the filter here in a concurrent
 25 |   context with better memory behavior (e.g. a server that shards to a
 26 |   thread-pool which handles only a portion of the bloom array).
 27 |   .
 28 |   <<http://i.imgur.com/RaUSmZB.png>>
 29 |   .
 30 | 
 31 | homepage:            http://github.com/jberryman/unagi-bloomfilter
 32 | license:             BSD3
 33 | license-file:        LICENSE
 34 | author:              Brandon Simmons
 35 | maintainer:          brandon.m.simmons@gmail.com
 36 | -- copyright:           
 37 | category:            Concurrency
 38 | build-type:          Simple
 39 | -- extra-source-files:  
 40 | cabal-version:       >=1.10
 41 | 
 42 | source-repository head
 43 |   type:     git
 44 |   location: https://github.com/jberryman/unagi-bloomfilter.git
 45 | 
 46 | Flag dev
 47 |   Description: To build tests, executables and benchmarks do `configure -fdev --enable-tests` and run the built executables by hand (i.e. not with `cabal test` etc.; we put all our different executables in test-suite sections in order to hide their dependencies from hackage)
 48 |   Default: False
 49 |   -- TODO did this solve our issues with having executable sections and hackage deps?:
 50 |   Manual: True
 51 | 
 52 | Flag instrumented
 53 |   Description: Enables assertions in library code. When --enable-library-profiling and --enable-executable-profiling is turned on, you can get stacktraces as well
 54 |   Default: False
 55 |   Manual: True
 56 | 
 57 | library
 58 |   if flag(dev)
 59 |       CPP-Options:     -DEXPORT_INTERNALS
 60 |   if flag(instrumented)
 61 |       CPP-Options:      -DASSERTIONS_ON
 62 |       ghc-options:      -fno-ignore-asserts
 63 |       -- TODO stacktraces don't seem to show anything useful. Maybe because of INLINEs?:
 64 |       -- ghc-prof-options:  "-with-rtsopts=-xc" -fprof-auto -fprof-auto-calls
 65 | 
 66 |   exposed-modules:     Control.Concurrent.BloomFilter
 67 |                      , Control.Concurrent.BloomFilter.Internal
 68 |   -- other-modules:       
 69 |   -- other-extensions:    
 70 |   build-depends:       base >=4.7 && <5
 71 |                      , atomic-primops >= 0.8
 72 |                      , primitive
 73 |                      , bytestring
 74 |                      , hashabler >= 1.3.0
 75 |   hs-source-dirs:      src
 76 |   default-language:    Haskell2010
 77 |   ghc-options: -Wall -fwarn-tabs -O2 -funbox-strict-fields
 78 | 
 79 | test-suite tests
 80 |   ghc-options: -fsimpl-tick-factor=1000
 81 |   type: exitcode-stdio-1.0
 82 |   default-language:    Haskell2010
 83 |   hs-source-dirs:      tests
 84 |   main-is:             Main.hs
 85 |   -- other-modules:
 86 | 
 87 |   ghc-options:         -Wall -O2 -threaded -funbox-strict-fields -fno-ignore-asserts "-with-rtsopts=-N"
 88 |   if flag(instrumented)
 89 |       ghc-prof-options:  "-with-rtsopts=-xc" -fprof-auto -fprof-auto-calls
 90 | 
 91 |   if flag(instrumented)
 92 |       CPP-Options:     -DASSERTIONS_ON
 93 |   if flag(dev)
 94 |       buildable: True
 95 |       build-depends:       base
 96 |                          , QuickCheck
 97 |                          , random
 98 |                          , unagi-bloomfilter
 99 |                          , primitive
100 |                          , bytestring
101 |                          , hashabler
102 |   else 
103 |       buildable: False
104 | 
105 | 
106 | benchmark bench
107 |   type: exitcode-stdio-1.0
108 |   default-language:    Haskell2010
109 |   main-is:             Main.hs
110 |   ghc-options:         -Wall -O2 -threaded -funbox-strict-fields
111 |   ghc-options:         "-with-rtsopts=-N -A50M -qa"
112 |   ghc-options:         -rtsopts
113 |   ghc-prof-options:    -fprof-auto -fprof-auto-calls
114 |   hs-source-dirs:      benchmarks
115 |   if flag(instrumented)
116 |       CPP-Options:     -DASSERTIONS_ON
117 |   if flag(dev)
118 |       buildable: True
119 |       build-depends:   base
120 |                      , criterion
121 |                      , unagi-bloomfilter
122 |                      , unordered-containers
123 |                      , containers
124 |                      , text
125 |                      , deepseq
126 |                      , random
127 |                      , hashabler
128 |   else 
129 |       buildable: False
130 | 
131 | 
132 | executable dev-example
133 |  if !flag(dev)
134 |    buildable: False
135 |  else
136 |    build-depends:       
137 |        base
138 |      , unagi-bloomfilter
139 | 
140 |  -- ghc-options: -ddump-to-file -ddump-simpl -dsuppress-module-prefixes -dsuppress-uniques -ddump-core-stats -ddump-inlinings
141 |  ghc-options: -O2  -rtsopts  
142 |   -- for ghc bug(?) https://ghc.haskell.org/trac/ghc/ticket/11263
143 |   -- (had to bump once again for additional setKMemberBits unrolling) 
144 |  ghc-options: -fsimpl-tick-factor=1000
145 |  
146 |  -- Either do threaded for eventlogging and simple timing...
147 |  -- ghc-options: -threaded -eventlog
148 |  -- and run e.g. with +RTS -N -l
149 | 
150 |  hs-source-dirs: core-example
151 |  main-is: Main.hs
152 |  default-language:    Haskell2010
153 | 
154 | 


--------------------------------------------------------------------------------
/benchmarks/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP , OverloadedStrings #-}
  2 | module Main where
  3 | #  ifdef ASSERTIONS_ON
  4 | #    error "Sorry, please reconfigure without -finstrumented so that we turn off assertions in library code."
  5 | #  endif
  6 | 
  7 | import Criterion.Main
  8 | import Control.DeepSeq
  9 | import Control.Monad
 10 | import Control.Concurrent
 11 | import qualified Data.Text as T
 12 | import Data.List
 13 | 
 14 | import Control.Concurrent.BloomFilter.Internal
 15 | import qualified Control.Concurrent.BloomFilter as Bloom
 16 | import Data.Hashabler
 17 | 
 18 | import qualified Data.Set as Set
 19 | import qualified Data.HashSet as HashSet
 20 | 
 21 | -- import System.IO.Unsafe(unsafePerformIO)
 22 | import System.Random
 23 | 
 24 | -- TODO comparisons with:
 25 | --   - pure Set
 26 | --   - best in class Int (or other specialized) hash map or trie
 27 | --   - general hashmap (of Hashable things)
 28 | --   - the above, wrapped in an IORef or MVar
 29 | 
 30 | main :: IO ()
 31 | main = do
 32 |     assertionsOn <- assertionCanary
 33 |     when assertionsOn $
 34 |       putStrLn  $ "!!! WARNING !!! assertions are enabled in library code and may result in "
 35 |                 ++"slower than realistic benchmarks. Try configuring without -finstrumented"
 36 | 
 37 |     procs <- getNumCapabilities
 38 |     if procs < 2 
 39 |         then putStrLn "!!! WARNING !!!: Some benchmarks are only valid if more than 1 core is available"
 40 |         else return ()
 41 |     
 42 | 
 43 |     -- TODO make this a function will call in 'env'
 44 |     let g = mkStdGen 8973459
 45 |         chars = randoms g :: [Char]
 46 |         fakeWords = go chars
 47 |         go :: [Char] -> [String]
 48 |         go [] = error "noninfinite list"
 49 |         go (s:ss) = let (a,as) = splitAt 3 ss
 50 |                         (b,bs) = splitAt 5 as
 51 |                         (c,cs) = splitAt 5 bs
 52 |                         (d,ds) = splitAt 6 cs
 53 |                         (e,es) = splitAt 8 ds
 54 |                      in [s]:a:b:c:d:e:(go es)
 55 | 
 56 |     let textWords10k = map T.pack $ take 10000 fakeWords
 57 |         (wds5k_0, wds5k_1) = splitAt 5000 textWords10k
 58 |     deepseq textWords10k $ return ()
 59 | 
 60 |     let txt = "orange" :: T.Text
 61 |         -- so half are in set and half are not:
 62 |         txt10New, txt10Mix :: [T.Text]
 63 |         txt10New = take 10 $ reverse textWords10k
 64 |         txt10Mix = concatMap (\(x,y)->[x,y]) $ zip textWords10k (take 5 txt10New)
 65 |     
 66 | 
 67 | 
 68 |     defaultMain [
 69 |       bgroup "internals" [
 70 |           env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
 71 |             bench "membershipWordAndBits64" $ nf (membershipWordAndBits64 (Hash64 1)) b
 72 |         , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
 73 |             bench "membershipWordAndBits128" $ nf (membershipWordAndBits128 (Hash128 1 1)) b
 74 |         ],
 75 | 
 76 |       -- For comparing cache behavior with perf, against below:
 77 |       bgroup "HashSet" $
 78 |         [ bench "10K insert" $ whnf (HashSet.fromList) wds5k_0 
 79 |         , env (return $ HashSet.fromList textWords10k) $ \ ~hs ->
 80 |             bench "10K lookups on 5k elems" $ whnf (foldl1' (==) . map (\t->HashSet.member t hs)) textWords10k
 81 |         ],
 82 |       bgroup "Set" $
 83 |         [ bench "10K insert" $ whnf (Set.fromList) textWords10k 
 84 |         , env (return $ Set.fromList textWords10k) $ \ ~hs ->
 85 |             bench "10K lookups on 5k elems" $ whnf (foldl1' (==) . map (\t->Set.member t hs)) textWords10k
 86 |         ],
 87 | 
 88 |       bgroup "different sizes" $
 89 |         let benches b = [
 90 |                 bench "10K inserts" $ whnfIO $ manyInserts b textWords10k
 91 |               , bench "10K lookups" $ whnfIO $ manyLookups b textWords10k
 92 |               ]
 93 |          in
 94 |             [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
 95 |                 bgroup "4096" (benches b)
 96 |             , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
 97 |                 bgroup "16384" (benches b)
 98 |             , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
 99 |                 bgroup "65536" (benches b)
100 |             , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
101 |                 bgroup "1MB" (benches b)
102 |             , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
103 |                 bgroup "8MB" (benches b)
104 |             , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
105 |                 bgroup "64MB" (benches b)
106 |             ]
107 |       , bgroup "different sizes (concurrency)" $
108 |         {-
109 |           -- TODO factor out cost of 'new' in some better way:
110 |         [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
111 |             bench "bigInsertLookup 15k ops" $  whnfIO (largeInsertQueryBench b wds5k_0 wds5k_1)
112 | 
113 |         , env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
114 |            bench "bigInsertLookup 15k ops across two threads (4096)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
115 |         , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
116 |            bench "bigInsertLookup 15k ops across two threads (16384)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
117 |         , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
118 |            bench "bigInsertLookup 15k ops across two threads (65536)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
119 |         , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
120 |            bench "bigInsertLookup 15k ops across two threads (1MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
121 |         , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
122 |            bench "bigInsertLookup 15k ops across two threads (8MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
123 |         , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
124 |            bench "bigInsertLookup 15k ops across two threads (64MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
125 |         -}
126 |         let benches b = [
127 |                 bench "10K inserts, across 2 threads" $ whnfIO $ manyInsertsTwoThreads b wds5k_0 wds5k_1
128 |               , bench "10K lookups, across 2 threads" $ whnfIO $ manyLookupsTwoThreads b wds5k_0 wds5k_1
129 |               ]
130 |          in
131 |             [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
132 |                 bgroup "4096" (benches b)
133 |             , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
134 |                 bgroup "16384" (benches b)
135 |             , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
136 |                 bgroup "65536" (benches b)
137 |             , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
138 |                 bgroup "1MB" (benches b)
139 |             , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
140 |                 bgroup "8MB" (benches b)
141 |             , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
142 |                 bgroup "64MB" (benches b)
143 |             ]
144 |       , bgroup "lookup insert" [
145 |           bgroup "Int" [
146 |               bench "siphash64_1_3 for comparison" $ whnf (siphash64_1_3 (SipKey 1 1)) (1::Int)
147 |             , bench "siphash128 for comparison" $ whnf (siphash128 (SipKey 1 1)) (1::Int)
148 |             , env (Bloom.new (SipKey 1 1) 3 12) $ \ ~b->
149 |               bgroup "3 12 (64-bit hash)" [
150 | 
151 |                   -- best case, with no cache effects (I think):
152 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
153 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
154 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])
155 | 
156 |                 , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
157 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
158 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
159 |               ]
160 |             , env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
161 |               bgroup "5 20 (64-bit hash)" [
162 | 
163 |                   -- best case, with no cache effects (I think):
164 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
165 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
166 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])
167 | 
168 |                 , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
169 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
170 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
171 |               ]
172 |             , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
173 |               bgroup "13 20 (128-bit hash)" [
174 | 
175 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
176 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
177 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])
178 | 
179 |                 , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
180 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
181 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
182 |               ]
183 |           ],
184 |           bgroup "Text" [
185 |               bench "siphash64_1_3 for comparison" $ whnf (siphash64_1_3 (SipKey 1 1)) txt
186 |             , bench "siphash128 for comparison" $ whnf (siphash128 (SipKey 1 1)) txt
187 |             , env (Bloom.new (SipKey 1 1) 3 12) $ \ ~b->
188 |               bgroup "3 12 (64-bit hash)" [
189 | 
190 |                   -- best case, with no cache effects (I think):
191 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
192 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
193 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))
194 | 
195 |                 , bench "insert x1" $ whnfIO (Bloom.insert b txt)
196 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
197 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
198 |               ]
199 |             , env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
200 |               bgroup "5 20 (64-bit hash)" [
201 | 
202 |                   -- best case, with no cache effects (I think):
203 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
204 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
205 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))
206 | 
207 |                 , bench "insert x1" $ whnfIO (Bloom.insert b txt)
208 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
209 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
210 |               ]
211 |             , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
212 |               bgroup "13 20 (128-bit hash)" [
213 | 
214 |                   bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
215 |                 , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
216 |                 , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))
217 | 
218 |                 , bench "insert x1" $ whnfIO (Bloom.insert b txt)
219 |                 , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
220 |                 , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
221 |               ]
222 |           ]
223 | 
224 |         ],
225 |   --
226 |   -- TODO check  TO SEE HOW THINGS LOOK BEFORE AND AFTER UNFOLDING CHANGE,
227 |   --             MAYBE TRY DOING inserts/lookups x10 here.
228 |   --   3x12 insert went from  51.8 to 49  (below)
229 |   --   5x20 insert went from  59.1 to 47.6 (in "lookup insert")
230 |       bgroup "comparisons micro x1 " [
231 |           bench "(just siphash64_1_3 on txt for below)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text)
232 |         -- This has 0.3% fpr for 10000 elements, so I think can be fairly compared
233 |         , env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b_text->
234 |           bgroup "unagi-bloomfilter 3 12" [
235 |               bench "insert" $ whnfIO (Bloom.insert b_text txt)
236 |             {- I was concerned that the above might not be valid (perhaps the
237 |              - hashing of the Text value was getting reused?), but the following
238 |              - convinced me it's all right; we can see differences in size of input
239 |              - string reflected in all these benchmarks. I believe bloomInsertPure1
240 |              - reflects the inability to inline Hashable instance machinery (since
241 |              - it must remain polymorphic.
242 |             , bench "Bloom.insert (64)(validation1)" $ whnf (bloomInsertPure1 b_text) txt
243 |             , bench "Bloom.insert (64)(validation2)" $ whnf (bloomInsertPure2 b_text) txt
244 |             , bench "Bloom.insert (64)(validation3)" $ whnfIO (Bloom.insert b_text "ora")
245 |             , bench "Bloom.insert (64)(validation4)" $ whnf (bloomInsertPure1 b_text) "ora"
246 |             , bench "Bloom.insert (64)(validation5)" $ whnf (bloomInsertPure2 b_text) "ora"
247 |             , bench "(validation orange)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text)
248 |             , bench "(validation ora)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("ora"::T.Text)
249 |             -}
250 |             , bench "lookup" $ nfIO (Bloom.lookup b_text txt)
251 |           ]
252 | 
253 |         , env (return $ HashSet.fromList $ take 10 textWords10k) $ \ ~hashset10->
254 |           bgroup "HashSet Text (10)" [
255 |               bench "insert" $ whnf (\t-> HashSet.insert t hashset10) txt
256 |             , bench "member" $ nf (\t-> HashSet.member t hashset10) txt
257 |           ]
258 |         , env (return $ HashSet.fromList $ take 100 textWords10k) $ \ ~hashset100->
259 |           bgroup "HashSet Text (100)" [
260 |               bench "insert" $ whnf (\t-> HashSet.insert t hashset100) txt
261 |             , bench "member" $ nf (\t-> HashSet.member t hashset100) txt
262 |           ]
263 |         , env (return $ HashSet.fromList $ take 10000 textWords10k) $ \ ~hashset10000->
264 |           bgroup "HashSet Text (10000)" [
265 |               bench "insert" $ whnf (\t-> HashSet.insert t hashset10000) txt
266 |             , bench "member" $ nf (\t-> HashSet.member t hashset10000) txt
267 |           ]
268 |         , env (return $ Set.fromList $ take 10 textWords10k) $ \ ~set10->
269 |           bgroup "Set Text (10)" [
270 |               bench "insert" $ whnf (\t-> Set.insert t set10) txt
271 |             , bench "member" $ nf (\t-> Set.member t set10) txt
272 |           ]
273 |         , env (return $ Set.fromList $ take 100 textWords10k) $ \ ~set100->
274 |           bgroup "Set Text (100)" [
275 |               bench "insert" $ whnf (\t-> Set.insert t set100) txt
276 |             , bench "member" $ nf (\t-> Set.member t set100) txt
277 |           ]
278 |         , env (return $ Set.fromList $ take 10000 textWords10k) $ \ ~set10000->
279 |           bgroup "Set Text (10000)" [
280 |               bench "insert" $ whnf (\t-> Set.insert t set10000) txt
281 |             , bench "member" $ nf (\t-> Set.member t set10000) txt
282 |           ]
283 |       ],
284 |       
285 |       bgroup "comparisons micro x10" [
286 |         -- This has 0.3% fpr for 10000 elements, so I think can be fairly compared
287 |           env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b_text->
288 |           bgroup "unagi-bloomfilter 3 12" [
289 |               bench "insert" $ whnfIO (mapM (Bloom.insert b_text) txt10New)
290 |             , bench "lookup" $ nfIO (mapM (Bloom.lookup b_text) txt10Mix)
291 |           ]
292 | 
293 |         , env (return $ HashSet.fromList $ take 100 textWords10k) $ \ ~hashset100->
294 |           bgroup "HashSet Text (100)" [
295 |               bench "insert" $ whnf (foldr (\t s-> HashSet.insert t s) hashset100) txt10New
296 |             , bench "member" $ nf (map $ \t-> HashSet.member t hashset100) txt10Mix
297 |           ]
298 |         , env (return $ HashSet.fromList $ take 10000 textWords10k) $ \ ~hashset10000->
299 |           bgroup "HashSet Text (10000)" [
300 |               bench "insert" $ whnf (foldr (\t s-> HashSet.insert t s) hashset10000) txt10New
301 |             , bench "member" $ nf (map $ \t-> HashSet.member t hashset10000) txt10Mix
302 |           ]
303 |         , env (return $ Set.fromList $ take 100 textWords10k) $ \ ~set100->
304 |           bgroup "Set Text (100)" [
305 |               bench "insert" $ whnf (foldr (\t s-> Set.insert t s) set100) txt10New
306 |             , bench "member" $ nf (map $ \t-> Set.member t set100) txt10Mix
307 |           ]
308 |         , env (return $ Set.fromList $ take 10000 textWords10k) $ \ ~set10000->
309 |           bgroup "Set Text (10000)" [
310 |               bench "insert" $ whnf (foldr (\t s-> Set.insert t s) set10000) txt10New
311 |             , bench "member" $ nf (map $ \t-> Set.member t set10000) txt10Mix
312 |           ]
313 |       ],
314 | 
315 |       bgroup "comparisons big" [
316 |           -- TODO large random lookup and insert benchmark, comparing with single-thread and then with work split.
317 |           --      make this how we compare as well?
318 |           --      Do this for various types of elements
319 |       ],
320 | 
321 |       bgroup "combining and creation" [
322 |         -- These timings can be subtracted from union timings:
323 |           bench "new 14" $ whnfIO $ Bloom.new (SipKey 1 1) 3 14
324 |         , bench "new 20" $ whnfIO $ Bloom.new (SipKey 1 1) 3 20
325 | 
326 |         , bench "unionInto (14 -> 14)" $ whnfIO $ unionBench 14 14
327 |         , bench "unionInto (20 -> 14)" $ whnfIO $ unionBench 20 14 -- 20 is 6x
328 |         , bench "unionInto (20 -> 20)" $ whnfIO $ unionBench 20 20
329 |         ]
330 |       ]
331 | 
332 | unionBench :: Int -> Int -> IO ()
333 | unionBench bigl littlel = do
334 |     b1 <- Bloom.new (SipKey 1 1) 3 bigl
335 |     b2 <- Bloom.new (SipKey 1 1) 3 littlel
336 |     b1 `Bloom.unionInto` b2
337 | 
338 | 
339 | instance NFData (BloomFilter a) where
340 |   rnf _ = ()
341 | 
342 | {-
343 | -- TODO fix both of these and compare with Set/HashSet (wrapped in IORef or MVar for second)
344 | largeInsertQueryBench :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
345 | largeInsertQueryBench b payload antipayload = do
346 |   forM_ payload $ Bloom.insert b
347 |   forM_ (zip payload antipayload) $ \(x,y)-> do
348 |     --- can't test, since we're re-using bloom:
349 |     _xOk <- Bloom.lookup b x
350 |     _yOk <- Bloom.lookup b y  -- usually False
351 |     -- unless (xOk) $ error "largeInsertQueryBench"
352 |     return ()
353 | 
354 | largeInsertQueryBenchTwoThreads :: Bloom.BloomFilter T.Text -> Int -> [T.Text] -> [T.Text] -> IO ()
355 | largeInsertQueryBenchTwoThreads b length_payload payload antipayload = do
356 |   t0 <- newEmptyMVar
357 |   t1 <- newEmptyMVar
358 |   let (payload0,payload1) = splitAt (length_payload `div` 2) payload
359 |   let (antipayload0,antipayload1) = splitAt (length_payload `div` 2) antipayload
360 | 
361 |   let go pld antpld v = do
362 |           forM_ pld $ Bloom.insert b
363 |           forM_ (zip pld antpld) $ \(x,y)-> do
364 |             _xOk <- Bloom.lookup b x
365 |             _yOk <- Bloom.lookup b y  -- usually False
366 |             -- unless (xOk) $ error "largeInsertQueryBench"
367 |             return ()
368 |           putMVar v ()
369 |   void $ forkIO $ go payload0 antipayload0 t0
370 |   void $ forkIO $ go payload1 antipayload1 t1
371 |   takeMVar t0 >> takeMVar t1
372 |   -}
373 | 
374 | 
375 | -- These are mostly to check cache behavior, and I don't expect it to matter
376 | -- whether a bloom filter was already "filled with elements" or not.
377 | manyInserts :: Bloom.BloomFilter T.Text -> [T.Text] -> IO ()
378 | manyInserts b payload = do
379 |   forM_ payload (void . Bloom.insert b)
380 | 
381 | manyLookups :: Bloom.BloomFilter T.Text -> [T.Text] -> IO ()
382 | manyLookups b payload = do
383 |   forM_ payload (void . Bloom.lookup b)
384 | 
385 | manyInsertsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
386 | manyInsertsTwoThreads b payload0 payload1 = do
387 |   t0 <- newEmptyMVar
388 |   t1 <- newEmptyMVar
389 |   let go pld v = manyInserts b pld >> putMVar v ()
390 |   void $ forkIO $ go payload0 t0
391 |   void $ forkIO $ go payload1 t1
392 |   takeMVar t0 >> takeMVar t1
393 | 
394 | manyLookupsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
395 | manyLookupsTwoThreads b payload0 payload1 = do
396 |   t0 <- newEmptyMVar
397 |   t1 <- newEmptyMVar
398 |   let go pld v = manyLookups b pld >> putMVar v ()
399 |   void $ forkIO $ go payload0 t0
400 |   void $ forkIO $ go payload1 t1
401 |   takeMVar t0 >> takeMVar t1
402 | 
403 | 
404 | {-
405 | -- So we can use whnf, and make sure hashes aren't being cached
406 | {-# NOINLINE bloomInsertPure1 #-}
407 | bloomInsertPure1 :: Hashable a => BloomFilter a -> a -> Bool
408 | bloomInsertPure1 b = unsafePerformIO . Bloom.insert b
409 | 
410 | bloomInsertPure2 :: Hashable a => BloomFilter a -> a -> Bool
411 | bloomInsertPure2 b = unsafePerformIO . Bloom.insert b
412 | 
413 | {-# NOINLINE bloomInsertPure3 #-}
414 | bloomInsertPure3 :: BloomFilter Text -> Text -> Bool
415 | bloomInsertPure3 b = unsafePerformIO . Bloom.insert b
416 | -}
417 | 


--------------------------------------------------------------------------------
/tests/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, BangPatterns, RecordWildCards, NamedFieldPuns #-}
  2 | module Main (main) where
  3 | 
  4 | import Control.Concurrent.BloomFilter.Internal
  5 | import qualified Control.Concurrent.BloomFilter as Bloom
  6 | import Data.Hashabler
  7 | 
  8 | import Test.QuickCheck hiding ((.&.))
  9 | import Data.Primitive.ByteArray
 10 | import Data.Primitive.MachDeps
 11 | import Data.Bits
 12 | import qualified Data.ByteString as B
 13 | import Control.Monad
 14 | import Control.Concurrent
 15 | import System.IO
 16 | import Data.Word
 17 | import Control.Exception
 18 | import Text.Printf
 19 | import Data.List
 20 | import System.Random
 21 | import Control.Applicative
 22 | import Prelude
 23 | 
 24 | main :: IO ()
 25 | main = do
 26 |     hSetBuffering stdout NoBuffering
 27 | #  ifdef ASSERTIONS_ON
 28 |     checkAssertionsOn
 29 | #  else
 30 |     putStrLn "!!! WARNING !!!: assertions not turned on in library code. configure with -finstrumented (first a `cabal clean` may be necessary) if you want to run tests with assertions enabled (it's good to test with both)"
 31 | #  endif
 32 |     procs <- getNumCapabilities
 33 |     if procs < 2 
 34 |         then putStrLn "!!! WARNING !!!: Some tests are only effective if more than 1 core is available"
 35 |         else return ()
 36 | 
 37 |     -- test helper sanity: --------
 38 |     unless ((fromBits64 $ replicate 64 '1') == (maxBound :: Word64) &&
 39 |              fromBits64 ((replicate 62 '0') ++ "11") == 3) $
 40 |         error "fromBits64 helper borked"
 41 | 
 42 |     -- make output to keep travis happy:
 43 |     void $ forkIO $ forever (putStr "." >> threadDelay (1000*1000))
 44 | 
 45 |     -- uncheckedSetBit: --------
 46 |     quickCheckErr 10000 $ \(Large i) ->
 47 |       all (\b-> ((i::Int) `setBit` b) == (i `uncheckedSetBit` b))
 48 |           [0.. wordSizeInBits-1]
 49 | 
 50 | 
 51 |     -- log2w --------
 52 |     unless ((fromIntegral log2w :: Float)
 53 |               == logBase 2 (fromIntegral wordSizeInBits)
 54 |            && (2^log2w == wordSizeInBits)) $
 55 |         error "log2w /= logBase 2 wordSizeInBits"
 56 | 
 57 |     -- maskLog2wRightmostBits --------
 58 |     let w = (2^^log2w) :: Float
 59 |     unless ((w-1) == fromIntegral maskLog2wRightmostBits) $
 60 |         error "maskLog2wRightmostBits is ill-defined"
 61 |     quickCheckErr 10000 $ \(Large i) ->
 62 |         fromIntegral (i .&. maskLog2wRightmostBits) < w
 63 | 
 64 | 
 65 |     -- hash64Enough: --------
 66 |     let sz33MB = 22
 67 |         kThatJustFits = (64-sz33MB) `div` log2w
 68 |     do newOnlyNeeds64 <- Bloom.new (SipKey 1 1) kThatJustFits sz33MB
 69 |        unless (hash64Enough newOnlyNeeds64) $
 70 |            error "These parameters should have produced a bloom requiring only 64 hash bits"
 71 |        newNeeds128 <- Bloom.new (SipKey 1 1) (kThatJustFits+1) sz33MB
 72 |        unless (not $ hash64Enough newNeeds128) $
 73 |            error "These parameters should have produced a bloom requiring just a bit more than 64-bits!"
 74 | 
 75 |     -- for membershipWordAndBits128  and membershipWordAndBits64:
 76 |     membershipWordTests
 77 | 
 78 |     setKMemberBitsUnrolledTest
 79 | 
 80 |     -- Creation/Insertion/FPR unit tests:
 81 |     createInsertFprTests
 82 |     smallBloomTest
 83 |     insertSaturateTest
 84 |     insertConcurrentTest
 85 |     -- TODO disabled for now; it might be that this was not quite right originally, but we manually deleted the cases that didn't work quite accurately under the old hashing scheme:
 86 |     -- highFprTest
 87 | 
 88 |     expectedExceptionsTest
 89 | 
 90 |     -- combining operations:
 91 |     unionSmokeTest
 92 |     unionTests
 93 |     intersectionTests
 94 |     
 95 |     serializationTests
 96 | 
 97 |     putStrLn "TESTS PASSED"
 98 | 
 99 | setKMemberBitsUnrolledTest :: IO ()
100 | setKMemberBitsUnrolledTest = do
101 |   forM_ [1..10] $ \n ->
102 |     quickCheckErr 1000 $ \(Large wd, Large h)-> 
103 |        setKMemberBits       wd n h ==
104 |        setKMemberBitsRolled wd n h
105 |     
106 | 
107 | -- Test exceptions that should only be possible to raise in untyped interface:
108 | expectedExceptionsTest :: IO ()
109 | expectedExceptionsTest = do
110 |     let assertRaises io = catch (io >> error "Expected BloomFilterException to be raised.")
111 |            (\e -> (e :: BloomFilterException) `seq` return ())
112 |         nw :: Int -> Int -> IO (Bloom.BloomFilter Int)
113 |         nw = Bloom.new (SipKey 1 1)
114 |     -- `k` not > 0
115 |     assertRaises $ nw 0 0
116 |     -- `log2l` not >= 0
117 |     assertRaises $ nw 1 (-1)
118 |     -- `log2l` too damn big
119 |     assertRaises $ nw 1 65
120 |     -- requiring > 128 hash bits:
121 |     assertRaises $ nw ((120 `div` log2w) + 1) 8
122 |     assertRaises $ nw 3 ((128 - 3*log2w) + 1)
123 | 
124 | -- Try to get all bits of a small filter filled and force many configurations:
125 | insertSaturateTest :: IO ()
126 | insertSaturateTest = do
127 |     randKey <- (,) <$> randomIO <*> randomIO
128 |     bl <- Bloom.new (uncurry SipKey randKey) 2 2
129 |     forM_ [(1::Int)..500] $ \el-> do
130 |       void $ Bloom.insert bl el
131 |       truePos <- Bloom.lookup bl el
132 |       unless truePos $
133 |         error $ "insertSaturateTest: Somehow got a false neg after insertion: "++(show (el, randKey))
134 |     forM_ [0..3] $ \ix-> do
135 |       wd <- readByteArray (arr bl) ix
136 |       let fill = popCount (wd :: Word)
137 |       when (fill < (wordSizeInBits - 10)) $
138 |         error $ "Bloomfilter doesn't look like it was saturated like we expected "
139 |                 ++(show fill)++"  "++(show randKey)
140 | 
141 | 
142 | insertConcurrentTest :: IO ()
143 | insertConcurrentTest = do
144 |   let k = 6
145 |   forM_ [0..12] $ \log2l-> do
146 |     let key = SipKey 23452345 (fromIntegral log2l)
147 |     b <- Bloom.new key k log2l
148 |     let szDataBytes = sIZEOF_INT * (floor ((2::Float)^log2l))
149 |     let (payload0,payload1) = splitAt szDataBytes [1..(szDataBytes * 2)]
150 |     done0 <- newEmptyMVar
151 |     done1 <- newEmptyMVar
152 |     void $ forkIO ((forM_ payload0 $ Bloom.insert b) >> putMVar done0 ())
153 |     void $ forkIO ((forM_ payload1 $ Bloom.insert b) >> putMVar done1 ())
154 |     takeMVar done0 >> takeMVar done1
155 | 
156 |     control <- Bloom.new key k log2l
157 |     forM_ (payload0++payload1) $ Bloom.insert control
158 | 
159 |     equalBloom b control
160 | 
161 | 
162 | 
163 | -- Smoke test for very small bloom filters:
164 | smallBloomTest :: IO ()
165 | smallBloomTest =
166 |   forM_ [0..2] $ \ourLog2l-> do
167 |     randKey <- (,) <$> randomIO <*> randomIO
168 |     bl <- Bloom.new (uncurry SipKey randKey) 3 ourLog2l
169 |     forM_ [1,2::Int] $ \el-> do
170 |       likelyNotPresent <- Bloom.insert bl el
171 |       truePos <- Bloom.lookup bl el
172 |       unless truePos $
173 |         error $ "smallBloomTest: Somehow got a false neg after insertion: "++(show (ourLog2l, el, randKey))
174 |       unless likelyNotPresent $
175 |         error $ "smallBloomTest: got unlikely failure, please report: "++(show (ourLog2l, el, randKey))
176 | 
177 | 
178 | membershipWordTests :: IO ()
179 | membershipWordTests = do
180 |     let sz33MB = 22
181 |     -- membershipWordAndBits64 --------
182 |     do let membershipWord = "1101001001001001001011"
183 |        let h | wordSizeInBits == 64 = Hash64 $ fromBits64 $            membershipWord++"   001111 001110 001101 001100 001011 001010 001001"
184 |              | otherwise            = Hash64 $ fromBits64 $ "1111111"++membershipWord++"    01111  01110  01101  01100  01011  01010  01001"
185 |            --                                                    \                     \         7 membership bits (15..9)           /
186 |            --                                                     \____ unused
187 |        newOnlyNeeds64 <- Bloom.new (SipKey 1 1) 7 sz33MB
188 |        assert (hash64Enough newOnlyNeeds64) $ return ()
189 |        let (memberWordOut, wordToOr) =
190 |              membershipWordAndBits64 h newOnlyNeeds64
191 | 
192 |        let memberWordExpected = fromIntegral $ fromBits64 (replicate (64-22) '0' ++ membershipWord)
193 |            wordToOrExpected = fromIntegral $ fromBits64 $ -- casts to 32-bit Int on 32-bit arch:
194 |              "000000000000000000000000000000000000000000000000 1111111 000000000"
195 | 
196 |        unless (memberWordOut == memberWordExpected) $
197 |            error $ "membershipWordAndBits64 memberWord: expected "++(show memberWordExpected)++" but got "++(show memberWordOut)
198 |        unless (wordToOr == wordToOrExpected) $
199 |            error $ "membershipWordAndBits64 wordToOr: expected "++(show wordToOrExpected)++" but got "++(show wordToOr)
200 | 
201 |     do
202 |       -- first test filling exactly 64-bits:
203 |       let membershipWord = "1001"  -- n.b. remaining bits divisible by log2w on both 32 and 64-bits
204 |       let kFilling64 | wordSizeInBits == 64 = 10
205 |                      | otherwise = 12
206 |           memberBitsToSet = take kFilling64 [3..]
207 |       assert (maximum memberBitsToSet <= (wordSizeInBits-1)) $ return ()
208 |       let kPayload = concatMap memberWordPaddedBinStr $ memberBitsToSet
209 |           h = Hash64 $ fromBits64 $
210 |                 membershipWord++kPayload
211 |       newNeedsExactly64 <- Bloom.new (SipKey 1 1) kFilling64 (length membershipWord)
212 |       assert (hash64Enough newNeedsExactly64) $ return ()
213 |       --
214 |       -- shared with next test:
215 |       let wordToOrExpected = foldl' setBit 0 memberBitsToSet
216 |       do
217 |         let memberWordExpected = 9
218 |         let (memberWordOut, wordToOr) =
219 |                membershipWordAndBits64 h newNeedsExactly64
220 | 
221 |         unless (memberWordOut == memberWordExpected) $
222 |             error $ "membershipWordAndBits64-full memberWord: expected "++(show memberWordExpected)++" but got "++(show memberWordOut)
223 |         unless (wordToOr == wordToOrExpected) $
224 |             error $ "membershipWordAndBits64-full wordToOr: expected "++(show wordToOrExpected)++" but got "++(show wordToOr)
225 | 
226 | 
227 |     -- membershipWordAndBits128 --------
228 | 
229 |       -- repeat above, but with one bit more in `l` so we need a single bit from h_1 --------
230 |       let membershipWord' = "10001" --17
231 |           h' = let h_0 = fromBits64 $ take (64-length membershipWord') (cycle "10") ++ membershipWord'
232 |                    h_1 = fromBits64 $ take (64-length kPayload) (cycle "110") ++ kPayload
233 |                 in Hash128 h_0 h_1
234 |       newJustNeeds128 <- Bloom.new (SipKey 1 1) kFilling64 (length membershipWord')
235 |       assert (not $ hash64Enough newJustNeeds128) $ return ()
236 |       do
237 |         let (memberWordOut, wordToOr) =
238 |                membershipWordAndBits128 h' newJustNeeds128
239 |         let memberWordExpected = 17
240 | 
241 |         unless (memberWordOut == memberWordExpected) $
242 |             error $ "membershipWordAndBitsJust128 memberWord: expected "++(show memberWordExpected)++" but got "++(show memberWordOut)
243 |         unless (wordToOr == wordToOrExpected) $
244 |             error $ "membershipWordAndBitsJust128 wordToOr: expected "++(show wordToOrExpected)++" but got "++(show wordToOr)
245 | 
246 | 
247 |       -- need all 128 bits --------
248 |       do let kFillingAll128 | wordSizeInBits == 64 = 20
249 |                             | otherwise            = 24
250 |              memberWordExpected = 170
251 |              membershipWord'' = printf "%08b" memberWordExpected
252 |              memberWords = concatMap memberWordPaddedBinStr [1..kFillingAll128]
253 |              h128 = Hash128 (fromBits64 $ ks_0++membershipWord'') (fromBits64 ks_1)
254 |                       where (ks_0, ks_1) = splitAt ((length memberWords) - 64) memberWords
255 |          newNeedsAll128 <- Bloom.new (SipKey 1 1) kFillingAll128 (length membershipWord'')
256 |          assert (not $ hash64Enough newNeedsAll128) $ return ()
257 |          let wordToOrExpected' = foldl' setBit 0 [1..kFillingAll128]
258 | 
259 |          let (memberWordOut, wordToOr) =
260 |                membershipWordAndBits128 h128 newNeedsAll128
261 |          unless (memberWordOut == memberWordExpected) $
262 |             error $ "membershipWordAndBits128-full memberWord: expected "++(show memberWordExpected)++" but got "++(show memberWordOut)
263 |          unless (wordToOr == wordToOrExpected') $
264 |             error $ "membershipWordAndBits128-full wordToOr: expected "++(show wordToOrExpected')++" but got "++(show wordToOr)
265 | 
266 |       -- need less than 128 bits, with 1s fill  --------
267 |       do let kJustOver = 13
268 |              memberWordExpected = 170
269 |              membershipWord'' = printf "%08b" memberWordExpected
270 |              memberWords = concatMap memberWordPaddedBinStr [1..kJustOver]
271 |              h128 = Hash128 (fromBits64 $ ks_0++pad_0++membershipWord'') (fromBits64 $ pad_1++ks_1)
272 |                       where (ks_0, ks_1) = splitAt ((length memberWords) - 64) memberWords
273 |                             pad_0 = replicate (64 - (length membershipWord'' + length ks_0)) '1'
274 |                             pad_1 = replicate (64 - (length ks_1)) '1'
275 |          newJustOver <- Bloom.new (SipKey 1 1) kJustOver (length membershipWord'')
276 |          assert (not $ hash64Enough newJustOver) $ return ()
277 |          let wordToOrExpected' = foldl' setBit 0 [1..kJustOver]
278 | 
279 |          let (memberWordOut, wordToOr) =
280 |                membershipWordAndBits128 h128 newJustOver
281 |          unless (memberWordOut == memberWordExpected) $
282 |             error $ "membershipWordAndBits128-fullx memberWord: expected "++(show memberWordExpected)++" but got "++(show memberWordOut)
283 |          unless (wordToOr == wordToOrExpected') $
284 |             error $ "membershipWordAndBits128-fullx wordToOr: expected "++(show wordToOrExpected')++" but got "++(show wordToOr)
285 | 
286 | 
287 | 
288 | 
289 | -- set a unique bit in each source and target word, check exact expected individual bits
290 | unionSmokeTest :: IO ()
291 | unionSmokeTest = do
292 |     b2 <- Bloom.new (SipKey 1 1) 3 1
293 |     b8 <- Bloom.new (SipKey 1 1) 3 3
294 |     let wds8 = zip [0..] $ map (2^) $ take 8 [(3::Int)..]
295 |     let f (ix,v) (x1,x2) | even ix = (x1.|.v, x2)
296 |                          | otherwise = (x1, x2.|.v)
297 |     let (expected_1,expected_2) = foldr f (2,4) wds8
298 | 
299 |     writeByteArray (arr b2) 0 (2::Word)
300 |     writeByteArray (arr b2) 1 (4::Word)
301 | 
302 |     forM_ wds8 $ \(ix,v)->
303 |       writeByteArray (arr b8) ix (v::Word)
304 | 
305 |     b8 `unionInto` b2
306 |     actual_1 <- readByteArray (arr b2) 0
307 |     actual_2 <- readByteArray (arr b2) 1
308 |     unless (actual_1 == expected_1 && actual_2 == expected_2 && actual_1 > 0 && actual_2 > 0) $
309 |       error $ "Union insane: "++(show [expected_1,actual_1, expected_2, actual_2])
310 | 
311 |     -- check identical size:
312 |     b8' <- Bloom.new (SipKey 1 1) 3 3
313 |     b8 `unionInto` b8'
314 |     forM_ wds8 $ \(ix,v)-> do
315 |       v' <- readByteArray (arr b8') ix
316 |       unless (v == v') $
317 |         error "Union smoke test on identical length filters failed."
318 | 
319 | unionTests :: IO ()
320 | unionTests = do
321 |   forM_ [19,20] $ \bigl-> forM_ [10..bigl] $ \littlel ->
322 |    forM_ ([3,11,12,13]++ if log2w == 6 then [10] else []) $ \ourk -> do -- for 64 and 128
323 |     b1 <- Bloom.new (SipKey 1 1) ourk bigl
324 |     b2 <- Bloom.new (SipKey 1 1) ourk littlel
325 |     let xs = [200..600] :: [Int]
326 |     let ys = [400..800] :: [Int]
327 |     let nots = [0..199]
328 |     let xsys = [200..800]
329 |     mapM_ (Bloom.insert b1) xs
330 |     mapM_ (Bloom.insert b2) ys
331 |     b1 `Bloom.unionInto` b2
332 |     forM_ nots $ \v-> do
333 |       exsts <- Bloom.lookup b2 v
334 |       when exsts $ error $ (show (bigl,littlel,v,ourk))++": Found unexpected element"
335 |     forM_ xsys $ \v-> do
336 |       exsts <- Bloom.lookup b2 v
337 |       unless exsts $ error $ (show (bigl,littlel,v,ourk))++": Could not find expected element."
338 | 
339 |   forM_ [0..10] $ \bigl-> forM_ [0..bigl] $ \littlel -> do
340 |    forM_ [2,14] $ \ourk -> do
341 |     b1 <- Bloom.new (SipKey 2 2) ourk bigl
342 |     b2 <- Bloom.new (SipKey 2 2) ourk littlel
343 |     void $ Bloom.insert b1 'a'
344 |     void $ Bloom.insert b2 'b'
345 |     b1 `Bloom.unionInto` b2
346 |     forM_ "cdefghijkl" $ \v-> do
347 |       exsts <- Bloom.lookup b2 v
348 |       when exsts $ error $ (show (bigl,littlel,v))++": Found unexpected element"
349 |     forM_ "ab" $ \v-> do
350 |       exsts <- Bloom.lookup b2 v
351 |       unless exsts $ error $ (show (bigl,littlel,v))++": Could not find expected element."
352 | 
353 |   -- another to excercise 128-bit a little more:
354 |   forM_ [16,17] $ \bigl-> 
355 |    forM_ [9..18] $ \ourk -> do
356 |      let littlel = 16
357 |      b1 <- Bloom.new (SipKey 2 2) ourk bigl
358 |      b2 <- Bloom.new (SipKey 2 2) ourk littlel
359 |      void $ Bloom.insert b1 'a'
360 |      void $ Bloom.insert b2 'b'
361 |      b1 `Bloom.unionInto` b2
362 |      forM_ "cdefghijkl" $ \v-> do
363 |        exsts <- Bloom.lookup b2 v
364 |        when exsts $ error $ (show (bigl,littlel,v))++": Found unexpected element"
365 |      forM_ "ab" $ \v-> do
366 |        exsts <- Bloom.lookup b2 v
367 |        unless exsts $ error $ (show (bigl,littlel,v))++": Could not find expected element."
368 | 
369 |   -- and using all 128 bits.
370 |   let kFillingAll128 | wordSizeInBits == 64 = 20
371 |                      | otherwise            = 24
372 |   b1 <- Bloom.new (SipKey 2 2) kFillingAll128 8
373 |   b2 <- Bloom.new (SipKey 2 2) kFillingAll128 7
374 |   void $ Bloom.insert b1 'a'
375 |   void $ Bloom.insert b2 'b'
376 |   b1 `Bloom.unionInto` b2
377 |   forM_ "cdefghijkl" $ \v-> do
378 |     exsts <- Bloom.lookup b2 v
379 |     when exsts $ error $ ": Found unexpected element"
380 |   forM_ "ab" $ \v-> do
381 |     exsts <- Bloom.lookup b2 v
382 |     unless exsts $ error $ ": Could not find expected element."
383 | 
384 | 
385 | -- the union tests are sufficient to test 'combine'. Just do a sanity check here.
386 | intersectionTests :: IO ()
387 | intersectionTests =
388 |   forM_ [18,19] $ \bigl-> forM_ [10..bigl] $ \littlel ->
389 |    forM_ [5,14] $ \ourk -> do
390 |     b1 <- Bloom.new (SipKey 3 1) ourk bigl
391 |     b2 <- Bloom.new (SipKey 3 1) ourk littlel
392 |     let xs = [200..600] :: [Int]
393 |     let ys = [400..800] :: [Int]
394 |     let nots = [199..399]++[601..801]
395 |     let xsys = [400..600]
396 |     mapM_ (Bloom.insert b1) xs
397 |     mapM_ (Bloom.insert b2) ys
398 |     b1 `Bloom.intersectionInto` b2
399 |     forM_ nots $ \v-> do
400 |       exsts <- Bloom.lookup b2 v
401 |       when exsts $ error $ (show (bigl,littlel,v))++": Found unexpected element"
402 |     forM_ xsys $ \v-> do
403 |       exsts <- Bloom.lookup b2 v
404 |       unless exsts $ error $ (show (bigl,littlel,v))++": Could not find expected element."
405 | 
406 | serializationTests :: IO ()
407 | serializationTests = do
408 |   quickCheckErr 1000 $ \(Large wd64)->
409 |     wd64 == (unbytes64 . bytes64 $ wd64)
410 |   equalBloomSane
411 |   -- test log2lFromArraySize:
412 |   forM_ [(1,0), (2,1), (3,2), (4,8), (5, 12)] $ \args-> do
413 |     BloomFilter{..} <- uncurry (Bloom.new (SipKey 848 734783)) args
414 |     log2lCalc <- log2lFromArraySize (sizeofMutableByteArray arr)
415 |     unless (log2l == log2lCalc) $ 
416 |       error $ "log2lFromArraySize mismatch: "++(show (args,log2l,log2lCalc))
417 | 
418 |   serializeRoundtripsTest
419 |   serializeGoldenTests
420 |   issue2Test
421 | 
422 | -- Issue #2: this triggered a bug from a careless bit of floating point arithmetic:
423 | issue2Test :: IO ()
424 | issue2Test = do
425 |   sipKey <- pure $ (Bloom.SipKey 1 1)
426 |   bloom <- Bloom.new sipKey 3 26 :: IO (Bloom.BloomFilter String)
427 |   bloomBS <- Bloom.serialize bloom
428 |   void $ (Bloom.deserialize sipKey bloomBS :: IO (Bloom.BloomFilter String))
429 | 
430 | 
431 | -- For validating that we can serialize and deserialize across machines, and
432 | -- try to handle forwards compatibility (later).
433 | serializeGoldenTests :: IO ()
434 | serializeGoldenTests = do
435 |   if sIZEOF_INT == 8
436 |     then 
437 |       forM_ [(1,0), (2,1), (3,2), (3,7), (4, 10)] $ \(k, log2l)-> do
438 |         let key = SipKey 983745 476835
439 |         b <- Bloom.new key k log2l
440 |         let szDataBytes = sIZEOF_INT * (floor ((2::Float)^log2l))
441 |         payload <- forM [1..(szDataBytes * 2)] $ \x-> do
442 |           void $ Bloom.insert b x
443 |           return x
444 | 
445 |         let path = "tests/serialized/"++(show k)++"_"++(show log2l)++".64.bytestring"
446 |         bSerNow <- unsafeSerialize b
447 |         -- B.writeFile path bSerNow  -- UNCOMMENT TO REGENERATE:
448 |         bSerStored <- B.readFile path
449 |         unless (bSerNow == bSerStored) $ 
450 |           error $ "Deserialized stored bloom did not match: "++path
451 | 
452 |         b' <- Bloom.deserialize key bSerStored
453 |         forM_ payload $ \x-> do
454 |           present <- Bloom.lookup b' x
455 |           unless present $ error $ "Did not find all expected elements in: "++path
456 |         
457 |     else 
458 |       -- TODO. Don't have a 32-bit machine to generate filters from.
459 |       return ()
460 | 
461 | serializeRoundtripsTest :: IO ()
462 | serializeRoundtripsTest = do
463 |   let key = SipKey 87345 8723
464 |   forM_ [(1,0), (2,1), (3,2), (3,7), (3, 14)] $ \(k, log2l)-> do
465 |     b <- Bloom.new key k log2l
466 |     let szDataBytes = sIZEOF_INT * (floor ((2::Float)^log2l))
467 |     forM_ [1..(szDataBytes * 2)] $ \x-> do
468 |       void $ Bloom.insert b x
469 | 
470 |     bSer <- Bloom.serialize b
471 |     bUnsafeSer <- unsafeSerialize b
472 |     unless (bSer == bUnsafeSer) $
473 |       error $ "Unsafe and safe serialize produced different bytestrings with"++(show (k,log2l))
474 |     b'  <- Bloom.deserialize key bSer
475 |     b'U <- Bloom.deserialize key bUnsafeSer
476 |     equalBloom b b'
477 |     equalBloom b b'U
478 | 
479 |     -- Now mangle and unmangle the bytestring to excercise offset/length, etc.
480 |     -- in deserialization:
481 |     let (!dc,!ba) = B.splitAt 5 $ B.reverse bUnsafeSer
482 |         !bax = ba `B.snoc` 0xFF
483 |         !xabcd = B.reverse bax `B.append` B.reverse dc
484 |     let !bUnsafeSer' = B.drop 1 xabcd
485 |     unless (bUnsafeSer == bUnsafeSer') $ error "Didn't mangle/unmangle properly"
486 |     b'U' <- Bloom.deserialize key bUnsafeSer'
487 |     equalBloom b b'U'
488 | 
489 | 
490 | equalBloomSane :: IO ()
491 | equalBloomSane = do
492 |   let key = SipKey 99 100
493 |   forM_ [(1,0), (2,1), (3,2), (4,8), (5, 12)] $ \(k, log2l)-> do
494 |     b0 <- Bloom.new key k log2l
495 |     b1 <- Bloom.new key k log2l
496 |     b2 <- Bloom.new key k log2l
497 |     let szDataBytes = sIZEOF_INT * (floor ((2::Float)^log2l))
498 |     forM_ [1..(szDataBytes * 2)] $ \x-> do
499 |       void $ Bloom.insert b0 x
500 |       void $ Bloom.insert b1 x
501 |       void $ Bloom.insert b2 x
502 |     equalBloom b0 b1
503 |     -- modify in metadata region, and make sure equal:
504 |     writeByteArray (arr b0) szDataBytes (0xFF::Word8)
505 |     writeByteArray (arr b0) (sizeofMutableByteArray (arr b0) -1) (0xFF:: Word8)
506 |     equalBloom b0 b1
507 | 
508 |     -- ensure we catch differences
509 |     writeByteArray (arr b0) (szDataBytes-1) (0x02::Word8) -- last data byte
510 |     throws1 <- try (equalBloom b0 b1)
511 |     writeByteArray (arr b2) (szDataBytes-1) (0x02::Word8) -- last data byte
512 |     equalBloom b0 b2
513 |     writeByteArray (arr b2) 0 (0xFF::Word8)      -- first data byte
514 |     throws2 <- try (equalBloom b0 b2)
515 |     case [throws1 :: Either SomeException (), throws2] of
516 |       [Left _, Left _] ->  return ()
517 |       es -> error $ "equalBloom didn't detect differences: "++(show es)
518 | 
519 |   
520 | 
521 | {-
522 |  A NOTE ON TESTING FPR (from the paper)
523 | 
524 | "consideration when implementing Bloom-1 filters. To
525 | ensure that the results obtained using Eq. (5) are accurate,
526 | a Bloom-1 filter was implemented in Matlab using ideal
527 | hash functions and simulated for the same parameters. In
528 | each simulation, 10,000 Fast Bloom filters are generated
529 | by inserting random elements until the specified load is
530 | achieved. Then their false positive rate is evaluated doing
531 | 10^6 random queries of non member elements. The average
532 | results were checked against those obtained with Eq. (5).
533 | This was done for false positive rates larger than 10-6
534 | . In all cases, differences were smaller than 0.5%."
535 | -}
536 | 
537 | createInsertFprTests :: IO ()
538 | createInsertFprTests =
539 |   let bloomParams = [
540 |         -- all params with low FPR:
541 |         (2, 1, 2),
542 |         (4, 1, 3),
543 |         (500, 8, 3),
544 |         (1000,  8, 10),
545 |         (500, 8, 15),
546 |         (500,  8, 20),
547 |         (5000000,  22, 3)]
548 |    in forM_ bloomParams $ \param@(payloadSz, ourLog2l, ourK)-> do
549 |         let !loadedFpr = fpr payloadSz (2^ourLog2l) ourK wordSizeInBits
550 |             payload = take payloadSz [2,4..] :: [Int]
551 |             antiPayloadSz = 10000
552 |             antiPayload = take antiPayloadSz [1,3..]
553 |         randKey <- (,) <$> randomIO <*> randomIO
554 |         bl <- Bloom.new (uncurry SipKey randKey) ourK ourLog2l
555 | 
556 |         allNeg <- mapM (Bloom.lookup bl) payload
557 |         unless (all not allNeg) $
558 |           error $ "Expected empty: "++(show param)++"\n"++(show randKey)++(show allNeg)
559 | 
560 |         falsesAndFPs <- mapM (Bloom.insert bl) payload
561 |         -- This should on average be less than `loadedFpr` calculated on fully-loaded
562 |         -- bloom filter:
563 |         let insertionFprMeasured =
564 |               (fromIntegral $ length $ filter not falsesAndFPs) / (fromIntegral payloadSz)
565 |         allTruePositives <- mapM (Bloom.lookup bl) payload
566 |         unless (and allTruePositives) $
567 |           error $ "Expected all true positives"++(show param)++"\n"++(show randKey)
568 | 
569 |         falsePs <- mapM (Bloom.lookup bl) antiPayload
570 |         let !loadedFprMeasured =
571 |               (fromIntegral $ length $ filter id falsePs) / (fromIntegral antiPayloadSz)
572 | 
573 |         -- TODO proper statistical measure of accuracy of measured FPR
574 |         unless (all (< 0.01) [insertionFprMeasured, loadedFprMeasured]) $
575 |           error $ "Measured unexpectedly high FPR. Possible fluke; please retry tests: "
576 |                    ++(show param)++"\n"++(show randKey)
577 |         unless ((abs $ loadedFprMeasured - loadedFpr) < 0.005) $
578 |           error $ "Measured FPR deviated from calculated FPR more than we expected: "
579 |                    ++(show param)++"\n"++(show randKey)
580 | 
581 |         allTruePositivesIns <- mapM (Bloom.insert bl) payload
582 |         unless (all not allTruePositivesIns) $
583 |           error $ "Expected all true positives (i.e. insert failures): "
584 |                  ++(show param)++"\n"++(show randKey)
585 | 
586 | 
587 | -- spot check our `fpr` function at higher values:
588 | highFprTest :: IO ()
589 | highFprTest = do
590 |   let bloomParams = [
591 |         -- params with double-digit pct FPR
592 |           (50000, 10, 3)
593 |         , (65000, 10, 3) -- 84.8% calculated . I guess error only affects smaller filters significantly?
594 |         , (5000, 8, 3)
595 |         , (5000, 8, 10)
596 |         , (500, 6, 1)
597 |         , (1000, 5, 2)
598 | 
599 |         , (200, 5, 2) -- low single-digit FPR
600 |         , (500, 6, 2)
601 | 
602 | -- TODO commented values below deviated slightly out of our allowed range after
603 | --      changing to siphash64_1_3. Figure out and add back.
604 | 
605 |         -- for small sizes, high-ish loads
606 |      -- , (625, 4, 2)  -- 51% measured, 48% calculated
607 |         , (625, 3, 2)  -- 83% measured, 78% calculated
608 |         , (700, 3, 2)
609 |      -- , (750, 3, 2)
610 |         , (750, 3, 3)
611 |         -- 50% fp or lower:
612 |      -- , (350, 3, 2)
613 |         , (200, 3, 2)
614 |         , (150, 3, 2)
615 |         -- > 90% fpr
616 |         , (2000, 4, 2)
617 |         , (2000, 4, 3)
618 | 
619 |         , (2500, 5, 2)
620 |         , (2450, 5, 2)
621 |         , (2400, 5, 2)
622 |         ]
623 |    in forM_ bloomParams $ \param@(payloadSz, ourLog2l, ourK)-> do
624 |         let !loadedFpr = fpr payloadSz (2^ourLog2l) ourK wordSizeInBits
625 |             payload = take payloadSz [2,4..] :: [Int]
626 |             antiPayloadSz = 200000
627 |             antiPayload = take antiPayloadSz [1,3..]
628 |         randKey <- (,) <$> randomIO <*> randomIO
629 |         bl <- Bloom.new (uncurry SipKey randKey) ourK ourLog2l
630 |         mapM_ (Bloom.insert bl) payload
631 | 
632 |         falsePs <- mapM (Bloom.lookup bl) antiPayload
633 |         let !loadedFprMeasured =
634 |               (fromIntegral $ length $ filter id falsePs) / (fromIntegral antiPayloadSz)
635 | 
636 |         unless ((abs $ loadedFprMeasured - loadedFpr) < 0.03) $
637 |           error $ "Measured high FPR deviated from calculated FPR more than we expected: "
638 |                    ++(fmtPct loadedFprMeasured)++" "++(fmtPct loadedFpr)
639 |                    ++(show param)++"\n"++(show randKey)
640 | 
641 | 
642 | fmtPct :: Double -> String
643 | fmtPct x = printf "%.2f%%" (x*100)
644 | 
645 | 
646 | # ifdef ASSERTIONS_ON
647 | checkAssertionsOn :: IO ()
648 | checkAssertionsOn = do
649 |     -- Make sure testing environment is sane:
650 |     assertionsWorking <- try $ assert False $ return ()
651 |     assertionsWorkingInLib <- assertionCanary
652 |     case assertionsWorking of
653 |          Left (AssertionFailed _)
654 |            | assertionsWorkingInLib -> putStrLn "Assertions: On"
655 |          _  -> error "Assertions aren't working"
656 | # endif
657 | 
658 | 
659 | -- Test helpers:
660 | fromBits64 :: String -> Word64
661 | fromBits64 bsDirty =
662 |     let bs = zip [0..] $ reverse $ filter (\b-> b == '0' || b == '1') bsDirty
663 |      in if length bs /= 64
664 |           then error "Expecting 64-bits"
665 |           else foldr (\(nth,c) wd-> if c == '1' then (wd `setBit` nth) else wd) 0x00 bs
666 | 
667 | memberWordPaddedBinStr :: Int -> String
668 | memberWordPaddedBinStr n
669 |     | n > wordSizeInBits = error "memberBitStr"
670 |     | otherwise = printf ("%0"++(show log2w)++"b") n
671 | 
672 | 
673 | -- Utilites:  ---------------------------------
674 | quickCheckErr :: Testable prop => Int -> prop -> IO ()
675 | quickCheckErr n p = 
676 |     quickCheckWithResult stdArgs{ maxSuccess = n , chatty = False } p
677 |       >>= maybeErr
678 | 
679 |   where maybeErr (Success _ _ _) = return ()
680 |         maybeErr e = error $ show e
681 | 
682 | -- TODO eventually could replace this with an exported lib function (e.g. Eq on frozen BloomFilters)
683 | --      easiest might just be unsafeSerialize + Eq for ByteString
684 | equalBloom :: BloomFilter a -> BloomFilter a -> IO ()
685 | equalBloom b0 b1 = do
686 |   unless ((key b0, k b0, hash64Enough b0, l_minus1 b0, log2l b0) == (key b1, k b1, hash64Enough b1, l_minus1 b1, log2l b1)) $
687 |     error "Can't compare arrays, since params aren't the same"
688 |   let sz0 = sizeofMutableByteArray (arr b0)
689 |   unless (sz0 == (sizeofMutableByteArray $ arr b1)) $
690 |     error "Array sizes differ"
691 | 
692 |   let dataWords = floor ((2::Float)^(log2l b0))
693 |   forM_ [0..(dataWords -1)] $ \wordIx -> do
694 |     w0 <- readByteArray (arr b0) wordIx 
695 |     w1 <- readByteArray (arr b1) wordIx 
696 |     unless ((w0 :: Word) == w1) $
697 |       error $ "Arrays differ at ix: "++(show wordIx)
698 |   
699 | 
700 | 


--------------------------------------------------------------------------------
/src/Control/Concurrent/BloomFilter/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, RecordWildCards, CPP, ScopedTypeVariables, DeriveDataTypeable #-}
  2 | module Control.Concurrent.BloomFilter.Internal (
  3 | {- | Some additional unsafe, low-level, and internal functions are exposed here
  4 |    for advanced users. The API should remain stable, except that functions may
  5 |    be added and no promises are made about the internals of the 'BloomFilter'
  6 |    type itself.
  7 |  -}
  8 |       new
  9 |     , BloomFilter(..)
 10 |     , BloomFilterException(..)
 11 |     , insert
 12 |     , lookup
 13 |     , unionInto
 14 |     , intersectionInto
 15 |     , clone
 16 |     , SipKey(..)
 17 |     , fpr
 18 | 
 19 |     , serialize
 20 |     , unsafeSerialize
 21 |     , deserialize
 22 |     , deserializeByteArray
 23 | 
 24 | # ifdef EXPORT_INTERNALS
 25 |   -- * Internal functions exposed for testing; you shouldn't see these
 26 |     , membershipWordAndBits64, membershipWordAndBits128
 27 |     , maskLog2wRightmostBits
 28 |     , log2w
 29 |     , wordSizeInBits
 30 |     , uncheckedSetBit
 31 |     , isHash64Enough
 32 |     , log2lFromArraySize
 33 |     , assertionCanary
 34 |     , bytes64, unbytes64
 35 |     , setKMemberBits, setKMemberBitsRolled
 36 | # endif
 37 |     )
 38 |     where
 39 | 
 40 | 
 41 | import Data.Bits hiding (unsafeShiftL, unsafeShiftR)
 42 | import qualified Data.Bits as BitsHidden
 43 | import qualified Data.Primitive.ByteArray as P
 44 | import Data.ByteString.Internal
 45 | import GHC.ForeignPtr
 46 | import Foreign.ForeignPtr
 47 | import Foreign.Storable(peekElemOff)
 48 | import Data.Primitive.MachDeps
 49 | import Data.Primitive.Types(Addr(..))
 50 | import Control.Monad.Primitive(RealWorld)
 51 | import Data.Atomics
 52 | import Data.Hashabler
 53 | import Control.Exception
 54 | import Data.Typeable(Typeable)
 55 | import Control.Monad
 56 | import Data.Word(Word64, Word8)
 57 | import Prelude hiding (lookup)
 58 | 
 59 | 
 60 | -- Future operations:
 61 | --   - memory-mapped bloomfilter for durability (which of ACID do we get?). See 'vector-mmap' package?
 62 | --     - allow opening mmap-ed file directly from serialized form?
 63 | --   - approximating number of items, and size of union and intersection
 64 | --     - simply sample some number of words, in order to get the accuracy the user requests
 65 | --   - freezing/pure/ST interface (e.g. Data.BloomFilter)
 66 | --     - API: 
 67 | --         - only allow writes in ST (copying for each write is awful)
 68 | --         - provide a fromList that uses ST, for convenience
 69 | --         - querying and combining can be regural pure interface (Semigroup)
 70 | --          
 71 | --   - bulk reads and writes, for performance: (especially good for pure interface fromList, etc.
 72 | --      fromList implementation possibilities:
 73 | --        1 - allocate new
 74 | --          - unsafeInsert all into new (possibly prefetching next block)
 75 | --          - non-threadsafe union with previous
 76 | --        2 - hash and sort (as list or something)
 77 | --          - memcpy previous
 78 | --          - unsafeInsert in order into new
 79 | --        3 - memcpy previous
 80 | --          - unsafeInsert new, manually prefetching next.
 81 | --     - re-order hashes for in-order cache line access (benchmark this)
 82 | --     - consider prefetching
 83 | --     - combine inter-word reads and writes.
 84 | --   - consider a Scalable Bloom Filter -type approach (or variant):
 85 | --     - CAS of linked list of filters
 86 | --     - possible linearizability issues.
 87 | --     - other operations become tricker or not doable.
 88 | --
 89 | --
 90 | -- Future typed interface:
 91 | --   - parameterize by length, or at least have separate Bloom64 (faster, uses only 64-bit hashes) and Bloom128 functions.
 92 | --      - new takes a type-level nat regardless of whether we carry that around in a parameter
 93 | --   - parameterize by k (no big deal being static)
 94 | --   - for sipkey: 
 95 | --       - use NullaryTypeClasses or some more clever solution
 96 | --         "The conﬁgurations problem is to propagate run-time preferences
 97 | --         throughout a program, allowing multiple concurrent conﬁguration sets
 98 | --         to coexist safely under statically guaranteed separation..."
 99 | --         TODO is this relevant for the other type-level params we imagine?
100 | --         TODO can the two be complimentary?: use a singleton class, but instantiate it dynamically with reflection? per:https://www.schoolofhaskell.com/user/thoughtpolice/using-reflection#dynamically-constructing-type-class-instances
101 | --       - Or use a type lit that corresponds to an environment variable, and tag
102 | --         - the user could shoot himself in the foot by changing the environment and break this scheme
103 | --            but that's probably okay.
104 | --         - what if user wants to pass it as a command line arg or something? Is this equally compatible with windows?
105 | --
106 | --   - `new` variant that ensures fast 64-bit version.
107 | --   - deserializing, will have type ... -> Either String (BloomFilter x y z a)
108 | --     - we can always check equality of value level nats by doing natVal when deserializing
109 | 
110 | 
111 | 
112 | -- TODO: expose functions that would allow users to shard inserts and lookups
113 | --       and use non-atomic write. e.g. maybe an unsafeInsert (non-threadsafe
114 | --       write) as well as functions that take a user-supplied hash directly. 
115 | 
116 | 
117 | 
118 | -- | A mutable bloom filter representing a set of 'Hashable' values of type @a@.
119 | --
120 | -- A bloom filter is a set-like probabilistic hash-based data structure.
121 | -- Elements can be 'insert'-ed and 'lookup'-ed again. The bloom filter
122 | -- takes a constant amount of memory regardless of the size and number of
123 | -- elements inserted, however as the bloom filter \"grows\" the likelihood of
124 | -- false-positives being returned by 'lookup' increases. 'fpr' can be used to
125 | -- estimate the expected false-positive rate.
126 | data BloomFilter a = BloomFilter { key :: !SipKey
127 |                                  , k :: !Int
128 |                                  , hash64Enough :: Bool
129 |                                  -- ^ if we need no more than 64-bits we can use the faster 'siphash64_1_3'
130 |                                  , l_minus1 :: !Word64
131 |                                  , log2l :: !Int
132 |                                  , arr :: !(P.MutableByteArray RealWorld)
133 |                                  }
134 | 
135 | 
136 | -- | Exceptions that may be thrown by operations in this library.
137 | newtype BloomFilterException = BloomFilterException String
138 |     deriving (Show, Typeable)
139 | 
140 | instance Exception BloomFilterException
141 | 
142 | throwBloom :: String -> IO a
143 | throwBloom = throwIO . BloomFilterException
144 | 
145 | -- | Create a new bloom filter of elements of type @a@ with the given hash key
146 | -- and parameters. 'fpr' can be useful for calculating the @k@ parameter, or
147 | -- determining a good filter size.
148 | --
149 | -- The parameters must satisfy the following conditions, otherwise a
150 | -- 'BloomFilterException' will be thrown:
151 | --
152 | --   - @k > 0@
153 | --   - @log2l >= 0 && log2l <= wordSizeInBits@
154 | --   - @log2l + k*(logBase 2 wordSizeInBits) <= 128@
155 | --
156 | -- In addition, performance on 64-bit machines will be best when
157 | -- @log2l + k*(logBase 2 wordSizeInBits) <= 64@ where we require only 64 hash
158 | -- bits for each element. (Performance on 32-bit machines will be worse in all
159 | -- cases, as we're doing 64-bit arithmetic.)
160 | --
161 | -- Example: on a 32-bit machine, the following produces a ~4KB bloom filter of
162 | -- @2^10@ 32-bit words, using 3 bits per element:
163 | --
164 | -- @
165 | --  do key <- read <$> getEnv "THE_SECRET_KEY"
166 | --     Bloom.new key 3 10
167 | -- @
168 | new :: SipKey
169 |     -- ^ The secret key to be used for hashing values for this bloom filter.
170 |     -> Int
171 |     -- ^ @k@: Number of independent bits of @w@ to which we map an element. 3 is a good choice.
172 |     -> Int
173 |     -- ^ @log2l@: The size of the filter, in machine words, as a power of 2. e.g. @3@ means @2^3@ or @8@ machine words.
174 |     -> IO (BloomFilter a)
175 | new key k log2l = do
176 |     -- In typed interface all of these conditions hold:
177 |     let !hash64Enough = isHash64Enough log2l k
178 |     checkParamInvariants k log2l
179 | 
180 |     (arr, sizeDataBytes) <- newBloomArr log2l
181 |     P.fillByteArray arr 0 sizeDataBytes (0x00)
182 | 
183 |     return $ BloomFilter { l_minus1 = (2^log2l)-1, .. }
184 | 
185 | -- factored out for deserialization:
186 | checkParamInvariants :: Int -> Int -> IO ()
187 | checkParamInvariants k log2l = do
188 |     unless (k > 0) $
189 |       throwBloom "in 'new', k must be > 0"
190 |     unless (log2l >= 0) $
191 |       throwBloom "in 'new', log2l must be >= 0"
192 | 
193 | -- We leave a buffer at the end of the data portion of the filter large enough
194 | -- to store metadata for serialization, so we don't have to do any copying for
195 | -- ser/deser. But we don't concern ourselves with populating or maintaining
196 | -- metadata except during our serialization and deserialization routines; that
197 | -- memory may be dirty.
198 | newBloomArr :: Int -> IO (P.MutableByteArray RealWorld, Int)
199 | newBloomArr log2l = do
200 |     let !sizeDataBytes = sIZEOF_INT `uncheckedShiftL` log2l
201 |     -- aligned: we assume atomic reads (no word tearing):
202 |     -- pinned: for performance, and so we can "serialize" to bytestring without
203 |     --         copying, and do other future FFI stuff
204 |     arr <- P.newAlignedPinnedByteArray (sizeDataBytes+sIZEOF_METADATA) aLIGNMENT_INT
205 |     return (arr,sizeDataBytes)
206 | 
207 | log2lFromArraySize :: Int{-bytes-} -> IO Int
208 | log2lFromArraySize sz = 
209 |   either throwBloom return $ do
210 |     let dataSzBytes = sz - sIZEOF_METADATA
211 |         (dataSzWds, z) = dataSzBytes `quotRem` sIZEOF_INT
212 |         isPowerOfTwo n = n .&. (n - 1) == 0 -- when n > 0
213 | 
214 |     unless (dataSzBytes >= sIZEOF_INT) $ Left "Array is not large enough to be a serialized bloom filter"
215 |     unless (isPowerOfTwo dataSzWds && z == 0) $ Left "Array is an unexpected size for a serialized bloom filter"
216 |     return $!
217 |       popCount (dataSzWds - 1) -- logBase 2, when isPowerOfTwo dataSzWds
218 |   
219 | 
220 | 
221 | membershipWordAndBits64 :: Hash64 a -> BloomFilter a -> (Int, Int)
222 | {-# INLINE membershipWordAndBits64 #-}
223 | membershipWordAndBits64 !(Hash64 h) = \ !(BloomFilter{ .. }) ->
224 |   assert (isHash64Enough log2l k) $
225 |     -- From right: take all member bits, then take membership word. NOTE: for
226 |     -- union on different size filters to work we must not e.g. take the
227 |     -- leftmost log2l bits; we need lower-order bits to be the same between the
228 |     -- two filters.
229 |     let !memberWord = fromIntegral $
230 |            l_minus1 .&. (h `uncheckedShiftR` fromIntegral (k*log2w))
231 |         !wordToOr = fst $ setKMemberBits 0x00 k h
232 | 
233 |      in (memberWord, wordToOr)
234 | 
235 | membershipWordAndBits128 :: Hash128 a -> BloomFilter a -> (Int, Int)
236 | {-# INLINE membershipWordAndBits128 #-}
237 | membershipWordAndBits128 (Hash128 h_0 h_1) = \(BloomFilter{ .. }) ->
238 |   assert (not $ isHash64Enough log2l k) $
239 |     -- Isolate member word by taking from lowest bits of h_0, take member bits
240 |     -- starting from right of h_1, possibly using leftmost from h_0 which we
241 |     -- splice onto the end as we shift and consume h_1:
242 |     let !memberWord = fromIntegral $ l_minus1 .&. h_0
243 |         !bitsReqd_h_0 = k*log2w - 64
244 | 
245 |         !wordToOr =
246 |            if bitsReqd_h_0 <= 0
247 |              then fst $ setKMemberBits 0x00 k h_1
248 |              else -- we'll shift right just enough so we can OR with the last bit of h_1 shifted right:
249 |                let !bitsReqd_h_0_withOffs = bitsReqd_h_0 + (64 `rem` log2w)
250 |                    !h_0_alignedMasked =
251 |                      -- clear right:
252 |                      (h_0 `uncheckedShiftR` (64 - bitsReqd_h_0)) -- n.b. conditional guards shift
253 |                        -- align at offset:
254 |                        `uncheckedShiftL` (64 - bitsReqd_h_0_withOffs)
255 |                    !initialKToTake = bitsReqd_h_0_withOffs `quot` log2w
256 |                    (!wordToOrPart0, !h_1_shifted) = setKMemberBits 0x00 initialKToTake h_1
257 | 
258 |                 in assert (initialKToTake > 0) $
259 |                      fst $ setKMemberBits wordToOrPart0 (k-initialKToTake) (h_1_shifted.|.h_0_alignedMasked)
260 | 
261 |      in (memberWord, wordToOr)
262 | 
263 | 
264 | -- To promote pipelining, and inlining. This improves lookup and insert by
265 | -- ~15-30% faster depending on which benchmarks we look at and how we squint.
266 | -- We treat this like a macro and expect it to be reduced in the common case
267 | -- where 'k' is a compile-time literal.
268 | setKMemberBits :: Int -> Int -> Word64 -> (Int, Word64)
269 | {-# INLINE setKMemberBits #-}
270 | setKMemberBits !wd 1 !h = 
271 |   ((wd `uncheckedSetBit` (maskLog2w h))
272 | 
273 |   , h `uncheckedShiftR` (log2w*1)
274 |   )
275 | setKMemberBits !wd 2 !h = 
276 |   (((wd `uncheckedSetBit` (maskLog2w h))
277 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
278 | 
279 |   , h `uncheckedShiftR` (log2w*2)
280 |   )
281 | setKMemberBits !wd 3 !h = 
282 |   ((((wd `uncheckedSetBit` (maskLog2w h))
283 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
284 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
285 | 
286 |   , h `uncheckedShiftR` (log2w*3)
287 |   )
288 | setKMemberBits !wd 4 !h = 
289 |   (((((wd `uncheckedSetBit` (maskLog2w h))
290 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
291 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
292 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
293 | 
294 |   , h `uncheckedShiftR` (log2w*4)
295 |   )
296 | setKMemberBits !wd 5 !h = 
297 |   ((((((wd `uncheckedSetBit` (maskLog2w h))
298 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
299 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
300 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
301 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
302 | 
303 |   , h `uncheckedShiftR` (log2w*5)
304 |   )
305 | setKMemberBits !wd 6 !h = 
306 |   (((((((wd `uncheckedSetBit` (maskLog2w h))
307 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
308 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
309 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
310 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
311 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*5))))
312 | 
313 |   , h `uncheckedShiftR` (log2w*6)
314 |   )
315 | {- TODO at this point we see a big performance hit in 7.10 (in e.g. "lookup insert/Int/3 12 (64-bit hash)/lookup x10")
316 |         presumably because the case is big enough that GHC does something different with it, but we need to look at core
317 |         to find out for sure. Maybe we can turn 'k' into an enumeration One | Two | Three at creation, and that might help
318 | setKMemberBits !wd 7 h = 
319 |   ((((((((wd `uncheckedSetBit` (maskLog2w h))
320 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
321 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
322 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
323 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
324 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*5))))
325 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*6))))
326 | 
327 |   , h `uncheckedShiftR` (log2w*7)
328 |   )
329 | setKMemberBits !wd 8 h = 
330 |   (((((((((wd `uncheckedSetBit` (maskLog2w h))
331 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
332 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
333 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
334 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
335 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*5))))
336 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*6))))
337 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*7))))
338 | 
339 |   , h `uncheckedShiftR` (log2w*8)
340 |   )
341 | setKMemberBits !wd 9 h = 
342 |   ((((((((((wd `uncheckedSetBit` (maskLog2w h))
343 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
344 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
345 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
346 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
347 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*5))))
348 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*6))))
349 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*7))))
350 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*8))))
351 | 
352 |   , h `uncheckedShiftR` (log2w*9)
353 |   )
354 | setKMemberBits !wd 10 h = 
355 |   (((((((((((wd `uncheckedSetBit` (maskLog2w h))
356 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` log2w)))
357 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*2))))
358 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*3))))
359 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*4))))
360 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*5))))
361 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*6))))
362 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*7))))
363 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*8))))
364 |     `uncheckedSetBit` (maskLog2w (h `uncheckedShiftR` (log2w*9))))
365 | 
366 |   , h `uncheckedShiftR` (log2w*10)
367 |   )
368 | -}
369 | -- 10 is all we should ever need since 11*log2w is always > w. If we unroll to
370 | -- 10 this ought to be unreachable, in fact:
371 | setKMemberBits !wd !k !h = setKMemberBitsRolled wd k h where
372 | 
373 | -- The non-unrolled version we fall back to, exposed for testing:
374 | setKMemberBitsRolled :: Int -> Int -> Word64 -> (Int, Word64)
375 | # ifdef ASSERTIONS_ON
376 | -- work around simplifier ticks exhausted bullshit, when compiling tests
377 | # else
378 | {-# INLINE setKMemberBitsRolled #-}
379 | # endif
380 | setKMemberBitsRolled !wd !k !h = go wd k h where
381 |   go wd' 0  h' = (wd', h')
382 |   go wd' k' h' = 
383 |       -- possible cast to 32-bit Int but we only need rightmost 5 or 6 bits:
384 |     let !memberBit = fromIntegral h' .&. maskLog2wRightmostBits
385 |      in go (wd' `uncheckedSetBit` memberBit) (k'-1) (h' `uncheckedShiftR` log2w)
386 | 
387 | maskLog2w :: Word64 -> Int
388 | {-# INLINE maskLog2w #-}
389 | maskLog2w !h = fromIntegral h .&. maskLog2wRightmostBits
390 | 
391 | 
392 | membershipWordAndBitsFor :: (Hashable a)=> BloomFilter a -> a -> (Int, Int)
393 | {-# INLINE membershipWordAndBitsFor #-}
394 | membershipWordAndBitsFor bloom@(BloomFilter{..}) a
395 |     | hash64Enough = membershipWordAndBits64  (siphash64_1_3  key a) bloom
396 |     | otherwise    = membershipWordAndBits128 (siphash128 key a) bloom
397 | 
398 | 
399 | -- True if we can get enough hash bits from a Word64, and a runtime check
400 | -- sanity check of our arguments to 'new'. This is probably in "enough for
401 | -- anyone" territory currently:
402 | isHash64Enough :: Int -> Int -> Bool
403 | {-# INLINE isHash64Enough #-}
404 | isHash64Enough log2l k =
405 |     let bitsReqd = log2l + k*log2w
406 |      in if bitsReqd > 128
407 |           then throw $ BloomFilterException "The passed parameters require over the maximum of 128 hash bits supported. Make sure: (log2l + k*(logBase 2 wordSizeInBits)) <= 128"
408 |           else if (log2l > wordSizeInBits)
409 |                  then throw $ BloomFilterException "You asked for (log2l > 64). We have no way to address memory in that range, and anyway that's way too big."
410 |                  else bitsReqd <= 64
411 | 
412 | maskLog2wRightmostBits :: Int -- 2^log2w - 1
413 | maskLog2wRightmostBits | sIZEOF_INT == 8 = 63
414 |                        | otherwise       = 31
415 | 
416 | wordSizeInBits :: Int
417 | wordSizeInBits = sIZEOF_INT * 8
418 | 
419 | log2w :: Int -- logBase 2 wordSizeInBits
420 | log2w | sIZEOF_INT == 8 = 6
421 |       | otherwise       = 5
422 | 
423 | uncheckedSetBit :: Int -> Int -> Int
424 | {-# INLINE uncheckedSetBit #-}
425 | uncheckedSetBit !x !i = x .|. (1 `uncheckedShiftL` i)
426 | 
427 | uncheckedShiftR :: (Num a, FiniteBits a, Ord a) => a -> Int -> a
428 | {-# INLINE uncheckedShiftR #-}
429 | uncheckedShiftR !a = \ !x->
430 |   assert (a >= 0) $ -- make sure we don't smear sign w/ a bad fromIntegral cast
431 |   assert (x < finiteBitSize a) $
432 |   assert (x >= 0) $
433 |     a `BitsHidden.unsafeShiftR` x
434 | uncheckedShiftL :: (Num a, FiniteBits a, Ord a) => a -> Int -> a
435 | {-# INLINE uncheckedShiftL #-}
436 | uncheckedShiftL !a = \ !x->
437 |   assert (a >= 0) $
438 |   assert (x < finiteBitSize a) $
439 |   assert (x >= 0) $
440 |     a `BitsHidden.unsafeShiftL` x
441 | 
442 | 
443 | -- | /O(size_of_element)/. Atomically insert a new element into the bloom
444 | -- filter.
445 | --
446 | -- This returns 'True' if the element /did not exist/ before the insert, and
447 | -- 'False' if the element did already exist (subject to false-positives; see
448 | -- 'lookup'). Note that this is reversed from @lookup@.
449 | insert :: Hashable a=> BloomFilter a -> a -> IO Bool
450 | {-# INLINE insert #-}
451 | insert bloom@(BloomFilter{..}) = \a-> do
452 |     let (!memberWord, !wordToOr) = membershipWordAndBitsFor bloom a
453 |     oldWord <- fetchOrIntArray arr memberWord wordToOr
454 |     return $! (oldWord .|. wordToOr) /= oldWord
455 | 
456 | -- | /O(size_of_element)/. Look up the value in the bloom filter, returning
457 | -- 'True' if the element is possibly in the set, and 'False' if the element is
458 | -- /certainly not/ in the set.
459 | --
460 | -- The likelihood that this returns 'True' on an element that was not
461 | -- previously 'insert'-ed depends on the parameters the filter was created
462 | -- with, and the number of elements already inserted. The 'fpr' function can
463 | -- help you estimate this.
464 | lookup :: Hashable a=> BloomFilter a -> a -> IO Bool
465 | {-# INLINE lookup #-}
466 | lookup bloom@(BloomFilter{..}) = \a-> do
467 |     let (!memberWord, !wordToOr) = membershipWordAndBitsFor bloom a
468 |     existingWord <- P.readByteArray arr memberWord
469 |     return $! (existingWord .|. wordToOr) == existingWord
470 | 
471 | 
472 | 
473 | 
474 | -- | /O(l_src+l_target)/. Write all elements in the first bloom filter into the
475 | -- second. This operation is lossless; ignoring writes to the source bloom
476 | -- filter that happen during this operation (see below), the target bloom
477 | -- filter will be identical to the filter produced had the elements been
478 | -- inserted into the target originally.
479 | --
480 | -- The source and target must have been created with the same key and
481 | -- @k@-value. In addition the target must not be larger (i.e. the @l@-value)
482 | -- than the source, /and/ they must both use 128/64 bit hashes. This throws a
483 | -- 'BloomFilterException' when those constraints are not met.
484 | --
485 | -- This operation is not linearizable with respect to 'insert'-type operations;
486 | -- elements being written to the source bloomfilter during this operation may
487 | -- or may not make it into the target "at random".
488 | unionInto :: BloomFilter a -- ^ Source, left unmodified.
489 |           -> BloomFilter a -- ^ Target, receiving elements from source.
490 |           -> IO ()
491 | unionInto = combine fetchOrIntArray
492 | 
493 | 
494 | 
495 | -- | /O(l_src+l_target)/. Make @target@ the intersection of the source and
496 | -- target sets.  This operation is "lossy" in that the false positive ratio of
497 | -- target after the operation may be higher than if the elements forming the
498 | -- intersection had been 'insert'-ed directly into target.
499 | --
500 | -- The constraints and comments re. linearizability in 'unionInto' also apply
501 | -- here.
502 | intersectionInto :: BloomFilter a -- ^ Source, left unmodified.
503 |                  -> BloomFilter a -- ^ Target, receiving elements from source.
504 |                  -> IO ()
505 | intersectionInto = combine fetchAndIntArray
506 | 
507 | 
508 | -- internal
509 | combine :: (P.MutableByteArray RealWorld -> Int -> Int -> IO x)
510 |         -> BloomFilter a -> BloomFilter a -> IO ()
511 | {-# INLINE combine #-}
512 | combine f = \src target -> do
513 |     unless (key src == key target) $ throwBloom $
514 |       "SipKey of the source BloomFilter does not match target"
515 |     unless (k src == k target) $ throwBloom $
516 |       "k of the source BloomFilter does not match target"
517 |     unless (log2l src >= log2l target) $ throwBloom $
518 |       "log2l of the source BloomFilter is smaller than the target"
519 |     unless (hash64Enough src == hash64Enough target) $ throwBloom $
520 |       "either the source or target BloomFilter requires 128 hash bits while the other requires 64"
521 | 
522 |     let target_l_minus1 = fromIntegral $ l_minus1 target
523 |         src_l_minus1 = fromIntegral $ l_minus1 src
524 | 
525 |     -- unless source and target are the same size we must "shrink" source to
526 |     -- size of target onto an intermediate array first (necessary to support
527 |     -- the AND for intersection, and also faster because it uses fewer atomic
528 |     -- primops onto target):
529 |     srcArrShrunk <-
530 |       if target_l_minus1 == src_l_minus1
531 |         then return (arr src)
532 |         else assert (target_l_minus1 < src_l_minus1) $ do
533 |           (srcArrShrunk, sizeDataBytes) <- newBloomArr $ log2l target
534 |           -- initialize new array with an efficient copy of first chunk from
535 |           -- source:
536 |           P.copyMutableByteArray srcArrShrunk 0 (arr src) 0 sizeDataBytes
537 | 
538 |           forM_ [(target_l_minus1+1).. src_l_minus1] $ \srcWordIx -> do
539 |             let !targetWordIx = srcWordIx .&. target_l_minus1
540 |             srcWord <- P.readByteArray (arr src) srcWordIx
541 |             assert (targetWordIx <= target_l_minus1) $
542 |               nonatomicFetchOrIntArray srcArrShrunk targetWordIx srcWord
543 | 
544 |           assert (P.sizeofMutableByteArray srcArrShrunk ==
545 |                   P.sizeofMutableByteArray (arr target)) $
546 |             return srcArrShrunk
547 | 
548 |     forM_ [0.. target_l_minus1] $ \ix -> do
549 |       srcWord <- P.readByteArray srcArrShrunk ix
550 |       f (arr target) ix srcWord
551 | 
552 | -- | Create a copy of the input @BloomFilter@.
553 | --
554 | -- This operation is not linearizable with respect to 'insert'-type operations;
555 | -- elements being written to the source bloomfilter during this operation may
556 | -- or may not make it into the target "at random".
557 | clone :: BloomFilter a -> IO (BloomFilter a)
558 | clone BloomFilter{..} = do
559 |     (arrCopy, sizeDataBytes) <- newBloomArr log2l
560 |     P.copyMutableByteArray arrCopy 0 arr 0 sizeDataBytes
561 |     return $ 
562 |       BloomFilter { arr = arrCopy, .. }
563 | 
564 | 
565 | nonatomicFetchOrIntArray :: P.MutableByteArray RealWorld -> Int -> Int -> IO Int
566 | nonatomicFetchOrIntArray ar ix wd = do
567 |   !before <- P.readByteArray ar ix
568 |   P.writeByteArray ar ix (before .|. wd)
569 |   return before
570 | 
571 | 
572 | {-
573 | -- This is the corrected equation from 'Supplementary File: A Comment on “Fast
574 | -- Bloom Filters and Their Generalization”'. Unfortunately since we're going to need to approximate factorial even to calculate this.
575 | -- compute this without moving into log space and probably approximating the
576 | -- factorials which might defeat the purpose of using this more accurate
577 | -- function to begin with.
578 | fpr :: Int  -- ^ @n@: Number of elements in filter
579 |     -> Int  -- ^ @l@: Size of filter, in machine words
580 |     -> Int  -- ^ @k@: Number of bits to map an element to
581 |     -> Float
582 | fpr nI lI kI =
583 |   let w = 64 -- TODO word-size in bits
584 |       n = fromIntegral nI
585 |       l = fromIntegral lI
586 |       k = fromIntegral kI
587 |    in summation 0 n $ \x->
588 |         (combination n x) *
589 |         ((1/l) ** x) *
590 |         ((1 - (1/l)) ** (n-x)) *
591 |         (factorial w / (w ** k*(x+1)) ) *
592 |         (summation 1 w $ \i->
593 |             (summation 1 i $ \j->
594 |                 ((j ** k*x) * i**k) /
595 |                 (factorial (w-i) * factorial j * factorial (i-j))
596 |             )
597 |         )
598 | -}
599 | 
600 | -- This is my attempt at translating the FPR equation from "Fast Bloom Filters
601 | -- and their Generalizations" with the following modifications:
602 | --   - calculations within summation performed in log space
603 | --   - use Stirling's approximation of factorial for larger `n`
604 | --   - scale `n` and `l` together for large `n`; the paper graphs FPR against
605 | --      this ratio so I guess this is justified, and it seems to work.
606 | 
607 | -- | An estimate of the false-positive rate for a bloom-1 filter. For a filter
608 | -- with the provided parameters and having @n@ elements, the value returned
609 | -- here is the percentage, over the course of many queries for elements /not/ in
610 | -- the filter, of those queries which would be expected to return an incorrect
611 | -- @True@ result.
612 | --
613 | -- This function is slow but the complexity is bounded and can accept inputs of
614 | -- any size.
615 | fpr :: Int  -- ^ @n@: Number of elements in filter
616 |     -> Int  -- ^ @l@: Size of filter, in machine words
617 |     -> Int  -- ^ @k@: Number of bits to map an element to
618 |     -> Int  -- ^ @w@: word-size in bits (e.g. 32 or 64)
619 |     -> Double
620 | fpr nI lI kI wI =
621 |     summation 0 n $ \x->
622 |       e ** (
623 |         (logCombination n x) +
624 |         (x * (negate $ log l)) +
625 |         ((n-x) * (log(l-1) - log l)) +
626 |         (k* log(1 - ((1 - (1/w)) ** (x*k))))
627 |         )
628 | 
629 |   where
630 |     n = min 32000 (fromIntegral nI)
631 |     l = fromIntegral lI * (n/fromIntegral nI)
632 | 
633 |     k = fromIntegral kI
634 |     w = fromIntegral wI
635 |     e = exp 1
636 |     --     / x \
637 |     -- log \ y /
638 |     logCombination x y
639 |         | y <= x    = logFactorial x - (logFactorial y + logFactorial (x - y))
640 |         | otherwise = log 0 -- TODO okay?
641 | 
642 |     logFactorial x
643 |         -- TODO memoize in array:
644 |         | x <= 500  = sum $ map log [1..x]
645 |          -- else use Stirling's approximation when error doesn't seem to affect
646 |         | otherwise = x * log x - x + log (sqrt (2*pi*x))
647 | 
648 |     summation low hi = sum . \f-> map f [low..hi]
649 | 
650 | 
651 | 
652 | -- ------------------------------------------------------------------
653 | -- Serialization
654 | -- ------------------------------------------------------------------
655 | 
656 | 
657 | -- For now we just prepare to throw an error if the 
658 | sIZEOF_METADATA, mETADATA_WORDS :: Int
659 | sIZEOF_METADATA = 8*mETADATA_WORDS
660 | mETADATA_WORDS = 8
661 | 
662 | sERIALIZATION_VERSION :: Word64
663 | sERIALIZATION_VERSION = 0
664 |   
665 | 
666 | -- Return metadata for serialization from the ADT
667 | metadataBytes :: StableHashable a=> BloomFilter a -> [Word8]
668 | metadataBytes bl@BloomFilter{..} = 
669 |   let keyHash = hashSipKey key
670 |       bs = 
671 |        [ sERIALIZATION_VERSION -- serialization format version
672 |        , tpHashOf bl           -- for verifying we deserialize to the correct element type
673 |        , hashWord64 keyHash    -- for verifying key (we don't wish to store it)
674 |        , tpHashOf keyHash      -- ensures sanity of the hashing of our key
675 |        , fromIntegral wordSizeInBits
676 |        , fromIntegral k 
677 |        , fromIntegral log2l 
678 |        , 0x0000                -- some padding for the hell of it; we can make the above more compact later too, if we want forward compatibility.
679 |        ] >>= bytes64
680 |    in assert (length bs == sIZEOF_METADATA) $
681 |        bs
682 | 
683 | -- A client may be concerned about keeping her SipKey secret; we have two decent options: 
684 | --   1. store the key in the serialized bloom filter, and force the user to use encryption
685 | --   2. make the user handle managing keys, and store a hash of the key to
686 | --      ensure sanity when the user provides the key again for deserialization
687 | -- We've chosen (2), mostly because if we omit the key the filter is completely
688 | -- opaque and secure, and we wish to experiment with mmap which a user of
689 | -- encryption couldn't use if they wanted to make sure their key never landed
690 | -- on disk.
691 | --
692 | -- We hash the key with itself and presume (read "hope") that this doesn't leak
693 | -- any information about the key:
694 | hashSipKey :: SipKey -> Hash64 (Word64, Word64)
695 | hashSipKey k@(SipKey w0 w1) = siphash64 k (w0, w1)
696 | 
697 | populateMetadata :: StableHashable a=> BloomFilter a -> IO ()
698 | populateMetadata b@BloomFilter{..} = do
699 |     let !sizeDataBytes = sIZEOF_INT `uncheckedShiftL` log2l
700 |     assert (P.sizeofMutableByteArray arr == sizeDataBytes + sIZEOF_METADATA) $ return ()
701 |     forM_ (zip [sizeDataBytes..] $ metadataBytes b) $ 
702 |         uncurry (P.writeByteArray arr)
703 | 
704 | -- | Serialize a bloom filter to a strict @ByteString@, which can be
705 | -- 'deserialize'-ed once again. Only a hash of the 'SipKey' is stored in the
706 | -- serialized format.
707 | --
708 | -- This operation is not linearizable with respect to 'insert'-type operations;
709 | -- elements being written to the source bloomfilter during this operation may
710 | -- or may not make it into the serialized @ByteString@ "at random".
711 | serialize :: StableHashable a=> BloomFilter a -> IO ByteString
712 | serialize bl = clone bl >>= unsafeSerialize
713 | 
714 | -- | Serialize a bloom filter to a strict @ByteString@, which can be
715 | -- 'deserialize'-ed once again. Only a hash of the 'SipKey' is stored in the
716 | -- serialized format. This operation is very fast and does no copying.
717 | --
718 | -- This is unsafe in that the source @BloomFilter@ must not be modified after
719 | -- this operation, otherwise the ByteString will change, breaking referential
720 | -- transparency. Use 'serialize' if uncertain.
721 | unsafeSerialize :: StableHashable a=> BloomFilter a -> IO ByteString
722 | unsafeSerialize b@BloomFilter{..} = do
723 |     populateMetadata b
724 |     let addr = (\(Addr x)-> x) $ P.mutableByteArrayContents arr
725 |         arr' = (\(P.MutableByteArray x) -> x) arr
726 |     return $ 
727 |       PS (ForeignPtr addr (PlainPtr arr')) 0 (P.sizeofMutableByteArray arr)
728 | 
729 | -- | Deserialize a 'BloomFilter' from a @ByteString@ created with 'serialize'
730 | -- or 'unsafeSerialize'. The key that was used to create the bloom filter is
731 | -- not stored for security, and must be provided here. However if the key
732 | -- provided does not match the key it was originally created with, a
733 | -- 'BloomFilterException' will be thrown.
734 | deserialize :: StableHashable a=> SipKey -> ByteString -> IO (BloomFilter a)
735 | deserialize key (PS fp@(ForeignPtr _ arrWrapped) off len) = do
736 |     log2l <- log2lFromArraySize len
737 |     -- It would be possible to create an 'unsafeDeserialize' which could
738 |     -- deserialize without this extra copy, where 'off' and 'len' are unused
739 |     -- (i.e. we can use the MutableByteArray directly), however we still have
740 |     -- the issue of finalizers; I think we would need to keep the ForeignPtr
741 |     -- around and make sure to touch it. However we can still offer our own IO
742 |     -- functions (e.g. an mmap routine) that does no extra copying.
743 |     (arr, _) <- newBloomArr log2l
744 | 
745 |     -- Copy ByteString data to a fresh MutableByteArray:
746 |     case arrWrapped of
747 |         PlainPtr  arrDirty   -> P.copyMutableByteArray arr 0 (P.MutableByteArray arrDirty) off len
748 |         MallocPtr arrDirty _ -> P.copyMutableByteArray arr 0 (P.MutableByteArray arrDirty) off len
749 |         -- If we don't have access to the MutableByteArray we do a slow
750 |         -- byte-at-a-time copy:
751 |         _ -> withForeignPtr fp $ \ptr->
752 |                forM_ (zip (take len [off..]) [0..]) $ \(ptrBytOff,targetBytIx)->
753 |                  peekElemOff ptr ptrBytOff >>= P.writeByteArray arr targetBytIx
754 | 
755 |     touchForeignPtr fp
756 |     deserializeByteArray key arr
757 | 
758 | 
759 | -- | A low-level deserialization routine. This is very fast, and does no copying.
760 | deserializeByteArray :: forall a. StableHashable a=> SipKey -> P.MutableByteArray RealWorld -> IO (BloomFilter a)
761 | deserializeByteArray key arr = do
762 |   let len = P.sizeofMutableByteArray arr
763 |   log2lActual <- log2lFromArraySize len
764 |   let metadataBytesIx = sIZEOF_INT `uncheckedShiftL` log2lActual
765 |   assert (metadataBytesIx + sIZEOF_METADATA == len) $ return ()
766 |   -- read bytes-at-a-time (endianness) and reconstruct metadata:
767 |   byts <- forM (take sIZEOF_METADATA [metadataBytesIx..]) $ P.readByteArray arr
768 |   let go [] = []
769 |       go (b0:b1:b2:b3:b4:b5:b6:b7:bs) = unbytes64 [b0,b1,b2,b3,b4,b5,b6,b7] : go bs
770 |       go _ = error "Bug: somehow sIZEOF_METADATA could not be chunked evenly into Word64s"
771 |   case go byts of
772 |     m@[version,tpHashBlParam,keyHash,tpHashKeyHash,wdSzBits,k64,log2l64,_pad] -> 
773 |       assert (length m == mETADATA_WORDS) $ do
774 |         let log2l = fromIntegral log2l64 
775 |             k = fromIntegral k64
776 |             hash64Enough = isHash64Enough log2l k
777 |             l_minus1 = (2^log2l)-1
778 |             blDirty :: BloomFilter a
779 |             blDirty = BloomFilter{..} -- defined here so we can use as proxy for param below.
780 | 
781 |         let check b = unless b . throwBloom
782 | 
783 |         check (version == sERIALIZATION_VERSION) $ 
784 |           if version > sERIALIZATION_VERSION
785 |             then "This bloomfilter was serialized with a new version of unagi-bloomfilter than the one in use."
786 |             else "This bloomfilter was serialized with an older and incompatible version of unagi-bloomfilter than the one in use."
787 |             -- This for now but we will offer forward compatibility, if possible, should serialization ever need to change.
788 |         check (tpHashBlParam == tpHashOf blDirty)
789 |           "This serialized bloom filter contained elements of a different type than you were expecting, or was created with an incompatible Hashable instance. See StableHashable."
790 |         let keyHashExpected = hashSipKey key
791 |             tpHashKeyHashExpected = tpHashOf keyHashExpected
792 |         check (tpHashKeyHashExpected == tpHashKeyHash) 
793 |           "Could not validate key. This serialized bloom filter was created with an incompatible Hashable instance. See StableHashable."
794 |         check (keyHash == hashWord64 keyHashExpected)
795 |           "The supplied key does not match the key that was used to create the serialized bloom filter."
796 |         check (fromIntegral wdSzBits == wordSizeInBits) $ 
797 |           "Serialized bloom filters are not currently cross-architecture compatible. Word size in bits when the filter was created was "++(show wdSzBits)++", but on the local machine is "++(show wordSizeInBits)
798 |         check (fromIntegral k == k64 && fromIntegral log2l == log2l64) $
799 |           "k or log2l could not fit in Int. This indicates corruption, or a bug: "++(show (k,k64,log2l,log2l64))
800 |         checkParamInvariants k log2l
801 | 
802 |         return blDirty
803 |         
804 |     _ -> error "Bug: somehow we returned the wrong number of metadata words"
805 |       
806 | 
807 |   
808 | tpHashOf :: StableHashable a => proxy a -> Word64
809 | tpHashOf = typeHashWord . typeHashOfProxy
810 | 
811 | bytes64 :: Word64 -> [Word8]
812 | {-# INLINE bytes64 #-}
813 | bytes64 wd = [ shifted 56, shifted 48, shifted 40, shifted 32
814 |              , shifted 24, shifted 16, shifted 8, fromIntegral wd]
815 |      where shifted = fromIntegral . uncheckedShiftR wd
816 | 
817 | unbytes64 :: [Word8] -> Word64
818 | {-# INLINE unbytes64 #-}
819 | unbytes64 [b0,b1,b2,b3,b4,b5,b6,b7] = 
820 |     unshifted b0 56 .|.  unshifted b1 48 .|.  unshifted b2 40 .|.  unshifted b3 32  .|.
821 |     unshifted b4 24 .|.  unshifted b5 16 .|.  unshifted b6 8 .|.  fromIntegral b7
822 |    where unshifted = uncheckedShiftL . fromIntegral
823 | unbytes64 _ = error "unbytes64"
824 |     
825 | 
826 | 
827 | -- ------------------------------------------------------------------
828 | -- Etc.
829 | -- ------------------------------------------------------------------
830 | 
831 | 
832 | # ifdef EXPORT_INTERNALS
833 | -- This could go anywhere, and lets us ensure that assertions are turned on
834 | -- when running test suite.
835 | assertionCanary :: IO Bool
836 | assertionCanary = do
837 |     assertionsWorking <- try $ assert False $ return ()
838 |     return $
839 |       case assertionsWorking of
840 |            Left (AssertionFailed _) -> True
841 |            _                        -> False
842 | # endif
843 | 


--------------------------------------------------------------------------------